From a45027f22c73b321f1f9cc110f0782fac10d5748 Mon Sep 17 00:00:00 2001
From: serxa <sergei@clickhouse.com>
Date: Wed, 31 Jul 2024 15:22:48 +0000
Subject: [PATCH 01/88] Fix flaky `test_delayed_replica_failover`

---
 tests/integration/test_delayed_replica_failover/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_delayed_replica_failover/test.py b/tests/integration/test_delayed_replica_failover/test.py
index a480ee3f278..c545877797a 100644
--- a/tests/integration/test_delayed_replica_failover/test.py
+++ b/tests/integration/test_delayed_replica_failover/test.py
@@ -101,7 +101,7 @@ SELECT sum(x) FROM distributed WITH TOTALS SETTINGS
         # allow pings to zookeeper to timeout (must be greater than ZK session timeout).
         for _ in range(30):
             try:
-                node_2_2.query("SELECT * FROM system.zookeeper where path = '/'")
+                node_2_2.query("SELECT * FROM system.zookeeper where path = '/' SETTINGS insert_keeper_max_retries = 0")
                 time.sleep(0.5)
             except:
                 break
@@ -120,7 +120,7 @@ SELECT sum(x) FROM distributed SETTINGS
             == "3"
         )
 
-        # Regression for skip_unavailable_shards in conjunction with skip_unavailable_shards
+        # Prefer fallback_to_stale_replicas over skip_unavailable_shards
         assert (
             instance_with_dist_table.query(
                 """

From f9c9d85e4109511bed14f5e7edb0f31b0bf0beae Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 31 Jul 2024 16:50:56 +0000
Subject: [PATCH 02/88] Automatic style fix

---
 tests/integration/test_delayed_replica_failover/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_delayed_replica_failover/test.py b/tests/integration/test_delayed_replica_failover/test.py
index c545877797a..1116d225b8c 100644
--- a/tests/integration/test_delayed_replica_failover/test.py
+++ b/tests/integration/test_delayed_replica_failover/test.py
@@ -101,7 +101,9 @@ SELECT sum(x) FROM distributed WITH TOTALS SETTINGS
         # allow pings to zookeeper to timeout (must be greater than ZK session timeout).
         for _ in range(30):
             try:
-                node_2_2.query("SELECT * FROM system.zookeeper where path = '/' SETTINGS insert_keeper_max_retries = 0")
+                node_2_2.query(
+                    "SELECT * FROM system.zookeeper where path = '/' SETTINGS insert_keeper_max_retries = 0"
+                )
                 time.sleep(0.5)
             except:
                 break

From 2251ad963992d4656c5ae20a7221aff36a86cc1d Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Mon, 5 Aug 2024 16:44:53 +0800
Subject: [PATCH 03/88] optimize orc string column reading

---
 .../Impl/NativeORCBlockInputFormat.cpp        | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
index 649721f28bf..a0a80ec4a58 100644
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
@@ -1143,24 +1143,42 @@ readColumnWithStringData(const orc::ColumnVectorBatch * orc_column, const orc::T
         reserver_size += 1;
     }
 
-    column_chars_t.reserve(reserver_size);
-    column_offsets.reserve(orc_str_column->numElements);
+    column_chars_t.resize_exact(reserver_size);
+    column_offsets.resize_exact(orc_str_column->numElements);
 
     size_t curr_offset = 0;
-    for (size_t i = 0; i < orc_str_column->numElements; ++i)
+    if (!orc_str_column->hasNulls)
     {
-        if (!orc_str_column->hasNulls || orc_str_column->notNull[i])
+        for (size_t i = 0; i < orc_str_column->numElements; ++i)
         {
             const auto * buf = orc_str_column->data[i];
             size_t buf_size = orc_str_column->length[i];
-            column_chars_t.insert_assume_reserved(buf, buf + buf_size);
+            memcpy(&column_chars_t[curr_offset], buf, buf_size);
             curr_offset += buf_size;
+
+            column_chars_t[curr_offset] = 0;
+            ++curr_offset;
+
+            column_offsets[i] = curr_offset;
         }
+    }
+    else
+    {
+        for (size_t i = 0; i < orc_str_column->numElements; ++i)
+        {
+            if (orc_str_column->notNull[i])
+            {
+                const auto * buf = orc_str_column->data[i];
+                size_t buf_size = orc_str_column->length[i];
+                memcpy(&column_chars_t[curr_offset], buf, buf_size);
+                curr_offset += buf_size;
+            }
 
-        column_chars_t.push_back(0);
-        ++curr_offset;
+            column_chars_t[curr_offset] = 0;
+            ++curr_offset;
 
-        column_offsets.push_back(curr_offset);
+            column_offsets[i] = curr_offset;
+        }
     }
     return {std::move(internal_column), std::move(internal_type), column_name};
 }

From 719ccaba5acb87734d1bb4cc2c4f5e76ad978c0a Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Mon, 5 Aug 2024 18:36:46 +0800
Subject: [PATCH 04/88] optimize parquet string column reading

---
 .../Formats/Impl/ArrowColumnToCHColumn.cpp    | 25 +++++++++++++++----
 .../Impl/NativeORCBlockInputFormat.cpp        |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
index ed91913de4d..fb56fdd4fe0 100644
--- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
+++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp
@@ -133,16 +133,31 @@ static ColumnWithTypeAndName readColumnWithStringData(const std::shared_ptr<arro
         std::shared_ptr<arrow::Buffer> buffer = chunk.value_data();
         const size_t chunk_length = chunk.length();
 
-        for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
+        const size_t null_count = chunk.null_count();
+        if (null_count == 0)
         {
-            if (!chunk.IsNull(offset_i) && buffer)
+            for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
             {
                 const auto * raw_data = buffer->data() + chunk.value_offset(offset_i);
                 column_chars_t.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
-            }
-            column_chars_t.emplace_back('\0');
+                column_chars_t.emplace_back('\0');
 
-            column_offsets.emplace_back(column_chars_t.size());
+                column_offsets.emplace_back(column_chars_t.size());
+            }
+        }
+        else
+        {
+            for (size_t offset_i = 0; offset_i != chunk_length; ++offset_i)
+            {
+                if (!chunk.IsNull(offset_i) && buffer)
+                {
+                    const auto * raw_data = buffer->data() + chunk.value_offset(offset_i);
+                    column_chars_t.insert_assume_reserved(raw_data, raw_data + chunk.value_length(offset_i));
+                }
+                column_chars_t.emplace_back('\0');
+
+                column_offsets.emplace_back(column_chars_t.size());
+            }
         }
     }
     return {std::move(internal_column), std::move(internal_type), column_name};
diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
index a0a80ec4a58..81bea0af53b 100644
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
@@ -1,4 +1,5 @@
 #include "NativeORCBlockInputFormat.h"
+#include "Columns/ColumnsCommon.h"
 
 #if USE_ORC
 #    include <Columns/ColumnDecimal.h>

From f147e5c39e19d1097361571ebedf4507c744c700 Mon Sep 17 00:00:00 2001
From: taiyang-li <654010905@qq.com>
Date: Mon, 5 Aug 2024 18:37:55 +0800
Subject: [PATCH 05/88] optimize parquet string column reading

---
 src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
index 81bea0af53b..a0a80ec4a58 100644
--- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp
@@ -1,5 +1,4 @@
 #include "NativeORCBlockInputFormat.h"
-#include "Columns/ColumnsCommon.h"
 
 #if USE_ORC
 #    include <Columns/ColumnDecimal.h>

From 42aa967311a55d3da0e1230595b0e0ca9928e777 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Thu, 8 Aug 2024 00:38:05 +0000
Subject: [PATCH 06/88] add profile events for merges

---
 src/Common/ProfileEvents.cpp                  | 25 +++++-
 .../Merges/AggregatingSortedTransform.h       | 10 +++
 .../Algorithms/AggregatingSortedAlgorithm.h   |  2 +
 .../FinishAggregatingInOrderAlgorithm.cpp     |  3 +
 .../FinishAggregatingInOrderAlgorithm.h       |  5 ++
 .../GraphiteRollupSortedAlgorithm.h           |  2 +
 .../Merges/Algorithms/IMergingAlgorithm.h     | 11 ++-
 .../IMergingAlgorithmWithSharedChunks.h       |  2 +
 src/Processors/Merges/Algorithms/MergedData.h |  2 +
 .../Algorithms/MergingSortedAlgorithm.h       |  2 +-
 .../Algorithms/SummingSortedAlgorithm.h       |  2 +
 .../Merges/CollapsingSortedTransform.h        | 10 +++
 src/Processors/Merges/IMergingTransform.h     | 35 +++++++-
 .../Merges/MergingSortedTransform.cpp         | 26 ++----
 .../Merges/MergingSortedTransform.h           |  4 -
 .../Merges/ReplacingSortedTransform.h         |  9 ++
 .../Merges/SummingSortedTransform.h           | 10 +++
 .../Merges/VersionedCollapsingTransform.h     |  9 ++
 .../Transforms/ColumnGathererTransform.cpp    | 57 ++++++-------
 .../Transforms/ColumnGathererTransform.h      | 11 ++-
 .../Transforms/MergeJoinTransform.cpp         | 12 ++-
 .../Transforms/MergeJoinTransform.h           |  2 +
 .../Transforms/MergeSortingTransform.cpp      |  2 -
 .../Transforms/PasteJoinTransform.cpp         | 10 +++
 .../Transforms/PasteJoinTransform.h           |  3 +-
 .../gtest_blocks_size_merging_streams.cpp     |  4 +-
 src/Storages/MergeTree/MergeList.h            |  1 +
 src/Storages/MergeTree/MergeProgress.h        | 27 +++---
 src/Storages/MergeTree/MergeTask.cpp          | 84 +++++++++++++++----
 src/Storages/MergeTree/MergeTask.h            | 20 ++++-
 src/Storages/MergeTree/MutateTask.cpp         | 10 ++-
 31 files changed, 308 insertions(+), 104 deletions(-)

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index ccdce7ff584..857a08d8a5d 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -210,7 +210,29 @@
     M(Merge, "Number of launched background merges.") \
     M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \
     M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \
-    M(MergesTimeMilliseconds, "Total time spent for background merges.")\
+    M(MergeTotalMilliseconds, "Total time spent for background merges") \
+    M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges") \
+    M(MergeHorizontalStageTotalMilliseconds, "Total time spent for horizontal stage of background merges") \
+    M(MergeHorizontalStageExecuteMilliseconds, "Total busy time spent for execution of horizontal stage of background merges") \
+    M(MergeVerticalStageTotalMilliseconds, "Total time spent for vertical stage of background merges") \
+    M(MergeVerticalStageExecuteMilliseconds, "Total busy time spent for execution of vertical stage of background merges") \
+    M(MergeProjectionStageTotalMilliseconds, "Total time spent for projection stage of background merges") \
+    M(MergeProjectionStageExecuteMilliseconds, "Total busy time spent for execution of projection stage of background merges") \
+    \
+    M(MergingSortedMilliseconds, "Total time spent while merging sorted columns") \
+    M(AggregatingSortedMilliseconds, "Total time spent while aggregating sorted columns") \
+    M(CollapsingSortedMilliseconds, "Total time spent while collapsing sorted columns") \
+    M(ReplacingSortedMilliseconds, "Total time spent while replacing sorted columns") \
+    M(SummingSortedMilliseconds, "Total time spent while summing sorted columns") \
+    M(VersionedCollapsingSortedMilliseconds, "Total time spent while version collapsing sorted columns") \
+    M(GatheringColumnMilliseconds, "Total time spent while gathering columns for vertical merge") \
+    \
+    M(MutationTotalParts, "Number of total parts for which mutations tried to be applied") \
+    M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate") \
+    M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation") \
+    M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.") \
+    M(MutationTimeMilliseconds, "Total time spent for mutations.") \
+    M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections") \
     \
     M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.") \
     M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.") \
@@ -225,7 +247,6 @@
     M(MergeTreeDataWriterProjectionsCalculationMicroseconds, "Time spent calculating projections") \
     M(MergeTreeDataProjectionWriterSortingBlocksMicroseconds, "Time spent sorting blocks (for projection it might be a key different from table's sorting key)") \
     M(MergeTreeDataProjectionWriterMergingBlocksMicroseconds, "Time spent merging blocks") \
-    M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections") \
     \
     M(InsertedWideParts, "Number of parts inserted in Wide format.") \
     M(InsertedCompactParts, "Number of parts inserted in Compact format.") \
diff --git a/src/Processors/Merges/AggregatingSortedTransform.h b/src/Processors/Merges/AggregatingSortedTransform.h
index c6d7e844c65..c96ad3db525 100644
--- a/src/Processors/Merges/AggregatingSortedTransform.h
+++ b/src/Processors/Merges/AggregatingSortedTransform.h
@@ -3,6 +3,11 @@
 #include <Processors/Merges/IMergingTransform.h>
 #include <Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h>
 
+namespace ProfileEvents
+{
+    extern const Event AggregatingSortedMilliseconds;
+}
+
 namespace DB
 {
 
@@ -29,6 +34,11 @@ public:
     }
 
     String getName() const override { return "AggregatingSortedTransform"; }
+
+    void onFinish() override
+    {
+        logMergedStats(ProfileEvents::AggregatingSortedMilliseconds, "Aggregated sorted", getLogger("AggregatingSortedTransform"));
+    }
 };
 
 }
diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
index 53c103e7038..908994e1851 100644
--- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.h
@@ -30,6 +30,8 @@ public:
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
 
+    MergedStats getMergedStats() const override { return merged_data.getMergedStats(); }
+
     /// Stores information for aggregation of SimpleAggregateFunction columns
     struct SimpleAggregateDescription
     {
diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
index 86675bcb237..477566d8a94 100644
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp
@@ -126,6 +126,9 @@ IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge()
 
 Chunk FinishAggregatingInOrderAlgorithm::prepareToMerge()
 {
+    total_merged_rows += accumulated_rows;
+    total_merged_bytes += accumulated_bytes;
+
     accumulated_rows = 0;
     accumulated_bytes = 0;
 
diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
index cc6578e79be..39171c5a978 100644
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
@@ -50,6 +50,8 @@ public:
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
 
+    MergedStats getMergedStats() const override { return  {.bytes = accumulated_bytes, .rows = accumulated_rows, .blocks = chunk_num}; }
+
 private:
     Chunk prepareToMerge();
     void addToAggregation();
@@ -92,6 +94,9 @@ private:
     UInt64 chunk_num = 0;
     size_t accumulated_rows = 0;
     size_t accumulated_bytes = 0;
+
+    size_t total_merged_rows = 0;
+    size_t total_merged_bytes = 0;
 };
 
 }
diff --git a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
index aaa3859efb6..cb2775c968d 100644
--- a/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/GraphiteRollupSortedAlgorithm.h
@@ -33,6 +33,8 @@ public:
     const char * getName() const override { return "GraphiteRollupSortedAlgorithm"; }
     Status merge() override;
 
+    MergedStats getMergedStats() const override { return merged_data->getMergedStats(); }
+
     struct ColumnsDefinition
     {
         size_t path_column_num;
diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h
index 9a1c7c24270..83f11232b71 100644
--- a/src/Processors/Merges/Algorithms/IMergingAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/IMergingAlgorithm.h
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <Processors/Chunk.h>
-#include <variant>
+#include <Common/ProfileEvents.h>
 
 namespace DB
 {
@@ -65,6 +65,15 @@ public:
 
     IMergingAlgorithm() = default;
     virtual ~IMergingAlgorithm() = default;
+
+    struct MergedStats
+    {
+        UInt64 bytes = 0;
+        UInt64 rows = 0;
+        UInt64 blocks = 0;
+    };
+
+    virtual MergedStats getMergedStats() const = 0;
 };
 
 // TODO: use when compile with clang which could support it
diff --git a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h
index bc1aafe93f7..1725108ac5d 100644
--- a/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h
+++ b/src/Processors/Merges/Algorithms/IMergingAlgorithmWithSharedChunks.h
@@ -16,6 +16,8 @@ public:
     void initialize(Inputs inputs) override;
     void consume(Input & input, size_t source_num) override;
 
+    MergedStats getMergedStats() const override { return merged_data->getMergedStats(); }
+
 private:
     Block header;
     SortDescription description;
diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h
index c5bb074bb0c..8f47f89d8ee 100644
--- a/src/Processors/Merges/Algorithms/MergedData.h
+++ b/src/Processors/Merges/Algorithms/MergedData.h
@@ -183,6 +183,8 @@ public:
     UInt64 totalAllocatedBytes() const { return total_allocated_bytes; }
     UInt64 maxBlockSize() const { return max_block_size; }
 
+    IMergingAlgorithm::MergedStats getMergedStats() const { return {.bytes = total_allocated_bytes, .rows = total_merged_rows, .blocks = total_chunks}; }
+
     virtual ~MergedData() = default;
 
 protected:
diff --git a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
index bcb111baadf..c889668a38e 100644
--- a/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/MergingSortedAlgorithm.h
@@ -31,7 +31,7 @@ public:
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
 
-    const MergedData & getMergedData() const { return merged_data; }
+    MergedStats getMergedStats() const override { return merged_data.getMergedStats(); }
 
 private:
     Block header;
diff --git a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
index 664b171c4b9..74b4e397831 100644
--- a/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/SummingSortedAlgorithm.h
@@ -30,6 +30,8 @@ public:
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
 
+    MergedStats getMergedStats() const override { return merged_data.getMergedStats(); }
+
     struct AggregateDescription;
     struct MapDescription;
 
diff --git a/src/Processors/Merges/CollapsingSortedTransform.h b/src/Processors/Merges/CollapsingSortedTransform.h
index 4479ac82f66..99fb700abf1 100644
--- a/src/Processors/Merges/CollapsingSortedTransform.h
+++ b/src/Processors/Merges/CollapsingSortedTransform.h
@@ -3,6 +3,11 @@
 #include <Processors/Merges/IMergingTransform.h>
 #include <Processors/Merges/Algorithms/CollapsingSortedAlgorithm.h>
 
+namespace ProfileEvents
+{
+    extern const Event CollapsingSortedMilliseconds;
+}
+
 namespace DB
 {
 
@@ -36,6 +41,11 @@ public:
     }
 
     String getName() const override { return "CollapsingSortedTransform"; }
+
+    void onFinish() override
+    {
+        logMergedStats(ProfileEvents::CollapsingSortedMilliseconds, "Collapsed sorted", getLogger("CollapsingSortedTransform"));
+    }
 };
 
 }
diff --git a/src/Processors/Merges/IMergingTransform.h b/src/Processors/Merges/IMergingTransform.h
index be629271736..fba5b038618 100644
--- a/src/Processors/Merges/IMergingTransform.h
+++ b/src/Processors/Merges/IMergingTransform.h
@@ -2,7 +2,10 @@
 
 #include <Processors/Merges/Algorithms/IMergingAlgorithm.h>
 #include <Processors/IProcessor.h>
+#include <Common/ProfileEvents.h>
 #include <Common/Stopwatch.h>
+#include <Common/logger_useful.h>
+#include <Common/formatReadable.h>
 
 namespace DB
 {
@@ -110,6 +113,8 @@ public:
 
     void work() override
     {
+        Stopwatch watch;
+
         if (!state.init_chunks.empty())
             algorithm.initialize(std::move(state.init_chunks));
 
@@ -147,6 +152,8 @@ public:
             // std::cerr << "Finished" << std::endl;
             state.is_finished = true;
         }
+
+        merging_elapsed_ns += watch.elapsedNanoseconds();
     }
 
 protected:
@@ -156,7 +163,33 @@ protected:
     Algorithm algorithm;
 
     /// Profile info.
-    Stopwatch total_stopwatch {CLOCK_MONOTONIC_COARSE};
+    UInt64 merging_elapsed_ns = 0;
+
+    void logMergedStats(ProfileEvents::Event elapsed_ms_event, std::string_view transform_message, LoggerPtr log) const
+    {
+        auto stats = algorithm.getMergedStats();
+
+        UInt64 elapsed_ms = merging_elapsed_ns / 1000000LL;
+        ProfileEvents::increment(elapsed_ms_event, elapsed_ms);
+
+        /// Don't print info for small parts (< 1M rows)
+        if (stats.rows < 1000000)
+            return;
+
+        double seconds = static_cast<double>(merging_elapsed_ns) / 1000000000ULL;
+
+        if (seconds == 0.0)
+        {
+            LOG_DEBUG(log, "{}: {} blocks, {} rows, {} bytes in 0 sec.",
+                transform_message, stats.blocks, stats.rows, stats.bytes);
+        }
+        else
+        {
+            LOG_DEBUG(log, "{}: {} blocks, {} rows, {} bytes in {} sec., {} rows/sec., {}/sec.",
+                transform_message, stats.blocks, stats.rows, stats.bytes,
+                seconds, stats.rows / seconds, ReadableSize(stats.bytes / seconds));
+        }
+    }
 
 private:
     using IMergingTransformBase::state;
diff --git a/src/Processors/Merges/MergingSortedTransform.cpp b/src/Processors/Merges/MergingSortedTransform.cpp
index 338b1ff7935..d2895a2a2e9 100644
--- a/src/Processors/Merges/MergingSortedTransform.cpp
+++ b/src/Processors/Merges/MergingSortedTransform.cpp
@@ -1,9 +1,12 @@
 #include <Processors/Merges/MergingSortedTransform.h>
 #include <Processors/Transforms/ColumnGathererTransform.h>
 #include <IO/WriteBuffer.h>
-
 #include <Common/logger_useful.h>
-#include <Common/formatReadable.h>
+
+namespace ProfileEvents
+{
+    extern const Event MergingSortedMilliseconds;
+}
 
 namespace DB
 {
@@ -18,7 +21,6 @@ MergingSortedTransform::MergingSortedTransform(
     UInt64 limit_,
     bool always_read_till_end_,
     WriteBuffer * out_row_sources_buf_,
-    bool quiet_,
     bool use_average_block_sizes,
     bool have_all_inputs_)
     : IMergingTransform(
@@ -37,7 +39,6 @@ MergingSortedTransform::MergingSortedTransform(
         limit_,
         out_row_sources_buf_,
         use_average_block_sizes)
-    , quiet(quiet_)
 {
 }
 
@@ -48,22 +49,7 @@ void MergingSortedTransform::onNewInput()
 
 void MergingSortedTransform::onFinish()
 {
-    if (quiet)
-        return;
-
-    const auto & merged_data = algorithm.getMergedData();
-
-    auto log = getLogger("MergingSortedTransform");
-
-    double seconds = total_stopwatch.elapsedSeconds();
-
-    if (seconds == 0.0)
-        LOG_DEBUG(log, "Merge sorted {} blocks, {} rows in 0 sec.", merged_data.totalChunks(), merged_data.totalMergedRows());
-    else
-        LOG_DEBUG(log, "Merge sorted {} blocks, {} rows in {} sec., {} rows/sec., {}/sec",
-            merged_data.totalChunks(), merged_data.totalMergedRows(), seconds,
-            merged_data.totalMergedRows() / seconds,
-            ReadableSize(merged_data.totalAllocatedBytes() / seconds));
+    logMergedStats(ProfileEvents::MergingSortedMilliseconds, "Merged sorted", getLogger("MergingSortedTransform"));
 }
 
 }
diff --git a/src/Processors/Merges/MergingSortedTransform.h b/src/Processors/Merges/MergingSortedTransform.h
index 2b53939f309..6e52450efa7 100644
--- a/src/Processors/Merges/MergingSortedTransform.h
+++ b/src/Processors/Merges/MergingSortedTransform.h
@@ -21,7 +21,6 @@ public:
         UInt64 limit_ = 0,
         bool always_read_till_end_ = false,
         WriteBuffer * out_row_sources_buf_ = nullptr,
-        bool quiet_ = false,
         bool use_average_block_sizes = false,
         bool have_all_inputs_ = true);
 
@@ -30,9 +29,6 @@ public:
 protected:
     void onNewInput() override;
     void onFinish() override;
-
-private:
-    bool quiet = false;
 };
 
 }
diff --git a/src/Processors/Merges/ReplacingSortedTransform.h b/src/Processors/Merges/ReplacingSortedTransform.h
index 2657987f161..dc262aab9ee 100644
--- a/src/Processors/Merges/ReplacingSortedTransform.h
+++ b/src/Processors/Merges/ReplacingSortedTransform.h
@@ -3,6 +3,10 @@
 #include <Processors/Merges/IMergingTransform.h>
 #include <Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h>
 
+namespace ProfileEvents
+{
+    extern const Event ReplacingSortedMilliseconds;
+}
 
 namespace DB
 {
@@ -38,6 +42,11 @@ public:
     }
 
     String getName() const override { return "ReplacingSorted"; }
+
+    void onFinish() override
+    {
+        logMergedStats(ProfileEvents::ReplacingSortedMilliseconds, "Replaced sorted", getLogger("ReplacingSortedTransform"));
+    }
 };
 
 }
diff --git a/src/Processors/Merges/SummingSortedTransform.h b/src/Processors/Merges/SummingSortedTransform.h
index 70ddebfea95..d7c20223d7e 100644
--- a/src/Processors/Merges/SummingSortedTransform.h
+++ b/src/Processors/Merges/SummingSortedTransform.h
@@ -3,6 +3,11 @@
 #include <Processors/Merges/IMergingTransform.h>
 #include <Processors/Merges/Algorithms/SummingSortedAlgorithm.h>
 
+namespace ProfileEvents
+{
+    extern const Event SummingSortedMilliseconds;
+}
+
 namespace DB
 {
 
@@ -33,6 +38,11 @@ public:
     }
 
     String getName() const override { return "SummingSortedTransform"; }
+
+    void onFinish() override
+    {
+        logMergedStats(ProfileEvents::SummingSortedMilliseconds, "Summed sorted", getLogger("SummingSortedTransform"));
+    }
 };
 
 }
diff --git a/src/Processors/Merges/VersionedCollapsingTransform.h b/src/Processors/Merges/VersionedCollapsingTransform.h
index 18244469bd7..32b5d7bf343 100644
--- a/src/Processors/Merges/VersionedCollapsingTransform.h
+++ b/src/Processors/Merges/VersionedCollapsingTransform.h
@@ -3,6 +3,10 @@
 #include <Processors/Merges/IMergingTransform.h>
 #include <Processors/Merges/Algorithms/VersionedCollapsingAlgorithm.h>
 
+namespace ProfileEvents
+{
+    extern const Event VersionedCollapsingSortedMilliseconds;
+}
 
 namespace DB
 {
@@ -33,6 +37,11 @@ public:
     }
 
     String getName() const override { return "VersionedCollapsingTransform"; }
+
+    void onFinish() override
+    {
+        logMergedStats(ProfileEvents::VersionedCollapsingSortedMilliseconds, "Versioned collapsed sorted", getLogger("VersionedCollapsingTransform"));
+    }
 };
 
 }
diff --git a/src/Processors/Transforms/ColumnGathererTransform.cpp b/src/Processors/Transforms/ColumnGathererTransform.cpp
index 15f8355bdc7..52fa42fdb51 100644
--- a/src/Processors/Transforms/ColumnGathererTransform.cpp
+++ b/src/Processors/Transforms/ColumnGathererTransform.cpp
@@ -1,11 +1,15 @@
 #include <Processors/Transforms/ColumnGathererTransform.h>
+#include <Common/ProfileEvents.h>
 #include <Common/logger_useful.h>
 #include <Common/typeid_cast.h>
 #include <Common/formatReadable.h>
 #include <Columns/ColumnSparse.h>
 #include <IO/WriteHelpers.h>
-#include <iomanip>
 
+namespace ProfileEvents
+{
+    extern const Event GatheringColumnMilliseconds;
+}
 
 namespace DB
 {
@@ -33,6 +37,13 @@ ColumnGathererStream::ColumnGathererStream(
         throw Exception(ErrorCodes::EMPTY_DATA_PASSED, "There are no streams to gather");
 }
 
+void ColumnGathererStream::updateStats(const IColumn & column)
+{
+    merged_rows += column.size();
+    merged_bytes += column.allocatedBytes();
+    ++merged_blocks;
+}
+
 void ColumnGathererStream::initialize(Inputs inputs)
 {
     Columns source_columns;
@@ -82,7 +93,9 @@ IMergingAlgorithm::Status ColumnGathererStream::merge()
         {
             res.addColumn(source_to_fully_copy->column);
         }
-        merged_rows += source_to_fully_copy->size;
+
+        updateStats(*source_to_fully_copy->column);
+
         source_to_fully_copy->pos = source_to_fully_copy->size;
         source_to_fully_copy = nullptr;
         return Status(std::move(res));
@@ -96,8 +109,7 @@ IMergingAlgorithm::Status ColumnGathererStream::merge()
         {
             next_required_source = 0;
             Chunk res;
-            merged_rows += sources.front().column->size();
-            merged_bytes += sources.front().column->allocatedBytes();
+            updateStats(*sources.front().column);
             res.addColumn(std::move(sources.front().column));
             sources.front().pos = sources.front().size = 0;
             return Status(std::move(res));
@@ -123,8 +135,8 @@ IMergingAlgorithm::Status ColumnGathererStream::merge()
     if (source_to_fully_copy && result_column->empty())
     {
         Chunk res;
-        merged_rows += source_to_fully_copy->column->size();
-        merged_bytes += source_to_fully_copy->column->allocatedBytes();
+        updateStats(*source_to_fully_copy->column);
+
         if (result_column->hasDynamicStructure())
         {
             auto col = result_column->cloneEmpty();
@@ -140,13 +152,13 @@ IMergingAlgorithm::Status ColumnGathererStream::merge()
         return Status(std::move(res));
     }
 
-    auto col = result_column->cloneEmpty();
-    result_column.swap(col);
+    auto return_column = result_column->cloneEmpty();
+    result_column.swap(return_column);
 
     Chunk res;
-    merged_rows += col->size();
-    merged_bytes += col->allocatedBytes();
-    res.addColumn(std::move(col));
+    updateStats(*return_column);
+
+    res.addColumn(std::move(return_column));
     return Status(std::move(res), row_sources_buf.eof() && !source_to_fully_copy);
 }
 
@@ -185,31 +197,10 @@ ColumnGathererTransform::ColumnGathererTransform(
             toString(header.columns()));
 }
 
-void ColumnGathererTransform::work()
-{
-    Stopwatch stopwatch;
-    IMergingTransform<ColumnGathererStream>::work();
-    elapsed_ns += stopwatch.elapsedNanoseconds();
-}
-
 void ColumnGathererTransform::onFinish()
 {
-    auto merged_rows = algorithm.getMergedRows();
-    auto merged_bytes = algorithm.getMergedRows();
-    /// Don't print info for small parts (< 10M rows)
-    if (merged_rows < 10000000)
-        return;
-
-    double seconds = static_cast<double>(elapsed_ns) / 1000000000ULL;
     const auto & column_name = getOutputPort().getHeader().getByPosition(0).name;
-
-    if (seconds == 0.0)
-        LOG_DEBUG(log, "Gathered column {} ({} bytes/elem.) in 0 sec.",
-            column_name, static_cast<double>(merged_bytes) / merged_rows);
-    else
-        LOG_DEBUG(log, "Gathered column {} ({} bytes/elem.) in {} sec., {} rows/sec., {}/sec.",
-            column_name, static_cast<double>(merged_bytes) / merged_rows, seconds,
-            merged_rows / seconds, ReadableSize(merged_bytes / seconds));
+    logMergedStats(ProfileEvents::GatheringColumnMilliseconds, fmt::format("Gathered column {}", column_name), log);
 }
 
 }
diff --git a/src/Processors/Transforms/ColumnGathererTransform.h b/src/Processors/Transforms/ColumnGathererTransform.h
index ec5691316ce..a535b2669d0 100644
--- a/src/Processors/Transforms/ColumnGathererTransform.h
+++ b/src/Processors/Transforms/ColumnGathererTransform.h
@@ -2,6 +2,7 @@
 
 #include <IO/ReadBuffer.h>
 #include <Common/PODArray.h>
+#include "base/types.h"
 #include <Processors/Merges/Algorithms/IMergingAlgorithm.h>
 #include <Processors/Merges/IMergingTransform.h>
 
@@ -72,10 +73,11 @@ public:
     template <typename Column>
     void gather(Column & column_res);
 
-    UInt64 getMergedRows() const { return merged_rows; }
-    UInt64 getMergedBytes() const { return merged_bytes; }
+    MergedStats getMergedStats() const override { return {.bytes = merged_bytes, .rows = merged_rows, .blocks = merged_blocks}; }
 
 private:
+    void updateStats(const IColumn & column);
+
     /// Cache required fields
     struct Source
     {
@@ -105,6 +107,7 @@ private:
     ssize_t next_required_source = -1;
     UInt64 merged_rows = 0;
     UInt64 merged_bytes = 0;
+    UInt64 merged_blocks = 0;
 };
 
 class ColumnGathererTransform final : public IMergingTransform<ColumnGathererStream>
@@ -120,12 +123,8 @@ public:
 
     String getName() const override { return "ColumnGathererTransform"; }
 
-    void work() override;
-
 protected:
     void onFinish() override;
-    UInt64 elapsed_ns = 0;
-
     LoggerPtr log;
 };
 
diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp
index e96a75d277b..26601207da8 100644
--- a/src/Processors/Transforms/MergeJoinTransform.cpp
+++ b/src/Processors/Transforms/MergeJoinTransform.cpp
@@ -511,6 +511,16 @@ void MergeJoinAlgorithm::logElapsed(double seconds)
         stat.max_blocks_loaded);
 }
 
+IMergingAlgorithm::MergedStats MergeJoinAlgorithm::getMergedStats() const
+{
+    return
+    {
+        .bytes = 0,
+        .rows = stat.num_rows[0] + stat.num_rows[1],
+        .blocks = stat.num_blocks[0] + stat.num_blocks[1],
+    };
+}
+
 static void prepareChunk(Chunk & chunk)
 {
     if (!chunk)
@@ -1271,7 +1281,7 @@ MergeJoinTransform::MergeJoinTransform(
 
 void MergeJoinTransform::onFinish()
 {
-    algorithm.logElapsed(total_stopwatch.elapsedSeconds());
+    algorithm.logElapsed(merging_elapsed_ns / 1000000000ULL);
 }
 
 }
diff --git a/src/Processors/Transforms/MergeJoinTransform.h b/src/Processors/Transforms/MergeJoinTransform.h
index d37a0b9f3ae..841a3f15a92 100644
--- a/src/Processors/Transforms/MergeJoinTransform.h
+++ b/src/Processors/Transforms/MergeJoinTransform.h
@@ -245,6 +245,8 @@ public:
     void setAsofInequality(ASOFJoinInequality asof_inequality_);
 
     void logElapsed(double seconds);
+    MergedStats getMergedStats() const override;
+
 private:
     std::optional<Status> handleAnyJoinState();
     Status anyJoin();
diff --git a/src/Processors/Transforms/MergeSortingTransform.cpp b/src/Processors/Transforms/MergeSortingTransform.cpp
index ede13b29219..c45192e7118 100644
--- a/src/Processors/Transforms/MergeSortingTransform.cpp
+++ b/src/Processors/Transforms/MergeSortingTransform.cpp
@@ -185,7 +185,6 @@ void MergeSortingTransform::consume(Chunk chunk)
 
         if (!external_merging_sorted)
         {
-            bool quiet = false;
             bool have_all_inputs = false;
             bool use_average_block_sizes = false;
 
@@ -199,7 +198,6 @@ void MergeSortingTransform::consume(Chunk chunk)
                     limit,
                     /*always_read_till_end_=*/ false,
                     nullptr,
-                    quiet,
                     use_average_block_sizes,
                     have_all_inputs);
 
diff --git a/src/Processors/Transforms/PasteJoinTransform.cpp b/src/Processors/Transforms/PasteJoinTransform.cpp
index d2fa7eed256..ad01b721726 100644
--- a/src/Processors/Transforms/PasteJoinTransform.cpp
+++ b/src/Processors/Transforms/PasteJoinTransform.cpp
@@ -58,6 +58,16 @@ static void prepareChunk(Chunk & chunk)
     chunk.setColumns(std::move(columns), num_rows);
 }
 
+IMergingAlgorithm::MergedStats PasteJoinAlgorithm::getMergedStats() const
+{
+    return
+    {
+        .bytes = 0,
+        .rows = stat.num_rows[0] + stat.num_rows[1],
+        .blocks = stat.num_blocks[0] + stat.num_blocks[1],
+    };
+}
+
 void PasteJoinAlgorithm::initialize(Inputs inputs)
 {
     if (inputs.size() != 2)
diff --git a/src/Processors/Transforms/PasteJoinTransform.h b/src/Processors/Transforms/PasteJoinTransform.h
index 6a7e65ee27c..fbe85f6993b 100644
--- a/src/Processors/Transforms/PasteJoinTransform.h
+++ b/src/Processors/Transforms/PasteJoinTransform.h
@@ -35,8 +35,7 @@ public:
     void initialize(Inputs inputs) override;
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
-
-    void logElapsed(double seconds);
+    MergedStats getMergedStats() const override;
 
 private:
     Chunk createBlockWithDefaults(size_t source_num);
diff --git a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
index bc22f249f97..f41a447049c 100644
--- a/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
+++ b/src/QueryPipeline/tests/gtest_blocks_size_merging_streams.cpp
@@ -83,7 +83,7 @@ TEST(MergingSortedTest, SimpleBlockSizeTest)
     EXPECT_EQ(pipe.numOutputPorts(), 3);
 
     auto transform = std::make_shared<MergingSortedTransform>(pipe.getHeader(), pipe.numOutputPorts(), sort_description,
-        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
+        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, true);
 
     pipe.addTransform(std::move(transform));
 
@@ -125,7 +125,7 @@ TEST(MergingSortedTest, MoreInterestingBlockSizes)
     EXPECT_EQ(pipe.numOutputPorts(), 3);
 
     auto transform = std::make_shared<MergingSortedTransform>(pipe.getHeader(), pipe.numOutputPorts(), sort_description,
-        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, false, true);
+        8192, /*max_block_size_bytes=*/0, SortingQueueStrategy::Batch, 0, false, nullptr, true);
 
     pipe.addTransform(std::move(transform));
 
diff --git a/src/Storages/MergeTree/MergeList.h b/src/Storages/MergeTree/MergeList.h
index d40af6abf43..3a96ba0abae 100644
--- a/src/Storages/MergeTree/MergeList.h
+++ b/src/Storages/MergeTree/MergeList.h
@@ -6,6 +6,7 @@
 #include <Common/CurrentMetrics.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ThreadStatus.h>
+#include "base/types.h"
 #include <Storages/MergeTree/MergeType.h>
 #include <Storages/MergeTree/MergeAlgorithm.h>
 #include <Storages/MergeTree/MergeTreePartInfo.h>
diff --git a/src/Storages/MergeTree/MergeProgress.h b/src/Storages/MergeTree/MergeProgress.h
index dd4922051b5..8562e81e761 100644
--- a/src/Storages/MergeTree/MergeProgress.h
+++ b/src/Storages/MergeTree/MergeProgress.h
@@ -8,10 +8,10 @@
 
 namespace ProfileEvents
 {
-    extern const Event MergesTimeMilliseconds;
     extern const Event MergedUncompressedBytes;
     extern const Event MergedRows;
-    extern const Event Merge;
+    extern const Event MutatedRows;
+    extern const Event MutatedUncompressedBytes;
 }
 
 namespace DB
@@ -63,18 +63,17 @@ public:
     void updateWatch()
     {
         UInt64 watch_curr_elapsed = merge_list_element_ptr->watch.elapsed();
-        ProfileEvents::increment(ProfileEvents::MergesTimeMilliseconds, (watch_curr_elapsed - watch_prev_elapsed) / 1000000);
         watch_prev_elapsed = watch_curr_elapsed;
     }
 
-    void operator() (const Progress & value)
+    void operator()(const Progress & value)
     {
-        ProfileEvents::increment(ProfileEvents::MergedUncompressedBytes, value.read_bytes);
-        if (stage.is_first)
-        {
-            ProfileEvents::increment(ProfileEvents::MergedRows, value.read_rows);
-            ProfileEvents::increment(ProfileEvents::Merge);
-        }
+        if (merge_list_element_ptr->is_mutation)
+            updateProfileEvents(value, ProfileEvents::MutatedRows, ProfileEvents::MutatedUncompressedBytes);
+        else
+            updateProfileEvents(value, ProfileEvents::MergedRows, ProfileEvents::MergedUncompressedBytes);
+
+
         updateWatch();
 
         merge_list_element_ptr->bytes_read_uncompressed += value.read_bytes;
@@ -90,6 +89,14 @@ public:
                 std::memory_order_relaxed);
         }
     }
+
+private:
+    void updateProfileEvents(const Progress & value, ProfileEvents::Event rows_event, ProfileEvents::Event bytes_event) const
+    {
+        ProfileEvents::increment(bytes_event, value.read_bytes);
+        if (stage.is_first)
+            ProfileEvents::increment(rows_event, value.read_rows);
+    }
 };
 
 }
diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index ce06adf110c..5f178f08ec3 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -5,9 +5,13 @@
 #include <memory>
 #include <fmt/format.h>
 
+#include "Common/ElapsedTimeProfileEventIncrement.h"
+#include "Common/Logger.h"
+#include "Common/Stopwatch.h"
 #include <Common/logger_useful.h>
 #include <Common/ActionBlocker.h>
 #include <Core/Settings.h>
+#include <Common/ProfileEvents.h>
 #include <Processors/Transforms/CheckSortedTransform.h>
 #include <Storages/MergeTree/DataPartStorageOnDiskFull.h>
 #include <Compression/CompressedWriteBuffer.h>
@@ -39,6 +43,16 @@
 #include <Interpreters/MergeTreeTransaction.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 
+namespace ProfileEvents
+{
+    extern const Event Merge;
+    extern const Event MergeTotalMilliseconds;
+    extern const Event MergeExecuteMilliseconds;
+    extern const Event MergeHorizontalStageExecuteMilliseconds;
+    extern const Event MergeVerticalStageExecuteMilliseconds;
+    extern const Event MergeProjectionStageExecuteMilliseconds;
+}
+
 namespace DB
 {
 
@@ -186,6 +200,8 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
     if (isTTLMergeType(global_ctx->future_part->merge_type) && global_ctx->ttl_merges_blocker->isCancelled())
         throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts with TTL");
 
+    ProfileEvents::increment(ProfileEvents::Merge);
+
     LOG_DEBUG(ctx->log, "Merging {} parts: from {} to {} into {} with storage {}",
         global_ctx->future_part->parts.size(),
         global_ctx->future_part->parts.front()->name,
@@ -446,6 +462,9 @@ void MergeTask::addGatheringColumn(GlobalRuntimeContextPtr global_ctx, const Str
 
 MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::getContextForNextStage()
 {
+    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    ProfileEvents::increment(ProfileEvents::MergeHorizontalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+
     auto new_ctx = std::make_shared<VerticalMergeRuntimeContext>();
 
     new_ctx->rows_sources_write_buf = std::move(ctx->rows_sources_write_buf);
@@ -463,8 +482,10 @@ MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::g
 
 MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNextStage()
 {
-    auto new_ctx = std::make_shared<MergeProjectionsRuntimeContext>();
+    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    ProfileEvents::increment(ProfileEvents::MergeVerticalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
 
+    auto new_ctx = std::make_shared<MergeProjectionsRuntimeContext>();
     new_ctx->need_sync = std::move(ctx->need_sync);
 
     ctx.reset();
@@ -474,9 +495,14 @@ MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNe
 
 bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute()
 {
-    assert(subtasks_iterator != subtasks.end());
-    if ((this->**subtasks_iterator)())
-        return true;
+    chassert(subtasks_iterator != subtasks.end());
+
+    Stopwatch watch;
+    bool res = (this->**subtasks_iterator)();
+    ctx->elapsed_execute_ns += watch.elapsedNanoseconds();
+
+    if (res)
+        return res;
 
     /// Move to the next subtask in an array of subtasks
     ++subtasks_iterator;
@@ -534,7 +560,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl()
 
 bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const
 {
-     /// No need to execute this part if it is horizontal merge.
+    /// No need to execute this part if it is horizontal merge.
     if (global_ctx->chosen_merge_algorithm != MergeAlgorithm::Vertical)
         return false;
 
@@ -906,12 +932,24 @@ bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const
     return false;
 }
 
+MergeTask::StageRuntimeContextPtr MergeTask::MergeProjectionsStage::getContextForNextStage()
+{
+    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    ProfileEvents::increment(ProfileEvents::MergeProjectionStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+
+    return nullptr;
+}
 
 bool MergeTask::VerticalMergeStage::execute()
 {
-    assert(subtasks_iterator != subtasks.end());
-    if ((this->**subtasks_iterator)())
-        return true;
+    chassert(subtasks_iterator != subtasks.end());
+
+    Stopwatch watch;
+    bool res = (this->**subtasks_iterator)();
+    ctx->elapsed_execute_ns += watch.elapsedNanoseconds();
+
+    if (res)
+        return res;
 
     /// Move to the next subtask in an array of subtasks
     ++subtasks_iterator;
@@ -920,9 +958,14 @@ bool MergeTask::VerticalMergeStage::execute()
 
 bool MergeTask::MergeProjectionsStage::execute()
 {
-    assert(subtasks_iterator != subtasks.end());
-    if ((this->**subtasks_iterator)())
-        return true;
+    chassert(subtasks_iterator != subtasks.end());
+
+    Stopwatch watch;
+    bool res = (this->**subtasks_iterator)();
+    ctx->elapsed_execute_ns += watch.elapsedNanoseconds();
+
+    if (res)
+        return res;
 
     /// Move to the next subtask in an array of subtasks
     ++subtasks_iterator;
@@ -969,12 +1012,22 @@ bool MergeTask::VerticalMergeStage::executeVerticalMergeForAllColumns() const
 
 bool MergeTask::execute()
 {
-    assert(stages_iterator != stages.end());
-    if ((*stages_iterator)->execute())
+    chassert(stages_iterator != stages.end());
+    const auto & current_stage = *stages_iterator;
+
+    if (current_stage->execute())
         return true;
 
-    /// Stage is finished, need initialize context for the next stage
-    auto next_stage_context = (*stages_iterator)->getContextForNextStage();
+    /// Stage is finished, need to initialize context for the next stage and update profile events.
+
+    UInt64 current_elapsed_ms = global_ctx->merge_list_element_ptr->watch.elapsedMilliseconds();
+    UInt64 stage_elapsed_ms = current_elapsed_ms - global_ctx->prev_elapesed_ms;
+    global_ctx->prev_elapesed_ms = current_elapsed_ms;
+
+    ProfileEvents::increment(current_stage->getTotalTimeProfileEvent(), stage_elapsed_ms);
+    ProfileEvents::increment(ProfileEvents::MergeTotalMilliseconds, stage_elapsed_ms);
+
+    auto next_stage_context = current_stage->getContextForNextStage();
 
     /// Move to the next stage in an array of stages
     ++stages_iterator;
@@ -1099,7 +1152,6 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
                 /* limit_= */0,
                 /* always_read_till_end_= */false,
                 ctx->rows_sources_write_buf.get(),
-                true,
                 ctx->blocks_are_granules_size);
             break;
 
diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h
index 8b0f2130e8e..979c85482e5 100644
--- a/src/Storages/MergeTree/MergeTask.h
+++ b/src/Storages/MergeTree/MergeTask.h
@@ -3,6 +3,7 @@
 #include <list>
 #include <memory>
 
+#include <Common/ProfileEvents.h>
 #include <Common/filesystemHelpers.h>
 
 #include <Compression/CompressedReadBuffer.h>
@@ -26,6 +27,12 @@
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeIndices.h>
 
+namespace ProfileEvents
+{
+    extern const Event MergeHorizontalStageTotalMilliseconds;
+    extern const Event MergeVerticalStageTotalMilliseconds;
+    extern const Event MergeProjectionStageTotalMilliseconds;
+}
 
 namespace DB
 {
@@ -134,6 +141,7 @@ private:
     {
         virtual void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) = 0;
         virtual StageRuntimeContextPtr getContextForNextStage() = 0;
+        virtual ProfileEvents::Event getTotalTimeProfileEvent() const = 0;
         virtual bool execute() = 0;
         virtual ~IStage() = default;
     };
@@ -195,6 +203,7 @@ private:
         bool need_prefix;
 
         scope_guard temporary_directory_lock;
+        UInt64 prev_elapesed_ms{0};
     };
 
     using GlobalRuntimeContextPtr = std::shared_ptr<GlobalRuntimeContext>;
@@ -233,6 +242,7 @@ private:
         /// Dependencies for next stages
         std::list<DB::NameAndTypePair>::const_iterator it_name_and_type;
         bool need_sync{false};
+        UInt64 elapsed_execute_ns{0};
     };
 
     using ExecuteAndFinalizeHorizontalPartRuntimeContextPtr = std::shared_ptr<ExecuteAndFinalizeHorizontalPartRuntimeContext>;
@@ -256,7 +266,6 @@ private:
 
         ExecuteAndFinalizeHorizontalPartSubtasks::const_iterator subtasks_iterator = subtasks.begin();
 
-
         MergeAlgorithm chooseMergeAlgorithm() const;
         void createMergedStream();
         void extractMergingAndGatheringColumns() const;
@@ -268,6 +277,7 @@ private:
         }
 
         StageRuntimeContextPtr getContextForNextStage() override;
+        ProfileEvents::Event getTotalTimeProfileEvent() const override { return ProfileEvents::MergeHorizontalStageTotalMilliseconds; }
 
         ExecuteAndFinalizeHorizontalPartRuntimeContextPtr ctx;
         GlobalRuntimeContextPtr global_ctx;
@@ -307,6 +317,7 @@ private:
         QueryPipeline column_parts_pipeline;
         std::unique_ptr<PullingPipelineExecutor> executor;
         std::unique_ptr<CompressedReadBufferFromFile> rows_sources_read_buf{nullptr};
+        UInt64 elapsed_execute_ns{0};
     };
 
     using VerticalMergeRuntimeContextPtr = std::shared_ptr<VerticalMergeRuntimeContext>;
@@ -321,6 +332,7 @@ private:
             global_ctx = static_pointer_cast<GlobalRuntimeContext>(global);
         }
         StageRuntimeContextPtr getContextForNextStage() override;
+        ProfileEvents::Event getTotalTimeProfileEvent() const override { return ProfileEvents::MergeVerticalStageTotalMilliseconds; }
 
         bool prepareVerticalMergeForAllColumns() const;
         bool executeVerticalMergeForAllColumns() const;
@@ -361,6 +373,7 @@ private:
         MergeTasks::iterator projections_iterator;
 
         LoggerPtr log{getLogger("MergeTask::MergeProjectionsStage")};
+        UInt64 elapsed_execute_ns{0};
     };
 
     using MergeProjectionsRuntimeContextPtr = std::shared_ptr<MergeProjectionsRuntimeContext>;
@@ -368,12 +381,15 @@ private:
     struct MergeProjectionsStage : public IStage
     {
         bool execute() override;
+
         void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) override
         {
             ctx = static_pointer_cast<MergeProjectionsRuntimeContext>(local);
             global_ctx = static_pointer_cast<GlobalRuntimeContext>(global);
         }
-        StageRuntimeContextPtr getContextForNextStage() override { return nullptr; }
+
+        StageRuntimeContextPtr getContextForNextStage() override;
+        ProfileEvents::Event getTotalTimeProfileEvent() const override { return ProfileEvents::MergeProjectionStageTotalMilliseconds; }
 
         bool mergeMinMaxIndexAndPrepareProjections() const;
         bool executeProjections() const;
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 9a775db73e2..fe78964a241 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -38,7 +38,10 @@
 
 namespace ProfileEvents
 {
-extern const Event MutateTaskProjectionsCalculationMicroseconds;
+    extern const Event MutationTotalParts;
+    extern const Event MutationUntouchedParts;
+    extern const Event MutationTimeMilliseconds;
+    extern const Event MutateTaskProjectionsCalculationMicroseconds;
 }
 
 namespace CurrentMetrics
@@ -2034,6 +2037,9 @@ bool MutateTask::execute()
             if (task->executeStep())
                 return true;
 
+            auto total_elapsed_ms = (*ctx->mutate_entry)->watch.elapsedMilliseconds();
+            ProfileEvents::increment(ProfileEvents::MutationTimeMilliseconds, total_elapsed_ms);
+
             // The `new_data_part` is a shared pointer and must be moved to allow
             // part deletion in case it is needed in `MutateFromLogEntryTask::finalize`.
             //
@@ -2118,6 +2124,7 @@ bool MutateTask::prepare()
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to mutate {} parts, not one. "
             "This is a bug.", ctx->future_part->parts.size());
 
+    ProfileEvents::increment(ProfileEvents::MutationTotalParts);
     ctx->num_mutations = std::make_unique<CurrentMetrics::Increment>(CurrentMetrics::PartMutation);
 
     auto context_for_reading = Context::createCopy(ctx->context);
@@ -2174,6 +2181,7 @@ bool MutateTask::prepare()
             ctx->temporary_directory_lock = std::move(lock);
         }
 
+        ProfileEvents::increment(ProfileEvents::MutationUntouchedParts);
         promise.set_value(std::move(part));
         return false;
     }

From 05febdfb2bdfa78f2d017758ce2261fb554e9546 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Thu, 8 Aug 2024 13:47:44 +0000
Subject: [PATCH 07/88] add more events and add tests

---
 src/Common/ProfileEvents.cpp                  |  9 +-
 .../Transforms/ColumnGathererTransform.h      |  1 -
 .../Transforms/MergeJoinTransform.cpp         |  3 +-
 .../Transforms/MergeJoinTransform.h           |  1 +
 .../Transforms/PasteJoinTransform.cpp         |  2 +-
 .../Transforms/PasteJoinTransform.h           |  1 +
 src/Storages/MergeTree/MergeList.h            |  1 -
 src/Storages/MergeTree/MergeTask.cpp          | 16 ++--
 src/Storages/MergeTree/MergeTask.h            |  2 +-
 .../MergeTree/MutateFromLogEntryTask.cpp      |  2 +
 .../MergeTree/MutatePlainMergeTreeTask.cpp    |  2 +
 src/Storages/MergeTree/MutateTask.cpp         | 25 ++++--
 src/Storages/MergeTree/MutateTask.h           |  1 +
 .../02378_part_log_profile_events.sql         |  2 +-
 .../03221_merge_profile_events.reference      |  3 +
 .../03221_merge_profile_events.sql            | 88 +++++++++++++++++++
 .../03221_mutate_profile_events.reference     |  2 +
 .../03221_mutate_profile_events.sql           | 33 +++++++
 18 files changed, 174 insertions(+), 20 deletions(-)
 create mode 100644 tests/queries/0_stateless/03221_merge_profile_events.reference
 create mode 100644 tests/queries/0_stateless/03221_merge_profile_events.sql
 create mode 100644 tests/queries/0_stateless/03221_mutate_profile_events.reference
 create mode 100644 tests/queries/0_stateless/03221_mutate_profile_events.sql

diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index 857a08d8a5d..d43d9fdcea8 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -209,6 +209,8 @@
     \
     M(Merge, "Number of launched background merges.") \
     M(MergedRows, "Rows read for background merges. This is the number of rows before merge.") \
+    M(MergedColumns, "Number of columns merged during the horizontal stage of merges.") \
+    M(GatheredColumns, "Number of columns gathered during the vertical stage of merges.") \
     M(MergedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for background merges. This is the number before merge.") \
     M(MergeTotalMilliseconds, "Total time spent for background merges") \
     M(MergeExecuteMilliseconds, "Total busy time spent for execution of background merges") \
@@ -231,8 +233,11 @@
     M(MutationUntouchedParts, "Number of total parts for which mutations tried to be applied but which was completely skipped according to predicate") \
     M(MutatedRows, "Rows read for mutations. This is the number of rows before mutation") \
     M(MutatedUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) that was read for mutations. This is the number before mutation.") \
-    M(MutationTimeMilliseconds, "Total time spent for mutations.") \
-    M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections") \
+    M(MutationTotalMilliseconds, "Total time spent for mutations.") \
+    M(MutationExecuteMilliseconds, "Total busy time spent for execution of mutations.") \
+    M(MutationAllPartColumns, "Number of times when task to mutate all columns in part was created") \
+    M(MutationSomePartColumns, "Number of times when task to mutate some columns in part was created") \
+    M(MutateTaskProjectionsCalculationMicroseconds, "Time spent calculating projections in mutations.") \
     \
     M(MergeTreeDataWriterRows, "Number of rows INSERTed to MergeTree tables.") \
     M(MergeTreeDataWriterUncompressedBytes, "Uncompressed bytes (for columns as they stored in memory) INSERTed to MergeTree tables.") \
diff --git a/src/Processors/Transforms/ColumnGathererTransform.h b/src/Processors/Transforms/ColumnGathererTransform.h
index a535b2669d0..fbc9a6bfcc6 100644
--- a/src/Processors/Transforms/ColumnGathererTransform.h
+++ b/src/Processors/Transforms/ColumnGathererTransform.h
@@ -2,7 +2,6 @@
 
 #include <IO/ReadBuffer.h>
 #include <Common/PODArray.h>
-#include "base/types.h"
 #include <Processors/Merges/Algorithms/IMergingAlgorithm.h>
 #include <Processors/Merges/IMergingTransform.h>
 
diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp
index 26601207da8..ec7f567ea57 100644
--- a/src/Processors/Transforms/MergeJoinTransform.cpp
+++ b/src/Processors/Transforms/MergeJoinTransform.cpp
@@ -515,7 +515,7 @@ IMergingAlgorithm::MergedStats MergeJoinAlgorithm::getMergedStats() const
 {
     return
     {
-        .bytes = 0,
+        .bytes = stat.num_bytes[0] + stat.num_bytes[1],
         .rows = stat.num_rows[0] + stat.num_rows[1],
         .blocks = stat.num_blocks[0] + stat.num_blocks[1],
     };
@@ -557,6 +557,7 @@ void MergeJoinAlgorithm::consume(Input & input, size_t source_num)
     {
         stat.num_blocks[source_num] += 1;
         stat.num_rows[source_num] += input.chunk.getNumRows();
+        stat.num_bytes[source_num] += input.chunk.allocatedBytes();
     }
 
     prepareChunk(input.chunk);
diff --git a/src/Processors/Transforms/MergeJoinTransform.h b/src/Processors/Transforms/MergeJoinTransform.h
index 841a3f15a92..8f74974af0f 100644
--- a/src/Processors/Transforms/MergeJoinTransform.h
+++ b/src/Processors/Transforms/MergeJoinTransform.h
@@ -282,6 +282,7 @@ private:
     {
         size_t num_blocks[2] = {0, 0};
         size_t num_rows[2] = {0, 0};
+        size_t num_bytes[2] = {0, 0};
 
         size_t max_blocks_loaded = 0;
     };
diff --git a/src/Processors/Transforms/PasteJoinTransform.cpp b/src/Processors/Transforms/PasteJoinTransform.cpp
index ad01b721726..982a347a70f 100644
--- a/src/Processors/Transforms/PasteJoinTransform.cpp
+++ b/src/Processors/Transforms/PasteJoinTransform.cpp
@@ -62,7 +62,7 @@ IMergingAlgorithm::MergedStats PasteJoinAlgorithm::getMergedStats() const
 {
     return
     {
-        .bytes = 0,
+        .bytes = stat.num_bytes[0] + stat.num_bytes[1],
         .rows = stat.num_rows[0] + stat.num_rows[1],
         .blocks = stat.num_blocks[0] + stat.num_blocks[1],
     };
diff --git a/src/Processors/Transforms/PasteJoinTransform.h b/src/Processors/Transforms/PasteJoinTransform.h
index fbe85f6993b..c184f20362d 100644
--- a/src/Processors/Transforms/PasteJoinTransform.h
+++ b/src/Processors/Transforms/PasteJoinTransform.h
@@ -54,6 +54,7 @@ private:
     {
         size_t num_blocks[2] = {0, 0};
         size_t num_rows[2] = {0, 0};
+        size_t num_bytes[2] = {0, 0};
 
         size_t max_blocks_loaded = 0;
     };
diff --git a/src/Storages/MergeTree/MergeList.h b/src/Storages/MergeTree/MergeList.h
index 3a96ba0abae..d40af6abf43 100644
--- a/src/Storages/MergeTree/MergeList.h
+++ b/src/Storages/MergeTree/MergeList.h
@@ -6,7 +6,6 @@
 #include <Common/CurrentMetrics.h>
 #include <Common/MemoryTracker.h>
 #include <Common/ThreadStatus.h>
-#include "base/types.h"
 #include <Storages/MergeTree/MergeType.h>
 #include <Storages/MergeTree/MergeAlgorithm.h>
 #include <Storages/MergeTree/MergeTreePartInfo.h>
diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index 5f178f08ec3..39bac8f7c24 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -5,9 +5,6 @@
 #include <memory>
 #include <fmt/format.h>
 
-#include "Common/ElapsedTimeProfileEventIncrement.h"
-#include "Common/Logger.h"
-#include "Common/Stopwatch.h"
 #include <Common/logger_useful.h>
 #include <Common/ActionBlocker.h>
 #include <Core/Settings.h>
@@ -46,6 +43,8 @@
 namespace ProfileEvents
 {
     extern const Event Merge;
+    extern const Event MergedColumns;
+    extern const Event GatheredColumns;
     extern const Event MergeTotalMilliseconds;
     extern const Event MergeExecuteMilliseconds;
     extern const Event MergeHorizontalStageExecuteMilliseconds;
@@ -183,6 +182,8 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::extractMergingAndGatheringColu
 
 bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
 {
+    ProfileEvents::increment(ProfileEvents::Merge);
+
     String local_tmp_prefix;
     if (global_ctx->need_prefix)
     {
@@ -200,8 +201,6 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
     if (isTTLMergeType(global_ctx->future_part->merge_type) && global_ctx->ttl_merges_blocker->isCancelled())
         throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts with TTL");
 
-    ProfileEvents::increment(ProfileEvents::Merge);
-
     LOG_DEBUG(ctx->log, "Merging {} parts: from {} to {} into {} with storage {}",
         global_ctx->future_part->parts.size(),
         global_ctx->future_part->parts.front()->name,
@@ -810,6 +809,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c
 
     /// Print overall profiling info. NOTE: it may duplicates previous messages
     {
+        ProfileEvents::increment(ProfileEvents::MergedColumns, global_ctx->merging_columns.size());
+        ProfileEvents::increment(ProfileEvents::GatheredColumns, global_ctx->gathering_columns.size());
+
         double elapsed_seconds = global_ctx->merge_list_element_ptr->watch.elapsedSeconds();
         LOG_DEBUG(ctx->log,
             "Merge sorted {} rows, containing {} columns ({} merged, {} gathered) in {} sec., {} rows/sec., {}/sec.",
@@ -1021,8 +1023,8 @@ bool MergeTask::execute()
     /// Stage is finished, need to initialize context for the next stage and update profile events.
 
     UInt64 current_elapsed_ms = global_ctx->merge_list_element_ptr->watch.elapsedMilliseconds();
-    UInt64 stage_elapsed_ms = current_elapsed_ms - global_ctx->prev_elapesed_ms;
-    global_ctx->prev_elapesed_ms = current_elapsed_ms;
+    UInt64 stage_elapsed_ms = current_elapsed_ms - global_ctx->prev_elapsed_ms;
+    global_ctx->prev_elapsed_ms = current_elapsed_ms;
 
     ProfileEvents::increment(current_stage->getTotalTimeProfileEvent(), stage_elapsed_ms);
     ProfileEvents::increment(ProfileEvents::MergeTotalMilliseconds, stage_elapsed_ms);
diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h
index 979c85482e5..38ccc287187 100644
--- a/src/Storages/MergeTree/MergeTask.h
+++ b/src/Storages/MergeTree/MergeTask.h
@@ -203,7 +203,7 @@ private:
         bool need_prefix;
 
         scope_guard temporary_directory_lock;
-        UInt64 prev_elapesed_ms{0};
+        UInt64 prev_elapsed_ms{0};
     };
 
     using GlobalRuntimeContextPtr = std::shared_ptr<GlobalRuntimeContext>;
diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
index 73084f487b9..56f68fd265a 100644
--- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
+++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
@@ -254,6 +254,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit
             LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. "
                            "We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false));
 
+            mutate_task->updateProfileEvents();
             write_part_log(ExecutionStatus::fromCurrentException("", true));
 
             if (storage.getSettings()->detach_not_byte_identical_parts)
@@ -281,6 +282,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit
          */
     finish_callback = [storage_ptr = &storage]() { storage_ptr->merge_selecting_task->schedule(); };
     ProfileEvents::increment(ProfileEvents::ReplicatedPartMutations);
+    mutate_task->updateProfileEvents();
     write_part_log({});
 
     return true;
diff --git a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp
index 9aec074deae..10461eb5942 100644
--- a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp
+++ b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp
@@ -102,6 +102,7 @@ bool MutatePlainMergeTreeTask::executeStep()
                 transaction.commit();
 
                 storage.updateMutationEntriesErrors(future_part, true, "");
+                mutate_task->updateProfileEvents();
                 write_part_log({});
 
                 state = State::NEED_FINISH;
@@ -114,6 +115,7 @@ bool MutatePlainMergeTreeTask::executeStep()
                 PreformattedMessage exception_message = getCurrentExceptionMessageAndPattern(/* with_stacktrace */ false);
                 LOG_ERROR(getLogger("MutatePlainMergeTreeTask"), exception_message);
                 storage.updateMutationEntriesErrors(future_part, false, exception_message.text);
+                mutate_task->updateProfileEvents();
                 write_part_log(ExecutionStatus::fromCurrentException("", true));
                 tryLogCurrentException(__PRETTY_FUNCTION__);
                 return false;
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index fe78964a241..f4af38d3b67 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -40,7 +40,10 @@ namespace ProfileEvents
 {
     extern const Event MutationTotalParts;
     extern const Event MutationUntouchedParts;
-    extern const Event MutationTimeMilliseconds;
+    extern const Event MutationTotalMilliseconds;
+    extern const Event MutationExecuteMilliseconds;
+    extern const Event MutationAllPartColumns;
+    extern const Event MutationSomePartColumns;
     extern const Event MutateTaskProjectionsCalculationMicroseconds;
 }
 
@@ -1049,6 +1052,7 @@ struct MutationContext
 
     /// Whether we need to count lightweight delete rows in this mutation
     bool count_lightweight_deleted_rows;
+    UInt64 execute_elapsed_ns = 0;
 };
 
 using MutationContextPtr = std::shared_ptr<MutationContext>;
@@ -2020,6 +2024,9 @@ MutateTask::MutateTask(
 
 bool MutateTask::execute()
 {
+    Stopwatch watch;
+    SCOPE_EXIT({ ctx->execute_elapsed_ns += watch.elapsedNanoseconds(); });
+
     switch (state)
     {
         case State::NEED_PREPARE:
@@ -2037,9 +2044,6 @@ bool MutateTask::execute()
             if (task->executeStep())
                 return true;
 
-            auto total_elapsed_ms = (*ctx->mutate_entry)->watch.elapsedMilliseconds();
-            ProfileEvents::increment(ProfileEvents::MutationTimeMilliseconds, total_elapsed_ms);
-
             // The `new_data_part` is a shared pointer and must be moved to allow
             // part deletion in case it is needed in `MutateFromLogEntryTask::finalize`.
             //
@@ -2056,6 +2060,15 @@ bool MutateTask::execute()
     return false;
 }
 
+void MutateTask::updateProfileEvents() const
+{
+    UInt64 total_elapsed_ms = (*ctx->mutate_entry)->watch.elapsedMilliseconds();
+    UInt64 execute_elapsed_ms = ctx->execute_elapsed_ns / 1000000UL;
+
+    ProfileEvents::increment(ProfileEvents::MutationTotalMilliseconds, total_elapsed_ms);
+    ProfileEvents::increment(ProfileEvents::MutationExecuteMilliseconds, execute_elapsed_ms);
+}
+
 static bool canSkipConversionToNullable(const MergeTreeDataPartPtr & part, const MutationCommand & command)
 {
     if (command.type != MutationCommand::READ_COLUMN)
@@ -2118,13 +2131,13 @@ static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, con
 
 bool MutateTask::prepare()
 {
+    ProfileEvents::increment(ProfileEvents::MutationTotalParts);
     MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry);
 
     if (ctx->future_part->parts.size() != 1)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to mutate {} parts, not one. "
             "This is a bug.", ctx->future_part->parts.size());
 
-    ProfileEvents::increment(ProfileEvents::MutationTotalParts);
     ctx->num_mutations = std::make_unique<CurrentMetrics::Increment>(CurrentMetrics::PartMutation);
 
     auto context_for_reading = Context::createCopy(ctx->context);
@@ -2291,6 +2304,7 @@ bool MutateTask::prepare()
         ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS;
 
         task = std::make_unique<MutateAllPartColumnsTask>(ctx);
+        ProfileEvents::increment(ProfileEvents::MutationAllPartColumns);
     }
     else /// TODO: check that we modify only non-key columns in this case.
     {
@@ -2330,6 +2344,7 @@ bool MutateTask::prepare()
         ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::ASK_KEEPER;
 
         task = std::make_unique<MutateSomePartColumnsTask>(ctx);
+        ProfileEvents::increment(ProfileEvents::MutationSomePartColumns);
     }
 
     return true;
diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h
index dc22b90f0e9..08427bff6d8 100644
--- a/src/Storages/MergeTree/MutateTask.h
+++ b/src/Storages/MergeTree/MutateTask.h
@@ -39,6 +39,7 @@ public:
         bool need_prefix_);
 
     bool execute();
+    void updateProfileEvents() const;
 
     std::future<MergeTreeData::MutableDataPartPtr> getFuture()
     {
diff --git a/tests/queries/0_stateless/02378_part_log_profile_events.sql b/tests/queries/0_stateless/02378_part_log_profile_events.sql
index 38d3f8b4c05..eec76d6f50e 100644
--- a/tests/queries/0_stateless/02378_part_log_profile_events.sql
+++ b/tests/queries/0_stateless/02378_part_log_profile_events.sql
@@ -39,7 +39,7 @@ SYSTEM FLUSH LOGS;
 
 SELECT
     if(count() == 2, 'Ok', 'Error: ' || toString(count())),
-    if(SUM(ProfileEvents['MergedRows']) == 512, 'Ok', 'Error: ' || toString(SUM(ProfileEvents['MergedRows']))),
+    if(SUM(ProfileEvents['MutatedRows']) == 512, 'Ok', 'Error: ' || toString(SUM(ProfileEvents['MutatedRows']))),
     if(SUM(ProfileEvents['FileOpen']) > 1, 'Ok', 'Error: ' || toString(SUM(ProfileEvents['FileOpen'])))
 FROM system.part_log
 WHERE event_time > now() - INTERVAL 10 MINUTE
diff --git a/tests/queries/0_stateless/03221_merge_profile_events.reference b/tests/queries/0_stateless/03221_merge_profile_events.reference
new file mode 100644
index 00000000000..729e53eae79
--- /dev/null
+++ b/tests/queries/0_stateless/03221_merge_profile_events.reference
@@ -0,0 +1,3 @@
+Horizontal	1	20000	3	0	480000	1	1	1	1
+Vertical	1	20000	1	2	480000	1	1	1	1	1	1
+Vertical	2	20020	4	2	480660	1	1	1	1	1	1	1	1
diff --git a/tests/queries/0_stateless/03221_merge_profile_events.sql b/tests/queries/0_stateless/03221_merge_profile_events.sql
new file mode 100644
index 00000000000..787aff93ffc
--- /dev/null
+++ b/tests/queries/0_stateless/03221_merge_profile_events.sql
@@ -0,0 +1,88 @@
+-- Tags: no-random-settings, no-random-merge-tree-settings
+
+DROP TABLE IF EXISTS t_merge_profile_events_1;
+
+CREATE TABLE t_merge_profile_events_1 (id UInt64, v1 UInt64, v2 UInt64)
+ENGINE = MergeTree ORDER BY id
+SETTINGS min_bytes_for_wide_part = 0;
+
+INSERT INTO t_merge_profile_events_1 SELECT number, number, number FROM numbers(10000);
+INSERT INTO t_merge_profile_events_1 SELECT number, number, number FROM numbers(10000);
+
+OPTIMIZE TABLE t_merge_profile_events_1 FINAL;
+SYSTEM FLUSH LOGS;
+
+SELECT
+    merge_algorithm,
+    ProfileEvents['Merge'],
+    ProfileEvents['MergedRows'],
+    ProfileEvents['MergedColumns'],
+    ProfileEvents['GatheredColumns'],
+    ProfileEvents['MergedUncompressedBytes'],
+    ProfileEvents['MergeTotalMilliseconds'] > 0,
+    ProfileEvents['MergeExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageExecuteMilliseconds'] > 0
+FROM system.part_log WHERE database = currentDatabase() AND table = 't_merge_profile_events_1' AND event_type = 'MergeParts' AND part_name = 'all_1_2_1';
+
+DROP TABLE IF EXISTS t_merge_profile_events_1;
+
+DROP TABLE IF EXISTS t_merge_profile_events_2;
+
+CREATE TABLE t_merge_profile_events_2 (id UInt64, v1 UInt64, v2 UInt64)
+ENGINE = MergeTree ORDER BY id
+SETTINGS min_bytes_for_wide_part = 0, vertical_merge_algorithm_min_rows_to_activate = 1, vertical_merge_algorithm_min_columns_to_activate = 1;
+
+INSERT INTO t_merge_profile_events_2 SELECT number, number, number FROM numbers(10000);
+INSERT INTO t_merge_profile_events_2 SELECT number, number, number FROM numbers(10000);
+
+OPTIMIZE TABLE t_merge_profile_events_2 FINAL;
+SYSTEM FLUSH LOGS;
+
+SELECT
+    merge_algorithm,
+    ProfileEvents['Merge'],
+    ProfileEvents['MergedRows'],
+    ProfileEvents['MergedColumns'],
+    ProfileEvents['GatheredColumns'],
+    ProfileEvents['MergedUncompressedBytes'],
+    ProfileEvents['MergeTotalMilliseconds'] > 0,
+    ProfileEvents['MergeExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeVerticalStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeVerticalStageExecuteMilliseconds'] > 0,
+FROM system.part_log WHERE database = currentDatabase() AND table = 't_merge_profile_events_2' AND event_type = 'MergeParts' AND part_name = 'all_1_2_1';
+
+DROP TABLE IF EXISTS t_merge_profile_events_2;
+
+DROP TABLE IF EXISTS t_merge_profile_events_3;
+
+CREATE TABLE t_merge_profile_events_3 (id UInt64, v1 UInt64, v2 UInt64, PROJECTION p (SELECT sum(v1), sum(v2) GROUP BY id % 10))
+ENGINE = MergeTree ORDER BY id
+SETTINGS min_bytes_for_wide_part = 0, vertical_merge_algorithm_min_rows_to_activate = 1, vertical_merge_algorithm_min_columns_to_activate = 1;
+
+INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(10000);
+INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(10000);
+
+OPTIMIZE TABLE t_merge_profile_events_3 FINAL;
+SYSTEM FLUSH LOGS;
+
+SELECT
+    merge_algorithm,
+    ProfileEvents['Merge'],
+    ProfileEvents['MergedRows'],
+    ProfileEvents['MergedColumns'],
+    ProfileEvents['GatheredColumns'],
+    ProfileEvents['MergedUncompressedBytes'],
+    ProfileEvents['MergeTotalMilliseconds'] > 0,
+    ProfileEvents['MergeExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeHorizontalStageExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeVerticalStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeVerticalStageExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeProjectionStageTotalMilliseconds'] > 0,
+    ProfileEvents['MergeProjectionStageExecuteMilliseconds'] > 0,
+FROM system.part_log WHERE database = currentDatabase() AND table = 't_merge_profile_events_3' AND event_type = 'MergeParts' AND part_name = 'all_1_2_1';
+
+DROP TABLE IF EXISTS t_merge_profile_events_3;
diff --git a/tests/queries/0_stateless/03221_mutate_profile_events.reference b/tests/queries/0_stateless/03221_mutate_profile_events.reference
new file mode 100644
index 00000000000..d094c37ff88
--- /dev/null
+++ b/tests/queries/0_stateless/03221_mutate_profile_events.reference
@@ -0,0 +1,2 @@
+3	2	1	10000	160000	0	1	1	1
+4	2	1	10000	320000	1	0	1	1
diff --git a/tests/queries/0_stateless/03221_mutate_profile_events.sql b/tests/queries/0_stateless/03221_mutate_profile_events.sql
new file mode 100644
index 00000000000..e9f7f9670bd
--- /dev/null
+++ b/tests/queries/0_stateless/03221_mutate_profile_events.sql
@@ -0,0 +1,33 @@
+-- Tags: no-random-settings, no-random-merge-tree-settings
+
+DROP TABLE IF EXISTS t_mutate_profile_events;
+
+CREATE TABLE t_mutate_profile_events (key UInt64, id UInt64, v1 UInt64, v2 UInt64)
+ENGINE = MergeTree ORDER BY id PARTITION BY key
+SETTINGS min_bytes_for_wide_part = 0;
+
+INSERT INTO t_mutate_profile_events SELECT 1, number, number, number FROM numbers(10000);
+INSERT INTO t_mutate_profile_events SELECT 2, number, number, number FROM numbers(10000);
+
+SET mutations_sync = 2;
+
+ALTER TABLE t_mutate_profile_events UPDATE v1 = 1000 WHERE key = 1;
+ALTER TABLE t_mutate_profile_events DELETE WHERE key = 2 AND v2 % 10 = 0;
+
+SYSTEM FLUSH LOGS;
+
+SELECT
+    splitByChar('_', part_name)[-1] AS version,
+    sum(ProfileEvents['MutationTotalParts']),
+    sum(ProfileEvents['MutationUntouchedParts']),
+    sum(ProfileEvents['MutatedRows']),
+    sum(ProfileEvents['MutatedUncompressedBytes']),
+    sum(ProfileEvents['MutationAllPartColumns']),
+    sum(ProfileEvents['MutationSomePartColumns']),
+    sum(ProfileEvents['MutationTotalMilliseconds']) > 0,
+    sum(ProfileEvents['MutationExecuteMilliseconds']) > 0,
+FROM system.part_log
+WHERE database = currentDatabase() AND table = 't_mutate_profile_events' AND event_type = 'MutatePart'
+GROUP BY version ORDER BY version;
+
+DROP TABLE IF EXISTS t_mutate_profile_events

From 6f0f27838745d427966a3c949316a541d9fda7ac Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 03:45:07 +0200
Subject: [PATCH 08/88] Fix trash (low-quality code) in AWS S3

---
 src/IO/S3/URI.cpp | 29 ++++++++++++-----------------
 src/IO/S3/URI.h   |  8 ++++----
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp
index fead18315d8..446c2aa355b 100644
--- a/src/IO/S3/URI.cpp
+++ b/src/IO/S3/URI.cpp
@@ -1,8 +1,8 @@
 #include <IO/S3/URI.h>
-#include <Interpreters/Context.h>
-#include <Storages/NamedCollectionsHelpers.h>
-#include "Common/Macros.h"
+
 #if USE_AWS_S3
+#include <Interpreters/Context.h>
+#include <Common/Macros.h>
 #include <Common/Exception.h>
 #include <Common/quoteString.h>
 #include <Common/re2.h>
@@ -10,6 +10,7 @@
 
 #include <boost/algorithm/string/case_conv.hpp>
 
+
 namespace DB
 {
 
@@ -47,14 +48,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
     /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
     static const RE2 path_style_pattern("^/([^/]*)/(.*)");
 
-    static constexpr auto S3 = "S3";
-    static constexpr auto S3EXPRESS = "S3EXPRESS";
-    static constexpr auto COSN = "COSN";
-    static constexpr auto COS = "COS";
-    static constexpr auto OBS = "OBS";
-    static constexpr auto OSS = "OSS";
-    static constexpr auto EOS = "EOS";
-
     if (allow_archive_path_syntax)
         std::tie(uri_str, archive_pattern) = getURIAndArchivePattern(uri_);
     else
@@ -85,7 +78,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
             URIConverter::modifyURI(uri, mapper);
     }
 
-    storage_name = S3;
+    storage_name = "S3";
 
     if (uri.getHost().empty())
         throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
@@ -93,11 +86,13 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
     /// Extract object version ID from query string.
     bool has_version_id = false;
     for (const auto & [query_key, query_value] : uri.getQueryParameters())
+    {
         if (query_key == "versionId")
         {
             version_id = query_value;
             has_version_id = true;
         }
+    }
 
     /// Poco::URI will ignore '?' when parsing the path, but if there is a versionId in the http parameter,
     /// '?' can not be used as a wildcard, otherwise it will be ambiguous.
@@ -130,14 +125,14 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
 
         boost::to_upper(name);
         /// For S3Express it will look like s3express-eun1-az1, i.e. contain region and AZ info
-        if (name != S3 && !name.starts_with(S3EXPRESS) && name != COS && name != OBS && name != OSS && name != EOS)
+        if (name != "S3" && !name.starts_with("S3EXPRESS") && name != "COS" && name != "OBS" && name != "OSS" && name != "EOS")
             throw Exception(
                 ErrorCodes::BAD_ARGUMENTS,
                 "Object storage system name is unrecognized in virtual hosted style S3 URI: {}",
                 quoteString(name));
 
-        if (name == COS)
-            storage_name = COSN;
+        if (name == "COS")
+            storage_name = "COSN";
         else
             storage_name = name;
     }
@@ -153,8 +148,8 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
 
 void URI::addRegionToURI(const std::string &region)
 {
-    if (auto pos = endpoint.find("amazonaws.com"); pos != std::string::npos)
-        endpoint = endpoint.substr(0, pos) + region + "." + endpoint.substr(pos);
+    if (auto pos = endpoint.find(".amazonaws.com"); pos != std::string::npos)
+        endpoint = endpoint.substr(0, pos) + "." + region + endpoint.substr(pos);
 }
 
 void URI::validateBucket(const String & bucket, const Poco::URI & uri)
diff --git a/src/IO/S3/URI.h b/src/IO/S3/URI.h
index 80e2da96cd4..c8d0b28cd15 100644
--- a/src/IO/S3/URI.h
+++ b/src/IO/S3/URI.h
@@ -1,14 +1,14 @@
 #pragma once
 
-#include <optional>
-#include <string>
-
 #include "config.h"
 
 #if USE_AWS_S3
 
+#include <optional>
+#include <string>
 #include <Poco/URI.h>
 
+
 namespace DB::S3
 {
 
@@ -23,7 +23,7 @@ namespace DB::S3
 struct URI
 {
     Poco::URI uri;
-    // Custom endpoint if URI scheme is not S3.
+    // Custom endpoint if URI scheme, if not S3.
     std::string endpoint;
     std::string bucket;
     std::string key;

From d0a1ee821b609856c2692abc01d132f8d7a8b88f Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 03:51:12 +0200
Subject: [PATCH 09/88] You don't know regular expressions

---
 src/IO/S3/URI.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp
index 446c2aa355b..eea73474c44 100644
--- a/src/IO/S3/URI.cpp
+++ b/src/IO/S3/URI.cpp
@@ -41,7 +41,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
     /// Case when AWS Private Link Interface is being used
     /// E.g. (bucket.vpce-07a1cd78f1bd55c5f-j3a3vg6w.s3.us-east-1.vpce.amazonaws.com/bucket-name/key)
     /// https://docs.aws.amazon.com/AmazonS3/latest/userguide/privatelink-interface-endpoints.html
-    static const RE2 aws_private_link_style_pattern(R"(bucket\.vpce\-([a-z0-9\-.]+)\.vpce.amazonaws.com(:\d{1,5})?)");
+    static const RE2 aws_private_link_style_pattern(R"(bucket\.vpce\-([a-z0-9\-.]+)\.vpce\.amazonaws\.com(:\d{1,5})?)");
 
     /// Case when bucket name and key represented in path of S3 URL.
     /// E.g. (https://s3.region.amazonaws.com/bucket-name/key)

From 978d36f9fe0e8cc34b6529276b7435b56a16bbb7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 03:57:47 +0200
Subject: [PATCH 10/88] It's strange that we cared about this

---
 src/IO/S3/URI.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp
index eea73474c44..64962f63edb 100644
--- a/src/IO/S3/URI.cpp
+++ b/src/IO/S3/URI.cpp
@@ -43,7 +43,7 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
     /// https://docs.aws.amazon.com/AmazonS3/latest/userguide/privatelink-interface-endpoints.html
     static const RE2 aws_private_link_style_pattern(R"(bucket\.vpce\-([a-z0-9\-.]+)\.vpce\.amazonaws\.com(:\d{1,5})?)");
 
-    /// Case when bucket name and key represented in path of S3 URL.
+    /// Case when bucket name and key represented in the path of S3 URL.
     /// E.g. (https://s3.region.amazonaws.com/bucket-name/key)
     /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#path-style-access
     static const RE2 path_style_pattern("^/([^/]*)/(.*)");
@@ -124,13 +124,6 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
         }
 
         boost::to_upper(name);
-        /// For S3Express it will look like s3express-eun1-az1, i.e. contain region and AZ info
-        if (name != "S3" && !name.starts_with("S3EXPRESS") && name != "COS" && name != "OBS" && name != "OSS" && name != "EOS")
-            throw Exception(
-                ErrorCodes::BAD_ARGUMENTS,
-                "Object storage system name is unrecognized in virtual hosted style S3 URI: {}",
-                quoteString(name));
-
         if (name == "COS")
             storage_name = "COSN";
         else

From dbd4a6d551a9c242c0ce8025ebcdb1304f0d448c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 04:41:06 +0200
Subject: [PATCH 11/88] Speed up from 4 sec to 2 sec #52771

---
 src/IO/S3/Credentials.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
index dfb7727fca4..11779c4dbd5 100644
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@@ -145,12 +145,16 @@ Aws::String AWSEC2MetadataClient::getDefaultCredentialsSecurely() const
 {
     String user_agent_string = awsComputeUserAgentString();
     auto [new_token, response_code] = getEC2MetadataToken(user_agent_string);
-    if (response_code == Aws::Http::HttpResponseCode::BAD_REQUEST)
+    if (response_code == Aws::Http::HttpResponseCode::BAD_REQUEST
+        || response_code == Aws::Http::HttpResponseCode::REQUEST_NOT_MADE)
+    {
+        /// At least the host should be available and reply, otherwise neither IMDSv2 nor IMDSv1 are usable.
         return {};
+    }
     else if (response_code != Aws::Http::HttpResponseCode::OK || new_token.empty())
     {
         LOG_TRACE(logger, "Calling EC2MetadataService to get token failed, "
-                  "falling back to less secure way. HTTP response code: {}", response_code);
+                  "falling back to a less secure way. HTTP response code: {}", response_code);
         return getDefaultCredentials();
     }
 

From 8bfe4ee23f8b37ff8780e44ede9dc785ac563f75 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 04:57:18 +0200
Subject: [PATCH 12/88] Speed up requests when IMDS is not available

---
 src/IO/S3/Credentials.cpp | 6 +++---
 src/IO/S3/Credentials.h   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
index 11779c4dbd5..9c5f6547933 100644
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@@ -251,7 +251,7 @@ static Aws::String getAWSMetadataEndpoint()
     return ec2_metadata_service_endpoint;
 }
 
-std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration)
+std::shared_ptr<AWSEC2MetadataClient> createEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration)
 {
     auto endpoint = getAWSMetadataEndpoint();
     return std::make_shared<AWSEC2MetadataClient>(client_configuration, endpoint.c_str());
@@ -785,11 +785,11 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
 
             /// EC2MetadataService throttles by delaying the response so the service client should set a large read timeout.
             /// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds.
-            aws_client_configuration.connectTimeoutMs = 1000;
+            aws_client_configuration.connectTimeoutMs = 10;
             aws_client_configuration.requestTimeoutMs = 1000;
 
             aws_client_configuration.retryStrategy = std::make_shared<Aws::Client::DefaultRetryStrategy>(1, 1000);
-            auto ec2_metadata_client = InitEC2MetadataClient(aws_client_configuration);
+            auto ec2_metadata_client = createEC2MetadataClient(aws_client_configuration);
             auto config_loader = std::make_shared<AWSEC2InstanceProfileConfigLoader>(ec2_metadata_client, !credentials_configuration.use_insecure_imds_request);
 
             AddProvider(std::make_shared<AWSInstanceProfileCredentialsProvider>(config_loader));
diff --git a/src/IO/S3/Credentials.h b/src/IO/S3/Credentials.h
index 95297ab0538..042c48ec15a 100644
--- a/src/IO/S3/Credentials.h
+++ b/src/IO/S3/Credentials.h
@@ -70,7 +70,7 @@ private:
     LoggerPtr logger;
 };
 
-std::shared_ptr<AWSEC2MetadataClient> InitEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration);
+std::shared_ptr<AWSEC2MetadataClient> createEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration);
 
 class AWSEC2InstanceProfileConfigLoader : public Aws::Config::AWSProfileConfigLoader
 {

From 9470ceb34d6519c820f7c7ddccbe27b48a046f2a Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 05:41:10 +0200
Subject: [PATCH 13/88] Minor changes

---
 src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +-
 src/IO/S3/PocoHTTPClient.cpp                    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
index 433a0e96d2e..9854bada9ec 100644
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@@ -63,7 +63,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
     {
         const auto & err = response.GetError();
         throw S3Exception(
-            fmt::format("{} (Code: {}, s3 exception: {})",
+            fmt::format("{} (Code: {}, S3 exception: '{}')",
                         err.GetMessage(), static_cast<size_t>(err.GetErrorType()), err.GetExceptionName()),
             err.GetErrorType());
     }
diff --git a/src/IO/S3/PocoHTTPClient.cpp b/src/IO/S3/PocoHTTPClient.cpp
index aab7a39534d..de43f34d838 100644
--- a/src/IO/S3/PocoHTTPClient.cpp
+++ b/src/IO/S3/PocoHTTPClient.cpp
@@ -128,7 +128,7 @@ void PocoHTTPClientConfiguration::updateSchemeAndRegion()
             }
             else
             {
-                /// In global mode AWS C++ SDK send `us-east-1` but accept switching to another one if being suggested.
+                /// In global mode AWS C++ SDK sends `us-east-1` but accepts switching to another one if being suggested.
                 region = Aws::Region::AWS_GLOBAL;
             }
         }

From a74cc601cd11ab32df61c46f9950c051fd8bb4da Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 16:04:27 +0200
Subject: [PATCH 14/88] A catch-all URL style for S3

---
 src/IO/S3/URI.cpp | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp
index 64962f63edb..9c80b377661 100644
--- a/src/IO/S3/URI.cpp
+++ b/src/IO/S3/URI.cpp
@@ -136,7 +136,16 @@ URI::URI(const std::string & uri_, bool allow_archive_path_syntax)
         validateBucket(bucket, uri);
     }
     else
-        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
+    {
+        /// Custom endpoint, e.g. a public domain of Cloudflare R2,
+        /// which could be served by a custom server-side code.
+        storage_name = "S3";
+        bucket = "default";
+        is_virtual_hosted_style = false;
+        endpoint = uri.getScheme() + "://" + uri.getAuthority();
+        if (!uri.getPath().empty())
+            key = uri.getPath().substr(1);
+    }
 }
 
 void URI::addRegionToURI(const std::string &region)

From 9a6d98cd203ee9a6cdde4450aaf72d109a9719c7 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 16:15:28 +0200
Subject: [PATCH 15/88] Update test

---
 .../integration/test_odbc_interaction/test.py | 44 +++++++++----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py
index 0d0d7a0afb1..9d4ca5ad49f 100644
--- a/tests/integration/test_odbc_interaction/test.py
+++ b/tests/integration/test_odbc_interaction/test.py
@@ -51,9 +51,9 @@ create_table_sql_nullable_template = """
     """
 
 
-def skip_test_msan(instance):
-    if instance.is_built_with_memory_sanitizer():
-        pytest.skip("Memory Sanitizer cannot work with third-party shared libraries")
+def skip_test_sanitizers(instance):
+    if instance.is_built_with_sanitizer():
+        pytest.skip("Sanitizers cannot work with third-party shared libraries")
 
 
 def get_mysql_conn():
@@ -208,7 +208,7 @@ def started_cluster():
 
 
 def test_mysql_odbc_select_nullable(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
     mysql_setup = node1.odbc_drivers["MySQL"]
 
     table_name = "test_insert_nullable_select"
@@ -248,7 +248,7 @@ def test_mysql_odbc_select_nullable(started_cluster):
 
 
 def test_mysql_simple_select_works(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     mysql_setup = node1.odbc_drivers["MySQL"]
 
@@ -331,7 +331,7 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32, column_x Nulla
 
 
 def test_mysql_insert(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     mysql_setup = node1.odbc_drivers["MySQL"]
     table_name = "test_insert"
@@ -374,7 +374,7 @@ def test_mysql_insert(started_cluster):
 
 
 def test_sqlite_simple_select_function_works(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     sqlite_setup = node1.odbc_drivers["SQLite3"]
     sqlite_db = sqlite_setup["Database"]
@@ -438,7 +438,7 @@ def test_sqlite_simple_select_function_works(started_cluster):
 
 
 def test_sqlite_table_function(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     sqlite_setup = node1.odbc_drivers["SQLite3"]
     sqlite_db = sqlite_setup["Database"]
@@ -470,7 +470,7 @@ def test_sqlite_table_function(started_cluster):
 
 
 def test_sqlite_simple_select_storage_works(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     sqlite_setup = node1.odbc_drivers["SQLite3"]
     sqlite_db = sqlite_setup["Database"]
@@ -503,7 +503,7 @@ def test_sqlite_simple_select_storage_works(started_cluster):
 
 
 def test_sqlite_odbc_hashed_dictionary(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     sqlite_db = node1.odbc_drivers["SQLite3"]["Database"]
     node1.exec_in_container(
@@ -586,7 +586,7 @@ def test_sqlite_odbc_hashed_dictionary(started_cluster):
 
 
 def test_sqlite_odbc_cached_dictionary(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     sqlite_db = node1.odbc_drivers["SQLite3"]["Database"]
     node1.exec_in_container(
@@ -635,7 +635,7 @@ def test_sqlite_odbc_cached_dictionary(started_cluster):
 
 
 def test_postgres_odbc_hashed_dictionary_with_schema(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     try:
         conn = get_postgres_conn(started_cluster)
@@ -663,7 +663,7 @@ def test_postgres_odbc_hashed_dictionary_with_schema(started_cluster):
 
 
 def test_postgres_odbc_hashed_dictionary_no_tty_pipe_overflow(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     try:
         conn = get_postgres_conn(started_cluster)
@@ -685,7 +685,7 @@ def test_postgres_odbc_hashed_dictionary_no_tty_pipe_overflow(started_cluster):
 
 
 def test_no_connection_pooling(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     try:
         conn = get_postgres_conn(started_cluster)
@@ -717,7 +717,7 @@ def test_no_connection_pooling(started_cluster):
 
 
 def test_postgres_insert(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
 
@@ -754,7 +754,7 @@ def test_postgres_insert(started_cluster):
 
 
 def test_odbc_postgres_date_data_type(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     try:
         conn = get_postgres_conn(started_cluster)
@@ -783,7 +783,7 @@ def test_odbc_postgres_date_data_type(started_cluster):
 
 
 def test_odbc_postgres_conversions(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     try:
         conn = get_postgres_conn(started_cluster)
@@ -841,7 +841,7 @@ def test_odbc_postgres_conversions(started_cluster):
 
 
 def test_odbc_cyrillic_with_varchar(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
     cursor = conn.cursor()
@@ -868,7 +868,7 @@ def test_odbc_cyrillic_with_varchar(started_cluster):
 
 
 def test_many_connections(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
     cursor = conn.cursor()
@@ -894,7 +894,7 @@ def test_many_connections(started_cluster):
 
 
 def test_concurrent_queries(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
     cursor = conn.cursor()
@@ -948,7 +948,7 @@ def test_concurrent_queries(started_cluster):
 
 
 def test_odbc_long_column_names(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
     cursor = conn.cursor()
@@ -986,7 +986,7 @@ def test_odbc_long_column_names(started_cluster):
 
 
 def test_odbc_long_text(started_cluster):
-    skip_test_msan(node1)
+    skip_test_sanitizers(node1)
 
     conn = get_postgres_conn(started_cluster)
     cursor = conn.cursor()

From 9d91b600caa0d5cddcf4d13a8c80ac85c9077fca Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 16:27:35 +0200
Subject: [PATCH 16/88] Add a test

---
 tests/integration/test_s3_imds/test_simple.py    |  2 +-
 .../03221_s3_imds_decent_timeout.reference       |  1 +
 .../0_stateless/03221_s3_imds_decent_timeout.sh  | 16 ++++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/03221_s3_imds_decent_timeout.reference
 create mode 100755 tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh

diff --git a/tests/integration/test_s3_imds/test_simple.py b/tests/integration/test_s3_imds/test_simple.py
index 0dacac2b0b9..4884c824f99 100644
--- a/tests/integration/test_s3_imds/test_simple.py
+++ b/tests/integration/test_s3_imds/test_simple.py
@@ -56,7 +56,7 @@ def test_credentials_from_metadata():
     )
 
     expected_logs = [
-        "Calling EC2MetadataService to get token failed, falling back to less secure way",
+        "Calling EC2MetadataService to get token failed, falling back to a less secure way",
         "Getting default credentials for ec2 instance from resolver:8080",
         "Calling EC2MetadataService resource, /latest/meta-data/iam/security-credentials returned credential string myrole",
         "Calling EC2MetadataService resource /latest/meta-data/iam/security-credentials/myrole",
diff --git a/tests/queries/0_stateless/03221_s3_imds_decent_timeout.reference b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh
new file mode 100755
index 00000000000..fb55539d04a
--- /dev/null
+++ b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh
@@ -0,0 +1,16 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest
+# ^ requires S3
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+# Inaccessible IMDS should not introduce large delays, so this query should reply quickly at least sometimes:
+while true
+do
+    # This host (likely) drops packets sent to it (does not reply), so it is good for testing timeouts.
+    # At the same time, we expect that google.com does not drop packets and quickly replies with 404, which is a non-retriable error for S3.
+    AWS_EC2_METADATA_SERVICE_ENDPOINT='https://10.255.255.255/' ${CLICKHOUSE_LOCAL} --time --query "SELECT * FROM s3('https://google.com/test')" |& grep -v -F 404 |
+        ${CLICKHOUSE_LOCAL} --input-format TSV "SELECT c1::Float64 < 1 FROM table" | grep 1 && break
+done

From f4173546a93ab72986d1239a812ac0bc7fda11cd Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Fri, 9 Aug 2024 21:47:01 +0200
Subject: [PATCH 17/88] Remove obsolete test

---
 src/IO/tests/gtest_s3_uri.cpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/src/IO/tests/gtest_s3_uri.cpp b/src/IO/tests/gtest_s3_uri.cpp
index 0ec28f80072..7216c8077e3 100644
--- a/src/IO/tests/gtest_s3_uri.cpp
+++ b/src/IO/tests/gtest_s3_uri.cpp
@@ -206,11 +206,6 @@ TEST(S3UriTest, validPatterns)
     }
 }
 
-TEST_P(S3UriTest, invalidPatterns)
-{
-    ASSERT_ANY_THROW(S3::URI new_uri(GetParam()));
-}
-
 TEST(S3UriTest, versionIdChecks)
 {
     for (const auto& test_case : TestCases)
@@ -223,19 +218,4 @@ TEST(S3UriTest, versionIdChecks)
     }
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    S3,
-    S3UriTest,
-    testing::Values(
-        "https:///",
-        "https://.s3.amazonaws.com/key",
-        "https://s3.amazonaws.com/key",
-        "https://jokserfn.s3amazonaws.com/key",
-        "https://s3.amazonaws.com//",
-        "https://amazonaws.com/",
-        "https://amazonaws.com//",
-        "https://amazonaws.com//key"));
-
-}
-
 #endif

From cebcc88b312b0702de9866c229e67097c738c4d1 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 10 Aug 2024 00:18:54 +0200
Subject: [PATCH 18/88] Fix build

---
 src/IO/tests/gtest_s3_uri.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/IO/tests/gtest_s3_uri.cpp b/src/IO/tests/gtest_s3_uri.cpp
index 7216c8077e3..c0bf7fcb28a 100644
--- a/src/IO/tests/gtest_s3_uri.cpp
+++ b/src/IO/tests/gtest_s3_uri.cpp
@@ -218,4 +218,5 @@ TEST(S3UriTest, versionIdChecks)
     }
 }
 
+}
 #endif

From 3f718626da61f0502115586425d728492d3a3ae3 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 10 Aug 2024 19:55:11 +0200
Subject: [PATCH 19/88] Better test

---
 tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh
index fb55539d04a..021278955cd 100755
--- a/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh
+++ b/tests/queries/0_stateless/03221_s3_imds_decent_timeout.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Tags: no-fasttest
+# Tags: no-fasttest, no-asan, no-msan, no-tsan
 # ^ requires S3
 
 CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
@@ -10,7 +10,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 while true
 do
     # This host (likely) drops packets sent to it (does not reply), so it is good for testing timeouts.
-    # At the same time, we expect that google.com does not drop packets and quickly replies with 404, which is a non-retriable error for S3.
-    AWS_EC2_METADATA_SERVICE_ENDPOINT='https://10.255.255.255/' ${CLICKHOUSE_LOCAL} --time --query "SELECT * FROM s3('https://google.com/test')" |& grep -v -F 404 |
+    # At the same time, we expect that the clickhouse host does not drop packets and quickly replies with 4xx, which is a non-retriable error for S3.
+    AWS_EC2_METADATA_SERVICE_ENDPOINT='https://10.255.255.255/' ${CLICKHOUSE_LOCAL} --time --query "SELECT * FROM s3('${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/nonexistent')" |& grep -v -F 404 |
         ${CLICKHOUSE_LOCAL} --input-format TSV "SELECT c1::Float64 < 1 FROM table" | grep 1 && break
 done

From 7524b8f76712ce421fdebd1fe86c79128fea3ceb Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sat, 10 Aug 2024 19:55:22 +0200
Subject: [PATCH 20/88] A slight improvement

---
 src/Storages/StorageMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 78dbb72c199..f7701a2aab8 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -895,7 +895,7 @@ void StorageMergeTree::loadDeduplicationLog()
     std::string path = fs::path(relative_data_path) / "deduplication_logs";
 
     /// If either there is already a deduplication log, or we will be able to use it.
-    if (disk->exists(path) || !disk->isReadOnly())
+    if (!disk->isReadOnly() || disk->exists(path))
     {
         deduplication_log = std::make_unique<MergeTreeDeduplicationLog>(path, settings->non_replicated_deduplication_window, format_version, disk);
         deduplication_log->load();

From ee3eec0a2a15592b2020d34b74d9f595e707c092 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Sun, 11 Aug 2024 04:47:26 +0200
Subject: [PATCH 21/88] Update Credentials.cpp

---
 src/IO/S3/Credentials.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
index 9c5f6547933..d6f7542da6b 100644
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@@ -785,6 +785,8 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
 
             /// EC2MetadataService throttles by delaying the response so the service client should set a large read timeout.
             /// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds.
+            /// But the connection timeout should be small because there is the case when there is no IMDS at all,
+            /// like outside of the cloud, on your own machines.
             aws_client_configuration.connectTimeoutMs = 10;
             aws_client_configuration.requestTimeoutMs = 1000;
 

From 967bd0566336f8c239f0045f703fc8fe428cb28f Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 12:24:13 -0600
Subject: [PATCH 22/88] Add create_if_not_exists setting to Settings.h

---
 src/Core/Settings.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 6f24db57026..1b52df76c45 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -896,6 +896,7 @@ class IColumn;
     M(UInt64, extract_key_value_pairs_max_pairs_per_row, 1000, "Max number of pairs that can be produced by the `extractKeyValuePairs` function. Used as a safeguard against consuming too much memory.", 0) ALIAS(extract_kvp_max_pairs_per_row) \
     M(Bool, restore_replace_external_engines_to_null, false, "Replace all the external table engines to Null on restore. Useful for testing purposes", 0) \
     M(Bool, restore_replace_external_table_functions_to_null, false, "Replace all table functions to Null on restore. Useful for testing purposes", 0) \
+    M(Bool, create_if_not_exists, false, "Enable IF NOT EXISTS for CREATE statements by default", 0) \
     \
     \
     /* ###################################### */ \

From f90b8327bea16ee81c12f0210d2602889a5944bc Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 11:43:57 -0600
Subject: [PATCH 23/88] Update SettingsChangesHistory.cpp with new
 create_if_not_exists settings

---
 src/Core/SettingsChangesHistory.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 511723f1873..202b21a92f0 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -75,6 +75,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
     },
     {"24.8",
         {
+            {"create_if_not_exists", false, false, "New setting."},
             {"rows_before_aggregation", true, true, "Provide exact value for rows_before_aggregation statistic, represents the number of rows read before aggregation"},
             {"restore_replace_external_table_functions_to_null", false, false, "New setting."},
             {"restore_replace_external_engines_to_null", false, false, "New setting."},

From 868a1e78f2d919ed7c581c9a74102887dbd204bf Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 11:47:30 -0600
Subject: [PATCH 24/88] Add support for new create_if_not_exists setting in
 InterpreterCreateQuery.cpp

---
 src/Interpreters/InterpreterCreateQuery.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
index a101e5e8f09..d899b8e111e 100644
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -1946,6 +1946,8 @@ BlockIO InterpreterCreateQuery::execute()
     FunctionNameNormalizer::visit(query_ptr.get());
     auto & create = query_ptr->as<ASTCreateQuery &>();
 
+    create.if_not_exists |= getContext()->getSettingsRef().create_if_not_exists;
+
     bool is_create_database = create.database && !create.table;
     if (!create.cluster.empty() && !maybeRemoveOnCluster(query_ptr, getContext()))
     {

From 2af1134c08ab164b2d77af854166fa30d96fddd9 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 12:17:24 -0600
Subject: [PATCH 25/88] Update settings.md docs with new create_if_not_exists
 settings

---
 docs/en/operations/settings/settings.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index e432f4e038f..22f73a03729 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -5637,3 +5637,9 @@ Possible values:
 - 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
 
 Default value: `0`.
+
+## create_if_not_exists
+
+Enable IF NOT EXISTS for CREATE statements by default. If either this setting or IF NOT EXISTS is specified, then no Exception will be thrown when trying to create a new table.
+
+Default value: `false`.

From cc0412c55372108116b6b05fb4fc66ebd4eccae2 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 16:01:48 -0600
Subject: [PATCH 26/88] Add test 03221_create_if_not_exists_setting

---
 ...221_create_if_not_exists_setting.reference |  4 ++
 .../03221_create_if_not_exists_setting.sh     | 43 +++++++++++++++++++
 2 files changed, 47 insertions(+)
 create mode 100644 tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
 create mode 100755 tests/queries/0_stateless/03221_create_if_not_exists_setting.sh

diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference b/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
new file mode 100644
index 00000000000..8740b05c9ca
--- /dev/null
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
@@ -0,0 +1,4 @@
+57
+82
+0
+0
diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
new file mode 100755
index 00000000000..cfbe2eb8fd9
--- /dev/null
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# $CLICKHOUSE_CLIENT -mn -q "SET create_if_not_exists=0;"  # Default
+$CLICKHOUSE_CLIENT -mn -q "
+DROP TABLE IF EXISTS example_table;
+CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
+CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
+" 2> /dev/null
+# ensure failed error code
+echo $?
+$CLICKHOUSE_CLIENT -mn -q "
+DROP DATABASE IF EXISTS example_database;
+CREATE DATABASE example_database;
+CREATE DATABASE example_database;
+" 2> /dev/null
+echo $?
+
+$CLICKHOUSE_CLIENT -mn -q "
+SET create_if_not_exists=1;
+DROP TABLE IF EXISTS example_table;
+CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
+CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
+"
+# ensure successful error code
+echo $?
+
+
+$CLICKHOUSE_CLIENT -mn -q "
+SET create_if_not_exists=1;
+DROP DATABASE IF EXISTS example_database;
+CREATE DATABASE example_database;
+CREATE DATABASE example_database;
+"
+echo $?
+
+$CLICKHOUSE_CLIENT -mn -q "
+DROP DATABASE example_database;
+DROP TABLE example_table;
+"
\ No newline at end of file

From d31c56e0d9756642d90885016369e9ca7994e3f0 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Sun, 11 Aug 2024 23:17:24 -0600
Subject: [PATCH 27/88] Update 03221_create_if_not_exists_setting.sh

---
 .../queries/0_stateless/03221_create_if_not_exists_setting.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
index cfbe2eb8fd9..8dcde8977bc 100755
--- a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
@@ -38,6 +38,6 @@ CREATE DATABASE example_database;
 echo $?
 
 $CLICKHOUSE_CLIENT -mn -q "
-DROP DATABASE example_database;
-DROP TABLE example_table;
+DROP DATABASE IF EXISTS example_database;
+DROP TABLE IF EXISTS example_table;
 "
\ No newline at end of file

From 5c54c7025bd87f1e4239354b5f3c0adff188dd3a Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Mon, 12 Aug 2024 08:25:54 +0000
Subject: [PATCH 28/88] Followup for #56996

---
 src/Interpreters/HashJoin/HashJoinMethodsImpl.h |  4 ++--
 src/Interpreters/HashJoin/JoinFeatures.h        | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/HashJoin/HashJoinMethodsImpl.h b/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
index aedd24630d1..39ba9fc6e93 100644
--- a/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
+++ b/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
@@ -354,8 +354,8 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
         {
             if (unlikely(current_offset >= max_joined_block_rows))
             {
-                added_columns.offsets_to_replicate->resize_assume_reserved(i);
-                added_columns.filter.resize_assume_reserved(i);
+                added_columns.offsets_to_replicate->resize(i);
+                added_columns.filter.resize(i);
                 break;
             }
         }
diff --git a/src/Interpreters/HashJoin/JoinFeatures.h b/src/Interpreters/HashJoin/JoinFeatures.h
index b8de606c51e..b39593e7cac 100644
--- a/src/Interpreters/HashJoin/JoinFeatures.h
+++ b/src/Interpreters/HashJoin/JoinFeatures.h
@@ -18,11 +18,25 @@ struct JoinFeatures
     static constexpr bool inner = KIND == JoinKind::Inner;
     static constexpr bool full = KIND == JoinKind::Full;
 
+    /** Whether we may need duplicate rows from the left table.
+      * For example, when we have row (key1, attr1) in left table
+      * and rows (key1, attr2), (key1, attr3) in right table,
+      * then we need to duplicate row (key1, attr1) for each of joined rows from right table, so result will be
+      * (key1, attr1, key1, attr2)
+      * (key1, attr1, key1, attr3)
+      */
     static constexpr bool need_replication = is_all_join || (is_any_join && right) || (is_semi_join && right);
+
+    /// Whether we need to filter rows from the left table that do not have matches in the right table.
     static constexpr bool need_filter = !need_replication && (inner || right || (is_semi_join && left) || (is_anti_join && left));
+
+    /// Whether we need to add default values for columns from the left table.
     static constexpr bool add_missing = (left || full) && !is_semi_join;
 
+    /// Whether we need to store flags for rows from the right table table
+    /// that indicates if they have matches in the left table.
     static constexpr bool need_flags = MapGetter<KIND, STRICTNESS, std::is_same_v<std::decay_t<Map>, HashJoin::MapsAll>>::flagged;
+
     static constexpr bool is_maps_all = std::is_same_v<std::decay_t<Map>, HashJoin::MapsAll>;
 };
 

From 3172bf8d76534bb46ce54ae6af96e14443d2b59b Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Mon, 12 Aug 2024 12:23:32 +0000
Subject: [PATCH 29/88] better accounting of time for merge of projections

---
 .../Transforms/MergeJoinTransform.cpp         |  2 +-
 src/Storages/MergeTree/MergeTask.cpp          | 24 ++++++++++++++-----
 .../03221_merge_profile_events.reference      |  2 +-
 .../03221_merge_profile_events.sql            |  8 ++++---
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/src/Processors/Transforms/MergeJoinTransform.cpp b/src/Processors/Transforms/MergeJoinTransform.cpp
index ec7f567ea57..6abfa0fccd0 100644
--- a/src/Processors/Transforms/MergeJoinTransform.cpp
+++ b/src/Processors/Transforms/MergeJoinTransform.cpp
@@ -1282,7 +1282,7 @@ MergeJoinTransform::MergeJoinTransform(
 
 void MergeJoinTransform::onFinish()
 {
-    algorithm.logElapsed(merging_elapsed_ns / 1000000000ULL);
+    algorithm.logElapsed(static_cast<double>(merging_elapsed_ns) / 1000000000ULL);
 }
 
 }
diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index 39bac8f7c24..cb1921ede2b 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -461,8 +461,12 @@ void MergeTask::addGatheringColumn(GlobalRuntimeContextPtr global_ctx, const Str
 
 MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::getContextForNextStage()
 {
-    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
-    ProfileEvents::increment(ProfileEvents::MergeHorizontalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    /// Do not increment for projection stage because time is already accounted in main task.
+    if (global_ctx->parent_part == nullptr)
+    {
+        ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+        ProfileEvents::increment(ProfileEvents::MergeHorizontalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    }
 
     auto new_ctx = std::make_shared<VerticalMergeRuntimeContext>();
 
@@ -481,8 +485,12 @@ MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::g
 
 MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNextStage()
 {
-    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
-    ProfileEvents::increment(ProfileEvents::MergeVerticalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    /// Do not increment for projection stage because time is already accounted in main task.
+    if (global_ctx->parent_part == nullptr)
+    {
+        ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+        ProfileEvents::increment(ProfileEvents::MergeVerticalStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    }
 
     auto new_ctx = std::make_shared<MergeProjectionsRuntimeContext>();
     new_ctx->need_sync = std::move(ctx->need_sync);
@@ -1026,8 +1034,12 @@ bool MergeTask::execute()
     UInt64 stage_elapsed_ms = current_elapsed_ms - global_ctx->prev_elapsed_ms;
     global_ctx->prev_elapsed_ms = current_elapsed_ms;
 
-    ProfileEvents::increment(current_stage->getTotalTimeProfileEvent(), stage_elapsed_ms);
-    ProfileEvents::increment(ProfileEvents::MergeTotalMilliseconds, stage_elapsed_ms);
+    /// Do not increment for projection stage because time is already accounted in main task.
+    if (global_ctx->parent_part == nullptr)
+    {
+        ProfileEvents::increment(current_stage->getTotalTimeProfileEvent(), stage_elapsed_ms);
+        ProfileEvents::increment(ProfileEvents::MergeTotalMilliseconds, stage_elapsed_ms);
+    }
 
     auto next_stage_context = current_stage->getContextForNextStage();
 
diff --git a/tests/queries/0_stateless/03221_merge_profile_events.reference b/tests/queries/0_stateless/03221_merge_profile_events.reference
index 729e53eae79..d969717336b 100644
--- a/tests/queries/0_stateless/03221_merge_profile_events.reference
+++ b/tests/queries/0_stateless/03221_merge_profile_events.reference
@@ -1,3 +1,3 @@
 Horizontal	1	20000	3	0	480000	1	1	1	1
 Vertical	1	20000	1	2	480000	1	1	1	1	1	1
-Vertical	2	20020	4	2	480660	1	1	1	1	1	1	1	1
+Vertical	2	400000	2	6	12800000	1	1	1	1	1	1	1	1	1	1
diff --git a/tests/queries/0_stateless/03221_merge_profile_events.sql b/tests/queries/0_stateless/03221_merge_profile_events.sql
index 787aff93ffc..1aa3dd266f8 100644
--- a/tests/queries/0_stateless/03221_merge_profile_events.sql
+++ b/tests/queries/0_stateless/03221_merge_profile_events.sql
@@ -58,12 +58,12 @@ DROP TABLE IF EXISTS t_merge_profile_events_2;
 
 DROP TABLE IF EXISTS t_merge_profile_events_3;
 
-CREATE TABLE t_merge_profile_events_3 (id UInt64, v1 UInt64, v2 UInt64, PROJECTION p (SELECT sum(v1), sum(v2) GROUP BY id % 10))
+CREATE TABLE t_merge_profile_events_3 (id UInt64, v1 UInt64, v2 UInt64, PROJECTION p (SELECT v2, v2 * v2, v2 * 2, v2 * 10, v1 ORDER BY v1))
 ENGINE = MergeTree ORDER BY id
 SETTINGS min_bytes_for_wide_part = 0, vertical_merge_algorithm_min_rows_to_activate = 1, vertical_merge_algorithm_min_columns_to_activate = 1;
 
-INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(10000);
-INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(10000);
+INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(100000);
+INSERT INTO t_merge_profile_events_3 SELECT number, number, number FROM numbers(100000);
 
 OPTIMIZE TABLE t_merge_profile_events_3 FINAL;
 SYSTEM FLUSH LOGS;
@@ -83,6 +83,8 @@ SELECT
     ProfileEvents['MergeVerticalStageExecuteMilliseconds'] > 0,
     ProfileEvents['MergeProjectionStageTotalMilliseconds'] > 0,
     ProfileEvents['MergeProjectionStageExecuteMilliseconds'] > 0,
+    ProfileEvents['MergeExecuteMilliseconds'] <= duration_ms,
+    ProfileEvents['MergeTotalMilliseconds'] <= duration_ms
 FROM system.part_log WHERE database = currentDatabase() AND table = 't_merge_profile_events_3' AND event_type = 'MergeParts' AND part_name = 'all_1_2_1';
 
 DROP TABLE IF EXISTS t_merge_profile_events_3;

From 83d20bee00a4973cbffc3bcd9ba4073c79efb073 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Mon, 12 Aug 2024 07:42:55 -0600
Subject: [PATCH 30/88] Update 03221_create_if_not_exists_setting test to a
 .sql test

---
 ...221_create_if_not_exists_setting.reference |  4 ---
 ...=> 03221_create_if_not_exists_setting.sql} | 32 ++++---------------
 2 files changed, 6 insertions(+), 30 deletions(-)
 rename tests/queries/0_stateless/{03221_create_if_not_exists_setting.sh => 03221_create_if_not_exists_setting.sql} (51%)
 mode change 100755 => 100644

diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference b/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
index 8740b05c9ca..e69de29bb2d 100644
--- a/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.reference
@@ -1,4 +0,0 @@
-57
-82
-0
-0
diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
old mode 100755
new mode 100644
similarity index 51%
rename from tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
rename to tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
index 8dcde8977bc..59535981e7a
--- a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sh
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
@@ -1,43 +1,23 @@
-#!/usr/bin/env bash
 
-CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
-# shellcheck source=../shell_config.sh
-. "$CURDIR"/../shell_config.sh
+SET create_if_not_exists=0;  -- Default
 
-# $CLICKHOUSE_CLIENT -mn -q "SET create_if_not_exists=0;"  # Default
-$CLICKHOUSE_CLIENT -mn -q "
 DROP TABLE IF EXISTS example_table;
 CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
-CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
-" 2> /dev/null
-# ensure failed error code
-echo $?
-$CLICKHOUSE_CLIENT -mn -q "
+CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id; -- { serverError TABLE_ALREADY_EXISTS }
+
 DROP DATABASE IF EXISTS example_database;
 CREATE DATABASE example_database;
-CREATE DATABASE example_database;
-" 2> /dev/null
-echo $?
+CREATE DATABASE example_database; -- { serverError DATABASE_ALREADY_EXISTS }
 
-$CLICKHOUSE_CLIENT -mn -q "
 SET create_if_not_exists=1;
+
 DROP TABLE IF EXISTS example_table;
 CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
 CREATE TABLE example_table (id UInt32) ENGINE=MergeTree() ORDER BY id;
-"
-# ensure successful error code
-echo $?
 
-
-$CLICKHOUSE_CLIENT -mn -q "
-SET create_if_not_exists=1;
 DROP DATABASE IF EXISTS example_database;
 CREATE DATABASE example_database;
 CREATE DATABASE example_database;
-"
-echo $?
 
-$CLICKHOUSE_CLIENT -mn -q "
 DROP DATABASE IF EXISTS example_database;
-DROP TABLE IF EXISTS example_table;
-"
\ No newline at end of file
+DROP TABLE IF EXISTS example_table;
\ No newline at end of file

From c817a4e8adfba37b23156ed75e1a501068e10cc1 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Mon, 12 Aug 2024 07:45:51 -0600
Subject: [PATCH 31/88] Update settings.md to clarify create_if_not_exists
 behavior

Co-authored-by: Nikita Taranov <nickita.taranov@gmail.com>
---
 docs/en/operations/settings/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 22f73a03729..b9d5dde8522 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -5640,6 +5640,6 @@ Default value: `0`.
 
 ## create_if_not_exists
 
-Enable IF NOT EXISTS for CREATE statements by default. If either this setting or IF NOT EXISTS is specified, then no Exception will be thrown when trying to create a new table.
+Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
 
 Default value: `false`.

From 897b8d5a88a69b8831ab489c2bea9d32d0cf06dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Mon, 12 Aug 2024 15:21:01 +0000
Subject: [PATCH 32/88] Try to give more chances to `node2` to steal some work

---
 tests/integration/test_storage_s3_queue/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index 8f197e09e61..00ef8499594 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -1300,7 +1300,7 @@ where zookeeper_path ilike '%{table_name}%' and status = 'Processed' and rows_pr
         pytest.param("unordered", 1),
         pytest.param("unordered", 8),
         pytest.param("ordered", 1),
-        pytest.param("ordered", 8),
+        pytest.param("ordered", 2),
     ],
 )
 def test_shards_distributed(started_cluster, mode, processing_threads):
@@ -1311,7 +1311,7 @@ def test_shards_distributed(started_cluster, mode, processing_threads):
     keeper_path = f"/clickhouse/test_{table_name}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
-    row_num = 50
+    row_num = 300
     total_rows = row_num * files_to_generate
     shards_num = 2
 

From dccf34dc9565699b79242ee9e2c36a0e021e0f21 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Mon, 12 Aug 2024 21:20:57 +0000
Subject: [PATCH 33/88] fix drift of profile event time

---
 src/Storages/MergeTree/MergeTask.cpp | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index cb1921ede2b..3aa4d764685 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -944,8 +944,13 @@ bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const
 
 MergeTask::StageRuntimeContextPtr MergeTask::MergeProjectionsStage::getContextForNextStage()
 {
-    ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
-    ProfileEvents::increment(ProfileEvents::MergeProjectionStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    /// Do not increment for projection stage because time is already accounted in main task.
+    /// The projection stage has its own empty projection stage which may add a drift of severals milliseconds.
+    if (global_ctx->parent_part == nullptr)
+    {
+        ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+        ProfileEvents::increment(ProfileEvents::MergeProjectionStageExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);
+    }
 
     return nullptr;
 }
@@ -1034,6 +1039,8 @@ bool MergeTask::execute()
     UInt64 stage_elapsed_ms = current_elapsed_ms - global_ctx->prev_elapsed_ms;
     global_ctx->prev_elapsed_ms = current_elapsed_ms;
 
+    auto next_stage_context = current_stage->getContextForNextStage();
+
     /// Do not increment for projection stage because time is already accounted in main task.
     if (global_ctx->parent_part == nullptr)
     {
@@ -1041,8 +1048,6 @@ bool MergeTask::execute()
         ProfileEvents::increment(ProfileEvents::MergeTotalMilliseconds, stage_elapsed_ms);
     }
 
-    auto next_stage_context = current_stage->getContextForNextStage();
-
     /// Move to the next stage in an array of stages
     ++stages_iterator;
     if (stages_iterator == stages.end())

From 3b1d6f30bec5a8a568ab477e639d97c9c95a3f2c Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@clickhouse.com>
Date: Mon, 12 Aug 2024 23:55:01 +0200
Subject: [PATCH 34/88] Debug test

---
 .../0_stateless/02490_benchmark_max_consecutive_errors.sh        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/02490_benchmark_max_consecutive_errors.sh b/tests/queries/0_stateless/02490_benchmark_max_consecutive_errors.sh
index f747b3156a5..df7e9386662 100755
--- a/tests/queries/0_stateless/02490_benchmark_max_consecutive_errors.sh
+++ b/tests/queries/0_stateless/02490_benchmark_max_consecutive_errors.sh
@@ -11,5 +11,6 @@ if [ "$RES" -eq 10 ]
 then
     echo "$RES"
 else
+    echo "$RES"
     cat "${CLICKHOUSE_TMP}/${CLICKHOUSE_DATABASE}.log"
 fi

From 2a51b6c403c8f86fb9e68f358ca490630c40fec6 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Tue, 13 Aug 2024 10:20:05 +0800
Subject: [PATCH 35/88] fix crash in lag/lead

---
 src/Processors/Transforms/WindowTransform.cpp |  6 ++++++
 .../03210_lag_lead_inframe_types.reference    | 16 +++++++++++++++
 .../03210_lag_lead_inframe_types.sql          | 20 +++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index 85e6b2ec55e..cae817380e0 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -1159,6 +1159,12 @@ void WindowTransform::appendChunk(Chunk & chunk)
         {
             if (ws.window_function_impl)
                 block.casted_columns.push_back(ws.window_function_impl->castColumn(block.input_columns, ws.argument_column_indices));
+            else
+            {
+                /// `castColumn` returns nullptr at default, so it's OK to put nullptr as a placeholder here
+                /// it should not be used in fact.
+                block.casted_columns.push_back(nullptr);
+            }
 
             block.output_columns.push_back(ws.aggregate_function->getResultType()
                 ->createColumn());
diff --git a/tests/queries/0_stateless/03210_lag_lead_inframe_types.reference b/tests/queries/0_stateless/03210_lag_lead_inframe_types.reference
index d4734a85e72..4ecf7f56b07 100644
--- a/tests/queries/0_stateless/03210_lag_lead_inframe_types.reference
+++ b/tests/queries/0_stateless/03210_lag_lead_inframe_types.reference
@@ -38,3 +38,19 @@
 7
 8
 9
+15	\N	3	15	15	15	15
+14	\N	2	10	10	10	154
+13	\N	2	10	10	10	143
+12	\N	2	10	10	10	14
+11	\N	2	10	10	10	12
+10	\N	2	10	10	10	10
+9	\N	1	5	5	5	99
+8	\N	1	5	5	5	88
+7	\N	1	5	5	5	9
+6	\N	1	5	5	5	7
+5	\N	1	5	5	5	5
+4	\N	0	0	0	0	44
+3	\N	0	0	0	0	33
+2	\N	0	0	0	0	4
+1	\N	0	0	0	0	2
+0	\N	0	0	0	0	0
diff --git a/tests/queries/0_stateless/03210_lag_lead_inframe_types.sql b/tests/queries/0_stateless/03210_lag_lead_inframe_types.sql
index f6017ee6690..cc6746e428f 100644
--- a/tests/queries/0_stateless/03210_lag_lead_inframe_types.sql
+++ b/tests/queries/0_stateless/03210_lag_lead_inframe_types.sql
@@ -2,3 +2,23 @@ SELECT lagInFrame(2::UInt128, 2, number) OVER w FROM numbers(10) WINDOW w AS (OR
 SELECT leadInFrame(2::UInt128, 2, number) OVER w FROM numbers(10) WINDOW w AS (ORDER BY number);
 SELECT lagInFrame(2::UInt64, 2, number) OVER w FROM numbers(10) WINDOW w AS (ORDER BY number);
 SELECT leadInFrame(2::UInt64, 2, number) OVER w FROM numbers(10) WINDOW w AS (ORDER BY number);
+
+SELECT
+    number,
+    YYYYMMDDToDate(1, toLowCardinality(11), max(YYYYMMDDToDate(YYYYMMDDToDate(toLowCardinality(1), 11, materialize(NULL), 19700101.1, 1, 27, 7, materialize(toUInt256(37)), 9, 19, 9), 1, toUInt128(11), NULL, 19700101.1, 1, 27, 7, 37, 9, 19, 9), toUInt256(30)) IGNORE NULLS OVER w, NULL, 19700101.1, toNullable(1), 27, materialize(7), 37, 9, 19, 9),
+    p,
+    pp,
+    lagInFrame(number, number - pp) OVER w AS lag2,
+    lagInFrame(number, number - pp, number * 11) OVER w AS lag,
+    leadInFrame(number, number - pp, number * 11) OVER w AS lead
+FROM
+(
+    SELECT
+        number,
+        intDiv(number, 5) AS p,
+        p * 5 AS pp
+    FROM numbers(16)
+)
+WHERE toLowCardinality(1)
+WINDOW w AS (PARTITION BY p ORDER BY number ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)
+ORDER BY number DESC NULLS LAST;

From a0f617c6cc06cd80ecbb485965f6b7e7763be18c Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 13 Aug 2024 10:14:37 +0200
Subject: [PATCH 36/88] tests: make 01600_parts_states_metrics_long better

- better bash
- HTTP protocol cannot handle multiple queries fix this
- decrease number of retries (this should be ok after no-parallel) to
  print final debug info

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 .../01600_parts_states_metrics_long.sh        | 43 +++++++++++--------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
index a07dd306b3e..0a9f94cc451 100755
--- a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
@@ -11,33 +11,40 @@ function query()
     ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&database_atomic_wait_for_drop_and_detach_synchronously=1" -d "$*"
 }
 
-# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
-verify_sql="SELECT
-    (SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics)
-    = (SELECT sum(active), sum(NOT active) FROM
-    (SELECT active FROM system.parts UNION ALL SELECT active FROM system.projection_parts UNION ALL SELECT 1 FROM system.dropped_tables_parts))"
 
 # The query is not atomic - it can compare states between system.parts and system.metrics from different points in time.
 # So, there is inherent race condition. But it should get expected result eventually.
 # In case of test failure, this code will do infinite loop and timeout.
 verify()
 {
-    for i in {1..5000}
-    do
-        result=$( query "$verify_sql" )
-        [ "$result" = "1" ] && echo "$result" && break
-        sleep 0.1
+    local result
 
-        if [[ $i -eq 5000 ]]
-        then
-            query "
-              SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics;
-              SELECT sum(active), sum(NOT active) FROM system.parts;
-              SELECT sum(active), sum(NOT active) FROM system.projection_parts;
-              SELECT count() FROM system.dropped_tables_parts;
-            "
+    for _ in {1..100}; do
+        # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
+        result=$( query "SELECT
+            (SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics)
+                =
+            (SELECT sum(active), sum(NOT active) FROM (
+                SELECT active FROM system.parts
+                UNION ALL SELECT active FROM system.projection_parts
+                UNION ALL SELECT 1 FROM system.dropped_tables_parts
+            ))"
+        )
+
+        if [ "$result" = "1" ]; then
+            echo "$result"
+            return
         fi
+
+        sleep 0.5
     done
+
+    $CLICKHOUSE_CLIENT -q "
+        SELECT sumIf(value, metric = 'PartsActive'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics;
+        SELECT sum(active), sum(NOT active) FROM system.parts;
+        SELECT sum(active), sum(NOT active) FROM system.projection_parts;
+        SELECT count() FROM system.dropped_tables_parts;
+    "
 }
 
 query "DROP TABLE IF EXISTS test_table"

From 85bd63a2ac54c8665e99c1b07c4a5e0189212635 Mon Sep 17 00:00:00 2001
From: kevinyhzou <kevinyunhe8@gmail.com>
Date: Fri, 10 May 2024 12:18:06 +0800
Subject: [PATCH 37/88] rebase and resolve conflict

---
 src/Core/Settings.h                           |   1 +
 src/Core/SettingsChangesHistory.cpp           |   1 +
 src/Interpreters/HashJoin/AddedColumns.cpp    | 157 ++++++++++++------
 src/Interpreters/HashJoin/AddedColumns.h      |  28 +++-
 src/Interpreters/HashJoin/HashJoin.cpp        |   4 +-
 src/Interpreters/HashJoin/HashJoin.h          |  12 ++
 src/Interpreters/HashJoin/HashJoinMethods.h   |   3 +-
 .../HashJoin/HashJoinMethodsImpl.h            |  49 +++---
 src/Interpreters/HashJoin/KnowRowsHolder.h    |   9 +-
 src/Interpreters/RowRefs.cpp                  |   6 +-
 src/Interpreters/RowRefs.h                    |   7 +-
 src/Interpreters/TableJoin.cpp                |   1 +
 src/Interpreters/TableJoin.h                  |   2 +
 tests/performance/all_join_opt.xml            |  15 ++
 14 files changed, 206 insertions(+), 89 deletions(-)
 create mode 100644 tests/performance/all_join_opt.xml

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 6f24db57026..e2740026e58 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -325,6 +325,7 @@ class IColumn;
     \
     M(Bool, join_use_nulls, false, "Use NULLs for non-joined rows of outer JOINs for types that can be inside Nullable. If false, use default value of corresponding columns data type.", IMPORTANT) \
     \
+    M(Int32, join_output_by_rowlist_perkey_rows_threshold, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join.", 0) \
     M(JoinStrictness, join_default_strictness, JoinStrictness::All, "Set default strictness in JOIN query. Possible values: empty string, 'ANY', 'ALL'. If empty, query without strictness will throw exception.", 0) \
     M(Bool, any_join_distinct_right_table_keys, false, "Enable old ANY JOIN logic with many-to-one left-to-right table keys mapping for all ANY JOINs. It leads to confusing not equal results for 't1 ANY LEFT JOIN t2' and 't2 ANY RIGHT JOIN t1'. ANY RIGHT JOIN needs one-to-many keys mapping to be consistent with LEFT one.", IMPORTANT) \
     M(Bool, single_join_prefer_left_table, true, "For single JOIN in case of identifier ambiguity prefer left table", IMPORTANT) \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 511723f1873..cc9524bea2e 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -85,6 +85,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"allow_experimental_time_series_table", false, false, "Added new setting to allow the TimeSeries table engine"},
             {"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
             {"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
+            {"join_output_by_rowlist_perkey_rows_threshold", 0, 5, "The lower limit of per-key average rows in the right table to determine whether to output by row list in hash join."},
         }
     },
     {"24.7",
diff --git a/src/Interpreters/HashJoin/AddedColumns.cpp b/src/Interpreters/HashJoin/AddedColumns.cpp
index 930a352744d..21cb6e401ed 100644
--- a/src/Interpreters/HashJoin/AddedColumns.cpp
+++ b/src/Interpreters/HashJoin/AddedColumns.cpp
@@ -15,48 +15,115 @@ JoinOnKeyColumns::JoinOnKeyColumns(const Block & block, const Names & key_names_
 {
 }
 
-template<> void AddedColumns<false>::buildOutput()
-{
-}
+template<>
+void AddedColumns<false>::buildOutput() {}
+
+template<>
+void AddedColumns<false>::buildJoinGetOutput() {}
+
+template<>
+template<bool from_row_list>
+void AddedColumns<false>::buildOutputFromBlocks() {}
 
 template<>
 void AddedColumns<true>::buildOutput()
 {
-    for (size_t i = 0; i < this->size(); ++i)
+    if (!output_by_row_list)
+        buildOutputFromBlocks<false>();
+    else
     {
-        auto& col = columns[i];
-        size_t default_count = 0;
-        auto apply_default = [&]()
+        if (join_data_avg_perkey_rows < output_by_row_list_threshold)
+            buildOutputFromBlocks<true>();
+        else
         {
-            if (default_count > 0)
+            for (size_t i = 0; i < this->size(); ++i)
             {
-                JoinCommon::addDefaultValues(*col, type_name[i].type, default_count);
-                default_count = 0;
-            }
-        };
-
-        for (size_t j = 0; j < lazy_output.blocks.size(); ++j)
-        {
-            if (!lazy_output.blocks[j])
-            {
-                default_count++;
-                continue;
-            }
-            apply_default();
-            const auto & column_from_block = reinterpret_cast<const Block *>(lazy_output.blocks[j])->getByPosition(right_indexes[i]);
-            /// If it's joinGetOrNull, we need to wrap not-nullable columns in StorageJoin.
-            if (is_join_get)
-            {
-                if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get());
-                    nullable_col && !column_from_block.column->isNullable())
+                auto & col = columns[i];
+                for (auto row_ref_i : lazy_output.row_refs)
                 {
-                    nullable_col->insertFromNotNullable(*column_from_block.column, lazy_output.row_nums[j]);
-                    continue;
+                    if (row_ref_i)
+                    {
+                        const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
+                        for (auto it = row_ref_list->begin(); it.ok(); ++it)
+                            col->insertFrom(*it->block->getByPosition(right_indexes[i]).column, it->row_num);
+                    }
+                    else
+                        type_name[i].type->insertDefaultInto(*col);
                 }
             }
-            col->insertFrom(*column_from_block.column, lazy_output.row_nums[j]);
         }
-        apply_default();
+    }
+}
+
+template<>
+void AddedColumns<true>::buildJoinGetOutput()
+{
+    for (size_t i = 0; i < this->size(); ++i)
+    {
+        auto & col = columns[i];
+        for (auto row_ref_i : lazy_output.row_refs)
+        {
+            if (!row_ref_i)
+            {
+                type_name[i].type->insertDefaultInto(*col);
+                continue;
+            }
+            const auto * row_ref = reinterpret_cast<const RowRef *>(row_ref_i);
+            const auto & column_from_block = row_ref->block->getByPosition(right_indexes[i]);
+            if (auto * nullable_col = typeid_cast<ColumnNullable *>(col.get()); nullable_col && !column_from_block.column->isNullable())
+                nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
+            else
+                col->insertFrom(*column_from_block.column, row_ref->row_num);
+        }
+    }
+}
+
+template<>
+template<bool from_row_list>
+void AddedColumns<true>::buildOutputFromBlocks()
+{
+    if (this->size() == 0)
+        return;
+    std::vector<const Block *> blocks;
+    std::vector<UInt32> row_nums;
+    blocks.reserve(lazy_output.row_refs.size());
+    row_nums.reserve(lazy_output.row_refs.size());
+    for (auto row_ref_i : lazy_output.row_refs)
+    {
+        if (row_ref_i)
+        {
+            if constexpr (from_row_list)
+            {
+                const RowRefList * row_ref_list = reinterpret_cast<const RowRefList *>(row_ref_i);
+                for (auto it = row_ref_list->begin(); it.ok(); ++it)
+                {
+                    blocks.emplace_back(it->block);
+                    row_nums.emplace_back(it->row_num);
+                }
+            }
+            else
+            {
+                const RowRef * row_ref = reinterpret_cast<const RowRefList *>(row_ref_i);
+                blocks.emplace_back(row_ref->block);
+                row_nums.emplace_back(row_ref->row_num);
+            }
+        }
+        else
+        {
+            blocks.emplace_back(nullptr);
+            row_nums.emplace_back(0);
+        }
+    }
+    for (size_t i = 0; i < this->size(); ++i)
+    {
+        auto & col = columns[i];
+        for (size_t j = 0; j < blocks.size(); ++j)
+        {
+            if (blocks[j])
+                col->insertFrom(*blocks[j]->getByPosition(right_indexes[i]).column, row_nums[j]);
+            else
+                type_name[i].type->insertDefaultInto(*col);
+        }
     }
 }
 
@@ -72,29 +139,27 @@ void AddedColumns<false>::applyLazyDefaults()
 }
 
 template<>
-void AddedColumns<true>::applyLazyDefaults()
-{
-}
+void AddedColumns<true>::applyLazyDefaults() {}
 
 template <>
-void AddedColumns<false>::appendFromBlock(const Block & block, size_t row_num,const bool has_defaults)
+void AddedColumns<false>::appendFromBlock(const RowRef * row_ref, const bool has_defaults)
 {
     if (has_defaults)
         applyLazyDefaults();
 
 #ifndef NDEBUG
-    checkBlock(block);
+    checkBlock(*row_ref->block);
 #endif
     if (is_join_get)
     {
         size_t right_indexes_size = right_indexes.size();
         for (size_t j = 0; j < right_indexes_size; ++j)
         {
-            const auto & column_from_block = block.getByPosition(right_indexes[j]);
+            const auto & column_from_block = row_ref->block->getByPosition(right_indexes[j]);
             if (auto * nullable_col = nullable_column_ptrs[j])
-                nullable_col->insertFromNotNullable(*column_from_block.column, row_num);
+                nullable_col->insertFromNotNullable(*column_from_block.column, row_ref->row_num);
             else
-                columns[j]->insertFrom(*column_from_block.column, row_num);
+                columns[j]->insertFrom(*column_from_block.column, row_ref->row_num);
         }
     }
     else
@@ -102,22 +167,21 @@ void AddedColumns<false>::appendFromBlock(const Block & block, size_t row_num,co
         size_t right_indexes_size = right_indexes.size();
         for (size_t j = 0; j < right_indexes_size; ++j)
         {
-            const auto & column_from_block = block.getByPosition(right_indexes[j]);
-            columns[j]->insertFrom(*column_from_block.column, row_num);
+            const auto & column_from_block = row_ref->block->getByPosition(right_indexes[j]);
+            columns[j]->insertFrom(*column_from_block.column, row_ref->row_num);
         }
     }
 }
 
 template <>
-void AddedColumns<true>::appendFromBlock(const Block & block, size_t row_num, bool)
+void AddedColumns<true>::appendFromBlock(const RowRef * row_ref, bool)
 {
 #ifndef NDEBUG
-    checkBlock(block);
+    checkBlock(*row_ref->block);
 #endif
     if (has_columns_to_add)
     {
-        lazy_output.blocks.emplace_back(reinterpret_cast<UInt64>(&block));
-        lazy_output.row_nums.emplace_back(static_cast<uint32_t>(row_num));
+        lazy_output.row_refs.emplace_back(reinterpret_cast<UInt64>(row_ref));
     }
 }
 template<>
@@ -131,8 +195,7 @@ void AddedColumns<true>::appendDefaultRow()
 {
     if (has_columns_to_add)
     {
-        lazy_output.blocks.emplace_back(0);
-        lazy_output.row_nums.emplace_back(0);
+        lazy_output.row_refs.emplace_back(0);
     }
 }
 }
diff --git a/src/Interpreters/HashJoin/AddedColumns.h b/src/Interpreters/HashJoin/AddedColumns.h
index 13a7df6f498..f1b95a63be6 100644
--- a/src/Interpreters/HashJoin/AddedColumns.h
+++ b/src/Interpreters/HashJoin/AddedColumns.h
@@ -50,8 +50,7 @@ public:
 
     struct LazyOutput
     {
-        PaddedPODArray<UInt64> blocks;
-        PaddedPODArray<UInt32> row_nums;
+        PaddedPODArray<UInt64> row_refs;
     };
 
     AddedColumns(
@@ -76,8 +75,7 @@ public:
         if constexpr (lazy)
         {
             has_columns_to_add = num_columns_to_add > 0;
-            lazy_output.blocks.reserve(rows_to_add);
-            lazy_output.row_nums.reserve(rows_to_add);
+            lazy_output.row_refs.reserve(rows_to_add);
         }
 
         columns.reserve(num_columns_to_add);
@@ -115,18 +113,22 @@ public:
             if (columns[j]->isNullable() && !saved_column->isNullable())
                 nullable_column_ptrs[j] = typeid_cast<ColumnNullable *>(columns[j].get());
         }
+        join_data_avg_perkey_rows = join.getJoinedData()->avgPerKeyRows();
+        output_by_row_list_threshold = join.getTableJoin().outputByRowListPerkeyRowsThreshold();
     }
 
     size_t size() const { return columns.size(); }
 
     void buildOutput();
 
+    void buildJoinGetOutput();
+
     ColumnWithTypeAndName moveColumn(size_t i)
     {
         return ColumnWithTypeAndName(std::move(columns[i]), type_name[i].type, type_name[i].qualified_name);
     }
 
-    void appendFromBlock(const Block & block, size_t row_num, bool has_default);
+    void appendFromBlock(const RowRef * row_ref, bool has_default);
 
     void appendDefaultRow();
 
@@ -134,6 +136,8 @@ public:
 
     const IColumn & leftAsofKey() const { return *left_asof_key; }
 
+    static constexpr bool isLazy() { return lazy; }
+
     Block left_block;
     std::vector<JoinOnKeyColumns> join_on_keys;
     ExpressionActionsPtr additional_filter_expression;
@@ -142,6 +146,9 @@ public:
     size_t rows_to_add;
     std::unique_ptr<IColumn::Offsets> offsets_to_replicate;
     bool need_filter = false;
+    bool output_by_row_list = false;
+    size_t join_data_avg_perkey_rows = 0;
+    size_t output_by_row_list_threshold = 0;
     IColumn::Filter filter;
 
     void reserve(bool need_replicate)
@@ -212,15 +219,22 @@ private:
         columns.back()->reserve(src_column.column->size());
         type_name.emplace_back(src_column.type, src_column.name, qualified_name);
     }
+
+    /** Build output from the blocks that extract from `RowRef` or `RowRefList`, to avoid block cache miss which may cause performance slow down.
+     *  And This problem would happen it we directly build output from `RowRef` or `RowRefList`.
+     */
+    template<bool from_row_list>
+    void buildOutputFromBlocks();
 };
 
 /// Adapter class to pass into addFoundRowAll
 /// In joinRightColumnsWithAdditionalFilter we don't want to add rows directly into AddedColumns,
 /// because they need to be filtered by additional_filter_expression.
-class PreSelectedRows : public std::vector<RowRef>
+class PreSelectedRows : public std::vector<const RowRef *>
 {
 public:
-    void appendFromBlock(const Block & block, size_t row_num, bool /* has_default */) { this->emplace_back(&block, row_num); }
+    void appendFromBlock(const RowRef * row_ref, bool /* has_default */) { this->emplace_back(row_ref); }
+    static constexpr bool isLazy() { return false; }
 };
 
 }
diff --git a/src/Interpreters/HashJoin/HashJoin.cpp b/src/Interpreters/HashJoin/HashJoin.cpp
index dd7d42de63e..9c07a71e614 100644
--- a/src/Interpreters/HashJoin/HashJoin.cpp
+++ b/src/Interpreters/HashJoin/HashJoin.cpp
@@ -495,7 +495,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
     }
 
     size_t rows = source_block.rows();
-
+    data->rows_to_join += rows;
     const auto & right_key_names = table_join->getAllNames(JoinTableSide::Right);
     ColumnPtrMap all_key_columns(right_key_names.size());
     for (const auto & column_name : right_key_names)
@@ -647,7 +647,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
             total_bytes = getTotalByteCount();
         }
     }
-
+    data->keys_to_join = total_rows;
     shrinkStoredBlocksToFit(total_bytes);
 
     return table_join->sizeLimits().check(total_rows, total_bytes, "JOIN", ErrorCodes::SET_SIZE_LIMIT_EXCEEDED);
diff --git a/src/Interpreters/HashJoin/HashJoin.h b/src/Interpreters/HashJoin/HashJoin.h
index 00f5ef6d214..d645b8e9273 100644
--- a/src/Interpreters/HashJoin/HashJoin.h
+++ b/src/Interpreters/HashJoin/HashJoin.h
@@ -345,6 +345,18 @@ public:
 
         size_t blocks_allocated_size = 0;
         size_t blocks_nullmaps_allocated_size = 0;
+
+        /// Number of rows of right table to join
+        size_t rows_to_join = 0;
+        /// Number of keys of right table to join
+        size_t keys_to_join = 0;
+
+        size_t avgPerKeyRows() const
+        {
+            if (keys_to_join == 0)
+                return 0;
+            return rows_to_join / keys_to_join;
+        }
     };
 
     using RightTableDataPtr = std::shared_ptr<RightTableData>;
diff --git a/src/Interpreters/HashJoin/HashJoinMethods.h b/src/Interpreters/HashJoin/HashJoinMethods.h
index 3b7a67467e3..97ad57d26ea 100644
--- a/src/Interpreters/HashJoin/HashJoinMethods.h
+++ b/src/Interpreters/HashJoin/HashJoinMethods.h
@@ -83,6 +83,7 @@ public:
         const Block & block_with_columns_to_add,
         const MapsTemplateVector & maps_,
         bool is_join_get = false);
+
 private:
     template <typename KeyGetter, bool is_asof_join>
     static KeyGetter createKeyGetter(const ColumnRawPtrs & key_columns, const Sizes & key_sizes);
@@ -128,7 +129,7 @@ private:
     template <typename AddedColumns>
     static ColumnPtr buildAdditionalFilter(
         size_t left_start_row,
-        const std::vector<RowRef> & selected_rows,
+        const std::vector<const RowRef *> & selected_rows,
         const std::vector<size_t> & row_replicate_offset,
         AddedColumns & added_columns);
 
diff --git a/src/Interpreters/HashJoin/HashJoinMethodsImpl.h b/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
index aedd24630d1..0d90bad2d8a 100644
--- a/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
+++ b/src/Interpreters/HashJoin/HashJoinMethodsImpl.h
@@ -95,7 +95,10 @@ Block HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinBlockImpl(
     added_columns.join_on_keys.clear();
     Block remaining_block = sliceBlock(block, num_joined);
 
-    added_columns.buildOutput();
+    if (is_join_get)
+        added_columns.buildJoinGetOutput();
+    else
+        added_columns.buildOutput();
     for (size_t i = 0; i < added_columns.size(); ++i)
         block.insert(added_columns.moveColumn(i));
 
@@ -339,6 +342,8 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
     size_t rows = added_columns.rows_to_add;
     if constexpr (need_filter)
         added_columns.filter = IColumn::Filter(rows, 0);
+    if constexpr (!flag_per_row && (STRICTNESS == JoinStrictness::All || (STRICTNESS == JoinStrictness::Semi && KIND == JoinKind::Right)))
+        added_columns.output_by_row_list = true;
 
     Arena pool;
 
@@ -381,15 +386,15 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
                     const IColumn & left_asof_key = added_columns.leftAsofKey();
 
                     auto row_ref = mapped->findAsof(left_asof_key, i);
-                    if (row_ref.block)
+                    if (row_ref && row_ref->block)
                     {
                         setUsed<need_filter>(added_columns.filter, i);
                         if constexpr (flag_per_row)
-                            used_flags.template setUsed<join_features.need_flags, flag_per_row>(row_ref.block, row_ref.row_num, 0);
+                            used_flags.template setUsed<join_features.need_flags, flag_per_row>(row_ref->block, row_ref->row_num, 0);
                         else
                             used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
 
-                        added_columns.appendFromBlock(*row_ref.block, row_ref.row_num, join_features.add_missing);
+                        added_columns.appendFromBlock(row_ref, join_features.add_missing);
                     }
                     else
                         addNotFoundRow<join_features.add_missing, join_features.need_replication>(added_columns, current_offset);
@@ -420,7 +425,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
                     if (used_once)
                     {
                         setUsed<need_filter>(added_columns.filter, i);
-                        added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
+                        added_columns.appendFromBlock(&mapped, join_features.add_missing);
                     }
 
                     break;
@@ -438,7 +443,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumns(
                 {
                     setUsed<need_filter>(added_columns.filter, i);
                     used_flags.template setUsed<join_features.need_flags, flag_per_row>(find_result);
-                    added_columns.appendFromBlock(*mapped.block, mapped.row_num, join_features.add_missing);
+                    added_columns.appendFromBlock(&mapped, join_features.add_missing);
 
                     if (join_features.is_any_or_semi_join)
                     {
@@ -477,7 +482,7 @@ template <JoinKind KIND, JoinStrictness STRICTNESS, typename MapsTemplate>
 template <typename AddedColumns>
 ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter(
     size_t left_start_row,
-    const std::vector<RowRef> & selected_rows,
+    const std::vector<const RowRef *> & selected_rows,
     const std::vector<size_t> & row_replicate_offset,
     AddedColumns & added_columns)
 {
@@ -489,7 +494,7 @@ ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter
             result_column = ColumnUInt8::create();
             break;
         }
-        const Block & sample_right_block = *selected_rows.begin()->block;
+        const Block & sample_right_block = *((*selected_rows.begin())->block);
         if (!sample_right_block || !added_columns.additional_filter_expression)
         {
             auto filter = ColumnUInt8::create();
@@ -519,8 +524,8 @@ ColumnPtr HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::buildAdditionalFilter
                 auto new_col = col.column->cloneEmpty();
                 for (const auto & selected_row : selected_rows)
                 {
-                    const auto & src_col = selected_row.block->getByPosition(right_col_pos);
-                    new_col->insertFrom(*src_col.column, selected_row.row_num);
+                    const auto & src_col = selected_row->block->getByPosition(right_col_pos);
+                    new_col->insertFrom(*src_col.column, selected_row->row_num);
                 }
                 executed_block.insert({std::move(new_col), col.type, col.name});
             }
@@ -700,26 +705,24 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
                                 {
                                     // For inner join, we need mark each right row'flag, because we only use each right row once.
                                     auto used_once = used_flags.template setUsedOnce<join_features.need_flags, true>(
-                                        selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                                        (*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
                                     if (used_once)
                                     {
                                         any_matched = true;
                                         total_added_rows += 1;
-                                        added_columns.appendFromBlock(
-                                            *selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
+                                        added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
                                     }
                                 }
                             }
                             else
                             {
                                 auto used_once = used_flags.template setUsedOnce<join_features.need_flags, true>(
-                                    selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                                    (*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
                                 if (used_once)
                                 {
                                     any_matched = true;
                                     total_added_rows += 1;
-                                    added_columns.appendFromBlock(
-                                        *selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
+                                    added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
                                 }
                             }
                         }
@@ -727,16 +730,14 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
                         {
                             any_matched = true;
                             if constexpr (join_features.right && join_features.need_flags)
-                                used_flags.template setUsed<true, true>(selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                                used_flags.template setUsed<true, true>((*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
                         }
                         else
                         {
                             any_matched = true;
                             total_added_rows += 1;
-                            added_columns.appendFromBlock(
-                                *selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
-                            used_flags.template setUsed<join_features.need_flags, true>(
-                                selected_right_row_it->block, selected_right_row_it->row_num, 0);
+                            added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
+                            used_flags.template setUsed<join_features.need_flags, true>((*selected_right_row_it)->block, (*selected_right_row_it)->row_num, 0);
                         }
                     }
 
@@ -756,8 +757,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
                         if (filter_flags[replicated_row])
                         {
                             any_matched = true;
-                            added_columns.appendFromBlock(
-                                *selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
+                            added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
                             total_added_rows += 1;
                         }
                         ++selected_right_row_it;
@@ -767,8 +767,7 @@ size_t HashJoinMethods<KIND, STRICTNESS, MapsTemplate>::joinRightColumnsWithAddt
                         if (filter_flags[replicated_row])
                         {
                             any_matched = true;
-                            added_columns.appendFromBlock(
-                                *selected_right_row_it->block, selected_right_row_it->row_num, join_features.add_missing);
+                            added_columns.appendFromBlock(*selected_right_row_it, join_features.add_missing);
                             total_added_rows += 1;
                             selected_right_row_it = selected_right_row_it + row_replicate_offset[i] - replicated_row;
                             break;
diff --git a/src/Interpreters/HashJoin/KnowRowsHolder.h b/src/Interpreters/HashJoin/KnowRowsHolder.h
index d51c96893c5..9223e98d13c 100644
--- a/src/Interpreters/HashJoin/KnowRowsHolder.h
+++ b/src/Interpreters/HashJoin/KnowRowsHolder.h
@@ -104,7 +104,7 @@ void addFoundRowAll(
         {
             if (!known_rows.isKnown(std::make_pair(it->block, it->row_num)))
             {
-                added.appendFromBlock(*it->block, it->row_num, false);
+                added.appendFromBlock(*it, false);
                 ++current_offset;
                 if (!new_known_rows_ptr)
                 {
@@ -124,11 +124,16 @@ void addFoundRowAll(
             known_rows.add(std::cbegin(*new_known_rows_ptr), std::cend(*new_known_rows_ptr));
         }
     }
+    else if constexpr (AddedColumns::isLazy())
+    {
+        added.appendFromBlock(&mapped, false);
+        current_offset += mapped.rows;
+    }
     else
     {
         for (auto it = mapped.begin(); it.ok(); ++it)
         {
-            added.appendFromBlock(*it->block, it->row_num, false);
+            added.appendFromBlock(*it, false);
             ++current_offset;
         }
     }
diff --git a/src/Interpreters/RowRefs.cpp b/src/Interpreters/RowRefs.cpp
index 9785ba46dab..1b397ab56ef 100644
--- a/src/Interpreters/RowRefs.cpp
+++ b/src/Interpreters/RowRefs.cpp
@@ -144,7 +144,7 @@ public:
         return low;
     }
 
-    RowRef findAsof(const IColumn & asof_column, size_t row_num) override
+    RowRef * findAsof(const IColumn & asof_column, size_t row_num) override
     {
         sort();
 
@@ -156,10 +156,10 @@ public:
         if (pos != entries.size())
         {
             size_t row_ref_index = entries[pos].row_ref_index;
-            return row_refs[row_ref_index];
+            return &row_refs[row_ref_index];
         }
 
-        return {nullptr, 0};
+        return nullptr;
     }
 
 private:
diff --git a/src/Interpreters/RowRefs.h b/src/Interpreters/RowRefs.h
index 650b2311ba7..7c98c47dd11 100644
--- a/src/Interpreters/RowRefs.h
+++ b/src/Interpreters/RowRefs.h
@@ -122,7 +122,7 @@ struct RowRefList : RowRef
     };
 
     RowRefList() {} /// NOLINT
-    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_) {}
+    RowRefList(const Block * block_, size_t row_num_) : RowRef(block_, row_num_), rows(1) {}
 
     ForwardIterator begin() const { return ForwardIterator(this); }
 
@@ -135,8 +135,11 @@ struct RowRefList : RowRef
             *next = Batch(nullptr);
         }
         next = next->insert(std::move(row_ref), pool);
+        ++rows;
     }
 
+public:
+    SizeT rows = 0;
 private:
     Batch * next = nullptr;
 };
@@ -158,7 +161,7 @@ struct SortedLookupVectorBase
     virtual void insert(const IColumn &, const Block *, size_t) = 0;
 
     // This needs to be synchronized internally
-    virtual RowRef findAsof(const IColumn &, size_t) = 0;
+    virtual RowRef * findAsof(const IColumn &, size_t) = 0;
 };
 
 
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index c8c926db13c..138085f0710 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -115,6 +115,7 @@ TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_, Temporary
     , partial_merge_join_left_table_buffer_bytes(settings.partial_merge_join_left_table_buffer_bytes)
     , max_files_to_merge(settings.join_on_disk_max_files_to_merge)
     , temporary_files_codec(settings.temporary_files_codec)
+    , output_by_rowlist_perkey_rows_threshold(settings.join_output_by_rowlist_perkey_rows_threshold)
     , max_memory_usage(settings.max_memory_usage)
     , tmp_volume(tmp_volume_)
     , tmp_data(tmp_data_)
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 3f2bebb5816..4d626084d81 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -148,6 +148,7 @@ private:
     const size_t partial_merge_join_left_table_buffer_bytes = 0;
     const size_t max_files_to_merge = 0;
     const String temporary_files_codec = "LZ4";
+    const size_t output_by_rowlist_perkey_rows_threshold = 0;
 
     /// Value if setting max_memory_usage for query, can be used when max_bytes_in_join is not specified.
     size_t max_memory_usage = 0;
@@ -295,6 +296,7 @@ public:
         return join_use_nulls && isRightOrFull(kind());
     }
 
+    size_t outputByRowListPerkeyRowsThreshold() const { return output_by_rowlist_perkey_rows_threshold; }
     size_t defaultMaxBytes() const { return default_max_bytes; }
     size_t maxJoinedBlockRows() const { return max_joined_block_rows; }
     size_t maxRowsInRightBlock() const { return partial_merge_join_rows_in_right_blocks; }
diff --git a/tests/performance/all_join_opt.xml b/tests/performance/all_join_opt.xml
new file mode 100644
index 00000000000..0ab9c39f67c
--- /dev/null
+++ b/tests/performance/all_join_opt.xml
@@ -0,0 +1,15 @@
+<test>
+    <create_query>CREATE TABLE test (a Int64, b String, c LowCardinality(String)) ENGINE = MergeTree() ORDER BY a</create_query>
+    <create_query>CREATE TABLE test1 (a Int64, b String, c LowCardinality(String)) ENGINE = MergeTree() ORDER BY a</create_query>
+
+    <fill_query>INSERT INTO test SELECT number % 10000, number % 10000, number % 10000 FROM numbers(10000000)</fill_query>
+    <fill_query>INSERT INTO test1 SELECT number % 1000 , number % 1000, number % 1000 FROM numbers(100000)</fill_query>
+
+    <query tag='INNER'>SELECT MAX(test1.a) FROM test INNER JOIN test1 on test.b = test1.b</query>
+    <query tag='LEFT'>SELECT MAX(test1.a) FROM test LEFT JOIN test1 on test.b = test1.b</query>
+    <query tag='RIGHT'>SELECT MAX(test1.a) FROM test RIGHT JOIN test1 on test.b = test1.b</query>
+    <query tag='FULL'>SELECT MAX(test1.a) FROM test FULL JOIN test1 on test.b = test1.b</query>
+
+    <drop_query>DROP TABLE IF EXISTS test</drop_query>
+    <drop_query>DROP TABLE IF EXISTS test1</drop_query>
+</test>
\ No newline at end of file

From fbe08cc24ca3f1b4472eba0960b14227917c0329 Mon Sep 17 00:00:00 2001
From: Peter Nguyen <petern0408@gmail.com>
Date: Tue, 13 Aug 2024 07:24:41 -0600
Subject: [PATCH 38/88] Add no-parallel flag to
 03221_create_if_not_exists_setting.sql

---
 tests/queries/0_stateless/03221_create_if_not_exists_setting.sql | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
index 59535981e7a..18b3ed7bcec 100644
--- a/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
+++ b/tests/queries/0_stateless/03221_create_if_not_exists_setting.sql
@@ -1,3 +1,4 @@
+-- Tags: no-parallel
 
 SET create_if_not_exists=0;  -- Default
 

From 04286bc270f9f473a07bc4ae27ae61d96256f775 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nk@clickhouse.com>
Date: Tue, 13 Aug 2024 14:45:05 +0000
Subject: [PATCH 39/88] Add status to PipelineExecutor. Verify status of pusing
 pipeline.

---
 programs/obfuscator/Obfuscator.cpp            |  2 +-
 src/Client/LocalConnection.cpp                |  8 +++-
 src/Interpreters/SystemLog.cpp                |  2 +-
 src/Processors/Executors/ExecutingGraph.cpp   | 19 ++++-----
 src/Processors/Executors/ExecutingGraph.h     | 11 ++++-
 src/Processors/Executors/PipelineExecutor.cpp | 31 +++++++++-----
 src/Processors/Executors/PipelineExecutor.h   | 20 +++++++++-
 .../PushingAsyncPipelineExecutor.cpp          | 21 +++++++---
 .../Executors/PushingAsyncPipelineExecutor.h  |  7 +++-
 .../Executors/PushingPipelineExecutor.cpp     | 40 ++++++++++++++-----
 .../Executors/PushingPipelineExecutor.h       |  9 +++--
 .../Transforms/CreatingSetsTransform.cpp      |  3 +-
 src/Server/GRPCServer.cpp                     |  3 +-
 src/Server/TCPHandler.cpp                     | 13 ++++--
 src/Storages/Distributed/DistributedSink.cpp  |  6 ++-
 src/Storages/StorageBuffer.cpp                |  3 +-
 src/Storages/tests/gtest_storage_log.cpp      |  2 +-
 ...221_insert_timeout_overflow_mode.reference |  2 +
 .../03221_insert_timeout_overflow_mode.sh     |  8 ++++
 19 files changed, 153 insertions(+), 57 deletions(-)
 create mode 100644 tests/queries/0_stateless/03221_insert_timeout_overflow_mode.reference
 create mode 100755 tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh

diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp
index 688ae1a1143..7c13215e350 100644
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@@ -1462,7 +1462,7 @@ try
         while (in_executor.pull(block))
         {
             Columns columns = obfuscator.generate(block.getColumns());
-            out_executor.push(header.cloneWithColumns(columns));
+            std::ignore = out_executor.push(header.cloneWithColumns(columns));
             processed_rows += block.rows();
             if (!silent)
                 std::cerr << "Processed " << processed_rows << " rows\n";
diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp
index 072184e0a66..8f1e0958002 100644
--- a/src/Client/LocalConnection.cpp
+++ b/src/Client/LocalConnection.cpp
@@ -287,13 +287,17 @@ void LocalConnection::sendData(const Block & block, const String &, bool)
     if (!block)
         return;
 
+    bool inserted = false;
     if (state->pushing_async_executor)
-        state->pushing_async_executor->push(block);
+        inserted = state->pushing_async_executor->push(block);
     else if (state->pushing_executor)
-        state->pushing_executor->push(block);
+        inserted = state->pushing_executor->push(block);
     else
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown executor");
 
+    if (!inserted)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
+
     if (send_profile_events)
         sendProfileEvents();
 }
diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp
index 572481e6b12..0cad56af00a 100644
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@@ -556,7 +556,7 @@ void SystemLog<LogElement>::flushImpl(const std::vector<LogElement> & to_flush,
         PushingPipelineExecutor executor(io.pipeline);
 
         executor.start();
-        executor.push(block);
+        std::ignore = executor.push(block);
         executor.finish();
     }
     catch (...)
diff --git a/src/Processors/Executors/ExecutingGraph.cpp b/src/Processors/Executors/ExecutingGraph.cpp
index 6d5b60d8159..10470325bb8 100644
--- a/src/Processors/Executors/ExecutingGraph.cpp
+++ b/src/Processors/Executors/ExecutingGraph.cpp
@@ -96,7 +96,7 @@ bool ExecutingGraph::addEdges(uint64_t node)
     return was_edge_added;
 }
 
-bool ExecutingGraph::expandPipeline(std::stack<uint64_t> & stack, uint64_t pid)
+ExecutingGraph::UpdateNodeStatus ExecutingGraph::expandPipeline(std::stack<uint64_t> & stack, uint64_t pid)
 {
     auto & cur_node = *nodes[pid];
     Processors new_processors;
@@ -108,7 +108,7 @@ bool ExecutingGraph::expandPipeline(std::stack<uint64_t> & stack, uint64_t pid)
     catch (...)
     {
         cur_node.exception = std::current_exception();
-        return false;
+        return UpdateNodeStatus::Exception;
     }
 
     {
@@ -118,7 +118,7 @@ bool ExecutingGraph::expandPipeline(std::stack<uint64_t> & stack, uint64_t pid)
         {
             for (auto & processor : new_processors)
                 processor->cancel();
-            return false;
+            return UpdateNodeStatus::Cancelled;
         }
         processors->insert(processors->end(), new_processors.begin(), new_processors.end());
 
@@ -178,7 +178,7 @@ bool ExecutingGraph::expandPipeline(std::stack<uint64_t> & stack, uint64_t pid)
         }
     }
 
-    return true;
+    return UpdateNodeStatus::Done;
 }
 
 void ExecutingGraph::initializeExecution(Queue & queue)
@@ -213,7 +213,7 @@ void ExecutingGraph::initializeExecution(Queue & queue)
 }
 
 
-bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue)
+ExecutingGraph::UpdateNodeStatus ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue)
 {
     std::stack<Edge *> updated_edges;
     std::stack<uint64_t> updated_processors;
@@ -309,7 +309,7 @@ bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue
                 catch (...)
                 {
                     node.exception = std::current_exception();
-                    return false;
+                    return UpdateNodeStatus::Exception;
                 }
 
 #ifndef NDEBUG
@@ -386,8 +386,9 @@ bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue
                 read_lock.unlock();
                 {
                     std::unique_lock lock(nodes_mutex);
-                    if (!expandPipeline(updated_processors, pid))
-                        return false;
+                    auto status = expandPipeline(updated_processors, pid);
+                    if (status != UpdateNodeStatus::Done)
+                        return status;
                 }
                 read_lock.lock();
 
@@ -397,7 +398,7 @@ bool ExecutingGraph::updateNode(uint64_t pid, Queue & queue, Queue & async_queue
         }
     }
 
-    return true;
+    return UpdateNodeStatus::Done;
 }
 
 void ExecutingGraph::cancel(bool cancel_all_processors)
diff --git a/src/Processors/Executors/ExecutingGraph.h b/src/Processors/Executors/ExecutingGraph.h
index 71dcd360a2c..e1a6ac96203 100644
--- a/src/Processors/Executors/ExecutingGraph.h
+++ b/src/Processors/Executors/ExecutingGraph.h
@@ -138,10 +138,17 @@ public:
     /// Traverse graph the first time to update all the childless nodes.
     void initializeExecution(Queue & queue);
 
+    enum class UpdateNodeStatus
+    {
+        Done,
+        Exception,
+        Cancelled,
+    };
+
     /// Update processor with pid number (call IProcessor::prepare).
     /// Check parents and children of current processor and push them to stacks if they also need to be updated.
     /// If processor wants to be expanded, lock will be upgraded to get write access to pipeline.
-    bool updateNode(uint64_t pid, Queue & queue, Queue & async_queue);
+    UpdateNodeStatus updateNode(uint64_t pid, Queue & queue, Queue & async_queue);
 
     void cancel(bool cancel_all_processors = true);
 
@@ -155,7 +162,7 @@ private:
 
     /// Update graph after processor (pid) returned ExpandPipeline status.
     /// All new nodes and nodes with updated ports are pushed into stack.
-    bool expandPipeline(std::stack<uint64_t> & stack, uint64_t pid);
+    UpdateNodeStatus expandPipeline(std::stack<uint64_t> & stack, uint64_t pid);
 
     std::shared_ptr<Processors> processors;
     std::vector<bool> source_processors;
diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index 82cad471a29..23b3a6d9f5f 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -77,9 +77,9 @@ const Processors & PipelineExecutor::getProcessors() const
     return graph->getProcessors();
 }
 
-void PipelineExecutor::cancel()
+void PipelineExecutor::cancel(ExecutionStatus reason)
 {
-    cancelled = true;
+    tryUpdateExecutionStatus(ExecutionStatus::Executing, reason);
     finish();
     graph->cancel();
 }
@@ -98,6 +98,11 @@ void PipelineExecutor::finish()
     tasks.finish();
 }
 
+bool PipelineExecutor::tryUpdateExecutionStatus(ExecutionStatus expected, ExecutionStatus desired)
+{
+    return execution_status.compare_exchange_strong(expected, desired);
+}
+
 void PipelineExecutor::execute(size_t num_threads, bool concurrency_control)
 {
     checkTimeLimit();
@@ -120,7 +125,7 @@ void PipelineExecutor::execute(size_t num_threads, bool concurrency_control)
     }
     catch (...)
     {
-        span.addAttribute(ExecutionStatus::fromCurrentException());
+        span.addAttribute(DB::ExecutionStatus::fromCurrentException());
 
 #ifndef NDEBUG
         LOG_TRACE(log, "Exception while executing query. Current state:\n{}", dumpPipeline());
@@ -169,7 +174,7 @@ bool PipelineExecutor::checkTimeLimitSoft()
         // We call cancel here so that all processors are notified and tasks waken up
         // so that the "break" is faster and doesn't wait for long events
         if (!continuing)
-            cancel();
+            cancel(ExecutionStatus::CancelledByTimeout);
 
         return continuing;
     }
@@ -195,7 +200,8 @@ void PipelineExecutor::finalizeExecution()
 {
     checkTimeLimit();
 
-    if (cancelled)
+    auto status = execution_status.load();
+    if (status == ExecutionStatus::CancelledByTimeout || status == ExecutionStatus::CancelledByUser)
         return;
 
     bool all_processors_finished = true;
@@ -271,7 +277,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie
                 break;
 
             if (!context.executeTask())
-                cancel();
+                cancel(ExecutionStatus::Exception);
 
             if (tasks.isFinished())
                 break;
@@ -289,11 +295,13 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie
                 Queue async_queue;
 
                 /// Prepare processor after execution.
-                if (!graph->updateNode(context.getProcessorID(), queue, async_queue))
-                    cancel();
+                auto status = graph->updateNode(context.getProcessorID(), queue, async_queue);
+                if (status == ExecutingGraph::UpdateNodeStatus::Exception)
+                    cancel(ExecutionStatus::Exception);
 
                 /// Push other tasks to global queue.
-                tasks.pushTasks(queue, async_queue, context);
+                if (status == ExecutingGraph::UpdateNodeStatus::Done)
+                    tasks.pushTasks(queue, async_queue, context);
             }
 
 #ifndef NDEBUG
@@ -309,7 +317,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie
             {
                 /// spawnThreads can throw an exception, for example CANNOT_SCHEDULE_TASK.
                 /// We should cancel execution properly before rethrow.
-                cancel();
+                cancel(ExecutionStatus::Exception);
                 throw;
             }
 
@@ -328,6 +336,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, std::atomic_bool * yie
 void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_control)
 {
     is_execution_initialized = true;
+    tryUpdateExecutionStatus(ExecutionStatus::NotStarted, ExecutionStatus::Executing);
 
     size_t use_threads = num_threads;
 
@@ -393,7 +402,7 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control)
         {
             /// If finished_flag is not set, there was an exception.
             /// Cancel execution in this case.
-            cancel();
+            cancel(ExecutionStatus::Exception);
             if (pool)
                 pool->wait();
         }
diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h
index ae119355cb5..79d0a29d4e1 100644
--- a/src/Processors/Executors/PipelineExecutor.h
+++ b/src/Processors/Executors/PipelineExecutor.h
@@ -48,8 +48,20 @@ public:
 
     const Processors & getProcessors() const;
 
+    enum class ExecutionStatus
+    {
+        NotStarted,
+        Executing,
+        Finished,
+        Exception,
+        CancelledByUser,
+        CancelledByTimeout,
+    };
+
     /// Cancel execution. May be called from another thread.
-    void cancel();
+    void cancel() { cancel(ExecutionStatus::CancelledByUser); }
+
+    ExecutionStatus getExecutionStatus() const { return execution_status.load(); }
 
     /// Cancel processors which only read data from source. May be called from another thread.
     void cancelReading();
@@ -81,7 +93,7 @@ private:
     /// system.opentelemetry_span_log
     bool trace_processors = false;
 
-    std::atomic_bool cancelled = false;
+    std::atomic<ExecutionStatus> execution_status = ExecutionStatus::NotStarted;
     std::atomic_bool cancelled_reading = false;
 
     LoggerPtr log = getLogger("PipelineExecutor");
@@ -105,6 +117,10 @@ private:
     void executeStepImpl(size_t thread_num, std::atomic_bool * yield_flag = nullptr);
     void executeSingleThread(size_t thread_num);
     void finish();
+    void cancel(ExecutionStatus reason);
+
+    /// If execution_status == from, change it to desired.
+    bool tryUpdateExecutionStatus(ExecutionStatus expected, ExecutionStatus desired);
 
     String dumpPipeline() const;
 };
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
index 830a96533ed..db5cf451c9e 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
@@ -176,7 +176,17 @@ void PushingAsyncPipelineExecutor::start()
     data->thread = ThreadFromGlobalPool(std::move(func));
 }
 
-void PushingAsyncPipelineExecutor::push(Chunk chunk)
+static void checkExecutionStatus(PipelineExecutor::ExecutionStatus status)
+{
+    if (status == PipelineExecutor::ExecutionStatus::CancelledByTimeout
+        || status == PipelineExecutor::ExecutionStatus::CancelledByUser)
+        return;
+
+    throw Exception(ErrorCodes::LOGICAL_ERROR,
+        "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
+}
+
+bool PushingAsyncPipelineExecutor::push(Chunk chunk)
 {
     if (!started)
         start();
@@ -185,13 +195,14 @@ void PushingAsyncPipelineExecutor::push(Chunk chunk)
     data->rethrowExceptionIfHas();
 
     if (!is_pushed)
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Pipeline for PushingAsyncPipelineExecutor was finished before all data was inserted");
+        checkExecutionStatus(data->executor->getExecutionStatus());
+
+    return is_pushed;
 }
 
-void PushingAsyncPipelineExecutor::push(Block block)
+bool PushingAsyncPipelineExecutor::push(Block block)
 {
-    push(Chunk(block.getColumns(), block.rows()));
+    return push(Chunk(block.getColumns(), block.rows()));
 }
 
 void PushingAsyncPipelineExecutor::finish()
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.h b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
index f976cd4c339..7835aaf596f 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.h
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
@@ -36,8 +36,11 @@ public:
 
     void start();
 
-    void push(Chunk chunk);
-    void push(Block block);
+    /// Return 'true' if push was successful.
+    /// Return 'false' if pipline was cancelled without exception.
+    /// This may happen in case of timeout_overflow_mode = 'break' OR internal bug.
+    [[nodiscard]] bool push(Chunk chunk);
+    [[nodiscard]] bool push(Block block);
 
     void finish();
 
diff --git a/src/Processors/Executors/PushingPipelineExecutor.cpp b/src/Processors/Executors/PushingPipelineExecutor.cpp
index 696932932df..3133cfd9a1e 100644
--- a/src/Processors/Executors/PushingPipelineExecutor.cpp
+++ b/src/Processors/Executors/PushingPipelineExecutor.cpp
@@ -80,36 +80,56 @@ const Block & PushingPipelineExecutor::getHeader() const
     return pushing_source->getPort().getHeader();
 }
 
+static void checkExecutionStatus(PipelineExecutor::ExecutionStatus status)
+{
+    if (status == PipelineExecutor::ExecutionStatus::CancelledByTimeout
+        || status == PipelineExecutor::ExecutionStatus::CancelledByUser)
+        return;
 
-void PushingPipelineExecutor::start()
+    throw Exception(ErrorCodes::LOGICAL_ERROR,
+        "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
+}
+
+bool PushingPipelineExecutor::start()
 {
     if (started)
-        return;
+        return true;
 
     started = true;
     executor = std::make_shared<PipelineExecutor>(pipeline.processors, pipeline.process_list_element);
     executor->setReadProgressCallback(pipeline.getReadProgressCallback());
 
     if (!executor->executeStep(&input_wait_flag))
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
+    {
+        checkExecutionStatus(executor->getExecutionStatus());
+        return false;
+    }
+
+    return true;
 }
 
-void PushingPipelineExecutor::push(Chunk chunk)
+bool PushingPipelineExecutor::push(Chunk chunk)
 {
     if (!started)
-        start();
+    {
+        if (!start())
+            return false;
+    }
 
     pushing_source->setData(std::move(chunk));
 
     if (!executor->executeStep(&input_wait_flag))
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-                        "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
+    {
+        checkExecutionStatus(executor->getExecutionStatus());
+        return false;
+    }
+
+    return true;
 }
 
-void PushingPipelineExecutor::push(Block block)
+bool PushingPipelineExecutor::push(Block block)
 {
-    push(Chunk(block.getColumns(), block.rows()));
+    return push(Chunk(block.getColumns(), block.rows()));
 }
 
 void PushingPipelineExecutor::finish()
diff --git a/src/Processors/Executors/PushingPipelineExecutor.h b/src/Processors/Executors/PushingPipelineExecutor.h
index f549c9482db..4021f61fb6b 100644
--- a/src/Processors/Executors/PushingPipelineExecutor.h
+++ b/src/Processors/Executors/PushingPipelineExecutor.h
@@ -35,10 +35,13 @@ public:
     /// Get structure of returned block or chunk.
     const Block & getHeader() const;
 
-    void start();
+    bool start();
 
-    void push(Chunk chunk);
-    void push(Block block);
+    /// Return 'true' if push was successful.
+    /// Return 'false' if pipline was cancelled without exception.
+    /// This may happen in case of timeout_overflow_mode = 'break' OR internal bug.
+    [[nodiscard]] bool push(Chunk chunk);
+    [[nodiscard]] bool push(Block block);
 
     void finish();
 
diff --git a/src/Processors/Transforms/CreatingSetsTransform.cpp b/src/Processors/Transforms/CreatingSetsTransform.cpp
index eeb8f4a6060..857233ac028 100644
--- a/src/Processors/Transforms/CreatingSetsTransform.cpp
+++ b/src/Processors/Transforms/CreatingSetsTransform.cpp
@@ -215,7 +215,8 @@ void CreatingSetsTransform::consume(Chunk chunk)
     if (!done_with_table)
     {
         block = materializeBlock(block);
-        executor->push(block);
+        if (!executor->push(block))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insert into a table");
 
         rows_to_transfer += block.rows();
         bytes_to_transfer += block.bytes();
diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp
index d8a4d7f0e1f..c261d76ef33 100644
--- a/src/Server/GRPCServer.cpp
+++ b/src/Server/GRPCServer.cpp
@@ -1012,7 +1012,8 @@ namespace
         while (pipeline_executor->pull(block))
         {
             if (block)
-                executor.push(block);
+                if (!executor.push(block))
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
         }
 
         if (isQueryCancelled())
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 448dfafbd9d..283b60b533c 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -932,12 +932,18 @@ void TCPHandler::processInsertQuery()
         executor.start();
 
         if (processed_data)
-            executor.push(std::move(processed_data));
+        {
+            if (!executor.push(std::move(processed_data)))
+                throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        }
         else
             startInsertQuery();
 
         while (readDataNext())
-            executor.push(std::move(state.block_for_insert));
+        {
+            if (!executor.push(std::move(state.block_for_insert)))
+                throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        }
 
         if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED)
             executor.cancel();
@@ -2034,7 +2040,8 @@ bool TCPHandler::receiveData(bool scalar)
         QueryPipeline temporary_table_out(storage->write(ASTPtr(), metadata_snapshot, query_context, /*async_insert=*/false));
         PushingPipelineExecutor executor(temporary_table_out);
         executor.start();
-        executor.push(block);
+        if (!executor.push(block))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insert into temporary table");
         executor.finish();
     }
     else if (state.need_receive_data_for_input)
diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index e3e73e42096..69f9e5b9380 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -89,7 +89,8 @@ static void writeBlockConvert(PushingPipelineExecutor & executor, const Block &
 {
     Block adopted_block = adoptBlock(executor.getHeader(), block, log);
     for (size_t i = 0; i < repeats; ++i)
-        executor.push(adopted_block);
+        if (!executor.push(adopted_block))
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
 }
 
 
@@ -408,7 +409,8 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
             CurrentMetrics::Increment metric_increment{CurrentMetrics::DistributedSend};
 
             Block adopted_shard_block = adoptBlock(job.executor->getHeader(), shard_block, log);
-            job.executor->push(adopted_shard_block);
+            if (!job.executor->push(adopted_shard_block))
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
         }
         else // local
         {
diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp
index f753d369d2d..3223a2813a3 100644
--- a/src/Storages/StorageBuffer.cpp
+++ b/src/Storages/StorageBuffer.cpp
@@ -1069,7 +1069,8 @@ void StorageBuffer::writeBlockToDestination(const Block & block, StoragePtr tabl
     auto block_io = interpreter.execute();
     PushingPipelineExecutor executor(block_io.pipeline);
     executor.start();
-    executor.push(std::move(block_to_write));
+    if (!executor.push(std::move(block_to_write)))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "StorageBuffer could not write data to destination table");
     executor.finish();
 }
 
diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp
index d75f3616f21..60890337cb4 100644
--- a/src/Storages/tests/gtest_storage_log.cpp
+++ b/src/Storages/tests/gtest_storage_log.cpp
@@ -98,7 +98,7 @@ std::string writeData(int rows, DB::StoragePtr & table, const DB::ContextPtr con
     QueryPipeline pipeline(table->write({}, metadata_snapshot, context, /*async_insert=*/false));
 
     PushingPipelineExecutor executor(pipeline);
-    executor.push(block);
+    std::ignore = executor.push(block);
     executor.finish();
 
     return data;
diff --git a/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.reference b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.reference
new file mode 100644
index 00000000000..68538c3f75b
--- /dev/null
+++ b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.reference
@@ -0,0 +1,2 @@
+QUERY_WAS_CANCELLED
+QUERY_WAS_CANCELLED
diff --git a/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh
new file mode 100755
index 00000000000..030c5211b2d
--- /dev/null
+++ b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh
@@ -0,0 +1,8 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+${CLICKHOUSE_CLIENT} --query "create table null_t (number UInt64) engine = Null;"
+${CLICKHOUSE_CLIENT} --query "select sleep(0.1) from system.numbers settings max_block_size = 1 format Native" 2>/dev/null | ${CLICKHOUSE_CLIENT} --max_execution_time = 0.3 --timeout_overflow_mode = 'break' --query "insert into null_t format Native" 2>&1 | grep -o "QUERY_WAS_CANCELLED"

From c5ae139c972d46d1e0bfa6ab5f165a049b6786f5 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nk@clickhouse.com>
Date: Tue, 13 Aug 2024 15:18:07 +0000
Subject: [PATCH 40/88] Cleanup.

---
 programs/obfuscator/Obfuscator.cpp            |  2 +-
 src/Client/LocalConnection.cpp                |  8 ++---
 src/Interpreters/SystemLog.cpp                |  2 +-
 .../PushingAsyncPipelineExecutor.cpp          | 15 ++++----
 .../Executors/PushingAsyncPipelineExecutor.h  |  7 ++--
 .../Executors/PushingPipelineExecutor.cpp     | 34 ++++++-------------
 .../Executors/PushingPipelineExecutor.h       |  9 ++---
 .../Transforms/CreatingSetsTransform.cpp      |  3 +-
 src/Server/GRPCServer.cpp                     |  3 +-
 src/Server/TCPHandler.cpp                     | 13 ++-----
 src/Storages/Distributed/DistributedSink.cpp  |  6 ++--
 src/Storages/StorageBuffer.cpp                |  3 +-
 src/Storages/tests/gtest_storage_log.cpp      |  2 +-
 13 files changed, 36 insertions(+), 71 deletions(-)

diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp
index 7c13215e350..688ae1a1143 100644
--- a/programs/obfuscator/Obfuscator.cpp
+++ b/programs/obfuscator/Obfuscator.cpp
@@ -1462,7 +1462,7 @@ try
         while (in_executor.pull(block))
         {
             Columns columns = obfuscator.generate(block.getColumns());
-            std::ignore = out_executor.push(header.cloneWithColumns(columns));
+            out_executor.push(header.cloneWithColumns(columns));
             processed_rows += block.rows();
             if (!silent)
                 std::cerr << "Processed " << processed_rows << " rows\n";
diff --git a/src/Client/LocalConnection.cpp b/src/Client/LocalConnection.cpp
index 8f1e0958002..072184e0a66 100644
--- a/src/Client/LocalConnection.cpp
+++ b/src/Client/LocalConnection.cpp
@@ -287,17 +287,13 @@ void LocalConnection::sendData(const Block & block, const String &, bool)
     if (!block)
         return;
 
-    bool inserted = false;
     if (state->pushing_async_executor)
-        inserted = state->pushing_async_executor->push(block);
+        state->pushing_async_executor->push(block);
     else if (state->pushing_executor)
-        inserted = state->pushing_executor->push(block);
+        state->pushing_executor->push(block);
     else
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown executor");
 
-    if (!inserted)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
-
     if (send_profile_events)
         sendProfileEvents();
 }
diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp
index 0cad56af00a..572481e6b12 100644
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@@ -556,7 +556,7 @@ void SystemLog<LogElement>::flushImpl(const std::vector<LogElement> & to_flush,
         PushingPipelineExecutor executor(io.pipeline);
 
         executor.start();
-        std::ignore = executor.push(block);
+        executor.push(block);
         executor.finish();
     }
     catch (...)
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
index db5cf451c9e..866d224a08d 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
@@ -15,6 +15,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
+    extern const int QUERY_WAS_CANCELLED;
 }
 
 class PushingAsyncSource : public ISource
@@ -176,17 +177,17 @@ void PushingAsyncPipelineExecutor::start()
     data->thread = ThreadFromGlobalPool(std::move(func));
 }
 
-static void checkExecutionStatus(PipelineExecutor::ExecutionStatus status)
+[[noreturn]] static void throwOnExecutionStatus(PipelineExecutor::ExecutionStatus status)
 {
     if (status == PipelineExecutor::ExecutionStatus::CancelledByTimeout
         || status == PipelineExecutor::ExecutionStatus::CancelledByUser)
-        return;
+        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
 
     throw Exception(ErrorCodes::LOGICAL_ERROR,
         "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
 }
 
-bool PushingAsyncPipelineExecutor::push(Chunk chunk)
+void PushingAsyncPipelineExecutor::push(Chunk chunk)
 {
     if (!started)
         start();
@@ -195,14 +196,12 @@ bool PushingAsyncPipelineExecutor::push(Chunk chunk)
     data->rethrowExceptionIfHas();
 
     if (!is_pushed)
-        checkExecutionStatus(data->executor->getExecutionStatus());
-
-    return is_pushed;
+        throwOnExecutionStatus(data->executor->getExecutionStatus());
 }
 
-bool PushingAsyncPipelineExecutor::push(Block block)
+void PushingAsyncPipelineExecutor::push(Block block)
 {
-    return push(Chunk(block.getColumns(), block.rows()));
+    push(Chunk(block.getColumns(), block.rows()));
 }
 
 void PushingAsyncPipelineExecutor::finish()
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.h b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
index 7835aaf596f..f976cd4c339 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.h
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.h
@@ -36,11 +36,8 @@ public:
 
     void start();
 
-    /// Return 'true' if push was successful.
-    /// Return 'false' if pipline was cancelled without exception.
-    /// This may happen in case of timeout_overflow_mode = 'break' OR internal bug.
-    [[nodiscard]] bool push(Chunk chunk);
-    [[nodiscard]] bool push(Block block);
+    void push(Chunk chunk);
+    void push(Block block);
 
     void finish();
 
diff --git a/src/Processors/Executors/PushingPipelineExecutor.cpp b/src/Processors/Executors/PushingPipelineExecutor.cpp
index 3133cfd9a1e..7a1c0111a3a 100644
--- a/src/Processors/Executors/PushingPipelineExecutor.cpp
+++ b/src/Processors/Executors/PushingPipelineExecutor.cpp
@@ -11,6 +11,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int LOGICAL_ERROR;
+    extern const int QUERY_WAS_CANCELLED;
 }
 
 class PushingSource : public ISource
@@ -80,56 +81,43 @@ const Block & PushingPipelineExecutor::getHeader() const
     return pushing_source->getPort().getHeader();
 }
 
-static void checkExecutionStatus(PipelineExecutor::ExecutionStatus status)
+[[noreturn]] static void throwOnExecutionStatus(PipelineExecutor::ExecutionStatus status)
 {
     if (status == PipelineExecutor::ExecutionStatus::CancelledByTimeout
         || status == PipelineExecutor::ExecutionStatus::CancelledByUser)
-        return;
+        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
 
     throw Exception(ErrorCodes::LOGICAL_ERROR,
         "Pipeline for PushingPipelineExecutor was finished before all data was inserted");
 }
 
-bool PushingPipelineExecutor::start()
+void PushingPipelineExecutor::start()
 {
     if (started)
-        return true;
+        return;
 
     started = true;
     executor = std::make_shared<PipelineExecutor>(pipeline.processors, pipeline.process_list_element);
     executor->setReadProgressCallback(pipeline.getReadProgressCallback());
 
     if (!executor->executeStep(&input_wait_flag))
-    {
-        checkExecutionStatus(executor->getExecutionStatus());
-        return false;
-    }
-
-    return true;
+        throwOnExecutionStatus(executor->getExecutionStatus());
 }
 
-bool PushingPipelineExecutor::push(Chunk chunk)
+void PushingPipelineExecutor::push(Chunk chunk)
 {
     if (!started)
-    {
-        if (!start())
-            return false;
-    }
+        start();
 
     pushing_source->setData(std::move(chunk));
 
     if (!executor->executeStep(&input_wait_flag))
-    {
-        checkExecutionStatus(executor->getExecutionStatus());
-        return false;
-    }
-
-    return true;
+        throwOnExecutionStatus(executor->getExecutionStatus());
 }
 
-bool PushingPipelineExecutor::push(Block block)
+void PushingPipelineExecutor::push(Block block)
 {
-    return push(Chunk(block.getColumns(), block.rows()));
+    push(Chunk(block.getColumns(), block.rows()));
 }
 
 void PushingPipelineExecutor::finish()
diff --git a/src/Processors/Executors/PushingPipelineExecutor.h b/src/Processors/Executors/PushingPipelineExecutor.h
index 4021f61fb6b..f549c9482db 100644
--- a/src/Processors/Executors/PushingPipelineExecutor.h
+++ b/src/Processors/Executors/PushingPipelineExecutor.h
@@ -35,13 +35,10 @@ public:
     /// Get structure of returned block or chunk.
     const Block & getHeader() const;
 
-    bool start();
+    void start();
 
-    /// Return 'true' if push was successful.
-    /// Return 'false' if pipline was cancelled without exception.
-    /// This may happen in case of timeout_overflow_mode = 'break' OR internal bug.
-    [[nodiscard]] bool push(Chunk chunk);
-    [[nodiscard]] bool push(Block block);
+    void push(Chunk chunk);
+    void push(Block block);
 
     void finish();
 
diff --git a/src/Processors/Transforms/CreatingSetsTransform.cpp b/src/Processors/Transforms/CreatingSetsTransform.cpp
index 857233ac028..eeb8f4a6060 100644
--- a/src/Processors/Transforms/CreatingSetsTransform.cpp
+++ b/src/Processors/Transforms/CreatingSetsTransform.cpp
@@ -215,8 +215,7 @@ void CreatingSetsTransform::consume(Chunk chunk)
     if (!done_with_table)
     {
         block = materializeBlock(block);
-        if (!executor->push(block))
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insert into a table");
+        executor->push(block);
 
         rows_to_transfer += block.rows();
         bytes_to_transfer += block.bytes();
diff --git a/src/Server/GRPCServer.cpp b/src/Server/GRPCServer.cpp
index c261d76ef33..d8a4d7f0e1f 100644
--- a/src/Server/GRPCServer.cpp
+++ b/src/Server/GRPCServer.cpp
@@ -1012,8 +1012,7 @@ namespace
         while (pipeline_executor->pull(block))
         {
             if (block)
-                if (!executor.push(block))
-                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
+                executor.push(block);
         }
 
         if (isQueryCancelled())
diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp
index 283b60b533c..448dfafbd9d 100644
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@@ -932,18 +932,12 @@ void TCPHandler::processInsertQuery()
         executor.start();
 
         if (processed_data)
-        {
-            if (!executor.push(std::move(processed_data)))
-                throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
-        }
+            executor.push(std::move(processed_data));
         else
             startInsertQuery();
 
         while (readDataNext())
-        {
-            if (!executor.push(std::move(state.block_for_insert)))
-                throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
-        }
+            executor.push(std::move(state.block_for_insert));
 
         if (state.cancellation_status == CancellationStatus::FULLY_CANCELLED)
             executor.cancel();
@@ -2040,8 +2034,7 @@ bool TCPHandler::receiveData(bool scalar)
         QueryPipeline temporary_table_out(storage->write(ASTPtr(), metadata_snapshot, query_context, /*async_insert=*/false));
         PushingPipelineExecutor executor(temporary_table_out);
         executor.start();
-        if (!executor.push(block))
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot insert into temporary table");
+        executor.push(block);
         executor.finish();
     }
     else if (state.need_receive_data_for_input)
diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp
index 69f9e5b9380..e3e73e42096 100644
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@@ -89,8 +89,7 @@ static void writeBlockConvert(PushingPipelineExecutor & executor, const Block &
 {
     Block adopted_block = adoptBlock(executor.getHeader(), block, log);
     for (size_t i = 0; i < repeats; ++i)
-        if (!executor.push(adopted_block))
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
+        executor.push(adopted_block);
 }
 
 
@@ -409,8 +408,7 @@ DistributedSink::runWritingJob(JobReplica & job, const Block & current_block, si
             CurrentMetrics::Increment metric_increment{CurrentMetrics::DistributedSend};
 
             Block adopted_shard_block = adoptBlock(job.executor->getHeader(), shard_block, log);
-            if (!job.executor->push(adopted_shard_block))
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot send data");
+            job.executor->push(adopted_shard_block);
         }
         else // local
         {
diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp
index 3223a2813a3..f753d369d2d 100644
--- a/src/Storages/StorageBuffer.cpp
+++ b/src/Storages/StorageBuffer.cpp
@@ -1069,8 +1069,7 @@ void StorageBuffer::writeBlockToDestination(const Block & block, StoragePtr tabl
     auto block_io = interpreter.execute();
     PushingPipelineExecutor executor(block_io.pipeline);
     executor.start();
-    if (!executor.push(std::move(block_to_write)))
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "StorageBuffer could not write data to destination table");
+    executor.push(std::move(block_to_write));
     executor.finish();
 }
 
diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp
index 60890337cb4..d75f3616f21 100644
--- a/src/Storages/tests/gtest_storage_log.cpp
+++ b/src/Storages/tests/gtest_storage_log.cpp
@@ -98,7 +98,7 @@ std::string writeData(int rows, DB::StoragePtr & table, const DB::ContextPtr con
     QueryPipeline pipeline(table->write({}, metadata_snapshot, context, /*async_insert=*/false));
 
     PushingPipelineExecutor executor(pipeline);
-    std::ignore = executor.push(block);
+    executor.push(block);
     executor.finish();
 
     return data;

From 94cc37a39f53f884cc7c6cd81a76c41bb2ea8565 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 11:49:42 +0000
Subject: [PATCH 41/88] Remove robin-map submodule

At some point, usearch stopped to use robin-map.
---
 .gitmodules                            | 3 ---
 contrib/CMakeLists.txt                 | 3 +--
 contrib/robin-map                      | 1 -
 contrib/robin-map-cmake/CMakeLists.txt | 1 -
 contrib/usearch-cmake/CMakeLists.txt   | 2 --
 5 files changed, 1 insertion(+), 9 deletions(-)
 delete mode 160000 contrib/robin-map
 delete mode 100644 contrib/robin-map-cmake/CMakeLists.txt

diff --git a/.gitmodules b/.gitmodules
index 0a66031de8d..cdee6a43ad8 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -345,9 +345,6 @@
 [submodule "contrib/FP16"]
 	path = contrib/FP16
 	url = https://github.com/Maratyszcza/FP16.git
-[submodule "contrib/robin-map"]
-	path = contrib/robin-map
-	url = https://github.com/Tessil/robin-map.git
 [submodule "contrib/aklomp-base64"]
 	path = contrib/aklomp-base64
 	url = https://github.com/aklomp/base64.git
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index dc2ad2a3150..d7489bc5c0e 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -209,9 +209,8 @@ endif()
 option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
 if (ENABLE_USEARCH)
     add_contrib (FP16-cmake FP16)
-    add_contrib (robin-map-cmake robin-map)
     add_contrib (SimSIMD-cmake SimSIMD)
-    add_contrib (usearch-cmake usearch) # requires: FP16, robin-map, SimdSIMD
+    add_contrib (usearch-cmake usearch) # requires: FP16, SimdSIMD
 else ()
     message(STATUS "Not using USearch")
 endif ()
diff --git a/contrib/robin-map b/contrib/robin-map
deleted file mode 160000
index 851a59e0e30..00000000000
--- a/contrib/robin-map
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 851a59e0e3063ee0e23089062090a73fd3de482d
diff --git a/contrib/robin-map-cmake/CMakeLists.txt b/contrib/robin-map-cmake/CMakeLists.txt
deleted file mode 100644
index f82ad705dcc..00000000000
--- a/contrib/robin-map-cmake/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-# See contrib/usearch-cmake/CMakeLists.txt
diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt
index 6be622275ae..83221e3810f 100644
--- a/contrib/usearch-cmake/CMakeLists.txt
+++ b/contrib/usearch-cmake/CMakeLists.txt
@@ -1,5 +1,4 @@
 set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16")
-set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map")
 set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD")
 set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch")
 
@@ -7,7 +6,6 @@ add_library(_usearch INTERFACE)
 
 target_include_directories(_usearch SYSTEM INTERFACE
     ${FP16_PROJECT_DIR}/include
-    ${ROBIN_MAP_PROJECT_DIR}/include
     ${SIMSIMD_PROJECT_DIR}/include
     ${USEARCH_PROJECT_DIR}/include)
 

From 5ca85674e6fd51633c8fa636bed71002dd2cd281 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Mon, 12 Aug 2024 19:29:04 +0000
Subject: [PATCH 42/88] Bump usearch to 2.3.2

---
 contrib/SimSIMD | 2 +-
 contrib/usearch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index de2cb75b9e9..c98e4635f3c 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit de2cb75b9e9e3389d5e1e51fd9f8ed151f3c17cf
+Subproject commit c98e4635f3cca9e33918fe1bdca23571162e0c28
diff --git a/contrib/usearch b/contrib/usearch
index 30810452bec..65b5d178f05 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 30810452bec5d3d3aa0931bb5d761e2f09aa6356
+Subproject commit 65b5d178f053d21480796d214b6ca04172d854a4

From 98c18eb341481d3daf610282c58960eb89188960 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Mon, 12 Aug 2024 19:50:53 +0000
Subject: [PATCH 43/88] Bump usearch to 2.4.1

---
 contrib/usearch                                          | 2 +-
 .../MergeTree/MergeTreeIndexVectorSimilarity.cpp         | 9 +++++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/contrib/usearch b/contrib/usearch
index 65b5d178f05..e811aa8c1d0 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 65b5d178f053d21480796d214b6ca04172d854a4
+Subproject commit e811aa8c1d07dfb3725e05fedb550f91fe44a324
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
index 5b0793fa0c8..083311a6602 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@@ -95,9 +95,14 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
     unum::usearch::metric_kind_t metric_kind,
     unum::usearch::scalar_kind_t scalar_kind,
     UsearchHnswParams usearch_hnsw_params)
-    : Base(Base::make(unum::usearch::metric_punned_t(dimensions, metric_kind, scalar_kind),
-                      unum::usearch::index_dense_config_t(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search)))
 {
+    unum::usearch::metric_punned_t metric(dimensions, metric_kind, scalar_kind);
+
+    unum::usearch::index_dense_config_t config(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search);
+    config.enable_key_lookups = false; /// we don't do row-to-vector lookups
+
+    USearchIndex usearch_index = USearchIndex::make(metric, config);
+    swap(usearch_index);
 }
 
 void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const

From dbe66e6092f6a6e232108f5b586253d0a017f66a Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 09:47:17 +0000
Subject: [PATCH 44/88] Bump usearch to 2.5.1

---
 contrib/usearch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/usearch b/contrib/usearch
index e811aa8c1d0..f2b4bff52b7 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit e811aa8c1d07dfb3725e05fedb550f91fe44a324
+Subproject commit f2b4bff52b74a0bf33067bc034ba68bb785753ee

From 383d2816e66c1ad67c9f4b962b52efd2cb8d1b53 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 09:56:11 +0000
Subject: [PATCH 45/88] Bump usearch to 2.6.1

---
 contrib/usearch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/usearch b/contrib/usearch
index f2b4bff52b7..a7bc711dfb9 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit f2b4bff52b74a0bf33067bc034ba68bb785753ee
+Subproject commit a7bc711dfb9e5665a1aee89d3a0297a211f2b97d

From 88f2d2e67df5acbca8c4df45be00ab23d42e090c Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 10:00:12 +0000
Subject: [PATCH 46/88] Bump usearch to v2.7.8

---
 contrib/SimSIMD                                           | 2 +-
 contrib/usearch                                           | 2 +-
 src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index c98e4635f3c..8f2c8881e44 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit c98e4635f3cca9e33918fe1bdca23571162e0c28
+Subproject commit 8f2c8881e440a55cfea246996984662623b4d5dd
diff --git a/contrib/usearch b/contrib/usearch
index a7bc711dfb9..b58cdb4025b 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit a7bc711dfb9e5665a1aee89d3a0297a211f2b97d
+Subproject commit b58cdb4025b68b55800dcc9f36fa33b43c003a7e
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
index 083311a6602..346f69140bb 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@@ -364,7 +364,7 @@ std::vector<size_t> MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer
     ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members);
     ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances);
 
-    std::vector<USearchIndex::key_t> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
+    std::vector<USearchIndex::vector_key_t> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
     std::vector<USearchIndex::distance_t> distances(result.size());
     result.dump_to(neighbors.data(), distances.data());
 

From fe7da4e7d1f9f76c9bb1d13fc0baa01433f069a6 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 10:23:12 +0000
Subject: [PATCH 47/88] Bump usearch to 2.8.16

---
 contrib/SimSIMD | 2 +-
 contrib/usearch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index 8f2c8881e44..fed0b4f8ec6 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit 8f2c8881e440a55cfea246996984662623b4d5dd
+Subproject commit fed0b4f8ec6c1fb75d47e554ae8ca9188fc068f4
diff --git a/contrib/usearch b/contrib/usearch
index b58cdb4025b..81edcb7936b 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit b58cdb4025b68b55800dcc9f36fa33b43c003a7e
+Subproject commit 81edcb7936b3aba701997ae6b1af59a61df280e1

From bd09e948ba710a9013fdfa477288162936bc6f85 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 12:21:43 +0000
Subject: [PATCH 48/88] Bump usearch to 2.9.2

---
 contrib/SimSIMD | 2 +-
 contrib/usearch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index fed0b4f8ec6..02665027985 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit fed0b4f8ec6c1fb75d47e554ae8ca9188fc068f4
+Subproject commit 02665027985a578bd91514011c31a0bbe302304d
diff --git a/contrib/usearch b/contrib/usearch
index 81edcb7936b..3ba2661f46f 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 81edcb7936b3aba701997ae6b1af59a61df280e1
+Subproject commit 3ba2661f46fbc0065113e11f29404020210ebb53

From 92aed17e7cfdd78340df66075520713b2dee5c66 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 12:24:01 +0000
Subject: [PATCH 49/88] Bump usearch to 2.10.5

---
 contrib/SimSIMD | 2 +-
 contrib/usearch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index 02665027985..127ead1da7c 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit 02665027985a578bd91514011c31a0bbe302304d
+Subproject commit 127ead1da7c39957b30a50dd85e74814edb022d6
diff --git a/contrib/usearch b/contrib/usearch
index 3ba2661f46f..fa1019941fe 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 3ba2661f46fbc0065113e11f29404020210ebb53
+Subproject commit fa1019941fe71f359516543ff4ec9f6fa8f0cb80

From 72efc8308c8cda9a2015fd1c5a1057c5b4a5675a Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 12:26:49 +0000
Subject: [PATCH 50/88] Bump usearch to 2.11.7

---
 contrib/SimSIMD | 2 +-
 contrib/usearch | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index 127ead1da7c..18d17686124 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit 127ead1da7c39957b30a50dd85e74814edb022d6
+Subproject commit 18d17686124ddebd9fe55eee56b2e0273a613d4b
diff --git a/contrib/usearch b/contrib/usearch
index fa1019941fe..bc83df3c0b7 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit fa1019941fe71f359516543ff4ec9f6fa8f0cb80
+Subproject commit bc83df3c0b7da8376574a3ca2b48f0738365c205

From 58d76fabf68c48c08e25fca8d4f6318f86010625 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 12:28:09 +0000
Subject: [PATCH 51/88] Bump usearch to 2.12.0

---
 contrib/usearch | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/usearch b/contrib/usearch
index bc83df3c0b7..e6c81f78c64 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit bc83df3c0b7da8376574a3ca2b48f0738365c205
+Subproject commit e6c81f78c64c0d8119f854691a06e60660638a25

From dcf96fa9f4363f7607e9b5ed82056d94c49a6ee3 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nk@clickhouse.com>
Date: Tue, 13 Aug 2024 17:57:06 +0200
Subject: [PATCH 52/88] Update 03221_insert_timeout_overflow_mode.sh

---
 tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh
index 030c5211b2d..db943a665cb 100755
--- a/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh
+++ b/tests/queries/0_stateless/03221_insert_timeout_overflow_mode.sh
@@ -5,4 +5,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CUR_DIR"/../shell_config.sh
 
 ${CLICKHOUSE_CLIENT} --query "create table null_t (number UInt64) engine = Null;"
-${CLICKHOUSE_CLIENT} --query "select sleep(0.1) from system.numbers settings max_block_size = 1 format Native" 2>/dev/null | ${CLICKHOUSE_CLIENT} --max_execution_time = 0.3 --timeout_overflow_mode = 'break' --query "insert into null_t format Native" 2>&1 | grep -o "QUERY_WAS_CANCELLED"
+${CLICKHOUSE_CLIENT} --query "select sleep(0.1) from system.numbers settings max_block_size = 1 format Native" 2>/dev/null | ${CLICKHOUSE_CLIENT} --max_execution_time 0.3 --timeout_overflow_mode break --query "insert into null_t format Native" 2>&1 | grep -o "QUERY_WAS_CANCELLED"

From 9833ef0bed218afdc1927181c11bac306fe21dda Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Tue, 13 Aug 2024 17:50:13 +0000
Subject: [PATCH 53/88] slightly better

---
 .../Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h   | 2 +-
 src/Processors/Merges/IMergingTransform.h                   | 6 +++---
 src/Storages/MergeTree/MergeTask.cpp                        | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
index 39171c5a978..c34028b1cba 100644
--- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h
@@ -50,7 +50,7 @@ public:
     void consume(Input & input, size_t source_num) override;
     Status merge() override;
 
-    MergedStats getMergedStats() const override { return  {.bytes = accumulated_bytes, .rows = accumulated_rows, .blocks = chunk_num}; }
+    MergedStats getMergedStats() const override { return {.bytes = accumulated_bytes, .rows = accumulated_rows, .blocks = chunk_num}; }
 
 private:
     Chunk prepareToMerge();
diff --git a/src/Processors/Merges/IMergingTransform.h b/src/Processors/Merges/IMergingTransform.h
index fba5b038618..e5cd3bdde46 100644
--- a/src/Processors/Merges/IMergingTransform.h
+++ b/src/Processors/Merges/IMergingTransform.h
@@ -113,7 +113,7 @@ public:
 
     void work() override
     {
-        Stopwatch watch;
+        Stopwatch watch{CLOCK_MONOTONIC_COARSE};
 
         if (!state.init_chunks.empty())
             algorithm.initialize(std::move(state.init_chunks));
@@ -180,12 +180,12 @@ protected:
 
         if (seconds == 0.0)
         {
-            LOG_DEBUG(log, "{}: {} blocks, {} rows, {} bytes in 0 sec.",
+            LOG_DEBUG(log, "{}, {} blocks, {} rows, {} bytes in 0 sec.",
                 transform_message, stats.blocks, stats.rows, stats.bytes);
         }
         else
         {
-            LOG_DEBUG(log, "{}: {} blocks, {} rows, {} bytes in {} sec., {} rows/sec., {}/sec.",
+            LOG_DEBUG(log, "{}, {} blocks, {} rows, {} bytes in {} sec., {} rows/sec., {}/sec.",
                 transform_message, stats.blocks, stats.rows, stats.bytes,
                 seconds, stats.rows / seconds, ReadableSize(stats.bytes / seconds));
         }
diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp
index 3aa4d764685..95e00773bae 100644
--- a/src/Storages/MergeTree/MergeTask.cpp
+++ b/src/Storages/MergeTree/MergeTask.cpp
@@ -945,7 +945,7 @@ bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const
 MergeTask::StageRuntimeContextPtr MergeTask::MergeProjectionsStage::getContextForNextStage()
 {
     /// Do not increment for projection stage because time is already accounted in main task.
-    /// The projection stage has its own empty projection stage which may add a drift of severals milliseconds.
+    /// The projection stage has its own empty projection stage which may add a drift of several milliseconds.
     if (global_ctx->parent_part == nullptr)
     {
         ProfileEvents::increment(ProfileEvents::MergeExecuteMilliseconds, ctx->elapsed_execute_ns / 1000000UL);

From b9ffa929ba418d54e5e140470f21d4347ac0eab9 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Tue, 13 Aug 2024 21:08:53 +0000
Subject: [PATCH 54/88] Fix: min marks to read overflow with parallel replicas

---
 .../MergeTree/MergeTreeIndexGranularity.cpp       | 12 ++++++++++--
 ..._replicas_min_marks_to_read_overflow.reference | 10 ++++++++++
 ...rallel_replicas_min_marks_to_read_overflow.sql | 15 +++++++++++++++
 3 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
 create mode 100644 tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql

diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
index 2a45ab1d927..2f9a4a47b11 100644
--- a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
@@ -103,8 +103,16 @@ size_t MergeTreeIndexGranularity::countMarksForRows(size_t from_mark, size_t num
 
     /// This is a heuristic to respect min_marks_to_read which is ignored by MergeTreeReadPool in case of remote disk.
     /// See comment in IMergeTreeSelectAlgorithm.
-    if (min_marks_to_read && from_mark + 2 * min_marks_to_read <= to_mark)
-        to_mark = from_mark + min_marks_to_read;
+    if (min_marks_to_read)
+    {
+        // check that ...
+        bool overflow = ((1ULL << 63) & min_marks_to_read); // further multiplication by 2 will not overflow
+        if (!overflow)
+            overflow = (std::numeric_limits<size_t>::max() - from_mark) < 2 * min_marks_to_read; // further addition will not overflow
+
+        if (!overflow && from_mark + 2 * min_marks_to_read <= to_mark)
+            to_mark = from_mark + min_marks_to_read;
+    }
 
     return getRowsCountInRange(from_mark, std::max(1UL, to_mark)) - offset_in_rows;
 }
diff --git a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
new file mode 100644
index 00000000000..7fafd4d13ea
--- /dev/null
+++ b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
@@ -0,0 +1,10 @@
+100	100
+101	101
+102	102
+103	103
+104	104
+105	105
+106	106
+107	107
+108	108
+109	109
diff --git a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql
new file mode 100644
index 00000000000..112373e5db2
--- /dev/null
+++ b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS test__fuzz_22 SYNC;
+
+CREATE TABLE test__fuzz_22 (k Float32, v String) ENGINE = ReplicatedMergeTree('/clickhouse/03222/{database}/test__fuzz_22', 'r1') ORDER BY k SETTINGS index_granularity = 1;
+
+INSERT INTO test__fuzz_22 SELECT number, toString(number) FROM numbers(10_000);
+
+SET allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 3, cluster_for_parallel_replicas='test_cluster_one_shard_three_replicas_localhost';
+
+SELECT k, v
+FROM test__fuzz_22
+ORDER BY k
+LIMIT 100, 10
+SETTINGS merge_tree_min_rows_for_concurrent_read = 9223372036854775806;
+
+DROP TABLE test__fuzz_22 SYNC;

From 6170a8663fc85b95cda4a7617975b06cc6c007f6 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Tue, 13 Aug 2024 12:31:13 +0000
Subject: [PATCH 55/88] Bump usearch to 2.13.2

---
 contrib/SimSIMD                               |   2 +-
 contrib/usearch                               |   2 +-
 .../mergetree-family/annindexes.md            |   2 +
 .../MergeTreeIndexVectorSimilarity.cpp        | 121 +++++++++++-------
 .../MergeTreeIndexVectorSimilarity.h          |  26 ++--
 ...r_search_index_creation_negative.reference |   2 -
 ..._vector_search_index_creation_negative.sql |   6 -
 7 files changed, 90 insertions(+), 71 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index 18d17686124..91a76d1ac51 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit 18d17686124ddebd9fe55eee56b2e0273a613d4b
+Subproject commit 91a76d1ac519b3b9dc8957734a3dabd985f00c26
diff --git a/contrib/usearch b/contrib/usearch
index e6c81f78c64..e21a5778a0d 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit e6c81f78c64c0d8119f854691a06e60660638a25
+Subproject commit e21a5778a0d4469ddaf38c94b7be0196bb701ee4
diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md
index e73d6f07a32..097b0f5850a 100644
--- a/docs/en/engines/table-engines/mergetree-family/annindexes.md
+++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@@ -59,6 +59,8 @@ Parameters:
 - `ef_construction`: (optional, default: 128)
 - `ef_search`: (optional, default: 64)
 
+Value 0 for parameters `m`, `ef_construction`, and `ef_search` refers to the default value.
+
 Example:
 
 ```sql
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
index 346f69140bb..fbbc66bd8db 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp
@@ -2,9 +2,6 @@
 
 #if USE_USEARCH
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-
 #include <Columns/ColumnArray.h>
 #include <Common/BitHelpers.h>
 #include <Common/formatReadable.h>
@@ -46,15 +43,15 @@ namespace
 {
 
 /// The only indexing method currently supported by USearch
-std::set<String> methods = {"hnsw"};
+const std::set<String> methods = {"hnsw"};
 
 /// Maps from user-facing name to internal name
-std::unordered_map<String, unum::usearch::metric_kind_t> distanceFunctionToMetricKind = {
+const std::unordered_map<String, unum::usearch::metric_kind_t> distanceFunctionToMetricKind = {
     {"L2Distance", unum::usearch::metric_kind_t::l2sq_k},
     {"cosineDistance", unum::usearch::metric_kind_t::cos_k}};
 
 /// Maps from user-facing name to internal name
-std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToScalarKind = {
+const std::unordered_map<String, unum::usearch::scalar_kind_t> quantizationToScalarKind = {
     {"f32", unum::usearch::scalar_kind_t::f32_k},
     {"f16", unum::usearch::scalar_kind_t::f16_k},
     {"i8", unum::usearch::scalar_kind_t::i8_k}};
@@ -96,13 +93,18 @@ USearchIndexWithSerialization::USearchIndexWithSerialization(
     unum::usearch::scalar_kind_t scalar_kind,
     UsearchHnswParams usearch_hnsw_params)
 {
-    unum::usearch::metric_punned_t metric(dimensions, metric_kind, scalar_kind);
+    USearchIndex::metric_t metric(dimensions, metric_kind, scalar_kind);
 
     unum::usearch::index_dense_config_t config(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search);
     config.enable_key_lookups = false; /// we don't do row-to-vector lookups
 
-    USearchIndex usearch_index = USearchIndex::make(metric, config);
-    swap(usearch_index);
+    if (auto error = config.validate(); error) /// already called in vectorSimilarityIndexValidator, call again because usearch may change the config in-place
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release()));
+
+    if (auto result = USearchIndex::make(metric, config); !result)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not create vector similarity index. Error: {}", String(result.error.release()));
+    else
+        swap(result.index);
 }
 
 void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const
@@ -113,9 +115,8 @@ void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const
         return true;
     };
 
-    auto result = Base::save_to_stream(callback);
-    if (result.error)
-        throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not save vector similarity index, error: " + String(result.error.release()));
+    if (auto result = Base::save_to_stream(callback); !result)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not save vector similarity index. Error: {}", String(result.error.release()));
 }
 
 void USearchIndexWithSerialization::deserialize(ReadBuffer & istr)
@@ -126,26 +127,43 @@ void USearchIndexWithSerialization::deserialize(ReadBuffer & istr)
         return true;
     };
 
-    auto result = Base::load_from_stream(callback);
-    if (result.error)
+    if (auto result = Base::load_from_stream(callback); !result)
         /// See the comment in MergeTreeIndexGranuleVectorSimilarity::deserializeBinary why we throw here
-        throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not load vector similarity index, error: " + String(result.error.release()) + " Please drop the index and create it again.");
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not load vector similarity index. Please drop the index and create it again. Error: {}", String(result.error.release()));
+
+    if (!try_reserve(limits()))
+        throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index");
 }
 
 USearchIndexWithSerialization::Statistics USearchIndexWithSerialization::getStatistics() const
 {
+    USearchIndex::stats_t global_stats = Base::stats();
+
     Statistics statistics = {
         .max_level = max_level(),
         .connectivity = connectivity(),
-        .size = size(),                         /// number of vectors
-        .capacity = capacity(),                 /// number of vectors reserved
-        .memory_usage = memory_usage(),         /// in bytes, the value is not exact
+        .size = size(),
+        .capacity = capacity(),
+        .memory_usage = memory_usage(),
         .bytes_per_vector = bytes_per_vector(),
         .scalar_words = scalar_words(),
-        .statistics = stats()};
+        .nodes = global_stats.nodes,
+        .edges = global_stats.edges,
+        .max_edges = global_stats.max_edges,
+        .level_stats = {}};
+
+    for (size_t i = 0; i < statistics.max_level; ++i)
+        statistics.level_stats.push_back(Base::stats(i));
+
     return statistics;
 }
 
+String USearchIndexWithSerialization::Statistics::toString() const
+{
+    return fmt::format("max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}, bytes_per_vector = {}, scalar_words = {}, nodes = {}, edges = {}, max_edges = {}",
+            max_level, connectivity, size, capacity, ReadableSize(memory_usage), bytes_per_vector, scalar_words, nodes, edges, max_edges);
+
+}
 MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity(
     const String & index_name_,
     const Block & index_sample_block_,
@@ -186,8 +204,7 @@ void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr)
     index->serialize(ostr);
 
     auto statistics = index->getStatistics();
-    LOG_TRACE(logger, "Wrote vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}",
-                      statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage));
+    LOG_TRACE(logger, "Wrote vector similarity index: {}", statistics.toString());
 }
 
 void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/)
@@ -209,8 +226,7 @@ void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr,
     index->deserialize(istr);
 
     auto statistics = index->getStatistics();
-    LOG_TRACE(logger, "Loaded vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}",
-                      statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage));
+    LOG_TRACE(logger, "Loaded vector similarity index: {}", statistics.toString());
 }
 
 MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilarity(
@@ -290,19 +306,24 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_
         if (!index)
             index = std::make_shared<USearchIndexWithSerialization>(dimensions, metric_kind, scalar_kind, usearch_hnsw_params);
 
+        /// We use Usearch's index_dense_t as index type which supports only 4 bio entries according to https://github.com/unum-cloud/usearch/tree/main/cpp
+        if (index->size() + num_rows > std::numeric_limits<UInt32>::max())
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Size of vector similarity index in column {} would exceed 4 billion entries", index_column_name);
+
         /// Reserving space is mandatory
-        if (!index->reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows)))
+        if (!index->try_reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows)))
             throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index");
 
         for (size_t row = 0; row < num_rows; ++row)
         {
-            auto rc = index->add(static_cast<UInt32>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]);
-            if (!rc)
-                throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index, error: " + String(rc.error.release()));
-
-            ProfileEvents::increment(ProfileEvents::USearchAddCount);
-            ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members);
-            ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, rc.computed_distances);
+            if (auto result = index->add(static_cast<UInt32>(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); !result)
+                throw Exception(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index. Error: {}", String(result.error.release()));
+            else
+            {
+                ProfileEvents::increment(ProfileEvents::USearchAddCount);
+                ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, result.visited_members);
+                ProfileEvents::increment(ProfileEvents::USearchAddComputedDistances, result.computed_distances);
+            }
         }
     }
     else
@@ -356,17 +377,16 @@ std::vector<size_t> MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer
 
     const std::vector<float> reference_vector = vector_similarity_condition.getReferenceVector();
 
-    auto result = index->search(reference_vector.data(), limit);
-    if (result.error)
-        throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index, error: " + String(result.error.release()));
+    auto search_result = index->search(reference_vector.data(), limit);
+    if (!search_result)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index. Error: {}", String(search_result.error.release()));
 
     ProfileEvents::increment(ProfileEvents::USearchSearchCount);
-    ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members);
-    ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, result.computed_distances);
+    ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, search_result.visited_members);
+    ProfileEvents::increment(ProfileEvents::USearchSearchComputedDistances, search_result.computed_distances);
 
-    std::vector<USearchIndex::vector_key_t> neighbors(result.size()); /// indexes of dots which were closest to the reference vector
-    std::vector<USearchIndex::distance_t> distances(result.size());
-    result.dump_to(neighbors.data(), distances.data());
+    std::vector<USearchIndex::vector_key_t> neighbors(search_result.size()); /// indexes of vectors which were closest to the reference vector
+    search_result.dump_to(neighbors.data());
 
     std::vector<size_t> granules;
     granules.reserve(neighbors.size());
@@ -414,14 +434,13 @@ MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(
 
 MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index)
 {
-    const bool has_six_args = (index.arguments.size() == 6);
-
+    /// Default parameters:
     unum::usearch::metric_kind_t metric_kind = distanceFunctionToMetricKind.at(index.arguments[1].safeGet<String>());
-
-    /// use defaults for the other parameters
     unum::usearch::scalar_kind_t scalar_kind = unum::usearch::scalar_kind_t::f32_k;
     UsearchHnswParams usearch_hnsw_params;
 
+    /// Optional parameters:
+    const bool has_six_args = (index.arguments.size() == 6);
     if (has_six_args)
     {
         scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet<String>());
@@ -466,12 +485,16 @@ void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* atta
     {
         if (!quantizationToScalarKind.contains(index.arguments[2].safeGet<String>()))
             throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind));
-        if (index.arguments[3].safeGet<UInt64>() < 2)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Fourth argument (M) of vector similarity index must be > 1");
-        if (index.arguments[4].safeGet<UInt64>() < 1)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Fifth argument (ef_construction) of vector similarity index must be > 0");
-        if (index.arguments[5].safeGet<UInt64>() < 1)
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Sixth argument (ef_search) of vector similarity index must be > 0");
+
+        /// Call Usearche's own parameter validation method for HNSW-specific parameters
+        UInt64 m = index.arguments[3].safeGet<UInt64>();
+        UInt64 ef_construction = index.arguments[4].safeGet<UInt64>();
+        UInt64 ef_search = index.arguments[5].safeGet<UInt64>();
+
+        unum::usearch::index_dense_config_t config(m, ef_construction, ef_search);
+
+        if (auto error = config.validate(); error)
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid parameters passed to vector similarity index. Error: {}", String(error.release()));
     }
 
     /// Check that the index is created on a single column
diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
index f7098c1626c..c4c03254d2d 100644
--- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
+++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h
@@ -4,12 +4,9 @@
 
 #if USE_USEARCH
 
-#pragma clang diagnostic push
-#pragma clang diagnostic ignored "-Wpass-failed"
-#  include <Storages/MergeTree/VectorSimilarityCondition.h>
-#  include <Common/Logger.h>
-#  include <usearch/index_dense.hpp>
-#pragma clang diagnostic pop
+#include <Storages/MergeTree/VectorSimilarityCondition.h>
+#include <Common/Logger.h>
+#include <usearch/index_dense.hpp>
 
 namespace DB
 {
@@ -21,7 +18,7 @@ struct UsearchHnswParams
     size_t ef_search = unum::usearch::default_expansion_search();
 };
 
-using USearchIndex = unum::usearch::index_dense_gt</*key_at*/ uint32_t, /*compressed_slot_at*/ uint32_t>;
+using USearchIndex = unum::usearch::index_dense_t;
 
 class USearchIndexWithSerialization : public USearchIndex
 {
@@ -41,13 +38,18 @@ public:
     {
         size_t max_level;
         size_t connectivity;
-        size_t size;
-        size_t capacity;
-        size_t memory_usage;
-        /// advanced stats:
+        size_t size;                /// number of indexed vectors
+        size_t capacity;            /// reserved number of indexed vectors
+        size_t memory_usage;        /// byte size (not exact)
         size_t bytes_per_vector;
         size_t scalar_words;
-        Base::stats_t statistics;
+        size_t nodes;
+        size_t edges;
+        size_t max_edges;
+
+        std::vector<USearchIndex::stats_t> level_stats; /// for debugging, excluded from getStatistics()
+
+        String toString() const;
     };
 
     Statistics getStatistics() const;
diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference
index b6d034208d0..f18daa6e02e 100644
--- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference
+++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference
@@ -3,8 +3,6 @@ Two or six index arguments
 2nd argument (distance function) must be String and L2Distance or cosineDistance
 3nd argument (quantization), if given, must be String and f32, f16, ...
 4nd argument (M), if given, must be UInt64 and > 1
-5nd argument (ef_construction), if given, must be UInt64 and > 0
-6nd argument (ef_search), if given, must be UInt64 and > 0
 Must be created on single column
 Must be created on Array(Float32) columns
 Rejects INSERTs of Arrays with different sizes
diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql
index 7c2ddfe81fc..de9d37e1000 100644
--- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql
+++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql
@@ -27,12 +27,6 @@ CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similar
 SELECT '4nd argument (M), if given, must be UInt64 and > 1';
 CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
 CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
-SELECT '5nd argument (ef_construction), if given, must be UInt64 and > 0';
-CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 'invalid', 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
-CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 0, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
-SELECT '6nd argument (ef_search), if given, must be UInt64 and > 0';
-CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 'invalid')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY }
-CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 0)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA }
 
 SELECT 'Must be created on single column';
 CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS }

From 8dec996686449e5a541157fdfd5ffb65b2208998 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Mon, 12 Aug 2024 06:35:44 +0000
Subject: [PATCH 56/88] Fix non-deterministic result order in
 test_storage_mysql.test_mysql_distributed

---
 tests/integration/test_storage_mysql/test.py | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_mysql/test.py b/tests/integration/test_storage_mysql/test.py
index 5948954ff5f..c724c5bb498 100644
--- a/tests/integration/test_storage_mysql/test.py
+++ b/tests/integration/test_storage_mysql/test.py
@@ -445,7 +445,7 @@ def test_mysql_distributed(started_cluster):
     query = "SELECT * FROM ("
     for i in range(3):
         query += "SELECT name FROM test_replicas UNION DISTINCT "
-    query += "SELECT name FROM test_replicas)"
+    query += "SELECT name FROM test_replicas) ORDER BY name"
 
     result = node2.query(query)
     assert result == "host2\nhost3\nhost4\n"
@@ -827,6 +827,9 @@ def test_settings(started_cluster):
         f"with settings: connect_timeout={connect_timeout}, read_write_timeout={rw_timeout}"
     )
 
+    node1.query("DROP DATABASE IF EXISTS m")
+    node1.query("DROP DATABASE IF EXISTS mm")
+
     rw_timeout = 40123001
     connect_timeout = 40123002
     node1.query(
@@ -855,6 +858,9 @@ def test_settings(started_cluster):
         f"with settings: connect_timeout={connect_timeout}, read_write_timeout={rw_timeout}"
     )
 
+    node1.query("DROP DATABASE m")
+    node1.query("DROP DATABASE mm")
+
     drop_mysql_table(conn, table_name)
     conn.close()
 
@@ -930,6 +936,9 @@ def test_joins(started_cluster):
 
     conn.commit()
 
+    node1.query("DROP TABLE IF EXISTS test_joins_table_users")
+    node1.query("DROP TABLE IF EXISTS test_joins_table_tickets")
+
     node1.query(
         """
         CREATE TABLE test_joins_table_users
@@ -964,6 +973,9 @@ def test_joins(started_cluster):
         """
     ) == "281607\tFeedback\t2024-06-25 12:09:41\tuser@example.com\n"
 
+    node1.query("DROP TABLE test_joins_table_users")
+    node1.query("DROP TABLE test_joins_table_tickets")
+
 
 if __name__ == "__main__":
     with contextmanager(started_cluster)() as cluster:

From 9fe31773bdeeeada849a60822a7409ee1aa8782f Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Wed, 14 Aug 2024 09:52:12 +0000
Subject: [PATCH 57/88] Fix part name in 00961_check_table

---
 tests/queries/0_stateless/00961_check_table.reference | 2 +-
 tests/queries/0_stateless/00961_check_table.sql       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/00961_check_table.reference b/tests/queries/0_stateless/00961_check_table.reference
index a0a054898b9..686285bb6aa 100644
--- a/tests/queries/0_stateless/00961_check_table.reference
+++ b/tests/queries/0_stateless/00961_check_table.reference
@@ -14,4 +14,4 @@
 ========
 201902_4_5_1	1	
 ========
-201801_1_1_0	1	
+201801_1_1_2	1	
diff --git a/tests/queries/0_stateless/00961_check_table.sql b/tests/queries/0_stateless/00961_check_table.sql
index a6abe8103d5..fc3c5435670 100644
--- a/tests/queries/0_stateless/00961_check_table.sql
+++ b/tests/queries/0_stateless/00961_check_table.sql
@@ -39,6 +39,6 @@ CHECK TABLE mt_table PARTITION 201902 SETTINGS max_threads = 1;
 
 SELECT '========';
 
-CHECK TABLE mt_table PART '201801_1_1_0';
+CHECK TABLE mt_table PART '201801_1_1_2';
 
 DROP TABLE IF EXISTS mt_table;

From 8d7319ccab75d8f5a401bdd63697f7bd6508d27f Mon Sep 17 00:00:00 2001
From: Duc Canh Le <duccanh.le@ahrefs.com>
Date: Wed, 14 Aug 2024 10:18:00 +0000
Subject: [PATCH 58/88] fix wrong format of SYSTEM SYNC REPLICA query

Signed-off-by: Duc Canh Le <duccanh.le@ahrefs.com>
---
 src/Parsers/ASTSystemQuery.cpp                | 45 ++++++++++---------
 ...03205_system_sync_replica_format.reference |  1 +
 .../03205_system_sync_replica_format.sql      |  1 +
 3 files changed, 25 insertions(+), 22 deletions(-)
 create mode 100644 tests/queries/0_stateless/03205_system_sync_replica_format.reference
 create mode 100644 tests/queries/0_stateless/03205_system_sync_replica_format.sql

diff --git a/src/Parsers/ASTSystemQuery.cpp b/src/Parsers/ASTSystemQuery.cpp
index a730ea0ba3d..7780544d5c2 100644
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@@ -198,6 +198,29 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState & s
                 print_database_table();
             }
 
+            if (sync_replica_mode != SyncReplicaMode::DEFAULT)
+            {
+                settings.ostr << ' ';
+                print_keyword(magic_enum::enum_name(sync_replica_mode));
+
+                // If the mode is LIGHTWEIGHT and specific source replicas are specified
+                if (sync_replica_mode == SyncReplicaMode::LIGHTWEIGHT && !src_replicas.empty())
+                {
+                    settings.ostr << ' ';
+                    print_keyword("FROM");
+                    settings.ostr << ' ';
+
+                    bool first = true;
+                    for (const auto & src : src_replicas)
+                    {
+                        if (!first)
+                            settings.ostr << ", ";
+                        first = false;
+                        settings.ostr << quoteString(src);
+                    }
+                }
+            }
+
             if (query_settings)
             {
                 settings.ostr << (settings.hilite ? hilite_keyword : "") << settings.nl_or_ws << "SETTINGS " << (settings.hilite ? hilite_none : "");
@@ -233,28 +256,6 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState & s
                 print_identifier(disk);
             }
 
-            if (sync_replica_mode != SyncReplicaMode::DEFAULT)
-            {
-                settings.ostr << ' ';
-                print_keyword(magic_enum::enum_name(sync_replica_mode));
-
-                // If the mode is LIGHTWEIGHT and specific source replicas are specified
-                if (sync_replica_mode == SyncReplicaMode::LIGHTWEIGHT && !src_replicas.empty())
-                {
-                    settings.ostr << ' ';
-                    print_keyword("FROM");
-                    settings.ostr << ' ';
-
-                    bool first = true;
-                    for (const auto & src : src_replicas)
-                    {
-                        if (!first)
-                            settings.ostr << ", ";
-                        first = false;
-                        settings.ostr << quoteString(src);
-                    }
-                }
-            }
             break;
         }
         case Type::SYNC_DATABASE_REPLICA:
diff --git a/tests/queries/0_stateless/03205_system_sync_replica_format.reference b/tests/queries/0_stateless/03205_system_sync_replica_format.reference
new file mode 100644
index 00000000000..aad51dd90b0
--- /dev/null
+++ b/tests/queries/0_stateless/03205_system_sync_replica_format.reference
@@ -0,0 +1 @@
+SYSTEM SYNC REPLICA db.`table` LIGHTWEIGHT
diff --git a/tests/queries/0_stateless/03205_system_sync_replica_format.sql b/tests/queries/0_stateless/03205_system_sync_replica_format.sql
new file mode 100644
index 00000000000..329bce80afc
--- /dev/null
+++ b/tests/queries/0_stateless/03205_system_sync_replica_format.sql
@@ -0,0 +1 @@
+SELECT formatQuery('SYSTEM SYNC REPLICA db.table LIGHTWEIGHT');

From 0e0272b2ffdf4286c5ef9766c90b88e096469e92 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Wed, 14 Aug 2024 10:21:23 +0000
Subject: [PATCH 59/88] Better check for overflow

+ limit min_marks_for_concurrent_read
---
 src/Processors/QueryPlan/ReadFromMergeTree.cpp | 18 ++++++++++++++++--
 .../MergeTree/MergeTreeIndexGranularity.cpp    | 15 +++++++++------
 ...plicas_min_marks_to_read_overflow.reference | 11 +++++++++++
 ...lel_replicas_min_marks_to_read_overflow.sql | 10 +++++++++-
 4 files changed, 45 insertions(+), 9 deletions(-)

diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 901d7c61167..b5b46ef9f41 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -350,7 +350,14 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
 
     /// We have a special logic for local replica. It has to read less data, because in some cases it should
     /// merge states of aggregate functions or do some other important stuff other than reading from Disk.
-    const auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
+    auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
+    if (pool_settings.min_marks_for_concurrent_read > std::numeric_limits<Int64>::max())
+    {
+        /// limit min marks to read in case it's big, happened in test since due to settings randomzation
+        pool_settings.min_marks_for_concurrent_read = std::numeric_limits<Int64>::max();
+        multiplier = 1.0f;
+    }
+
     if (auto result = pool_settings.min_marks_for_concurrent_read * multiplier; canConvertTo<size_t>(result))
         pool_settings.min_marks_for_concurrent_read = static_cast<size_t>(result);
     else
@@ -519,7 +526,14 @@ Pipe ReadFromMergeTree::readInOrder(
             .number_of_current_replica = client_info.number_of_current_replica,
         };
 
-        const auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
+        auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
+        if (pool_settings.min_marks_for_concurrent_read > std::numeric_limits<Int64>::max())
+        {
+            /// limit min marks to read in case it's big, happened in test since due to settings randomzation
+            pool_settings.min_marks_for_concurrent_read = std::numeric_limits<Int64>::max();
+            multiplier = 1.0f;
+        }
+
         if (auto result = pool_settings.min_marks_for_concurrent_read * multiplier; canConvertTo<size_t>(result))
             pool_settings.min_marks_for_concurrent_read = static_cast<size_t>(result);
         else
diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
index 2f9a4a47b11..2b924284857 100644
--- a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
+++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp
@@ -105,13 +105,16 @@ size_t MergeTreeIndexGranularity::countMarksForRows(size_t from_mark, size_t num
     /// See comment in IMergeTreeSelectAlgorithm.
     if (min_marks_to_read)
     {
-        // check that ...
-        bool overflow = ((1ULL << 63) & min_marks_to_read); // further multiplication by 2 will not overflow
-        if (!overflow)
-            overflow = (std::numeric_limits<size_t>::max() - from_mark) < 2 * min_marks_to_read; // further addition will not overflow
+        // check overflow
+        size_t min_marks_to_read_2 = 0;
+        bool overflow = common::mulOverflow(min_marks_to_read, 2, min_marks_to_read_2);
 
-        if (!overflow && from_mark + 2 * min_marks_to_read <= to_mark)
-            to_mark = from_mark + min_marks_to_read;
+        size_t to_mark_overwrite = 0;
+        if (!overflow)
+            overflow = common::addOverflow(from_mark, min_marks_to_read_2, to_mark_overwrite);
+
+        if (!overflow && to_mark_overwrite < to_mark)
+            to_mark = to_mark_overwrite;
     }
 
     return getRowsCountInRange(from_mark, std::max(1UL, to_mark)) - offset_in_rows;
diff --git a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
index 7fafd4d13ea..b6c452ba328 100644
--- a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
+++ b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.reference
@@ -1,3 +1,14 @@
+1006
+1007
+1008
+1009
+101
+1010
+1011
+1012
+1013
+1014
+---
 100	100
 101	101
 102	102
diff --git a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql
index 112373e5db2..6f486f8f0fe 100644
--- a/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql
+++ b/tests/queries/0_stateless/03222_parallel_replicas_min_marks_to_read_overflow.sql
@@ -6,10 +6,18 @@ INSERT INTO test__fuzz_22 SELECT number, toString(number) FROM numbers(10_000);
 
 SET allow_experimental_parallel_reading_from_replicas = 2, max_parallel_replicas = 3, cluster_for_parallel_replicas='test_cluster_one_shard_three_replicas_localhost';
 
+SELECT v
+FROM test__fuzz_22
+ORDER BY v
+LIMIT 10, 10
+SETTINGS merge_tree_min_rows_for_concurrent_read = 9223372036854775806;
+
+SELECT '---';
+
 SELECT k, v
 FROM test__fuzz_22
 ORDER BY k
 LIMIT 100, 10
-SETTINGS merge_tree_min_rows_for_concurrent_read = 9223372036854775806;
+SETTINGS optimize_read_in_order=1, merge_tree_min_rows_for_concurrent_read = 9223372036854775806;
 
 DROP TABLE test__fuzz_22 SYNC;

From 56d6ef5c4a015f5851923f2c420538456564e790 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Wed, 14 Aug 2024 10:53:07 +0000
Subject: [PATCH 60/88] Fix 02995_index_10 timeout

---
 tests/queries/0_stateless/02995_index_10.sh | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/queries/0_stateless/02995_index_10.sh b/tests/queries/0_stateless/02995_index_10.sh
index 813cc49cbd8..e7e7d3c3b42 100755
--- a/tests/queries/0_stateless/02995_index_10.sh
+++ b/tests/queries/0_stateless/02995_index_10.sh
@@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh
 
-${CLICKHOUSE_CLIENT} "
+${CLICKHOUSE_CLIENT} -q "
 
 DROP TABLE IF EXISTS test;
 CREATE TABLE test (a String, b String, c String) ENGINE = MergeTree ORDER BY (a, b, c) SETTINGS index_granularity = 11;
@@ -37,8 +37,9 @@ WHERE a >= (round(pow(sipHash64(1, try), 1 / (3 + sipHash64(2, try) % 8))) AS a1
   AND b <= (b1 + round(pow(sipHash64(7, try), 1 / (3 + sipHash64(8, try) % 8))))::String
   AND c >= (round(pow(sipHash64(9, try), 1 / (3 + sipHash64(10, try) % 8))) AS c1)::String
   AND c <= (c1 + round(pow(sipHash64(11, try), 1 / (3 + sipHash64(12, try) % 8))))::String
-HAVING count() > 0;
-"
+HAVING count() > 0
+SETTINGS trace_profile_events=0 -- test is too slow with profiling
+;"
 done | ${CLICKHOUSE_CLIENT}
 
-${CLICKHOUSE_CLIENT} "DROP TABLE test"
+${CLICKHOUSE_CLIENT} -q "DROP TABLE test"

From fc9929dc3d87a6f8065d4f46fe5002fab8d5537e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Wed, 14 Aug 2024 14:15:05 +0000
Subject: [PATCH 61/88] Make tests with azurite repeatable

---
 tests/integration/helpers/cluster.py          | 16 ++++++++++++++
 .../integration/test_storage_s3_queue/test.py | 21 ++++++++++++-------
 2 files changed, 29 insertions(+), 8 deletions(-)

diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 215718463e8..a97d0f9c340 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -570,6 +570,8 @@ class ClickHouseCluster:
         self.spark_session = None
 
         self.with_azurite = False
+        self.azurite_container = "cont"
+        self.blob_service_client = None
         self._azurite_port = 0
 
         # available when with_hdfs == True
@@ -2692,6 +2694,20 @@ class ClickHouseCluster:
                     connection_string
                 )
                 logging.debug(blob_service_client.get_account_information())
+                containers = [c for c in blob_service_client.list_containers(name_starts_with=self.azurite_container) if c.name == self.azurite_container]
+                if len(containers) > 0:
+                    for c in containers:
+                        blob_service_client.delete_container(c)
+
+                container_client = blob_service_client.get_container_client(self.azurite_container)
+                if container_client.exists():
+                    logging.debug(f"azurite container '{self.azurite_container}' exist, deleting all blobs")
+                    for b in container_client.list_blobs():
+                        container_client.delete_blob(b.name)
+                else:
+                    logging.debug(f"azurite container '{self.azurite_container}' doesn't exist, creating it")
+                    container_client.create_container()
+
                 self.blob_service_client = blob_service_client
                 return
             except Exception as ex:
diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index 00ef8499594..ff723d0792a 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -13,7 +13,6 @@ from uuid import uuid4
 AVAILABLE_MODES = ["unordered", "ordered"]
 DEFAULT_AUTH = ["'minio'", "'minio123'"]
 NO_AUTH = ["NOSIGN"]
-AZURE_CONTAINER_NAME = "cont"
 
 
 def prepare_public_s3_bucket(started_cluster):
@@ -75,6 +74,17 @@ def s3_queue_setup_teardown(started_cluster):
     objects = list(minio.list_objects(started_cluster.minio_bucket, recursive=True))
     for obj in objects:
         minio.remove_object(started_cluster.minio_bucket, obj.object_name)
+
+    container_client = started_cluster.blob_service_client.get_container_client(
+        started_cluster.azurite_container
+    )
+
+    if container_client.exists():
+        blob_names = [b.name for b in container_client.list_blobs()]
+        logging.debug(f"Deleting blobs: {blob_names}")
+        for b in blob_names:
+            container_client.delete_blob(b)
+
     yield  # run test
 
 
@@ -129,11 +139,6 @@ def started_cluster():
         cluster.start()
         logging.info("Cluster started")
 
-        container_client = cluster.blob_service_client.get_container_client(
-            AZURE_CONTAINER_NAME
-        )
-        container_client.create_container()
-
         yield cluster
     finally:
         cluster.shutdown()
@@ -190,7 +195,7 @@ def put_s3_file_content(started_cluster, filename, data, bucket=None):
 
 def put_azure_file_content(started_cluster, filename, data, bucket=None):
     client = started_cluster.blob_service_client.get_blob_client(
-        AZURE_CONTAINER_NAME, filename
+        started_cluster.azurite_container, filename
     )
     buf = io.BytesIO(data)
     client.upload_blob(buf, "BlockBlob", len(data))
@@ -313,7 +318,7 @@ def test_delete_after_processing(started_cluster, mode, engine_name):
         assert len(objects) == 0
     else:
         client = started_cluster.blob_service_client.get_container_client(
-            AZURE_CONTAINER_NAME
+            started_cluster.azurite_container
         )
         objects_iterator = client.list_blobs(files_path)
         for objects in objects_iterator:

From 920b88846b585a335b71c6f36a208f8cce8a5a74 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 14 Aug 2024 13:14:01 +0000
Subject: [PATCH 62/88] Optionally re-enable compilation with -O0

---
 CMakeLists.txt | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 884d5be42de..6fa91fa002f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -322,17 +322,21 @@ if (DISABLE_OMIT_FRAME_POINTER)
     set (CMAKE_ASM_FLAGS_ADD "${CMAKE_ASM_FLAGS_ADD} -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer")
 endif()
 
+# Before you start hating your debugger because it refuses to show variables ('<optimized out>'), try building with -DDEBUG_O_LEVEL="0"
+# https://stackoverflow.com/questions/63386189/whats-the-difference-between-a-compilers-o0-option-and-og-option/63386263#63386263
+set(DEBUG_O_LEVEL "g" CACHE STRING "The -Ox level used for debug builds")
+
 set (CMAKE_CXX_FLAGS                     "${CMAKE_CXX_FLAGS} ${COMPILER_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
 set (CMAKE_CXX_FLAGS_RELWITHDEBINFO      "${CMAKE_CXX_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
-set (CMAKE_CXX_FLAGS_DEBUG               "${CMAKE_CXX_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
+set (CMAKE_CXX_FLAGS_DEBUG               "${CMAKE_CXX_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_CXX_FLAGS_ADD}")
 
 set (CMAKE_C_FLAGS                       "${CMAKE_C_FLAGS} ${COMPILER_FLAGS} ${CMAKE_C_FLAGS_ADD}")
 set (CMAKE_C_FLAGS_RELWITHDEBINFO        "${CMAKE_C_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
-set (CMAKE_C_FLAGS_DEBUG                 "${CMAKE_C_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
+set (CMAKE_C_FLAGS_DEBUG                 "${CMAKE_C_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_C_FLAGS_ADD}")
 
 set (CMAKE_ASM_FLAGS                     "${CMAKE_ASM_FLAGS} ${COMPILER_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
 set (CMAKE_ASM_FLAGS_RELWITHDEBINFO      "${CMAKE_ASM_FLAGS_RELWITHDEBINFO} -O3 ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
-set (CMAKE_ASM_FLAGS_DEBUG               "${CMAKE_ASM_FLAGS_DEBUG} -Og ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
+set (CMAKE_ASM_FLAGS_DEBUG               "${CMAKE_ASM_FLAGS_DEBUG} -O${DEBUG_O_LEVEL} ${DEBUG_INFO_FLAGS} ${CMAKE_ASM_FLAGS_ADD}")
 
 if (OS_DARWIN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++")

From 07c4a072fe14567ebe9809d19f869fc7a948f73e Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 14 Aug 2024 16:45:02 +0200
Subject: [PATCH 63/88] Add debug logging

---
 src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp     |  9 ++++++---
 src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp    |  3 ++-
 src/Interpreters/Cache/FileCache.cpp                | 11 ++++++++++-
 src/Interpreters/Cache/FileCache.h                  |  3 ++-
 src/Interpreters/Cache/FileSegment.cpp              |  8 ++++++--
 src/Interpreters/Cache/FileSegment.h                |  6 +++++-
 src/Interpreters/Cache/WriteBufferToFileSegment.cpp |  6 ++++--
 7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
index b471f3fc58f..286d06bc424 100644
--- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp
@@ -645,8 +645,9 @@ void CachedOnDiskReadBufferFromFile::predownload(FileSegment & file_segment)
 
             ProfileEvents::increment(ProfileEvents::CachedReadBufferReadFromSourceBytes, current_impl_buffer_size);
 
+            std::string failure_reason;
             bool continue_predownload = file_segment.reserve(
-                current_predownload_size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds);
+                current_predownload_size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, failure_reason);
             if (continue_predownload)
             {
                 LOG_TEST(log, "Left to predownload: {}, buffer size: {}", bytes_to_predownload, current_impl_buffer_size);
@@ -1002,7 +1003,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
         {
             chassert(file_offset_of_buffer_end + size - 1 <= file_segment.range().right);
 
-            bool success = file_segment.reserve(size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds);
+            std::string failure_reason;
+            bool success = file_segment.reserve(size, settings.filesystem_cache_reserve_space_wait_lock_timeout_milliseconds, failure_reason);
             if (success)
             {
                 chassert(file_segment.getCurrentWriteOffset() == static_cast<size_t>(implementation_buffer->getPosition()));
@@ -1028,7 +1030,8 @@ bool CachedOnDiskReadBufferFromFile::nextImplStep()
                     LOG_TRACE(log, "Bypassing cache because writeCache method failed");
             }
             else
-                LOG_TRACE(log, "No space left in cache to reserve {} bytes, will continue without cache download", size);
+                LOG_TRACE(log, "No space left in cache to reserve {} bytes, reason: {}, "
+                          "will continue without cache download", failure_reason, size);
 
             if (!success)
             {
diff --git a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
index 382c4a80cc4..103ae0e1832 100644
--- a/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
+++ b/src/Disks/IO/CachedOnDiskWriteBufferFromFile.cpp
@@ -91,7 +91,8 @@ bool FileSegmentRangeWriter::write(char * data, size_t size, size_t offset, File
 
         size_t size_to_write = std::min(available_size, size);
 
-        bool reserved = file_segment->reserve(size_to_write, reserve_space_lock_wait_timeout_milliseconds);
+        std::string failure_reason;
+        bool reserved = file_segment->reserve(size_to_write, reserve_space_lock_wait_timeout_milliseconds, failure_reason);
         if (!reserved)
         {
             appendFilesystemCacheLog(*file_segment);
diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp
index 13c70b38543..b1a1f629b00 100644
--- a/src/Interpreters/Cache/FileCache.cpp
+++ b/src/Interpreters/Cache/FileCache.cpp
@@ -804,7 +804,8 @@ bool FileCache::tryReserve(
     const size_t size,
     FileCacheReserveStat & reserve_stat,
     const UserInfo & user,
-    size_t lock_wait_timeout_milliseconds)
+    size_t lock_wait_timeout_milliseconds,
+    std::string & failure_reason)
 {
     ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::FilesystemCacheReserveMicroseconds);
 
@@ -817,6 +818,7 @@ bool FileCache::tryReserve(
     if (cache_is_being_resized.load(std::memory_order_relaxed))
     {
         ProfileEvents::increment(ProfileEvents::FilesystemCacheFailToReserveSpaceBecauseOfCacheResize);
+        failure_reason = "cache is being resized";
         return false;
     }
 
@@ -824,6 +826,7 @@ bool FileCache::tryReserve(
     if (!cache_lock)
     {
         ProfileEvents::increment(ProfileEvents::FilesystemCacheFailToReserveSpaceBecauseOfLockContention);
+        failure_reason = "cache contention";
         return false;
     }
 
@@ -847,6 +850,7 @@ bool FileCache::tryReserve(
             LOG_TEST(log, "Query limit exceeded, space reservation failed, "
                      "recache_on_query_limit_exceeded is disabled (while reserving for {}:{})",
                      file_segment.key(), file_segment.offset());
+            failure_reason = "query limit exceeded";
             return false;
         }
 
@@ -877,6 +881,7 @@ bool FileCache::tryReserve(
         if (!query_priority->collectCandidatesForEviction(
                 size, required_elements_num, reserve_stat, eviction_candidates, {}, user.user_id, cache_lock))
         {
+            failure_reason = "cannot evict enough space for query limit";
             return false;
         }
 
@@ -891,11 +896,15 @@ bool FileCache::tryReserve(
     if (!main_priority->collectCandidatesForEviction(
             size, required_elements_num, reserve_stat, eviction_candidates, queue_iterator, user.user_id, cache_lock))
     {
+        failure_reason = "cannot evict enough space";
         return false;
     }
 
     if (!file_segment.getKeyMetadata()->createBaseDirectory())
+    {
+        failure_reason = "not enough space on device";
         return false;
+    }
 
     if (eviction_candidates.size() > 0)
     {
diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h
index 07be802a940..efa504689eb 100644
--- a/src/Interpreters/Cache/FileCache.h
+++ b/src/Interpreters/Cache/FileCache.h
@@ -165,7 +165,8 @@ public:
         size_t size,
         FileCacheReserveStat & stat,
         const UserInfo & user,
-        size_t lock_wait_timeout_milliseconds);
+        size_t lock_wait_timeout_milliseconds,
+        std::string & failure_reason);
 
     std::vector<FileSegment::Info> getFileSegmentInfos(const UserID & user_id);
 
diff --git a/src/Interpreters/Cache/FileSegment.cpp b/src/Interpreters/Cache/FileSegment.cpp
index c46fb978ae4..cfbdfbaa257 100644
--- a/src/Interpreters/Cache/FileSegment.cpp
+++ b/src/Interpreters/Cache/FileSegment.cpp
@@ -502,7 +502,11 @@ LockedKeyPtr FileSegment::lockKeyMetadata(bool assert_exists) const
     return metadata->tryLock();
 }
 
-bool FileSegment::reserve(size_t size_to_reserve, size_t lock_wait_timeout_milliseconds, FileCacheReserveStat * reserve_stat)
+bool FileSegment::reserve(
+    size_t size_to_reserve,
+    size_t lock_wait_timeout_milliseconds,
+    std::string & failure_reason,
+    FileCacheReserveStat * reserve_stat)
 {
     if (!size_to_reserve)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Zero space reservation is not allowed");
@@ -554,7 +558,7 @@ bool FileSegment::reserve(size_t size_to_reserve, size_t lock_wait_timeout_milli
     if (!reserve_stat)
         reserve_stat = &dummy_stat;
 
-    bool reserved = cache->tryReserve(*this, size_to_reserve, *reserve_stat, getKeyMetadata()->user, lock_wait_timeout_milliseconds);
+    bool reserved = cache->tryReserve(*this, size_to_reserve, *reserve_stat, getKeyMetadata()->user, lock_wait_timeout_milliseconds, failure_reason);
 
     if (!reserved)
         setDownloadFailedUnlocked(lock());
diff --git a/src/Interpreters/Cache/FileSegment.h b/src/Interpreters/Cache/FileSegment.h
index 25ffb880b45..e90ebdbf8fe 100644
--- a/src/Interpreters/Cache/FileSegment.h
+++ b/src/Interpreters/Cache/FileSegment.h
@@ -201,7 +201,11 @@ public:
 
     /// Try to reserve exactly `size` bytes (in addition to the getDownloadedSize() bytes already downloaded).
     /// Returns true if reservation was successful, false otherwise.
-    bool reserve(size_t size_to_reserve, size_t lock_wait_timeout_milliseconds, FileCacheReserveStat * reserve_stat = nullptr);
+    bool reserve(
+        size_t size_to_reserve,
+        size_t lock_wait_timeout_milliseconds,
+        std::string & failure_reason,
+        FileCacheReserveStat * reserve_stat = nullptr);
 
     /// Write data into reserved space.
     void write(char * from, size_t size, size_t offset_in_file);
diff --git a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
index e6ebf6ad50c..e43bbacdbc5 100644
--- a/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
+++ b/src/Interpreters/Cache/WriteBufferToFileSegment.cpp
@@ -75,7 +75,8 @@ void WriteBufferToFileSegment::nextImpl()
     FileCacheReserveStat reserve_stat;
     /// In case of an error, we don't need to finalize the file segment
     /// because it will be deleted soon and completed in the holder's destructor.
-    bool ok = file_segment->reserve(bytes_to_write, reserve_space_lock_wait_timeout_milliseconds, &reserve_stat);
+    std::string failure_reason;
+    bool ok = file_segment->reserve(bytes_to_write, reserve_space_lock_wait_timeout_milliseconds, failure_reason, &reserve_stat);
 
     if (!ok)
     {
@@ -84,9 +85,10 @@ void WriteBufferToFileSegment::nextImpl()
             reserve_stat_msg += fmt::format("{} hold {}, can release {}; ",
                 toString(kind), ReadableSize(stat.non_releasable_size), ReadableSize(stat.releasable_size));
 
-        throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Failed to reserve {} bytes for {}: {}(segment info: {})",
+        throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Failed to reserve {} bytes for {}: reason {}, {}(segment info: {})",
             bytes_to_write,
             file_segment->getKind() == FileSegmentKind::Temporary ? "temporary file" : "the file in cache",
+            failure_reason,
             reserve_stat_msg,
             file_segment->getInfoForLog()
         );

From 4827b8bb1c7a77e50912ab40d5c009c43d20f6ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Wed, 14 Aug 2024 14:56:02 +0000
Subject: [PATCH 64/88] Make S3Queue tests repeatable

---
 .../integration/test_storage_s3_queue/test.py | 97 ++++++++++++++-----
 1 file changed, 72 insertions(+), 25 deletions(-)

diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index ff723d0792a..08a8a7cac81 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -1,6 +1,7 @@
 import io
 import logging
 import random
+import string
 import time
 
 import pytest
@@ -267,6 +268,10 @@ def create_mv(
     )
 
 
+def generate_random_string(length=6):
+    return "".join(random.choice(string.ascii_lowercase) for i in range(length))
+
+
 @pytest.mark.parametrize("mode", ["unordered", "ordered"])
 @pytest.mark.parametrize("engine_name", ["S3Queue", "AzureQueue"])
 def test_delete_after_processing(started_cluster, mode, engine_name):
@@ -276,6 +281,8 @@ def test_delete_after_processing(started_cluster, mode, engine_name):
     files_path = f"{table_name}_data"
     files_num = 5
     row_num = 10
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     if engine_name == "S3Queue":
         storage = "s3"
     else:
@@ -290,7 +297,7 @@ def test_delete_after_processing(started_cluster, mode, engine_name):
         table_name,
         mode,
         files_path,
-        additional_settings={"after_processing": "delete"},
+        additional_settings={"after_processing": "delete", "keeper_path": keeper_path},
         engine_name=engine_name,
     )
     create_mv(node, table_name, dst_table_name)
@@ -333,7 +340,8 @@ def test_failed_retry(started_cluster, mode, engine_name):
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
     file_path = f"{files_path}/trash_test.csv"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     retries_num = 3
 
     values = [
@@ -391,7 +399,8 @@ def test_failed_retry(started_cluster, mode, engine_name):
 def test_direct_select_file(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"test.direct_select_file_{mode}"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     file_path = f"{files_path}/test.csv"
 
@@ -496,8 +505,17 @@ def test_direct_select_multiple_files(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"direct_select_multiple_files_{mode}"
     files_path = f"{table_name}_data"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
 
-    create_table(started_cluster, node, table_name, mode, files_path)
+    create_table(
+        started_cluster,
+        node,
+        table_name,
+        mode,
+        files_path,
+        additional_settings={"keeper_path": keeper_path},
+    )
     for i in range(5):
         rand_values = [[random.randint(0, 50) for _ in range(3)] for _ in range(10)]
         values_csv = (
@@ -520,14 +538,23 @@ def test_direct_select_multiple_files(started_cluster, mode):
 
 
 @pytest.mark.parametrize("mode", AVAILABLE_MODES)
-def test_streaming_to_view_(started_cluster, mode):
+def test_streaming_to_view(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"streaming_to_view_{mode}"
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
 
     total_values = generate_random_files(started_cluster, files_path, 10)
-    create_table(started_cluster, node, table_name, mode, files_path)
+    create_table(
+        started_cluster,
+        node,
+        table_name,
+        mode,
+        files_path,
+        additional_settings={"keeper_path": keeper_path},
+    )
     create_mv(node, table_name, dst_table_name)
 
     expected_values = set([tuple(i) for i in total_values])
@@ -549,7 +576,8 @@ def test_streaming_to_many_views(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"streaming_to_many_views_{mode}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
 
     for i in range(3):
@@ -587,7 +615,8 @@ def test_streaming_to_many_views(started_cluster, mode):
 def test_multiple_tables_meta_mismatch(started_cluster):
     node = started_cluster.instances["instance"]
     table_name = f"multiple_tables_meta_mismatch"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
 
     create_table(
@@ -680,7 +709,8 @@ def test_multiple_tables_streaming_sync(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"multiple_tables_streaming_sync_{mode}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
 
@@ -763,7 +793,8 @@ def test_multiple_tables_streaming_sync_distributed(started_cluster, mode):
     node_2 = started_cluster.instances["instance2"]
     table_name = f"multiple_tables_streaming_sync_distributed_{mode}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
     row_num = 50
@@ -838,7 +869,8 @@ def test_max_set_age(started_cluster):
     node = started_cluster.instances["instance"]
     table_name = "max_set_age"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     max_age = 20
     files_to_generate = 10
@@ -949,10 +981,9 @@ def test_max_set_age(started_cluster):
 def test_max_set_size(started_cluster):
     node = started_cluster.instances["instance"]
     table_name = f"max_set_size"
-    dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
-    max_age = 10
     files_to_generate = 10
 
     create_table(
@@ -996,7 +1027,8 @@ def test_drop_table(started_cluster):
     node = started_cluster.instances["instance"]
     table_name = f"test_drop"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
 
@@ -1029,6 +1061,8 @@ def test_s3_client_reused(started_cluster):
     table_name = f"test.test_s3_client_reused"
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     row_num = 10
 
     def get_created_s3_clients_count():
@@ -1062,6 +1096,7 @@ def test_s3_client_reused(started_cluster):
         additional_settings={
             "after_processing": "delete",
             "s3queue_processing_threads_num": 1,
+            "keeper_path": keeper_path,
         },
         auth=NO_AUTH,
         bucket=started_cluster.minio_public_bucket,
@@ -1119,7 +1154,8 @@ def test_processing_threads(started_cluster, mode):
     node = started_cluster.instances["instance"]
     table_name = f"processing_threads_{mode}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
     processing_threads = 32
@@ -1186,7 +1222,8 @@ def test_shards(started_cluster, mode, processing_threads):
     node = started_cluster.instances["instance"]
     table_name = f"test_shards_{mode}_{processing_threads}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
     shards_num = 3
@@ -1313,7 +1350,8 @@ def test_shards_distributed(started_cluster, mode, processing_threads):
     node_2 = started_cluster.instances["instance2"]
     table_name = f"test_shards_distributed_{mode}_{processing_threads}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
     row_num = 300
@@ -1466,8 +1504,8 @@ def test_settings_check(started_cluster):
     node = started_cluster.instances["instance"]
     node_2 = started_cluster.instances["instance2"]
     table_name = f"test_settings_check"
-    dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     mode = "ordered"
 
@@ -1509,7 +1547,10 @@ def test_processed_file_setting(started_cluster, processing_threads):
     node = started_cluster.instances["instance"]
     table_name = f"test_processed_file_setting_{processing_threads}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}_{processing_threads}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = (
+        f"/clickhouse/test_{table_name}_{processing_threads}_{generate_random_string()}"
+    )
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
@@ -1560,7 +1601,10 @@ def test_processed_file_setting_distributed(started_cluster, processing_threads)
     node_2 = started_cluster.instances["instance2"]
     table_name = f"test_processed_file_setting_distributed_{processing_threads}"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = (
+        f"/clickhouse/test_{table_name}_{processing_threads}_{generate_random_string()}"
+    )
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
@@ -1614,7 +1658,8 @@ def test_upgrade(started_cluster):
 
     table_name = f"test_upgrade"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
@@ -1655,7 +1700,8 @@ def test_exception_during_insert(started_cluster):
 
     table_name = f"test_exception_during_insert"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
@@ -1708,7 +1754,8 @@ def test_commit_on_limit(started_cluster):
 
     table_name = f"test_commit_on_limit"
     dst_table_name = f"{table_name}_dst"
-    keeper_path = f"/clickhouse/test_{table_name}"
+    # A unique path is necessary for repeatable tests
+    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     files_to_generate = 10
 

From 2aef696856cbded795aedc23b7b8963799b8ebe6 Mon Sep 17 00:00:00 2001
From: serxa <sergei@clickhouse.com>
Date: Wed, 14 Aug 2024 15:27:37 +0000
Subject: [PATCH 65/88] make test runnable multiple times

---
 .../test_delayed_replica_failover/test.py     | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/tests/integration/test_delayed_replica_failover/test.py b/tests/integration/test_delayed_replica_failover/test.py
index 1116d225b8c..ed63a47e030 100644
--- a/tests/integration/test_delayed_replica_failover/test.py
+++ b/tests/integration/test_delayed_replica_failover/test.py
@@ -20,21 +20,29 @@ node_1_2 = cluster.add_instance("node_1_2", with_zookeeper=True)
 node_2_1 = cluster.add_instance("node_2_1", with_zookeeper=True)
 node_2_2 = cluster.add_instance("node_2_2", with_zookeeper=True)
 
+# For test to be runnable multiple times
+seqno = 0
 
 @pytest.fixture(scope="module")
 def started_cluster():
     try:
         cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
 
+
+@pytest.fixture(scope="function", autouse=True)
+def create_tables():
+    global seqno
+    try:
+        seqno += 1
         for shard in (1, 2):
             for replica in (1, 2):
                 node = cluster.instances["node_{}_{}".format(shard, replica)]
                 node.query(
-                    """
-CREATE TABLE replicated (d Date, x UInt32) ENGINE =
-    ReplicatedMergeTree('/clickhouse/tables/{shard}/replicated', '{instance}') PARTITION BY toYYYYMM(d) ORDER BY d""".format(
-                        shard=shard, instance=node.name
-                    )
+                    f"CREATE TABLE replicated (d Date, x UInt32) ENGINE = "
+                    f"ReplicatedMergeTree('/clickhouse/tables/{shard}/replicated_{seqno}', '{node.name}') PARTITION BY toYYYYMM(d) ORDER BY d"
                 )
 
         node_1_1.query(
@@ -42,10 +50,15 @@ CREATE TABLE replicated (d Date, x UInt32) ENGINE =
             "Distributed('test_cluster', 'default', 'replicated')"
         )
 
-        yield cluster
+        yield
 
     finally:
-        cluster.shutdown()
+        node_1_1.query("DROP TABLE distributed")
+
+        node_1_1.query("DROP TABLE replicated")
+        node_1_2.query("DROP TABLE replicated")
+        node_2_1.query("DROP TABLE replicated")
+        node_2_2.query("DROP TABLE replicated")
 
 
 def test(started_cluster):

From 537f3bcd76fbbbfb1ee4b7b718b0630fad95509c Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 14 Aug 2024 16:13:17 +0000
Subject: [PATCH 66/88] Automatic style fix

---
 tests/integration/helpers/cluster.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index a97d0f9c340..0b6cf03d467 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -2694,18 +2694,30 @@ class ClickHouseCluster:
                     connection_string
                 )
                 logging.debug(blob_service_client.get_account_information())
-                containers = [c for c in blob_service_client.list_containers(name_starts_with=self.azurite_container) if c.name == self.azurite_container]
+                containers = [
+                    c
+                    for c in blob_service_client.list_containers(
+                        name_starts_with=self.azurite_container
+                    )
+                    if c.name == self.azurite_container
+                ]
                 if len(containers) > 0:
                     for c in containers:
                         blob_service_client.delete_container(c)
 
-                container_client = blob_service_client.get_container_client(self.azurite_container)
+                container_client = blob_service_client.get_container_client(
+                    self.azurite_container
+                )
                 if container_client.exists():
-                    logging.debug(f"azurite container '{self.azurite_container}' exist, deleting all blobs")
+                    logging.debug(
+                        f"azurite container '{self.azurite_container}' exist, deleting all blobs"
+                    )
                     for b in container_client.list_blobs():
                         container_client.delete_blob(b.name)
                 else:
-                    logging.debug(f"azurite container '{self.azurite_container}' doesn't exist, creating it")
+                    logging.debug(
+                        f"azurite container '{self.azurite_container}' doesn't exist, creating it"
+                    )
                     container_client.create_container()
 
                 self.blob_service_client = blob_service_client

From 8e6096dee72acc5ee75eb05ddfb9384767f52648 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@users.noreply.github.com>
Date: Wed, 14 Aug 2024 16:13:22 +0000
Subject: [PATCH 67/88] Automatic style fix

---
 tests/integration/test_delayed_replica_failover/test.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/test_delayed_replica_failover/test.py b/tests/integration/test_delayed_replica_failover/test.py
index ed63a47e030..f1034e26b25 100644
--- a/tests/integration/test_delayed_replica_failover/test.py
+++ b/tests/integration/test_delayed_replica_failover/test.py
@@ -23,6 +23,7 @@ node_2_2 = cluster.add_instance("node_2_2", with_zookeeper=True)
 # For test to be runnable multiple times
 seqno = 0
 
+
 @pytest.fixture(scope="module")
 def started_cluster():
     try:

From cf58e8c1e37dda01b73b0cbd2553e3b460aa28ad Mon Sep 17 00:00:00 2001
From: serxa <sergei@clickhouse.com>
Date: Wed, 14 Aug 2024 16:35:45 +0000
Subject: [PATCH 68/88] fix data race in
 `DynamicResourceManager::updateConfiguration`

---
 .../Nodes/DynamicResourceManager.cpp          | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp b/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp
index 01aa7df48d3..6b9f6318903 100644
--- a/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp
+++ b/src/Common/Scheduler/Nodes/DynamicResourceManager.cpp
@@ -184,14 +184,20 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
 
     // Resource update leads to loss of runtime data of nodes and may lead to temporary violation of constraints (e.g. limits)
     // Try to minimise this by reusing "equal" resources (initialized with the same configuration).
+    std::vector<State::ResourcePtr> resources_to_attach;
     for (auto & [name, new_resource] : new_state->resources)
     {
         if (auto iter = state->resources.find(name); iter != state->resources.end()) // Resource update
         {
             State::ResourcePtr old_resource = iter->second;
             if (old_resource->equals(*new_resource))
+            {
                 new_resource = old_resource; // Rewrite with older version to avoid loss of runtime data
+                continue;
+            }
         }
+        // It is new or updated resource
+        resources_to_attach.emplace_back(new_resource);
     }
 
     // Commit new state
@@ -199,17 +205,14 @@ void DynamicResourceManager::updateConfiguration(const Poco::Util::AbstractConfi
     state = new_state;
 
     // Attach new and updated resources to the scheduler
-    for (auto & [name, resource] : new_state->resources)
+    for (auto & resource : resources_to_attach)
     {
         const SchedulerNodePtr & root = resource->nodes.find("/")->second.ptr;
-        if (root->parent == nullptr)
+        resource->attached_to = &scheduler;
+        scheduler.event_queue->enqueue([this, root]
         {
-            resource->attached_to = &scheduler;
-            scheduler.event_queue->enqueue([this, root]
-            {
-                scheduler.attachChild(root);
-            });
-        }
+            scheduler.attachChild(root);
+        });
     }
 
     // NOTE: after mutex unlock `state` became available for Classifier(s) and must be immutable

From 61b96ed7498a19c315d03d6d23330c06837dc990 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Wed, 14 Aug 2024 16:43:12 +0000
Subject: [PATCH 69/88] Make rest of the tests repeatable

---
 .../integration/test_storage_s3_queue/test.py | 77 ++++++++++++-------
 1 file changed, 48 insertions(+), 29 deletions(-)

diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index 08a8a7cac81..664d537a8d1 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -68,8 +68,8 @@ def s3_queue_setup_teardown(started_cluster):
     instance = started_cluster.instances["instance"]
     instance_2 = started_cluster.instances["instance2"]
 
-    instance.query("DROP DATABASE IF EXISTS test; CREATE DATABASE test;")
-    instance_2.query("DROP DATABASE IF EXISTS test; CREATE DATABASE test;")
+    instance.query("DROP DATABASE IF EXISTS default; CREATE DATABASE default;")
+    instance_2.query("DROP DATABASE IF EXISTS default; CREATE DATABASE default;")
 
     minio = started_cluster.minio_client
     objects = list(minio.list_objects(started_cluster.minio_bucket, recursive=True))
@@ -276,7 +276,7 @@ def generate_random_string(length=6):
 @pytest.mark.parametrize("engine_name", ["S3Queue", "AzureQueue"])
 def test_delete_after_processing(started_cluster, mode, engine_name):
     node = started_cluster.instances["instance"]
-    table_name = f"test.delete_after_processing_{mode}_{engine_name}"
+    table_name = f"delete_after_processing_{mode}_{engine_name}"
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
     files_num = 5
@@ -336,7 +336,7 @@ def test_delete_after_processing(started_cluster, mode, engine_name):
 @pytest.mark.parametrize("engine_name", ["S3Queue", "AzureQueue"])
 def test_failed_retry(started_cluster, mode, engine_name):
     node = started_cluster.instances["instance"]
-    table_name = f"test.failed_retry_{mode}_{engine_name}"
+    table_name = f"failed_retry_{mode}_{engine_name}"
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
     file_path = f"{files_path}/trash_test.csv"
@@ -398,9 +398,9 @@ def test_failed_retry(started_cluster, mode, engine_name):
 @pytest.mark.parametrize("mode", AVAILABLE_MODES)
 def test_direct_select_file(started_cluster, mode):
     node = started_cluster.instances["instance"]
-    table_name = f"test.direct_select_file_{mode}"
+    table_name = f"direct_select_file_{mode}"
     # A unique path is necessary for repeatable tests
-    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
+    keeper_path = f"/clickhouse/test_{table_name}_{mode}_{generate_random_string()}"
     files_path = f"{table_name}_data"
     file_path = f"{files_path}/test.csv"
 
@@ -461,7 +461,7 @@ def test_direct_select_file(started_cluster, mode):
     ] == []
 
     # New table with different zookeeper path
-    keeper_path = f"/clickhouse/test_{table_name}_{mode}_2"
+    keeper_path = f"{keeper_path}_2"
     create_table(
         started_cluster,
         node,
@@ -791,10 +791,12 @@ def test_multiple_tables_streaming_sync(started_cluster, mode):
 def test_multiple_tables_streaming_sync_distributed(started_cluster, mode):
     node = started_cluster.instances["instance"]
     node_2 = started_cluster.instances["instance2"]
-    table_name = f"multiple_tables_streaming_sync_distributed_{mode}"
+    # A unique table name is necessary for repeatable tests
+    table_name = (
+        f"multiple_tables_streaming_sync_distributed_{mode}_{generate_random_string()}"
+    )
     dst_table_name = f"{table_name}_dst"
-    # A unique path is necessary for repeatable tests
-    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
+    keeper_path = f"/clickhouse/test_{table_name}"
     files_path = f"{table_name}_data"
     files_to_generate = 300
     row_num = 50
@@ -1058,7 +1060,7 @@ def test_drop_table(started_cluster):
 
 def test_s3_client_reused(started_cluster):
     node = started_cluster.instances["instance"]
-    table_name = f"test.test_s3_client_reused"
+    table_name = f"test_s3_client_reused"
     dst_table_name = f"{table_name}_dst"
     files_path = f"{table_name}_data"
     # A unique path is necessary for repeatable tests
@@ -1698,10 +1700,10 @@ def test_upgrade(started_cluster):
 def test_exception_during_insert(started_cluster):
     node = started_cluster.instances["instance_too_many_parts"]
 
-    table_name = f"test_exception_during_insert"
+    # A unique table name is necessary for repeatable tests
+    table_name = f"test_exception_during_insert_{generate_random_string()}"
     dst_table_name = f"{table_name}_dst"
-    # A unique path is necessary for repeatable tests
-    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
+    keeper_path = f"/clickhouse/test_{table_name}"
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
@@ -1715,6 +1717,7 @@ def test_exception_during_insert(started_cluster):
             "keeper_path": keeper_path,
         },
     )
+    node.rotate_logs()
     total_values = generate_random_files(
         started_cluster, files_path, files_to_generate, start_ind=0, row_num=1
     )
@@ -1731,34 +1734,49 @@ def test_exception_during_insert(started_cluster):
     )
     assert "Too many parts" in exception
 
+    original_parts_to_throw_insert = 0
+    modified_parts_to_throw_insert = 10
     node.replace_in_config(
         "/etc/clickhouse-server/config.d/merge_tree.xml",
-        "parts_to_throw_insert>0",
-        "parts_to_throw_insert>10",
+        f"parts_to_throw_insert>{original_parts_to_throw_insert}",
+        f"parts_to_throw_insert>{modified_parts_to_throw_insert}",
     )
-    node.restart_clickhouse()
+    try:
+        node.restart_clickhouse()
 
-    def get_count():
-        return int(node.query(f"SELECT count() FROM {dst_table_name}"))
+        def get_count():
+            return int(node.query(f"SELECT count() FROM {dst_table_name}"))
 
-    expected_rows = 10
-    for _ in range(20):
-        if expected_rows == get_count():
-            break
-        time.sleep(1)
-    assert expected_rows == get_count()
+        expected_rows = 10
+        for _ in range(20):
+            if expected_rows == get_count():
+                break
+            time.sleep(1)
+        assert expected_rows == get_count()
+    finally:
+        node.replace_in_config(
+            "/etc/clickhouse-server/config.d/merge_tree.xml",
+            f"parts_to_throw_insert>{modified_parts_to_throw_insert}",
+            f"parts_to_throw_insert>{original_parts_to_throw_insert}",
+        )
+        node.restart_clickhouse()
 
 
 def test_commit_on_limit(started_cluster):
     node = started_cluster.instances["instance"]
 
-    table_name = f"test_commit_on_limit"
+    # A unique table name is necessary for repeatable tests
+    table_name = f"test_commit_on_limit_{generate_random_string()}"
     dst_table_name = f"{table_name}_dst"
-    # A unique path is necessary for repeatable tests
-    keeper_path = f"/clickhouse/test_{table_name}_{generate_random_string()}"
+    keeper_path = f"/clickhouse/test_{table_name}"
     files_path = f"{table_name}_data"
     files_to_generate = 10
 
+    failed_files_event_before = int(
+        node.query(
+            "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
+        )
+    )
     create_table(
         started_cluster,
         node,
@@ -1833,7 +1851,8 @@ def test_commit_on_limit(started_cluster):
 
     assert "test_999999.csv" in get_processed_files()
 
-    assert 1 == int(
+    node.count_in_log(f"Setting file {files_path}/test_9999.csv as failed")
+    assert failed_files_event_before + 1 == int(
         node.query(
             "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
         )

From 7b1bca2b488685e3953c9b2950d788d565bff73d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Wed, 14 Aug 2024 16:55:59 +0000
Subject: [PATCH 70/88] Add missing assertion

---
 tests/integration/test_storage_s3_queue/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index 664d537a8d1..34fb1eaf1fe 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -1851,7 +1851,9 @@ def test_commit_on_limit(started_cluster):
 
     assert "test_999999.csv" in get_processed_files()
 
-    node.count_in_log(f"Setting file {files_path}/test_9999.csv as failed")
+    assert 1 == int(
+        node.count_in_log(f"Setting file {files_path}/test_9999.csv as failed")
+    )
     assert failed_files_event_before + 1 == int(
         node.query(
             "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"

From 209d4eb016a58acb27826ce96e61db884490b66f Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Wed, 14 Aug 2024 19:00:17 +0200
Subject: [PATCH 71/88] Fix build

---
 src/Interpreters/Cache/Metadata.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Interpreters/Cache/Metadata.cpp b/src/Interpreters/Cache/Metadata.cpp
index 7e4b76d3cc6..6399691bcf6 100644
--- a/src/Interpreters/Cache/Metadata.cpp
+++ b/src/Interpreters/Cache/Metadata.cpp
@@ -705,7 +705,8 @@ void CacheMetadata::downloadImpl(FileSegment & file_segment, std::optional<Memor
     {
         auto size = reader->available();
 
-        if (!file_segment.reserve(size, reserve_space_lock_wait_timeout_milliseconds))
+        std::string failure_reason;
+        if (!file_segment.reserve(size, reserve_space_lock_wait_timeout_milliseconds, failure_reason))
         {
             LOG_TEST(
                 log, "Failed to reserve space during background download "

From 2e5f45a7ad4924affb1ff8b0e5a40b59b6549621 Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <michael.stetsyuk@clickhouse.com>
Date: Wed, 14 Aug 2024 17:53:33 +0000
Subject: [PATCH 72/88] rename: S3DiskNoKeyErrors -> DiskS3NoSuchKeyErrors

---
 src/Common/CurrentMetrics.cpp                             | 2 +-
 src/IO/S3/Client.cpp                                      | 4 ++--
 tests/integration/test_checking_s3_blobs_paranoid/test.py | 2 +-
 tests/integration/test_storage_delta/test.py              | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index b6dd14d292c..67890568941 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -307,7 +307,7 @@
     M(FilteringMarksWithPrimaryKey, "Number of threads currently doing filtering of mark ranges by the primary key") \
     M(FilteringMarksWithSecondaryKeys, "Number of threads currently doing filtering of mark ranges by secondary keys") \
     \
-    M(S3DiskNoKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
+    M(DiskS3NoSuchKeyErrors, "The number of `NoSuchKey` errors that occur when reading data from S3 cloud storage through ClickHouse disks.") \
 
 #ifdef APPLY_FOR_EXTERNAL_METRICS
     #define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp
index a966e370ca1..8338a235387 100644
--- a/src/IO/S3/Client.cpp
+++ b/src/IO/S3/Client.cpp
@@ -46,7 +46,7 @@ namespace ProfileEvents
 
 namespace CurrentMetrics
 {
-    extern const Metric S3DiskNoKeyErrors;
+    extern const Metric DiskS3NoSuchKeyErrors;
 }
 
 namespace DB
@@ -701,7 +701,7 @@ RequestResult Client::processRequestResult(RequestResult && outcome) const
         return std::forward<RequestResult>(outcome);
 
     if (outcome.GetError().GetErrorType() == Aws::S3::S3Errors::NO_SUCH_KEY)
-        CurrentMetrics::add(CurrentMetrics::S3DiskNoKeyErrors);
+        CurrentMetrics::add(CurrentMetrics::DiskS3NoSuchKeyErrors);
 
     String enriched_message = fmt::format(
         "{} {}",
diff --git a/tests/integration/test_checking_s3_blobs_paranoid/test.py b/tests/integration/test_checking_s3_blobs_paranoid/test.py
index 73f2888ce00..76a0f30f82e 100644
--- a/tests/integration/test_checking_s3_blobs_paranoid/test.py
+++ b/tests/integration/test_checking_s3_blobs_paranoid/test.py
@@ -708,7 +708,7 @@ def test_no_key_found_disk(cluster, broken_s3):
             """
             SELECT value
             FROM system.metrics
-            WHERE metric = 'S3DiskNoKeyErrors'
+            WHERE metric = 'DiskS3NoSuchKeyErrors'
             """
         ).strip()
     )
diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py
index 054b79ff6fe..a595d01e6b3 100644
--- a/tests/integration/test_storage_delta/test.py
+++ b/tests/integration/test_storage_delta/test.py
@@ -464,7 +464,7 @@ def test_restart_broken(started_cluster):
             """
             SELECT value
             FROM system.metrics
-            WHERE metric = 'S3DiskNoKeyErrors'
+            WHERE metric = 'DiskS3NoSuchKeyErrors'
             """
         ).strip()
     )

From 5e037e5ba852ebd1984d957f21a4925fea8de2ff Mon Sep 17 00:00:00 2001
From: Max Kainov <maxkaynov@gmail.com>
Date: Wed, 14 Aug 2024 20:45:50 +0200
Subject: [PATCH 73/88] CI: Minor fixes for changelog and release exceptions

---
 .github/workflows/create_release.yml | 1 +
 tests/ci/changelog.py                | 2 --
 tests/ci/create_release.py           | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index 73613c65266..eb16c25f604 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -101,6 +101,7 @@ jobs:
             --volume=".:/wd" --workdir="/wd" \
             clickhouse/style-test \
             ./tests/ci/changelog.py -v --debug-helpers \
+            --gh-user-or-token ${{ secrets.ROBOT_CLICKHOUSE_COMMIT_TOKEN }} \
             --jobs=5 \
             --output="./docs/changelogs/${{ env.RELEASE_TAG }}.md" ${{ env.RELEASE_TAG }}
           git add ./docs/changelogs/${{ env.RELEASE_TAG }}.md
diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py
index b7f73f22016..554ba339892 100755
--- a/tests/ci/changelog.py
+++ b/tests/ci/changelog.py
@@ -19,7 +19,6 @@ from env_helper import TEMP_PATH
 from git_helper import git_runner, is_shallow
 from github_helper import GitHub, PullRequest, PullRequests, Repository
 from s3_helper import S3Helper
-from get_robot_token import get_best_robot_token
 from ci_utils import Shell
 from version_helper import (
     FILE_WITH_VERSION_PATH,
@@ -172,7 +171,6 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--gh-user-or-token",
         help="user name or GH token to authenticate",
-        default=get_best_robot_token(),
     )
     parser.add_argument(
         "--gh-password",
diff --git a/tests/ci/create_release.py b/tests/ci/create_release.py
index b5ea61e1952..68268b033fe 100755
--- a/tests/ci/create_release.py
+++ b/tests/ci/create_release.py
@@ -484,7 +484,7 @@ class ReleaseInfo:
             )
         else:
             if not dry_run:
-                assert not self.changelog_pr
+                assert not self.version_bump_pr
 
         self.prs_merged = res
 

From aa38024b0e0cdc4a839446df2e7de974efc6b7e7 Mon Sep 17 00:00:00 2001
From: Igor Nikonov <igor@clickhouse.com>
Date: Wed, 14 Aug 2024 20:59:08 +0000
Subject: [PATCH 74/88] Fix UBSan: lower upper bound for
 min_marks_for_concurrent_read

---
 src/Processors/QueryPlan/ReadFromMergeTree.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
index 3ece7b1c5c8..734e67bda24 100644
--- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp
+++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -353,10 +353,11 @@ Pipe ReadFromMergeTree::readFromPoolParallelReplicas(
     /// We have a special logic for local replica. It has to read less data, because in some cases it should
     /// merge states of aggregate functions or do some other important stuff other than reading from Disk.
     auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
-    if (pool_settings.min_marks_for_concurrent_read > std::numeric_limits<Int64>::max())
+    const auto min_marks_for_concurrent_read_limit = std::numeric_limits<Int64>::max() >> 1;
+    if (pool_settings.min_marks_for_concurrent_read > min_marks_for_concurrent_read_limit)
     {
         /// limit min marks to read in case it's big, happened in test since due to settings randomzation
-        pool_settings.min_marks_for_concurrent_read = std::numeric_limits<Int64>::max();
+        pool_settings.min_marks_for_concurrent_read = min_marks_for_concurrent_read_limit;
         multiplier = 1.0f;
     }
 
@@ -529,10 +530,11 @@ Pipe ReadFromMergeTree::readInOrder(
         };
 
         auto multiplier = context->getSettingsRef().parallel_replicas_single_task_marks_count_multiplier;
-        if (pool_settings.min_marks_for_concurrent_read > std::numeric_limits<Int64>::max())
+        const auto min_marks_for_concurrent_read_limit = std::numeric_limits<Int64>::max() >> 1;
+        if (pool_settings.min_marks_for_concurrent_read > min_marks_for_concurrent_read_limit)
         {
             /// limit min marks to read in case it's big, happened in test since due to settings randomzation
-            pool_settings.min_marks_for_concurrent_read = std::numeric_limits<Int64>::max();
+            pool_settings.min_marks_for_concurrent_read = min_marks_for_concurrent_read_limit;
             multiplier = 1.0f;
         }
 

From b077f2cc9c11b01c443eb1ff976457965f7297ee Mon Sep 17 00:00:00 2001
From: maxvostrikov <max.vostrikov@clickhouse.com>
Date: Thu, 15 Aug 2024 02:31:10 +0200
Subject: [PATCH 75/88] performance comparison test for
 output_format_parquet_write_page_index setting added new performance
 comparison test for output_format_parquet_write_page_index setting

---
 tests/performance/parquet_read_with_index.xml | 30 +++++++++++++++++++
 1 file changed, 30 insertions(+)
 create mode 100644 tests/performance/parquet_read_with_index.xml

diff --git a/tests/performance/parquet_read_with_index.xml b/tests/performance/parquet_read_with_index.xml
new file mode 100644
index 00000000000..1bb2d8eb4a2
--- /dev/null
+++ b/tests/performance/parquet_read_with_index.xml
@@ -0,0 +1,30 @@
+<test>
+    <fill_query>
+        INSERT INTO FUNCTION file('test_pq_index', Parquet) SELECT * FROM generateRandom('int64_column Nullable(Int64), tuple_column Tuple(a Nullable(String), b Nullable(Float64), c Tuple(i UInt32, j UInt32)),array_tuple_column Array(Tuple(a Nullable(String), b Nullable(Float64), c Nullable(Int64))), map_tuple_column Map(String, Tuple(a Nullable(String), b Nullable(Float64), c Nullable(Int64)))') limit 1000000 SETTINGS output_format_parquet_use_custom_encoder=false, output_format_parquet_write_page_index=true
+    </fill_query>
+
+    <query>
+        SELECT * FROM file('test_pq_index', Parquet, 'tuple_column Tuple(a Nullable(String))') Format Null
+    </query>
+
+    <query>
+        SELECT tuple_column.a FROM file('test_pq_index', Parquet) Format Null
+    </query>
+
+    <query>
+        SELECT tuple_column.a FROM file('test_pq_index', Parquet, 'tuple_column Tuple(a Nullable(String))') Format Null
+    </query>
+
+    <query>
+        SELECT tuple_column.c.i FROM file('test_pq_index', Parquet) Format Null
+    </query>
+
+    <query>
+        SELECT * FROM file('test_pq_index', Parquet, 'array_tuple_column Array (Tuple(a Nullable(String)))') Format Null
+    </query>
+
+    <query>
+        SELECT * FROM file('test_pq_index', Parquet, 'map_tuple_column Map(String, Tuple(a Nullable(String)))') Format Null
+    </query>
+
+</test>

From 690c4d0803366a3a6fe1887e5c01b35d80f501f9 Mon Sep 17 00:00:00 2001
From: lgbo-ustc <lgbo.ustc@gmail.com>
Date: Thu, 15 Aug 2024 09:04:22 +0800
Subject: [PATCH 76/88] update

---
 src/Processors/Transforms/WindowTransform.cpp | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp
index cae817380e0..bd11aa4cd28 100644
--- a/src/Processors/Transforms/WindowTransform.cpp
+++ b/src/Processors/Transforms/WindowTransform.cpp
@@ -1157,14 +1157,7 @@ void WindowTransform::appendChunk(Chunk & chunk)
         // Initialize output columns.
         for (auto & ws : workspaces)
         {
-            if (ws.window_function_impl)
-                block.casted_columns.push_back(ws.window_function_impl->castColumn(block.input_columns, ws.argument_column_indices));
-            else
-            {
-                /// `castColumn` returns nullptr at default, so it's OK to put nullptr as a placeholder here
-                /// it should not be used in fact.
-                block.casted_columns.push_back(nullptr);
-            }
+            block.casted_columns.push_back(ws.window_function_impl ? ws.window_function_impl->castColumn(block.input_columns, ws.argument_column_indices) : nullptr);
 
             block.output_columns.push_back(ws.aggregate_function->getResultType()
                 ->createColumn());

From 0bb076a4d381fcc4e9827bebedbbe46ded9b9278 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 15 Aug 2024 08:08:00 +0000
Subject: [PATCH 77/88] Improve schema inference of date times

---
 src/Core/Settings.h                           |   1 +
 src/Core/SettingsChangesHistory.cpp           |   1 +
 src/Formats/EscapingRuleUtils.cpp             |   5 +-
 src/Formats/FormatFactory.cpp                 |   1 +
 src/Formats/FormatSettings.h                  |   1 +
 src/Formats/SchemaInferenceUtils.cpp          | 119 +++++---
 src/IO/ReadHelpers.cpp                        |  29 +-
 src/IO/ReadHelpers.h                          |  77 +++--
 src/IO/parseDateTimeBestEffort.cpp            |  77 ++++-
 src/IO/parseDateTimeBestEffort.h              |   8 +
 .../03222_date_time_inference.reference       | 253 +++++++++++++++++
 .../0_stateless/03222_date_time_inference.sql | 268 ++++++++++++++++++
 12 files changed, 761 insertions(+), 79 deletions(-)
 create mode 100644 tests/queries/0_stateless/03222_date_time_inference.reference
 create mode 100644 tests/queries/0_stateless/03222_date_time_inference.sql

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 0808e8eb49f..ad6cc89c5cd 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -1136,6 +1136,7 @@ class IColumn;
     M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
     M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
+    M(Bool, input_format_try_infer_datetimes_only_datetime64, false, "When input_format_try_infer_datetimes is enabled, infer only DateTime64 but not DateTime types", 0) \
     M(Bool, input_format_try_infer_exponent_floats, false, "Try to infer floats in exponential notation while schema inference in text formats (except JSON, where exponent numbers are always inferred)", 0) \
     M(Bool, output_format_markdown_escape_special_characters, false, "Escape special characters in Markdown", 0) \
     M(Bool, input_format_protobuf_flatten_google_wrappers, false, "Enable Google wrappers for regular non-nested columns, e.g. google.protobuf.StringValue 'str' for String column 'str'. For Nullable columns empty wrappers are recognized as defaults, and missing as nulls", 0) \
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 20a8721c10e..b344a141a46 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -88,6 +88,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"enable_analyzer", 1, 1, "Added an alias to a setting `allow_experimental_analyzer`."},
             {"optimize_functions_to_subcolumns", false, true, "Enabled settings by default"},
             {"allow_experimental_vector_similarity_index", false, false, "Added new setting to allow experimental vector similarity indexes"},
+            {"input_format_try_infer_datetimes_only_datetime64", true, false, "Allow to infer DateTime instead of DateTime64 in data formats"}
         }
     },
     {"24.7",
diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp
index 58407a810c5..e7d9be39ec9 100644
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@@ -419,10 +419,11 @@ String getAdditionalFormatInfoByEscapingRule(const FormatSettings & settings, Fo
     String result = getAdditionalFormatInfoForAllRowBasedFormats(settings);
     /// First, settings that are common for all text formats:
     result += fmt::format(
-        ", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}",
+        ", try_infer_integers={}, try_infer_dates={}, try_infer_datetimes={}, try_infer_datetimes_only_datetime64={}",
         settings.try_infer_integers,
         settings.try_infer_dates,
-        settings.try_infer_datetimes);
+        settings.try_infer_datetimes,
+        settings.try_infer_datetimes_only_datetime64);
 
     /// Second, format-specific settings:
     switch (escaping_rule)
diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp
index a78836ff63c..da57c59bdfc 100644
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@@ -266,6 +266,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
     format_settings.try_infer_integers = settings.input_format_try_infer_integers;
     format_settings.try_infer_dates = settings.input_format_try_infer_dates;
     format_settings.try_infer_datetimes = settings.input_format_try_infer_datetimes;
+    format_settings.try_infer_datetimes_only_datetime64 = settings.input_format_try_infer_datetimes_only_datetime64;
     format_settings.try_infer_exponent_floats = settings.input_format_try_infer_exponent_floats;
     format_settings.markdown.escape_special_characters = settings.output_format_markdown_escape_special_characters;
     format_settings.bson.output_string_as_string = settings.output_format_bson_string_as_string;
diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h
index f0359218775..3970c776ad2 100644
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@@ -46,6 +46,7 @@ struct FormatSettings
     bool try_infer_integers = true;
     bool try_infer_dates = true;
     bool try_infer_datetimes = true;
+    bool try_infer_datetimes_only_datetime64 = false;
     bool try_infer_exponent_floats = false;
 
     enum class DateTimeInputFormat : uint8_t
diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp
index 3c374ada9e6..5bd41e33f58 100644
--- a/src/Formats/SchemaInferenceUtils.cpp
+++ b/src/Formats/SchemaInferenceUtils.cpp
@@ -306,37 +306,45 @@ namespace
         type_indexes.erase(TypeIndex::UInt64);
     }
 
-    /// If we have only Date and DateTime types, convert Date to DateTime,
-    /// otherwise, convert all Date and DateTime to String.
+    /// If we have only date/datetimes types (Date/DateTime/DateTime64), convert all of them to the common type,
+    /// otherwise, convert all Date, DateTime and DateTime64 to String.
     void transformDatesAndDateTimes(DataTypes & data_types, TypeIndexesSet & type_indexes)
     {
         bool have_dates = type_indexes.contains(TypeIndex::Date);
-        bool have_datetimes = type_indexes.contains(TypeIndex::DateTime64);
-        bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes)));
+        bool have_datetimes = type_indexes.contains(TypeIndex::DateTime);
+        bool have_datetimes64 = type_indexes.contains(TypeIndex::DateTime64);
+        bool all_dates_or_datetimes = (type_indexes.size() == (static_cast<size_t>(have_dates) + static_cast<size_t>(have_datetimes) + static_cast<size_t>(have_datetimes64)));
 
-        if (!all_dates_or_datetimes && (have_dates || have_datetimes))
+        if (!all_dates_or_datetimes && (have_dates || have_datetimes || have_datetimes64))
         {
             for (auto & type : data_types)
             {
-                if (isDate(type) || isDateTime64(type))
+                if (isDate(type) || isDateTime(type) || isDateTime64(type))
                     type = std::make_shared<DataTypeString>();
             }
 
             type_indexes.erase(TypeIndex::Date);
             type_indexes.erase(TypeIndex::DateTime);
+            type_indexes.erase(TypeIndex::DateTime64);
             type_indexes.insert(TypeIndex::String);
             return;
         }
 
-        if (have_dates && have_datetimes)
+        for (auto & type : data_types)
         {
-            for (auto & type : data_types)
+            if (isDate(type) && (have_datetimes || have_datetimes64))
             {
-                if (isDate(type))
+                if (have_datetimes64)
                     type = std::make_shared<DataTypeDateTime64>(9);
+                else
+                    type = std::make_shared<DataTypeDateTime>();
+                type_indexes.erase(TypeIndex::Date);
+            }
+            else if (isDateTime(type) && have_datetimes64)
+            {
+                type = std::make_shared<DataTypeDateTime64>(9);
+                type_indexes.erase(TypeIndex::DateTime);
             }
-
-            type_indexes.erase(TypeIndex::Date);
         }
     }
 
@@ -697,55 +705,87 @@ namespace
 
     bool tryInferDate(std::string_view field)
     {
-        if (field.empty())
+        /// Minimum length of Date text representation is 8 (YYYY-M-D) and maximum is 10 (YYYY-MM-DD)
+        if (field.size() < 8 || field.size() > 10)
             return false;
 
-        ReadBufferFromString buf(field);
-        Float64 tmp_float;
         /// Check if it's just a number, and if so, don't try to infer Date from it,
         /// because we can interpret this number as a Date (for example 20000101 will be 2000-01-01)
         /// and it will lead to inferring Date instead of simple Int64/UInt64 in some cases.
-        if (tryReadFloatText(tmp_float, buf) && buf.eof())
-            return false;
-
-        buf.seek(0, SEEK_SET); /// Return position to the beginning
-
-        DayNum tmp;
-        return tryReadDateText(tmp, buf) && buf.eof();
-    }
-
-    bool tryInferDateTime(std::string_view field, const FormatSettings & settings)
-    {
-        if (field.empty())
+        if (std::all_of(field.begin(), field.end(), isNumericASCII))
             return false;
 
         ReadBufferFromString buf(field);
-        Float64 tmp_float;
+        DayNum tmp;
+        return tryReadDateText(tmp, buf, DateLUT::instance(), /*allowed_delimiters=*/"-/:") && buf.eof();
+    }
+
+    DataTypePtr tryInferDateTimeOrDateTime64(std::string_view field, const FormatSettings & settings)
+    {
+        /// Don't try to infer DateTime if string is too long.
+        /// It's difficult to say what is the real maximum length of
+        /// DateTime we can parse using BestEffort approach.
+        /// 50 symbols is more or less valid limit for date times that makes sense.
+        if (field.empty() || field.size() > 50)
+            return nullptr;
+
+        /// Check that we have at least one digit, don't infer datetime form strings like "Apr"/"May"/etc.
+        if (!std::any_of(field.begin(), field.end(), isNumericASCII))
+            return nullptr;
+
         /// Check if it's just a number, and if so, don't try to infer DateTime from it,
         /// because we can interpret this number as a timestamp and it will lead to
-        /// inferring DateTime instead of simple Int64/Float64 in some cases.
+        /// inferring DateTime instead of simple Int64 in some cases.
+        if (std::all_of(field.begin(), field.end(), isNumericASCII))
+            return nullptr;
+
+        ReadBufferFromString buf(field);
+        Float64 tmp_float;
+        /// Check if it's a float value, and if so, don't try to infer DateTime from it,
+        /// because it will lead to inferring DateTime instead of simple Float64 in some cases.
         if (tryReadFloatText(tmp_float, buf) && buf.eof())
-            return false;
+            return nullptr;
+
+        buf.seek(0, SEEK_SET); /// Return position to the beginning
+        if (!settings.try_infer_datetimes_only_datetime64)
+        {
+            time_t tmp;
+            switch (settings.date_time_input_format)
+            {
+                case FormatSettings::DateTimeInputFormat::Basic:
+                    if (tryReadDateTimeText(tmp, buf, DateLUT::instance(), /*allowed_date_delimiters=*/"-/:", /*allowed_time_delimiters=*/":") && buf.eof())
+                        return std::make_shared<DataTypeDateTime>();
+                    break;
+                case FormatSettings::DateTimeInputFormat::BestEffort:
+                    if (tryParseDateTimeBestEffortStrict(tmp, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
+                        return std::make_shared<DataTypeDateTime>();
+                    break;
+                case FormatSettings::DateTimeInputFormat::BestEffortUS:
+                    if (tryParseDateTimeBestEffortUSStrict(tmp, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
+                        return std::make_shared<DataTypeDateTime>();
+                    break;
+            }
+        }
 
         buf.seek(0, SEEK_SET); /// Return position to the beginning
         DateTime64 tmp;
         switch (settings.date_time_input_format)
         {
             case FormatSettings::DateTimeInputFormat::Basic:
-                if (tryReadDateTime64Text(tmp, 9, buf) && buf.eof())
-                    return true;
+                if (tryReadDateTime64Text(tmp, 9, buf, DateLUT::instance(), /*allowed_date_delimiters=*/"-/:", /*allowed_time_delimiters=*/":") && buf.eof())
+                    return std::make_shared<DataTypeDateTime64>(9);
                 break;
             case FormatSettings::DateTimeInputFormat::BestEffort:
-                if (tryParseDateTime64BestEffort(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
-                    return true;
+                if (tryParseDateTime64BestEffortStrict(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
+                    return std::make_shared<DataTypeDateTime64>(9);
                 break;
             case FormatSettings::DateTimeInputFormat::BestEffortUS:
-                if (tryParseDateTime64BestEffortUS(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC")) && buf.eof())
-                    return true;
+                if (tryParseDateTime64BestEffortUSStrict(tmp, 9, buf, DateLUT::instance(), DateLUT::instance("UTC"), /*allowed_date_delimiters=*/"-/:") && buf.eof())
+                    return std::make_shared<DataTypeDateTime64>(9);
                 break;
         }
 
-        return false;
+        return nullptr;
     }
 
     template <bool is_json>
@@ -1439,8 +1479,11 @@ DataTypePtr tryInferDateOrDateTimeFromString(std::string_view field, const Forma
     if (settings.try_infer_dates && tryInferDate(field))
         return std::make_shared<DataTypeDate>();
 
-    if (settings.try_infer_datetimes && tryInferDateTime(field, settings))
-        return std::make_shared<DataTypeDateTime64>(9);
+    if (settings.try_infer_datetimes)
+    {
+        if (auto type = tryInferDateTimeOrDateTime64(field, settings))
+            return type;
+    }
 
     return nullptr;
 }
diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp
index 9559462e62b..48d788512e4 100644
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@@ -1271,7 +1271,7 @@ template void readJSONArrayInto<PaddedPODArray<UInt8>, void>(PaddedPODArray<UInt
 template bool readJSONArrayInto<PaddedPODArray<UInt8>, bool>(PaddedPODArray<UInt8> & s, ReadBuffer & buf);
 
 template <typename ReturnType>
-ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
+ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
@@ -1318,6 +1318,9 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
     }
     else
     {
+        if (!isSymbolIn(*buf.position(), allowed_delimiters))
+            return error();
+
         ++buf.position();
 
         if (!append_digit(month))
@@ -1325,7 +1328,11 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
         append_digit(month);
 
         if (!buf.eof() && !isNumericASCII(*buf.position()))
+        {
+            if (!isSymbolIn(*buf.position(), allowed_delimiters))
+                return error();
             ++buf.position();
+        }
         else
             return error();
 
@@ -1338,12 +1345,12 @@ ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf)
     return ReturnType(true);
 }
 
-template void readDateTextFallback<void>(LocalDate &, ReadBuffer &);
-template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &);
+template void readDateTextFallback<void>(LocalDate &, ReadBuffer &, const char * allowed_delimiters);
+template bool readDateTextFallback<bool>(LocalDate &, ReadBuffer &, const char * allowed_delimiters);
 
 
 template <typename ReturnType, bool dt64_mode>
-ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
+ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters, const char * allowed_time_delimiters)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
@@ -1413,6 +1420,9 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
             if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3])
                 || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9]))
                 return false;
+
+            if (!isSymbolIn(s[4], allowed_date_delimiters) || !isSymbolIn(s[7], allowed_date_delimiters))
+                return false;
         }
 
         UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
@@ -1443,6 +1453,9 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
                 if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[3]) || !isNumericASCII(s[4])
                     || !isNumericASCII(s[6]) || !isNumericASCII(s[7]))
                     return false;
+
+                if (!isSymbolIn(s[2], allowed_time_delimiters) || !isSymbolIn(s[5], allowed_time_delimiters))
+                    return false;
             }
 
             hour = (s[0] - '0') * 10 + (s[1] - '0');
@@ -1488,10 +1501,10 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
     return ReturnType(true);
 }
 
-template void readDateTimeTextFallback<void, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
-template void readDateTimeTextFallback<void, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
-template bool readDateTimeTextFallback<bool, false>(time_t &, ReadBuffer &, const DateLUTImpl &);
-template bool readDateTimeTextFallback<bool, true>(time_t &, ReadBuffer &, const DateLUTImpl &);
+template void readDateTimeTextFallback<void, false>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
+template void readDateTimeTextFallback<void, true>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
+template bool readDateTimeTextFallback<bool, false>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
+template bool readDateTimeTextFallback<bool, true>(time_t &, ReadBuffer &, const DateLUTImpl &, const char *, const char *);
 
 
 template <typename ReturnType>
diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h
index ffba4fafb5c..39e1cb12b5c 100644
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@@ -703,13 +703,28 @@ struct NullOutput
 };
 
 template <typename ReturnType>
-ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf);
+ReturnType readDateTextFallback(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters);
+
+inline bool isSymbolIn(char symbol, const char * symbols)
+{
+    if (symbols == nullptr)
+        return true;
+
+    const char * pos = symbols;
+    while (*pos)
+    {
+        if (*pos == symbol)
+            return true;
+        ++pos;
+    }
+    return false;
+}
 
 /// In YYYY-MM-DD format.
 /// For convenience, Month and Day parts can have single digit instead of two digits.
 /// Any separators other than '-' are supported.
 template <typename ReturnType = void>
-inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
+inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters = nullptr)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
@@ -753,6 +768,9 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
         }
         else
         {
+            if (!isSymbolIn(pos[-1], allowed_delimiters))
+                return error();
+
             if (!isNumericASCII(pos[0]))
                 return error();
 
@@ -768,6 +786,9 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
             if (isNumericASCII(pos[-1]) || !isNumericASCII(pos[0]))
                 return error();
 
+            if (!isSymbolIn(pos[-1], allowed_delimiters))
+                return error();
+
             day = pos[0] - '0';
             if (isNumericASCII(pos[1]))
             {
@@ -783,7 +804,7 @@ inline ReturnType readDateTextImpl(LocalDate & date, ReadBuffer & buf)
         return ReturnType(true);
     }
     else
-        return readDateTextFallback<ReturnType>(date, buf);
+        return readDateTextFallback<ReturnType>(date, buf, allowed_delimiters);
 }
 
 inline void convertToDayNum(DayNum & date, ExtendedDayNum & from)
@@ -797,15 +818,15 @@ inline void convertToDayNum(DayNum & date, ExtendedDayNum & from)
 }
 
 template <typename ReturnType = void>
-inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
+inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_delimiters = nullptr)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
     LocalDate local_date;
 
     if constexpr (throw_exception)
-        readDateTextImpl<ReturnType>(local_date, buf);
-    else if (!readDateTextImpl<ReturnType>(local_date, buf))
+        readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters);
+    else if (!readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters))
         return false;
 
     ExtendedDayNum ret = date_lut.makeDayNum(local_date.year(), local_date.month(), local_date.day());
@@ -814,15 +835,15 @@ inline ReturnType readDateTextImpl(DayNum & date, ReadBuffer & buf, const DateLU
 }
 
 template <typename ReturnType = void>
-inline ReturnType readDateTextImpl(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut)
+inline ReturnType readDateTextImpl(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_delimiters = nullptr)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
     LocalDate local_date;
 
     if constexpr (throw_exception)
-        readDateTextImpl<ReturnType>(local_date, buf);
-    else if (!readDateTextImpl<ReturnType>(local_date, buf))
+        readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters);
+    else if (!readDateTextImpl<ReturnType>(local_date, buf, allowed_delimiters))
         return false;
 
     /// When the parameter is out of rule or out of range, Date32 uses 1925-01-01 as the default value (-DateLUT::instance().getDayNumOffsetEpoch(), -16436) and Date uses 1970-01-01.
@@ -846,19 +867,19 @@ inline void readDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTI
     readDateTextImpl<void>(date, buf, date_lut);
 }
 
-inline bool tryReadDateText(LocalDate & date, ReadBuffer & buf)
+inline bool tryReadDateText(LocalDate & date, ReadBuffer & buf, const char * allowed_delimiters = nullptr)
 {
-    return readDateTextImpl<bool>(date, buf);
+    return readDateTextImpl<bool>(date, buf, allowed_delimiters);
 }
 
-inline bool tryReadDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+inline bool tryReadDateText(DayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_delimiters = nullptr)
 {
-    return readDateTextImpl<bool>(date, buf, time_zone);
+    return readDateTextImpl<bool>(date, buf, time_zone, allowed_delimiters);
 }
 
-inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+inline bool tryReadDateText(ExtendedDayNum & date, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_delimiters = nullptr)
 {
-    return readDateTextImpl<bool>(date, buf, time_zone);
+    return readDateTextImpl<bool>(date, buf, time_zone, allowed_delimiters);
 }
 
 UUID parseUUID(std::span<const UInt8> src);
@@ -975,13 +996,13 @@ inline T parseFromString(std::string_view str)
 
 
 template <typename ReturnType = void, bool dt64_mode = false>
-ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut);
+ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr);
 
 /** In YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format, according to specified time zone.
   * As an exception, also supported parsing of unix timestamp in form of decimal number.
   */
 template <typename ReturnType = void, bool dt64_mode = false>
-inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
+inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
@@ -1014,6 +1035,9 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
                 if (!isNumericASCII(s[0]) || !isNumericASCII(s[1]) || !isNumericASCII(s[2]) || !isNumericASCII(s[3])
                     || !isNumericASCII(s[5]) || !isNumericASCII(s[6]) || !isNumericASCII(s[8]) || !isNumericASCII(s[9]))
                     return ReturnType(false);
+
+                if (!isSymbolIn(s[4], allowed_date_delimiters) || !isSymbolIn(s[7], allowed_date_delimiters))
+                    return ReturnType(false);
             }
 
             UInt16 year = (s[0] - '0') * 1000 + (s[1] - '0') * 100 + (s[2] - '0') * 10 + (s[3] - '0');
@@ -1033,6 +1057,9 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
                     if (!isNumericASCII(s[11]) || !isNumericASCII(s[12]) || !isNumericASCII(s[14]) || !isNumericASCII(s[15])
                         || !isNumericASCII(s[17]) || !isNumericASCII(s[18]))
                         return ReturnType(false);
+
+                    if (!isSymbolIn(s[13], allowed_time_delimiters) || !isSymbolIn(s[16], allowed_time_delimiters))
+                        return ReturnType(false);
                 }
 
                 hour = (s[11] - '0') * 10 + (s[12] - '0');
@@ -1057,11 +1084,11 @@ inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, cons
             return readIntTextImpl<time_t, ReturnType, ReadIntTextCheckOverflow::CHECK_OVERFLOW>(datetime, buf);
     }
     else
-        return readDateTimeTextFallback<ReturnType, dt64_mode>(datetime, buf, date_lut);
+        return readDateTimeTextFallback<ReturnType, dt64_mode>(datetime, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
 }
 
 template <typename ReturnType>
-inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut)
+inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut, const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
 {
     static constexpr bool throw_exception = std::is_same_v<ReturnType, void>;
 
@@ -1075,7 +1102,7 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
         {
             try
             {
-                readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
+                readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
             }
             catch (const DB::Exception &)
             {
@@ -1085,7 +1112,7 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
         }
         else
         {
-            auto ok = readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut);
+            auto ok = readDateTimeTextImpl<ReturnType, true>(whole, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
             if (!ok && (buf.eof() || *buf.position() != '.'))
                 return ReturnType(false);
         }
@@ -1168,14 +1195,14 @@ inline void readDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer
     readDateTimeTextImpl<void>(datetime64, scale, buf, date_lut);
 }
 
-inline bool tryReadDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
+inline bool tryReadDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance(), const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
 {
-    return readDateTimeTextImpl<bool>(datetime, buf, time_zone);
+    return readDateTimeTextImpl<bool>(datetime, buf, time_zone, allowed_date_delimiters, allowed_time_delimiters);
 }
 
-inline bool tryReadDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
+inline bool tryReadDateTime64Text(DateTime64 & datetime64, UInt32 scale, ReadBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance(), const char * allowed_date_delimiters = nullptr, const char * allowed_time_delimiters = nullptr)
 {
-    return readDateTimeTextImpl<bool>(datetime64, scale, buf, date_lut);
+    return readDateTimeTextImpl<bool>(datetime64, scale, buf, date_lut, allowed_date_delimiters, allowed_time_delimiters);
 }
 
 inline void readDateTimeText(LocalDateTime & datetime, ReadBuffer & buf)
diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp
index e046e837689..68122a37df6 100644
--- a/src/IO/parseDateTimeBestEffort.cpp
+++ b/src/IO/parseDateTimeBestEffort.cpp
@@ -82,13 +82,14 @@ struct DateTimeSubsecondPart
     UInt8 digits;
 };
 
-template <typename ReturnType, bool is_us_style>
+template <typename ReturnType, bool is_us_style, bool strict = false>
 ReturnType parseDateTimeBestEffortImpl(
     time_t & res,
     ReadBuffer & in,
     const DateLUTImpl & local_time_zone,
     const DateLUTImpl & utc_time_zone,
-    DateTimeSubsecondPart * fractional)
+    DateTimeSubsecondPart * fractional,
+    const char * allowed_date_delimiters = nullptr)
 {
     auto on_error = [&]<typename... FmtArgs>(int error_code [[maybe_unused]],
                                              FormatStringHelper<FmtArgs...> fmt_string [[maybe_unused]],
@@ -170,22 +171,36 @@ ReturnType parseDateTimeBestEffortImpl(
                     fractional->digits = 3;
                     readDecimalNumber<3>(fractional->value, digits + 10);
                 }
+                else if constexpr (strict)
+                {
+                    /// Fractional part is not allowed.
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected fractional part");
+                }
                 return ReturnType(true);
             }
             else if (num_digits == 10 && !year && !has_time)
             {
+                if (strict && month)
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
+
                 /// This is unix timestamp.
                 readDecimalNumber<10>(res, digits);
                 return ReturnType(true);
             }
             else if (num_digits == 9 && !year && !has_time)
             {
+                if (strict && month)
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
+
                 /// This is unix timestamp.
                 readDecimalNumber<9>(res, digits);
                 return ReturnType(true);
             }
             else if (num_digits == 14 && !year && !has_time)
             {
+                if (strict && month)
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
+
                 /// This is YYYYMMDDhhmmss
                 readDecimalNumber<4>(year, digits);
                 readDecimalNumber<2>(month, digits + 4);
@@ -197,6 +212,9 @@ ReturnType parseDateTimeBestEffortImpl(
             }
             else if (num_digits == 8 && !year)
             {
+                if (strict && month)
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month component is duplicated");
+
                 /// This is YYYYMMDD
                 readDecimalNumber<4>(year, digits);
                 readDecimalNumber<2>(month, digits + 4);
@@ -272,6 +290,9 @@ ReturnType parseDateTimeBestEffortImpl(
                         else
                             return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected number of decimal digits after year and month: {}", num_digits);
                     }
+
+                    if (!isSymbolIn(delimiter_after_year, allowed_date_delimiters))
+                        return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: '{}' delimiter between date parts is not allowed", delimiter_after_year);
                 }
             }
             else if (num_digits == 2 || num_digits == 1)
@@ -403,9 +424,16 @@ ReturnType parseDateTimeBestEffortImpl(
                 else
                 {
                     if (day_of_month)
+                    {
+                        if (strict && hour)
+                            return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: hour component is duplicated");
+
                         hour = hour_or_day_of_month_or_month;
+                    }
                     else
+                    {
                         day_of_month = hour_or_day_of_month_or_month;
+                    }
                 }
             }
             else if (num_digits != 0)
@@ -446,6 +474,11 @@ ReturnType parseDateTimeBestEffortImpl(
                     fractional->digits = num_digits;
                     readDecimalNumber(fractional->value, num_digits, digits);
                 }
+                else if (strict)
+                {
+                    /// Fractional part is not allowed.
+                    return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: unexpected fractional part");
+                }
             }
             else if (c == '+' || c == '-')
             {
@@ -582,12 +615,24 @@ ReturnType parseDateTimeBestEffortImpl(
         return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: neither Date nor Time was parsed successfully");
 
     if (!day_of_month)
+    {
+        if constexpr (strict)
+            return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: day of month is required");
         day_of_month = 1;
+    }
+
     if (!month)
+    {
+        if constexpr (strict)
+            return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: month is required");
         month = 1;
+    }
 
     if (!year)
     {
+        if constexpr (strict)
+            return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: year is required");
+
         /// If year is not specified, it will be the current year if the date is unknown or not greater than today,
         /// otherwise it will be the previous year.
         /// This convoluted logic is needed to parse the syslog format, which looks as follows: "Mar  3 01:33:48".
@@ -654,20 +699,20 @@ ReturnType parseDateTimeBestEffortImpl(
     return ReturnType(true);
 }
 
-template <typename ReturnType, bool is_us_style>
-ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone)
+template <typename ReturnType, bool is_us_style, bool strict = false>
+ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters = nullptr)
 {
     time_t whole;
     DateTimeSubsecondPart subsecond = {0, 0}; // needs to be explicitly initialized sine it could be missing from input string
 
     if constexpr (std::is_same_v<ReturnType, bool>)
     {
-        if (!parseDateTimeBestEffortImpl<bool, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond))
+        if (!parseDateTimeBestEffortImpl<bool, is_us_style, strict>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters))
             return false;
     }
     else
     {
-        parseDateTimeBestEffortImpl<ReturnType, is_us_style>(whole, in, local_time_zone, utc_time_zone, &subsecond);
+        parseDateTimeBestEffortImpl<ReturnType, is_us_style, strict>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters);
     }
 
 
@@ -730,4 +775,24 @@ bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer &
     return parseDateTime64BestEffortImpl<bool, true>(res, scale, in, local_time_zone, utc_time_zone);
 }
 
+bool tryParseDateTimeBestEffortStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
+{
+    return parseDateTimeBestEffortImpl<bool, false, true>(res, in, local_time_zone, utc_time_zone, nullptr, allowed_date_delimiters);
+}
+
+bool tryParseDateTimeBestEffortUSStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
+{
+    return parseDateTimeBestEffortImpl<bool, true, true>(res, in, local_time_zone, utc_time_zone, nullptr, allowed_date_delimiters);
+}
+
+bool tryParseDateTime64BestEffortStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
+{
+    return parseDateTime64BestEffortImpl<bool, false, true>(res, scale, in, local_time_zone, utc_time_zone, allowed_date_delimiters);
+}
+
+bool tryParseDateTime64BestEffortUSStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters)
+{
+    return parseDateTime64BestEffortImpl<bool, true, true>(res, scale, in, local_time_zone, utc_time_zone, allowed_date_delimiters);
+}
+
 }
diff --git a/src/IO/parseDateTimeBestEffort.h b/src/IO/parseDateTimeBestEffort.h
index 22af44f9e76..6dd052b67a3 100644
--- a/src/IO/parseDateTimeBestEffort.h
+++ b/src/IO/parseDateTimeBestEffort.h
@@ -63,4 +63,12 @@ void parseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in,
 bool tryParseDateTime64BestEffort(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
 void parseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
 bool tryParseDateTime64BestEffortUS(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone);
+
+/// More strict version of best effort parsing. Requires day, month and year to be present, checks for allowed
+/// delimiters between date components, makes additional correctness checks. Used in schema inference if date times.
+bool tryParseDateTimeBestEffortStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
+bool tryParseDateTimeBestEffortUSStrict(time_t & res, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
+bool tryParseDateTime64BestEffortStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
+bool tryParseDateTime64BestEffortUSStrict(DateTime64 & res, UInt32 scale, ReadBuffer & in, const DateLUTImpl & local_time_zone, const DateLUTImpl & utc_time_zone, const char * allowed_date_delimiters);
+
 }
diff --git a/tests/queries/0_stateless/03222_date_time_inference.reference b/tests/queries/0_stateless/03222_date_time_inference.reference
new file mode 100644
index 00000000000..3288308a1d0
--- /dev/null
+++ b/tests/queries/0_stateless/03222_date_time_inference.reference
@@ -0,0 +1,253 @@
+Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+2020-01-01	Date
+String
+2020_01_01	String
+2020_1_01	String
+2020_01_1	String
+2020_1_1	String
+2020a01a01	String
+2020a1a01	String
+2020a01a1	String
+2020a1a1	String
+20200101	String
+DateTime
+2020-01-02 18:42:42	DateTime
+2020-01-02 18:42:42	DateTime
+2020-01-02 18:42:42	DateTime
+String
+2020_01_01 42:42:42	String
+2020a01a01 42:42:42	String
+2020-01-01 42.42.42	String
+2020-01-01 42 42 42	String
+2020-01-01 42a42a42	String
+DateTime64
+2020-01-02 18:42:42.424200000	DateTime64(9)
+2020-01-02 18:42:42.424200000	DateTime64(9)
+2020-01-02 18:42:42.424200000	DateTime64(9)
+String
+2020_01_01 42:42:42.4242	String
+2020a01a01 42:42:42.4242	String
+2020-01-01 42.42.42.4242	String
+2020-01-01 42 42 42.4242	String
+2020-01-01 42a42a42.4242	String
+DateTime/DateTime64 best effort
+2000-01-01 00:00:00	DateTime
+2000-01-01 01:00:00	DateTime
+2000-01-01 01:00:00.000000000	DateTime64(9)
+2017-01-01 22:02:03	DateTime
+2017-01-01 22:02:03.000000000	DateTime64(9)
+2017-01-01 21:02:03	DateTime
+2017-01-01 21:02:03.000000000	DateTime64(9)
+2017-01-01 22:02:03	DateTime
+2017-01-01 22:02:03.000000000	DateTime64(9)
+2017-01-02 01:02:03	DateTime
+2017-01-02 01:02:03.000000000	DateTime64(9)
+1970-01-02 01:02:03	DateTime
+1970-01-02 01:02:03.000000000	DateTime64(9)
+1970-01-02 01:02:03	DateTime
+1970-01-02 01:02:03.000000000	DateTime64(9)
+2018-02-11 03:40:50	DateTime
+2018-02-11 03:40:50.000000000	DateTime64(9)
+2000-04-17 01:02:03	DateTime
+2000-04-17 01:02:03.000000000	DateTime64(9)
+1970-01-02 01:00:00	DateTime
+1970-01-02 01:00:00.000000000	DateTime64(9)
+1970-01-02 01:02:03	DateTime
+1970-01-02 01:02:03.000000000	DateTime64(9)
+1970-01-02 01:02:03	DateTime
+1970-01-02 01:02:03.000000000	DateTime64(9)
+2015-12-31 20:00:00	DateTime
+2015-12-31 20:00:00	DateTime
+2016-01-01 00:00:00	DateTime
+2016-01-01 00:00:00	DateTime
+2017-01-01 22:02:03	DateTime
+2017-01-01 22:02:03.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 04:04:05	DateTime
+2017-01-02 04:04:05.000000000	DateTime64(9)
+2017-01-02 02:34:05	DateTime
+2017-01-02 02:34:05.000000000	DateTime64(9)
+2017-01-02 00:04:05	DateTime
+2017-01-02 00:04:05.000000000	DateTime64(9)
+2017-01-02 02:04:05	DateTime
+2017-01-02 02:04:05.000000000	DateTime64(9)
+2017-01-02 00:04:05	DateTime
+2017-01-02 00:04:05.000000000	DateTime64(9)
+2017-01-01 18:04:05	DateTime
+2017-01-01 18:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-01 23:04:05	DateTime
+2017-01-01 23:04:05.000000000	DateTime64(9)
+2017-02-01 23:04:05	DateTime
+2017-02-01 23:04:05.000000000	DateTime64(9)
+2017-06-01 23:04:05	DateTime
+2017-06-01 23:04:05.000000000	DateTime64(9)
+2017-01-02 00:04:05	DateTime
+2017-01-02 00:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-01-02 04:04:05	DateTime
+2017-01-02 04:04:05.000000000	DateTime64(9)
+2017-01-02 04:04:05	DateTime
+2017-01-02 04:04:05.000000000	DateTime64(9)
+2017-01-02 02:04:05	DateTime
+2017-01-02 02:04:05.000000000	DateTime64(9)
+2017-01-02 03:04:05	DateTime
+2017-01-02 03:04:05.000000000	DateTime64(9)
+2017-04-01 11:22:33	DateTime
+2017-04-01 11:22:33.000000000	DateTime64(9)
+2017-04-01 22:02:03	DateTime
+2017-04-01 22:02:03.000000000	DateTime64(9)
+2017-04-01 22:02:03	DateTime
+2017-04-01 22:02:03.000000000	DateTime64(9)
+2017-04-02 01:02:03	DateTime
+2017-04-02 01:02:03.000000000	DateTime64(9)
+2017-04-02 11:22:33	DateTime
+2017-04-02 11:22:33.000000000	DateTime64(9)
+2017-04-02 01:02:03	DateTime
+2017-04-02 01:02:03.000000000	DateTime64(9)
+2017-04-02 01:22:33	DateTime
+2017-04-02 01:22:33.000000000	DateTime64(9)
+2017-04-02 01:02:03	DateTime
+2017-04-02 01:02:03.000000000	DateTime64(9)
+2017-04-02 01:02:33	DateTime
+2017-04-02 01:02:33.000000000	DateTime64(9)
+2017-04-01 22:02:03	DateTime
+2017-04-01 22:02:03.000000000	DateTime64(9)
+2017-04-02 01:02:03	DateTime
+2017-04-02 01:02:03.000000000	DateTime64(9)
+2017-04-01 22:02:03	DateTime
+2017-04-01 22:02:03.000000000	DateTime64(9)
+2017-04-01 21:02:03	DateTime
+2017-04-01 21:02:03.000000000	DateTime64(9)
+2017-04-02 01:02:03	DateTime
+2017-04-02 01:02:03.000000000	DateTime64(9)
+2017-01-01 22:02:03	DateTime
+2017-01-01 22:02:03.000000000	DateTime64(9)
+2017-04-25 01:02:03	DateTime
+2017-04-25 01:02:03.000000000	DateTime64(9)
+2017-04-25 01:02:03	DateTime
+2017-04-25 01:02:03.000000000	DateTime64(9)
+2017-01-25 01:02:03	DateTime
+2017-01-25 01:02:03.000000000	DateTime64(9)
+2017-01-24 22:02:03	DateTime
+2017-01-24 22:02:03.000000000	DateTime64(9)
+2017-01-25 13:02:03	DateTime
+2017-01-25 13:02:03.000000000	DateTime64(9)
+2017-01-25 01:02:03	DateTime
+2017-01-25 01:02:03.000000000	DateTime64(9)
+2017-01-25 01:02:03	DateTime
+2017-01-25 01:02:03.000000000	DateTime64(9)
+2017-01-24 22:02:03	DateTime
+2017-01-24 22:02:03.000000000	DateTime64(9)
+2017-01-24 22:02:03	DateTime
+2017-01-24 22:02:03.000000000	DateTime64(9)
+2017-01-25 10:02:03	DateTime
+2017-01-25 10:02:03.000000000	DateTime64(9)
+2017-01-25 10:02:03	DateTime
+2017-01-25 10:02:03.000000000	DateTime64(9)
+2017-01-25 10:02:03	DateTime
+2017-01-25 10:02:03.000000000	DateTime64(9)
+2017-01-25 09:32:03	DateTime
+2017-01-25 09:32:03.000000000	DateTime64(9)
+2017-01-25 01:02:03	DateTime
+2017-01-25 01:02:03.000000000	DateTime64(9)
+2017-01-25 13:02:03	DateTime
+2017-01-25 13:02:03.000000000	DateTime64(9)
+2017-01-25 13:02:03	DateTime
+2017-01-25 13:02:03.000000000	DateTime64(9)
+2017-01-25 10:02:03	DateTime
+2017-01-25 10:02:03.000000000	DateTime64(9)
+2018-02-11 03:40:50	DateTime
+2018-02-11 03:40:50.000000000	DateTime64(9)
+2018-02-11 03:40:50	DateTime
+2018-02-11 03:40:50.000000000	DateTime64(9)
+String
+2	String
+20	String
+200	String
+2000	String
+20000	String
+200001	String
+2000010	String
+20000101	String
+200001010	String
+2000010101	String
+20000101010	String
+200001010101	String
+2000010101010	String
+20000101010101	String
+2.1	String
+20.1	String
+200.1	String
+2000.1	String
+20000.1	String
+200001.1	String
+2000010.1	String
+20000101.1	String
+200001010.1	String
+2000010101.1	String
+20000101010.1	String
+200001010101.1	String
+2000010101010.1	String
+20000101010101.1	String
+Mar	String
+Mar1	String
+Mar 1	String
+Mar01	String
+Mar 01	String
+Mar2020	String
+Mar 2020	String
+Mar012020	String
+Mar 012020	String
+Mar01012020	String
+Mar 01012020	String
+Mar0101202001	String
+Mar 0101202001	String
+Mar010120200101	String
+Mar 010120200101	String
+Mar01012020010101	String
+Mar 01012020010101	String
+Mar01012020010101.000	String
+Mar 0101202001010101.000	String
+2000 01 01 01:00:00	String
+2000 01 01 01:00:00.000	String
+2000a01a01 01:00:00	String
+2000a01a01 01:00:00.000	String
+2000-01-01 01 00 00	String
+2000-01-01 01 00 00.000	String
+2000-01-01 01-00-00	String
+2000-01-01 01-00-00.000	String
+2000-01-01 01a00a00	String
+2000-01-01 01a00a00.000	String
+2000-01 01:00:00	String
+2000-01 01:00:00.000	String
+2000 01	String
+2000-01	String
+Mar 2000 00:00:00	String
+Mar 2000 00:00:00.000	String
+2000 00:00:00	String
+2000 00:00:00.000	String
+Mar 2000-01-01 00:00:00	String
+Mar 2000-01-01 00:00:00.000	String
diff --git a/tests/queries/0_stateless/03222_date_time_inference.sql b/tests/queries/0_stateless/03222_date_time_inference.sql
new file mode 100644
index 00000000000..01266a88d55
--- /dev/null
+++ b/tests/queries/0_stateless/03222_date_time_inference.sql
@@ -0,0 +1,268 @@
+set input_format_try_infer_datetimes = 1;
+set input_format_try_infer_dates = 1;
+set schema_inference_make_columns_nullable = 0;
+set input_format_json_try_infer_numbers_from_strings = 0;
+
+select 'Date';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:01:01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:1:01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:01:1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:1:1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-1-01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-1-1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/01/01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/1/01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/01/1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/1/1"}');
+
+select 'String';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_01_01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_1_01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_01_1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_1_1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a01a01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a1a01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a01a1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a1a1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20200101"}');
+
+select 'DateTime';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:01:01 42:42:42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/01/01 42:42:42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42:42:42"}');
+
+select 'String';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_01_01 42:42:42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a01a01 42:42:42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42.42.42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42 42 42"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42a42a42"}');
+
+select 'DateTime64';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:01:01 42:42:42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020/01/01 42:42:42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42:42:42.4242"}');
+
+select 'String';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020_01_01 42:42:42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020a01a01 42:42:42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42.42.42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42 42 42.4242"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020-01-01 42a42a42.4242"}');
+
+set date_time_input_format='best_effort';
+select 'DateTime/DateTime64 best effort';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 00:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203.000 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203 MSK+0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203.000 MSK+0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/17 010203.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/1970 010203Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/1970 010203.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/70 010203Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "02/01/70 010203.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "11 Feb 2018 06:40:50 +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "11 Feb 2018 06:40:50.000 +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "17 Apr 2000 2 1:2:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "17 Apr 2000 2 1:2:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "19700102 01:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "19700102 01:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "19700102010203Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "19700102010203Z.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "1970/01/02 010203Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "1970/01/02 010203.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2016-01-01MSD"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2016-01-01 MSD"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2016-01-01UTC"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2016-01-01Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "201701 02 010203 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "201701 02 010203.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+0"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+0"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+0000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+0000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05 -0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000 -0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+030"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+030"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05+900"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000+900"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05GMT"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000GMT"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05 MSD"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000 MSD"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05 MSD Feb"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000 MSD Feb"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05 MSD Jun"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000 MSD Jun"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02 03:04:05.000 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05+00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000+00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05 -0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000 -0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05-0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000-0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05+0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000+0100"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017-01-02T03:04:05.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 01 11:22:33"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 01 11:22:33.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 010203 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 010203.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 01:2:3 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 01:2:3.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:02:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:02:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 11:22:33"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 11:22:33.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:03"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:03.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:22:33"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:22:33.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:33"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:33.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3.000 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3 UTC+0000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3.000 UTC+0000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3 UTC+0400"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 02 1:2:3.000 UTC+0400"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 2 1:2:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Apr 2 1:2:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Jan 02 010203 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2017 Jan 02 010203.000 UTC+0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Apr 2017 01:02:03"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Apr 2017 01:02:03.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Apr 2017 1:2:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Apr 2017 1:2:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 MSK"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z+03:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z+03:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z +03:00 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z +03:00 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z +0300 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z +0300 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z+03:00 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z+03:00 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z +03:30 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z +03:30 PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3Z Mon"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000Z Mon"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3Z PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000Z PM"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3 Z PM +03:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "25 Jan 2017 1:2:3.000 Z PM +03:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Sun 11 Feb 2018 06:40:50 +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Sun 11 Feb 2018 06:40:50.000 +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Sun, 11 Feb 2018 06:40:50 +0300"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Sun, 11 Feb 2018 06:40:50.000 +0300"}');
+
+select 'String';
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001010"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101010"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001010101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010101010"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101010101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001010.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010101.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101010.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "200001010101.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000010101010.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "20000101010101.1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 1"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar2020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 2020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar012020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 012020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar01012020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 01012020"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar0101202001"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 0101202001"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar010120200101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 010120200101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar01012020010101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 01012020010101"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar01012020010101.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 0101202001010101.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000 01 01 01:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000 01 01 01:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000a01a01 01:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000a01a01 01:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01 00 00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01 00 00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01-00-00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01-00-00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01a00a00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01-01 01a00a00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01 01:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01 01:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000 01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000-01"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 2000 00:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 2000 00:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000 00:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2000 00:00:00.000"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 2000-01-01 00:00:00"}');
+select x, toTypeName(x) from format(JSONEachRow, '{"x" : "Mar 2000-01-01 00:00:00.000"}');
+
+

From 8950491fa5af2f1abacbba86181f16fb512b8004 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Thu, 15 Aug 2024 11:18:15 +0200
Subject: [PATCH 78/88] Fix unit test build

---
 src/Interpreters/tests/gtest_filecache.cpp | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/Interpreters/tests/gtest_filecache.cpp b/src/Interpreters/tests/gtest_filecache.cpp
index 36acc319f4e..fd602ab5918 100644
--- a/src/Interpreters/tests/gtest_filecache.cpp
+++ b/src/Interpreters/tests/gtest_filecache.cpp
@@ -246,7 +246,8 @@ void download(FileSegment & file_segment)
     ASSERT_EQ(file_segment.state(), State::DOWNLOADING);
     ASSERT_EQ(file_segment.getDownloadedSize(), 0);
 
-    ASSERT_TRUE(file_segment.reserve(file_segment.range().size(), 1000));
+    std::string failure_reason;
+    ASSERT_TRUE(file_segment.reserve(file_segment.range().size(), 1000, failure_reason));
     download(cache_base_path, file_segment);
     ASSERT_EQ(file_segment.state(), State::DOWNLOADING);
 
@@ -258,7 +259,8 @@ void assertDownloadFails(FileSegment & file_segment)
 {
     ASSERT_EQ(file_segment.getOrSetDownloader(), FileSegment::getCallerId());
     ASSERT_EQ(file_segment.getDownloadedSize(), 0);
-    ASSERT_FALSE(file_segment.reserve(file_segment.range().size(), 1000));
+    std::string failure_reason;
+    ASSERT_FALSE(file_segment.reserve(file_segment.range().size(), 1000, failure_reason));
     file_segment.complete();
 }
 
@@ -957,10 +959,11 @@ TEST_F(FileCacheTest, temporaryData)
 
     {
         ASSERT_EQ(some_data_holder->size(), 5);
+        std::string failure_reason;
         for (auto & segment : *some_data_holder)
         {
             ASSERT_TRUE(segment->getOrSetDownloader() == DB::FileSegment::getCallerId());
-            ASSERT_TRUE(segment->reserve(segment->range().size(), 1000));
+            ASSERT_TRUE(segment->reserve(segment->range().size(), 1000, failure_reason));
             download(*segment);
             segment->complete();
         }

From 3af8ba2deb99a0450c448d12a9c1da6858aec987 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <antaljanosbenjamin@users.noreply.github.com>
Date: Thu, 15 Aug 2024 11:21:20 +0200
Subject: [PATCH 79/88] Revert "[RFC] Fix settings/current_database in
 system.processes for async BACKUP/RESTORE"

---
 src/Backups/BackupsWorker.cpp  | 4 ----
 src/Interpreters/ProcessList.h | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp
index 8b45c816817..0b93ae6d547 100644
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@@ -490,8 +490,6 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
 
             /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
             auto process_list_element = context_in_use->getProcessListElement();
-            /// Update context to preserve query information in processlist (settings, current_database)
-            process_list_element->updateContext(context_in_use);
 
             thread_pool.scheduleOrThrowOnError(
                 [this,
@@ -855,8 +853,6 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
 
             /// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
             auto process_list_element = context_in_use->getProcessListElement();
-            /// Update context to preserve query information in processlist (settings, current_database)
-            process_list_element->updateContext(context_in_use);
 
             thread_pool.scheduleOrThrowOnError(
                 [this,
diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h
index 248ba947bc1..accb73e12df 100644
--- a/src/Interpreters/ProcessList.h
+++ b/src/Interpreters/ProcessList.h
@@ -244,9 +244,6 @@ public:
     /// Same as checkTimeLimit but it never throws
     [[nodiscard]] bool checkTimeLimitSoft();
 
-    /// Use it in case of the query left in background to execute asynchronously
-    void updateContext(ContextWeakPtr weak_context) { context = std::move(weak_context); }
-
     /// Get the reference for the start of the query. Used to synchronize with other Stopwatches
     UInt64 getQueryCPUStartTime() { return watch.getStart(); }
 };

From d18b6c63d408c4a70f3e541ce1956a4229bfe452 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?=
 <benjamin.antal@clickhouse.com>
Date: Thu, 15 Aug 2024 09:41:03 +0000
Subject: [PATCH 80/88] Change name of default azurite container to avoid
 clashing with azure blob storage tests

---
 tests/integration/helpers/cluster.py            | 2 +-
 tests/integration/test_storage_s3_queue/test.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 0b6cf03d467..53f4f1e1f26 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -570,7 +570,7 @@ class ClickHouseCluster:
         self.spark_session = None
 
         self.with_azurite = False
-        self.azurite_container = "cont"
+        self.azurite_container = "azurite-container"
         self.blob_service_client = None
         self._azurite_port = 0
 
diff --git a/tests/integration/test_storage_s3_queue/test.py b/tests/integration/test_storage_s3_queue/test.py
index 34fb1eaf1fe..9e3ee19179a 100644
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@@ -232,7 +232,7 @@ def create_table(
         url = f"http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{files_path}/"
         engine_def = f"{engine_name}('{url}', {auth_params}, {file_format})"
     else:
-        engine_def = f"{engine_name}('{started_cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', '{files_path}/', 'CSV')"
+        engine_def = f"{engine_name}('{started_cluster.env_variables['AZURITE_CONNECTION_STRING']}', '{started_cluster.azurite_container}', '{files_path}/', 'CSV')"
 
     node.query(f"DROP TABLE IF EXISTS {table_name}")
     create_query = f"""

From 9f6e472b0cf03f8018b149ad5f7541b4ddec5735 Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Thu, 15 Aug 2024 11:47:41 +0200
Subject: [PATCH 81/88] process regexp flags correctly

---
 src/Common/OptimizedRegularExpression.cpp | 40 ++++++++++++++---------
 src/Common/tests/gtest_optimize_re.cpp    |  2 ++
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp
index 712cab80aff..04e5f846adf 100644
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@@ -244,33 +244,41 @@ const char * analyzeImpl(
                 is_trivial = false;
                 if (!in_square_braces)
                 {
-                    /// Check for case-insensitive flag.
-                    if (pos + 1 < end && pos[1] == '?')
+                    /// it means flag negation
+                    /// there are various possible flags
+                    /// actually only imsU are supported by re2
+                    auto is_flag_char = [](char x)
                     {
-                        for (size_t offset = 2; pos + offset < end; ++offset)
+                        return x == '-' || x == 'i' || x == 'm' || x == 's' || x == 'U' || x == 'u';
+                    };
+                    /// Check for case-insensitive flag.
+                    if (pos + 2 < end && pos[1] == '?' && is_flag_char(pos[2]))
+                    {
+                        size_t offset = 2;
+                        for (; pos + offset < end; ++offset)
                         {
-                            if (pos[offset] == '-'  /// it means flag negation
-                                /// various possible flags, actually only imsU are supported by re2
-                                || (pos[offset] >= 'a' && pos[offset] <= 'z')
-                                || (pos[offset] >= 'A' && pos[offset] <= 'Z'))
+                            if (pos[offset] == 'i')
                             {
-                                if (pos[offset] == 'i')
-                                {
-                                    /// Actually it can be negated case-insensitive flag. But we don't care.
-                                    has_case_insensitive_flag = true;
-                                    break;
-                                }
+                                /// Actually it can be negated case-insensitive flag. But we don't care.
+                                has_case_insensitive_flag = true;
                             }
-                            else
+                            else if (!is_flag_char(pos[offset]))
                                 break;
                         }
+                        pos += offset;
+                        /// if this group only contains flags, we have nothing to do.
+                        if (*pos == ')')
+                        {
+                            ++pos;
+                            break;
+                        }
                     }
                     /// (?:regex) means non-capturing parentheses group
-                    if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
+                    else if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
                     {
                         pos += 2;
                     }
-                    if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
+                    else if (pos + 3 < end && pos[1] == '?' && (pos[2] == '<' || pos[2] == '\'' || (pos[2] == 'P' && pos[3] == '<')))
                     {
                         pos = skipNameCapturingGroup(pos, pos[2] == 'P' ? 3: 2, end);
                     }
diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp
index a9fcb918b24..0730a13f160 100644
--- a/src/Common/tests/gtest_optimize_re.cpp
+++ b/src/Common/tests/gtest_optimize_re.cpp
@@ -19,6 +19,8 @@ TEST(OptimizeRE, analyze)
     };
     test_f("abc", "abc", {}, true, true);
     test_f("c([^k]*)de", "");
+    test_f("(?-s)bob", "bob", {}, false, true);
+    test_f("(?s)bob", "bob", {}, false, true);
     test_f("abc(de)fg", "abcdefg", {}, false, true);
     test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
     test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);

From bea8e65f4fcfa99cdae21ff4776d509ba4fcd0d7 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 15 Aug 2024 09:48:28 +0000
Subject: [PATCH 82/88] Fix tests

---
 src/IO/parseDateTimeBestEffort.cpp            | 20 ++++++++++++++++---
 ...ed_dates_in_csv_schema_inference.reference |  2 +-
 ...03033_dynamic_text_serialization.reference | 10 +++++-----
 .../03199_json_extract_dynamic.reference      |  2 +-
 ...ad_for_schema_inference_in_cache.reference |  2 +-
 .../0_stateless/03222_date_time_inference.sql |  1 +
 6 files changed, 26 insertions(+), 11 deletions(-)

diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp
index 68122a37df6..f220577f2cb 100644
--- a/src/IO/parseDateTimeBestEffort.cpp
+++ b/src/IO/parseDateTimeBestEffort.cpp
@@ -82,7 +82,7 @@ struct DateTimeSubsecondPart
     UInt8 digits;
 };
 
-template <typename ReturnType, bool is_us_style, bool strict = false>
+template <typename ReturnType, bool is_us_style, bool strict = false, bool is_64 = false>
 ReturnType parseDateTimeBestEffortImpl(
     time_t & res,
     ReadBuffer & in,
@@ -686,6 +686,20 @@ ReturnType parseDateTimeBestEffortImpl(
         }
     };
 
+    if constexpr (strict)
+    {
+        if constexpr (is_64)
+        {
+            if (year < 1900)
+                return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime64: year {} is less than minimum supported year 1900", year);
+        }
+        else
+        {
+            if (year < 1970)
+                return on_error(ErrorCodes::CANNOT_PARSE_DATETIME, "Cannot read DateTime: year {} is less than minimum supported year 1970", year);
+        }
+    }
+
     if (has_time_zone_offset)
     {
         res = utc_time_zone.makeDateTime(year, month, day_of_month, hour, minute, second);
@@ -707,12 +721,12 @@ ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuf
 
     if constexpr (std::is_same_v<ReturnType, bool>)
     {
-        if (!parseDateTimeBestEffortImpl<bool, is_us_style, strict>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters))
+        if (!parseDateTimeBestEffortImpl<bool, is_us_style, strict, true>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters))
             return false;
     }
     else
     {
-        parseDateTimeBestEffortImpl<ReturnType, is_us_style, strict>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters);
+        parseDateTimeBestEffortImpl<ReturnType, is_us_style, strict, true>(whole, in, local_time_zone, utc_time_zone, &subsecond, allowed_date_delimiters);
     }
 
 
diff --git a/tests/queries/0_stateless/02228_unquoted_dates_in_csv_schema_inference.reference b/tests/queries/0_stateless/02228_unquoted_dates_in_csv_schema_inference.reference
index be82d744a3b..56293ca0e5d 100644
--- a/tests/queries/0_stateless/02228_unquoted_dates_in_csv_schema_inference.reference
+++ b/tests/queries/0_stateless/02228_unquoted_dates_in_csv_schema_inference.reference
@@ -1 +1 @@
-c1	Nullable(DateTime64(9))					
+c1	Nullable(DateTime)					
diff --git a/tests/queries/0_stateless/03033_dynamic_text_serialization.reference b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference
index d965245266c..aa7b3fc83a2 100644
--- a/tests/queries/0_stateless/03033_dynamic_text_serialization.reference
+++ b/tests/queries/0_stateless/03033_dynamic_text_serialization.reference
@@ -4,7 +4,7 @@ JSON
 {"d":"str","dynamicType(d)":"String"}
 {"d":["1","2","3"],"dynamicType(d)":"Array(Int64)"}
 {"d":"2020-01-01","dynamicType(d)":"Date"}
-{"d":"2020-01-01 10:00:00.000000000","dynamicType(d)":"DateTime64(9)"}
+{"d":"2020-01-01 10:00:00","dynamicType(d)":"DateTime"}
 {"d":{"a":"42","b":"str"},"dynamicType(d)":"Tuple(a Int64, b String)"}
 {"d":{"a":"43"},"dynamicType(d)":"Tuple(a Int64)"}
 {"d":{"a":"44","c":["1","2","3"]},"dynamicType(d)":"Tuple(a Int64, c Array(Int64))"}
@@ -22,7 +22,7 @@ CSV
 "str","String"
 "[1,2,3]","Array(Int64)"
 "2020-01-01","Date"
-"2020-01-01 10:00:00.000000000","DateTime64(9)"
+"2020-01-01 10:00:00","DateTime"
 "[1, 'str', [1, 2, 3]]","String"
 \N,"None"
 true,"Bool"
@@ -32,18 +32,18 @@ TSV
 str	String
 [1,2,3]	Array(Int64)
 2020-01-01	Date
-2020-01-01 10:00:00.000000000	DateTime64(9)
+2020-01-01 10:00:00	DateTime
 [1, \'str\', [1, 2, 3]]	String
 \N	None
 true	Bool
 Values
-(42,'Int64'),(42.42,'Float64'),('str','String'),([1,2,3],'Array(Int64)'),('2020-01-01','Date'),('2020-01-01 10:00:00.000000000','DateTime64(9)'),(NULL,'None'),(true,'Bool')
+(42,'Int64'),(42.42,'Float64'),('str','String'),([1,2,3],'Array(Int64)'),('2020-01-01','Date'),('2020-01-01 10:00:00','DateTime'),(NULL,'None'),(true,'Bool')
 Cast using parsing
 42	Int64
 42.42	Float64
 [1,2,3]	Array(Int64)
 2020-01-01	Date
-2020-01-01 10:00:00.000000000	DateTime64(9)
+2020-01-01 10:00:00	DateTime
 \N	None
 true	Bool
 42	Int64
diff --git a/tests/queries/0_stateless/03199_json_extract_dynamic.reference b/tests/queries/0_stateless/03199_json_extract_dynamic.reference
index 759b7763cd1..955106946ea 100644
--- a/tests/queries/0_stateless/03199_json_extract_dynamic.reference
+++ b/tests/queries/0_stateless/03199_json_extract_dynamic.reference
@@ -12,7 +12,7 @@ Hello	String
 [1,2,3]	Array(Nullable(Int64))
 ['str1','str2','str3']	Array(Nullable(String))
 [[[1],[2,3,4]],[[5,6],[7]]]	Array(Array(Array(Nullable(Int64))))
-['2020-01-01 00:00:00.000000000','2020-01-01 00:00:00.000000000']	Array(Nullable(DateTime64(9)))
+['2020-01-01 00:00:00','2020-01-01 00:00:00']	Array(Nullable(DateTime))
 ['2020-01-01','2020-01-01 date']	Array(Nullable(String))
 ['2020-01-01','2020-01-01 00:00:00','str']	Array(Nullable(String))
 ['2020-01-01','2020-01-01 00:00:00','42']	Array(Nullable(String))
diff --git a/tests/queries/0_stateless/03212_max_bytes_to_read_for_schema_inference_in_cache.reference b/tests/queries/0_stateless/03212_max_bytes_to_read_for_schema_inference_in_cache.reference
index cd109daac52..13b1138d1c4 100644
--- a/tests/queries/0_stateless/03212_max_bytes_to_read_for_schema_inference_in_cache.reference
+++ b/tests/queries/0_stateless/03212_max_bytes_to_read_for_schema_inference_in_cache.reference
@@ -1,2 +1,2 @@
 x	Nullable(Int64)					
-schema_inference_hints=, max_rows_to_read_for_schema_inference=25000, max_bytes_to_read_for_schema_inference=1000, schema_inference_make_columns_nullable=true, try_infer_integers=true, try_infer_dates=true, try_infer_datetimes=true, try_infer_numbers_from_strings=false, read_bools_as_numbers=true, read_bools_as_strings=true, read_objects_as_strings=true, read_numbers_as_strings=true, read_arrays_as_strings=true, try_infer_objects_as_tuples=true, infer_incomplete_types_as_strings=true, try_infer_objects=false, use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=false
+schema_inference_hints=, max_rows_to_read_for_schema_inference=25000, max_bytes_to_read_for_schema_inference=1000, schema_inference_make_columns_nullable=true, try_infer_integers=true, try_infer_dates=true, try_infer_datetimes=true, try_infer_datetimes_only_datetime64=false, try_infer_numbers_from_strings=false, read_bools_as_numbers=true, read_bools_as_strings=true, read_objects_as_strings=true, read_numbers_as_strings=true, read_arrays_as_strings=true, try_infer_objects_as_tuples=true, infer_incomplete_types_as_strings=true, try_infer_objects=false, use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects=false
diff --git a/tests/queries/0_stateless/03222_date_time_inference.sql b/tests/queries/0_stateless/03222_date_time_inference.sql
index 01266a88d55..ebd472294be 100644
--- a/tests/queries/0_stateless/03222_date_time_inference.sql
+++ b/tests/queries/0_stateless/03222_date_time_inference.sql
@@ -2,6 +2,7 @@ set input_format_try_infer_datetimes = 1;
 set input_format_try_infer_dates = 1;
 set schema_inference_make_columns_nullable = 0;
 set input_format_json_try_infer_numbers_from_strings = 0;
+set session_timezone = 'UTC';
 
 select 'Date';
 select x, toTypeName(x) from format(JSONEachRow, '{"x" : "2020:01:01"}');

From 0a10f0ceb3fab80e5dcfab5ebbebbdbfcdaff6c1 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 15 Aug 2024 11:27:12 +0000
Subject: [PATCH 83/88] Update tests

---
 .../02325_dates_schema_inference.reference    | 58 +++++++++----------
 tests/queries/0_stateless/02404_data.CSV      | 10 ++++
 .../0_stateless/02404_data.CSVWithNames       | 11 ++++
 .../0_stateless/02404_data.CustomSeparated    | 10 ++++
 .../0_stateless/02404_data.JSONCompactEachRow | 10 ++++
 .../0_stateless/02404_data.JSONEachRow        | 10 ++++
 tests/queries/0_stateless/02404_data.TSKV     | 10 ++++
 tests/queries/0_stateless/02404_data.TSV      | 10 ++++
 .../0_stateless/02404_data.TSVWithNames       | 11 ++++
 tests/queries/0_stateless/02404_data.Values   |  1 +
 ...ce_cache_respect_format_settings.reference | 18 +++---
 11 files changed, 121 insertions(+), 38 deletions(-)
 create mode 100644 tests/queries/0_stateless/02404_data.CSV
 create mode 100644 tests/queries/0_stateless/02404_data.CSVWithNames
 create mode 100644 tests/queries/0_stateless/02404_data.CustomSeparated
 create mode 100644 tests/queries/0_stateless/02404_data.JSONCompactEachRow
 create mode 100644 tests/queries/0_stateless/02404_data.JSONEachRow
 create mode 100644 tests/queries/0_stateless/02404_data.TSKV
 create mode 100644 tests/queries/0_stateless/02404_data.TSV
 create mode 100644 tests/queries/0_stateless/02404_data.TSVWithNames
 create mode 100644 tests/queries/0_stateless/02404_data.Values

diff --git a/tests/queries/0_stateless/02325_dates_schema_inference.reference b/tests/queries/0_stateless/02325_dates_schema_inference.reference
index c8eebd3262e..124f105220d 100644
--- a/tests/queries/0_stateless/02325_dates_schema_inference.reference
+++ b/tests/queries/0_stateless/02325_dates_schema_inference.reference
@@ -1,29 +1,29 @@
 JSONEachRow
 x	Nullable(Date)					
 x	Nullable(DateTime64(9))					
-x	Nullable(DateTime64(9))					
+x	Nullable(DateTime)					
 x	Array(Nullable(Date))					
-x	Array(Nullable(DateTime64(9)))					
-x	Array(Nullable(DateTime64(9)))					
-x	Tuple(\n    date1 Nullable(DateTime64(9)),\n    date2 Nullable(Date))					
-x	Array(Nullable(DateTime64(9)))					
-x	Array(Nullable(DateTime64(9)))					
-x	Nullable(DateTime64(9))					
+x	Array(Nullable(DateTime))					
+x	Array(Nullable(DateTime))					
+x	Tuple(\n    date1 Nullable(DateTime),\n    date2 Nullable(Date))					
+x	Array(Nullable(DateTime))					
+x	Array(Nullable(DateTime))					
+x	Nullable(DateTime)					
 x	Array(Nullable(String))					
 x	Nullable(String)					
 x	Array(Nullable(String))					
-x	Tuple(\n    key1 Array(Array(Nullable(DateTime64(9)))),\n    key2 Array(Array(Nullable(String))))					
+x	Tuple(\n    key1 Array(Array(Nullable(DateTime))),\n    key2 Array(Array(Nullable(String))))					
 CSV
 c1	Nullable(Date)					
 c1	Nullable(DateTime64(9))					
-c1	Nullable(DateTime64(9))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(Date))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Map(String, Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Nullable(DateTime64(9))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Map(String, Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(String))					
 c1	Nullable(String)					
 c1	Array(Nullable(String))					
@@ -31,14 +31,14 @@ c1	Map(String, Array(Array(Nullable(String))))
 TSV
 c1	Nullable(Date)					
 c1	Nullable(DateTime64(9))					
-c1	Nullable(DateTime64(9))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(Date))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Map(String, Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Nullable(DateTime64(9))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Map(String, Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(String))					
 c1	Nullable(String)					
 c1	Array(Nullable(String))					
@@ -46,14 +46,14 @@ c1	Map(String, Array(Array(Nullable(String))))
 Values
 c1	Nullable(Date)					
 c1	Nullable(DateTime64(9))					
-c1	Nullable(DateTime64(9))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(Date))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Map(String, Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Array(Nullable(DateTime64(9)))					
-c1	Nullable(DateTime64(9))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Map(String, Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Array(Nullable(DateTime))					
+c1	Nullable(DateTime)					
 c1	Array(Nullable(String))					
 c1	Nullable(String)					
 c1	Array(Nullable(String))					
diff --git a/tests/queries/0_stateless/02404_data.CSV b/tests/queries/0_stateless/02404_data.CSV
new file mode 100644
index 00000000000..2d8b5c8daa8
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.CSV
@@ -0,0 +1,10 @@
+0,"1970-01-01"
+1,"1970-01-02"
+2,"1970-01-03"
+3,"1970-01-04"
+4,"1970-01-05"
+5,"1970-01-06"
+6,"1970-01-07"
+7,"1970-01-08"
+8,"1970-01-09"
+9,"1970-01-10"
diff --git a/tests/queries/0_stateless/02404_data.CSVWithNames b/tests/queries/0_stateless/02404_data.CSVWithNames
new file mode 100644
index 00000000000..34647008916
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.CSVWithNames
@@ -0,0 +1,11 @@
+"number","toDate(number)"
+0,"1970-01-01"
+1,"1970-01-02"
+2,"1970-01-03"
+3,"1970-01-04"
+4,"1970-01-05"
+5,"1970-01-06"
+6,"1970-01-07"
+7,"1970-01-08"
+8,"1970-01-09"
+9,"1970-01-10"
diff --git a/tests/queries/0_stateless/02404_data.CustomSeparated b/tests/queries/0_stateless/02404_data.CustomSeparated
new file mode 100644
index 00000000000..f3ae1663536
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.CustomSeparated
@@ -0,0 +1,10 @@
+0	1970-01-01
+1	1970-01-02
+2	1970-01-03
+3	1970-01-04
+4	1970-01-05
+5	1970-01-06
+6	1970-01-07
+7	1970-01-08
+8	1970-01-09
+9	1970-01-10
diff --git a/tests/queries/0_stateless/02404_data.JSONCompactEachRow b/tests/queries/0_stateless/02404_data.JSONCompactEachRow
new file mode 100644
index 00000000000..de2e0986aab
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.JSONCompactEachRow
@@ -0,0 +1,10 @@
+["0", "1970-01-01"]
+["1", "1970-01-02"]
+["2", "1970-01-03"]
+["3", "1970-01-04"]
+["4", "1970-01-05"]
+["5", "1970-01-06"]
+["6", "1970-01-07"]
+["7", "1970-01-08"]
+["8", "1970-01-09"]
+["9", "1970-01-10"]
diff --git a/tests/queries/0_stateless/02404_data.JSONEachRow b/tests/queries/0_stateless/02404_data.JSONEachRow
new file mode 100644
index 00000000000..e77256ac7fc
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.JSONEachRow
@@ -0,0 +1,10 @@
+{"number":"0","toDate(number)":"1970-01-01"}
+{"number":"1","toDate(number)":"1970-01-02"}
+{"number":"2","toDate(number)":"1970-01-03"}
+{"number":"3","toDate(number)":"1970-01-04"}
+{"number":"4","toDate(number)":"1970-01-05"}
+{"number":"5","toDate(number)":"1970-01-06"}
+{"number":"6","toDate(number)":"1970-01-07"}
+{"number":"7","toDate(number)":"1970-01-08"}
+{"number":"8","toDate(number)":"1970-01-09"}
+{"number":"9","toDate(number)":"1970-01-10"}
diff --git a/tests/queries/0_stateless/02404_data.TSKV b/tests/queries/0_stateless/02404_data.TSKV
new file mode 100644
index 00000000000..70f7ad33c8b
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.TSKV
@@ -0,0 +1,10 @@
+number=0	toDate(number)=1970-01-01
+number=1	toDate(number)=1970-01-02
+number=2	toDate(number)=1970-01-03
+number=3	toDate(number)=1970-01-04
+number=4	toDate(number)=1970-01-05
+number=5	toDate(number)=1970-01-06
+number=6	toDate(number)=1970-01-07
+number=7	toDate(number)=1970-01-08
+number=8	toDate(number)=1970-01-09
+number=9	toDate(number)=1970-01-10
diff --git a/tests/queries/0_stateless/02404_data.TSV b/tests/queries/0_stateless/02404_data.TSV
new file mode 100644
index 00000000000..f3ae1663536
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.TSV
@@ -0,0 +1,10 @@
+0	1970-01-01
+1	1970-01-02
+2	1970-01-03
+3	1970-01-04
+4	1970-01-05
+5	1970-01-06
+6	1970-01-07
+7	1970-01-08
+8	1970-01-09
+9	1970-01-10
diff --git a/tests/queries/0_stateless/02404_data.TSVWithNames b/tests/queries/0_stateless/02404_data.TSVWithNames
new file mode 100644
index 00000000000..23310234a8c
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.TSVWithNames
@@ -0,0 +1,11 @@
+number	toDate(number)
+0	1970-01-01
+1	1970-01-02
+2	1970-01-03
+3	1970-01-04
+4	1970-01-05
+5	1970-01-06
+6	1970-01-07
+7	1970-01-08
+8	1970-01-09
+9	1970-01-10
diff --git a/tests/queries/0_stateless/02404_data.Values b/tests/queries/0_stateless/02404_data.Values
new file mode 100644
index 00000000000..d9a621d7ec9
--- /dev/null
+++ b/tests/queries/0_stateless/02404_data.Values
@@ -0,0 +1 @@
+(0,'1970-01-01'),(1,'1970-01-02'),(2,'1970-01-03'),(3,'1970-01-04'),(4,'1970-01-05'),(5,'1970-01-06'),(6,'1970-01-07'),(7,'1970-01-08'),(8,'1970-01-09'),(9,'1970-01-10')
\ No newline at end of file
diff --git a/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference b/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference
index 049603328d9..3d6b1021916 100644
--- a/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference
+++ b/tests/queries/0_stateless/02404_schema_inference_cache_respect_format_settings.reference
@@ -4,7 +4,7 @@ c2	Nullable(Date)
 c1	Nullable(Float64)					
 c2	Nullable(Date)					
 c1	Nullable(Int64)					
-c2	Nullable(DateTime64(9))					
+c2	Nullable(DateTime)					
 c1	UInt8					
 c2	Nullable(Date)					
 4
@@ -14,7 +14,7 @@ toDate(number)	Nullable(Date)
 number	Nullable(Float64)					
 toDate(number)	Nullable(Date)					
 number	Nullable(Int64)					
-toDate(number)	Nullable(DateTime64(9))					
+toDate(number)	Nullable(DateTime)					
 number	Nullable(Int64)					
 toDate(number)	Nullable(Date)					
 4
@@ -24,7 +24,7 @@ c2	Nullable(Date)
 c1	Nullable(Float64)					
 c2	Nullable(Date)					
 c1	Nullable(Int64)					
-c2	Nullable(DateTime64(9))					
+c2	Nullable(DateTime)					
 c1	UInt8					
 c2	Nullable(Date)					
 4
@@ -34,7 +34,7 @@ toDate(number)	Nullable(Date)
 number	Nullable(Float64)					
 toDate(number)	Nullable(Date)					
 number	Nullable(Int64)					
-toDate(number)	Nullable(DateTime64(9))					
+toDate(number)	Nullable(DateTime)					
 number	Nullable(Int64)					
 toDate(number)	Nullable(Date)					
 4
@@ -44,7 +44,7 @@ toDate(number)	Nullable(Date)
 number	Nullable(Float64)					
 toDate(number)	Nullable(Date)					
 number	Nullable(Int64)					
-toDate(number)	Nullable(DateTime64(9))					
+toDate(number)	Nullable(DateTime)					
 number	Nullable(Int64)					
 toDate(number)	Nullable(Date)					
 4
@@ -54,7 +54,7 @@ c2	Nullable(Date)
 c1	Nullable(Float64)					
 c2	Nullable(Date)					
 c1	Nullable(Int64)					
-c2	Nullable(DateTime64(9))					
+c2	Nullable(DateTime)					
 c1	UInt8					
 c2	Nullable(Date)					
 4
@@ -64,7 +64,7 @@ toDate(number)	Nullable(Date)
 number	Nullable(Float64)					
 toDate(number)	Nullable(Date)					
 number	Nullable(Int64)					
-toDate(number)	Nullable(DateTime64(9))					
+toDate(number)	Nullable(DateTime)					
 number	Nullable(Int64)					
 toDate(number)	Nullable(Date)					
 4
@@ -74,7 +74,7 @@ c2	Nullable(Date)
 c1	Nullable(Float64)					
 c2	Nullable(Date)					
 c1	Nullable(Int64)					
-c2	Nullable(DateTime64(9))					
+c2	Nullable(DateTime)					
 c1	UInt8					
 c2	Nullable(Date)					
 4
@@ -84,7 +84,7 @@ c2	Nullable(Date)
 c1	Nullable(Float64)					
 c2	Nullable(Date)					
 c1	Nullable(Int64)					
-c2	Nullable(DateTime64(9))					
+c2	Nullable(DateTime)					
 c1	UInt8					
 c2	Nullable(Date)					
 4

From 50a8cee0c5a4cd067cee2dc5584401b15283b3cd Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 15 Aug 2024 11:39:04 +0000
Subject: [PATCH 84/88] Update docs

---
 docs/en/interfaces/schema-inference.md | 93 +++++++++++++++++---------
 1 file changed, 62 insertions(+), 31 deletions(-)

diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md
index 05fae994cbe..4afba20d76c 100644
--- a/docs/en/interfaces/schema-inference.md
+++ b/docs/en/interfaces/schema-inference.md
@@ -359,13 +359,14 @@ DESC format(JSONEachRow, '{"int" : 42, "float" : 42.42, "string" : "Hello, World
 Dates, DateTimes:
 
 ```sql
-DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00"}')
+DESC format(JSONEachRow, '{"date" : "2022-01-01", "datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}')
 ```
 ```response
-┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
-│ date     │ Nullable(Date)          │              │                    │         │                  │                │
-│ datetime │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
-└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ date       │ Nullable(Date)          │              │                    │         │                  │                │
+│ datetime   │ Nullable(DateTime)      │              │                    │         │                  │                │
+│ datetime64 │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
 Arrays:
@@ -759,12 +760,13 @@ DESC format(CSV, 'Hello world!,World hello!')
 Dates, DateTimes:
 
 ```sql
-DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00"')
+DESC format(CSV, '"2020-01-01","2020-01-01 00:00:00","2022-01-01 00:00:00.000"')
 ```
 ```response
 ┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
 │ c1   │ Nullable(Date)          │              │                    │         │                  │                │
-│ c2   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+│ c2   │ Nullable(DateTime)      │              │                    │         │                  │                │
+│ c3   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
 └──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
@@ -956,12 +958,13 @@ DESC format(TSKV, 'int=42	float=42.42	bool=true	string=Hello,World!\n')
 Dates, DateTimes:
 
 ```sql
-DESC format(TSV, '2020-01-01	2020-01-01 00:00:00')
+DESC format(TSV, '2020-01-01	2020-01-01 00:00:00	2022-01-01 00:00:00.000')
 ```
 ```response
 ┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
 │ c1   │ Nullable(Date)          │              │                    │         │                  │                │
-│ c2   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+│ c2   │ Nullable(DateTime)      │              │                    │         │                  │                │
+│ c3   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
 └──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
@@ -1126,12 +1129,13 @@ DESC format(Values, $$(42, 42.42, true, 'Hello,World!')$$)
 Dates, DateTimes:
 
 ```sql
-DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00')$$)
-```
+ DESC format(Values, $$('2020-01-01', '2020-01-01 00:00:00', '2022-01-01 00:00:00.000')$$)
+ ```
 ```response
 ┌─name─┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
 │ c1   │ Nullable(Date)          │              │                    │         │                  │                │
-│ c2   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+│ c2   │ Nullable(DateTime)      │              │                    │         │                  │                │
+│ c3   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
 └──────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
@@ -1504,8 +1508,8 @@ DESC format(JSONEachRow, $$
 
 #### input_format_try_infer_datetimes
 
-If enabled, ClickHouse will try to infer type `DateTime64` from string fields in schema inference for text formats.
-If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime64(9)`,
+If enabled, ClickHouse will try to infer type `DateTime` or `DateTime64` from string fields in schema inference for text formats.
+If all fields from a column in sample data were successfully parsed as datetimes, the result type will be `DateTime` or `DateTime64(9)` (if any datetime had fractional part),
 if at least one field was not parsed as datetime, the result type will be `String`.
 
 Enabled by default.
@@ -1513,39 +1517,66 @@ Enabled by default.
 **Examples**
 
 ```sql
-SET input_format_try_infer_datetimes = 0
+SET input_format_try_infer_datetimes = 0;
 DESC format(JSONEachRow, $$
-                                {"datetime" : "2021-01-01 00:00:00.000"}
-                                {"datetime" : "2022-01-01 00:00:00.000"}
+                                {"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
+                                {"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
                          $$)
 ```
 ```response
-┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
-│ datetime │ Nullable(String) │              │                    │         │                  │                │
-└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ datetime   │ Nullable(String) │              │                    │         │                  │                │
+│ datetime64 │ Nullable(String) │              │                    │         │                  │                │
+└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 ```sql
-SET input_format_try_infer_datetimes = 1
+SET input_format_try_infer_datetimes = 1;
 DESC format(JSONEachRow, $$
-                                {"datetime" : "2021-01-01 00:00:00.000"}
-                                {"datetime" : "2022-01-01 00:00:00.000"}
+                                {"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
+                                {"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
                          $$)
 ```
 ```response
-┌─name─────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
-│ datetime │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
-└──────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ datetime   │ Nullable(DateTime)      │              │                    │         │                  │                │
+│ datetime64 │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 ```sql
 DESC format(JSONEachRow, $$
-                                {"datetime" : "2021-01-01 00:00:00.000"}
-                                {"datetime" : "unknown"}
+                                {"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
+                                {"datetime" : "unknown", "datetime64" : "unknown"}
                          $$)
 ```
 ```response
-┌─name─────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
-│ datetime │ Nullable(String) │              │                    │         │                  │                │
-└──────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+┌─name───────┬─type─────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ datetime   │ Nullable(String) │              │                    │         │                  │                │
+│ datetime64 │ Nullable(String) │              │                    │         │                  │                │
+└────────────┴──────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
+```
+
+#### input_format_try_infer_datetimes_only_datetime64
+
+If enabled, ClickHouse will always infer `DateTime64(9)` when `input_format_try_infer_datetimes` is enabled even if datetime values don't contain fractional part.
+
+Disabled by default.
+
+**Examples**
+
+```sql
+SET input_format_try_infer_datetimes = 1;
+SET input_format_try_infer_datetimes_only_datetime64 = 1;
+DESC format(JSONEachRow, $$
+                                {"datetime" : "2021-01-01 00:00:00", "datetime64" : "2021-01-01 00:00:00.000"}
+                                {"datetime" : "2022-01-01 00:00:00", "datetime64" : "2022-01-01 00:00:00.000"}
+                         $$)
+```
+
+```text
+┌─name───────┬─type────────────────────┬─default_type─┬─default_expression─┬─comment─┬─codec_expression─┬─ttl_expression─┐
+│ datetime   │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+│ datetime64 │ Nullable(DateTime64(9)) │              │                    │         │                  │                │
+└────────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘
 ```
 
 Note: Parsing datetimes during schema inference respect setting [date_time_input_format](/docs/en/operations/settings/settings-formats.md#date_time_input_format)

From 03bfb1562b56d96963e75bbd14b6759ae103e52a Mon Sep 17 00:00:00 2001
From: Han Fei <hanfei19910905@gmail.com>
Date: Thu, 15 Aug 2024 14:26:01 +0200
Subject: [PATCH 85/88] fix overflow

---
 src/Common/OptimizedRegularExpression.cpp | 2 ++
 src/Common/tests/gtest_optimize_re.cpp    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/Common/OptimizedRegularExpression.cpp b/src/Common/OptimizedRegularExpression.cpp
index 04e5f846adf..2cdb3409487 100644
--- a/src/Common/OptimizedRegularExpression.cpp
+++ b/src/Common/OptimizedRegularExpression.cpp
@@ -266,6 +266,8 @@ const char * analyzeImpl(
                                 break;
                         }
                         pos += offset;
+                        if (pos == end)
+                            return pos;
                         /// if this group only contains flags, we have nothing to do.
                         if (*pos == ')')
                         {
diff --git a/src/Common/tests/gtest_optimize_re.cpp b/src/Common/tests/gtest_optimize_re.cpp
index 0730a13f160..d6735c3ccfe 100644
--- a/src/Common/tests/gtest_optimize_re.cpp
+++ b/src/Common/tests/gtest_optimize_re.cpp
@@ -21,6 +21,7 @@ TEST(OptimizeRE, analyze)
     test_f("c([^k]*)de", "");
     test_f("(?-s)bob", "bob", {}, false, true);
     test_f("(?s)bob", "bob", {}, false, true);
+    test_f("(?ssss", "");
     test_f("abc(de)fg", "abcdefg", {}, false, true);
     test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"}, false, true);
     test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"}, false, true);

From 657bbce23f6d764dc0172f9de3d6bc7fcd06fe10 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Thu, 15 Aug 2024 14:38:20 +0200
Subject: [PATCH 86/88] Add a test

---
 tests/integration/test_mask_sensitive_info/test.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py
index 6f6dc4d287f..8d5345082ff 100644
--- a/tests/integration/test_mask_sensitive_info/test.py
+++ b/tests/integration/test_mask_sensitive_info/test.py
@@ -202,6 +202,10 @@ def test_create_table():
         f"S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'",
         f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV') settings mode = 'ordered'",
         f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV', 'gzip') settings mode = 'ordered'",
+        (
+            f"Iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')",
+            "DNS_ERROR",
+        ),
     ]
 
     def make_test_case(i):
@@ -266,6 +270,7 @@ def test_create_table():
             # due to sensitive data substituion the query will be normalized, so not "settings" but "SETTINGS"
             "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV') SETTINGS mode = 'ordered'",
             "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') SETTINGS mode = 'ordered'",
+            "CREATE TABLE table21 (`x` int) ENGINE = Iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')",
         ],
         must_not_contain=[password],
     )
@@ -387,6 +392,7 @@ def test_table_functions():
         f"azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_15.csv', '{azure_account_name}', '{azure_account_key}', 'CSV', 'none', 'auto')",
         f"azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')",
         f"azureBlobStorageCluster('test_shard_localhost', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_17.csv', account_name = '{azure_account_name}', account_key = '{azure_account_key}')",
+        f"iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')",
     ]
 
     def make_test_case(i):
@@ -478,6 +484,7 @@ def test_table_functions():
             f"CREATE TABLE tablefunc48 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_15.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')",
             f"CREATE TABLE tablefunc49 (x int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')",
             f"CREATE TABLE tablefunc50 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_17.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')",
+            "CREATE TABLE tablefunc51 (`x` int) AS iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')",
         ],
         must_not_contain=[password],
     )

From fb037bcc722939f8d01fbd63c155a9a816c83f94 Mon Sep 17 00:00:00 2001
From: jsc0218 <jsc0218@gmail.com>
Date: Thu, 15 Aug 2024 13:48:43 +0000
Subject: [PATCH 87/88] move to mergetree setting and add more info

---
 .../operations/settings/merge-tree-settings.md  | 17 +++++++++++++++--
 docs/en/sql-reference/statements/delete.md      |  3 +--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/en/operations/settings/merge-tree-settings.md b/docs/en/operations/settings/merge-tree-settings.md
index a3bd919d3ce..a13aacc76e6 100644
--- a/docs/en/operations/settings/merge-tree-settings.md
+++ b/docs/en/operations/settings/merge-tree-settings.md
@@ -1042,10 +1042,23 @@ Compression rates of LZ4 or ZSTD improve on average by 20-40%.
 This setting works best for tables with no primary key or a low-cardinality primary key, i.e. a table with only few distinct primary key values.
 High-cardinality primary keys, e.g. involving timestamp columns of type `DateTime64`, are not expected to benefit from this setting.
 
-### deduplicate_merge_projection_mode
+## lightweight_mutation_projection_mode
+
+By default, lightweight delete `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. So the default value would be `throw`.
+However, this option can change the behavior. With the value either `drop` or `rebuild`, deletes will work with projections. `drop` would delete the projection so it might be fast in the current query as projection gets deleted but slow in future queries as no projection attached.
+`rebuild` would rebuild the projection which might affect the performance of the current query, but might speedup for future queries. A good thing is that these options would only work in the part level,
+which means projections in the part that don't get touched would stay intact instead of triggering any action like drop or rebuild.
+
+Possible values:
+
+- throw, drop, rebuild
+
+Default value: throw
+
+## deduplicate_merge_projection_mode
 
 Whether to allow create projection for the table with non-classic MergeTree, that is not (Replicated, Shared) MergeTree. If allowed, what is the action when merge projections, either drop or rebuild. So classic MergeTree would ignore this setting.
-It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members.
+It also controls `OPTIMIZE DEDUPLICATE` as well, but has effect on all MergeTree family members. Similar to the option `lightweight_mutation_projection_mode`, it is also part level.
 
 Possible values:
 
diff --git a/docs/en/sql-reference/statements/delete.md b/docs/en/sql-reference/statements/delete.md
index 88a9c933519..78142f880fe 100644
--- a/docs/en/sql-reference/statements/delete.md
+++ b/docs/en/sql-reference/statements/delete.md
@@ -38,8 +38,7 @@ If you anticipate frequent deletes, consider using a [custom partitioning key](/
 
 ### Lightweight `DELETE`s with projections
 
-By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation and may require the projection to be rebuilt, negatively affecting `DELETE` performance.
-However, there is an option to change this behavior. By changing setting `lightweight_mutation_projection_mode = 'drop'`, deletes will work with projections.
+By default, `DELETE` does not work for tables with projections. This is because rows in a projection may be affected by a `DELETE` operation. But there is a [MergeTree setting](https://clickhouse.com/docs/en/operations/settings/merge-tree-settings) `lightweight_mutation_projection_mode` can change the behavior.
 
 ## Performance considerations when using lightweight `DELETE`
 

From bdd0e01545a93b8bda34f667ede94f0e0faaa665 Mon Sep 17 00:00:00 2001
From: Max Kainov <maxkaynov@gmail.com>
Date: Thu, 15 Aug 2024 14:06:21 +0200
Subject: [PATCH 88/88] CI: Auto release workflow

---
 .github/actions/debug/action.yml     |  18 +++++
 .github/workflows/auto_releases.yml  | 109 +++++++++++++++++++++++++++
 .github/workflows/create_release.yml |  21 ++++++
 3 files changed, 148 insertions(+)
 create mode 100644 .github/actions/debug/action.yml
 create mode 100644 .github/workflows/auto_releases.yml

diff --git a/.github/actions/debug/action.yml b/.github/actions/debug/action.yml
new file mode 100644
index 00000000000..e1fe3f28024
--- /dev/null
+++ b/.github/actions/debug/action.yml
@@ -0,0 +1,18 @@
+name: DebugInfo
+description: Prints workflow debug info
+
+runs:
+  using: "composite"
+  steps:
+    - name: Print envs
+      shell: bash
+      run: |
+          echo "::group::Envs"
+          env
+          echo "::endgroup::"
+    - name: Print Event.json
+      shell: bash
+      run: |
+          echo "::group::Event.json"
+          python3 -m json.tool "$GITHUB_EVENT_PATH"
+          echo "::endgroup::"
diff --git a/.github/workflows/auto_releases.yml b/.github/workflows/auto_releases.yml
new file mode 100644
index 00000000000..c159907187c
--- /dev/null
+++ b/.github/workflows/auto_releases.yml
@@ -0,0 +1,109 @@
+name: AutoReleases
+
+env:
+  PYTHONUNBUFFERED: 1
+
+concurrency:
+  group: autoreleases
+
+on:
+  #  schedule:
+  #    - cron: '0 9 * * *'
+  workflow_dispatch:
+    inputs:
+      dry-run:
+        description: 'Dry run'
+        required: false
+        default: true
+        type: boolean
+
+jobs:
+  AutoReleaseInfo:
+    runs-on: [self-hosted, style-checker-aarch64]
+    outputs:
+      data: ${{ steps.info.outputs.AUTO_RELEASE_PARAMS }}
+      dry_run: ${{ steps.info.outputs.DRY_RUN }}
+    steps:
+      - name: Debug Info
+        uses: ./.github/actions/debug
+      - name: Set envs
+        run: |
+          cat >> "$GITHUB_ENV" << 'EOF'
+          ROBOT_CLICKHOUSE_SSH_KEY<<RCSK
+          ${{secrets.ROBOT_CLICKHOUSE_SSH_KEY}}
+          RCSK
+          EOF
+          echo "DRY_RUN=true" >> "$GITHUB_ENV"
+      - name: Check out repository code
+        uses: ClickHouse/checkout@v1
+      - name: Prepare Info
+        id: info
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 auto_release.py --prepare
+          echo "::group::Auto Release Info"
+          python3 -m json.tool /tmp/autorelease_info.json
+          echo "::endgroup::"
+          {
+              echo 'AUTO_RELEASE_PARAMS<<EOF'
+              cat  /tmp/autorelease_info.json
+              echo 'EOF'
+          } >> "$GITHUB_ENV"
+          {
+              echo 'AUTO_RELEASE_PARAMS<<EOF'
+              cat  /tmp/autorelease_info.json
+              echo 'EOF'
+          } >> "$GITHUB_OUTPUT"
+          echo "DRY_RUN=true" >> "$GITHUB_OUTPUT"
+      - name: Post Release Branch statuses
+        run: |
+          cd "$GITHUB_WORKSPACE/tests/ci"
+          python3 auto_release.py --post-status
+      - name: Clean up
+        uses: ./.github/actions/clean
+
+  Release_0:
+    needs: AutoReleaseInfo
+    name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].release_branch }}
+    if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].ready }}
+    uses: ./.github/workflows/create_release.yml
+    with:
+      ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
+      type: patch
+      dry-run: ${{ needs.AutoReleaseInfo.outputs.dry_run }}
+#
+#  Release_1:
+#    needs: [AutoReleaseInfo, Release_0]
+#    name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].release_branch }}
+#    if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].ready }}
+#    uses: ./.github/workflows/create_release.yml
+#    with:
+#      ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[1].commit_sha }}
+#      type: patch
+#      dry-run: ${{ env.DRY_RUN }}
+#
+#  Release_2:
+#    needs: [AutoReleaseInfo, Release_1]
+#    name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].release_branch }}
+#    if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[2].ready }}
+#    uses: ./.github/workflow/create_release.yml
+#    with:
+#      ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[0].commit_sha }}
+#      type: patch
+#      dry-run: ${{ env.DRY_RUN }}
+#
+#  Release_3:
+#    needs: [AutoReleaseInfo, Release_2]
+#    name: Release ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].release_branch }}
+#    if: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3] && fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].ready }}
+#    uses: ./.github/workflow/create_release.yml
+#    with:
+#      ref: ${{ fromJson(needs.AutoReleaseInfo.outputs.data).releases[3].commit_sha }}
+#      type: patch
+#      dry-run: ${{ env.DRY_RUN }}
+
+#  - name: Post Slack Message
+#    if: ${{ !cancelled() }}
+#    run: |
+#      cd "$GITHUB_WORKSPACE/tests/ci"
+#      python3 auto_release.py --post-auto-release-complete --wf-status ${{ job.status }}
diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml
index eb16c25f604..1553d689227 100644
--- a/.github/workflows/create_release.yml
+++ b/.github/workflows/create_release.yml
@@ -2,6 +2,7 @@ name: CreateRelease
 
 concurrency:
   group: release
+
 'on':
   workflow_dispatch:
     inputs:
@@ -26,6 +27,26 @@ concurrency:
         required: false
         default: false
         type: boolean
+  workflow_call:
+    inputs:
+      ref:
+        description: 'Git reference (branch or commit sha) from which to create the release'
+        required: true
+        type: string
+      type:
+        description: 'The type of release: "new" for a new release or "patch" for a patch release'
+        required: true
+        type: string
+      only-repo:
+        description: 'Run only repos updates including docker (repo-recovery, tests)'
+        required: false
+        default: false
+        type: boolean
+      dry-run:
+        description: 'Dry run'
+        required: false
+        default: false
+        type: boolean
 
 jobs:
   CreateRelease: