From 32d892c9db424b7df8616ca5d6459cc78ba0cfde Mon Sep 17 00:00:00 2001
From: MikhailBurdukov <burdukvmikhail@gmail.com>
Date: Tue, 15 Oct 2024 15:06:57 +0000
Subject: [PATCH 01/80] tests fix

---
 src/Interpreters/InterpreterSystemQuery.cpp |  2 +-
 tests/integration/test_drop_replica/test.py | 19 ++++++++++---------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index b743095e6f6..3016f62f20d 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -1010,7 +1010,7 @@ void InterpreterSystemQuery::dropReplica(ASTSystemQuery & query)
                 {
                     ReplicatedTableStatus status;
                     storage_replicated->getStatus(status);
-                    if (status.zookeeper_info.path == query.replica_zk_path)
+                    if (status.replica_path == remote_replica_path)
                         throw Exception(ErrorCodes::TABLE_WAS_NOT_DROPPED,
                                         "There is a local table {}, which has the same table path in ZooKeeper. "
                                         "Please check the path in query. "
diff --git a/tests/integration/test_drop_replica/test.py b/tests/integration/test_drop_replica/test.py
index e0928c6ab08..b959e80fc19 100644
--- a/tests/integration/test_drop_replica/test.py
+++ b/tests/integration/test_drop_replica/test.py
@@ -141,11 +141,7 @@ def test_drop_replica(start_cluster):
             shard=1
         )
     )
-    assert "There is a local table" in node_1_2.query_and_get_error(
-        "SYSTEM DROP REPLICA 'node_1_1' FROM ZKPATH '/clickhouse/tables/test/{shard}/replicated/test_table'".format(
-            shard=1
-        )
-    )
+
     assert "There is a local table" in node_1_1.query_and_get_error(
         "SYSTEM DROP REPLICA 'node_1_1' FROM ZKPATH '/clickhouse/tables/test/{shard}/replicated/test_table'".format(
             shard=1
@@ -221,11 +217,16 @@ def test_drop_replica(start_cluster):
     )
     assert exists_replica_1_1 == None
 
-    node_1_2.query("SYSTEM DROP REPLICA 'node_1_1'")
-    exists_replica_1_1 = check_exists(
+    node_1_2.query("DETACH TABLE test4.test_table")
+    node_1_1.query(
+        "SYSTEM DROP REPLICA 'node_1_2' FROM ZKPATH '/clickhouse/tables/test4/{shard}/replicated/test_table'".format(
+            shard=1
+        )
+    )
+    exists_replica_1_2 = check_exists(
         zk,
         "/clickhouse/tables/test4/{shard}/replicated/test_table/replicas/{replica}".format(
-            shard=1, replica="node_1_1"
+            shard=1, replica="node_1_2"
         ),
     )
-    assert exists_replica_1_1 == None
+    assert exists_replica_1_2 == None

From 42dd97b78c189ffdd1dcba13c5bc3ca84e1388f6 Mon Sep 17 00:00:00 2001
From: MikhailBurdukov <burdukvmikhail@gmail.com>
Date: Tue, 15 Oct 2024 15:23:47 +0000
Subject: [PATCH 02/80] Empty


From d1b3f364fb55404427c756dafe959b8d05b31c99 Mon Sep 17 00:00:00 2001
From: MikhailBurdukov <burdukvmikhail@gmail.com>
Date: Wed, 16 Oct 2024 08:59:11 +0000
Subject: [PATCH 03/80] Fix flaky check

---
 tests/integration/test_drop_replica/test.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_drop_replica/test.py b/tests/integration/test_drop_replica/test.py
index b959e80fc19..b70a0725039 100644
--- a/tests/integration/test_drop_replica/test.py
+++ b/tests/integration/test_drop_replica/test.py
@@ -8,6 +8,7 @@ def fill_nodes(nodes, shard):
     for node in nodes:
         node.query(
             """
+                DROP DATABASE IF EXISTS test SYNC;
                 CREATE DATABASE test;
     
                 CREATE TABLE test.test_table(date Date, id UInt32)
@@ -20,6 +21,7 @@ def fill_nodes(nodes, shard):
 
         node.query(
             """
+                DROP DATABASE IF EXISTS test1 SYNC;
                 CREATE DATABASE test1;
     
                 CREATE TABLE test1.test_table(date Date, id UInt32)
@@ -32,6 +34,7 @@ def fill_nodes(nodes, shard):
 
         node.query(
             """
+                DROP DATABASE IF EXISTS test2 SYNC;
                 CREATE DATABASE test2;
     
                 CREATE TABLE test2.test_table(date Date, id UInt32)
@@ -44,7 +47,8 @@ def fill_nodes(nodes, shard):
 
         node.query(
             """
-                CREATE DATABASE test3;
+            DROP DATABASE IF EXISTS test3 SYNC;
+            CREATE DATABASE test3;
     
                 CREATE TABLE test3.test_table(date Date, id UInt32)
                 ENGINE = ReplicatedMergeTree('/clickhouse/tables/test3/{shard}/replicated/test_table', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) 
@@ -56,6 +60,7 @@ def fill_nodes(nodes, shard):
 
         node.query(
             """
+                DROP DATABASE IF EXISTS test4 SYNC;
                 CREATE DATABASE test4;
     
                 CREATE TABLE test4.test_table(date Date, id UInt32)
@@ -83,9 +88,6 @@ node_1_3 = cluster.add_instance(
 def start_cluster():
     try:
         cluster.start()
-
-        fill_nodes([node_1_1, node_1_2], 1)
-
         yield cluster
 
     except Exception as ex:
@@ -101,6 +103,8 @@ def check_exists(zk, path):
 
 
 def test_drop_replica(start_cluster):
+    fill_nodes([node_1_1, node_1_2], 1)
+
     node_1_1.query(
         "INSERT INTO test.test_table SELECT number, toString(number) FROM numbers(100)"
     )
@@ -230,3 +234,7 @@ def test_drop_replica(start_cluster):
         ),
     )
     assert exists_replica_1_2 == None
+
+    node_1_1.query("ATTACH DATABASE test")
+    for i in range(1, 5):
+        node_1_1.query("ATTACH DATABASE test{}".format(i))

From 09a1f86db7d135d35b1dcb826776bca63f899abc Mon Sep 17 00:00:00 2001
From: MikhailBurdukov <burdukvmikhail@gmail.com>
Date: Mon, 21 Oct 2024 08:36:08 +0000
Subject: [PATCH 04/80] Test fix

---
 tests/integration/test_drop_replica/test.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_drop_replica/test.py b/tests/integration/test_drop_replica/test.py
index b70a0725039..201507a1734 100644
--- a/tests/integration/test_drop_replica/test.py
+++ b/tests/integration/test_drop_replica/test.py
@@ -221,6 +221,8 @@ def test_drop_replica(start_cluster):
     )
     assert exists_replica_1_1 == None
 
+    node_1_1.query("ATTACH DATABASE test4")
+
     node_1_2.query("DETACH TABLE test4.test_table")
     node_1_1.query(
         "SYSTEM DROP REPLICA 'node_1_2' FROM ZKPATH '/clickhouse/tables/test4/{shard}/replicated/test_table'".format(
@@ -236,5 +238,5 @@ def test_drop_replica(start_cluster):
     assert exists_replica_1_2 == None
 
     node_1_1.query("ATTACH DATABASE test")
-    for i in range(1, 5):
+    for i in range(1, 4):
         node_1_1.query("ATTACH DATABASE test{}".format(i))

From 15aa07ba8862db3ba619ec74e3a5393999fc5ce4 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Wed, 23 Oct 2024 15:06:57 +0000
Subject: [PATCH 05/80] optimize replacing merge for non intersecting parts

---
 src/Core/SortCursor.h                         |  38 ++++++-
 .../Algorithms/ReplacingSortedAlgorithm.cpp   | 100 +++++++++++++-----
 .../Algorithms/ReplacingSortedAlgorithm.h     |   1 +
 3 files changed, 110 insertions(+), 29 deletions(-)

diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h
index f41664a1607..3d568be199c 100644
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@@ -195,6 +195,15 @@ struct SortCursorHelper
         /// The last row of this cursor is no larger than the first row of the another cursor.
         return !derived().greaterAt(rhs.derived(), impl->rows - 1, 0);
     }
+
+    bool ALWAYS_INLINE totallyLess(const SortCursorHelper & rhs) const
+    {
+        if (impl->rows == 0 || rhs.impl->rows == 0)
+            return false;
+
+        /// The last row of this cursor is less than the first row of the another cursor.
+        return rhs.derived().template greaterAt<false>(derived(), 0, impl->rows - 1);
+    }
 };
 
 
@@ -203,6 +212,7 @@ struct SortCursor : SortCursorHelper<SortCursor>
     using SortCursorHelper<SortCursor>::SortCursorHelper;
 
     /// The specified row of this cursor is greater than the specified row of another cursor.
+    template <bool consider_order = true>
     bool ALWAYS_INLINE greaterAt(const SortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
     {
 #if USE_EMBEDDED_COMPILER
@@ -218,7 +228,10 @@ struct SortCursor : SortCursorHelper<SortCursor>
             if (res < 0)
                 return false;
 
-            return impl->order > rhs.impl->order;
+            if constexpr (consider_order)
+                return impl->order > rhs.impl->order;
+            else
+                return false;
         }
 #endif
 
@@ -235,7 +248,10 @@ struct SortCursor : SortCursorHelper<SortCursor>
                 return false;
         }
 
-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
     }
 };
 
@@ -245,6 +261,7 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
 {
     using SortCursorHelper<SimpleSortCursor>::SortCursorHelper;
 
+    template <bool consider_order = true>
     bool ALWAYS_INLINE greaterAt(const SimpleSortCursor & rhs, size_t lhs_pos, size_t rhs_pos) const
     {
         int res = 0;
@@ -271,7 +288,10 @@ struct SimpleSortCursor : SortCursorHelper<SimpleSortCursor>
         if (res < 0)
             return false;
 
-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
     }
 };
 
@@ -280,6 +300,7 @@ struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleCol
 {
     using SortCursorHelper<SpecializedSingleColumnSortCursor>::SortCursorHelper;
 
+    template <bool consider_order = true>
     bool ALWAYS_INLINE greaterAt(const SortCursorHelper<SpecializedSingleColumnSortCursor> & rhs, size_t lhs_pos, size_t rhs_pos) const
     {
         auto & this_impl = this->impl;
@@ -302,7 +323,10 @@ struct SpecializedSingleColumnSortCursor : SortCursorHelper<SpecializedSingleCol
         if (res < 0)
             return false;
 
-        return this_impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return this_impl->order > rhs.impl->order;
+        else
+            return false;
     }
 };
 
@@ -311,6 +335,7 @@ struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
 {
     using SortCursorHelper<SortCursorWithCollation>::SortCursorHelper;
 
+    template <bool consider_order = true>
     bool ALWAYS_INLINE greaterAt(const SortCursorWithCollation & rhs, size_t lhs_pos, size_t rhs_pos) const
     {
         for (size_t i = 0; i < impl->sort_columns_size; ++i)
@@ -330,7 +355,10 @@ struct SortCursorWithCollation : SortCursorHelper<SortCursorWithCollation>
             if (res < 0)
                 return false;
         }
-        return impl->order > rhs.impl->order;
+        if constexpr (consider_order)
+            return impl->order > rhs.impl->order;
+        else
+            return false;
     }
 };
 
diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index cd347d371d9..d7ff8b9336b 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -46,11 +46,28 @@ ReplacingSortedAlgorithm::ReplacingSortedAlgorithm(
 {
     if (!is_deleted_column.empty())
         is_deleted_column_number = header_.getPositionByName(is_deleted_column);
+
     if (!version_column.empty())
         version_column_number = header_.getPositionByName(version_column);
 }
 
 void ReplacingSortedAlgorithm::insertRow()
+{
+    if (is_deleted_column_number != -1)
+    {
+        if (!(cleanup && assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num]))
+            insertRowImpl();
+    }
+    else
+    {
+        insertRowImpl();
+    }
+
+    /// insertRowImpl() may has not been called
+    saveChunkForSkippingFinalFromSelectedRow();
+}
+
+void ReplacingSortedAlgorithm::insertRowImpl()
 {
     if (out_row_sources_buf)
     {
@@ -67,6 +84,7 @@ void ReplacingSortedAlgorithm::insertRow()
         /// We just record the position to be selected in the chunk
         if (!selected_row.owned_chunk->replace_final_selection)
             selected_row.owned_chunk->replace_final_selection = ColumnUInt64::create();
+
         selected_row.owned_chunk->replace_final_selection->insert(selected_row.row_num);
 
         /// This is the last row we can select from `selected_row.owned_chunk`, keep it to emit later
@@ -74,7 +92,9 @@ void ReplacingSortedAlgorithm::insertRow()
             to_be_emitted.push(std::move(selected_row.owned_chunk));
     }
     else
+    {
         merged_data->insertRow(*selected_row.all_columns, selected_row.row_num, selected_row.owned_chunk->getNumRows());
+    }
 
     selected_row.clear();
 }
@@ -101,6 +121,58 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
             return Status(current.impl->order);
         }
 
+        if (current.impl->isFirst()
+            && is_deleted_column_number == -1 /// Ignore optimization if we need to filter deleted rows.
+            && sources_origin_merge_tree_part_level[current->order] > 0
+            && !skipLastRowFor(current->order) /// Ignore optimization if last row should be skipped.
+            && (queue.size() == 1 || (queue.size() >= 2 && current.totallyLess(queue.nextChild()))))
+        {
+            /// This is special optimization if current cursor is totally less than next cursor
+            /// and current chunk has no duplicates (we assume that parts with non-zero level have no duplicates)
+            /// We want to insert current cursor chunk directly in merged data.
+
+            size_t source_num = current->order;
+            auto current_chunk = std::move(*sources[source_num].chunk);
+            size_t chunk_num_rows = current_chunk.getNumRows();
+
+            /// First if merged_data is not empty we need to flush it.
+            /// We will get into the same condition on next merge call.
+            if (merged_data->mergedRows() != 0)
+                return Status(merged_data->pull());
+
+            /// We will get the next block from the corresponding source, if there is one.
+            queue.removeTop();
+
+            if (enable_vertical_final)
+            {
+                auto replace_final_selection = ColumnUInt64::create(chunk_num_rows);
+                auto & replace_final_data = replace_final_selection->getData();
+
+                std::iota(replace_final_data.begin(), replace_final_data.end(), 0);
+                current_chunk.getChunkInfos().add(std::make_shared<ChunkSelectFinalIndices>(std::move(replace_final_selection)));
+
+                Status status(merged_data->pull(), false);
+                status.required_source = source_num;
+                return Status(std::move(current_chunk), false);
+            }
+
+            merged_data->insertChunk(std::move(current_chunk), chunk_num_rows);
+            sources[source_num].chunk = {};
+
+            /// Write order of rows for other columns this data will be used in gather stream
+            if (out_row_sources_buf)
+            {
+                /// All rows are not skipped.
+                RowSourcePart row_source(source_num);
+                for (size_t i = 0; i < chunk_num_rows; ++i)
+                    out_row_sources_buf->write(row_source.data);
+            }
+
+            Status status(merged_data->pull(), false);
+            status.required_source = source_num;
+            return status;
+        }
+
         RowRef current_row;
         setRowRef(current_row, current);
 
@@ -113,17 +185,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
 
             /// Write the data for the previous primary key.
             if (!selected_row.empty())
-            {
-                if (is_deleted_column_number!=-1)
-                {
-                    if (!(cleanup && assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num]))
-                        insertRow();
-                }
-                else
-                    insertRow();
-                /// insertRow() may has not been called
-                saveChunkForSkippingFinalFromSelectedRow();
-            }
+                insertRow();
 
             selected_row.clear();
         }
@@ -133,10 +195,10 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
         if (out_row_sources_buf)
             current_row_sources.emplace_back(current.impl->order, true);
 
-        if ((is_deleted_column_number!=-1))
+        if (is_deleted_column_number != -1)
         {
             const UInt8 is_deleted = assert_cast<const ColumnUInt8 &>(*current->all_columns[is_deleted_column_number]).getData()[current->getRow()];
-            if ((is_deleted != 1) && (is_deleted != 0))
+            if (is_deleted > 1)
                 throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect data: is_deleted = {} (must be 1 or 0).", toString(is_deleted));
         }
 
@@ -172,17 +234,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
 
     /// We will write the data for the last primary key.
     if (!selected_row.empty())
-    {
-        if (is_deleted_column_number!=-1)
-        {
-            if (!(cleanup && assert_cast<const ColumnUInt8 &>(*(*selected_row.all_columns)[is_deleted_column_number]).getData()[selected_row.row_num]))
-                insertRow();
-        }
-        else
-            insertRow();
-        /// insertRow() may has not been called
-        saveChunkForSkippingFinalFromSelectedRow();
-    }
+        insertRow();
 
     /// Skipping final: emit the remaining chunks
     if (!to_be_emitted.empty())
diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
index 2f23f2a5c4d..b0dd4fe4b08 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
@@ -63,6 +63,7 @@ private:
     PODArray<RowSourcePart> current_row_sources;
 
     void insertRow();
+    void insertRowImpl();
 
     /// Method for using in skipping FINAL logic
     /// Skipping FINAL doesn't merge rows to new chunks but marks selected rows in input chunks and emit them

From 06b31d7669c87f66b548b1f39272150cb98ebc7c Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Wed, 23 Oct 2024 16:53:25 +0000
Subject: [PATCH 06/80] fix build

---
 src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index d7ff8b9336b..0ce626b1dc9 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -5,6 +5,7 @@
 #include <IO/WriteBuffer.h>
 #include <Columns/IColumn.h>
 #include <Processors/Merges/Algorithms/RowRef.h>
+#include <numeric>
 
 namespace DB
 {

From de8c5eaed016de240d203ef0bebb20d94b8eb2a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ji=C5=99=C3=AD=20Kozlovsk=C3=BD?=
 <jirislav@users.noreply.github.com>
Date: Thu, 24 Oct 2024 12:48:40 +0200
Subject: [PATCH 07/80] Make the Replxx client history size configurable

---
 docs/en/interfaces/cli.md               | 1 +
 programs/client/Client.cpp              | 4 ++++
 programs/disks/DisksApp.cpp             | 3 +++
 programs/disks/DisksApp.h               | 2 ++
 programs/keeper-client/KeeperClient.cpp | 3 +++
 programs/keeper-client/KeeperClient.h   | 2 ++
 src/Client/ClientApplicationBase.cpp    | 5 ++++-
 src/Client/ClientBase.cpp               | 3 +++
 src/Client/ClientBase.h                 | 1 +
 src/Client/ReplxxLineReader.cpp         | 3 +++
 src/Client/ReplxxLineReader.h           | 1 +
 11 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/docs/en/interfaces/cli.md b/docs/en/interfaces/cli.md
index 66291014ed7..504f6eec6de 100644
--- a/docs/en/interfaces/cli.md
+++ b/docs/en/interfaces/cli.md
@@ -190,6 +190,7 @@ You can pass parameters to `clickhouse-client` (all parameters have a default va
 - `--config-file` – The name of the configuration file.
 - `--secure` – If specified, will connect to server over secure connection (TLS). You might need to configure your CA certificates in the [configuration file](#configuration_files). The available configuration settings are the same as for [server-side TLS configuration](../operations/server-configuration-parameters/settings.md#openssl).
 - `--history_file` — Path to a file containing command history.
+- `--history_max_entries` — Maximum number of entries in the history file. Default value: 1 000 000.
 - `--param_<name>` — Value for a [query with parameters](#cli-queries-with-parameters).
 - `--hardware-utilization` — Print hardware utilization information in progress bar.
 - `--print-profile-events` – Print `ProfileEvents` packets.
diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp
index ffb029404d3..a0a40aa36ad 100644
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@@ -192,6 +192,10 @@ void Client::parseConnectionsCredentials(Poco::Util::AbstractConfiguration & con
                 history_file = home_path + "/" + history_file.substr(1);
             config.setString("history_file", history_file);
         }
+        if (config.has(prefix + ".history_max_entries"))
+        {
+            config.setUInt("history_max_entries", history_max_entries);
+        }
         if (config.has(prefix + ".accept-invalid-certificate"))
             config.setBool("accept-invalid-certificate", config.getBool(prefix + ".accept-invalid-certificate"));
     }
diff --git a/programs/disks/DisksApp.cpp b/programs/disks/DisksApp.cpp
index 5fddfce0678..610d8eaa638 100644
--- a/programs/disks/DisksApp.cpp
+++ b/programs/disks/DisksApp.cpp
@@ -236,6 +236,7 @@ void DisksApp::runInteractiveReplxx()
     ReplxxLineReader lr(
         suggest,
         history_file,
+        history_max_entries,
         /* multiline= */ false,
         /* ignore_shell_suspend= */ false,
         query_extenders,
@@ -398,6 +399,8 @@ void DisksApp::initializeHistoryFile()
                 throw;
         }
     }
+
+    history_max_entries = config().getUInt("history-max-entries", 1000000);
 }
 
 void DisksApp::init(const std::vector<String> & common_arguments)
diff --git a/programs/disks/DisksApp.h b/programs/disks/DisksApp.h
index 5b240648508..4f2bd7fcad6 100644
--- a/programs/disks/DisksApp.h
+++ b/programs/disks/DisksApp.h
@@ -62,6 +62,8 @@ private:
 
     // Fields responsible for the REPL work
     String history_file;
+    UInt32 history_max_entries = 0; /// Maximum number of entries in the history file. Needs to be initialized to 0 since we don't have a proper constructor. Worry not, actual value is set within the initializeHistoryFile method.
+
     LineReader::Suggest suggest;
     static LineReader::Patterns query_extenders;
     static LineReader::Patterns query_delimiters;
diff --git a/programs/keeper-client/KeeperClient.cpp b/programs/keeper-client/KeeperClient.cpp
index 97caa142124..ad850cfa704 100644
--- a/programs/keeper-client/KeeperClient.cpp
+++ b/programs/keeper-client/KeeperClient.cpp
@@ -239,6 +239,8 @@ void KeeperClient::initialize(Poco::Util::Application & /* self */)
         }
     }
 
+    history_max_entries = config().getUInt("history-max-entries", 1000000);
+
     String default_log_level;
     if (config().has("query"))
         /// We don't want to see any information log in query mode, unless it was set explicitly
@@ -315,6 +317,7 @@ void KeeperClient::runInteractiveReplxx()
     ReplxxLineReader lr(
         suggest,
         history_file,
+        history_max_entries,
         /* multiline= */ false,
         /* ignore_shell_suspend= */ false,
         query_extenders,
diff --git a/programs/keeper-client/KeeperClient.h b/programs/keeper-client/KeeperClient.h
index 0d3db3c2f02..359663c6a13 100644
--- a/programs/keeper-client/KeeperClient.h
+++ b/programs/keeper-client/KeeperClient.h
@@ -59,6 +59,8 @@ protected:
     std::vector<String> getCompletions(const String & prefix) const;
 
     String history_file;
+    UInt32 history_max_entries; /// Maximum number of entries in the history file.
+
     LineReader::Suggest suggest;
 
     zkutil::ZooKeeperArgs zk_args;
diff --git a/src/Client/ClientApplicationBase.cpp b/src/Client/ClientApplicationBase.cpp
index d26641fe5f9..bceb80eb9f7 100644
--- a/src/Client/ClientApplicationBase.cpp
+++ b/src/Client/ClientApplicationBase.cpp
@@ -167,7 +167,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
         ("query_kind", po::value<std::string>()->default_value("initial_query"), "One of initial_query/secondary_query/no_query")
         ("query_id", po::value<std::string>(), "query_id")
 
-        ("history_file", po::value<std::string>(), "path to history file")
+        ("history_file", po::value<std::string>(), "Path to a file containing command history.")
+        ("history_max_entries", po::value<UInt32>()->default_value(1000000), "Maximum number of entries in the history file.")
 
         ("stage", po::value<std::string>()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit")
         ("progress", po::value<ProgressOption>()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print progress of queries execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off")
@@ -350,6 +351,8 @@ void ClientApplicationBase::init(int argc, char ** argv)
         getClientConfiguration().setBool("highlight", options["highlight"].as<bool>());
     if (options.count("history_file"))
         getClientConfiguration().setString("history_file", options["history_file"].as<std::string>());
+    if (options.count("history_max_entries"))
+        getClientConfiguration().setUInt("history_max_entries", options["history_max_entries"].as<UInt32>());
     if (options.count("interactive"))
         getClientConfiguration().setBool("interactive", true);
     if (options.count("pager"))
diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 8f7cced73ef..e667e5f6a4a 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -2674,6 +2674,8 @@ void ClientBase::runInteractive()
         }
     }
 
+    history_max_entries = getClientConfiguration().getUInt("history_max_entries");
+
     LineReader::Patterns query_extenders = {"\\"};
     LineReader::Patterns query_delimiters = {";", "\\G", "\\G;"};
     char word_break_characters[] = " \t\v\f\a\b\r\n`~!@#$%^&*()-=+[{]}\\|;:'\",<.>/?";
@@ -2686,6 +2688,7 @@ void ClientBase::runInteractive()
     ReplxxLineReader lr(
         *suggest,
         history_file,
+        history_max_entries,
         getClientConfiguration().has("multiline"),
         getClientConfiguration().getBool("ignore_shell_suspend", true),
         query_extenders,
diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h
index b06958f1d14..5ca177af0e3 100644
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@@ -328,6 +328,7 @@ protected:
 
     String home_path;
     String history_file; /// Path to a file containing command history.
+    UInt32 history_max_entries; /// Maximum number of entries in the history file.
 
     String current_profile;
 
diff --git a/src/Client/ReplxxLineReader.cpp b/src/Client/ReplxxLineReader.cpp
index 37ceb471e5b..ee90a6cc7b7 100644
--- a/src/Client/ReplxxLineReader.cpp
+++ b/src/Client/ReplxxLineReader.cpp
@@ -293,6 +293,7 @@ void ReplxxLineReader::setLastIsDelimiter(bool flag)
 ReplxxLineReader::ReplxxLineReader(
     Suggest & suggest,
     const String & history_file_path_,
+    UInt32 history_max_entries_,
     bool multiline_,
     bool ignore_shell_suspend,
     Patterns extenders_,
@@ -313,6 +314,8 @@ ReplxxLineReader::ReplxxLineReader(
 {
     using Replxx = replxx::Replxx;
 
+    rx.set_max_history_size(static_cast<int>(history_max_entries_));
+
     if (!history_file_path.empty())
     {
         history_file_fd = open(history_file_path.c_str(), O_RDWR);
diff --git a/src/Client/ReplxxLineReader.h b/src/Client/ReplxxLineReader.h
index 1dbad2c70dd..ccda47170e6 100644
--- a/src/Client/ReplxxLineReader.h
+++ b/src/Client/ReplxxLineReader.h
@@ -14,6 +14,7 @@ public:
     (
         Suggest & suggest,
         const String & history_file_path,
+        UInt32 history_max_entries,
         bool multiline,
         bool ignore_shell_suspend,
         Patterns extenders_,

From 60c049375ceb7529846d070f0fcd4043ea646e6b Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 15 Oct 2024 21:58:30 +0200
Subject: [PATCH 08/80] Add ability to set user/password in http_handlers

This will allow to omit them in requests for
dynamic_query_handler/predefined_query_handler, that will allow to build
more indepenent handlers/apps.

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
---
 src/Access/Credentials.h                      |  3 +
 src/Server/HTTP/authenticateUserByHTTP.cpp    | 18 ++++++
 src/Server/HTTP/authenticateUserByHTTP.h      | 18 ++++--
 src/Server/HTTPHandler.cpp                    | 60 +++++++++++++------
 src/Server/HTTPHandler.h                      | 36 +++++++----
 src/Server/HTTPHandlerFactory.cpp             |  2 +-
 src/Server/PrometheusRequestHandler.cpp       |  3 +-
 .../test_http_handlers_config/test.py         | 25 +++++++-
 .../test_dynamic_handler/config.xml           | 27 +++++++++
 .../test_predefined_handler/config.xml        | 30 ++++++++++
 .../users.d/users.yaml                        |  7 +++
 11 files changed, 192 insertions(+), 37 deletions(-)
 create mode 100644 tests/integration/test_http_handlers_config/users.d/users.yaml

diff --git a/src/Access/Credentials.h b/src/Access/Credentials.h
index f220b8d2c48..b21b7e6921f 100644
--- a/src/Access/Credentials.h
+++ b/src/Access/Credentials.h
@@ -15,6 +15,9 @@ public:
     explicit Credentials() = default;
     explicit Credentials(const String & user_name_);
 
+    Credentials(const Credentials &) = default;
+    Credentials(Credentials &&) = default;
+
     virtual ~Credentials() = default;
 
     const String & getUserName() const;
diff --git a/src/Server/HTTP/authenticateUserByHTTP.cpp b/src/Server/HTTP/authenticateUserByHTTP.cpp
index cbad91cc292..61029ed9560 100644
--- a/src/Server/HTTP/authenticateUserByHTTP.cpp
+++ b/src/Server/HTTP/authenticateUserByHTTP.cpp
@@ -6,6 +6,7 @@
 #include <Access/ExternalAuthenticators.h>
 #include <Common/Base64.h>
 #include <Common/HTTPHeaderFilter.h>
+#include <Server/HTTPHandler.h>
 #include <Server/HTTP/HTTPServerRequest.h>
 #include <Server/HTTP/HTMLForm.h>
 #include <Server/HTTP/HTTPServerResponse.h>
@@ -54,11 +55,13 @@ bool authenticateUserByHTTP(
     HTTPServerResponse & response,
     Session & session,
     std::unique_ptr<Credentials> & request_credentials,
+    const HTTPHandlerConnectionConfig & connection_config,
     ContextPtr global_context,
     LoggerPtr log)
 {
     /// Get the credentials created by the previous call of authenticateUserByHTTP() while handling the previous HTTP request.
     auto current_credentials = std::move(request_credentials);
+    const auto & config_credentials = connection_config.credentials;
 
     /// The user and password can be passed by headers (similar to X-Auth-*),
     /// which is used by load balancers to pass authentication information.
@@ -70,6 +73,7 @@ bool authenticateUserByHTTP(
     /// The header 'X-ClickHouse-SSL-Certificate-Auth: on' enables checking the common name
     /// extracted from the SSL certificate used for this connection instead of checking password.
     bool has_ssl_certificate_auth = (request.get("X-ClickHouse-SSL-Certificate-Auth", "") == "on");
+    bool has_config_credentials = config_credentials.has_value();
 
     /// User name and password can be passed using HTTP Basic auth or query parameters
     /// (both methods are insecure).
@@ -79,6 +83,10 @@ bool authenticateUserByHTTP(
     std::string spnego_challenge;
     SSLCertificateSubjects certificate_subjects;
 
+    if (config_credentials)
+    {
+        checkUserNameNotEmpty(config_credentials->getUserName(), "config authentication");
+    }
     if (has_ssl_certificate_auth)
     {
 #if USE_SSL
@@ -86,6 +94,8 @@ bool authenticateUserByHTTP(
         checkUserNameNotEmpty(user, "X-ClickHouse HTTP headers");
 
         /// It is prohibited to mix different authorization schemes.
+        if (has_config_credentials)
+            throwMultipleAuthenticationMethods("SSL certificate authentication", "authentication set in config");
         if (!password.empty())
             throwMultipleAuthenticationMethods("SSL certificate authentication", "authentication via password");
         if (has_http_credentials)
@@ -109,6 +119,8 @@ bool authenticateUserByHTTP(
         checkUserNameNotEmpty(user, "X-ClickHouse HTTP headers");
 
         /// It is prohibited to mix different authorization schemes.
+        if (has_config_credentials)
+            throwMultipleAuthenticationMethods("X-ClickHouse HTTP headers", "authentication set in config");
         if (has_http_credentials)
             throwMultipleAuthenticationMethods("X-ClickHouse HTTP headers", "Authorization HTTP header");
         if (has_credentials_in_query_params)
@@ -117,6 +129,8 @@ bool authenticateUserByHTTP(
     else if (has_http_credentials)
     {
         /// It is prohibited to mix different authorization schemes.
+        if (has_config_credentials)
+            throwMultipleAuthenticationMethods("Authorization HTTP header", "authentication set in config");
         if (has_credentials_in_query_params)
             throwMultipleAuthenticationMethods("Authorization HTTP header", "authentication via parameters");
 
@@ -190,6 +204,10 @@ bool authenticateUserByHTTP(
             return false;
         }
     }
+    else if (has_config_credentials)
+    {
+        current_credentials = std::make_unique<BasicCredentials>(*config_credentials);
+    }
     else // I.e., now using user name and password strings ("Basic").
     {
         if (!current_credentials)
diff --git a/src/Server/HTTP/authenticateUserByHTTP.h b/src/Server/HTTP/authenticateUserByHTTP.h
index 3b5a04cae68..02dcf828faa 100644
--- a/src/Server/HTTP/authenticateUserByHTTP.h
+++ b/src/Server/HTTP/authenticateUserByHTTP.h
@@ -11,13 +11,22 @@ class HTMLForm;
 class HTTPServerResponse;
 class Session;
 class Credentials;
+class BasicCredentials;
+struct HTTPHandlerConnectionConfig;
 
 /// Authenticates a user via HTTP protocol and initializes a session.
+///
 /// Usually retrieves the name and the password for that user from either the request's headers or from the query parameters.
-/// Returns true when the user successfully authenticated,
-/// the session instance will be configured accordingly, and the request_credentials instance will be dropped.
-/// Returns false when the user is not authenticated yet, and the HTTP_UNAUTHORIZED response is sent with the "WWW-Authenticate" header,
-/// in this case the `request_credentials` instance must be preserved until the next request or until any exception.
+/// You can also pass user/password explicitly via `config_credentials`.
+///
+/// Returns true when the user successfully authenticated:
+/// - the session instance will be configured accordingly
+/// - and the request_credentials instance will be dropped.
+///
+/// Returns false when the user is not authenticated yet:
+/// - the HTTP_UNAUTHORIZED response is sent with the "WWW-Authenticate" header
+/// - the `request_credentials` instance must be preserved until the next request or until any exception.
+///
 /// Throws an exception if authentication failed.
 bool authenticateUserByHTTP(
     const HTTPServerRequest & request,
@@ -25,6 +34,7 @@ bool authenticateUserByHTTP(
     HTTPServerResponse & response,
     Session & session,
     std::unique_ptr<Credentials> & request_credentials,
+    const HTTPHandlerConnectionConfig & connection_config,
     ContextPtr global_context,
     LoggerPtr log);
 
diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp
index 8a9ae05b355..5fd92d99b3c 100644
--- a/src/Server/HTTPHandler.cpp
+++ b/src/Server/HTTPHandler.cpp
@@ -1,6 +1,5 @@
 #include <Server/HTTPHandler.h>
 
-#include <Access/Credentials.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
 #include <Core/ExternalTable.h>
@@ -145,6 +144,15 @@ static std::chrono::steady_clock::duration parseSessionTimeout(
     return std::chrono::seconds(session_timeout);
 }
 
+HTTPHandlerConnectionConfig::HTTPHandlerConnectionConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix)
+{
+    if (config.has(config_prefix + ".handler.user") || config.has(config_prefix + ".handler.password"))
+    {
+        credentials.emplace(
+            config.getString(config_prefix + ".handler.user", "default"),
+            config.getString(config_prefix + ".handler.password", ""));
+    }
+}
 
 void HTTPHandler::pushDelayedResults(Output & used_output)
 {
@@ -182,11 +190,12 @@ void HTTPHandler::pushDelayedResults(Output & used_output)
 }
 
 
-HTTPHandler::HTTPHandler(IServer & server_, const std::string & name, const HTTPResponseHeaderSetup & http_response_headers_override_)
+HTTPHandler::HTTPHandler(IServer & server_, const HTTPHandlerConnectionConfig & connection_config_, const std::string & name, const HTTPResponseHeaderSetup & http_response_headers_override_)
     : server(server_)
     , log(getLogger(name))
     , default_settings(server.context()->getSettingsRef())
     , http_response_headers_override(http_response_headers_override_)
+    , connection_config(connection_config_)
 {
     server_display_name = server.config().getString("display_name", getFQDNOrHostName());
 }
@@ -199,7 +208,7 @@ HTTPHandler::~HTTPHandler() = default;
 
 bool HTTPHandler::authenticateUser(HTTPServerRequest & request, HTMLForm & params, HTTPServerResponse & response)
 {
-    return authenticateUserByHTTP(request, params, response, *session, request_credentials, server.context(), log);
+    return authenticateUserByHTTP(request, params, response, *session, request_credentials, connection_config, server.context(), log);
 }
 
 
@@ -768,8 +777,12 @@ void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
 }
 
 DynamicQueryHandler::DynamicQueryHandler(
-    IServer & server_, const std::string & param_name_, const HTTPResponseHeaderSetup & http_response_headers_override_)
-    : HTTPHandler(server_, "DynamicQueryHandler", http_response_headers_override_), param_name(param_name_)
+    IServer & server_,
+    const HTTPHandlerConnectionConfig & connection_config,
+    const std::string & param_name_,
+    const HTTPResponseHeaderSetup & http_response_headers_override_)
+    : HTTPHandler(server_, connection_config, "DynamicQueryHandler", http_response_headers_override_)
+    , param_name(param_name_)
 {
 }
 
@@ -826,12 +839,13 @@ std::string DynamicQueryHandler::getQuery(HTTPServerRequest & request, HTMLForm
 
 PredefinedQueryHandler::PredefinedQueryHandler(
     IServer & server_,
+    const HTTPHandlerConnectionConfig & connection_config,
     const NameSet & receive_params_,
     const std::string & predefined_query_,
     const CompiledRegexPtr & url_regex_,
     const std::unordered_map<String, CompiledRegexPtr> & header_name_with_regex_,
     const HTTPResponseHeaderSetup & http_response_headers_override_)
-    : HTTPHandler(server_, "PredefinedQueryHandler", http_response_headers_override_)
+    : HTTPHandler(server_, connection_config, "PredefinedQueryHandler", http_response_headers_override_)
     , receive_params(receive_params_)
     , predefined_query(predefined_query_)
     , url_regex(url_regex_)
@@ -923,10 +937,11 @@ HTTPRequestHandlerFactoryPtr createDynamicHandlerFactory(IServer & server,
 {
     auto query_param_name = config.getString(config_prefix + ".handler.query_param_name", "query");
 
+    HTTPHandlerConnectionConfig connection_config(config, config_prefix);
     HTTPResponseHeaderSetup http_response_headers_override = parseHTTPResponseHeaders(config, config_prefix);
 
-    auto creator = [&server, query_param_name, http_response_headers_override]() -> std::unique_ptr<DynamicQueryHandler>
-    { return std::make_unique<DynamicQueryHandler>(server, query_param_name, http_response_headers_override); };
+    auto creator = [&server, query_param_name, http_response_headers_override, connection_config]() -> std::unique_ptr<DynamicQueryHandler>
+    { return std::make_unique<DynamicQueryHandler>(server, connection_config, query_param_name, http_response_headers_override); };
 
     auto factory = std::make_shared<HandlingRuleHTTPHandlerFactory<DynamicQueryHandler>>(std::move(creator));
     factory->addFiltersFromConfig(config, config_prefix);
@@ -968,6 +983,8 @@ HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server,
     Poco::Util::AbstractConfiguration::Keys headers_name;
     config.keys(config_prefix + ".headers", headers_name);
 
+    HTTPHandlerConnectionConfig connection_config(config, config_prefix);
+
     for (const auto & header_name : headers_name)
     {
         auto expression = config.getString(config_prefix + ".headers." + header_name);
@@ -1001,12 +1018,18 @@ HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server,
                 predefined_query,
                 regex,
                 headers_name_with_regex,
-                http_response_headers_override]
+                http_response_headers_override,
+                connection_config]
                 -> std::unique_ptr<PredefinedQueryHandler>
             {
                 return std::make_unique<PredefinedQueryHandler>(
-                    server, analyze_receive_params, predefined_query, regex,
-                    headers_name_with_regex, http_response_headers_override);
+                    server,
+                    connection_config,
+                    analyze_receive_params,
+                    predefined_query,
+                    regex,
+                    headers_name_with_regex,
+                    http_response_headers_override);
             };
             factory = std::make_shared<HandlingRuleHTTPHandlerFactory<PredefinedQueryHandler>>(std::move(creator));
             factory->addFiltersFromConfig(config, config_prefix);
@@ -1019,18 +1042,21 @@ HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server,
         analyze_receive_params,
         predefined_query,
         headers_name_with_regex,
-        http_response_headers_override]
+        http_response_headers_override,
+        connection_config]
         -> std::unique_ptr<PredefinedQueryHandler>
     {
         return std::make_unique<PredefinedQueryHandler>(
-            server, analyze_receive_params, predefined_query, CompiledRegexPtr{},
-            headers_name_with_regex, http_response_headers_override);
+            server,
+            connection_config,
+            analyze_receive_params,
+            predefined_query,
+            CompiledRegexPtr{},
+            headers_name_with_regex,
+            http_response_headers_override);
     };
-
     factory = std::make_shared<HandlingRuleHTTPHandlerFactory<PredefinedQueryHandler>>(std::move(creator));
-
     factory->addFiltersFromConfig(config, config_prefix);
-
     return factory;
 }
 
diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h
index 6580b317f6e..2296fa70aeb 100644
--- a/src/Server/HTTPHandler.h
+++ b/src/Server/HTTPHandler.h
@@ -12,6 +12,7 @@
 #include <IO/CascadeWriteBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
 #include <Common/re2.h>
+#include <Access/Credentials.h>
 
 #include "HTTPResponseHeaderWriter.h"
 
@@ -26,17 +27,28 @@ namespace DB
 {
 
 class Session;
-class Credentials;
 class IServer;
 struct Settings;
 class WriteBufferFromHTTPServerResponse;
 
 using CompiledRegexPtr = std::shared_ptr<const re2::RE2>;
 
+struct HTTPHandlerConnectionConfig
+{
+    std::optional<BasicCredentials> credentials;
+
+    /// TODO:
+    /// String quota;
+    /// String default_database;
+
+    HTTPHandlerConnectionConfig() = default;
+    HTTPHandlerConnectionConfig(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix);
+};
+
 class HTTPHandler : public HTTPRequestHandler
 {
 public:
-    HTTPHandler(IServer & server_, const std::string & name, const HTTPResponseHeaderSetup & http_response_headers_override_);
+    HTTPHandler(IServer & server_, const HTTPHandlerConnectionConfig & connection_config_, const std::string & name, const HTTPResponseHeaderSetup & http_response_headers_override_);
     ~HTTPHandler() override;
 
     void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override;
@@ -146,16 +158,7 @@ private:
     // The request_credential instance may outlive a single request/response loop.
     // This happens only when the authentication mechanism requires more than a single request/response exchange (e.g., SPNEGO).
     std::unique_ptr<Credentials> request_credentials;
-
-    // Returns true when the user successfully authenticated,
-    //  the session instance will be configured accordingly, and the request_credentials instance will be dropped.
-    // Returns false when the user is not authenticated yet, and the 'Negotiate' response is sent,
-    //  the session and request_credentials instances are preserved.
-    // Throws an exception if authentication failed.
-    bool authenticateUser(
-        HTTPServerRequest & request,
-        HTMLForm & params,
-        HTTPServerResponse & response);
+    HTTPHandlerConnectionConfig connection_config;
 
     /// Also initializes 'used_output'.
     void processQuery(
@@ -174,6 +177,13 @@ private:
         Output & used_output);
 
     static void pushDelayedResults(Output & used_output);
+
+protected:
+    // @see authenticateUserByHTTP()
+    virtual bool authenticateUser(
+        HTTPServerRequest & request,
+        HTMLForm & params,
+        HTTPServerResponse & response);
 };
 
 class DynamicQueryHandler : public HTTPHandler
@@ -184,6 +194,7 @@ private:
 public:
     explicit DynamicQueryHandler(
         IServer & server_,
+        const HTTPHandlerConnectionConfig & connection_config,
         const std::string & param_name_ = "query",
         const HTTPResponseHeaderSetup & http_response_headers_override_ = std::nullopt);
 
@@ -203,6 +214,7 @@ private:
 public:
     PredefinedQueryHandler(
         IServer & server_,
+        const HTTPHandlerConnectionConfig & connection_config,
         const NameSet & receive_params_,
         const std::string & predefined_query_,
         const CompiledRegexPtr & url_regex_,
diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp
index a99f0a50a4b..cd7a3177ad9 100644
--- a/src/Server/HTTPHandlerFactory.cpp
+++ b/src/Server/HTTPHandlerFactory.cpp
@@ -263,7 +263,7 @@ void addDefaultHandlersFactory(
 
     auto dynamic_creator = [&server] () -> std::unique_ptr<DynamicQueryHandler>
     {
-        return std::make_unique<DynamicQueryHandler>(server, "query");
+        return std::make_unique<DynamicQueryHandler>(server, HTTPHandlerConnectionConfig{}, "query");
     };
     auto query_handler = std::make_shared<HandlingRuleHTTPHandlerFactory<DynamicQueryHandler>>(std::move(dynamic_creator));
     query_handler->addFilter([](const auto & request)
diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp
index cd18eac50a7..9c521e06667 100644
--- a/src/Server/PrometheusRequestHandler.cpp
+++ b/src/Server/PrometheusRequestHandler.cpp
@@ -7,6 +7,7 @@
 #include <Server/HTTP/sendExceptionToHTTPClient.h>
 #include <Server/IServer.h>
 #include <Server/PrometheusMetricsWriter.h>
+#include <Server/HTTPHandler.h>
 #include "config.h"
 
 #include <Access/Credentials.h>
@@ -137,7 +138,7 @@ protected:
 
     bool authenticateUser(HTTPServerRequest & request, HTTPServerResponse & response)
     {
-        return authenticateUserByHTTP(request, *params, response, *session, request_credentials, server().context(), log());
+        return authenticateUserByHTTP(request, *params, response, *session, request_credentials, HTTPHandlerConnectionConfig{}, server().context(), log());
     }
 
     void makeContext(HTTPServerRequest & request)
diff --git a/tests/integration/test_http_handlers_config/test.py b/tests/integration/test_http_handlers_config/test.py
index efba4f05748..cf291c6dedd 100644
--- a/tests/integration/test_http_handlers_config/test.py
+++ b/tests/integration/test_http_handlers_config/test.py
@@ -17,9 +17,10 @@ class SimpleCluster:
         cluster.start()
 
     def add_instance(self, name, config_dir):
-        script_path = os.path.dirname(os.path.realpath(__file__))
         return self.cluster.add_instance(
-            name, main_configs=[os.path.join(script_path, config_dir, "config.xml")]
+            name,
+            main_configs=[os.path.join(config_dir, "config.xml")],
+            user_configs=["users.d/users.yaml"],
         )
 
 
@@ -96,6 +97,16 @@ def test_dynamic_query_handler():
             == res_custom_ct.headers["X-Test-Http-Response-Headers-Even-Multiple"]
         )
 
+        assert cluster.instance.http_request(
+            "test_dynamic_handler_auth_with_password?query=select+currentUser()"
+        ).content, "with_password"
+        assert cluster.instance.http_request(
+            "test_dynamic_handler_auth_with_password_fail?query=select+currentUser()"
+        ).status_code, 403
+        assert cluster.instance.http_request(
+            "test_dynamic_handler_auth_without_password?query=select+currentUser()"
+        ).content, "without_password"
+
 
 def test_predefined_query_handler():
     with contextlib.closing(
@@ -177,6 +188,16 @@ def test_predefined_query_handler():
         )
         assert b"max_threads\t1\n" == res1.content
 
+        assert cluster.instance.http_request(
+            "test_predefined_handler_auth_with_password"
+        ).content, "with_password"
+        assert cluster.instance.http_request(
+            "test_predefined_handler_auth_with_password_fail"
+        ).status_code, 403
+        assert cluster.instance.http_request(
+            "test_predefined_handler_auth_without_password"
+        ).content, "without_password"
+
 
 def test_fixed_static_handler():
     with contextlib.closing(
diff --git a/tests/integration/test_http_handlers_config/test_dynamic_handler/config.xml b/tests/integration/test_http_handlers_config/test_dynamic_handler/config.xml
index 58fedbd9078..4900219f595 100644
--- a/tests/integration/test_http_handlers_config/test_dynamic_handler/config.xml
+++ b/tests/integration/test_http_handlers_config/test_dynamic_handler/config.xml
@@ -24,5 +24,32 @@
                 </http_response_headers>
             </handler>
         </rule>
+
+        <rule>
+            <methods>GET</methods>
+            <url>/test_dynamic_handler_auth_with_password</url>
+            <handler>
+                <type>dynamic_query_handler</type>
+                <user>with_password</user>
+                <password>password</password>
+            </handler>
+        </rule>
+        <rule>
+            <methods>GET</methods>
+            <url>/test_dynamic_handler_auth_with_password_fail</url>
+            <handler>
+                <type>dynamic_query_handler</type>
+                <user>with_password</user>
+                <!-- No password - authentication should fail -->
+            </handler>
+        </rule>
+        <rule>
+            <methods>GET</methods>
+            <url>/test_dynamic_handler_auth_without_password</url>
+            <handler>
+                <type>dynamic_query_handler</type>
+                <user>without_password</user>
+            </handler>
+        </rule>
     </http_handlers>
 </clickhouse>
diff --git a/tests/integration/test_http_handlers_config/test_predefined_handler/config.xml b/tests/integration/test_http_handlers_config/test_predefined_handler/config.xml
index a7804721f12..3c0ee3cd09a 100644
--- a/tests/integration/test_http_handlers_config/test_predefined_handler/config.xml
+++ b/tests/integration/test_http_handlers_config/test_predefined_handler/config.xml
@@ -33,5 +33,35 @@
                 <query>INSERT INTO test_table(id, data) SELECT {id:UInt32}, {_request_body:String}</query>
             </handler>
         </rule>
+
+        <rule>
+            <methods>GET</methods>
+            <url>/test_predefined_handler_auth_with_password</url>
+            <handler>
+                <type>predefined_query_handler</type>
+                <user>with_password</user>
+                <password>password</password>
+                <query>SELECT currentUser()</query>
+            </handler>
+        </rule>
+        <rule>
+            <methods>GET</methods>
+            <url>/test_predefined_handler_auth_with_password_fail</url>
+            <handler>
+                <type>predefined_query_handler</type>
+                <user>with_password</user>
+                <!-- No password - authentication should fail -->
+                <query>SELECT currentUser()</query>
+            </handler>
+        </rule>
+        <rule>
+            <methods>GET</methods>
+            <url>/test_predefined_handler_auth_without_password</url>
+            <handler>
+                <type>predefined_query_handler</type>
+                <user>without_password</user>
+                <query>SELECT currentUser()</query>
+            </handler>
+        </rule>
     </http_handlers>
 </clickhouse>
diff --git a/tests/integration/test_http_handlers_config/users.d/users.yaml b/tests/integration/test_http_handlers_config/users.d/users.yaml
new file mode 100644
index 00000000000..9ab8a84ae5a
--- /dev/null
+++ b/tests/integration/test_http_handlers_config/users.d/users.yaml
@@ -0,0 +1,7 @@
+users:
+  with_password:
+    profile: default
+    password: password
+  without_password:
+    profile: default
+    no_password: 1

From 54a00e875160d87a3793c60d281bd321a5826aec Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Fri, 25 Oct 2024 14:01:55 +0000
Subject: [PATCH 09/80] fix optimization of replacing algorithm

---
 .../Merges/Algorithms/ReplacingSortedAlgorithm.cpp        | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index 0ce626b1dc9..b22f1271687 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -132,15 +132,15 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
             /// and current chunk has no duplicates (we assume that parts with non-zero level have no duplicates)
             /// We want to insert current cursor chunk directly in merged data.
 
-            size_t source_num = current->order;
-            auto current_chunk = std::move(*sources[source_num].chunk);
-            size_t chunk_num_rows = current_chunk.getNumRows();
-
             /// First if merged_data is not empty we need to flush it.
             /// We will get into the same condition on next merge call.
             if (merged_data->mergedRows() != 0)
                 return Status(merged_data->pull());
 
+            size_t source_num = current->order;
+            auto current_chunk = std::move(*sources[source_num].chunk);
+            size_t chunk_num_rows = current_chunk.getNumRows();
+
             /// We will get the next block from the corresponding source, if there is one.
             queue.removeTop();
 

From 845a9d5a2c0173837052fd22aef420858abdae45 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Fri, 25 Oct 2024 12:00:56 +0000
Subject: [PATCH 10/80] add test 03257_client_history_max_entries

---
 .../03257_client_history_max_entries.py       | 38 +++++++++++++++++++
 ...03257_client_history_max_entries.reference |  2 +
 tests/queries/0_stateless/helpers/client.py   |  6 ++-
 3 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100755 tests/queries/0_stateless/03257_client_history_max_entries.py
 create mode 100644 tests/queries/0_stateless/03257_client_history_max_entries.reference

diff --git a/tests/queries/0_stateless/03257_client_history_max_entries.py b/tests/queries/0_stateless/03257_client_history_max_entries.py
new file mode 100755
index 00000000000..8ba402138ed
--- /dev/null
+++ b/tests/queries/0_stateless/03257_client_history_max_entries.py
@@ -0,0 +1,38 @@
+#!/usr/bin/env python3
+# Tags: no-parallel, no-fasttest
+
+import os
+import signal
+import sys
+
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(CURDIR, "helpers"))
+
+from client import client, end_of_block, prompt
+
+log = None
+# uncomment the line below for debugging
+# log=sys.stdout
+
+TMP_FILE = os.path.join(
+    os.environ.get("CLICKHOUSE_TMP", "/tmp"),
+    os.path.basename(os.path.abspath(__file__)) + ".hist",
+)
+
+with client(
+    name="client1>",
+    log=log,
+    extra_options={"history_file": TMP_FILE, "history_max_entries": 2},
+) as client:
+    client.expect(prompt)
+    client.send("SELECT 1")
+    client.expect(prompt)
+    client.send("SELECT 2")
+    client.expect(prompt)
+    client.send("SELECT 3")
+    client.expect(prompt)
+
+with open(TMP_FILE, "r") as f:
+    for line in f:
+        if not line.startswith("###"):
+            print(line, end="")
diff --git a/tests/queries/0_stateless/03257_client_history_max_entries.reference b/tests/queries/0_stateless/03257_client_history_max_entries.reference
new file mode 100644
index 00000000000..4c85f1227c6
--- /dev/null
+++ b/tests/queries/0_stateless/03257_client_history_max_entries.reference
@@ -0,0 +1,2 @@
+SELECT 2
+SELECT 3
diff --git a/tests/queries/0_stateless/helpers/client.py b/tests/queries/0_stateless/helpers/client.py
index b721931e46d..844a2da1026 100644
--- a/tests/queries/0_stateless/helpers/client.py
+++ b/tests/queries/0_stateless/helpers/client.py
@@ -13,10 +13,12 @@ end_of_block = r".*\r\n.*\r\n"
 
 
 class client(object):
-    def __init__(self, command=None, name="", log=None):
+    def __init__(self, command=None, name="", log=None, extra_options=None):
         self.client = uexpect.spawn(["/bin/bash", "--noediting"])
         if command is None:
-            options = "--enable-progress-table-toggle=0"
+            extra_options = extra_options or {}
+            extra_options["enable-progress-table-toggle"] = 0
+            options = " ".join(f"--{k}={v}" for k, v in extra_options.items())
             command = (
                 os.environ.get("CLICKHOUSE_BINARY", "clickhouse") + " client " + options
             )

From 1c9ac878914515aad4f97ee04f9fed99cae47e68 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@clickhouse.com>
Date: Mon, 28 Oct 2024 07:37:30 +0000
Subject: [PATCH 11/80] Revert "add test 03257_client_history_max_entries"

This reverts commit 845a9d5a2c0173837052fd22aef420858abdae45.
---
 .../03257_client_history_max_entries.py       | 38 -------------------
 ...03257_client_history_max_entries.reference |  2 -
 tests/queries/0_stateless/helpers/client.py   |  6 +--
 3 files changed, 2 insertions(+), 44 deletions(-)
 delete mode 100755 tests/queries/0_stateless/03257_client_history_max_entries.py
 delete mode 100644 tests/queries/0_stateless/03257_client_history_max_entries.reference

diff --git a/tests/queries/0_stateless/03257_client_history_max_entries.py b/tests/queries/0_stateless/03257_client_history_max_entries.py
deleted file mode 100755
index 8ba402138ed..00000000000
--- a/tests/queries/0_stateless/03257_client_history_max_entries.py
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env python3
-# Tags: no-parallel, no-fasttest
-
-import os
-import signal
-import sys
-
-CURDIR = os.path.dirname(os.path.realpath(__file__))
-sys.path.insert(0, os.path.join(CURDIR, "helpers"))
-
-from client import client, end_of_block, prompt
-
-log = None
-# uncomment the line below for debugging
-# log=sys.stdout
-
-TMP_FILE = os.path.join(
-    os.environ.get("CLICKHOUSE_TMP", "/tmp"),
-    os.path.basename(os.path.abspath(__file__)) + ".hist",
-)
-
-with client(
-    name="client1>",
-    log=log,
-    extra_options={"history_file": TMP_FILE, "history_max_entries": 2},
-) as client:
-    client.expect(prompt)
-    client.send("SELECT 1")
-    client.expect(prompt)
-    client.send("SELECT 2")
-    client.expect(prompt)
-    client.send("SELECT 3")
-    client.expect(prompt)
-
-with open(TMP_FILE, "r") as f:
-    for line in f:
-        if not line.startswith("###"):
-            print(line, end="")
diff --git a/tests/queries/0_stateless/03257_client_history_max_entries.reference b/tests/queries/0_stateless/03257_client_history_max_entries.reference
deleted file mode 100644
index 4c85f1227c6..00000000000
--- a/tests/queries/0_stateless/03257_client_history_max_entries.reference
+++ /dev/null
@@ -1,2 +0,0 @@
-SELECT 2
-SELECT 3
diff --git a/tests/queries/0_stateless/helpers/client.py b/tests/queries/0_stateless/helpers/client.py
index 844a2da1026..b721931e46d 100644
--- a/tests/queries/0_stateless/helpers/client.py
+++ b/tests/queries/0_stateless/helpers/client.py
@@ -13,12 +13,10 @@ end_of_block = r".*\r\n.*\r\n"
 
 
 class client(object):
-    def __init__(self, command=None, name="", log=None, extra_options=None):
+    def __init__(self, command=None, name="", log=None):
         self.client = uexpect.spawn(["/bin/bash", "--noediting"])
         if command is None:
-            extra_options = extra_options or {}
-            extra_options["enable-progress-table-toggle"] = 0
-            options = " ".join(f"--{k}={v}" for k, v in extra_options.items())
+            options = "--enable-progress-table-toggle=0"
             command = (
                 os.environ.get("CLICKHOUSE_BINARY", "clickhouse") + " client " + options
             )

From 01cb0eb32fa353517c31f0b2ec7da613a5abffaa Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <michael.stetsyuk@clickhouse.com>
Date: Thu, 3 Oct 2024 08:30:52 +0000
Subject: [PATCH 12/80] mv fixReplicaMetadataVersionIfNeeded from attach thread
 to restarting thread

---
 .../ReplicatedMergeTreeAttachThread.cpp       | 90 +-----------------
 .../ReplicatedMergeTreeAttachThread.h         |  2 -
 .../MergeTree/ReplicatedMergeTreeQueue.cpp    |  2 +-
 .../MergeTree/ReplicatedMergeTreeQueue.h      |  1 +
 .../ReplicatedMergeTreeRestartingThread.cpp   | 92 +++++++++++++++++++
 .../ReplicatedMergeTreeRestartingThread.h     |  4 +
 tests/integration/helpers/cluster.py          | 27 ++++--
 .../test_fix_metadata_version/__init__.py     |  0
 .../configs/config.xml                        | 16 ++++
 .../test_fix_metadata_version/test.py         | 73 +++++++++++++++
 10 files changed, 206 insertions(+), 101 deletions(-)
 create mode 100644 tests/integration/test_fix_metadata_version/__init__.py
 create mode 100644 tests/integration/test_fix_metadata_version/configs/config.xml
 create mode 100644 tests/integration/test_fix_metadata_version/test.py

diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
index 22b8ccca151..c258048354e 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeTreeSettings.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeAttachThread.h>
+#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Common/ZooKeeper/IKeeper.h>
 
@@ -20,7 +21,6 @@ namespace ErrorCodes
 {
     extern const int SUPPORT_IS_DISABLED;
     extern const int REPLICA_STATUS_CHANGED;
-    extern const int LOGICAL_ERROR;
 }
 
 ReplicatedMergeTreeAttachThread::ReplicatedMergeTreeAttachThread(StorageReplicatedMergeTree & storage_)
@@ -123,67 +123,6 @@ void ReplicatedMergeTreeAttachThread::checkHasReplicaMetadataInZooKeeper(const z
     }
 }
 
-Int32 ReplicatedMergeTreeAttachThread::fixReplicaMetadataVersionIfNeeded(zkutil::ZooKeeperPtr zookeeper)
-{
-    const String & zookeeper_path = storage.zookeeper_path;
-    const String & replica_path = storage.replica_path;
-    const bool replica_readonly = storage.is_readonly;
-
-    for (size_t i = 0; i != 2; ++i)
-    {
-        String replica_metadata_version_str;
-        const bool replica_metadata_version_exists = zookeeper->tryGet(replica_path + "/metadata_version", replica_metadata_version_str);
-        if (!replica_metadata_version_exists)
-            return -1;
-
-        const Int32 metadata_version = parse<Int32>(replica_metadata_version_str);
-
-        if (metadata_version != 0 || replica_readonly)
-        {
-            /// No need to fix anything
-            return metadata_version;
-        }
-
-        Coordination::Stat stat;
-        zookeeper->get(fs::path(zookeeper_path) / "metadata", &stat);
-        if (stat.version == 0)
-        {
-            /// No need to fix anything
-            return metadata_version;
-        }
-
-        ReplicatedMergeTreeQueue & queue = storage.queue;
-        queue.pullLogsToQueue(zookeeper);
-        if (queue.getStatus().metadata_alters_in_queue != 0)
-        {
-            LOG_DEBUG(log, "No need to update metadata_version as there are ALTER_METADATA entries in the queue");
-            return metadata_version;
-        }
-
-        const Coordination::Requests ops = {
-            zkutil::makeSetRequest(fs::path(replica_path) / "metadata_version", std::to_string(stat.version), 0),
-            zkutil::makeCheckRequest(fs::path(zookeeper_path) / "metadata", stat.version),
-        };
-        Coordination::Responses ops_responses;
-        const auto code = zookeeper->tryMulti(ops, ops_responses);
-        if (code == Coordination::Error::ZOK)
-        {
-            LOG_DEBUG(log, "Successfully set metadata_version to {}", stat.version);
-            return stat.version;
-        }
-        if (code != Coordination::Error::ZBADVERSION)
-        {
-            throw zkutil::KeeperException(code);
-        }
-    }
-
-    /// Second attempt is only possible if metadata_version != 0 or metadata.version changed during the first attempt.
-    /// If metadata_version != 0, on second attempt we will return the new metadata_version.
-    /// If metadata.version changed, on second attempt we will either get metadata_version != 0 and return the new metadata_version or we will get metadata_alters_in_queue != 0 and return 0.
-    /// Either way, on second attempt this method should return.
-    throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to fix replica metadata_version in ZooKeeper after two attempts");
-}
-
 void ReplicatedMergeTreeAttachThread::runImpl()
 {
     storage.setZooKeeper();
@@ -227,33 +166,6 @@ void ReplicatedMergeTreeAttachThread::runImpl()
     /// Just in case it was not removed earlier due to connection loss
     zookeeper->tryRemove(replica_path + "/flags/force_restore_data");
 
-    const Int32 replica_metadata_version = fixReplicaMetadataVersionIfNeeded(zookeeper);
-    const bool replica_metadata_version_exists = replica_metadata_version != -1;
-    if (replica_metadata_version_exists)
-    {
-        storage.setInMemoryMetadata(metadata_snapshot->withMetadataVersion(replica_metadata_version));
-    }
-    else
-    {
-        /// Table was created before 20.4 and was never altered,
-        /// let's initialize replica metadata version from global metadata version.
-        Coordination::Stat table_metadata_version_stat;
-        zookeeper->get(zookeeper_path + "/metadata", &table_metadata_version_stat);
-
-        Coordination::Requests ops;
-        ops.emplace_back(zkutil::makeCheckRequest(zookeeper_path + "/metadata", table_metadata_version_stat.version));
-        ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/metadata_version", toString(table_metadata_version_stat.version), zkutil::CreateMode::Persistent));
-
-        Coordination::Responses res;
-        auto code = zookeeper->tryMulti(ops, res);
-
-        if (code == Coordination::Error::ZBADVERSION)
-            throw Exception(ErrorCodes::REPLICA_STATUS_CHANGED, "Failed to initialize metadata_version "
-                                                                "because table was concurrently altered, will retry");
-
-        zkutil::KeeperMultiException::check(code, ops, res);
-    }
-
     storage.checkTableStructure(replica_path, metadata_snapshot);
     storage.checkParts(skip_sanity_checks);
 
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.h b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.h
index bfc97442598..250a5ed34d1 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.h
@@ -48,8 +48,6 @@ private:
     void runImpl();
 
     void finalizeInitialization();
-
-    Int32 fixReplicaMetadataVersionIfNeeded(zkutil::ZooKeeperPtr zookeeper);
 };
 
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
index 6b1581645f8..b1564b58a6c 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@@ -615,7 +615,7 @@ std::pair<int32_t, int32_t> ReplicatedMergeTreeQueue::pullLogsToQueue(zkutil::Zo
 {
     std::lock_guard lock(pull_logs_to_queue_mutex);
 
-    if (reason != LOAD)
+    if (reason != LOAD && reason != FIX_METADATA_VERSION)
     {
         /// It's totally ok to load queue on readonly replica (that's what RestartingThread does on initialization).
         /// It's ok if replica became readonly due to connection loss after we got current zookeeper (in this case zookeeper must be expired).
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
index 9d3349663e2..6ec8818b0c6 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
@@ -334,6 +334,7 @@ public:
         UPDATE,
         MERGE_PREDICATE,
         SYNC,
+        FIX_METADATA_VERSION,
         OTHER,
     };
 
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
index 9d3e26cdc8d..93124e634bd 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
@@ -29,6 +29,8 @@ namespace MergeTreeSetting
 namespace ErrorCodes
 {
     extern const int REPLICA_IS_ALREADY_ACTIVE;
+    extern const int REPLICA_STATUS_CHANGED;
+    extern const int LOGICAL_ERROR;
 }
 
 namespace FailPoints
@@ -207,6 +209,36 @@ bool ReplicatedMergeTreeRestartingThread::tryStartup()
             throw;
         }
 
+        const Int32 replica_metadata_version = fixReplicaMetadataVersionIfNeeded(zookeeper);
+        const bool replica_metadata_version_exists = replica_metadata_version != -1;
+        if (replica_metadata_version_exists)
+        {
+            storage.setInMemoryMetadata(storage.getInMemoryMetadataPtr()->withMetadataVersion(replica_metadata_version));
+        }
+        else
+        {
+            /// Table was created before 20.4 and was never altered,
+            /// let's initialize replica metadata version from global metadata version.
+
+            const String & zookeeper_path = storage.zookeeper_path, & replica_path = storage.replica_path;
+
+            Coordination::Stat table_metadata_version_stat;
+            zookeeper->get(zookeeper_path + "/metadata", &table_metadata_version_stat);
+
+            Coordination::Requests ops;
+            ops.emplace_back(zkutil::makeCheckRequest(zookeeper_path + "/metadata", table_metadata_version_stat.version));
+            ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/metadata_version", toString(table_metadata_version_stat.version), zkutil::CreateMode::Persistent));
+
+            Coordination::Responses res;
+            auto code = zookeeper->tryMulti(ops, res);
+
+            if (code == Coordination::Error::ZBADVERSION)
+                throw Exception(ErrorCodes::REPLICA_STATUS_CHANGED, "Failed to initialize metadata_version "
+                                                                    "because table was concurrently altered, will retry");
+
+            zkutil::KeeperMultiException::check(code, ops, res);
+        }
+
         storage.queue.removeCurrentPartsFromMutations();
         storage.last_queue_update_finish_time.store(time(nullptr));
 
@@ -424,4 +456,64 @@ void ReplicatedMergeTreeRestartingThread::setNotReadonly()
     storage.readonly_start_time.store(0, std::memory_order_relaxed);
 }
 
+
+Int32 ReplicatedMergeTreeRestartingThread::fixReplicaMetadataVersionIfNeeded(zkutil::ZooKeeperPtr zookeeper)
+{
+    const String & zookeeper_path = storage.zookeeper_path;
+    const String & replica_path = storage.replica_path;
+
+    const size_t num_attempts = 2;
+    for (size_t attempt = 0; attempt != num_attempts; ++attempt)
+    {
+        String replica_metadata_version_str;
+        Coordination::Stat replica_stat;
+        const bool replica_metadata_version_exists = zookeeper->tryGet(replica_path + "/metadata_version", replica_metadata_version_str, &replica_stat);
+        if (!replica_metadata_version_exists)
+            return -1;
+
+        const Int32 metadata_version = parse<Int32>(replica_metadata_version_str);
+        if (metadata_version != 0)
+            return metadata_version;
+
+        Coordination::Stat table_stat;
+        zookeeper->get(fs::path(zookeeper_path) / "metadata", &table_stat);
+        if (table_stat.version == 0)
+            return metadata_version;
+
+        ReplicatedMergeTreeQueue & queue = storage.queue;
+        queue.pullLogsToQueue(zookeeper, {}, ReplicatedMergeTreeQueue::FIX_METADATA_VERSION);
+        if (queue.getStatus().metadata_alters_in_queue != 0)
+        {
+            LOG_INFO(log, "Skipping updating metadata_version as there are ALTER_METADATA entries in the queue");
+            return metadata_version;
+        }
+
+        const Coordination::Requests ops = {
+            zkutil::makeSetRequest(fs::path(replica_path) / "metadata_version", std::to_string(table_stat.version), replica_stat.version),
+            zkutil::makeCheckRequest(fs::path(zookeeper_path) / "metadata", table_stat.version),
+        };
+        Coordination::Responses ops_responses;
+        const Coordination::Error code = zookeeper->tryMulti(ops, ops_responses);
+        if (code == Coordination::Error::ZOK)
+        {
+            LOG_DEBUG(log, "Successfully set metadata_version to {}", table_stat.version);
+            return table_stat.version;
+        }
+
+        if (code == Coordination::Error::ZBADVERSION)
+        {
+            LOG_WARNING(log, "Cannot fix metadata_version because either metadata.version or metadata_version.version changed, attempts left = {}", num_attempts - attempt - 1);
+            continue;
+        }
+
+        throw zkutil::KeeperException(code);
+    }
+
+    /// Second attempt is only possible if either metadata_version.version or metadata.version changed during the first attempt.
+    /// If metadata_version changed to non-zero value during the first attempt, on second attempt we will return the new metadata_version.
+    /// If metadata.version changed during first attempt, on second attempt we will either get metadata_version != 0 and return the new metadata_version or we will get metadata_alters_in_queue != 0 and return 0.
+    /// So either first or second attempt should return unless metadata_version was rewritten from 0 to 0 during the first attempt which is highly unlikely.
+    throw Exception(ErrorCodes::LOGICAL_ERROR, "Failed to fix replica metadata_version in ZooKeeper after two attempts");
+}
+
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h
index d719505ae5e..6f450dc1d40 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h
@@ -6,6 +6,7 @@
 #include <thread>
 #include <atomic>
 #include <Common/logger_useful.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
 
 
 namespace DB
@@ -68,6 +69,9 @@ private:
 
     /// Disable readonly mode for table
     void setNotReadonly();
+
+    /// Fix replica metadata_version if needed
+    Int32 fixReplicaMetadataVersionIfNeeded(zkutil::ZooKeeperPtr zookeeper);
 };
 
 
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 3c92df51ac4..1e98561b9f7 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -83,6 +83,8 @@ CLICKHOUSE_ERROR_LOG_FILE = "/var/log/clickhouse-server/clickhouse-server.err.lo
 # This means that this minimum need to be, at least, 1 year older than the current release
 CLICKHOUSE_CI_MIN_TESTED_VERSION = "23.3"
 
+ZOOKEEPER_CONTAINERS = ("zoo1", "zoo2", "zoo3")
+
 
 # to create docker-compose env file
 def _create_env_file(path, variables):
@@ -2061,6 +2063,11 @@ class ClickHouseCluster:
         container_id = self.get_container_id(instance_name)
         return self.docker_client.api.logs(container_id).decode()
 
+    def query_zookeeper(self, query, node=ZOOKEEPER_CONTAINERS[0], nothrow=False):
+        cmd = f'clickhouse keeper-client -p {self.zookeeper_port} -q "{query}"'
+        container_id = self.get_container_id(node)
+        return self.exec_in_container(container_id, cmd, nothrow=nothrow, use_cli=False)
+
     def exec_in_container(
         self,
         container_id: str,
@@ -2391,16 +2398,16 @@ class ClickHouseCluster:
 
     def wait_zookeeper_secure_to_start(self, timeout=20):
         logging.debug("Wait ZooKeeper Secure to start")
-        nodes = ["zoo1", "zoo2", "zoo3"]
-        self.wait_zookeeper_nodes_to_start(nodes, timeout)
+        self.wait_zookeeper_nodes_to_start(ZOOKEEPER_CONTAINERS, timeout)
 
     def wait_zookeeper_to_start(self, timeout: float = 180) -> None:
         logging.debug("Wait ZooKeeper to start")
-        nodes = ["zoo1", "zoo2", "zoo3"]
-        self.wait_zookeeper_nodes_to_start(nodes, timeout)
+        self.wait_zookeeper_nodes_to_start(ZOOKEEPER_CONTAINERS, timeout)
 
     def wait_zookeeper_nodes_to_start(
-        self, nodes: List[str], timeout: float = 60
+        self,
+        nodes: List[str],
+        timeout: float = 60,
     ) -> None:
         start = time.time()
         err = Exception("")
@@ -3226,7 +3233,11 @@ class ClickHouseCluster:
         return zk
 
     def run_kazoo_commands_with_retries(
-        self, kazoo_callback, zoo_instance_name="zoo1", repeats=1, sleep_for=1
+        self,
+        kazoo_callback,
+        zoo_instance_name=ZOOKEEPER_CONTAINERS[0],
+        repeats=1,
+        sleep_for=1,
     ):
         zk = self.get_kazoo_client(zoo_instance_name)
         logging.debug(
@@ -4648,9 +4659,7 @@ class ClickHouseInstance:
             depends_on.append("nats1")
 
         if self.with_zookeeper:
-            depends_on.append("zoo1")
-            depends_on.append("zoo2")
-            depends_on.append("zoo3")
+            depends_on += list(ZOOKEEPER_CONTAINERS)
 
         if self.with_minio:
             depends_on.append("minio1")
diff --git a/tests/integration/test_fix_metadata_version/__init__.py b/tests/integration/test_fix_metadata_version/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_fix_metadata_version/configs/config.xml b/tests/integration/test_fix_metadata_version/configs/config.xml
new file mode 100644
index 00000000000..4662e6794e3
--- /dev/null
+++ b/tests/integration/test_fix_metadata_version/configs/config.xml
@@ -0,0 +1,16 @@
+<clickhouse>
+    <tcp_port>9000</tcp_port>
+
+    <profiles>
+        <default>
+        </default>
+    </profiles>
+
+    <users>
+        <default>
+            <profile>default</profile>
+            <no_password></no_password>
+        </default>
+    </users>
+
+</clickhouse>
diff --git a/tests/integration/test_fix_metadata_version/test.py b/tests/integration/test_fix_metadata_version/test.py
new file mode 100644
index 00000000000..085872bba05
--- /dev/null
+++ b/tests/integration/test_fix_metadata_version/test.py
@@ -0,0 +1,73 @@
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+node = cluster.add_instance(
+    "node",
+    main_configs=["configs/config.xml"],
+    stay_alive=True,
+    with_zookeeper=True,
+)
+
+
+@pytest.fixture(scope="module")
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_fix_metadata_version(start_cluster):
+    zookeeper_path = "/clickhouse/test_fix_metadata_version"
+    replica = "replica1"
+    replica_path = f"{zookeeper_path}/replicas/{replica}"
+
+    def get_metadata_versions():
+        table_metadata_version = int(
+            node.query(
+                f"""
+                SELECT version
+                FROM system.zookeeper
+                WHERE path = '{zookeeper_path}' AND name = 'metadata'
+                """
+            ).strip()
+        )
+
+        replica_metadata_version = int(
+            node.query(
+                f"""
+                SELECT value
+                FROM system.zookeeper
+                WHERE path = '{replica_path}' AND name = 'metadata_version'
+                """
+            ).strip()
+        )
+
+        return table_metadata_version, replica_metadata_version
+
+    node.query(
+        f"""
+        DROP TABLE IF EXISTS t SYNC;
+        CREATE TABLE t
+        (
+            `x` UInt32
+        )
+        ENGINE = ReplicatedMergeTree('{zookeeper_path}', '{replica}')
+        ORDER BY x
+        """
+    )
+
+    node.query("ALTER TABLE t (ADD COLUMN `y` UInt32)")
+
+    assert get_metadata_versions() == (1, 1)
+
+    cluster.query_zookeeper(f"set '{replica_path}/metadata_version' '0'")
+
+    assert get_metadata_versions() == (1, 0)
+
+    node.restart_clickhouse()
+
+    assert get_metadata_versions() == (1, 1)

From f8c13061a743fc162fc0094ceb773292df4677f6 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Mon, 28 Oct 2024 15:20:01 +0000
Subject: [PATCH 13/80] fix optimization of replacing algorithm

---
 .../Algorithms/ReplacingSortedAlgorithm.cpp   | 42 ++++++++++---------
 1 file changed, 22 insertions(+), 20 deletions(-)

diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index b22f1271687..5059bc806a8 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -5,6 +5,7 @@
 #include <IO/WriteBuffer.h>
 #include <Columns/IColumn.h>
 #include <Processors/Merges/Algorithms/RowRef.h>
+#include "Common/Logger.h"
 #include <numeric>
 
 namespace DB
@@ -122,7 +123,25 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
             return Status(current.impl->order);
         }
 
-        if (current.impl->isFirst()
+        RowRef current_row;
+        setRowRef(current_row, current);
+
+        bool key_differs = selected_row.empty() || rowsHaveDifferentSortColumns(selected_row, current_row);
+        if (key_differs)
+        {
+            /// If there are enough rows and the last one is calculated completely
+            if (merged_data->hasEnoughRows())
+                return Status(merged_data->pull());
+
+            /// Write the data for the previous primary key.
+            if (!selected_row.empty())
+                insertRow();
+
+            selected_row.clear();
+        }
+
+        if (current->isFirst()
+            && key_differs
             && is_deleted_column_number == -1 /// Ignore optimization if we need to filter deleted rows.
             && sources_origin_merge_tree_part_level[current->order] > 0
             && !skipLastRowFor(current->order) /// Ignore optimization if last row should be skipped.
@@ -152,9 +171,9 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
                 std::iota(replace_final_data.begin(), replace_final_data.end(), 0);
                 current_chunk.getChunkInfos().add(std::make_shared<ChunkSelectFinalIndices>(std::move(replace_final_selection)));
 
-                Status status(merged_data->pull(), false);
+                Status status(std::move(current_chunk), false);
                 status.required_source = source_num;
-                return Status(std::move(current_chunk), false);
+                return status;
             }
 
             merged_data->insertChunk(std::move(current_chunk), chunk_num_rows);
@@ -174,23 +193,6 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
             return status;
         }
 
-        RowRef current_row;
-        setRowRef(current_row, current);
-
-        bool key_differs = selected_row.empty() || rowsHaveDifferentSortColumns(selected_row, current_row);
-        if (key_differs)
-        {
-            /// If there are enough rows and the last one is calculated completely
-            if (merged_data->hasEnoughRows())
-                return Status(merged_data->pull());
-
-            /// Write the data for the previous primary key.
-            if (!selected_row.empty())
-                insertRow();
-
-            selected_row.clear();
-        }
-
         /// Initially, skip all rows. Unskip last on insert.
         size_t current_pos = current_row_sources.size();
         if (out_row_sources_buf)

From 5e9aa01f33a2a5745e4d4a131f3d3ddbe84a5808 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 28 Oct 2024 17:25:52 +0100
Subject: [PATCH 14/80] Introduce Setting tiers

---
 docs/en/operations/system-tables/settings.md  | 105 ++++++++++++++++--
 src/Core/BaseSettings.cpp                     |   9 ++
 src/Core/BaseSettings.h                       |  59 +++++-----
 src/Core/ServerSettings.cpp                   |   2 +-
 src/Core/Settings.cpp                         |  18 +--
 src/Core/SettingsObsoleteMacros.h             |   4 +-
 src/Core/SettingsTierType.cpp                 |  19 ++++
 src/Core/SettingsTierType.h                   |  26 +++++
 src/Storages/MergeTree/MergeTreeSettings.cpp  |   4 +-
 src/Storages/System/StorageSystemSettings.cpp |  10 ++
 10 files changed, 206 insertions(+), 50 deletions(-)
 create mode 100644 src/Core/SettingsTierType.cpp
 create mode 100644 src/Core/SettingsTierType.h

diff --git a/docs/en/operations/system-tables/settings.md b/docs/en/operations/system-tables/settings.md
index a04e095e990..1cfee0ba5f4 100644
--- a/docs/en/operations/system-tables/settings.md
+++ b/docs/en/operations/system-tables/settings.md
@@ -18,6 +18,11 @@ Columns:
     - `1` — Current user can’t change the setting.
 - `default` ([String](../../sql-reference/data-types/string.md)) — Setting default value.
 - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete.
+- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values:
+    - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. .
+    - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+    - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+    - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases.
 
 **Example**
 
@@ -26,19 +31,99 @@ The following example shows how to get information about settings which name con
 ``` sql
 SELECT *
 FROM system.settings
-WHERE name LIKE '%min_i%'
+WHERE name LIKE '%min_insert_block_size_%'
+FORMAT Vertical
 ```
 
 ``` text
-┌─name───────────────────────────────────────────────_─value─────_─changed─_─description───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────_─min──_─max──_─readonly─_─type─────────_─default───_─alias_for─_─is_obsolete─┐
-│ min_insert_block_size_rows                         │ 1048449   │       0 │ Squash blocks passed to INSERT query to specified size in rows, if blocks are not big enough.                                                                         │ ____ │ ____ │        0 │ UInt64       │ 1048449   │           │           0 │
-│ min_insert_block_size_bytes                        │ 268402944 │       0 │ Squash blocks passed to INSERT query to specified size in bytes, if blocks are not big enough.                                                                        │ ____ │ ____ │        0 │ UInt64       │ 268402944 │           │           0 │
-│ min_insert_block_size_rows_for_materialized_views  │ 0         │       0 │ Like min_insert_block_size_rows, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_rows)                                           │ ____ │ ____ │        0 │ UInt64       │ 0         │           │           0 │
-│ min_insert_block_size_bytes_for_materialized_views │ 0         │       0 │ Like min_insert_block_size_bytes, but applied only during pushing to MATERIALIZED VIEW (default: min_insert_block_size_bytes)                                         │ ____ │ ____ │        0 │ UInt64       │ 0         │           │           0 │
-│ read_backoff_min_interval_between_events_ms        │ 1000      │       0 │ Settings to reduce the number of threads in case of slow reads. Do not pay attention to the event, if the previous one has passed less than a certain amount of time. │ ____ │ ____ │        0 │ Milliseconds │ 1000      │           │           0 │
-└────────────────────────────────────────────────────┴───────────┴─────────┴─────────────────────────────────────────────────────────────────────────────────────────────────────────────────
-──────────────────────────────────────────────────────┴──────┴──────┴──────────┴──────────────┴───────────┴───────────┴─────────────┘
-```
+Row 1:
+──────
+name:        min_insert_block_size_rows
+value:       1048449
+changed:     0
+description: Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+
+Possible values:
+
+- Positive integer.
+- 0 — Squashing disabled.
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     1048449
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 2:
+──────
+name:        min_insert_block_size_bytes
+value:       268402944
+changed:     0
+description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+
+Possible values:
+
+- Positive integer.
+- 0 — Squashing disabled.
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     268402944
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 3:
+──────
+name:        min_insert_block_size_rows_for_materialized_views
+value:       0
+changed:     0
+description: Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage.
+
+Possible values:
+
+- Any positive integer.
+- 0 — Squashing disabled.
+
+**See Also**
+
+- [min_insert_block_size_rows](#min-insert-block-size-rows)
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     0
+alias_for:   
+is_obsolete: 0
+tier:        Production
+
+Row 4:
+──────
+name:        min_insert_block_size_bytes_for_materialized_views
+value:       0
+changed:     0
+description: Sets the minimum number of bytes in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones. This setting is applied only for blocks inserted into [materialized view](../../sql-reference/statements/create/view.md). By adjusting this setting, you control blocks squashing while pushing to materialized view and avoid excessive memory usage.
+
+Possible values:
+
+- Any positive integer.
+- 0 — Squashing disabled.
+
+**See also**
+
+- [min_insert_block_size_bytes](#min-insert-block-size-bytes)
+min:         ᴺᵁᴸᴸ
+max:         ᴺᵁᴸᴸ
+readonly:    0
+type:        UInt64
+default:     0
+alias_for:   
+is_obsolete: 0
+tier:        Production
+ ```
 
 Using of `WHERE changed` can be useful, for example, when you want to check:
 
diff --git a/src/Core/BaseSettings.cpp b/src/Core/BaseSettings.cpp
index c535b9ce65e..7bfa581598d 100644
--- a/src/Core/BaseSettings.cpp
+++ b/src/Core/BaseSettings.cpp
@@ -8,6 +8,7 @@ namespace DB
 {
 namespace ErrorCodes
 {
+    extern const int INCORRECT_DATA;
     extern const int UNKNOWN_SETTING;
 }
 
@@ -38,6 +39,14 @@ BaseSettingsHelpers::Flags BaseSettingsHelpers::readFlags(ReadBuffer & in)
     return static_cast<Flags>(res);
 }
 
+SettingsTierType BaseSettingsHelpers::getTier(Flags flags)
+{
+    int8_t tier = (flags & Flags::TIER);
+    if (tier > SettingsTierType::OBSOLETE)
+        throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown tier value: '{}'", tier);
+    return SettingsTierType{tier};
+}
+
 
 void BaseSettingsHelpers::throwSettingNotFound(std::string_view name)
 {
diff --git a/src/Core/BaseSettings.h b/src/Core/BaseSettings.h
index 2a2e0bb334e..218460330f4 100644
--- a/src/Core/BaseSettings.h
+++ b/src/Core/BaseSettings.h
@@ -2,6 +2,7 @@
 
 #include <unordered_map>
 #include <Core/SettingsFields.h>
+#include <Core/SettingsTierType.h>
 #include <Core/SettingsWriteFormat.h>
 #include <IO/Operators.h>
 #include <base/range.h>
@@ -21,6 +22,27 @@ namespace DB
 class ReadBuffer;
 class WriteBuffer;
 
+struct BaseSettingsHelpers
+{
+    [[noreturn]] static void throwSettingNotFound(std::string_view name);
+    static void warningSettingNotFound(std::string_view name);
+
+    static void writeString(std::string_view str, WriteBuffer & out);
+    static String readString(ReadBuffer & in);
+
+    enum Flags : UInt64
+    {
+        IMPORTANT = 0x01,
+        CUSTOM = 0x02,
+        TIER = 0x0c, /// 0b1100 == 2 bits
+        /// If adding new flags, consider first if Tier might need more bits
+    };
+
+    static SettingsTierType getTier(Flags flags);
+    static void writeFlags(Flags flags, WriteBuffer & out);
+    static Flags readFlags(ReadBuffer & in);
+};
+
 /** Template class to define collections of settings.
   * If you create a new setting, please also add it to ./utils/check-style/check-settings-style
   * for validation
@@ -138,7 +160,7 @@ public:
         const char * getTypeName() const;
         const char * getDescription() const;
         bool isCustom() const;
-        bool isObsolete() const;
+        SettingsTierType getTier() const;
 
         bool operator==(const SettingFieldRef & other) const { return (getName() == other.getName()) && (getValue() == other.getValue()); }
         bool operator!=(const SettingFieldRef & other) const { return !(*this == other); }
@@ -225,24 +247,6 @@ private:
     std::conditional_t<Traits::allow_custom_settings, CustomSettingMap, boost::blank> custom_settings_map;
 };
 
-struct BaseSettingsHelpers
-{
-    [[noreturn]] static void throwSettingNotFound(std::string_view name);
-    static void warningSettingNotFound(std::string_view name);
-
-    static void writeString(std::string_view str, WriteBuffer & out);
-    static String readString(ReadBuffer & in);
-
-    enum Flags : UInt64
-    {
-        IMPORTANT = 0x01,
-        CUSTOM = 0x02,
-        OBSOLETE = 0x04,
-    };
-    static void writeFlags(Flags flags, WriteBuffer & out);
-    static Flags readFlags(ReadBuffer & in);
-};
-
 template <typename TTraits>
 void BaseSettings<TTraits>::set(std::string_view name, const Field & value)
 {
@@ -797,14 +801,14 @@ bool BaseSettings<TTraits>::SettingFieldRef::isCustom() const
 }
 
 template <typename TTraits>
-bool BaseSettings<TTraits>::SettingFieldRef::isObsolete() const
+SettingsTierType BaseSettings<TTraits>::SettingFieldRef::getTier() const
 {
     if constexpr (Traits::allow_custom_settings)
     {
         if (custom_setting)
-            return false;
+            return SettingsTierType::PRODUCTION;
     }
-    return accessor->isObsolete(index);
+    return accessor->getTier(index);
 }
 
 using AliasMap = std::unordered_map<std::string_view, std::string_view>;
@@ -835,8 +839,8 @@ using AliasMap = std::unordered_map<std::string_view, std::string_view>;
             const String & getName(size_t index) const { return field_infos[index].name; } \
             const char * getTypeName(size_t index) const { return field_infos[index].type; } \
             const char * getDescription(size_t index) const { return field_infos[index].description; } \
-            bool isImportant(size_t index) const { return field_infos[index].is_important; } \
-            bool isObsolete(size_t index) const { return field_infos[index].is_obsolete; } \
+            bool isImportant(size_t index) const { return field_infos[index].flags & BaseSettingsHelpers::Flags::IMPORTANT; } \
+            SettingsTierType getTier(size_t index) const { return BaseSettingsHelpers::getTier(field_infos[index].flags); } \
             Field castValueUtil(size_t index, const Field & value) const { return field_infos[index].cast_value_util_function(value); } \
             String valueToStringUtil(size_t index, const Field & value) const { return field_infos[index].value_to_string_util_function(value); } \
             Field stringToValueUtil(size_t index, const String & str) const { return field_infos[index].string_to_value_util_function(str); } \
@@ -856,8 +860,7 @@ using AliasMap = std::unordered_map<std::string_view, std::string_view>;
                 String name; \
                 const char * type; \
                 const char * description; \
-                bool is_important; \
-                bool is_obsolete; \
+                BaseSettingsHelpers::Flags flags; \
                 Field (*cast_value_util_function)(const Field &); \
                 String (*value_to_string_util_function)(const Field &); \
                 Field (*string_to_value_util_function)(const String &); \
@@ -968,8 +971,8 @@ struct DefineAliases
 /// NOLINTNEXTLINE
 #define IMPLEMENT_SETTINGS_TRAITS_(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \
     res.field_infos.emplace_back( \
-        FieldInfo{#NAME, #TYPE, DESCRIPTION, (FLAGS) & IMPORTANT, \
-            static_cast<bool>((FLAGS) & BaseSettingsHelpers::Flags::OBSOLETE), \
+        FieldInfo{#NAME, #TYPE, DESCRIPTION, \
+            static_cast<BaseSettingsHelpers::Flags>(FLAGS), \
             [](const Field & value) -> Field { return static_cast<Field>(SettingField##TYPE{value}); }, \
             [](const Field & value) -> String { return SettingField##TYPE{value}.toString(); }, \
             [](const String & str) -> Field { SettingField##TYPE temp; temp.parseFromString(str); return static_cast<Field>(temp); }, \
diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp
index 7c2cb49a2ba..326f151b12f 100644
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@@ -337,7 +337,7 @@ void ServerSettings::dumpToSystemServerSettingsColumns(ServerSettingColumnsParam
         res_columns[4]->insert(setting.getDescription());
         res_columns[5]->insert(setting.getTypeName());
         res_columns[6]->insert(is_changeable ? changeable_settings_it->second.second : ChangeableWithoutRestart::No);
-        res_columns[7]->insert(setting.isObsolete());
+        res_columns[7]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
     }
 }
 }
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 0aecb7cf941..54cd3ad9a4f 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -1,7 +1,5 @@
-#include <Columns/ColumnArray.h>
 #include <Columns/ColumnMap.h>
 #include <Core/BaseSettings.h>
-#include <Core/BaseSettingsFwdMacros.h>
 #include <Core/BaseSettingsFwdMacrosImpl.h>
 #include <Core/BaseSettingsProgramOptions.h>
 #include <Core/DistributedCacheProtocol.h>
@@ -40,10 +38,15 @@ namespace ErrorCodes
   * Note: as an alternative, we could implement settings to be completely dynamic in the form of the map: String -> Field,
   *  but we are not going to do it, because settings are used everywhere as static struct fields.
   *
-  * `flags` can be either 0 or IMPORTANT.
+  * `flags` can be either 0 or IMPORTANT + a Tier (PRODUCTION | BETA | EXPERIMENTAL)
   * A setting is "IMPORTANT" if it affects the results of queries and can't be ignored by older versions.
+  * Tiers:
+  * EXPERIMENTAL: The feature is in active development stage. Mostly for developers or for ClickHouse enthusiasts.
+  * BETA: There are no known bugs problems in the functionality, but the outcome of using it together with other
+  * features/components is unknown and correctness is not guaranteed.
+  * PRODUCTION (Default): The feature is safe to use along with other features from the PRODUCTION tier.
   *
-  * When adding new or changing existing settings add them to the settings changes history in SettingsChangesHistory.h
+  * When adding new or changing existing settings add them to the settings changes history in SettingsChangesHistory.cpp
   * for tracking settings changes in different versions and for special `compatibility` settings to work correctly.
   */
 
@@ -6007,7 +6010,7 @@ void SettingsImpl::checkNoSettingNamesAtTopLevel(const Poco::Util::AbstractConfi
     {
         const auto & name = setting.getName();
         bool should_skip_check = name == "max_table_size_to_drop" || name == "max_partition_size_to_drop";
-        if (config.has(name) && !setting.isObsolete() && !should_skip_check)
+        if (config.has(name) && (setting.getTier() != SettingsTierType::OBSOLETE) && !should_skip_check)
         {
             throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, "A setting '{}' appeared at top level in config {}."
                 " But it is user-level setting that should be located in users.xml inside <profiles> section for specific profile."
@@ -6183,7 +6186,7 @@ std::vector<std::string_view> Settings::getChangedAndObsoleteNames() const
     std::vector<std::string_view> setting_names;
     for (const auto & setting : impl->allChanged())
     {
-        if (setting.isObsolete())
+        if (setting.getTier() == SettingsTierType::OBSOLETE)
             setting_names.emplace_back(setting.getName());
     }
     return setting_names;
@@ -6232,7 +6235,8 @@ void Settings::dumpToSystemSettingsColumns(MutableColumnsAndConstraints & params
         res_columns[6]->insert(writability == SettingConstraintWritability::CONST);
         res_columns[7]->insert(setting.getTypeName());
         res_columns[8]->insert(setting.getDefaultValueString());
-        res_columns[10]->insert(setting.isObsolete());
+        res_columns[10]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
+        res_columns[11]->insert(setting.getTier());
     };
 
     const auto & settings_to_aliases = SettingsImpl::Traits::settingsToAliases();
diff --git a/src/Core/SettingsObsoleteMacros.h b/src/Core/SettingsObsoleteMacros.h
index 97db1def294..c680cdc45b6 100644
--- a/src/Core/SettingsObsoleteMacros.h
+++ b/src/Core/SettingsObsoleteMacros.h
@@ -2,8 +2,8 @@
 
 // clang-format off
 #define MAKE_OBSOLETE(M, TYPE, NAME, DEFAULT) \
-    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE)
+    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", SettingsTierType::OBSOLETE)
 
 /// NOTE: ServerSettings::loadSettingsFromConfig() should be updated to include this settings
 #define MAKE_DEPRECATED_BY_SERVER_CONFIG(M, TYPE, NAME, DEFAULT) \
-    M(TYPE, NAME, DEFAULT, "User-level setting is deprecated, and it must be defined in the server configuration instead.", BaseSettingsHelpers::Flags::OBSOLETE)
+    M(TYPE, NAME, DEFAULT, "User-level setting is deprecated, and it must be defined in the server configuration instead.", SettingsTierType::OBSOLETE)
diff --git a/src/Core/SettingsTierType.cpp b/src/Core/SettingsTierType.cpp
new file mode 100644
index 00000000000..48090f26fae
--- /dev/null
+++ b/src/Core/SettingsTierType.cpp
@@ -0,0 +1,19 @@
+#include <Core/SettingsTierType.h>
+#include <DataTypes/DataTypeEnum.h>
+
+namespace DB
+{
+
+std::shared_ptr<DataTypeEnum8> getSettingsTierEnum()
+{
+    return std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            {"Production",      static_cast<Int8>(SettingsTierType::PRODUCTION)},
+            {"Obsolete",        static_cast<Int8>(SettingsTierType::OBSOLETE)},
+            {"Experimental",    static_cast<Int8>(SettingsTierType::EXPERIMENTAL)},
+            {"Beta",            static_cast<Int8>(SettingsTierType::BETA)}
+        });
+}
+
+}
diff --git a/src/Core/SettingsTierType.h b/src/Core/SettingsTierType.h
new file mode 100644
index 00000000000..d8bba89bc18
--- /dev/null
+++ b/src/Core/SettingsTierType.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <Core/Types.h>
+
+#include <cstdint>
+#include <memory>
+
+namespace DB
+{
+
+template <typename Type>
+class DataTypeEnum;
+using DataTypeEnum8 = DataTypeEnum<Int8>;
+
+// Make it signed for compatibility with DataTypeEnum8
+enum SettingsTierType : int8_t
+{
+    PRODUCTION = 0b0000,
+    OBSOLETE = 0b0100,
+    EXPERIMENTAL = 0b1000,
+    BETA = 0b1100
+};
+
+std::shared_ptr<DataTypeEnum8> getSettingsTierEnum();
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp
index 8c6aafe48f2..b95b3a856de 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeSettings.cpp
@@ -238,7 +238,7 @@ namespace ErrorCodes
     DECLARE(DeduplicateMergeProjectionMode, deduplicate_merge_projection_mode, DeduplicateMergeProjectionMode::THROW, "Whether to allow create projection for the table with non-classic MergeTree. Ignore option is purely for compatibility which might result in incorrect answer. Otherwise, if allowed, what is the action when merge, drop or rebuild.", 0) \
 
 #define MAKE_OBSOLETE_MERGE_TREE_SETTING(M, TYPE, NAME, DEFAULT) \
-    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", BaseSettingsHelpers::Flags::OBSOLETE)
+    M(TYPE, NAME, DEFAULT, "Obsolete setting, does nothing.", SettingsTierType::OBSOLETE)
 
 #define OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS) \
     /** Obsolete settings that do nothing but left for compatibility reasons. */ \
@@ -648,7 +648,7 @@ void MergeTreeSettings::dumpToSystemMergeTreeSettingsColumns(MutableColumnsAndCo
         res_columns[5]->insert(max);
         res_columns[6]->insert(writability == SettingConstraintWritability::CONST);
         res_columns[7]->insert(setting.getTypeName());
-        res_columns[8]->insert(setting.isObsolete());
+        res_columns[8]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
     }
 }
 
diff --git a/src/Storages/System/StorageSystemSettings.cpp b/src/Storages/System/StorageSystemSettings.cpp
index 9309f10378e..debd40386a6 100644
--- a/src/Storages/System/StorageSystemSettings.cpp
+++ b/src/Storages/System/StorageSystemSettings.cpp
@@ -2,6 +2,8 @@
 
 #include <Access/SettingsConstraintsAndProfileIDs.h>
 #include <Core/Settings.h>
+#include <Core/SettingsTierType.h>
+#include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
@@ -34,6 +36,14 @@ ColumnsDescription StorageSystemSettings::getColumnsDescription()
         {"default", std::make_shared<DataTypeString>(), "Setting default value."},
         {"alias_for", std::make_shared<DataTypeString>(), "Flag that shows whether this name is an alias to another setting."},
         {"is_obsolete", std::make_shared<DataTypeUInt8>(), "Shows whether a setting is obsolete."},
+        {"tier", getSettingsTierEnum(), R"(
+Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their
+development and the expectations one might have when using them:
+* PRODUCTION: The feature is stable, safe to use and does not have issues interacting with other PRODUCTION features.
+* BETA: The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+* EXPERIMENTAL: The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+* OBSOLETE: No longer supported. Either it is already removed or it will be removed in future releases.
+)"},
     };
 }
 

From 309f18debef94455e1d50ca08fc9dbe3baa54796 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 28 Oct 2024 17:26:20 +0100
Subject: [PATCH 15/80] Mark some NON-PRODUCTION settings

---
 src/Core/BaseSettings.cpp                     |   2 +-
 src/Core/Settings.cpp                         | 403 +++++++++---------
 src/Storages/MergeTree/MergeTreeSettings.cpp  |  19 +-
 .../System/StorageSystemMergeTreeSettings.cpp |  11 +-
 4 files changed, 227 insertions(+), 208 deletions(-)

diff --git a/src/Core/BaseSettings.cpp b/src/Core/BaseSettings.cpp
index 7bfa581598d..51e99262bdb 100644
--- a/src/Core/BaseSettings.cpp
+++ b/src/Core/BaseSettings.cpp
@@ -42,7 +42,7 @@ BaseSettingsHelpers::Flags BaseSettingsHelpers::readFlags(ReadBuffer & in)
 SettingsTierType BaseSettingsHelpers::getTier(Flags flags)
 {
     int8_t tier = (flags & Flags::TIER);
-    if (tier > SettingsTierType::OBSOLETE)
+    if (tier > SettingsTierType::BETA)
         throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown tier value: '{}'", tier);
     return SettingsTierType{tier};
 }
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 54cd3ad9a4f..4159758fe76 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -5506,90 +5506,102 @@ For testing purposes. Replaces all external table functions to Null to not initi
     DECLARE(Bool, restore_replace_external_dictionary_source_to_null, false, R"(
 Replace external dictionary sources to Null on restore. Useful for testing purposes
 )", 0) \
-    DECLARE(Bool, create_if_not_exists, false, R"(
-Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
-)", 0) \
-    DECLARE(Bool, enforce_strict_identifier_format, false, R"(
-If enabled, only allow identifiers containing alphanumeric characters and underscores.
-)", 0) \
-    DECLARE(Bool, mongodb_throw_on_unsupported_query, true, R"(
-If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.
-)", 0) \
-    \
-    /* ###################################### */ \
-    /* ######## EXPERIMENTAL FEATURES ####### */ \
-    /* ###################################### */ \
-    DECLARE(Bool, allow_experimental_materialized_postgresql_table, false, R"(
-Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental
-)", 0) \
-    DECLARE(Bool, allow_experimental_funnel_functions, false, R"(
-Enable experimental functions for funnel analysis.
-)", 0) \
-    DECLARE(Bool, allow_experimental_nlp_functions, false, R"(
-Enable experimental functions for natural language processing.
-)", 0) \
-    DECLARE(Bool, allow_experimental_hash_functions, false, R"(
-Enable experimental hash functions
-)", 0) \
-    DECLARE(Bool, allow_experimental_object_type, false, R"(
-Allow Object and JSON data types
-)", 0) \
-    DECLARE(Bool, allow_experimental_time_series_table, false, R"(
-Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.
+        /* Parallel replicas */ \
+    DECLARE(UInt64, allow_experimental_parallel_reading_from_replicas, 0, R"(
+Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure
+)", BETA) ALIAS(enable_parallel_replicas) \
+    DECLARE(NonZeroUInt64, max_parallel_replicas, 1, R"(
+The maximum number of replicas for each shard when executing a query.
 
 Possible values:
 
-- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled.
-- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
-)", 0) \
-    DECLARE(Bool, allow_experimental_vector_similarity_index, false, R"(
-Allow experimental vector similarity index
-)", 0) \
-    DECLARE(Bool, allow_experimental_variant_type, false, R"(
-Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md).
-)", 0) \
-    DECLARE(Bool, allow_experimental_dynamic_type, false, R"(
-Allow Dynamic data type
-)", 0) \
-    DECLARE(Bool, allow_experimental_json_type, false, R"(
-Allow JSON data type
-)", 0) \
-    DECLARE(Bool, allow_experimental_codecs, false, R"(
-If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).
-)", 0) \
-    DECLARE(Bool, allow_experimental_shared_set_join, true, R"(
-Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
-)", 0) \
-    DECLARE(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
-SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
-)", 0) \
-    DECLARE(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
-The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
-)", 0) \
-    DECLARE(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
-Throw exception if unsupported query is used inside transaction
-)", 0) \
-    DECLARE(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, R"(
-Wait for committed changes to become actually visible in the latest snapshot
-)", 0) \
-    DECLARE(Bool, implicit_transaction, false, R"(
-If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)
-)", 0) \
-    DECLARE(UInt64, grace_hash_join_initial_buckets, 1, R"(
-Initial number of grace hash join buckets
-)", 0) \
-    DECLARE(UInt64, grace_hash_join_max_buckets, 1024, R"(
-Limit on the number of grace hash join buckets
-)", 0) \
-    DECLARE(UInt64, join_to_sort_minimum_perkey_rows, 40, R"(
-The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys
-)", 0) \
-    DECLARE(UInt64, join_to_sort_maximum_table_rows, 10000, R"(
-The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.
-)", 0) \
-    DECLARE(Bool, allow_experimental_join_right_table_sorting, false, R"(
-If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.
+- Positive integer.
+
+**Additional Info**
+
+This options will produce different results depending on the settings used.
+
+:::note
+This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details.
+:::
+
+### Parallel processing using `SAMPLE` key
+
+A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases:
+
+- The position of the sampling key in the partitioning key does not allow efficient range scans.
+- Adding a sampling key to the table makes filtering by other columns less efficient.
+- The sampling key is an expression that is expensive to calculate.
+- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency.
+
+### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key)
+
+This setting is useful for any replicated table.
 )", 0) \
+    DECLARE(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, R"(
+Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_count, 0, R"(
+This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing.
+)", BETA) \
+    DECLARE(UInt64, parallel_replica_offset, 0, R"(
+This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas.
+)", BETA) \
+    DECLARE(String, parallel_replicas_custom_key, "", R"(
+An arbitrary integer expression that can be used to split work between replicas for a specific table.
+The value can be any integer expression.
+
+Simple expressions using primary keys are preferred.
+
+If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards.
+Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_custom_key_range_lower, 0, R"(
+Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`.
+
+When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
+
+Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_custom_key_range_upper, 0, R"(
+Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression.
+
+When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
+
+Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing
+)", BETA) \
+    DECLARE(String, cluster_for_parallel_replicas, "", R"(
+Cluster for a shard in which current server is located
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_allow_in_with_subquery, true, R"(
+If true, subquery for IN will be executed on every follower replica.
+)", BETA) \
+    DECLARE(Float, parallel_replicas_single_task_marks_count_multiplier, 2, R"(
+A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_for_non_replicated_merge_tree, false, R"(
+If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, R"(
+Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_prefer_local_join, true, R"(
+If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.
+)", BETA) \
+    DECLARE(UInt64, parallel_replicas_mark_segment_size, 0, R"(
+Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384]
+)", BETA) \
+    DECLARE(Bool, parallel_replicas_local_plan, false, R"(
+Build local plan for local replica
+)", BETA) \
+    \
+    DECLARE(Bool, allow_experimental_analyzer, true, R"(
+Allow new query analyzer.
+)", IMPORTANT | BETA) ALIAS(enable_analyzer) \
+    DECLARE(Bool, analyzer_compatibility_join_using_top_level_identifier, false, R"(
+Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).
+)", BETA) \
+    \
     DECLARE(Timezone, session_timezone, "", R"(
 Sets the implicit time zone of the current session or query.
 The implicit time zone is the time zone applied to values of type DateTime/DateTime64 which have no explicitly specified time zone.
@@ -5649,126 +5661,121 @@ This happens due to different parsing pipelines:
 **See also**
 
 - [timezone](../server-configuration-parameters/settings.md#timezone)
+)", BETA) \
+DECLARE(Bool, create_if_not_exists, false, R"(
+Enable `IF NOT EXISTS` for `CREATE` statement by default. If either this setting or `IF NOT EXISTS` is specified and a table with the provided name already exists, no exception will be thrown.
+)", 0) \
+    DECLARE(Bool, enforce_strict_identifier_format, false, R"(
+If enabled, only allow identifiers containing alphanumeric characters and underscores.
+)", 0) \
+    DECLARE(Bool, mongodb_throw_on_unsupported_query, true, R"(
+If enabled, MongoDB tables will return an error when a MongoDB query cannot be built. Otherwise, ClickHouse reads the full table and processes it locally. This option does not apply to the legacy implementation or when 'allow_experimental_analyzer=0'.
+)", 0) \
+    DECLARE(Bool, implicit_select, false, R"(
+Allow writing simple SELECT queries without the leading SELECT keyword, which makes it simple for calculator-style usage, e.g. `1 + 2` becomes a valid query.
 )", 0) \
-    DECLARE(Bool, use_hive_partitioning, false, R"(
-When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`.
-)", 0)\
     \
-    DECLARE(Bool, allow_statistics_optimize, false, R"(
-Allows using statistics to optimize queries
-)", 0) ALIAS(allow_statistic_optimize) \
-    DECLARE(Bool, allow_experimental_statistics, false, R"(
-Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics).
-)", 0) ALIAS(allow_experimental_statistic) \
     \
-    /* Parallel replicas */ \
-    DECLARE(UInt64, allow_experimental_parallel_reading_from_replicas, 0, R"(
-Use up to `max_parallel_replicas` the number of replicas from each shard for SELECT query execution. Reading is parallelized and coordinated dynamically. 0 - disabled, 1 - enabled, silently disable them in case of failure, 2 - enabled, throw an exception in case of failure
-)", 0) ALIAS(enable_parallel_replicas) \
-    DECLARE(NonZeroUInt64, max_parallel_replicas, 1, R"(
-The maximum number of replicas for each shard when executing a query.
+    /* ####################################################### */ \
+    /* ########### START OF EXPERIMENTAL FEATURES ############ */ \
+    /* ## ADD PRODUCTION / BETA FEATURES BEFORE THIS BLOCK  ## */ \
+    /* ####################################################### */ \
+    \
+    DECLARE(Bool, allow_experimental_materialized_postgresql_table, false, R"(
+Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_funnel_functions, false, R"(
+Enable experimental functions for funnel analysis.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_nlp_functions, false, R"(
+Enable experimental functions for natural language processing.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_hash_functions, false, R"(
+Enable experimental hash functions
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_object_type, false, R"(
+Allow Object and JSON data types
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_time_series_table, false, R"(
+Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.
 
 Possible values:
 
-- Positive integer.
-
-**Additional Info**
-
-This options will produce different results depending on the settings used.
-
-:::note
-This setting will produce incorrect results when joins or subqueries are involved, and all tables don't meet certain requirements. See [Distributed Subqueries and max_parallel_replicas](../../sql-reference/operators/in.md/#max_parallel_replica-subqueries) for more details.
-:::
-
-### Parallel processing using `SAMPLE` key
-
-A query may be processed faster if it is executed on several servers in parallel. But the query performance may degrade in the following cases:
-
-- The position of the sampling key in the partitioning key does not allow efficient range scans.
-- Adding a sampling key to the table makes filtering by other columns less efficient.
-- The sampling key is an expression that is expensive to calculate.
-- The cluster latency distribution has a long tail, so that querying more servers increases the query overall latency.
-
-### Parallel processing using [parallel_replicas_custom_key](#parallel_replicas_custom_key)
-
-This setting is useful for any replicated table.
-)", 0) \
-    DECLARE(ParallelReplicasMode, parallel_replicas_mode, ParallelReplicasMode::READ_TASKS, R"(
-Type of filter to use with custom key for parallel replicas. default - use modulo operation on the custom key, range - use range filter on custom key using all possible values for the value type of custom key.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_count, 0, R"(
-This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the number of parallel replicas participating in query processing.
-)", 0) \
-    DECLARE(UInt64, parallel_replica_offset, 0, R"(
-This is internal setting that should not be used directly and represents an implementation detail of the 'parallel replicas' mode. This setting will be automatically set up by the initiator server for distributed queries to the index of the replica participating in query processing among parallel replicas.
-)", 0) \
-    DECLARE(String, parallel_replicas_custom_key, "", R"(
-An arbitrary integer expression that can be used to split work between replicas for a specific table.
-The value can be any integer expression.
-
-Simple expressions using primary keys are preferred.
-
-If the setting is used on a cluster that consists of a single shard with multiple replicas, those replicas will be converted into virtual shards.
-Otherwise, it will behave same as for `SAMPLE` key, it will use multiple replicas of each shard.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_custom_key_range_lower, 0, R"(
-Allows the filter type `range` to split the work evenly between replicas based on the custom range `[parallel_replicas_custom_key_range_lower, INT_MAX]`.
-
-When used in conjunction with [parallel_replicas_custom_key_range_upper](#parallel_replicas_custom_key_range_upper), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
-
-Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_custom_key_range_upper, 0, R"(
-Allows the filter type `range` to split the work evenly between replicas based on the custom range `[0, parallel_replicas_custom_key_range_upper]`. A value of 0 disables the upper bound, setting it the max value of the custom key expression.
-
-When used in conjunction with [parallel_replicas_custom_key_range_lower](#parallel_replicas_custom_key_range_lower), it lets the filter evenly split the work over replicas for the range `[parallel_replicas_custom_key_range_lower, parallel_replicas_custom_key_range_upper]`.
-
-Note: This setting will not cause any additional data to be filtered during query processing, rather it changes the points at which the range filter breaks up the range `[0, INT_MAX]` for parallel processing
-)", 0) \
-    DECLARE(String, cluster_for_parallel_replicas, "", R"(
-Cluster for a shard in which current server is located
-)", 0) \
-    DECLARE(Bool, parallel_replicas_allow_in_with_subquery, true, R"(
-If true, subquery for IN will be executed on every follower replica.
-)", 0) \
-    DECLARE(Float, parallel_replicas_single_task_marks_count_multiplier, 2, R"(
-A multiplier which will be added during calculation for minimal number of marks to retrieve from coordinator. This will be applied only for remote replicas.
-)", 0) \
-    DECLARE(Bool, parallel_replicas_for_non_replicated_merge_tree, false, R"(
-If true, ClickHouse will use parallel replicas algorithm also for non-replicated MergeTree tables
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_min_number_of_rows_per_replica, 0, R"(
-Limit the number of replicas used in a query to (estimated rows to read / min_number_of_rows_per_replica). The max is still limited by 'max_parallel_replicas'
-)", 0) \
-    DECLARE(Bool, parallel_replicas_prefer_local_join, true, R"(
-If true, and JOIN can be executed with parallel replicas algorithm, and all storages of right JOIN part are *MergeTree, local JOIN will be used instead of GLOBAL JOIN.
-)", 0) \
-    DECLARE(UInt64, parallel_replicas_mark_segment_size, 0, R"(
-Parts virtually divided into segments to be distributed between replicas for parallel reading. This setting controls the size of these segments. Not recommended to change until you're absolutely sure in what you're doing. Value should be in range [128; 16384]
+- 0 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is disabled.
+- 1 — the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine is enabled.
 )", 0) \
+    DECLARE(Bool, allow_experimental_vector_similarity_index, false, R"(
+Allow experimental vector similarity index
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_variant_type, false, R"(
+Allows creation of experimental [Variant](../../sql-reference/data-types/variant.md).
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_dynamic_type, false, R"(
+Allow Dynamic data type
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_json_type, false, R"(
+Allow JSON data type
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_codecs, false, R"(
+If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_shared_set_join, true, R"(
+Only in ClickHouse Cloud. Allow to create ShareSet and SharedJoin
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, max_limit_for_ann_queries, 1'000'000, R"(
+SELECT queries with LIMIT bigger than this setting cannot use vector similarity indexes. Helps to prevent memory overflows in vector similarity indexes.
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, hnsw_candidate_list_size_for_search, 256, R"(
+The size of the dynamic candidate list when searching the vector similarity index, also known as 'ef_search'.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, throw_on_unsupported_query_inside_transaction, true, R"(
+Throw exception if unsupported query is used inside transaction
+)", EXPERIMENTAL) \
+    DECLARE(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, R"(
+Wait for committed changes to become actually visible in the latest snapshot
+)", EXPERIMENTAL) \
+    DECLARE(Bool, implicit_transaction, false, R"(
+If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, grace_hash_join_initial_buckets, 1, R"(
+Initial number of grace hash join buckets
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, grace_hash_join_max_buckets, 1024, R"(
+Limit on the number of grace hash join buckets
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, join_to_sort_minimum_perkey_rows, 40, R"(
+The lower limit of per-key average rows in the right table to determine whether to rerange the right table by key in left or inner join. This setting ensures that the optimization is not applied for sparse table keys
+)", EXPERIMENTAL) \
+    DECLARE(UInt64, join_to_sort_maximum_table_rows, 10000, R"(
+The maximum number of rows in the right table to determine whether to rerange the right table by key in left or inner join.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_join_right_table_sorting, false, R"(
+If it is set to true, and the conditions of `join_to_sort_minimum_perkey_rows` and `join_to_sort_maximum_table_rows` are met, rerange the right table by key to improve the performance in left or inner hash join.
+)", EXPERIMENTAL) \
+    DECLARE(Bool, use_hive_partitioning, false, R"(
+When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`.
+)", EXPERIMENTAL)\
+    \
+    DECLARE(Bool, allow_statistics_optimize, false, R"(
+Allows using statistics to optimize queries
+)", EXPERIMENTAL) ALIAS(allow_statistic_optimize) \
+    DECLARE(Bool, allow_experimental_statistics, false, R"(
+Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics).
+)", EXPERIMENTAL) ALIAS(allow_experimental_statistic) \
+    \
     DECLARE(Bool, allow_archive_path_syntax, true, R"(
 File/S3 engines/table function will parse paths with '::' as '\\<archive\\> :: \\<file\\>' if archive has correct extension
-)", 0) \
-    DECLARE(Bool, parallel_replicas_local_plan, false, R"(
-Build local plan for local replica
-)", 0) \
+)", EXPERIMENTAL) \
     \
     DECLARE(Bool, allow_experimental_inverted_index, false, R"(
 If it is set to true, allow to use experimental inverted index.
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(Bool, allow_experimental_full_text_index, false, R"(
 If it is set to true, allow to use experimental full-text index.
-)", 0) \
+)", EXPERIMENTAL) \
     \
     DECLARE(Bool, allow_experimental_join_condition, false, R"(
 Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y.
-)", 0) \
-    \
-    DECLARE(Bool, allow_experimental_analyzer, true, R"(
-Allow new query analyzer.
-)", IMPORTANT) ALIAS(enable_analyzer) \
-    DECLARE(Bool, analyzer_compatibility_join_using_top_level_identifier, false, R"(
-Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).
 )", 0) \
     \
     DECLARE(Bool, allow_experimental_live_view, false, R"(
@@ -5781,43 +5788,45 @@ Possible values:
 )", 0) \
     DECLARE(Seconds, live_view_heartbeat_interval, 15, R"(
 The heartbeat interval in seconds to indicate live query is alive.
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(UInt64, max_live_view_insert_blocks_before_refresh, 64, R"(
 Limit maximum number of inserted blocks after which mergeable blocks are dropped and query is re-executed.
-)", 0) \
+)", EXPERIMENTAL) \
     \
     DECLARE(Bool, allow_experimental_window_view, false, R"(
 Enable WINDOW VIEW. Not mature enough.
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(Seconds, window_view_clean_interval, 60, R"(
 The clean interval of window view in seconds to free outdated data.
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(Seconds, window_view_heartbeat_interval, 15, R"(
 The heartbeat interval in seconds to indicate watch query is alive.
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(Seconds, wait_for_window_view_fire_signal_timeout, 10, R"(
 Timeout for waiting for window view fire signal in event time processing
-)", 0) \
+)", EXPERIMENTAL) \
     \
     DECLARE(Bool, stop_refreshable_materialized_views_on_startup, false, R"(
 On server startup, prevent scheduling of refreshable materialized views, as if with SYSTEM STOP VIEWS. You can manually start them with SYSTEM START VIEWS or SYSTEM START VIEW \\<name\\> afterwards. Also applies to newly created views. Has no effect on non-refreshable materialized views.
-)", 0) \
+)", EXPERIMENTAL) \
     \
     DECLARE(Bool, allow_experimental_database_materialized_mysql, false, R"(
 Allow to create database with Engine=MaterializedMySQL(...).
-)", 0) \
+)", EXPERIMENTAL) \
     DECLARE(Bool, allow_experimental_database_materialized_postgresql, false, R"(
 Allow to create database with Engine=MaterializedPostgreSQL(...).
-)", 0) \
+)", EXPERIMENTAL) \
     \
     /** Experimental feature for moving data between shards. */ \
     DECLARE(Bool, allow_experimental_query_deduplication, false, R"(
 Experimental data deduplication for SELECT queries based on part UUIDs
-)", 0) \
-    DECLARE(Bool, implicit_select, false, R"(
-Allow writing simple SELECT queries without the leading SELECT keyword, which makes it simple for calculator-style usage, e.g. `1 + 2` becomes a valid query.
-)", 0)
-
+)", EXPERIMENTAL) \
+    \
+    /* ####################################################### */ \
+    /* ############ END OF EXPERIMENTAL FEATURES ############# */ \
+    /* ## ADD PRODUCTION / BETA FEATURES BEFORE THIS BLOCK  ## */ \
+    /* ####################################################### */ \
+    /* ####################################################### */ \
 
 // End of COMMON_SETTINGS
 // Please add settings related to formats in Core/FormatFactorySettings.h, move obsolete settings to OBSOLETE_SETTINGS and obsolete format settings to OBSOLETE_FORMAT_SETTINGS.
diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp
index b95b3a856de..36e146f4624 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeSettings.cpp
@@ -88,7 +88,7 @@ namespace ErrorCodes
     DECLARE(UInt64, min_age_to_force_merge_seconds, 0, "If all parts in a certain range are older than this value, range will be always eligible for merging. Set to 0 to disable.", 0) \
     DECLARE(Bool, min_age_to_force_merge_on_partition_only, false, "Whether min_age_to_force_merge_seconds should be applied only on the entire partition and not on subset.", false) \
     DECLARE(UInt64, number_of_free_entries_in_pool_to_execute_optimize_entire_partition, 25, "When there is less than specified number of free entries in pool, do not try to execute optimize entire partition with a merge (this merge is created when set min_age_to_force_merge_seconds > 0 and min_age_to_force_merge_on_partition_only = true). This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
-    DECLARE(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", 0) \
+    DECLARE(Bool, remove_rolled_back_parts_immediately, 1, "Setting for an incomplete experimental feature.", EXPERIMENTAL) \
     DECLARE(UInt64, replicated_max_mutations_in_one_entry, 10000, "Max number of mutation commands that can be merged together and executed in one MUTATE_PART entry (0 means unlimited)", 0) \
     DECLARE(UInt64, number_of_mutations_to_delay, 500, "If table has at least that many unfinished mutations, artificially slow down mutations of table. Disabled if set to 0", 0) \
     DECLARE(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \
@@ -214,14 +214,14 @@ namespace ErrorCodes
     DECLARE(Bool, enable_block_offset_column, false, "Enable persisting column _block_offset for each row.", 0) \
     \
     /** Experimental/work in progress feature. Unsafe for production. */ \
-    DECLARE(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \
-    DECLARE(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", 0) \
-    DECLARE(Bool, allow_remote_fs_zero_copy_replication, false, "Don't use this setting in production, because it is not ready.", 0) \
-    DECLARE(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for zero-copy table-independent info.", 0) \
-    DECLARE(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", 0) \
-    DECLARE(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", 0) \
-    DECLARE(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", 0) \
-    DECLARE(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", 0) \
+    DECLARE(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", EXPERIMENTAL) \
+    DECLARE(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", EXPERIMENTAL) \
+    DECLARE(Bool, allow_remote_fs_zero_copy_replication, false, "Don't use this setting in production, because it is not ready.", BETA) \
+    DECLARE(String, remote_fs_zero_copy_zookeeper_path, "/clickhouse/zero_copy", "ZooKeeper path for zero-copy table-independent info.", EXPERIMENTAL) \
+    DECLARE(Bool, remote_fs_zero_copy_path_compatible_mode, false, "Run zero-copy in compatible mode during conversion process.", EXPERIMENTAL) \
+    DECLARE(Bool, cache_populated_by_fetch, false, "Only available in ClickHouse Cloud", EXPERIMENTAL) \
+    DECLARE(Bool, force_read_through_cache_for_merges, false, "Force read-through filesystem cache for merges", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_replacing_merge_with_cleanup, false, "Allow experimental CLEANUP merges for ReplacingMergeTree with is_deleted column.", EXPERIMENTAL) \
     \
     /** Compress marks and primary key. */ \
     DECLARE(Bool, compress_marks, true, "Marks support compression, reduce mark file size and speed up network transmission.", 0) \
@@ -649,6 +649,7 @@ void MergeTreeSettings::dumpToSystemMergeTreeSettingsColumns(MutableColumnsAndCo
         res_columns[6]->insert(writability == SettingConstraintWritability::CONST);
         res_columns[7]->insert(setting.getTypeName());
         res_columns[8]->insert(setting.getTier() == SettingsTierType::OBSOLETE);
+        res_columns[9]->insert(setting.getTier());
     }
 }
 
diff --git a/src/Storages/System/StorageSystemMergeTreeSettings.cpp b/src/Storages/System/StorageSystemMergeTreeSettings.cpp
index 35d975216f6..1da4835dba5 100644
--- a/src/Storages/System/StorageSystemMergeTreeSettings.cpp
+++ b/src/Storages/System/StorageSystemMergeTreeSettings.cpp
@@ -1,4 +1,5 @@
-#include <Core/Settings.h>
+#include <Core/SettingsTierType.h>
+#include <DataTypes/DataTypeEnum.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <DataTypes/DataTypeNullable.h>
@@ -30,6 +31,14 @@ ColumnsDescription SystemMergeTreeSettings<replicated>::getColumnsDescription()
         },
         {"type",        std::make_shared<DataTypeString>(), "Setting type (implementation specific string value)."},
         {"is_obsolete", std::make_shared<DataTypeUInt8>(), "Shows whether a setting is obsolete."},
+        {"tier", getSettingsTierEnum(), R"(
+Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their
+development and the expectations one might have when using them:
+* PRODUCTION: The feature is stable, safe to use and does not have issues interacting with other PRODUCTION features.
+* BETA: The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+* EXPERIMENTAL: The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+* OBSOLETE: No longer supported. Either it is already removed or it will be removed in future releases.
+)"},
     };
 }
 

From 08d070d982ababa39f726480efc4ba76d85f365e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 28 Oct 2024 17:46:11 +0100
Subject: [PATCH 16/80] Add basic test for setting tiers

---
 .../queries/0_stateless/03257_setting_tiers.reference | 10 ++++++++++
 tests/queries/0_stateless/03257_setting_tiers.sql     | 11 +++++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 tests/queries/0_stateless/03257_setting_tiers.reference
 create mode 100644 tests/queries/0_stateless/03257_setting_tiers.sql

diff --git a/tests/queries/0_stateless/03257_setting_tiers.reference b/tests/queries/0_stateless/03257_setting_tiers.reference
new file mode 100644
index 00000000000..d3d171221e8
--- /dev/null
+++ b/tests/queries/0_stateless/03257_setting_tiers.reference
@@ -0,0 +1,10 @@
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
diff --git a/tests/queries/0_stateless/03257_setting_tiers.sql b/tests/queries/0_stateless/03257_setting_tiers.sql
new file mode 100644
index 00000000000..c7ffe87a80b
--- /dev/null
+++ b/tests/queries/0_stateless/03257_setting_tiers.sql
@@ -0,0 +1,11 @@
+SELECT count() > 0 FROM system.settings WHERE tier = 'Production';
+SELECT count() > 0 FROM system.settings WHERE tier = 'Beta';
+SELECT count() > 0 FROM system.settings WHERE tier = 'Experimental';
+SELECT count() > 0 FROM system.settings WHERE tier = 'Obsolete';
+SELECT count() == countIf(tier IN ['Production', 'Beta', 'Experimental', 'Obsolete']) FROM system.settings;
+
+SELECT count() > 0 FROM system.merge_tree_settings WHERE tier = 'Production';
+SELECT count() > 0 FROM system.merge_tree_settings WHERE tier = 'Beta';
+SELECT count() > 0 FROM system.merge_tree_settings WHERE tier = 'Experimental';
+SELECT count() > 0 FROM system.merge_tree_settings WHERE tier = 'Obsolete';
+SELECT count() == countIf(tier IN ['Production', 'Beta', 'Experimental', 'Obsolete']) FROM system.merge_tree_settings;

From f89887de6a6d5ffa7a5e8eec20a4a2358fed4410 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 28 Oct 2024 18:18:09 +0100
Subject: [PATCH 17/80] Adjust existing tests

---
 .../queries/0_stateless/01221_system_settings.reference  | 4 ++--
 .../0_stateless/02117_show_create_table_system.reference | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/queries/0_stateless/01221_system_settings.reference b/tests/queries/0_stateless/01221_system_settings.reference
index 32a0ed11b6c..821d2e386a9 100644
--- a/tests/queries/0_stateless/01221_system_settings.reference
+++ b/tests/queries/0_stateless/01221_system_settings.reference
@@ -1,4 +1,4 @@
-send_timeout	300	0	Timeout for sending data to the network, in seconds. If a client needs to send some data but is not able to send any bytes in this interval, the exception is thrown. If you set this setting on the client, the \'receive_timeout\' for the socket will also be set on the corresponding connection end on the server.	\N	\N	0	Seconds	300		0
-storage_policy	default	0	Name of storage disk policy	\N	\N	0	String	0
+send_timeout	300	0	Timeout for sending data to the network, in seconds. If a client needs to send some data but is not able to send any bytes in this interval, the exception is thrown. If you set this setting on the client, the \'receive_timeout\' for the socket will also be set on the corresponding connection end on the server.	\N	\N	0	Seconds	300		0	Production
+storage_policy	default	0	Name of storage disk policy	\N	\N	0	String	0	Production
 1
 1
diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference
index b260e2dce6c..2ea62444cff 100644
--- a/tests/queries/0_stateless/02117_show_create_table_system.reference
+++ b/tests/queries/0_stateless/02117_show_create_table_system.reference
@@ -342,7 +342,8 @@ CREATE TABLE system.merge_tree_settings
     `max` Nullable(String),
     `readonly` UInt8,
     `type` String,
-    `is_obsolete` UInt8
+    `is_obsolete` UInt8,
+    `tier` Enum8('Production' = 0, 'Obsolete' = 4, 'Experimental' = 8, 'Beta' = 12)
 )
 ENGINE = SystemMergeTreeSettings
 COMMENT 'Contains a list of all MergeTree engine specific settings, their current and default values along with descriptions. You may change any of them in SETTINGS section in CREATE query.'
@@ -932,7 +933,8 @@ CREATE TABLE system.replicated_merge_tree_settings
     `max` Nullable(String),
     `readonly` UInt8,
     `type` String,
-    `is_obsolete` UInt8
+    `is_obsolete` UInt8,
+    `tier` Enum8('Production' = 0, 'Obsolete' = 4, 'Experimental' = 8, 'Beta' = 12)
 )
 ENGINE = SystemReplicatedMergeTreeSettings
 COMMENT 'Contains a list of all ReplicatedMergeTree engine specific settings, their current and default values along with descriptions. You may change any of them in SETTINGS section in CREATE query. '
@@ -1009,7 +1011,8 @@ CREATE TABLE system.settings
     `type` String,
     `default` String,
     `alias_for` String,
-    `is_obsolete` UInt8
+    `is_obsolete` UInt8,
+    `tier` Enum8('Production' = 0, 'Obsolete' = 4, 'Experimental' = 8, 'Beta' = 12)
 )
 ENGINE = SystemSettings
 COMMENT 'Contains a list of all user-level settings (which can be modified in a scope of query or session), their current and default values along with descriptions.'

From 49655e71f5dc6ca87a41ef30de6bd8b2b53be354 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Mon, 28 Oct 2024 18:20:43 +0100
Subject: [PATCH 18/80] Update docs

---
 docs/en/operations/system-tables/merge_tree_settings.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/en/operations/system-tables/merge_tree_settings.md b/docs/en/operations/system-tables/merge_tree_settings.md
index 48217d63f9d..473315d3941 100644
--- a/docs/en/operations/system-tables/merge_tree_settings.md
+++ b/docs/en/operations/system-tables/merge_tree_settings.md
@@ -18,6 +18,11 @@ Columns:
     - `1` — Current user can’t change the setting.
 - `type` ([String](../../sql-reference/data-types/string.md)) — Setting type (implementation specific string value).
 - `is_obsolete` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) - Shows whether a setting is obsolete.
+- `tier` ([Enum8](../../sql-reference/data-types/enum.md)) — Support level for this feature. ClickHouse features are organized in tiers, varying depending on the current status of their development and the expectations one might have when using them. Values:
+    - `'Production'` — The feature is stable, safe to use and does not have issues interacting with other **production** features. .
+    - `'Beta'` — The feature is stable and safe. The outcome of using it together with other features is unknown and correctness is not guaranteed. Testing and reports are welcome.
+    - `'Experimental'` — The feature is under development. Only intended for developers and ClickHouse enthusiasts. The feature might or might not work and could be removed at any time.
+    - `'Obsolete'` — No longer supported. Either it is already removed or it will be removed in future releases.
 
 **Example**
 ```sql

From 6cf3da7982cf9c678388bf45e4092d778560eade Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Mon, 28 Oct 2024 20:53:39 +0000
Subject: [PATCH 19/80] better vertical final in Replacing

---
 .../Merges/Algorithms/ReplacingSortedAlgorithm.cpp  | 13 +++----------
 .../Merges/Algorithms/ReplacingSortedAlgorithm.h    |  8 ++++++--
 .../Transforms/SelectByIndicesTransform.h           |  6 +++++-
 3 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
index 5059bc806a8..dbce348d1aa 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.cpp
@@ -5,8 +5,6 @@
 #include <IO/WriteBuffer.h>
 #include <Columns/IColumn.h>
 #include <Processors/Merges/Algorithms/RowRef.h>
-#include "Common/Logger.h"
-#include <numeric>
 
 namespace DB
 {
@@ -165,13 +163,8 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
 
             if (enable_vertical_final)
             {
-                auto replace_final_selection = ColumnUInt64::create(chunk_num_rows);
-                auto & replace_final_data = replace_final_selection->getData();
-
-                std::iota(replace_final_data.begin(), replace_final_data.end(), 0);
-                current_chunk.getChunkInfos().add(std::make_shared<ChunkSelectFinalIndices>(std::move(replace_final_selection)));
-
-                Status status(std::move(current_chunk), false);
+                current_chunk.getChunkInfos().add(std::make_shared<ChunkSelectFinalAllRows>());
+                Status status(std::move(current_chunk));
                 status.required_source = source_num;
                 return status;
             }
@@ -188,7 +181,7 @@ IMergingAlgorithm::Status ReplacingSortedAlgorithm::merge()
                     out_row_sources_buf->write(row_source.data);
             }
 
-            Status status(merged_data->pull(), false);
+            Status status(merged_data->pull());
             status.required_source = source_num;
             return status;
         }
diff --git a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
index b0dd4fe4b08..ec366b900f5 100644
--- a/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
+++ b/src/Processors/Merges/Algorithms/ReplacingSortedAlgorithm.h
@@ -13,8 +13,7 @@ class Logger;
 namespace DB
 {
 
-/** Use in skipping final to keep list of indices of selected row after merging final
-  */
+//// Used in skipping final to keep the list of indices of selected rows after merging.
 struct ChunkSelectFinalIndices : public ChunkInfoCloneable<ChunkSelectFinalIndices>
 {
     explicit ChunkSelectFinalIndices(MutableColumnPtr select_final_indices_);
@@ -24,6 +23,11 @@ struct ChunkSelectFinalIndices : public ChunkInfoCloneable<ChunkSelectFinalIndic
     const ColumnUInt64 * select_final_indices = nullptr;
 };
 
+//// Used in skipping final to keep all rows in chunk after merging.
+struct ChunkSelectFinalAllRows : public ChunkInfoCloneable<ChunkSelectFinalAllRows>
+{
+};
+
 /** Merges several sorted inputs into one.
   * For each group of consecutive identical values of the primary key (the columns by which the data is sorted),
   *  keeps row with max `version` value.
diff --git a/src/Processors/Transforms/SelectByIndicesTransform.h b/src/Processors/Transforms/SelectByIndicesTransform.h
index b44f5a3203e..e67d3bfde51 100644
--- a/src/Processors/Transforms/SelectByIndicesTransform.h
+++ b/src/Processors/Transforms/SelectByIndicesTransform.h
@@ -26,8 +26,12 @@ public:
     void transform(Chunk & chunk) override
     {
         size_t num_rows = chunk.getNumRows();
-        auto select_final_indices_info = chunk.getChunkInfos().extract<ChunkSelectFinalIndices>();
 
+        auto select_all_rows_info = chunk.getChunkInfos().extract<ChunkSelectFinalAllRows>();
+        if (select_all_rows_info)
+            return;
+
+        auto select_final_indices_info = chunk.getChunkInfos().extract<ChunkSelectFinalIndices>();
         if (!select_final_indices_info || !select_final_indices_info->select_final_indices)
             throw Exception(ErrorCodes::LOGICAL_ERROR, "Chunk passed to SelectByIndicesTransform without indices column");
 

From 4839c1d9cebd8e0a8f3221b250f4c90ae2910196 Mon Sep 17 00:00:00 2001
From: xmy <xumovens@gmail.com>
Date: Tue, 29 Oct 2024 18:42:35 +0800
Subject: [PATCH 20/80] Support write hdfs files with space

---
 .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 14 ++++++-------
 .../HDFS/WriteBufferFromHDFS.cpp              | 21 ++++++++++---------
 .../ObjectStorage/HDFS/WriteBufferFromHDFS.h  |  3 ++-
 tests/integration/test_storage_hdfs/test.py   | 15 +++++++++++++
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
index 182534529ea..7698193ee2f 100644
--- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp
@@ -103,15 +103,15 @@ std::unique_ptr<WriteBufferFromFileBase> HDFSObjectStorage::writeObject( /// NOL
             ErrorCodes::UNSUPPORTED_METHOD,
             "HDFS API doesn't support custom attributes/metadata for stored objects");
 
-    std::string path = object.remote_path;
-    if (path.starts_with("/"))
-        path = path.substr(1);
-    if (!path.starts_with(url))
-        path = fs::path(url) / path;
-
+    auto path = extractObjectKeyFromURL(object);
     /// Single O_WRONLY in libhdfs adds O_TRUNC
     return std::make_unique<WriteBufferFromHDFS>(
-        path, config, settings->replication, patchSettings(write_settings), buf_size,
+        url_without_path,
+        fs::path(data_directory) / path,
+        config,
+        settings->replication,
+        patchSettings(write_settings),
+        buf_size,
         mode == WriteMode::Rewrite ? O_WRONLY : O_WRONLY | O_APPEND);
 }
 
diff --git a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp
index 4f6f8c782f2..4879dc41d53 100644
--- a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp
+++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.cpp
@@ -29,6 +29,7 @@ extern const int CANNOT_FSYNC;
 struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
 {
     std::string hdfs_uri;
+    std::string hdfs_file_path;
     hdfsFile fout;
     HDFSBuilderWrapper builder;
     HDFSFSPtr fs;
@@ -36,25 +37,24 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
 
     WriteBufferFromHDFSImpl(
             const std::string & hdfs_uri_,
+            const std::string & hdfs_file_path_,
             const Poco::Util::AbstractConfiguration & config_,
             int replication_,
             const WriteSettings & write_settings_,
             int flags)
         : hdfs_uri(hdfs_uri_)
+        , hdfs_file_path(hdfs_file_path_)
         , builder(createHDFSBuilder(hdfs_uri, config_))
         , fs(createHDFSFS(builder.get()))
         , write_settings(write_settings_)
     {
-        const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2);
-        const String path = hdfs_uri.substr(begin_of_path);
-
         /// O_WRONLY meaning create or overwrite i.e., implies O_TRUNCAT here
-        fout = hdfsOpenFile(fs.get(), path.c_str(), flags, 0, replication_, 0);
+        fout = hdfsOpenFile(fs.get(), hdfs_file_path.c_str(), flags, 0, replication_, 0);
 
         if (fout == nullptr)
         {
             throw Exception(ErrorCodes::CANNOT_OPEN_FILE, "Unable to open HDFS file: {} ({}) error: {}",
-                path, hdfs_uri, std::string(hdfsGetLastError()));
+                hdfs_file_path, hdfs_uri, std::string(hdfsGetLastError()));
         }
     }
 
@@ -71,7 +71,7 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
         rlock.unlock(std::max(0, bytes_written));
 
         if (bytes_written < 0)
-            throw Exception(ErrorCodes::NETWORK_ERROR, "Fail to write HDFS file: {} {}", hdfs_uri, std::string(hdfsGetLastError()));
+            throw Exception(ErrorCodes::NETWORK_ERROR, "Fail to write HDFS file: {}, hdfs_uri: {}, {}", hdfs_file_path, hdfs_uri, std::string(hdfsGetLastError()));
 
         if (write_settings.remote_throttler)
             write_settings.remote_throttler->add(bytes_written, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds);
@@ -83,20 +83,21 @@ struct WriteBufferFromHDFS::WriteBufferFromHDFSImpl
     {
         int result = hdfsSync(fs.get(), fout);
         if (result < 0)
-            throw ErrnoException(ErrorCodes::CANNOT_FSYNC, "Cannot HDFS sync {} {}", hdfs_uri, std::string(hdfsGetLastError()));
+            throw ErrnoException(ErrorCodes::CANNOT_FSYNC, "Cannot HDFS sync {}, hdfs_url: {}, {}", hdfs_file_path, hdfs_uri, std::string(hdfsGetLastError()));
     }
 };
 
 WriteBufferFromHDFS::WriteBufferFromHDFS(
-        const std::string & hdfs_name_,
+        const std::string & hdfs_uri_,
+        const std::string & hdfs_file_path_,
         const Poco::Util::AbstractConfiguration & config_,
         int replication_,
         const WriteSettings & write_settings_,
         size_t buf_size_,
         int flags_)
     : WriteBufferFromFileBase(buf_size_, nullptr, 0)
-    , impl(std::make_unique<WriteBufferFromHDFSImpl>(hdfs_name_, config_, replication_, write_settings_, flags_))
-    , filename(hdfs_name_)
+    , impl(std::make_unique<WriteBufferFromHDFSImpl>(hdfs_uri_, hdfs_file_path_, config_, replication_, write_settings_, flags_))
+    , filename(hdfs_file_path_)
 {
 }
 
diff --git a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h
index e3f0ae96a8f..8166da92e16 100644
--- a/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h
+++ b/src/Storages/ObjectStorage/HDFS/WriteBufferFromHDFS.h
@@ -22,7 +22,8 @@ class WriteBufferFromHDFS final : public WriteBufferFromFileBase
 
 public:
     WriteBufferFromHDFS(
-        const String & hdfs_name_,
+        const String & hdfs_uri_,
+        const String & hdfs_file_path_,
         const Poco::Util::AbstractConfiguration & config_,
         int replication_,
         const WriteSettings & write_settings_ = {},
diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py
index 362ea7d5bda..366bc28d2c9 100644
--- a/tests/integration/test_storage_hdfs/test.py
+++ b/tests/integration/test_storage_hdfs/test.py
@@ -396,6 +396,21 @@ def test_read_files_with_spaces(started_cluster):
     node1.query(f"drop table test")
 
 
+def test_write_files_with_spaces(started_cluster):
+    fs = HdfsClient(hosts=started_cluster.hdfs_ip)
+    dir = "/itime=2024-10-24 10%3A02%3A04"
+    fs.mkdirs(dir)
+
+    node1.query(
+        f"insert into function hdfs('hdfs://hdfs1:9000{dir}/test.csv', TSVRaw) select 123 settings hdfs_truncate_on_insert=1"
+    )
+    result = node1.query(
+        f"select * from hdfs('hdfs://hdfs1:9000{dir}/test.csv', TSVRaw)"
+    )
+    assert int(result) == 123
+    fs.delete(dir, recursive=True)
+
+
 def test_truncate_table(started_cluster):
     hdfs_api = started_cluster.hdfs_api
     node1.query(

From 646a48e36b1ecc00dd78a1af42c121eacb225575 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Tue, 29 Oct 2024 10:51:15 +0000
Subject: [PATCH 21/80] Escape special symbols in files for JSON subcolumns

---
 src/DataTypes/Serializations/ISerialization.cpp        |  2 +-
 .../0_stateless/03257_json_escape_file_names.reference |  3 +++
 .../0_stateless/03257_json_escape_file_names.sql       | 10 ++++++++++
 3 files changed, 14 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/03257_json_escape_file_names.reference
 create mode 100644 tests/queries/0_stateless/03257_json_escape_file_names.sql

diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index fdcdf9e0cda..3c3e9bdc9f9 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -206,7 +206,7 @@ String getNameForSubstreamPath(
         else if (it->type == SubstreamType::ObjectSharedData)
             stream_name += ".object_shared_data";
         else if (it->type == SubstreamType::ObjectTypedPath || it->type == SubstreamType::ObjectDynamicPath)
-            stream_name += "." + it->object_path_name;
+            stream_name += "." + escapeForFileName(it->object_path_name);
     }
 
     return stream_name;
diff --git a/tests/queries/0_stateless/03257_json_escape_file_names.reference b/tests/queries/0_stateless/03257_json_escape_file_names.reference
new file mode 100644
index 00000000000..f44e7d62cc1
--- /dev/null
+++ b/tests/queries/0_stateless/03257_json_escape_file_names.reference
@@ -0,0 +1,3 @@
+{"a-b-c":"43","a-b\\/c-d\\/e":"44","a\\/b\\/c":"42"}
+42	43	44
+42	43	44
diff --git a/tests/queries/0_stateless/03257_json_escape_file_names.sql b/tests/queries/0_stateless/03257_json_escape_file_names.sql
new file mode 100644
index 00000000000..9cc150170fd
--- /dev/null
+++ b/tests/queries/0_stateless/03257_json_escape_file_names.sql
@@ -0,0 +1,10 @@
+set allow_experimental_json_type = 1;
+drop table if exists test;
+create table test (json JSON) engine=MergeTree order by tuple() settings min_rows_for_wide_part=0, min_bytes_for_wide_part=0;
+insert into test format JSONAsObject {"a/b/c" : 42, "a-b-c" : 43, "a-b/c-d/e" : 44};
+
+select * from test;
+select json.`a/b/c`, json.`a-b-c`, json.`a-b/c-d/e` from test;
+select json.`a/b/c`.:Int64, json.`a-b-c`.:Int64, json.`a-b/c-d/e`.:Int64 from test;
+drop table test;
+

From 0d22cbe47fc7b38049157f5f8466e45b43f3e691 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Tue, 29 Oct 2024 11:08:27 +0000
Subject: [PATCH 22/80] Fix bad_weak_ptr exception with Dynamic in functions
 comparison

---
 src/Functions/FunctionsComparison.h                         | 4 ++--
 src/Functions/transform.cpp                                 | 2 +-
 .../03258_dynamic_in_functions_weak_ptr_exception.reference | 0
 .../03258_dynamic_in_functions_weak_ptr_exception.sql       | 6 ++++++
 4 files changed, 9 insertions(+), 3 deletions(-)
 create mode 100644 tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.reference
 create mode 100644 tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.sql

diff --git a/src/Functions/FunctionsComparison.h b/src/Functions/FunctionsComparison.h
index bd6f0361307..be0875581a5 100644
--- a/src/Functions/FunctionsComparison.h
+++ b/src/Functions/FunctionsComparison.h
@@ -1171,7 +1171,7 @@ public:
 
         if (left_tuple && right_tuple)
         {
-            auto func = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionComparison<Op, Name>>(check_decimal_overflow));
+            auto func = std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionComparison<Op, Name>>(check_decimal_overflow));
 
             bool has_nullable = false;
             bool has_null = false;
@@ -1181,7 +1181,7 @@ public:
             {
                 ColumnsWithTypeAndName args = {{nullptr, left_tuple->getElements()[i], ""},
                                                {nullptr, right_tuple->getElements()[i], ""}};
-                auto element_type = func.build(args)->getResultType();
+                auto element_type = func->build(args)->getResultType();
                 has_nullable = has_nullable || element_type->isNullable();
                 has_null = has_null || element_type->onlyNull();
             }
diff --git a/src/Functions/transform.cpp b/src/Functions/transform.cpp
index 45f0a7f5c17..e5445b36809 100644
--- a/src/Functions/transform.cpp
+++ b/src/Functions/transform.cpp
@@ -211,7 +211,7 @@ namespace
             ColumnsWithTypeAndName args = arguments;
             args[0].column = args[0].column->cloneResized(input_rows_count)->convertToFullColumnIfConst();
 
-            auto impl = FunctionToOverloadResolverAdaptor(std::make_shared<FunctionTransform>()).build(args);
+            auto impl = std::make_shared<FunctionToOverloadResolverAdaptor>(std::make_shared<FunctionTransform>())->build(args);
 
             return impl->execute(args, result_type, input_rows_count);
         }
diff --git a/tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.reference b/tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.sql b/tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.sql
new file mode 100644
index 00000000000..f825353c135
--- /dev/null
+++ b/tests/queries/0_stateless/03258_dynamic_in_functions_weak_ptr_exception.sql
@@ -0,0 +1,6 @@
+SET allow_experimental_dynamic_type = 1;
+DROP TABLE IF EXISTS t0;
+CREATE TABLE t0 (c0 Tuple(c1 Int,c2 Dynamic)) ENGINE = Memory();
+SELECT 1 FROM t0 tx JOIN t0 ty ON tx.c0 = ty.c0;
+DROP TABLE t0;
+

From e2459c663deb7c1f573b0ee5418d0c5042193f16 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Tue, 29 Oct 2024 12:38:59 +0100
Subject: [PATCH 23/80] Fix tidy report

---
 src/Core/BaseSettings.cpp |  6 +++---
 src/Core/BaseSettings.h   | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/Core/BaseSettings.cpp b/src/Core/BaseSettings.cpp
index 51e99262bdb..9d55179a5db 100644
--- a/src/Core/BaseSettings.cpp
+++ b/src/Core/BaseSettings.cpp
@@ -32,14 +32,14 @@ void BaseSettingsHelpers::writeFlags(Flags flags, WriteBuffer & out)
 }
 
 
-BaseSettingsHelpers::Flags BaseSettingsHelpers::readFlags(ReadBuffer & in)
+UInt64 BaseSettingsHelpers::readFlags(ReadBuffer & in)
 {
     UInt64 res;
     readVarUInt(res, in);
-    return static_cast<Flags>(res);
+    return res;
 }
 
-SettingsTierType BaseSettingsHelpers::getTier(Flags flags)
+SettingsTierType BaseSettingsHelpers::getTier(UInt64 flags)
 {
     int8_t tier = (flags & Flags::TIER);
     if (tier > SettingsTierType::BETA)
diff --git a/src/Core/BaseSettings.h b/src/Core/BaseSettings.h
index 218460330f4..949b884636f 100644
--- a/src/Core/BaseSettings.h
+++ b/src/Core/BaseSettings.h
@@ -38,9 +38,9 @@ struct BaseSettingsHelpers
         /// If adding new flags, consider first if Tier might need more bits
     };
 
-    static SettingsTierType getTier(Flags flags);
+    static SettingsTierType getTier(UInt64 flags);
     static void writeFlags(Flags flags, WriteBuffer & out);
-    static Flags readFlags(ReadBuffer & in);
+    static UInt64 readFlags(ReadBuffer & in);
 };
 
 /** Template class to define collections of settings.
@@ -481,7 +481,7 @@ void BaseSettings<TTraits>::read(ReadBuffer & in, SettingsWriteFormat format)
         size_t index = accessor.find(name);
 
         using Flags = BaseSettingsHelpers::Flags;
-        Flags flags{0};
+        UInt64 flags{0};
         if (format >= SettingsWriteFormat::STRINGS_WITH_FLAGS)
             flags = BaseSettingsHelpers::readFlags(in);
         bool is_important = (flags & Flags::IMPORTANT);
@@ -860,7 +860,7 @@ using AliasMap = std::unordered_map<std::string_view, std::string_view>;
                 String name; \
                 const char * type; \
                 const char * description; \
-                BaseSettingsHelpers::Flags flags; \
+                UInt64 flags; \
                 Field (*cast_value_util_function)(const Field &); \
                 String (*value_to_string_util_function)(const Field &); \
                 Field (*string_to_value_util_function)(const String &); \
@@ -972,7 +972,7 @@ struct DefineAliases
 #define IMPLEMENT_SETTINGS_TRAITS_(TYPE, NAME, DEFAULT, DESCRIPTION, FLAGS) \
     res.field_infos.emplace_back( \
         FieldInfo{#NAME, #TYPE, DESCRIPTION, \
-            static_cast<BaseSettingsHelpers::Flags>(FLAGS), \
+            static_cast<UInt64>(FLAGS), \
             [](const Field & value) -> Field { return static_cast<Field>(SettingField##TYPE{value}); }, \
             [](const Field & value) -> String { return SettingField##TYPE{value}.toString(); }, \
             [](const String & str) -> Field { SettingField##TYPE temp; temp.parseFromString(str); return static_cast<Field>(temp); }, \

From 6fa8153d1aac4d5a0b500cf040ca697a97b0c6f1 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Tue, 29 Oct 2024 14:23:34 +0000
Subject: [PATCH 24/80] Fix ignoring format settings in Native format via HTTP
 and Async Inserts

---
 src/Processors/Formats/Impl/NativeFormat.cpp    | 10 ++++++----
 .../Transforms/getSourceFromASTInsertQuery.cpp  |  1 +
 ..._native_http_async_insert_settings.reference |  1 +
 .../03259_native_http_async_insert_settings.sh  | 17 +++++++++++++++++
 4 files changed, 25 insertions(+), 4 deletions(-)
 create mode 100644 tests/queries/0_stateless/03259_native_http_async_insert_settings.reference
 create mode 100755 tests/queries/0_stateless/03259_native_http_async_insert_settings.sh

diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp
index 5411e2e7811..022cb38596b 100644
--- a/src/Processors/Formats/Impl/NativeFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeFormat.cpp
@@ -15,16 +15,17 @@ namespace DB
 class NativeInputFormat final : public IInputFormat
 {
 public:
-    NativeInputFormat(ReadBuffer & buf, const Block & header_, const FormatSettings & settings)
+    NativeInputFormat(ReadBuffer & buf, const Block & header_, const FormatSettings & settings_)
         : IInputFormat(header_, &buf)
         , reader(std::make_unique<NativeReader>(
               buf,
               header_,
               0,
-              settings,
-              settings.defaults_for_omitted_fields ? &block_missing_values : nullptr))
+              settings_,
+              settings_.defaults_for_omitted_fields ? &block_missing_values : nullptr))
         , header(header_)
         , block_missing_values(header.columns())
+        , settings(settings_)
         {
         }
 
@@ -55,7 +56,7 @@ public:
 
     void setReadBuffer(ReadBuffer & in_) override
     {
-        reader = std::make_unique<NativeReader>(in_, header, 0);
+        reader = std::make_unique<NativeReader>(in_, header, 0, settings, settings.defaults_for_omitted_fields ? &block_missing_values : nullptr);
         IInputFormat::setReadBuffer(in_);
     }
 
@@ -67,6 +68,7 @@ private:
     std::unique_ptr<NativeReader> reader;
     Block header;
     BlockMissingValues block_missing_values;
+    const FormatSettings settings;
     size_t approx_bytes_read_for_chunk = 0;
 };
 
diff --git a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
index 648ed9751ff..0c00baeabf7 100644
--- a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
+++ b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
@@ -16,6 +16,7 @@
 #include "IO/CompressionMethod.h"
 #include <Core/Settings.h>
 #include <Parsers/ASTLiteral.h>
+#include <Formats/FormatFactory.h>
 
 
 namespace DB
diff --git a/tests/queries/0_stateless/03259_native_http_async_insert_settings.reference b/tests/queries/0_stateless/03259_native_http_async_insert_settings.reference
new file mode 100644
index 00000000000..573541ac970
--- /dev/null
+++ b/tests/queries/0_stateless/03259_native_http_async_insert_settings.reference
@@ -0,0 +1 @@
+0
diff --git a/tests/queries/0_stateless/03259_native_http_async_insert_settings.sh b/tests/queries/0_stateless/03259_native_http_async_insert_settings.sh
new file mode 100755
index 00000000000..c0934b06cc7
--- /dev/null
+++ b/tests/queries/0_stateless/03259_native_http_async_insert_settings.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+
+$CLICKHOUSE_CLIENT -q "drop table if exists test"
+$CLICKHOUSE_CLIENT -q "create table test (x UInt32) engine=Memory";
+
+url="${CLICKHOUSE_URL}&async_insert=1&wait_for_async_insert=1"
+
+$CLICKHOUSE_LOCAL -q "select NULL::Nullable(UInt32) as x format Native" | ${CLICKHOUSE_CURL} -sS "$url&query=INSERT%20INTO%20test%20FORMAT%20Native" --data-binary @-
+
+$CLICKHOUSE_CLIENT -q "select * from test";
+$CLICKHOUSE_CLIENT -q "drop table test"
+

From 414d04690e9f431d34b175d3c732eaea350d6cb3 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Tue, 29 Oct 2024 14:25:27 +0000
Subject: [PATCH 25/80] Remove unneeded include

---
 src/Processors/Transforms/getSourceFromASTInsertQuery.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
index 0c00baeabf7..648ed9751ff 100644
--- a/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
+++ b/src/Processors/Transforms/getSourceFromASTInsertQuery.cpp
@@ -16,7 +16,6 @@
 #include "IO/CompressionMethod.h"
 #include <Core/Settings.h>
 #include <Parsers/ASTLiteral.h>
-#include <Formats/FormatFactory.h>
 
 
 namespace DB

From 858162ce2c229002e808d0d1acc2e100df79b8e0 Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Tue, 29 Oct 2024 15:06:03 +0000
Subject: [PATCH 26/80] add a perf test

---
 .../replacing_final_non_intersecting.xml      | 26 +++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 tests/performance/replacing_final_non_intersecting.xml

diff --git a/tests/performance/replacing_final_non_intersecting.xml b/tests/performance/replacing_final_non_intersecting.xml
new file mode 100644
index 00000000000..b3d32f1ca2e
--- /dev/null
+++ b/tests/performance/replacing_final_non_intersecting.xml
@@ -0,0 +1,26 @@
+<test>
+    <!--
+        This test is intended to check the performance of merge.
+        Since it's hard to measure it let's run a query with FINAL and disabled optimizations of query plan.
+    -->
+    <settings>
+        <split_parts_ranges_into_intersecting_and_non_intersecting_final>0</split_parts_ranges_into_intersecting_and_non_intersecting_final>
+        <split_intersecting_parts_ranges_into_layers_final>0</split_intersecting_parts_ranges_into_layers_final>
+    </settings>
+
+    <create_query>
+        CREATE TABLE replacing_final_non_intersecting (d DateTime, c1 UInt64, c2 String, c3 LowCardinality(String))
+        ENGINE = ReplacingMergeTree()
+        ORDER BY d
+    </create_query>
+
+    <fill_query>INSERT INTO replacing_final_non_intersecting SELECT toDateTime('2020-10-10 00:00:00') - number, number, toString(number), toString(number % 1000) FROM numbers(0, 5000000)</fill_query>
+    <fill_query>OPTIMIZE TABLE replacing_final_non_intersecting FINAL</fill_query>
+    <fill_query>SYSTEM STOP MERGES replacing_final_non_intersecting</fill_query>
+    <fill_query>INSERT INTO replacing_final_non_intersecting SELECT toDateTime('2020-10-10 00:00:00') - number, number, toString(number), toString(number % 1000) FROM numbers(5000000, 500000)</fill_query>
+
+    <query>SELECT * FROM replacing_final_non_intersecting FINAL FORMAT Null SETTINGS enable_vertical_final = 0</query>
+    <query>SELECT * FROM replacing_final_non_intersecting FINAL FORMAT Null SETTINGS enable_vertical_final = 1</query>
+
+    <drop_query>DROP TABLE IF EXISTS replacing_final_non_intersecting</drop_query>
+</test>

From f73ff65edb2824a744462dbb4dc760fe22ee3648 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Tue, 29 Oct 2024 19:00:38 +0000
Subject: [PATCH 27/80] Fix tests

---
 src/DataTypes/Serializations/ISerialization.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index 3c3e9bdc9f9..7522248e088 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -161,7 +161,7 @@ String getNameForSubstreamPath(
     String stream_name,
     SubstreamIterator begin,
     SubstreamIterator end,
-    bool escape_tuple_delimiter)
+    bool escape_for_file_name)
 {
     using Substream = ISerialization::Substream;
 
@@ -186,7 +186,7 @@ String getNameForSubstreamPath(
             /// Because nested data may be represented not by Array of Tuple,
             /// but by separate Array columns with names in a form of a.b,
             /// and name is encoded as a whole.
-            if (it->type == Substream::TupleElement && escape_tuple_delimiter)
+            if (it->type == Substream::TupleElement && escape_for_file_name)
                 stream_name += escapeForFileName(substream_name);
             else
                 stream_name += substream_name;
@@ -206,7 +206,7 @@ String getNameForSubstreamPath(
         else if (it->type == SubstreamType::ObjectSharedData)
             stream_name += ".object_shared_data";
         else if (it->type == SubstreamType::ObjectTypedPath || it->type == SubstreamType::ObjectDynamicPath)
-            stream_name += "." + escapeForFileName(it->object_path_name);
+            stream_name += "." + (escape_for_file_name ? escapeForFileName(it->object_path_name) : it->object_path_name);
     }
 
     return stream_name;

From 091db0a9845099a2b88b22eb2a73996c9ab8d1bf Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Wed, 30 Oct 2024 10:19:08 +0000
Subject: [PATCH 28/80] Fix kafka test

---
 tests/integration/test_storage_kafka/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py
index 0bade55415f..999324b563a 100644
--- a/tests/integration/test_storage_kafka/test.py
+++ b/tests/integration/test_storage_kafka/test.py
@@ -4193,7 +4193,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster, create_query_generator
             ],
             "expected": {
                 "raw_message": "050102696405496E743634000000000000000007626C6F636B4E6F06537472696E67034241440476616C3106537472696E6702414D0476616C3207466C6F617433320000003F0476616C330555496E743801",
-                "error": "Cannot convert: String to UInt16",
+                "error": "Cannot parse string \'BAD\' as UInt16",
             },
             "printable": False,
         },

From d6acaeae5ac604816159e33dbd29abbde819086c Mon Sep 17 00:00:00 2001
From: Hiroaki Nakamura <hnakamur@gmail.com>
Date: Wed, 30 Oct 2024 19:43:04 +0900
Subject: [PATCH 29/80] Fix doc for CREATE MATERIALIZED VIEW ON CLUSTER

---
 docs/en/sql-reference/statements/create/view.md | 2 +-
 docs/ru/sql-reference/statements/create/view.md | 2 +-
 docs/zh/sql-reference/statements/create/view.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md
index 0e5d5250e0f..c770348bce0 100644
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@@ -55,7 +55,7 @@ SELECT * FROM view(column1=value1, column2=value2 ...)
 ## Materialized View
 
 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE]
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE]
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }]
 AS SELECT ...
 [COMMENT 'comment']
diff --git a/docs/ru/sql-reference/statements/create/view.md b/docs/ru/sql-reference/statements/create/view.md
index 8fa30446bb3..5dbffd90205 100644
--- a/docs/ru/sql-reference/statements/create/view.md
+++ b/docs/ru/sql-reference/statements/create/view.md
@@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Материализованные представления {#materialized}
 
 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] 
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] 
 [DEFINER = { user | CURRENT_USER }] [SQL SECURITY { DEFINER | INVOKER | NONE }] 
 AS SELECT ...
 ```
diff --git a/docs/zh/sql-reference/statements/create/view.md b/docs/zh/sql-reference/statements/create/view.md
index 49a1d66bdf1..6c93240644d 100644
--- a/docs/zh/sql-reference/statements/create/view.md
+++ b/docs/zh/sql-reference/statements/create/view.md
@@ -39,7 +39,7 @@ SELECT a, b, c FROM (SELECT ...)
 ## Materialized {#materialized}
 
 ``` sql
-CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
+CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster_name] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
 ```
 
 物化视图存储由相应的[SELECT](../../../sql-reference/statements/select/index.md)管理.

From 486f4512d17492d2e950843f18d358f5a060a3c4 Mon Sep 17 00:00:00 2001
From: Christoph Wurm <christoph@clickhouse.com>
Date: Wed, 30 Oct 2024 10:53:54 +0000
Subject: [PATCH 30/80] Add missing sources grants for Kafka, NATS and
 RabbitMQ.

---
 src/Access/Common/AccessType.h            | 3 +++
 src/Access/ContextAccess.cpp              | 5 ++++-
 src/Storages/Kafka/StorageKafkaUtils.cpp  | 1 +
 src/Storages/NATS/StorageNATS.cpp         | 8 +++++++-
 src/Storages/RabbitMQ/StorageRabbitMQ.cpp | 8 +++++++-
 5 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h
index e9f24a8c685..fe34618c490 100644
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@@ -237,6 +237,9 @@ enum class AccessType : uint8_t
     M(S3, "", GLOBAL, SOURCES) \
     M(HIVE, "", GLOBAL, SOURCES) \
     M(AZURE, "", GLOBAL, SOURCES) \
+    M(KAFKA, "", GLOBAL, SOURCES) \
+    M(NATS, "", GLOBAL, SOURCES) \
+    M(RABBITMQ, "", GLOBAL, SOURCES) \
     M(SOURCES, "", GROUP, ALL) \
     \
     M(CLUSTER, "", GLOBAL, ALL) /* ON CLUSTER queries */ \
diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp
index 949fd37e403..f47cd53b137 100644
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@@ -52,7 +52,10 @@ namespace
         {AccessType::HDFS, "HDFS"},
         {AccessType::S3, "S3"},
         {AccessType::HIVE, "Hive"},
-        {AccessType::AZURE, "AzureBlobStorage"}
+        {AccessType::AZURE, "AzureBlobStorage"},
+        {AccessType::KAFKA, "Kafka"},
+        {AccessType::NATS, "NATS"},
+        {AccessType::RABBITMQ, "RabbitMQ"}
     };
 
 
diff --git a/src/Storages/Kafka/StorageKafkaUtils.cpp b/src/Storages/Kafka/StorageKafkaUtils.cpp
index dd954d6a7c2..119aadd11d8 100644
--- a/src/Storages/Kafka/StorageKafkaUtils.cpp
+++ b/src/Storages/Kafka/StorageKafkaUtils.cpp
@@ -308,6 +308,7 @@ void registerStorageKafka(StorageFactory & factory)
         creator_fn,
         StorageFactory::StorageFeatures{
             .supports_settings = true,
+            .source_access_type = AccessType::KAFKA,
         });
 }
 
diff --git a/src/Storages/NATS/StorageNATS.cpp b/src/Storages/NATS/StorageNATS.cpp
index 123f5adc22d..5a51f078e7b 100644
--- a/src/Storages/NATS/StorageNATS.cpp
+++ b/src/Storages/NATS/StorageNATS.cpp
@@ -786,7 +786,13 @@ void registerStorageNATS(StorageFactory & factory)
         return std::make_shared<StorageNATS>(args.table_id, args.getContext(), args.columns, args.comment, std::move(nats_settings), args.mode);
     };
 
-    factory.registerStorage("NATS", creator_fn, StorageFactory::StorageFeatures{ .supports_settings = true, });
+    factory.registerStorage(
+        "NATS",
+        creator_fn,
+        StorageFactory::StorageFeatures{
+            .supports_settings = true,
+            .source_access_type = AccessType::NATS,
+        });
 }
 
 }
diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
index 0f3ac2d5289..3e922b541f7 100644
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
@@ -1322,7 +1322,13 @@ void registerStorageRabbitMQ(StorageFactory & factory)
         return std::make_shared<StorageRabbitMQ>(args.table_id, args.getContext(), args.columns, args.comment, std::move(rabbitmq_settings), args.mode);
     };
 
-    factory.registerStorage("RabbitMQ", creator_fn, StorageFactory::StorageFeatures{ .supports_settings = true, });
+    factory.registerStorage(
+        "RabbitMQ",
+        creator_fn,
+        StorageFactory::StorageFeatures{
+            .supports_settings = true,
+            .source_access_type = AccessType::RABBITMQ,
+        });
 }
 
 }

From 3b0273a5d30b447ca00c1ded40ce937fd358604f Mon Sep 17 00:00:00 2001
From: Christoph Wurm <christoph@clickhouse.com>
Date: Wed, 30 Oct 2024 11:02:22 +0000
Subject: [PATCH 31/80] Docs

---
 docs/en/sql-reference/statements/grant.md |  6 ++++
 docs/ru/sql-reference/statements/grant.md | 38 +++++++++++++++++------
 docs/zh/sql-reference/statements/grant.md | 38 +++++++++++++++++------
 3 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md
index c11299baf38..d4a3e128b13 100644
--- a/docs/en/sql-reference/statements/grant.md
+++ b/docs/en/sql-reference/statements/grant.md
@@ -238,10 +238,13 @@ Hierarchy of privileges:
     - `HDFS`
     - `HIVE`
     - `JDBC`
+    - `KAFKA`
     - `MONGO`
     - `MYSQL`
+    - `NATS`
     - `ODBC`
     - `POSTGRES`
+    - `RABBITMQ`
     - `REDIS`
     - `REMOTE`
     - `S3`
@@ -520,10 +523,13 @@ Allows using external data sources. Applies to [table engines](../../engines/tab
     - `HDFS`. Level: `GLOBAL`
     - `HIVE`. Level: `GLOBAL`
     - `JDBC`. Level: `GLOBAL`
+    - `KAFKA`. Level: `GLOBAL`
     - `MONGO`. Level: `GLOBAL`
     - `MYSQL`. Level: `GLOBAL`
+    - `NATS`. Level: `GLOBAL`
     - `ODBC`. Level: `GLOBAL`
     - `POSTGRES`. Level: `GLOBAL`
+    - `RABBITMQ`. Level: `GLOBAL`
     - `REDIS`. Level: `GLOBAL`
     - `REMOTE`. Level: `GLOBAL`
     - `S3`. Level: `GLOBAL`
diff --git a/docs/ru/sql-reference/statements/grant.md b/docs/ru/sql-reference/statements/grant.md
index 2ccc2d05452..79682dc42cd 100644
--- a/docs/ru/sql-reference/statements/grant.md
+++ b/docs/ru/sql-reference/statements/grant.md
@@ -192,14 +192,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
     - `addressToSymbol`
     - `demangle`
 - [SOURCES](#grant-sources)
+    - `AZURE`
     - `FILE`
-    - `URL`
-    - `REMOTE`
-    - `MYSQL`
-    - `ODBC`
-    - `JDBC`
     - `HDFS`
+    - `HIVE`
+    - `JDBC`
+    - `KAFKA`
+    - `MONGO`
+    - `MYSQL`
+    - `NATS`
+    - `ODBC`
+    - `POSTGRES`
+    - `RABBITMQ`
+    - `REDIS`
+    - `REMOTE`
     - `S3`
+    - `SQLITE`
+    - `URL`
 - [dictGet](#grant-dictget)
 
 Примеры того, как трактуется данная иерархия:
@@ -461,14 +470,23 @@ GRANT INSERT(x,y) ON db.table TO john
 Разрешает использовать внешние источники данных. Применяется к [движкам таблиц](../../engines/table-engines/index.md) и [табличным функциям](../table-functions/index.md#table-functions).
 
 - `SOURCES`. Уровень: `GROUP`
+    - `AZURE`. Уровень: `GLOBAL`
     - `FILE`. Уровень: `GLOBAL`
-    - `URL`. Уровень: `GLOBAL`
-    - `REMOTE`. Уровень: `GLOBAL`
-    - `MYSQL`. Уровень: `GLOBAL`
-    - `ODBC`. Уровень: `GLOBAL`
-    - `JDBC`. Уровень: `GLOBAL`
     - `HDFS`. Уровень: `GLOBAL`
+    - `HIVE`. Уровень: `GLOBAL`
+    - `JDBC`. Уровень: `GLOBAL`
+    - `KAFKA`. Уровень: `GLOBAL`
+    - `MONGO`. Уровень: `GLOBAL`
+    - `MYSQL`. Уровень: `GLOBAL`
+    - `NATS`. Уровень: `GLOBAL`
+    - `ODBC`. Уровень: `GLOBAL`
+    - `POSTGRES`. Уровень: `GLOBAL`
+    - `RABBITMQ`. Уровень: `GLOBAL`
+    - `REDIS`. Уровень: `GLOBAL`
+    - `REMOTE`. Уровень: `GLOBAL`
     - `S3`. Уровень: `GLOBAL`
+    - `SQLITE`. Уровень: `GLOBAL`
+    - `URL`. Уровень: `GLOBAL`
 
 Привилегия `SOURCES` разрешает использование всех источников. Также вы можете присвоить привилегию для каждого источника отдельно. Для использования источников необходимы дополнительные привилегии.
 
diff --git a/docs/zh/sql-reference/statements/grant.md b/docs/zh/sql-reference/statements/grant.md
index fea51d590d5..3fd314c791f 100644
--- a/docs/zh/sql-reference/statements/grant.md
+++ b/docs/zh/sql-reference/statements/grant.md
@@ -170,14 +170,23 @@ GRANT SELECT(x,y) ON db.table TO john WITH GRANT OPTION
     -   `addressToSymbol`
     -   `demangle`
 -   [SOURCES](#grant-sources)
+    -   `AZURE`
     -   `FILE`
-    -   `URL`
-    -   `REMOTE`
-    -   `YSQL`
-    -   `ODBC`
-    -   `JDBC`
     -   `HDFS`
+    -   `HIVE`
+    -   `JDBC`
+    -   `KAFKA`
+    -   `MONGO`
+    -   `MYSQL`
+    -   `NATS`
+    -   `ODBC`
+    -   `POSTGRES`
+    -   `RABBITMQ`
+    -   `REDIS`
+    -   `REMOTE`
     -   `S3`
+    -   `SQLITE`
+    -   `URL`
 -   [dictGet](#grant-dictget)
 
 如何对待该层级的示例：
@@ -428,14 +437,23 @@ GRANT INSERT(x,y) ON db.table TO john
 允许在 [table engines](../../engines/table-engines/index.md) 和 [table functions](../../sql-reference/table-functions/index.md#table-functions)中使用外部数据源。
 
 -   `SOURCES`. 级别: `GROUP`
+    -   `AZURE`. 级别: `GLOBAL`
     -   `FILE`. 级别: `GLOBAL`
-    -   `URL`. 级别: `GLOBAL`
-    -   `REMOTE`. 级别: `GLOBAL`
-    -   `YSQL`. 级别: `GLOBAL`
-    -   `ODBC`. 级别: `GLOBAL`
-    -   `JDBC`. 级别: `GLOBAL`
     -   `HDFS`. 级别: `GLOBAL`
+    -   `HIVE`. 级别: `GLOBAL`
+    -   `JDBC`. 级别: `GLOBAL`
+    -   `KAFKA`. 级别: `GLOBAL`
+    -   `MONGO`. 级别: `GLOBAL`
+    -   `MYSQL`. 级别: `GLOBAL`
+    -   `NATS`. 级别: `GLOBAL`
+    -   `ODBC`. 级别: `GLOBAL`
+    -   `POSTGRES`. 级别: `GLOBAL`
+    -   `RABBITMQ`. 级别: `GLOBAL`
+    -   `REDIS`. 级别: `GLOBAL`
+    -   `REMOTE`. 级别: `GLOBAL`
     -   `S3`. 级别: `GLOBAL`
+    -   `SQLITE`. 级别: `GLOBAL`
+    -   `URL`. 级别: `GLOBAL`
 
 `SOURCES` 权限允许使用所有数据源。当然也可以单独对每个数据源进行授权。要使用数据源时，还需要额外的权限。
 

From fae5b1170910d8e2b6cd0bf7e12b9a72cbb9bb67 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 30 Oct 2024 12:06:30 +0000
Subject: [PATCH 32/80] Fix #69010

---
 src/Interpreters/Cache/QueryCache.cpp         | 37 +++++++++++++++++--
 .../02494_query_cache_system_tables.sql       |  8 +++-
 2 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp
index c766c5209fc..cfd7608b6c6 100644
--- a/src/Interpreters/Cache/QueryCache.cpp
+++ b/src/Interpreters/Cache/QueryCache.cpp
@@ -89,11 +89,40 @@ struct HasSystemTablesMatcher
         {
             database_table = identifier->name();
         }
-        /// Handle SELECT [...] FROM clusterAllReplicas(<cluster>, '<table>')
-        else if (const auto * literal = node->as<ASTLiteral>())
+        /// SELECT [...] FROM clusterAllReplicas(<cluster>, '<table>')
+        /// This SQL syntax is quite common but we need to be careful. A naive attempt to cast 'node' to an ASTLiteral will be too general
+        /// and introduce false positives in queries like
+        ///     'SELECT * FROM users WHERE name = 'system.metrics' SETTINGS use_query_cache = true;'
+        /// Therefore, make sure we are really in `clusterAllReplicas`. EXPLAIN AST for
+        ///     'SELECT * FROM clusterAllReplicas('default', system.one) SETTINGS use_query_cache = 1'
+        /// returns:
+        ///     [...]
+        ///     Function clusterAllReplicas (children 1)
+        ///       ExpressionList (children 2)
+        ///         Literal 'test_shard_localhost'
+        ///         Literal 'system.one'
+        ///     [...]
+        else if (const auto * function = node->as<ASTFunction>())
         {
-            const auto & value = literal->value;
-            database_table = toString(value);
+            if (function->name == "clusterAllReplicas")
+            {
+                const ASTs & function_children = function->children;
+                if (!function_children.empty())
+                {
+                    if (const auto * expression_list = function_children[0]->as<ASTExpressionList>())
+                    {
+                        const ASTs & expression_list_children = expression_list->children;
+                        if (!expression_list_children.empty())
+                        {
+                            if (const auto * literal = expression_list_children[1]->as<ASTLiteral>())
+                            {
+                                const auto & value = literal->value;
+                                database_table = toString(value);
+                            }
+                        }
+                    }
+                }
+            }
         }
 
         Tokens tokens(database_table.c_str(), database_table.c_str() + database_table.size(), /*max_query_size*/ 2048, /*skip_insignificant*/ true);
diff --git a/tests/queries/0_stateless/02494_query_cache_system_tables.sql b/tests/queries/0_stateless/02494_query_cache_system_tables.sql
index 7c9f01c4e91..12eaec0f8bc 100644
--- a/tests/queries/0_stateless/02494_query_cache_system_tables.sql
+++ b/tests/queries/0_stateless/02494_query_cache_system_tables.sql
@@ -44,9 +44,16 @@ SELECT * SETTINGS use_query_cache = 1;
 SELECT * FROM information_schema.tables SETTINGS use_query_cache = 1; -- { serverError QUERY_CACHE_USED_WITH_SYSTEM_TABLE }
 SELECT * FROM INFORMATION_SCHEMA.TABLES SETTINGS use_query_cache = 1; -- { serverError QUERY_CACHE_USED_WITH_SYSTEM_TABLE }
 
+-- Issue #69010: A system table name appears as a literal. That's okay and must not throw.
+DROP TABLE IF EXISTS tab;
+CREATE TABLE tab (uid Int16, name String) ENGINE = Memory;
+SELECT * FROM tab WHERE name = 'system.one' SETTINGS use_query_cache = true;
+DROP TABLE tab;
+
 -- System tables can be "hidden" inside e.g. table functions
 SELECT * FROM clusterAllReplicas('test_shard_localhost', system.one) SETTINGS use_query_cache = 1; -- {serverError QUERY_CACHE_USED_WITH_SYSTEM_TABLE }
 SELECT * FROM clusterAllReplicas('test_shard_localhost', 'system.one') SETTINGS use_query_cache = 1; -- {serverError QUERY_CACHE_USED_WITH_SYSTEM_TABLE }
+-- Note how in the previous query ^^ 'system.one' is also a literal. ClusterAllReplicas gets special handling.
 
 -- Criminal edge case that a user creates a table named "system". The query cache must not reject queries against it.
 DROP TABLE IF EXISTS system;
@@ -60,5 +67,4 @@ CREATE TABLE system.system (c UInt64) ENGINE = Memory;
 SElECT * FROM system.system SETTINGS use_query_cache = 1; -- { serverError QUERY_CACHE_USED_WITH_SYSTEM_TABLE }
 DROP TABLE system.system;
 
--- Cleanup
 SYSTEM DROP QUERY CACHE;

From bbbb81f43dfa09cc1727b8596685a6acfe57ea9f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Wed, 30 Oct 2024 13:23:48 +0100
Subject: [PATCH 33/80] Improvements based on review

---
 src/Core/BaseSettings.cpp | 2 +-
 src/Core/Settings.cpp     | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Core/BaseSettings.cpp b/src/Core/BaseSettings.cpp
index 9d55179a5db..2cce94f9d0a 100644
--- a/src/Core/BaseSettings.cpp
+++ b/src/Core/BaseSettings.cpp
@@ -41,7 +41,7 @@ UInt64 BaseSettingsHelpers::readFlags(ReadBuffer & in)
 
 SettingsTierType BaseSettingsHelpers::getTier(UInt64 flags)
 {
-    int8_t tier = (flags & Flags::TIER);
+    int8_t tier = static_cast<int8_t>(flags & Flags::TIER);
     if (tier > SettingsTierType::BETA)
         throw Exception(ErrorCodes::INCORRECT_DATA, "Unknown tier value: '{}'", tier);
     return SettingsTierType{tier};
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 4159758fe76..aa9b7fd817b 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -38,7 +38,9 @@ namespace ErrorCodes
   * Note: as an alternative, we could implement settings to be completely dynamic in the form of the map: String -> Field,
   *  but we are not going to do it, because settings are used everywhere as static struct fields.
   *
-  * `flags` can be either 0 or IMPORTANT + a Tier (PRODUCTION | BETA | EXPERIMENTAL)
+  * `flags` can include a Tier (BETA | EXPERIMENTAL) and an optional bitwise AND with IMPORTANT.
+  * The default (0) means a PRODUCTION ready setting
+  *
   * A setting is "IMPORTANT" if it affects the results of queries and can't be ignored by older versions.
   * Tiers:
   * EXPERIMENTAL: The feature is in active development stage. Mostly for developers or for ClickHouse enthusiasts.
@@ -5824,8 +5826,6 @@ Experimental data deduplication for SELECT queries based on part UUIDs
     \
     /* ####################################################### */ \
     /* ############ END OF EXPERIMENTAL FEATURES ############# */ \
-    /* ## ADD PRODUCTION / BETA FEATURES BEFORE THIS BLOCK  ## */ \
-    /* ####################################################### */ \
     /* ####################################################### */ \
 
 // End of COMMON_SETTINGS

From 4364be72f1983fc8306eb5e4e209c71d64a0e71a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Wed, 30 Oct 2024 13:27:12 +0100
Subject: [PATCH 34/80] Mark merge_selector_algorithm as experimental

---
 src/Core/Settings.cpp                        | 3 ++-
 src/Storages/MergeTree/MergeTreeSettings.cpp | 8 +++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index aa9b7fd817b..1c392d2c547 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -5905,13 +5905,14 @@ Experimental data deduplication for SELECT queries based on part UUIDs
     /** The section above is for obsolete settings. Do not add anything there. */
 #endif /// __CLION_IDE__
 
-
 #define LIST_OF_SETTINGS(M, ALIAS)     \
     COMMON_SETTINGS(M, ALIAS)          \
     OBSOLETE_SETTINGS(M, ALIAS)        \
     FORMAT_FACTORY_SETTINGS(M, ALIAS)  \
     OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \
 
+// clang-format on
+
 DECLARE_SETTINGS_TRAITS_ALLOW_CUSTOM_SETTINGS(SettingsTraits, LIST_OF_SETTINGS)
 IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS)
 
diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp
index 36e146f4624..38c8f389fbe 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeSettings.cpp
@@ -30,10 +30,11 @@ namespace ErrorCodes
     extern const int BAD_ARGUMENTS;
 }
 
+// clang-format off
+
 /** These settings represent fine tunes for internal details of MergeTree storages
   * and should not be changed by the user without a reason.
   */
-
 #define MERGE_TREE_SETTINGS(DECLARE, ALIAS) \
     DECLARE(UInt64, min_compress_block_size, 0, "When granule is written, compress the data in buffer if the size of pending uncompressed data is larger or equal than the specified threshold. If this setting is not set, the corresponding global setting is used.", 0) \
     DECLARE(UInt64, max_compress_block_size, 0, "Compress the pending uncompressed data in buffer if its size is larger or equal than the specified threshold. Block of data will be compressed even if the current granule is not finished. If this setting is not set, the corresponding global setting is used.", 0) \
@@ -98,7 +99,7 @@ namespace ErrorCodes
     DECLARE(String, merge_workload, "", "Name of workload to be used to access resources for merges", 0) \
     DECLARE(String, mutation_workload, "", "Name of workload to be used to access resources for mutations", 0) \
     DECLARE(Milliseconds, background_task_preferred_step_execution_time_ms, 50, "Target time to execution of one step of merge or mutation. Can be exceeded if one step takes longer time", 0) \
-    DECLARE(MergeSelectorAlgorithm, merge_selector_algorithm, MergeSelectorAlgorithm::SIMPLE, "The algorithm to select parts for merges assignment", 0) \
+    DECLARE(MergeSelectorAlgorithm, merge_selector_algorithm, MergeSelectorAlgorithm::SIMPLE, "The algorithm to select parts for merges assignment", EXPERIMENTAL) \
     \
     /** Inserts settings. */ \
     DECLARE(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \
@@ -276,8 +277,9 @@ namespace ErrorCodes
     MERGE_TREE_SETTINGS(M, ALIAS)             \
     OBSOLETE_MERGE_TREE_SETTINGS(M, ALIAS)
 
-DECLARE_SETTINGS_TRAITS(MergeTreeSettingsTraits, LIST_OF_MERGE_TREE_SETTINGS)
+// clang-format on
 
+DECLARE_SETTINGS_TRAITS(MergeTreeSettingsTraits, LIST_OF_MERGE_TREE_SETTINGS)
 
 /** Settings for the MergeTree family of engines.
   * Could be loaded from config or from a CREATE TABLE query (SETTINGS clause).

From 9ab5f16968cb1c89a8c47b5dae07ea050380327f Mon Sep 17 00:00:00 2001
From: Pavel Kruglov <48961922+Avogar@users.noreply.github.com>
Date: Wed, 30 Oct 2024 14:10:12 +0100
Subject: [PATCH 35/80] Update test.py

---
 tests/integration/test_storage_kafka/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py
index 999324b563a..336ca824a2d 100644
--- a/tests/integration/test_storage_kafka/test.py
+++ b/tests/integration/test_storage_kafka/test.py
@@ -4193,7 +4193,7 @@ def test_kafka_formats_with_broken_message(kafka_cluster, create_query_generator
             ],
             "expected": {
                 "raw_message": "050102696405496E743634000000000000000007626C6F636B4E6F06537472696E67034241440476616C3106537472696E6702414D0476616C3207466C6F617433320000003F0476616C330555496E743801",
-                "error": "Cannot parse string \'BAD\' as UInt16",
+                "error": "Cannot parse string 'BAD' as UInt16",
             },
             "printable": False,
         },

From 0e063673d5fe5c5c9a60aa82d3258c20c7816f22 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Wed, 30 Oct 2024 13:18:37 +0000
Subject: [PATCH 36/80] Fix potential out-of-bound access

---
 src/Interpreters/Cache/QueryCache.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp
index cfd7608b6c6..7dbee567c5b 100644
--- a/src/Interpreters/Cache/QueryCache.cpp
+++ b/src/Interpreters/Cache/QueryCache.cpp
@@ -112,7 +112,7 @@ struct HasSystemTablesMatcher
                     if (const auto * expression_list = function_children[0]->as<ASTExpressionList>())
                     {
                         const ASTs & expression_list_children = expression_list->children;
-                        if (!expression_list_children.empty())
+                        if (expression_list_children.size() >= 2)
                         {
                             if (const auto * literal = expression_list_children[1]->as<ASTLiteral>())
                             {

From 3a41e79eb8fe5f9ad69cf2a65056db4e7901a09e Mon Sep 17 00:00:00 2001
From: Christoph Wurm <christoph@clickhouse.com>
Date: Wed, 30 Oct 2024 15:20:07 +0000
Subject: [PATCH 37/80] Fix test

---
 tests/queries/0_stateless/01271_show_privileges.reference | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference
index 17554f5c8a5..930e92cda4e 100644
--- a/tests/queries/0_stateless/01271_show_privileges.reference
+++ b/tests/queries/0_stateless/01271_show_privileges.reference
@@ -183,6 +183,9 @@ HDFS	[]	GLOBAL	SOURCES
 S3	[]	GLOBAL	SOURCES
 HIVE	[]	GLOBAL	SOURCES
 AZURE	[]	GLOBAL	SOURCES
+KAFKA	[]	GLOBAL	SOURCES
+NATS	[]	GLOBAL	SOURCES
+RABBITMQ	[]	GLOBAL	SOURCES
 SOURCES	[]	\N	ALL
 CLUSTER	[]	GLOBAL	ALL
 ALL	['ALL PRIVILEGES']	\N	\N

From a819cfa709f3e100e9ae139a81f16eb99e98eec8 Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Wed, 30 Oct 2024 16:50:40 +0100
Subject: [PATCH 38/80] Read ECS token from file

---
 src/IO/S3/Credentials.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
index a3f671e76d9..91571432840 100644
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@@ -1,5 +1,7 @@
 #include <IO/S3/Credentials.h>
 #include <Common/Exception.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadHelpers.h>
 
 namespace DB
 {
@@ -693,6 +695,7 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
         static const char AWS_ECS_CONTAINER_CREDENTIALS_RELATIVE_URI[] = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI";
         static const char AWS_ECS_CONTAINER_CREDENTIALS_FULL_URI[] = "AWS_CONTAINER_CREDENTIALS_FULL_URI";
         static const char AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN[] = "AWS_CONTAINER_AUTHORIZATION_TOKEN";
+        static const char AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN_PATH[] = "AWS_CONTAINER_AUTHORIZATION_TOKEN_PATH";
         static const char AWS_EC2_METADATA_DISABLED[] = "AWS_EC2_METADATA_DISABLED";
 
         /// The only difference from DefaultAWSCredentialsProviderChain::DefaultAWSCredentialsProviderChain()
@@ -750,7 +753,22 @@ S3CredentialsProviderChain::S3CredentialsProviderChain(
         }
         else if (!absolute_uri.empty())
         {
-            const auto token = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN);
+            auto token = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN);
+            const auto token_path = Aws::Environment::GetEnv(AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN_PATH);
+
+            if (!token_path.empty())
+            {
+                LOG_INFO(logger, "The environment variable value {} is {}", AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN_PATH, token_path);
+
+                String token_from_file;
+
+                ReadBufferFromFile in(token_path);
+                readStringUntilEOF(token_from_file, in);
+                Poco::trimInPlace(token_from_file);
+
+                token = token_from_file;
+            }
+
             AddProvider(std::make_shared<Aws::Auth::TaskRoleCredentialsProvider>(absolute_uri.c_str(), token.c_str()));
 
             /// DO NOT log the value of the authorization token for security purposes.

From 12e36c39fc823986e3aecb105773d18a9b4e601e Mon Sep 17 00:00:00 2001
From: Konstantin Bogdanov <konstantin@clickhouse.com>
Date: Wed, 30 Oct 2024 16:52:59 +0100
Subject: [PATCH 39/80] Sort headers

---
 src/IO/S3/Credentials.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/IO/S3/Credentials.cpp b/src/IO/S3/Credentials.cpp
index 91571432840..cde9a7a3662 100644
--- a/src/IO/S3/Credentials.cpp
+++ b/src/IO/S3/Credentials.cpp
@@ -1,7 +1,7 @@
-#include <IO/S3/Credentials.h>
-#include <Common/Exception.h>
 #include <IO/ReadBufferFromFile.h>
 #include <IO/ReadHelpers.h>
+#include <IO/S3/Credentials.h>
+#include <Common/Exception.h>
 
 namespace DB
 {

From d24b029e45f5dcd2a57af8b3609c092327250632 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Tue, 29 Oct 2024 17:50:56 +0100
Subject: [PATCH 40/80] Add support for chrono data types to the "fmt"
 formatter.

---
 base/base/chrono_io.h | 47 ++++++++++++++++++++++++++++++++-----------
 1 file changed, 35 insertions(+), 12 deletions(-)

diff --git a/base/base/chrono_io.h b/base/base/chrono_io.h
index 4ee8dec6634..d55aa11bc1d 100644
--- a/base/base/chrono_io.h
+++ b/base/base/chrono_io.h
@@ -4,6 +4,7 @@
 #include <string>
 #include <sstream>
 #include <cctz/time_zone.h>
+#include <fmt/core.h>
 
 
 inline std::string to_string(const std::time_t & time)
@@ -11,18 +12,6 @@ inline std::string to_string(const std::time_t & time)
     return cctz::format("%Y-%m-%d %H:%M:%S", std::chrono::system_clock::from_time_t(time), cctz::local_time_zone());
 }
 
-template <typename Clock, typename Duration = typename Clock::duration>
-std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
-{
-    // Don't use DateLUT because it shows weird characters for
-    // TimePoint::max(). I wish we could use C++20 format, but it's not
-    // there yet.
-    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
-
-    auto in_time_t = std::chrono::system_clock::to_time_t(tp);
-    return to_string(in_time_t);
-}
-
 template <typename Rep, typename Period = std::ratio<1>>
 std::string to_string(const std::chrono::duration<Rep, Period> & duration)
 {
@@ -33,6 +22,20 @@ std::string to_string(const std::chrono::duration<Rep, Period> & duration)
     return std::to_string(seconds_as_double.count()) + "s";
 }
 
+template <typename Clock, typename Duration = typename Clock::duration>
+std::string to_string(const std::chrono::time_point<Clock, Duration> & tp)
+{
+    // Don't use DateLUT because it shows weird characters for
+    // TimePoint::max(). I wish we could use C++20 format, but it's not
+    // there yet.
+    // return DateLUT::instance().timeToString(std::chrono::system_clock::to_time_t(tp));
+
+    if constexpr (std::is_same_v<Clock, std::chrono::system_clock>)
+        return to_string(std::chrono::system_clock::to_time_t(tp));
+    else
+        return to_string(tp.time_since_epoch());
+}
+
 template <typename Clock, typename Duration = typename Clock::duration>
 std::ostream & operator<<(std::ostream & o, const std::chrono::time_point<Clock, Duration> & tp)
 {
@@ -44,3 +47,23 @@ std::ostream & operator<<(std::ostream & o, const std::chrono::duration<Rep, Per
 {
     return o << to_string(duration);
 }
+
+template <typename Clock, typename Duration>
+struct fmt::formatter<std::chrono::time_point<Clock, Duration>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::time_point<Clock, Duration> & tp, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(tp), ctx);
+    }
+};
+
+template <typename Rep, typename Period>
+struct fmt::formatter<std::chrono::duration<Rep, Period>> : fmt::formatter<std::string>
+{
+    template <typename FormatCtx>
+    auto format(const std::chrono::duration<Rep, Period> & duration, FormatCtx & ctx) const
+    {
+        return fmt::formatter<std::string>::format(::to_string(duration), ctx);
+    }
+};

From 31402c5840a05a156ee6c5bb1942f42e27578052 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Tue, 29 Oct 2024 17:52:36 +0100
Subject: [PATCH 41/80] Add support for a custom cancellation exception to
 QueryStatus::cancelQuery().

---
 src/Interpreters/ProcessList.cpp | 23 ++++++++++++++++++-----
 src/Interpreters/ProcessList.h   |  9 ++++++++-
 2 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp
index 177468f1c8b..7a9b8566c77 100644
--- a/src/Interpreters/ProcessList.cpp
+++ b/src/Interpreters/ProcessList.cpp
@@ -447,12 +447,16 @@ void QueryStatus::ExecutorHolder::remove()
     executor = nullptr;
 }
 
-CancellationCode QueryStatus::cancelQuery(bool)
+CancellationCode QueryStatus::cancelQuery(bool /* kill */, std::exception_ptr exception)
 {
-    if (is_killed.load())
+    if (is_killed.exchange(true))
         return CancellationCode::CancelSent;
 
-    is_killed.store(true);
+    {
+        std::lock_guard lock{cancellation_exception_mutex};
+        if (!cancellation_exception)
+            cancellation_exception = exception;
+    }
 
     std::vector<ExecutorHolderPtr> executors_snapshot;
 
@@ -486,7 +490,7 @@ void QueryStatus::addPipelineExecutor(PipelineExecutor * e)
     /// addPipelineExecutor() from the cancelQuery() context, and this will
     /// lead to deadlock.
     if (is_killed.load())
-        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        throwQueryWasCancelled();
 
     std::lock_guard lock(executors_mutex);
     assert(!executors.contains(e));
@@ -512,11 +516,20 @@ void QueryStatus::removePipelineExecutor(PipelineExecutor * e)
 bool QueryStatus::checkTimeLimit()
 {
     if (is_killed.load())
-        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+        throwQueryWasCancelled();
 
     return limits.checkTimeLimit(watch, overflow_mode);
 }
 
+void QueryStatus::throwQueryWasCancelled() const
+{
+    std::lock_guard lock{cancellation_exception_mutex};
+    if (cancellation_exception)
+        std::rethrow_exception(cancellation_exception);
+    else
+        throw Exception(ErrorCodes::QUERY_WAS_CANCELLED, "Query was cancelled");
+}
+
 bool QueryStatus::checkTimeLimitSoft()
 {
     if (is_killed.load())
diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h
index b2583e74d9b..f171fe8f4d4 100644
--- a/src/Interpreters/ProcessList.h
+++ b/src/Interpreters/ProcessList.h
@@ -109,6 +109,9 @@ protected:
     /// KILL was send to the query
     std::atomic<bool> is_killed { false };
 
+    std::exception_ptr cancellation_exception TSA_GUARDED_BY(cancellation_exception_mutex);
+    mutable std::mutex cancellation_exception_mutex;
+
     /// All data to the client already had been sent.
     /// Including EndOfStream or Exception.
     std::atomic<bool> is_all_data_sent { false };
@@ -127,6 +130,8 @@ protected:
     /// A weak pointer is used here because it's a ProcessListEntry which owns this QueryStatus, and not vice versa.
     void setProcessListEntry(std::weak_ptr<ProcessListEntry> process_list_entry_);
 
+    [[noreturn]] void throwQueryWasCancelled() const;
+
     mutable std::mutex executors_mutex;
 
     struct ExecutorHolder
@@ -225,7 +230,9 @@ public:
 
     QueryStatusInfo getInfo(bool get_thread_list = false, bool get_profile_events = false, bool get_settings = false) const;
 
-    CancellationCode cancelQuery(bool kill);
+    /// Cancels the current query.
+    /// Optional argument `exception` allows to set an exception which checkTimeLimit() will throw instead of "QUERY_WAS_CANCELLED".
+    CancellationCode cancelQuery(bool kill, std::exception_ptr exception = nullptr);
 
     bool isKilled() const { return is_killed; }
 

From 8fea878834ca5d715284048a820a23b56dcd4f46 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Tue, 29 Oct 2024 17:56:55 +0100
Subject: [PATCH 42/80] Make configurable the number of retries used by
 ZooKeeper when connecting.

---
 src/Common/ZooKeeper/ZooKeeperArgs.cpp | 4 ++++
 src/Common/ZooKeeper/ZooKeeperArgs.h   | 1 +
 src/Common/ZooKeeper/ZooKeeperImpl.cpp | 4 +++-
 3 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.cpp b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
index cdc9a1afe4c..c488d829b9d 100644
--- a/src/Common/ZooKeeper/ZooKeeperArgs.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.cpp
@@ -176,6 +176,10 @@ void ZooKeeperArgs::initFromKeeperSection(const Poco::Util::AbstractConfiguratio
         {
             connection_timeout_ms = config.getInt(config_name + "." + key);
         }
+        else if (key == "num_connection_retries")
+        {
+            num_connection_retries = config.getInt(config_name + "." + key);
+        }
         else if (key == "enable_fault_injections_during_startup")
         {
             enable_fault_injections_during_startup = config.getBool(config_name + "." + key);
diff --git a/src/Common/ZooKeeper/ZooKeeperArgs.h b/src/Common/ZooKeeper/ZooKeeperArgs.h
index 3754c2f7aac..e790e578808 100644
--- a/src/Common/ZooKeeper/ZooKeeperArgs.h
+++ b/src/Common/ZooKeeper/ZooKeeperArgs.h
@@ -39,6 +39,7 @@ struct ZooKeeperArgs
     String sessions_path = "/clickhouse/sessions";
     String client_availability_zone;
     int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
+    UInt64 num_connection_retries = 2;
     int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
     int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
     bool enable_fault_injections_during_startup = false;
diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
index 173f37c3454..7b027f48d4b 100644
--- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp
@@ -440,7 +440,9 @@ void ZooKeeper::connect(
     if (nodes.empty())
         throw Exception::fromMessage(Error::ZBADARGUMENTS, "No nodes passed to ZooKeeper constructor");
 
-    static constexpr size_t num_tries = 3;
+    /// We always have at least one attempt to connect.
+    size_t num_tries = args.num_connection_retries + 1;
+
     bool connected = false;
     bool dns_error = false;
 

From 982b67fb22b0bb0a508595624096cb23da4dc357 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Tue, 29 Oct 2024 20:20:08 +0100
Subject: [PATCH 43/80] Add support for zookeeper retries to
 executeDDLQueryOnCluster().

---
 src/Common/ZooKeeper/ZooKeeperRetries.h       | 12 ++++++++---
 src/Databases/DatabaseReplicatedWorker.cpp    |  3 +--
 src/Databases/DatabaseReplicatedWorker.h      |  2 +-
 src/Interpreters/DDLWorker.cpp                | 21 ++++++++++++++++++-
 src/Interpreters/DDLWorker.h                  |  8 ++++++-
 src/Interpreters/executeDDLQueryOnCluster.cpp |  2 +-
 src/Interpreters/executeDDLQueryOnCluster.h   |  3 +++
 7 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/src/Common/ZooKeeper/ZooKeeperRetries.h b/src/Common/ZooKeeper/ZooKeeperRetries.h
index b5b03971385..acea521a7ce 100644
--- a/src/Common/ZooKeeper/ZooKeeperRetries.h
+++ b/src/Common/ZooKeeper/ZooKeeperRetries.h
@@ -15,14 +15,15 @@ namespace ErrorCodes
 
 struct ZooKeeperRetriesInfo
 {
+    ZooKeeperRetriesInfo() = default;
     ZooKeeperRetriesInfo(UInt64 max_retries_, UInt64 initial_backoff_ms_, UInt64 max_backoff_ms_)
         : max_retries(max_retries_), initial_backoff_ms(std::min(initial_backoff_ms_, max_backoff_ms_)), max_backoff_ms(max_backoff_ms_)
     {
     }
 
-    UInt64 max_retries;
-    UInt64 initial_backoff_ms;
-    UInt64 max_backoff_ms;
+    UInt64 max_retries = 0; /// "max_retries = 0" means only one attempt.
+    UInt64 initial_backoff_ms = 100;
+    UInt64 max_backoff_ms = 5000;
 };
 
 class ZooKeeperRetriesControl
@@ -220,6 +221,7 @@ private:
             return false;
         }
 
+        /// Check if the query was cancelled.
         if (process_list_element)
             process_list_element->checkTimeLimit();
 
@@ -228,6 +230,10 @@ private:
         sleepForMilliseconds(current_backoff_ms);
         current_backoff_ms = std::min(current_backoff_ms * 2, retries_info.max_backoff_ms);
 
+        /// Check if the query was cancelled again after sleeping.
+        if (process_list_element)
+            process_list_element->checkTimeLimit();
+
         return true;
     }
 
diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp
index 5d75dff391a..6a711c92332 100644
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@@ -199,13 +199,12 @@ void DatabaseReplicatedDDLWorker::initializeReplication()
     active_node_holder = zkutil::EphemeralNodeHolder::existing(active_path, *active_node_holder_zookeeper);
 }
 
-String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
+String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr)
 {
     auto zookeeper = getAndSetZooKeeper();
     return enqueueQueryImpl(zookeeper, entry, database);
 }
 
-
 bool DatabaseReplicatedDDLWorker::waitForReplicaToProcessAllEntries(UInt64 timeout_ms)
 {
     auto zookeeper = getAndSetZooKeeper();
diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h
index b690854e249..d2385cbdba3 100644
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@@ -24,7 +24,7 @@ class DatabaseReplicatedDDLWorker : public DDLWorker
 public:
     DatabaseReplicatedDDLWorker(DatabaseReplicated * db, ContextPtr context_);
 
-    String enqueueQuery(DDLLogEntry & entry) override;
+    String enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo &, QueryStatusPtr) override;
 
     String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, ContextPtr query_context);
 
diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp
index 1be1a0c9bb9..eaba46f5d48 100644
--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
@@ -26,6 +26,7 @@
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/ZooKeeper/ZooKeeper.h>
 #include <Common/ZooKeeper/ZooKeeperLock.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/isLocalAddress.h>
 #include <Common/logger_useful.h>
 #include <Common/randomSeed.h>
@@ -1053,7 +1054,25 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP
 }
 
 
-String DDLWorker::enqueueQuery(DDLLogEntry & entry)
+String DDLWorker::enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo & retries_info, QueryStatusPtr process_list_element)
+{
+    String node_path;
+    if (retries_info.max_retries > 0)
+    {
+        ZooKeeperRetriesControl retries_ctl{"DDLWorker::enqueueQuery", log, retries_info, process_list_element};
+        retries_ctl.retryLoop([&]{
+            node_path = enqueueQueryAttempt(entry);
+        });
+    }
+    else
+    {
+        node_path = enqueueQueryAttempt(entry);
+    }
+    return node_path;
+}
+
+
+String DDLWorker::enqueueQueryAttempt(DDLLogEntry & entry)
 {
     if (entry.hosts.empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty host list in a distributed DDL task");
diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h
index ee17714add9..a5f47a51bb3 100644
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@@ -48,6 +48,9 @@ struct DDLTaskBase;
 using DDLTaskPtr = std::unique_ptr<DDLTaskBase>;
 using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
 class AccessRightsElements;
+struct ZooKeeperRetriesInfo;
+class QueryStatus;
+using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 
 class DDLWorker
 {
@@ -65,7 +68,7 @@ public:
     virtual ~DDLWorker();
 
     /// Pushes query into DDL queue, returns path to created node
-    virtual String enqueueQuery(DDLLogEntry & entry);
+    virtual String enqueueQuery(DDLLogEntry & entry, const ZooKeeperRetriesInfo & retries_info, QueryStatusPtr process_list_element);
 
     /// Host ID (name:port) for logging purposes
     /// Note that in each task hosts are identified individually by name:port from initiator server cluster config
@@ -120,6 +123,9 @@ protected:
         mutable std::shared_mutex mtx;
     };
 
+    /// Pushes query into DDL queue, returns path to created node
+    String enqueueQueryAttempt(DDLLogEntry & entry);
+
     /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
     void scheduleTasks(bool reinitialized);
 
diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp
index c0440c755ad..0b88d07148c 100644
--- a/src/Interpreters/executeDDLQueryOnCluster.cpp
+++ b/src/Interpreters/executeDDLQueryOnCluster.cpp
@@ -189,7 +189,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, ContextPtr context,
     entry.setSettingsIfRequired(context);
     entry.tracing_context = OpenTelemetry::CurrentContext();
     entry.initial_query_id = context->getClientInfo().initial_query_id;
-    String node_path = ddl_worker.enqueueQuery(entry);
+    String node_path = ddl_worker.enqueueQuery(entry, params.retries_info, context->getProcessListElement());
 
     return getDDLOnClusterStatus(node_path, ddl_worker.getReplicasDir(), entry, context);
 }
diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h
index d015e8d8694..69e0c38834e 100644
--- a/src/Interpreters/executeDDLQueryOnCluster.h
+++ b/src/Interpreters/executeDDLQueryOnCluster.h
@@ -37,6 +37,9 @@ struct DDLQueryOnClusterParams
 
     /// Privileges which the current user should have to execute a query.
     AccessRightsElements access_to_check;
+
+    /// Use retries when creating nodes "query-0000000000", "query-0000000001", "query-0000000002" in ZooKeeper.
+    ZooKeeperRetriesInfo retries_info;
 };
 
 /// Pushes distributed DDL query to the queue.

From f6b5d27c58895f2e39fe3c6b747170f50f524ad3 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Tue, 29 Oct 2024 21:55:17 +0100
Subject: [PATCH 44/80] Rework coordination of hosts during BACKUP ON CLUSTER /
 RESTORE ON CLUSTER. Fix concurrency check, implement cancelling of
 distributed backups/restores.

---
 src/Backups/BackupConcurrencyCheck.cpp        |  135 ++
 src/Backups/BackupConcurrencyCheck.h          |   55 +
 src/Backups/BackupCoordinationCleaner.cpp     |   64 +
 src/Backups/BackupCoordinationCleaner.h       |   40 +
 src/Backups/BackupCoordinationLocal.cpp       |   38 +-
 src/Backups/BackupCoordinationLocal.h         |   32 +-
 ...te.cpp => BackupCoordinationOnCluster.cpp} |  309 ++---
 ...Remote.h => BackupCoordinationOnCluster.h} |   67 +-
 src/Backups/BackupCoordinationStage.h         |    8 -
 src/Backups/BackupCoordinationStageSync.cpp   | 1205 ++++++++++++++---
 src/Backups/BackupCoordinationStageSync.h     |  189 ++-
 src/Backups/BackupEntriesCollector.cpp        |   17 +-
 src/Backups/BackupEntriesCollector.h          |    4 -
 src/Backups/BackupIO.h                        |    5 +
 src/Backups/BackupIO_AzureBlobStorage.h       |    1 +
 src/Backups/BackupIO_Disk.cpp                 |   28 +-
 src/Backups/BackupIO_Disk.h                   |    2 +
 src/Backups/BackupIO_File.cpp                 |   28 +-
 src/Backups/BackupIO_File.h                   |    2 +
 src/Backups/BackupIO_S3.h                     |    1 +
 src/Backups/BackupImpl.cpp                    |   82 +-
 src/Backups/BackupImpl.h                      |    6 +-
 src/Backups/BackupKeeperSettings.cpp          |   58 +
 src/Backups/BackupKeeperSettings.h            |   64 +
 src/Backups/BackupSettings.cpp                |   11 +
 src/Backups/BackupSettings.h                  |    2 +
 src/Backups/BackupsWorker.cpp                 |  924 ++++++-------
 src/Backups/BackupsWorker.h                   |   47 +-
 src/Backups/IBackup.h                         |    9 +-
 src/Backups/IBackupCoordination.h             |   36 +-
 src/Backups/IRestoreCoordination.h            |   36 +-
 src/Backups/RestoreCoordinationLocal.cpp      |   34 +-
 src/Backups/RestoreCoordinationLocal.h        |   27 +-
 src/Backups/RestoreCoordinationOnCluster.cpp  |  318 +++++
 ...emote.h => RestoreCoordinationOnCluster.h} |   55 +-
 src/Backups/RestoreCoordinationRemote.cpp     |  379 ------
 src/Backups/RestorerFromBackup.cpp            |   28 +-
 src/Backups/RestorerFromBackup.h              |    5 +-
 src/Backups/WithRetries.cpp                   |   57 +-
 src/Backups/WithRetries.h                     |   32 +-
 src/Common/Exception.cpp                      |    4 +-
 src/Common/Exception.h                        |    2 +-
 src/Core/Settings.cpp                         |   37 +-
 src/Core/SettingsChangesHistory.cpp           |    5 +
 src/Interpreters/InterpreterBackupQuery.cpp   |   21 +-
 src/Storages/StorageKeeperMap.cpp             |   14 +-
 tests/integration/helpers/cluster.py          |   13 +
 tests/integration/helpers/config_manager.py   |   65 +
 .../configs/faster_zk_disconnect_detect.xml   |   12 +
 .../configs/lesser_timeouts.xml               |    2 +-
 .../configs/shutdown_cancel_backups.xml       |    3 +
 .../configs/slow_backups.xml                  |    7 +
 .../configs/zookeeper_retries.xml             |    9 +-
 .../test_backup_restore_on_cluster/test.py    |    2 +-
 .../test_cancel_backup.py                     |  780 +++++++++++
 .../test_disallow_concurrency.py              |    4 +-
 56 files changed, 3849 insertions(+), 1571 deletions(-)
 create mode 100644 src/Backups/BackupConcurrencyCheck.cpp
 create mode 100644 src/Backups/BackupConcurrencyCheck.h
 create mode 100644 src/Backups/BackupCoordinationCleaner.cpp
 create mode 100644 src/Backups/BackupCoordinationCleaner.h
 rename src/Backups/{BackupCoordinationRemote.cpp => BackupCoordinationOnCluster.cpp} (73%)
 rename src/Backups/{BackupCoordinationRemote.h => BackupCoordinationOnCluster.h} (67%)
 create mode 100644 src/Backups/BackupKeeperSettings.cpp
 create mode 100644 src/Backups/BackupKeeperSettings.h
 create mode 100644 src/Backups/RestoreCoordinationOnCluster.cpp
 rename src/Backups/{RestoreCoordinationRemote.h => RestoreCoordinationOnCluster.h} (62%)
 delete mode 100644 src/Backups/RestoreCoordinationRemote.cpp
 create mode 100644 tests/integration/helpers/config_manager.py
 create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml
 create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml
 create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml
 create mode 100644 tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py

diff --git a/src/Backups/BackupConcurrencyCheck.cpp b/src/Backups/BackupConcurrencyCheck.cpp
new file mode 100644
index 00000000000..8b29ae41b53
--- /dev/null
+++ b/src/Backups/BackupConcurrencyCheck.cpp
@@ -0,0 +1,135 @@
+#include <Backups/BackupConcurrencyCheck.h>
+
+#include <Common/Exception.h>
+#include <Common/logger_useful.h>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
+}
+
+
+BackupConcurrencyCheck::BackupConcurrencyCheck(
+    const UUID & backup_or_restore_uuid_,
+    bool is_restore_,
+    bool on_cluster_,
+    bool allow_concurrency_,
+    BackupConcurrencyCounters & counters_)
+    : is_restore(is_restore_), backup_or_restore_uuid(backup_or_restore_uuid_), on_cluster(on_cluster_), counters(counters_)
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (!allow_concurrency_)
+    {
+        bool found_concurrent_operation = false;
+        if (is_restore)
+        {
+            size_t num_local_restores = counters.local_restores;
+            size_t num_on_cluster_restores = counters.on_cluster_restores.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_restores.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_restores;
+            }
+            else
+            {
+                ++num_local_restores;
+            }
+            found_concurrent_operation = (num_local_restores + num_on_cluster_restores > 1);
+        }
+        else
+        {
+            size_t num_local_backups = counters.local_backups;
+            size_t num_on_cluster_backups = counters.on_cluster_backups.size();
+            if (on_cluster)
+            {
+                if (!counters.on_cluster_backups.contains(backup_or_restore_uuid))
+                    ++num_on_cluster_backups;
+            }
+            else
+            {
+                ++num_local_backups;
+            }
+            found_concurrent_operation = (num_local_backups + num_on_cluster_backups > 1);
+        }
+
+        if (found_concurrent_operation)
+            throwConcurrentOperationNotAllowed(is_restore);
+    }
+
+    if (on_cluster)
+    {
+        if (is_restore)
+            ++counters.on_cluster_restores[backup_or_restore_uuid];
+        else
+            ++counters.on_cluster_backups[backup_or_restore_uuid];
+    }
+    else
+    {
+        if (is_restore)
+            ++counters.local_restores;
+        else
+            ++counters.local_backups;
+    }
+}
+
+
+BackupConcurrencyCheck::~BackupConcurrencyCheck()
+{
+    std::lock_guard lock{counters.mutex};
+
+    if (on_cluster)
+    {
+        if (is_restore)
+        {
+            auto it = counters.on_cluster_restores.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_restores.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_restores.erase(it);
+            }
+        }
+        else
+        {
+            auto it = counters.on_cluster_backups.find(backup_or_restore_uuid);
+            if (it != counters.on_cluster_backups.end())
+            {
+                if (!--it->second)
+                    counters.on_cluster_backups.erase(it);
+            }
+        }
+    }
+    else
+    {
+        if (is_restore)
+            --counters.local_restores;
+        else
+            --counters.local_backups;
+    }
+}
+
+
+void BackupConcurrencyCheck::throwConcurrentOperationNotAllowed(bool is_restore)
+{
+    throw Exception(
+        ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
+        "Concurrent {} are not allowed, turn on setting '{}'",
+        is_restore ? "restores" : "backups",
+        is_restore ? "allow_concurrent_restores" : "allow_concurrent_backups");
+}
+
+
+BackupConcurrencyCounters::BackupConcurrencyCounters() = default;
+
+
+BackupConcurrencyCounters::~BackupConcurrencyCounters()
+{
+    if (local_backups > 0 || local_restores > 0 || !on_cluster_backups.empty() || !on_cluster_restores.empty())
+        LOG_ERROR(getLogger(__PRETTY_FUNCTION__), "Some backups or restores are processing");
+}
+
+}
diff --git a/src/Backups/BackupConcurrencyCheck.h b/src/Backups/BackupConcurrencyCheck.h
new file mode 100644
index 00000000000..048a23a716a
--- /dev/null
+++ b/src/Backups/BackupConcurrencyCheck.h
@@ -0,0 +1,55 @@
+#pragma once
+
+#include <Core/UUID.h>
+#include <base/scope_guard.h>
+#include <mutex>
+#include <unordered_map>
+
+
+namespace DB
+{
+class BackupConcurrencyCounters;
+
+/// Local checker for concurrent BACKUP or RESTORE operations.
+/// This class is used by implementations of IBackupCoordination and IRestoreCoordination
+/// to throw an exception if concurrent backups or restores are not allowed.
+class BackupConcurrencyCheck
+{
+public:
+    /// Checks concurrency of a BACKUP operation or a RESTORE operation.
+    /// Keep a constructed instance of BackupConcurrencyCheck until the operation is done.
+    BackupConcurrencyCheck(
+        const UUID & backup_or_restore_uuid_,
+        bool is_restore_,
+        bool on_cluster_,
+        bool allow_concurrency_,
+        BackupConcurrencyCounters & counters_);
+
+    ~BackupConcurrencyCheck();
+
+    [[noreturn]] static void throwConcurrentOperationNotAllowed(bool is_restore);
+
+private:
+    const bool is_restore;
+    const UUID backup_or_restore_uuid;
+    const bool on_cluster;
+    BackupConcurrencyCounters & counters;
+};
+
+
+class BackupConcurrencyCounters
+{
+public:
+    BackupConcurrencyCounters();
+    ~BackupConcurrencyCounters();
+
+private:
+    friend class BackupConcurrencyCheck;
+    size_t local_backups TSA_GUARDED_BY(mutex) = 0;
+    size_t local_restores TSA_GUARDED_BY(mutex) = 0;
+    std::unordered_map<UUID /* backup_uuid */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
+    std::unordered_map<UUID /* restore_uuid */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
+    std::mutex mutex;
+};
+
+}
diff --git a/src/Backups/BackupCoordinationCleaner.cpp b/src/Backups/BackupCoordinationCleaner.cpp
new file mode 100644
index 00000000000..1f5068a94de
--- /dev/null
+++ b/src/Backups/BackupCoordinationCleaner.cpp
@@ -0,0 +1,64 @@
+#include <Backups/BackupCoordinationCleaner.h>
+
+
+namespace DB
+{
+
+BackupCoordinationCleaner::BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
+    : zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
+{
+}
+
+void BackupCoordinationCleaner::cleanup()
+{
+    tryRemoveAllNodes(/* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryCleanupAfterError() noexcept
+{
+    return tryRemoveAllNodes(/* throw_if_error = */ false, /* retries_kind = */ WithRetries::kNormal);
+}
+
+bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (cleanup_result.succeeded)
+            return true;
+        if (cleanup_result.exception)
+        {
+            if (throw_if_error)
+                std::rethrow_exception(cleanup_result.exception);
+            return false;
+        }
+    }
+
+    try
+    {
+        LOG_TRACE(log, "Removing nodes from ZooKeeper");
+        auto holder = with_retries.createRetriesControlHolder("removeAllNodes", retries_kind);
+        holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            zookeeper->removeRecursive(zookeeper_path);
+        });
+
+        std::lock_guard lock{mutex};
+        cleanup_result.succeeded = true;
+        return true;
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this restore: {}",
+                  getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));
+
+        std::lock_guard lock{mutex};
+        cleanup_result.exception = std::current_exception();
+
+        if (throw_if_error)
+            throw;
+        return false;
+    }
+}
+
+}
diff --git a/src/Backups/BackupCoordinationCleaner.h b/src/Backups/BackupCoordinationCleaner.h
new file mode 100644
index 00000000000..43e095d9f33
--- /dev/null
+++ b/src/Backups/BackupCoordinationCleaner.h
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <Backups/WithRetries.h>
+
+
+namespace DB
+{
+
+/// Removes all the nodes from ZooKeeper used to coordinate a BACKUP ON CLUSTER operation or
+/// a RESTORE ON CLUSTER operation (successful or not).
+/// This class is used by BackupCoordinationOnCluster and RestoreCoordinationOnCluster to cleanup.
+class BackupCoordinationCleaner
+{
+public:
+    BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);
+
+    void cleanup();
+    bool tryCleanupAfterError() noexcept;
+
+private:
+    bool tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind);
+
+    const String zookeeper_path;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const LoggerPtr log;
+
+    struct CleanupResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+    };
+    CleanupResult cleanup_result TSA_GUARDED_BY(mutex);
+
+    std::mutex mutex;
+};
+
+}
diff --git a/src/Backups/BackupCoordinationLocal.cpp b/src/Backups/BackupCoordinationLocal.cpp
index efdc18cc29c..8bd6b4d327d 100644
--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@@ -1,5 +1,7 @@
 #include <Backups/BackupCoordinationLocal.h>
+
 #include <Common/Exception.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>
 #include <Common/quoteString.h>
 #include <fmt/format.h>
@@ -8,27 +10,20 @@
 namespace DB
 {
 
-BackupCoordinationLocal::BackupCoordinationLocal(bool plain_backup_)
-    : log(getLogger("BackupCoordinationLocal")), file_infos(plain_backup_)
+BackupCoordinationLocal::BackupCoordinationLocal(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("BackupCoordinationLocal"))
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ false, allow_concurrent_backup_, concurrency_counters_)
+    , file_infos(is_plain_backup_)
 {
 }
 
 BackupCoordinationLocal::~BackupCoordinationLocal() = default;
 
-void BackupCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void BackupCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings BackupCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo BackupCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
     return {};
 }
@@ -135,15 +130,4 @@ bool BackupCoordinationLocal::startWritingFile(size_t data_file_index)
     return writing_files.emplace(data_file_index).second;
 }
 
-
-bool BackupCoordinationLocal::hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const
-{
-    if (num_active_backups > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_backups={}", num_active_backups);
-        return true;
-    }
-    return false;
-}
-
 }
diff --git a/src/Backups/BackupCoordinationLocal.h b/src/Backups/BackupCoordinationLocal.h
index a7f15c79649..09991c0d301 100644
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@@ -21,13 +22,21 @@ namespace DB
 class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
-    explicit BackupCoordinationLocal(bool plain_backup_);
+    explicit BackupCoordinationLocal(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_);
+
     ~BackupCoordinationLocal() override;
 
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setBackupQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }
 
     void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name,
                                 const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
@@ -54,17 +63,18 @@ public:
     BackupFileInfos getFileInfosForAllHosts() const override;
     bool startWritingFile(size_t data_file_index) override;
 
-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;
 
 private:
     LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;
 
-    BackupCoordinationReplicatedTables TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    BackupCoordinationReplicatedAccess TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    BackupCoordinationReplicatedSQLObjects TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    BackupCoordinationFileInfos TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    BackupCoordinationReplicatedTables replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    BackupCoordinationReplicatedAccess replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    BackupCoordinationReplicatedSQLObjects replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    BackupCoordinationFileInfos file_infos TSA_GUARDED_BY(file_infos_mutex);
     BackupCoordinationKeeperMapTables keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);
 
     mutable std::mutex replicated_tables_mutex;
     mutable std::mutex replicated_access_mutex;
diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationOnCluster.cpp
similarity index 73%
rename from src/Backups/BackupCoordinationRemote.cpp
rename to src/Backups/BackupCoordinationOnCluster.cpp
index a60ac0c636f..dc34939f805 100644
--- a/src/Backups/BackupCoordinationRemote.cpp
+++ b/src/Backups/BackupCoordinationOnCluster.cpp
@@ -1,7 +1,4 @@
-#include <Backups/BackupCoordinationRemote.h>
-
-#include <base/hex.h>
-#include <boost/algorithm/string/split.hpp>
+#include <Backups/BackupCoordinationOnCluster.h>
 
 #include <Access/Common/AccessEntityType.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
@@ -26,8 +23,6 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-namespace Stage = BackupCoordinationStage;
-
 namespace
 {
     using PartNameAndChecksum = IBackupCoordination::PartNameAndChecksum;
@@ -149,144 +144,152 @@ namespace
     };
 }
 
-size_t BackupCoordinationRemote::findCurrentHostIndex(const Strings & all_hosts, const String & current_host)
+Strings BackupCoordinationOnCluster::excludeInitiator(const Strings & all_hosts)
+{
+    Strings all_hosts_without_initiator = all_hosts;
+    bool has_initiator = (std::erase(all_hosts_without_initiator, kInitiator) > 0);
+    chassert(has_initiator);
+    return all_hosts_without_initiator;
+}
+
+size_t BackupCoordinationOnCluster::findCurrentHostIndex(const String & current_host, const Strings & all_hosts)
 {
     auto it = std::find(all_hosts.begin(), all_hosts.end(), current_host);
     if (it == all_hosts.end())
-        return 0;
+        return all_hosts.size();
     return it - all_hosts.begin();
 }
 
-BackupCoordinationRemote::BackupCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
+
+BackupCoordinationOnCluster::BackupCoordinationOnCluster(
+    const UUID & backup_uuid_,
+    bool is_plain_backup_,
     const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
     const BackupKeeperSettings & keeper_settings_,
-    const String & backup_uuid_,
-    const Strings & all_hosts_,
     const String & current_host_,
-    bool plain_backup_,
-    bool is_internal_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_backup_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
     QueryStatusPtr process_list_element_)
     : root_zookeeper_path(root_zookeeper_path_)
-    , zookeeper_path(root_zookeeper_path_ + "/backup-" + backup_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/backup-" + toString(backup_uuid_))
     , keeper_settings(keeper_settings_)
     , backup_uuid(backup_uuid_)
     , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(excludeInitiator(all_hosts))
     , current_host(current_host_)
-    , current_host_index(findCurrentHostIndex(all_hosts, current_host))
-    , plain_backup(plain_backup_)
-    , is_internal(is_internal_)
-    , log(getLogger("BackupCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
+    , current_host_index(findCurrentHostIndex(current_host, all_hosts))
+    , plain_backup(is_plain_backup_)
+    , log(getLogger("BackupCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ true, allow_concurrent_backup_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
 {
     createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
 }
 
-BackupCoordinationRemote::~BackupCoordinationRemote()
+BackupCoordinationOnCluster::~BackupCoordinationOnCluster()
 {
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
+    tryFinishImpl();
 }
 
-void BackupCoordinationRemote::createRootNodes()
+void BackupCoordinationOnCluster::createRootNodes()
 {
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
     holder.retries_ctl.retryLoop(
     [&, &zk = holder.faulty_zookeeper]()
     {
         with_retries.renewZooKeeper(zk);
 
         zk->createAncestors(zookeeper_path);
-
-        Coordination::Requests ops;
-        Coordination::Responses responses;
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_part_names", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_mutations", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_data_paths", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/file_infos", "", zkutil::CreateMode::Persistent));
-        ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/writing_files", "", zkutil::CreateMode::Persistent));
-        zk->tryMulti(ops, responses);
+        zk->createIfNotExists(zookeeper_path, "");
+        zk->createIfNotExists(zookeeper_path + "/repl_part_names", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_mutations", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_data_paths", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_access", "");
+        zk->createIfNotExists(zookeeper_path + "/repl_sql_objects", "");
+        zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+        zk->createIfNotExists(zookeeper_path + "/file_infos", "");
+        zk->createIfNotExists(zookeeper_path + "/writing_files", "");
     });
 }
 
-void BackupCoordinationRemote::removeAllNodes()
+Strings BackupCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-    [&, &zk = holder.faulty_zookeeper]()
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void BackupCoordinationOnCluster::setBackupQueryWasSentToOtherHosts()
+{
+    backup_query_was_sent_to_other_hosts = true;
+}
+
+bool BackupCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void BackupCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool BackupCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool BackupCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
     {
-        /// Usually this function is called by the initiator when a backup is complete so we don't need the coordination anymore.
-        ///
-        /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-        /// while some hosts are still making the backup. Removing all the nodes will remove the parent node of the backup coordination
-        /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some useless part
-        /// of their backup work before that. Anyway in this case backup won't be finalized (because only an initiator can do that).
-        with_retries.renewZooKeeper(zk);
-        zk->removeRecursive(zookeeper_path);
-    });
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
 }
 
-
-void BackupCoordinationRemote::setStage(const String & new_stage, const String & message)
+void BackupCoordinationOnCluster::waitForOtherHostsToFinish()
 {
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
+    if ((current_host != kInitiator) || !backup_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
 }
 
-void BackupCoordinationRemote::setError(const Exception & exception)
+bool BackupCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
 {
-    stage_sync->setError(current_host, exception);
+    if (current_host != kInitiator)
+        return false;
+    if (!backup_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
 }
 
-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait)
+ZooKeeperRetriesInfo BackupCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
 {
-    return stage_sync->wait(all_hosts, stage_to_wait);
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
 }
 
-Strings BackupCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-
-void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
+void BackupCoordinationOnCluster::serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name)
 {
     {
         auto holder = with_retries.createRetriesControlHolder(logging_name + "::create");
@@ -301,7 +304,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
     if (value.empty())
         return;
 
-    size_t max_part_size = keeper_settings.keeper_value_max_size;
+    size_t max_part_size = keeper_settings.value_max_size;
     if (!max_part_size)
         max_part_size = value.size();
 
@@ -324,7 +327,7 @@ void BackupCoordinationRemote::serializeToMultipleZooKeeperNodes(const String &
     }
 }
 
-String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
+String BackupCoordinationOnCluster::deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const
 {
     Strings part_names;
 
@@ -357,7 +360,7 @@ String BackupCoordinationRemote::deserializeFromMultipleZooKeeperNodes(const Str
 }
 
 
-void BackupCoordinationRemote::addReplicatedPartNames(
+void BackupCoordinationOnCluster::addReplicatedPartNames(
     const String & table_zk_path,
     const String & table_name_for_logs,
     const String & replica_name,
@@ -381,14 +384,14 @@ void BackupCoordinationRemote::addReplicatedPartNames(
     });
 }
 
-Strings BackupCoordinationRemote::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
+Strings BackupCoordinationOnCluster::getReplicatedPartNames(const String & table_zk_path, const String & replica_name) const
 {
     std::lock_guard lock{replicated_tables_mutex};
     prepareReplicatedTables();
     return replicated_tables->getPartNames(table_zk_path, replica_name);
 }
 
-void BackupCoordinationRemote::addReplicatedMutations(
+void BackupCoordinationOnCluster::addReplicatedMutations(
     const String & table_zk_path,
     const String & table_name_for_logs,
     const String & replica_name,
@@ -412,7 +415,7 @@ void BackupCoordinationRemote::addReplicatedMutations(
         });
 }
 
-std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
+std::vector<IBackupCoordination::MutationInfo> BackupCoordinationOnCluster::getReplicatedMutations(const String & table_zk_path, const String & replica_name) const
 {
     std::lock_guard lock{replicated_tables_mutex};
     prepareReplicatedTables();
@@ -420,7 +423,7 @@ std::vector<IBackupCoordination::MutationInfo> BackupCoordinationRemote::getRepl
 }
 
 
-void BackupCoordinationRemote::addReplicatedDataPath(
+void BackupCoordinationOnCluster::addReplicatedDataPath(
     const String & table_zk_path, const String & data_path)
 {
     {
@@ -441,7 +444,7 @@ void BackupCoordinationRemote::addReplicatedDataPath(
     });
 }
 
-Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk_path) const
+Strings BackupCoordinationOnCluster::getReplicatedDataPaths(const String & table_zk_path) const
 {
     std::lock_guard lock{replicated_tables_mutex};
     prepareReplicatedTables();
@@ -449,7 +452,7 @@ Strings BackupCoordinationRemote::getReplicatedDataPaths(const String & table_zk
 }
 
 
-void BackupCoordinationRemote::prepareReplicatedTables() const
+void BackupCoordinationOnCluster::prepareReplicatedTables() const
 {
     if (replicated_tables)
         return;
@@ -536,7 +539,7 @@ void BackupCoordinationRemote::prepareReplicatedTables() const
         replicated_tables->addDataPath(std::move(data_paths));
 }
 
-void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
+void BackupCoordinationOnCluster::addReplicatedAccessFilePath(const String & access_zk_path, AccessEntityType access_entity_type, const String & file_path)
 {
     {
         std::lock_guard lock{replicated_access_mutex};
@@ -558,14 +561,14 @@ void BackupCoordinationRemote::addReplicatedAccessFilePath(const String & access
     });
 }
 
-Strings BackupCoordinationRemote::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
+Strings BackupCoordinationOnCluster::getReplicatedAccessFilePaths(const String & access_zk_path, AccessEntityType access_entity_type) const
 {
     std::lock_guard lock{replicated_access_mutex};
     prepareReplicatedAccess();
     return replicated_access->getFilePaths(access_zk_path, access_entity_type, current_host);
 }
 
-void BackupCoordinationRemote::prepareReplicatedAccess() const
+void BackupCoordinationOnCluster::prepareReplicatedAccess() const
 {
     if (replicated_access)
         return;
@@ -601,7 +604,7 @@ void BackupCoordinationRemote::prepareReplicatedAccess() const
         replicated_access->addFilePath(std::move(file_path));
 }
 
-void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
+void BackupCoordinationOnCluster::addReplicatedSQLObjectsDir(const String & loader_zk_path, UserDefinedSQLObjectType object_type, const String & dir_path)
 {
     {
         std::lock_guard lock{replicated_sql_objects_mutex};
@@ -631,14 +634,14 @@ void BackupCoordinationRemote::addReplicatedSQLObjectsDir(const String & loader_
     });
 }
 
-Strings BackupCoordinationRemote::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
+Strings BackupCoordinationOnCluster::getReplicatedSQLObjectsDirs(const String & loader_zk_path, UserDefinedSQLObjectType object_type) const
 {
     std::lock_guard lock{replicated_sql_objects_mutex};
     prepareReplicatedSQLObjects();
     return replicated_sql_objects->getDirectories(loader_zk_path, object_type, current_host);
 }
 
-void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
+void BackupCoordinationOnCluster::prepareReplicatedSQLObjects() const
 {
     if (replicated_sql_objects)
         return;
@@ -674,7 +677,7 @@ void BackupCoordinationRemote::prepareReplicatedSQLObjects() const
         replicated_sql_objects->addDirectory(std::move(directory));
 }
 
-void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
+void BackupCoordinationOnCluster::addKeeperMapTable(const String & table_zookeeper_root_path, const String & table_id, const String & data_path_in_backup)
 {
     {
         std::lock_guard lock{keeper_map_tables_mutex};
@@ -695,7 +698,7 @@ void BackupCoordinationRemote::addKeeperMapTable(const String & table_zookeeper_
     });
 }
 
-void BackupCoordinationRemote::prepareKeeperMapTables() const
+void BackupCoordinationOnCluster::prepareKeeperMapTables() const
 {
     if (keeper_map_tables)
         return;
@@ -740,7 +743,7 @@ void BackupCoordinationRemote::prepareKeeperMapTables() const
 
 }
 
-String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
+String BackupCoordinationOnCluster::getKeeperMapDataPath(const String & table_zookeeper_root_path) const
 {
     std::lock_guard lock(keeper_map_tables_mutex);
     prepareKeeperMapTables();
@@ -748,7 +751,7 @@ String BackupCoordinationRemote::getKeeperMapDataPath(const String & table_zooke
 }
 
 
-void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
+void BackupCoordinationOnCluster::addFileInfos(BackupFileInfos && file_infos_)
 {
     {
         std::lock_guard lock{file_infos_mutex};
@@ -761,21 +764,21 @@ void BackupCoordinationRemote::addFileInfos(BackupFileInfos && file_infos_)
     serializeToMultipleZooKeeperNodes(zookeeper_path + "/file_infos/" + current_host, file_infos_str, "addFileInfos");
 }
 
-BackupFileInfos BackupCoordinationRemote::getFileInfos() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfos() const
 {
     std::lock_guard lock{file_infos_mutex};
     prepareFileInfos();
     return file_infos->getFileInfos(current_host);
 }
 
-BackupFileInfos BackupCoordinationRemote::getFileInfosForAllHosts() const
+BackupFileInfos BackupCoordinationOnCluster::getFileInfosForAllHosts() const
 {
     std::lock_guard lock{file_infos_mutex};
     prepareFileInfos();
     return file_infos->getFileInfosForAllHosts();
 }
 
-void BackupCoordinationRemote::prepareFileInfos() const
+void BackupCoordinationOnCluster::prepareFileInfos() const
 {
     if (file_infos)
         return;
@@ -801,7 +804,7 @@ void BackupCoordinationRemote::prepareFileInfos() const
     }
 }
 
-bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
+bool BackupCoordinationOnCluster::startWritingFile(size_t data_file_index)
 {
     {
         /// Check if this host is already writing this file.
@@ -842,66 +845,4 @@ bool BackupCoordinationRemote::startWritingFile(size_t data_file_index)
     }
 }
 
-bool BackupCoordinationRemote::hasConcurrentBackups(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base backup
-    if (is_internal)
-        return false;
-
-    std::string backup_stage_path = zookeeper_path + "/stage";
-
-    bool result = false;
-
-    auto holder = with_retries.createRetriesControlHolder("getAllArchiveSuffixes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zk);
-
-        if (!zk->exists(root_zookeeper_path))
-            zk->createAncestors(root_zookeeper_path);
-
-        for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-        {
-            Coordination::Stat stat;
-            zk->get(root_zookeeper_path, &stat);
-            Strings existing_backup_paths = zk->getChildren(root_zookeeper_path);
-
-            for (const auto & existing_backup_path : existing_backup_paths)
-            {
-                if (startsWith(existing_backup_path, "restore-"))
-                    continue;
-
-                String existing_backup_uuid = existing_backup_path;
-                existing_backup_uuid.erase(0, String("backup-").size());
-
-                if (existing_backup_uuid == toString(backup_uuid))
-                    continue;
-
-                String status;
-                if (zk->tryGet(root_zookeeper_path + "/" + existing_backup_path + "/stage", status))
-                {
-                    /// Check if some other backup is in progress
-                    if (status == Stage::SCHEDULED_TO_START)
-                    {
-                        LOG_WARNING(log, "Found a concurrent backup: {}, current backup: {}", existing_backup_uuid, toString(backup_uuid));
-                        result = true;
-                        return;
-                    }
-                }
-            }
-
-            zk->createIfNotExists(backup_stage_path, "");
-            auto code = zk->trySet(backup_stage_path, Stage::SCHEDULED_TO_START, stat.version);
-            if (code == Coordination::Error::ZOK)
-                break;
-            bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-            if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                throw zkutil::KeeperException::fromPath(code, backup_stage_path);
-        }
-    });
-
-    return result;
-}
-
 }
diff --git a/src/Backups/BackupCoordinationRemote.h b/src/Backups/BackupCoordinationOnCluster.h
similarity index 67%
rename from src/Backups/BackupCoordinationRemote.h
rename to src/Backups/BackupCoordinationOnCluster.h
index 7a56b1a4eb8..7369c2cc746 100644
--- a/src/Backups/BackupCoordinationRemote.h
+++ b/src/Backups/BackupCoordinationOnCluster.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <Backups/IBackupCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
 #include <Backups/BackupCoordinationReplicatedSQLObjects.h>
@@ -13,32 +15,35 @@
 namespace DB
 {
 
-/// We try to store data to zookeeper several times due to possible version conflicts.
-constexpr size_t MAX_ZOOKEEPER_ATTEMPTS = 10;
-
 /// Implementation of the IBackupCoordination interface performing coordination via ZooKeeper. It's necessary for "BACKUP ON CLUSTER".
-class BackupCoordinationRemote : public IBackupCoordination
+class BackupCoordinationOnCluster : public IBackupCoordination
 {
 public:
-    using BackupKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;
 
-    BackupCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    BackupCoordinationOnCluster(
+        const UUID & backup_uuid_,
+        bool is_plain_backup_,
         const String & root_zookeeper_path_,
+        zkutil::GetZooKeeper get_zookeeper_,
         const BackupKeeperSettings & keeper_settings_,
-        const String & backup_uuid_,
-        const Strings & all_hosts_,
         const String & current_host_,
-        bool plain_backup_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_backup_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
         QueryStatusPtr process_list_element_);
 
-    ~BackupCoordinationRemote() override;
+    ~BackupCoordinationOnCluster() override;
 
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setBackupQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;
 
     void addReplicatedPartNames(
         const String & table_zk_path,
@@ -73,13 +78,14 @@ public:
     BackupFileInfos getFileInfosForAllHosts() const override;
     bool startWritingFile(size_t data_file_index) override;
 
-    bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;
 
-    static size_t findCurrentHostIndex(const Strings & all_hosts, const String & current_host);
+    static Strings excludeInitiator(const Strings & all_hosts);
+    static size_t findCurrentHostIndex(const String & current_host, const Strings & all_hosts);
 
 private:
     void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;
 
     void serializeToMultipleZooKeeperNodes(const String & path, const String & value, const String & logging_name);
     String deserializeFromMultipleZooKeeperNodes(const String & path, const String & logging_name) const;
@@ -96,26 +102,27 @@ private:
     const String root_zookeeper_path;
     const String zookeeper_path;
     const BackupKeeperSettings keeper_settings;
-    const String backup_uuid;
+    const UUID backup_uuid;
     const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
     const String current_host;
     const size_t current_host_index;
     const bool plain_backup;
-    const bool is_internal;
     LoggerPtr const log;
 
-    /// The order of these two fields matters, because stage_sync holds a reference to with_retries object
-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> backup_query_was_sent_to_other_hosts = false;
 
-    mutable std::optional<BackupCoordinationReplicatedTables> TSA_GUARDED_BY(replicated_tables_mutex) replicated_tables;
-    mutable std::optional<BackupCoordinationReplicatedAccess> TSA_GUARDED_BY(replicated_access_mutex) replicated_access;
-    mutable std::optional<BackupCoordinationReplicatedSQLObjects> TSA_GUARDED_BY(replicated_sql_objects_mutex) replicated_sql_objects;
-    mutable std::optional<BackupCoordinationFileInfos> TSA_GUARDED_BY(file_infos_mutex) file_infos;
+    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
+    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access TSA_GUARDED_BY(replicated_access_mutex);
+    mutable std::optional<BackupCoordinationReplicatedSQLObjects> replicated_sql_objects TSA_GUARDED_BY(replicated_sql_objects_mutex);
+    mutable std::optional<BackupCoordinationFileInfos> file_infos TSA_GUARDED_BY(file_infos_mutex);
     mutable std::optional<BackupCoordinationKeeperMapTables> keeper_map_tables TSA_GUARDED_BY(keeper_map_tables_mutex);
-    std::unordered_set<size_t> TSA_GUARDED_BY(writing_files_mutex) writing_files;
+    std::unordered_set<size_t> writing_files TSA_GUARDED_BY(writing_files_mutex);
 
-    mutable std::mutex zookeeper_mutex;
     mutable std::mutex replicated_tables_mutex;
     mutable std::mutex replicated_access_mutex;
     mutable std::mutex replicated_sql_objects_mutex;
diff --git a/src/Backups/BackupCoordinationStage.h b/src/Backups/BackupCoordinationStage.h
index 9abdc019784..2cd1efb5404 100644
--- a/src/Backups/BackupCoordinationStage.h
+++ b/src/Backups/BackupCoordinationStage.h
@@ -8,10 +8,6 @@ namespace DB
 
 namespace BackupCoordinationStage
 {
-    /// This stage is set after concurrency check so ensure we dont start other backup/restores
-    /// when concurrent backup/restores are not allowed
-    constexpr const char * SCHEDULED_TO_START = "scheduled to start";
-
     /// Finding all tables and databases which we're going to put to the backup and collecting their metadata.
     constexpr const char * GATHERING_METADATA = "gathering metadata";
 
@@ -46,10 +42,6 @@ namespace BackupCoordinationStage
 
     /// Coordination stage meaning that a host finished its work.
     constexpr const char * COMPLETED = "completed";
-
-    /// Coordination stage meaning that backup/restore has failed due to an error
-    /// Check '/error' for the error message
-    constexpr const char * ERROR = "error";
 }
 
 }
diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp
index 17ef163ce35..1642cab70c7 100644
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
@@ -9,267 +9,1084 @@
 #include <IO/WriteBufferFromString.h>
 #include <IO/WriteHelpers.h>
 #include <Backups/BackupCoordinationStage.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Poco/URI.h>
+#include <boost/algorithm/string/join.hpp>
+
 
 namespace DB
 {
 
-namespace Stage = BackupCoordinationStage;
-
 namespace ErrorCodes
 {
     extern const int FAILED_TO_SYNC_BACKUP_OR_RESTORE;
+    extern const int LOGICAL_ERROR;
 }
 
+namespace
+{
+    /// The coordination version is stored in the 'start' node for each host
+    /// by each host when it starts working on this backup or restore.
+    /// The initial version didn't use nodes 'finish*' and 'num_hosts'.
+    constexpr const int kInitialVersion = 1;
+    constexpr const int kCurrentVersion = 2;
+}
+
+bool BackupCoordinationStageSync::HostInfo::operator ==(const HostInfo & other) const
+{
+    /// We don't compare `last_connection_time` here.
+    return (host == other.host) && (started == other.started) && (connected == other.connected) && (finished == other.finished)
+        && (stages == other.stages) && (!!exception == !!other.exception);
+}
+
+bool BackupCoordinationStageSync::HostInfo::operator !=(const HostInfo & other) const
+{
+    return !(*this == other);
+}
+
+bool BackupCoordinationStageSync::State::operator ==(const State & other) const = default;
+bool BackupCoordinationStageSync::State::operator !=(const State & other) const = default;
+
 
 BackupCoordinationStageSync::BackupCoordinationStageSync(
-    const String & root_zookeeper_path_,
-    WithRetries & with_retries_,
-    LoggerPtr log_)
-    : zookeeper_path(root_zookeeper_path_ + "/stage")
+        bool is_restore_,
+        const String & zookeeper_path_,
+        const String & current_host_,
+        const Strings & all_hosts_,
+        bool allow_concurrency_,
+        const WithRetries & with_retries_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+        QueryStatusPtr process_list_element_,
+        LoggerPtr log_)
+    : is_restore(is_restore_)
+    , operation_name(is_restore ? "restore" : "backup")
+    , current_host(current_host_)
+    , current_host_desc(getHostDesc(current_host))
+    , all_hosts(all_hosts_)
+    , allow_concurrency(allow_concurrency_)
     , with_retries(with_retries_)
+    , schedule(schedule_)
+    , process_list_element(process_list_element_)
     , log(log_)
+    , failure_after_host_disconnected_for_seconds(with_retries.getKeeperSettings().failure_after_host_disconnected_for_seconds)
+    , finish_timeout_after_error(with_retries.getKeeperSettings().finish_timeout_after_error)
+    , sync_period_ms(with_retries.getKeeperSettings().sync_period_ms)
+    , max_attempts_after_bad_version(with_retries.getKeeperSettings().max_attempts_after_bad_version)
+    , zookeeper_path(zookeeper_path_)
+    , root_zookeeper_path(zookeeper_path.parent_path().parent_path())
+    , operation_node_path(zookeeper_path.parent_path())
+    , operation_node_name(zookeeper_path.parent_path().filename())
+    , stage_node_path(zookeeper_path)
+    , start_node_path(zookeeper_path / ("started|" + current_host))
+    , finish_node_path(zookeeper_path / ("finished|" + current_host))
+    , num_hosts_node_path(zookeeper_path / "num_hosts")
+    , alive_node_path(zookeeper_path / ("alive|" + current_host))
+    , alive_tracker_node_path(fs::path{root_zookeeper_path} / "alive_tracker")
+    , error_node_path(zookeeper_path / "error")
+    , zk_nodes_changed(std::make_shared<Poco::Event>())
 {
+    if ((zookeeper_path.filename() != "stage") || !operation_node_name.starts_with(is_restore ? "restore-" : "backup-")
+        || (root_zookeeper_path == operation_node_path))
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected path in ZooKeeper specified: {}", zookeeper_path);
+    }
+
+    initializeState();
     createRootNodes();
+
+    try
+    {
+        createStartAndAliveNodes();
+        startWatchingThread();
+    }
+    catch (...)
+    {
+        trySetError(std::current_exception());
+        tryFinishImpl();
+        throw;
+    }
 }
 
+
+BackupCoordinationStageSync::~BackupCoordinationStageSync()
+{
+    tryFinishImpl();
+}
+
+
+void BackupCoordinationStageSync::initializeState()
+{
+    std::lock_guard lock{mutex};
+    auto now = std::chrono::system_clock::now();
+    auto monotonic_now = std::chrono::steady_clock::now();
+
+    for (const String & host : all_hosts)
+        state.hosts.emplace(host, HostInfo{.host = host, .last_connection_time = now, .last_connection_time_monotonic = monotonic_now});
+}
+
+
+String BackupCoordinationStageSync::getHostDesc(const String & host)
+{
+    String res;
+    if (host.empty())
+    {
+        res = "the initiator";
+    }
+    else
+    {
+        try
+        {
+            res = "host ";
+            Poco::URI::decode(host, res); /// Append the decoded host name to `res`.
+        }
+        catch (const Poco::URISyntaxException &)
+        {
+            res = "host " + host;
+        }
+    }
+    return res;
+}
+
+
+String BackupCoordinationStageSync::getHostsDesc(const Strings & hosts)
+{
+    String res = "[";
+    for (const String & host : hosts)
+    {
+        if (res != "[")
+            res += ", ";
+        res += getHostDesc(host);
+    }
+    res += "]";
+    return res;
+}
+
+
 void BackupCoordinationStageSync::createRootNodes()
 {
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
+    auto holder = with_retries.createRetriesControlHolder("BackupStageSync::createRootNodes", WithRetries::kInitialization);
     holder.retries_ctl.retryLoop(
         [&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            zookeeper->createAncestors(root_zookeeper_path);
+            zookeeper->createIfNotExists(root_zookeeper_path, "");
+        });
+}
+
+
+void BackupCoordinationStageSync::createStartAndAliveNodes()
+{
+    auto holder = with_retries.createRetriesControlHolder("BackupStageSync::createStartAndAliveNodes", WithRetries::kInitialization);
+    holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
     {
         with_retries.renewZooKeeper(zookeeper);
-        zookeeper->createAncestors(zookeeper_path);
-        zookeeper->createIfNotExists(zookeeper_path, "");
+        createStartAndAliveNodes(zookeeper);
     });
 }
 
-void BackupCoordinationStageSync::set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts)
-{
-    auto holder = with_retries.createRetriesControlHolder("set");
-    holder.retries_ctl.retryLoop(
-        [&, &zookeeper = holder.faulty_zookeeper]()
-    {
-        with_retries.renewZooKeeper(zookeeper);
 
-        if (all_hosts)
+void BackupCoordinationStageSync::createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper)
+{
+    /// The "num_hosts" node keeps the number of hosts which started (created the "started" node)
+    /// but not yet finished (not created the "finished" node).
+    /// The number of alive hosts can be less than that.
+
+    /// The "alive_tracker" node always keeps an empty string, we track its version only.
+    /// The "alive_tracker" node increases its version each time when any "alive" nodes are created
+    /// so we use it to check concurrent backups/restores.
+    zookeeper->createIfNotExists(alive_tracker_node_path, "");
+
+    std::optional<size_t> num_hosts;
+    int num_hosts_version = -1;
+
+    bool check_concurrency = !allow_concurrency;
+    int alive_tracker_version = -1;
+
+    for (size_t attempt_no = 1; attempt_no <= max_attempts_after_bad_version; ++attempt_no)
+    {
+        if (!num_hosts)
         {
-            auto code = zookeeper->trySet(zookeeper_path, new_stage);
-            if (code != Coordination::Error::ZOK)
-                throw zkutil::KeeperException::fromPath(code, zookeeper_path);
+            String num_hosts_str;
+            Coordination::Stat stat;
+            if (zookeeper->tryGet(num_hosts_node_path, num_hosts_str, &stat))
+            {
+                num_hosts = parseFromString<size_t>(num_hosts_str);
+                num_hosts_version = stat.version;
+            }
+        }
+
+        String serialized_error;
+        if (zookeeper->tryGet(error_node_path, serialized_error))
+        {
+            auto [exception, host] = parseErrorNode(serialized_error);
+            if (exception)
+                std::rethrow_exception(exception);
+        }
+
+        if (check_concurrency)
+        {
+            Coordination::Stat stat;
+            zookeeper->exists(alive_tracker_node_path, &stat);
+            alive_tracker_version = stat.version;
+
+            checkConcurrency(zookeeper);
+            check_concurrency = false;
+        }
+
+        Coordination::Requests requests;
+        requests.reserve(6);
+
+        size_t operation_node_path_pos = static_cast<size_t>(-1);
+        if (!zookeeper->exists(operation_node_path))
+        {
+            operation_node_path_pos = requests.size();
+            requests.emplace_back(zkutil::makeCreateRequest(operation_node_path, "", zkutil::CreateMode::Persistent));
+        }
+
+        size_t stage_node_path_pos = static_cast<size_t>(-1);
+        if (!zookeeper->exists(stage_node_path))
+        {
+            stage_node_path_pos = requests.size();
+            requests.emplace_back(zkutil::makeCreateRequest(stage_node_path, "", zkutil::CreateMode::Persistent));
+        }
+
+        size_t num_hosts_node_path_pos = requests.size();
+        if (num_hosts)
+            requests.emplace_back(zkutil::makeSetRequest(num_hosts_node_path, toString(*num_hosts + 1), num_hosts_version));
+        else
+            requests.emplace_back(zkutil::makeCreateRequest(num_hosts_node_path, "1", zkutil::CreateMode::Persistent));
+
+        size_t alive_tracker_node_path_pos = requests.size();
+        requests.emplace_back(zkutil::makeSetRequest(alive_tracker_node_path, "", alive_tracker_version));
+
+        requests.emplace_back(zkutil::makeCreateRequest(start_node_path, std::to_string(kCurrentVersion), zkutil::CreateMode::Persistent));
+        requests.emplace_back(zkutil::makeCreateRequest(alive_node_path, "", zkutil::CreateMode::Ephemeral));
+
+        Coordination::Responses responses;
+        auto code = zookeeper->tryMulti(requests, responses);
+
+        if (code == Coordination::Error::ZOK)
+        {
+            LOG_INFO(log, "Created start node #{} in ZooKeeper for {} (coordination version: {})",
+                     num_hosts.value_or(0) + 1, current_host_desc, kCurrentVersion);
+            return;
+        }
+
+        auto show_error_before_next_attempt = [&](const String & message)
+        {
+            bool will_try_again = (attempt_no < max_attempts_after_bad_version);
+            LOG_TRACE(log, "{} (attempt #{}){}", message, attempt_no, will_try_again ? ", will try again" : "");
+        };
+
+        if ((responses.size() > operation_node_path_pos) &&
+            (responses[operation_node_path_pos]->error == Coordination::Error::ZNODEEXISTS))
+        {
+            show_error_before_next_attempt(fmt::format("Node {} in ZooKeeper already exists", operation_node_path));
+            /// needs another attempt
+        }
+        else if ((responses.size() > stage_node_path_pos) &&
+            (responses[stage_node_path_pos]->error == Coordination::Error::ZNODEEXISTS))
+        {
+            show_error_before_next_attempt(fmt::format("Node {} in ZooKeeper already exists", stage_node_path));
+            /// needs another attempt
+        }
+        else if ((responses.size() > num_hosts_node_path_pos) && num_hosts &&
+            (responses[num_hosts_node_path_pos]->error == Coordination::Error::ZBADVERSION))
+        {
+            show_error_before_next_attempt("Other host changed the 'num_hosts' node in ZooKeeper");
+            num_hosts.reset(); /// needs to reread 'num_hosts' again
+        }
+        else if ((responses.size() > num_hosts_node_path_pos) && num_hosts &&
+            (responses[num_hosts_node_path_pos]->error == Coordination::Error::ZNONODE))
+        {
+            show_error_before_next_attempt("Other host removed the 'num_hosts' node in ZooKeeper");
+            num_hosts.reset(); /// needs to reread 'num_hosts' again
+        }
+        else if ((responses.size() > num_hosts_node_path_pos) && !num_hosts &&
+            (responses[num_hosts_node_path_pos]->error == Coordination::Error::ZNODEEXISTS))
+        {
+            show_error_before_next_attempt("Other host created the 'num_hosts' node in ZooKeeper");
+            /// needs another attempt
+        }
+        else if ((responses.size() > alive_tracker_node_path_pos) &&
+            (responses[alive_tracker_node_path_pos]->error == Coordination::Error::ZBADVERSION))
+        {
+            show_error_before_next_attempt("Concurrent backup or restore changed some 'alive' nodes in ZooKeeper");
+            check_concurrency = true; /// needs to recheck for concurrency again
         }
         else
         {
-            zookeeper->createIfNotExists(zookeeper_path + "/started|" + current_host, "");
-            zookeeper->createIfNotExists(zookeeper_path + "/current|" + current_host + "|" + new_stage, message);
+            zkutil::KeeperMultiException::check(code, requests, responses);
         }
+    }
+
+    throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                    "Couldn't create the 'start' node in ZooKeeper for {} after {} attempts",
+                    current_host_desc, max_attempts_after_bad_version);
+}
+
+
+void BackupCoordinationStageSync::checkConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper)
+{
+    if (allow_concurrency)
+        return;
+
+    Strings found_operations;
+    auto code = zookeeper->tryGetChildren(root_zookeeper_path, found_operations);
+
+    if (!((code == Coordination::Error::ZOK) || (code == Coordination::Error::ZNONODE)))
+        throw zkutil::KeeperException::fromPath(code, root_zookeeper_path);
+
+    if (code == Coordination::Error::ZNONODE)
+        return;
+
+    for (const String & found_operation : found_operations)
+    {
+        if (found_operation.starts_with(is_restore ? "restore-" : "backup-") && (found_operation != operation_node_name))
+        {
+            Strings stages;
+            code = zookeeper->tryGetChildren(fs::path{root_zookeeper_path} / found_operation / "stage", stages);
+
+            if (!((code == Coordination::Error::ZOK) || (code == Coordination::Error::ZNONODE)))
+                throw zkutil::KeeperException::fromPath(code, fs::path{root_zookeeper_path} / found_operation / "stage");
+
+            if (code == Coordination::Error::ZOK)
+            {
+                for (const String & stage : stages)
+                {
+                    if (stage.starts_with("alive"))
+                        BackupConcurrencyCheck::throwConcurrentOperationNotAllowed(is_restore);
+                }
+            }
+        }
+    }
+}
+
+
+void BackupCoordinationStageSync::startWatchingThread()
+{
+    watching_thread_future = schedule([this]() { watchingThread(); }, Priority{});
+}
+
+
+void BackupCoordinationStageSync::stopWatchingThread()
+{
+    should_stop_watching_thread = true;
+
+    /// Wake up waiting threads.
+    if (zk_nodes_changed)
+        zk_nodes_changed->set();
+    state_changed.notify_all();
+
+    if (watching_thread_future.valid())
+        watching_thread_future.wait();
+}
+
+
+void BackupCoordinationStageSync::watchingThread()
+{
+    while (!should_stop_watching_thread)
+    {
+        try
+        {
+            /// Check if the current BACKUP or RESTORE command is already cancelled.
+            checkIfQueryCancelled();
+
+            /// Reset the `connected` flag for each host, we'll set them to true again after we find the 'alive' nodes.
+            resetConnectedFlag();
+
+            /// Recreate the 'alive' node if necessary and read a new state from ZooKeeper.
+            auto holder = with_retries.createRetriesControlHolder("BackupStageSync::watchingThread");
+            auto & zookeeper = holder.faulty_zookeeper;
+            with_retries.renewZooKeeper(zookeeper);
+
+            if (should_stop_watching_thread)
+                return;
+
+            /// Recreate the 'alive' node if it was removed.
+            createAliveNode(zookeeper);
+
+            /// Reads the current state from nodes in ZooKeeper.
+            readCurrentState(zookeeper);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, "Caugth exception while watching");
+        }
+
+        try
+        {
+            /// Cancel the query if there is an error on another host or if some host was disconnected too long.
+            cancelQueryIfError();
+            cancelQueryIfDisconnectedTooLong();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, "Caugth exception while checking if the query should be cancelled");
+        }
+
+        zk_nodes_changed->tryWait(sync_period_ms.count());
+    }
+}
+
+
+void BackupCoordinationStageSync::createAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper)
+{
+    if (zookeeper->exists(alive_node_path))
+        return;
+
+    Coordination::Requests requests;
+    requests.emplace_back(zkutil::makeCreateRequest(alive_node_path, "", zkutil::CreateMode::Ephemeral));
+    requests.emplace_back(zkutil::makeSetRequest(alive_tracker_node_path, "", -1));
+    zookeeper->multi(requests);
+
+    LOG_INFO(log, "The alive node was recreated for {}", current_host_desc);
+}
+
+
+void BackupCoordinationStageSync::resetConnectedFlag()
+{
+    std::lock_guard lock{mutex};
+    for (auto & [_, host_info] : state.hosts)
+        host_info.connected = false;
+}
+
+
+void BackupCoordinationStageSync::readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper)
+{
+    zk_nodes_changed->reset();
+
+    /// Get zk nodes and subscribe on their changes.
+    Strings new_zk_nodes = zookeeper->getChildren(stage_node_path, nullptr, zk_nodes_changed);
+    std::sort(new_zk_nodes.begin(), new_zk_nodes.end()); /// Sorting is necessary because we compare the list of zk nodes with its previous versions.
+
+    State new_state;
+
+    {
+        std::lock_guard lock{mutex};
+
+        /// Log all changes in zookeeper nodes in the "stage" folder to make debugging easier.
+        Strings added_zk_nodes, removed_zk_nodes;
+        std::set_difference(new_zk_nodes.begin(), new_zk_nodes.end(), zk_nodes.begin(), zk_nodes.end(), back_inserter(added_zk_nodes));
+        std::set_difference(zk_nodes.begin(), zk_nodes.end(), new_zk_nodes.begin(), new_zk_nodes.end(), back_inserter(removed_zk_nodes));
+        if (!added_zk_nodes.empty())
+            LOG_TRACE(log, "Detected new zookeeper nodes appeared in the stage folder: {}", boost::algorithm::join(added_zk_nodes, ", "));
+        if (!removed_zk_nodes.empty())
+            LOG_TRACE(log, "Detected that some zookeeper nodes disappeared from the stage folder: {}", boost::algorithm::join(removed_zk_nodes, ", "));
+
+        zk_nodes = new_zk_nodes;
+        new_state = state;
+    }
+
+    auto get_host_info = [&](const String & host) -> HostInfo *
+    {
+        auto it = new_state.hosts.find(host);
+        if (it == new_state.hosts.end())
+            return nullptr;
+        return &it->second;
+    };
+
+    auto now = std::chrono::system_clock::now();
+    auto monotonic_now = std::chrono::steady_clock::now();
+
+    /// Read the current state from zookeeper nodes.
+    for (const auto & zk_node : new_zk_nodes)
+    {
+        if (zk_node == "error")
+        {
+            if (!new_state.host_with_error)
+            {
+                String serialized_error = zookeeper->get(error_node_path);
+                auto [exception, host] = parseErrorNode(serialized_error);
+                if (auto * host_info = get_host_info(host))
+                {
+                    host_info->exception = exception;
+                    new_state.host_with_error = host;
+                }
+            }
+        }
+        else if (zk_node.starts_with("started|"))
+        {
+            String host = zk_node.substr(strlen("started|"));
+            if (auto * host_info = get_host_info(host))
+            {
+                if (!host_info->started)
+                {
+                    host_info->version = parseStartNode(zookeeper->get(zookeeper_path / zk_node), host);
+                    host_info->started = true;
+                }
+            }
+        }
+        else if (zk_node.starts_with("finished|"))
+        {
+            String host = zk_node.substr(strlen("finished|"));
+            if (auto * host_info = get_host_info(host))
+                host_info->finished = true;
+        }
+        else if (zk_node.starts_with("alive|"))
+        {
+            String host = zk_node.substr(strlen("alive|"));
+            if (auto * host_info = get_host_info(host))
+            {
+                host_info->connected = true;
+                host_info->last_connection_time = now;
+                host_info->last_connection_time_monotonic = monotonic_now;
+            }
+        }
+        else if (zk_node.starts_with("current|"))
+        {
+            String host_and_stage = zk_node.substr(strlen("current|"));
+            size_t separator_pos = host_and_stage.find('|');
+            if (separator_pos != String::npos)
+            {
+                String host = host_and_stage.substr(0, separator_pos);
+                String stage = host_and_stage.substr(separator_pos + 1);
+                if (auto * host_info = get_host_info(host))
+                {
+                    String result = zookeeper->get(fs::path{zookeeper_path} / zk_node);
+                    host_info->stages[stage] = std::move(result);
+
+                    /// The initial version didn't create the 'finish' ZooKeeper nodes so
+                    /// we consider that if the "completed" stage is reached by a host then the host has finished its work.
+                    /// This assumption is not correct if an error happens, but the initial version can't handle errors quite
+                    /// correctly anyway.
+                    if ((host_info->version == kInitialVersion) && (stage == BackupCoordinationStage::COMPLETED))
+                        host_info->finished = true;
+                }
+            }
+        }
+    }
+
+    /// Check if the state has been just changed, and if so then wake up waiting threads (see waitHostsReachStage()).
+    bool was_state_changed = false;
+
+    {
+        std::lock_guard lock{mutex};
+        was_state_changed = (new_state != state);
+        state = std::move(new_state);
+    }
+
+    if (was_state_changed)
+        state_changed.notify_all();
+}
+
+
+int BackupCoordinationStageSync::parseStartNode(const String & start_node_contents, const String & host) const
+{
+    int version;
+    if (start_node_contents.empty())
+    {
+        version = kInitialVersion;
+    }
+    else if (!tryParse(version, start_node_contents) || (version < kInitialVersion))
+    {
+        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                        "Coordination version {} used by {} is not supported", start_node_contents, getHostDesc(host));
+    }
+
+    if (version < kCurrentVersion)
+        LOG_WARNING(log, "Coordination version {} used by {} is outdated", version, getHostDesc(host));
+    return version;
+}
+
+
+std::pair<std::exception_ptr, String> BackupCoordinationStageSync::parseErrorNode(const String & error_node_contents)
+{
+    ReadBufferFromOwnString buf{error_node_contents};
+    String host;
+    readStringBinary(host, buf);
+    auto exception = std::make_exception_ptr(readException(buf, fmt::format("Got error from {}", getHostDesc(host))));
+    return {exception, host};
+}
+
+
+void BackupCoordinationStageSync::checkIfQueryCancelled()
+{
+    if (process_list_element->checkTimeLimitSoft())
+        return; /// Not cancelled.
+
+    std::lock_guard lock{mutex};
+    if (state.cancelled)
+        return; /// Already marked as cancelled.
+
+    state.cancelled = true;
+    state_changed.notify_all();
+}
+
+
+void BackupCoordinationStageSync::cancelQueryIfError()
+{
+    std::exception_ptr exception;
+
+    {
+        std::lock_guard lock{mutex};
+        if (state.cancelled || !state.host_with_error)
+            return;
+
+        state.cancelled = true;
+        exception = state.hosts.at(*state.host_with_error).exception;
+    }
+
+    process_list_element->cancelQuery(false, exception);
+    state_changed.notify_all();
+}
+
+
+void BackupCoordinationStageSync::cancelQueryIfDisconnectedTooLong()
+{
+    std::exception_ptr exception;
+
+    {
+        std::lock_guard lock{mutex};
+        if (state.cancelled || state.host_with_error || ((failure_after_host_disconnected_for_seconds.count() == 0)))
+            return;
+
+        auto monotonic_now = std::chrono::steady_clock::now();
+        bool info_shown = false;
+
+        for (auto & [host, host_info] : state.hosts)
+        {
+            if (!host_info.connected && !host_info.finished && (host != current_host))
+            {
+                auto disconnected_duration = std::chrono::duration_cast<std::chrono::seconds>(monotonic_now - host_info.last_connection_time_monotonic);
+                if (disconnected_duration > failure_after_host_disconnected_for_seconds)
+                {
+                    /// Host `host` was disconnected too long.
+                    /// We can't just throw an exception here because readCurrentState() is called from a background thread.
+                    /// So here we're writingh the error to the `process_list_element` and let it to be thrown later
+                    /// from `process_list_element->checkTimeLimit()`.
+                    String message = fmt::format("The 'alive' node hasn't been updated in ZooKeeper for {} for {} "
+                                                 "which is more than the specified timeout {}. Last time the 'alive' node was detected at {}",
+                                                 getHostDesc(host), disconnected_duration, failure_after_host_disconnected_for_seconds,
+                                                 host_info.last_connection_time);
+                    LOG_WARNING(log, "Lost connection to {}: {}", getHostDesc(host), message);
+                    exception = std::make_exception_ptr(Exception{ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "Lost connection to {}: {}", getHostDesc(host), message});
+                    break;
+                }
+
+                if ((disconnected_duration >= std::chrono::seconds{1}) && !info_shown)
+                {
+                    LOG_TRACE(log, "The 'alive' node hasn't been updated in ZooKeeper for {} for {}", getHostDesc(host), disconnected_duration);
+                    info_shown = true;
+                }
+            }
+        }
+
+        if (!exception)
+            return;
+
+        state.cancelled = true;
+    }
+
+    process_list_element->cancelQuery(false, exception);
+    state_changed.notify_all();
+}
+
+
+void BackupCoordinationStageSync::setStage(const String & stage, const String & stage_result)
+{
+    LOG_INFO(log, "{} reached stage {}", current_host_desc, stage);
+    auto holder = with_retries.createRetriesControlHolder("BackupStageSync::setStage");
+    holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+    {
+        with_retries.renewZooKeeper(zookeeper);
+        zookeeper->createIfNotExists(getStageNodePath(stage), stage_result);
     });
 }
 
-void BackupCoordinationStageSync::setError(const String & current_host, const Exception & exception)
+
+String BackupCoordinationStageSync::getStageNodePath(const String & stage) const
 {
-    auto holder = with_retries.createRetriesControlHolder("setError");
-    holder.retries_ctl.retryLoop(
-        [&, &zookeeper = holder.faulty_zookeeper]()
+    return fs::path{zookeeper_path} / ("current|" + current_host + "|" + stage);
+}
+
+
+bool BackupCoordinationStageSync::trySetError(std::exception_ptr exception) noexcept
+{
+    try
+    {
+        std::rethrow_exception(exception);
+    }
+    catch (const Exception & e)
+    {
+        return trySetError(e);
+    }
+    catch (...)
+    {
+        return trySetError(Exception(getCurrentExceptionMessageAndPattern(true, true), getCurrentExceptionCode()));
+    }
+}
+
+
+bool BackupCoordinationStageSync::trySetError(const Exception & exception)
+{
+    try
+    {
+        setError(exception);
+        return true;
+    }
+    catch (...)
+    {
+        return false;
+    }
+}
+
+
+void BackupCoordinationStageSync::setError(const Exception & exception)
+{
+    /// Most likely this exception has been already logged so here we're logging it without stacktrace.
+    String exception_message = getExceptionMessage(exception, /* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true);
+    LOG_INFO(log, "Sending exception from {} to other hosts: {}", current_host_desc, exception_message);
+
+    auto holder = with_retries.createRetriesControlHolder("BackupStageSync::setError", WithRetries::kErrorHandling);
+
+    holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
     {
         with_retries.renewZooKeeper(zookeeper);
 
         WriteBufferFromOwnString buf;
         writeStringBinary(current_host, buf);
         writeException(exception, buf, true);
-        zookeeper->createIfNotExists(zookeeper_path + "/error", buf.str());
+        auto code = zookeeper->tryCreate(error_node_path, buf.str(), zkutil::CreateMode::Persistent);
 
-        /// When backup/restore fails, it removes the nodes from Zookeeper.
-        /// Sometimes it fails to remove all nodes. It's possible that it removes /error node, but fails to remove /stage node,
-        /// so the following line tries to preserve the error status.
-        auto code = zookeeper->trySet(zookeeper_path, Stage::ERROR);
-        if (code != Coordination::Error::ZOK)
-            throw zkutil::KeeperException::fromPath(code, zookeeper_path);
+        if (code == Coordination::Error::ZOK)
+        {
+            LOG_TRACE(log, "Sent exception from {} to other hosts", current_host_desc);
+        }
+        else if (code == Coordination::Error::ZNODEEXISTS)
+        {
+            LOG_INFO(log, "An error has been already assigned for this {}", operation_name);
+        }
+        else
+        {
+            throw zkutil::KeeperException::fromPath(code, error_node_path);
+        }
     });
 }
 
-Strings BackupCoordinationStageSync::wait(const Strings & all_hosts, const String & stage_to_wait)
+
+Strings BackupCoordinationStageSync::waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout) const
 {
-    return waitImpl(all_hosts, stage_to_wait, {});
-}
-
-Strings BackupCoordinationStageSync::waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return waitImpl(all_hosts, stage_to_wait, timeout);
-}
-
-namespace
-{
-    struct UnreadyHost
-    {
-        String host;
-        bool started = false;
-    };
-}
-
-struct BackupCoordinationStageSync::State
-{
-    std::optional<Strings> results;
-    std::optional<std::pair<String, Exception>> error;
-    std::optional<String> disconnected_host;
-    std::optional<UnreadyHost> unready_host;
-};
-
-BackupCoordinationStageSync::State BackupCoordinationStageSync::readCurrentState(
-    WithRetries::RetriesControlHolder & retries_control_holder,
-    const Strings & zk_nodes,
-    const Strings & all_hosts,
-    const String & stage_to_wait) const
-{
-    auto zookeeper = retries_control_holder.faulty_zookeeper;
-    auto & retries_ctl = retries_control_holder.retries_ctl;
-
-    std::unordered_set<std::string_view> zk_nodes_set{zk_nodes.begin(), zk_nodes.end()};
-
-    State state;
-    if (zk_nodes_set.contains("error"))
-    {
-        String errors = zookeeper->get(zookeeper_path + "/error");
-        ReadBufferFromOwnString buf{errors};
-        String host;
-        readStringBinary(host, buf);
-        state.error = std::make_pair(host, readException(buf, fmt::format("Got error from {}", host)));
-        return state;
-    }
-
-    std::optional<UnreadyHost> unready_host;
-
-    for (const auto & host : all_hosts)
-    {
-        if (!zk_nodes_set.contains("current|" + host + "|" + stage_to_wait))
-        {
-            const String started_node_name = "started|" + host;
-            const String alive_node_name = "alive|" + host;
-
-            bool started = zk_nodes_set.contains(started_node_name);
-            bool alive = zk_nodes_set.contains(alive_node_name);
-
-            if (!alive)
-            {
-                /// If the "alive" node doesn't exist then we don't have connection to the corresponding host.
-                /// This node is ephemeral so probably it will be recreated soon. We use zookeeper retries to wait.
-                /// In worst case when we won't manage to see the alive node for a long time we will just abort the backup.
-                const auto * const suffix = retries_ctl.isLastRetry() ? "" : ", will retry";
-                if (started)
-                    retries_ctl.setUserError(Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
-                                                       "Lost connection to host {}{}", host, suffix));
-                else
-                    retries_ctl.setUserError(Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
-                                                       "No connection to host {} yet{}", host, suffix));
-
-                state.disconnected_host = host;
-                return state;
-            }
-
-            if (!unready_host)
-                unready_host.emplace(UnreadyHost{.host = host, .started = started});
-        }
-    }
-
-    if (unready_host)
-    {
-        state.unready_host = std::move(unready_host);
-        return state;
-    }
-
     Strings results;
-    for (const auto & host : all_hosts)
-        results.emplace_back(zookeeper->get(zookeeper_path + "/current|" + host + "|" + stage_to_wait));
-    state.results = std::move(results);
+    results.resize(hosts.size());
 
-    return state;
+    std::unique_lock lock{mutex};
+
+    /// TSA_NO_THREAD_SAFETY_ANALYSIS is here because Clang Thread Safety Analysis doesn't understand std::unique_lock.
+    auto check_if_hosts_ready = [&](bool time_is_out) TSA_NO_THREAD_SAFETY_ANALYSIS
+    {
+        return checkIfHostsReachStage(hosts, stage_to_wait, time_is_out, timeout, results);
+    };
+
+    if (timeout)
+    {
+        if (!state_changed.wait_for(lock, *timeout, [&] { return check_if_hosts_ready(/* time_is_out = */ false); }))
+            check_if_hosts_ready(/* time_is_out = */ true);
+    }
+    else
+    {
+        state_changed.wait(lock, [&] { return check_if_hosts_ready(/* time_is_out = */ false); });
+    }
+
+    return results;
 }
 
-Strings BackupCoordinationStageSync::waitImpl(
-    const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const
+
+bool BackupCoordinationStageSync::checkIfHostsReachStage(
+    const Strings & hosts,
+    const String & stage_to_wait,
+    bool time_is_out,
+    std::optional<std::chrono::milliseconds> timeout,
+    Strings & results) const
 {
-    if (all_hosts.empty())
-        return {};
+    if (should_stop_watching_thread)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "finish() was called while waiting for a stage");
 
-    /// Wait until all hosts are ready or an error happens or time is out.
+    process_list_element->checkTimeLimit();
 
-    bool use_timeout = timeout.has_value();
-    std::chrono::steady_clock::time_point end_of_timeout;
-    if (use_timeout)
-        end_of_timeout = std::chrono::steady_clock::now() + std::chrono::duration_cast<std::chrono::steady_clock::duration>(*timeout);
-
-    State state;
-    for (;;)
+    for (size_t i = 0; i != hosts.size(); ++i)
     {
-        LOG_INFO(log, "Waiting for the stage {}", stage_to_wait);
-        /// Set by ZooKepper when list of zk nodes have changed.
-        auto watch = std::make_shared<Poco::Event>();
-        Strings zk_nodes;
-        {
-            auto holder = with_retries.createRetriesControlHolder("waitImpl");
-            holder.retries_ctl.retryLoop(
-                [&, &zookeeper = holder.faulty_zookeeper]()
-            {
-                with_retries.renewZooKeeper(zookeeper);
-                watch->reset();
-                /// Get zk nodes and subscribe on their changes.
-                zk_nodes = zookeeper->getChildren(zookeeper_path, nullptr, watch);
+        const String & host = hosts[i];
+        auto it = state.hosts.find(host);
 
-                /// Read the current state of zk nodes.
-                state = readCurrentState(holder, zk_nodes, all_hosts, stage_to_wait);
-            });
+        if (it == state.hosts.end())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "waitForHostsToReachStage() was called for unexpected {}, all hosts are {}", getHostDesc(host), getHostsDesc(all_hosts));
+
+        const HostInfo & host_info = it->second;
+        auto stage_it = host_info.stages.find(stage_to_wait);
+        if (stage_it != host_info.stages.end())
+        {
+            results[i] = stage_it->second;
+            continue;
         }
 
-        /// Analyze the current state of zk nodes.
-        chassert(state.results || state.error || state.disconnected_host || state.unready_host);
-
-        if (state.results || state.error || state.disconnected_host)
-            break; /// Everything is ready or error happened.
-
-        /// Log what we will wait.
-        const auto & unready_host = *state.unready_host;
-        LOG_INFO(log, "Waiting on ZooKeeper watch for any node to be changed (currently waiting for host {}{})",
-                 unready_host.host,
-                 (!unready_host.started ? " which didn't start the operation yet" : ""));
-
-        /// Wait until `watch_callback` is called by ZooKeeper meaning that zk nodes have changed.
+        if (host_info.finished)
         {
-            if (use_timeout)
+            throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                            "{} finished without coming to stage {}", getHostDesc(host), stage_to_wait);
+        }
+
+        String host_status;
+        if (!host_info.started)
+            host_status = fmt::format(": the host hasn't started working on this {} yet", operation_name);
+        else if (!host_info.connected)
+            host_status = fmt::format(": the host is currently disconnected, last connection was at {}", host_info.last_connection_time);
+
+        if (!time_is_out)
+        {
+            LOG_TRACE(log, "Waiting for {} to reach stage {}{}", getHostDesc(host), stage_to_wait, host_status);
+            return false;
+        }
+        else
+        {
+            throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                            "Waited longer than timeout {} for {} to reach stage {}{}",
+                            *timeout, getHostDesc(host), stage_to_wait, host_status);
+        }
+    }
+
+    LOG_INFO(log, "Hosts {} reached stage {}", getHostsDesc(hosts), stage_to_wait);
+    return true;
+}
+
+
+void BackupCoordinationStageSync::finish(bool & other_hosts_also_finished)
+{
+    tryFinishImpl(other_hosts_also_finished, /* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+}
+
+
+bool BackupCoordinationStageSync::tryFinishAfterError(bool & other_hosts_also_finished) noexcept
+{
+    return tryFinishImpl(other_hosts_also_finished, /* throw_if_error = */ false, /* retries_kind = */ WithRetries::kErrorHandling);
+}
+
+
+bool BackupCoordinationStageSync::tryFinishImpl()
+{
+    bool other_hosts_also_finished;
+    return tryFinishAfterError(other_hosts_also_finished);
+}
+
+
+bool BackupCoordinationStageSync::tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind)
+{
+    auto get_value_other_hosts_also_finished = [&] TSA_REQUIRES(mutex)
+    {
+        other_hosts_also_finished = true;
+        for (const auto & [host, host_info] : state.hosts)
+        {
+            if ((host != current_host) && !host_info.finished)
+                other_hosts_also_finished = false;
+        }
+    };
+
+    {
+        std::lock_guard lock{mutex};
+        if (finish_result.succeeded)
+        {
+            get_value_other_hosts_also_finished();
+            return true;
+        }
+        if (finish_result.exception)
+        {
+            if (throw_if_error)
+                std::rethrow_exception(finish_result.exception);
+            return false;
+        }
+    }
+
+    try
+    {
+        stopWatchingThread();
+
+        auto holder = with_retries.createRetriesControlHolder("BackupStageSync::finish", retries_kind);
+        holder.retries_ctl.retryLoop([&, &zookeeper = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zookeeper);
+            createFinishNodeAndRemoveAliveNode(zookeeper);
+        });
+
+        std::lock_guard lock{mutex};
+        finish_result.succeeded = true;
+        get_value_other_hosts_also_finished();
+        return true;
+    }
+    catch (...)
+    {
+        LOG_TRACE(log, "Caught exception while creating the 'finish' node for {}: {}",
+            current_host_desc,
+            getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));
+
+        std::lock_guard lock{mutex};
+        finish_result.exception = std::current_exception();
+        if (throw_if_error)
+            throw;
+        return false;
+    }
+}
+
+
+void BackupCoordinationStageSync::createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper)
+{
+    if (zookeeper->exists(finish_node_path))
+        return;
+
+    std::optional<size_t> num_hosts;
+    int num_hosts_version = -1;
+
+    for (size_t attempt_no = 1; attempt_no <= max_attempts_after_bad_version; ++attempt_no)
+    {
+        if (!num_hosts)
+        {
+            Coordination::Stat stat;
+            num_hosts = parseFromString<size_t>(zookeeper->get(num_hosts_node_path, &stat));
+            num_hosts_version = stat.version;
+        }
+
+        Coordination::Requests requests;
+        requests.reserve(3);
+
+        requests.emplace_back(zkutil::makeCreateRequest(finish_node_path, "", zkutil::CreateMode::Persistent));
+
+        size_t num_hosts_node_path_pos = requests.size();
+        requests.emplace_back(zkutil::makeSetRequest(num_hosts_node_path, toString(*num_hosts - 1), num_hosts_version));
+
+        size_t alive_node_path_pos = static_cast<size_t>(-1);
+        if (zookeeper->exists(alive_node_path))
+        {
+            alive_node_path_pos = requests.size();
+            requests.emplace_back(zkutil::makeRemoveRequest(alive_node_path, -1));
+        }
+
+        Coordination::Responses responses;
+        auto code = zookeeper->tryMulti(requests, responses);
+
+        if (code == Coordination::Error::ZOK)
+        {
+            --*num_hosts;
+            String hosts_left_desc = ((*num_hosts == 0) ? "no hosts left" : fmt::format("{} hosts left", *num_hosts));
+            LOG_INFO(log, "Created the 'finish' node in ZooKeeper for {}, {}", current_host_desc, hosts_left_desc);
+            return;
+        }
+
+        auto show_error_before_next_attempt = [&](const String & message)
+        {
+            bool will_try_again = (attempt_no < max_attempts_after_bad_version);
+            LOG_TRACE(log, "{} (attempt #{}){}", message, attempt_no, will_try_again ? ", will try again" : "");
+        };
+
+        if ((responses.size() > num_hosts_node_path_pos) &&
+            (responses[num_hosts_node_path_pos]->error == Coordination::Error::ZBADVERSION))
+        {
+            show_error_before_next_attempt("Other host changed the 'num_hosts' node in ZooKeeper");
+            num_hosts.reset(); /// needs to reread 'num_hosts' again
+        }
+        else if ((responses.size() > alive_node_path_pos) &&
+            (responses[alive_node_path_pos]->error == Coordination::Error::ZNONODE))
+        {
+            show_error_before_next_attempt(fmt::format("Node {} in ZooKeeper doesn't exist", alive_node_path_pos));
+            /// needs another attempt
+        }
+        else
+        {
+            zkutil::KeeperMultiException::check(code, requests, responses);
+        }
+    }
+
+    throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                    "Couldn't create the 'finish' node for {} after {} attempts",
+                    current_host_desc, max_attempts_after_bad_version);
+}
+
+
+void BackupCoordinationStageSync::waitForOtherHostsToFinish() const
+{
+    tryWaitForOtherHostsToFinishImpl(/* reason = */ "", /* throw_if_error = */ true, /* timeout = */ {});
+}
+
+
+bool BackupCoordinationStageSync::tryWaitForOtherHostsToFinishAfterError() const noexcept
+{
+    std::optional<std::chrono::seconds> timeout;
+    if (finish_timeout_after_error.count() != 0)
+        timeout = finish_timeout_after_error;
+
+    String reason = fmt::format("{} needs other hosts to finish before cleanup", current_host_desc);
+    return tryWaitForOtherHostsToFinishImpl(reason, /* throw_if_error = */ false, timeout);
+}
+
+
+bool BackupCoordinationStageSync::tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const
+{
+    std::unique_lock lock{mutex};
+
+    /// TSA_NO_THREAD_SAFETY_ANALYSIS is here because Clang Thread Safety Analysis doesn't understand std::unique_lock.
+    auto check_if_other_hosts_finish = [&](bool time_is_out) TSA_NO_THREAD_SAFETY_ANALYSIS
+    {
+        return checkIfOtherHostsFinish(reason, throw_if_error, time_is_out, timeout);
+    };
+
+    if (timeout)
+    {
+        if (state_changed.wait_for(lock, *timeout, [&] { return check_if_other_hosts_finish(/* time_is_out = */ false); }))
+            return true;
+        return check_if_other_hosts_finish(/* time_is_out = */ true);
+    }
+    else
+    {
+        state_changed.wait(lock, [&] { return check_if_other_hosts_finish(/* time_is_out = */ false); });
+        return true;
+    }
+}
+
+
+bool BackupCoordinationStageSync::checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const
+{
+    if (should_stop_watching_thread)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "finish() was called while waiting for other hosts to finish");
+
+    if (throw_if_error)
+        process_list_element->checkTimeLimit();
+
+    for (const auto & [host, host_info] : state.hosts)
+    {
+        if ((host == current_host) || host_info.finished)
+            continue;
+
+        String host_status;
+        if (!host_info.started)
+            host_status = fmt::format(": the host hasn't started working on this {} yet", operation_name);
+        else if (!host_info.connected)
+            host_status = fmt::format(": the host is currently disconnected, last connection was at {}", host_info.last_connection_time);
+
+        if (!time_is_out)
+        {
+            String reason_text = reason.empty() ? "" : (" because " + reason);
+            LOG_TRACE(log, "Waiting for {} to finish{}{}", getHostDesc(host), reason_text, host_status);
+            return false;
+        }
+        else
+        {
+            String reason_text = reason.empty() ? "" : fmt::format(" (reason of waiting: {})", reason);
+            if (!throw_if_error)
             {
-                auto current_time = std::chrono::steady_clock::now();
-                if ((current_time > end_of_timeout)
-                    || !watch->tryWait(std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time).count()))
-                    break;
+                LOG_INFO(log, "Waited longer than timeout {} for {} to finish{}{}",
+                          *timeout, getHostDesc(host), host_status, reason_text);
+                return false;
             }
             else
             {
-                watch->wait();
+                throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
+                                "Waited longer than timeout {} for {} to finish{}{}",
+                                *timeout, getHostDesc(host), host_status, reason_text);
             }
         }
     }
 
-    /// Rethrow an error raised originally on another host.
-    if (state.error)
-        state.error->second.rethrow();
-
-    /// Another host terminated without errors.
-    if (state.disconnected_host)
-        throw Exception(ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE, "No connection to host {}", *state.disconnected_host);
-
-    /// Something's unready, timeout is probably not enough.
-    if (state.unready_host)
-    {
-        const auto & unready_host = *state.unready_host;
-        throw Exception(
-            ErrorCodes::FAILED_TO_SYNC_BACKUP_OR_RESTORE,
-            "Waited for host {} too long (> {}){}",
-            unready_host.host,
-            to_string(*timeout),
-            unready_host.started ? "" : ": Operation didn't start");
-    }
-
-    LOG_TRACE(log, "Everything is Ok. All hosts achieved stage {}", stage_to_wait);
-    return std::move(*state.results);
+    LOG_TRACE(log, "Other hosts finished working on this {}", operation_name);
+    return true;
 }
 
 }
diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h
index a06c5c61041..32f660af997 100644
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@@ -10,33 +10,190 @@ class BackupCoordinationStageSync
 {
 public:
     BackupCoordinationStageSync(
-        const String & root_zookeeper_path_,
-        WithRetries & with_retries_,
+        bool is_restore_,                    /// true if this is a RESTORE ON CLUSTER command, false if this is a BACKUP ON CLUSTER command
+        const String & zookeeper_path_,      /// path to the "stage" folder in ZooKeeper
+        const String & current_host_,        /// the current host, or an empty string if it's the initiator of the BACKUP/RESTORE ON CLUSTER command
+        const Strings & all_hosts_,          /// all the hosts (including the initiator and the current host) performing the BACKUP/RESTORE ON CLUSTER command
+        bool allow_concurrency_,             /// whether it's allowed to have concurrent backups or restores.
+        const WithRetries & with_retries_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+        QueryStatusPtr process_list_element_,
         LoggerPtr log_);
 
+    ~BackupCoordinationStageSync();
+
     /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
-    void set(const String & current_host, const String & new_stage, const String & message, const bool & all_hosts = false);
-    void setError(const String & current_host, const Exception & exception);
+    void setStage(const String & stage, const String & stage_result = {});
 
-    /// Sets the stage of the current host and waits until all hosts come to the same stage.
-    /// The function returns the messages all hosts set when they come to the required stage.
-    Strings wait(const Strings & all_hosts, const String & stage_to_wait);
+    /// Waits until all the specified hosts come to the specified stage.
+    /// The function returns the results which specified hosts set when they came to the required stage.
+    /// If it doesn't happen before the timeout then the function will stop waiting and throw an exception.
+    Strings waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout = {}) const;
 
-    /// Almost the same as setAndWait() but this one stops waiting and throws an exception after a specific amount of time.
-    Strings waitFor(const Strings & all_hosts, const String & stage_to_wait, std::chrono::milliseconds timeout);
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    void waitForOtherHostsToFinish() const;
+
+    /// Lets other host know that the current host has finished its work.
+    void finish(bool & other_hosts_also_finished);
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(std::exception_ptr exception) noexcept;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    bool tryWaitForOtherHostsToFinishAfterError() const noexcept;
+
+    /// Lets other host know that the current host has finished its work (as a part of error-handling process).
+    bool tryFinishAfterError(bool & other_hosts_also_finished) noexcept;
+
+    /// Returns a printable name of a specific host. For empty host the function returns "initiator".
+    static String getHostDesc(const String & host);
+    static String getHostsDesc(const Strings & hosts);
 
 private:
+    /// Initializes the original state. It will be updated then with readCurrentState().
+    void initializeState();
+
+    /// Creates the root node in ZooKeeper.
     void createRootNodes();
 
-    struct State;
-    State readCurrentState(WithRetries::RetriesControlHolder & retries_control_holder, const Strings & zk_nodes, const Strings & all_hosts, const String & stage_to_wait) const;
+    /// Atomically creates both 'start' and 'alive' nodes and also checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void createStartAndAliveNodes();
+    void createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
 
-    Strings waitImpl(const Strings & all_hosts, const String & stage_to_wait, std::optional<std::chrono::milliseconds> timeout) const;
+    /// Deserialize the version of a node stored in the 'start' node.
+    int parseStartNode(const String & start_node_contents, const String & host) const;
 
-    String zookeeper_path;
-    /// A reference to the field of parent object - BackupCoordinationRemote or RestoreCoordinationRemote
-    WithRetries & with_retries;
-    LoggerPtr log;
+    /// Recreates the 'alive' node if it doesn't exist. It's an ephemeral node so it's removed automatically after disconnections.
+    void createAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Checks that there is no concurrent backup or restore if `allow_concurrency` is false.
+    void checkConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Watching thread periodically reads the current state from ZooKeeper and recreates the 'alive' node.
+    void startWatchingThread();
+    void stopWatchingThread();
+    void watchingThread();
+
+    /// Reads the current state from ZooKeeper without throwing exceptions.
+    void readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    String getStageNodePath(const String & stage) const;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    bool trySetError(const Exception & exception);
+    void setError(const Exception & exception);
+
+    /// Deserializes an error stored in the error node.
+    static std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents);
+
+    /// Reset the `connected` flag for each host.
+    void resetConnectedFlag();
+
+    /// Checks if the current query is cancelled, and if so then the function sets the `cancelled` flag in the current state.
+    void checkIfQueryCancelled();
+
+    /// Checks if the current state contains an error, and if so then the function passes this error to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfError();
+
+    /// Checks if some host was disconnected for too long, and if so then the function generates an error and pass it to the query status
+    /// to cancel the current BACKUP or RESTORE command.
+    void cancelQueryIfDisconnectedTooLong();
+
+    /// Used by waitForHostsToReachStage() to check if everything is ready to return.
+    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, bool time_is_out, std::optional<std::chrono::milliseconds> timeout, Strings & results) const TSA_REQUIRES(mutex);
+
+    /// Creates the 'finish' node.
+    bool tryFinishImpl();
+    bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
+    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Waits until all the other hosts finish their work.
+    bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
+    bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
+
+    const bool is_restore;
+    const String operation_name;
+    const String current_host;
+    const String current_host_desc;
+    const Strings all_hosts;
+    const bool allow_concurrency;
+
+    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
+    const WithRetries & with_retries;
+
+    const ThreadPoolCallbackRunnerUnsafe<void> schedule;
+    const QueryStatusPtr process_list_element;
+    const LoggerPtr log;
+
+    const std::chrono::seconds failure_after_host_disconnected_for_seconds;
+    const std::chrono::seconds finish_timeout_after_error;
+    const std::chrono::milliseconds sync_period_ms;
+    const size_t max_attempts_after_bad_version;
+
+    /// Paths in ZooKeeper.
+    const std::filesystem::path zookeeper_path;
+    const String root_zookeeper_path;
+    const String operation_node_path;
+    const String operation_node_name;
+    const String stage_node_path;
+    const String start_node_path;
+    const String finish_node_path;
+    const String num_hosts_node_path;
+    const String alive_node_path;
+    const String alive_tracker_node_path;
+    const String error_node_path;
+
+    std::shared_ptr<Poco::Event> zk_nodes_changed;
+
+    /// We store list of previously found ZooKeeper nodes to show better logging messages.
+    Strings zk_nodes;
+
+    /// Information about one host read from ZooKeeper.
+    struct HostInfo
+    {
+        String host;
+        bool started = false;
+        bool connected = false;
+        bool finished = false;
+        int version = 0;
+        std::map<String /* stage */, String /* result */> stages = {}; /// std::map because we need to compare states
+        std::exception_ptr exception = nullptr;
+
+        std::chrono::time_point<std::chrono::system_clock> last_connection_time = {};
+        std::chrono::time_point<std::chrono::steady_clock> last_connection_time_monotonic = {};
+
+        bool operator ==(const HostInfo & other) const;
+        bool operator !=(const HostInfo & other) const;
+    };
+
+    /// Information about all the host participating in the current BACKUP or RESTORE operation.
+    struct State
+    {
+        std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
+        std::optional<String> host_with_error;
+        bool cancelled = false;
+
+        bool operator ==(const State & other) const;
+        bool operator !=(const State & other) const;
+    };
+
+    State state TSA_GUARDED_BY(mutex);
+    mutable std::condition_variable state_changed;
+
+    std::future<void> watching_thread_future;
+    std::atomic<bool> should_stop_watching_thread = false;
+
+    struct FinishResult
+    {
+        bool succeeded = false;
+        std::exception_ptr exception;
+        bool other_hosts_also_finished = false;
+    };
+    FinishResult finish_result TSA_GUARDED_BY(mutex);
+
+    mutable std::mutex mutex;
 };
 
 }
diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp
index ae73630d41c..00a4471d994 100644
--- a/src/Backups/BackupEntriesCollector.cpp
+++ b/src/Backups/BackupEntriesCollector.cpp
@@ -102,7 +102,6 @@ BackupEntriesCollector::BackupEntriesCollector(
     , read_settings(read_settings_)
     , context(context_)
     , process_list_element(context->getProcessListElement())
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
     , collect_metadata_timeout(context->getConfigRef().getUInt64(
           "backups.collect_metadata_timeout", context->getConfigRef().getUInt64("backups.consistent_metadata_snapshot_timeout", 600000)))
     , attempts_to_collect_metadata_before_sleep(context->getConfigRef().getUInt("backups.attempts_to_collect_metadata_before_sleep", 2))
@@ -176,21 +175,7 @@ Strings BackupEntriesCollector::setStage(const String & new_stage, const String
     checkIsQueryCancelled();
 
     current_stage = new_stage;
-    backup_coordination->setStage(new_stage, message);
-
-    if (new_stage == Stage::formatGatheringMetadata(0))
-    {
-        return backup_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-    }
-    if (new_stage.starts_with(Stage::GATHERING_METADATA))
-    {
-        auto current_time = std::chrono::steady_clock::now();
-        auto end_of_timeout = std::max(current_time, collect_metadata_end_time);
-        return backup_coordination->waitForStage(
-            new_stage, std::chrono::duration_cast<std::chrono::milliseconds>(end_of_timeout - current_time));
-    }
-
-    return backup_coordination->waitForStage(new_stage);
+    return backup_coordination->setStage(new_stage, message, /* sync = */ true);
 }
 
 void BackupEntriesCollector::checkIsQueryCancelled() const
diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h
index ae076a84c8b..504489cce6b 100644
--- a/src/Backups/BackupEntriesCollector.h
+++ b/src/Backups/BackupEntriesCollector.h
@@ -111,10 +111,6 @@ private:
     ContextPtr context;
     QueryStatusPtr process_list_element;
 
-    /// The time a BACKUP ON CLUSTER or RESTORE ON CLUSTER command will wait until all the nodes receive the BACKUP (or RESTORE) query and start working.
-    /// This setting is similar to `distributed_ddl_task_timeout`.
-    const std::chrono::milliseconds on_cluster_first_sync_timeout;
-
     /// The time a BACKUP command will try to collect the metadata of tables & databases.
     const std::chrono::milliseconds collect_metadata_timeout;
 
diff --git a/src/Backups/BackupIO.h b/src/Backups/BackupIO.h
index ee2f38c785b..c9e0f25f9a0 100644
--- a/src/Backups/BackupIO.h
+++ b/src/Backups/BackupIO.h
@@ -5,6 +5,7 @@
 
 namespace DB
 {
+
 class IDisk;
 using DiskPtr = std::shared_ptr<IDisk>;
 class SeekableReadBuffer;
@@ -63,9 +64,13 @@ public:
 
     virtual void copyFile(const String & destination, const String & source, size_t size) = 0;
 
+    /// Removes a file written to the backup, if it still exists.
     virtual void removeFile(const String & file_name) = 0;
     virtual void removeFiles(const Strings & file_names) = 0;
 
+    /// Removes the backup folder if it's empty or contains empty subfolders.
+    virtual void removeEmptyDirectories() = 0;
+
     virtual const ReadSettings & getReadSettings() const = 0;
     virtual const WriteSettings & getWriteSettings() const = 0;
     virtual size_t getWriteBufferSize() const = 0;
diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h
index c3b88f245ab..c90a030a1e7 100644
--- a/src/Backups/BackupIO_AzureBlobStorage.h
+++ b/src/Backups/BackupIO_AzureBlobStorage.h
@@ -81,6 +81,7 @@ public:
 
     void removeFile(const String & file_name) override;
     void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}
 
 private:
     std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
diff --git a/src/Backups/BackupIO_Disk.cpp b/src/Backups/BackupIO_Disk.cpp
index aeb07b154f5..794fb5be936 100644
--- a/src/Backups/BackupIO_Disk.cpp
+++ b/src/Backups/BackupIO_Disk.cpp
@@ -91,16 +91,36 @@ std::unique_ptr<WriteBuffer> BackupWriterDisk::writeFile(const String & file_nam
 void BackupWriterDisk::removeFile(const String & file_name)
 {
     disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
 }
 
 void BackupWriterDisk::removeFiles(const Strings & file_names)
 {
     for (const auto & file_name : file_names)
         disk->removeFileIfExists(root_path / file_name);
-    if (disk->existsDirectory(root_path) && disk->isDirectoryEmpty(root_path))
-        disk->removeDirectory(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterDisk::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!disk->existsDirectory(current_dir))
+        return;
+
+    if (disk->isDirectoryEmpty(current_dir))
+    {
+        disk->removeDirectory(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (auto it = disk->iterateDirectory(current_dir); it->isValid(); it->next())
+        removeEmptyDirectoriesImpl(current_dir / it->name());
+
+    if (disk->isDirectoryEmpty(current_dir))
+        disk->removeDirectory(current_dir);
 }
 
 void BackupWriterDisk::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
diff --git a/src/Backups/BackupIO_Disk.h b/src/Backups/BackupIO_Disk.h
index 3d3253877bd..c77513935a9 100644
--- a/src/Backups/BackupIO_Disk.h
+++ b/src/Backups/BackupIO_Disk.h
@@ -50,9 +50,11 @@ public:
 
     void removeFile(const String & file_name) override;
     void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;
 
 private:
     std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);
 
     const DiskPtr disk;
     const std::filesystem::path root_path;
diff --git a/src/Backups/BackupIO_File.cpp b/src/Backups/BackupIO_File.cpp
index 681513bf7ce..80f084d241c 100644
--- a/src/Backups/BackupIO_File.cpp
+++ b/src/Backups/BackupIO_File.cpp
@@ -106,16 +106,36 @@ std::unique_ptr<WriteBuffer> BackupWriterFile::writeFile(const String & file_nam
 void BackupWriterFile::removeFile(const String & file_name)
 {
     (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
 }
 
 void BackupWriterFile::removeFiles(const Strings & file_names)
 {
     for (const auto & file_name : file_names)
         (void)fs::remove(root_path / file_name);
-    if (fs::is_directory(root_path) && fs::is_empty(root_path))
-        (void)fs::remove(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectories()
+{
+    removeEmptyDirectoriesImpl(root_path);
+}
+
+void BackupWriterFile::removeEmptyDirectoriesImpl(const fs::path & current_dir)
+{
+    if (!fs::is_directory(current_dir))
+        return;
+
+    if (fs::is_empty(current_dir))
+    {
+        (void)fs::remove(current_dir);
+        return;
+    }
+
+    /// Backups are not too deep, so recursion is good enough here.
+    for (const auto & it : std::filesystem::directory_iterator{current_dir})
+        removeEmptyDirectoriesImpl(it.path());
+
+    if (fs::is_empty(current_dir))
+        (void)fs::remove(current_dir);
 }
 
 void BackupWriterFile::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path,
diff --git a/src/Backups/BackupIO_File.h b/src/Backups/BackupIO_File.h
index ebe9a0f02cb..a2169ac7b4b 100644
--- a/src/Backups/BackupIO_File.h
+++ b/src/Backups/BackupIO_File.h
@@ -42,9 +42,11 @@ public:
 
     void removeFile(const String & file_name) override;
     void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override;
 
 private:
     std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
+    void removeEmptyDirectoriesImpl(const std::filesystem::path & current_dir);
 
     const std::filesystem::path root_path;
     const DataSourceDescription data_source_description;
diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h
index a04f1c915b9..4ccf477b369 100644
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@@ -74,6 +74,7 @@ public:
 
     void removeFile(const String & file_name) override;
     void removeFiles(const Strings & file_names) override;
+    void removeEmptyDirectories() override {}
 
 private:
     std::unique_ptr<ReadBuffer> readFile(const String & file_name, size_t expected_file_size) override;
diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp
index b95a2e10b4d..af3fa5531b8 100644
--- a/src/Backups/BackupImpl.cpp
+++ b/src/Backups/BackupImpl.cpp
@@ -147,11 +147,11 @@ BackupImpl::BackupImpl(
 
 BackupImpl::~BackupImpl()
 {
-    if ((open_mode == OpenMode::WRITE) && !is_internal_backup && !writing_finalized && !std::uncaught_exceptions() && !std::current_exception())
+    if ((open_mode == OpenMode::WRITE) && !writing_finalized && !corrupted)
     {
         /// It is suspicious to destroy BackupImpl without finalization while writing a backup when there is no exception.
-        LOG_ERROR(log, "BackupImpl is not finalized when destructor is called. Stack trace: {}", StackTrace().toString());
-        chassert(false && "BackupImpl is not finalized when destructor is called.");
+        LOG_ERROR(log, "BackupImpl is not finalized or marked as corrupted when destructor is called. Stack trace: {}", StackTrace().toString());
+        chassert(false, "BackupImpl is not finalized or marked as corrupted when destructor is called.");
     }
 
     try
@@ -196,9 +196,6 @@ void BackupImpl::open()
 
     if (open_mode == OpenMode::READ)
         readBackupMetadata();
-
-    if ((open_mode == OpenMode::WRITE) && base_backup_info)
-        base_backup_uuid = getBaseBackupUnlocked()->getUUID();
 }
 
 void BackupImpl::close()
@@ -280,6 +277,8 @@ std::shared_ptr<const IBackup> BackupImpl::getBaseBackupUnlocked() const
                 toString(base_backup->getUUID()),
                 (base_backup_uuid ? toString(*base_backup_uuid) : ""));
         }
+
+        base_backup_uuid = base_backup->getUUID();
     }
     return base_backup;
 }
@@ -369,7 +368,7 @@ void BackupImpl::writeBackupMetadata()
         if (base_backup_in_use)
         {
             *out << "<base_backup>" << xml << base_backup_info->toString() << "</base_backup>";
-            *out << "<base_backup_uuid>" << toString(*base_backup_uuid) << "</base_backup_uuid>";
+            *out << "<base_backup_uuid>" << getBaseBackupUnlocked()->getUUID() << "</base_backup_uuid>";
         }
     }
 
@@ -594,9 +593,6 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const
 
 void BackupImpl::removeLockFile()
 {
-    if (is_internal_backup)
-        return; /// Internal backup must not remove the lock file (it's still used by the initiator).
-
     if (checkLockFile(false))
         writer->removeFile(lock_file_name);
 }
@@ -989,8 +985,11 @@ void BackupImpl::finalizeWriting()
     if (open_mode != OpenMode::WRITE)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
 
+    if (corrupted)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup can't be finalized after an error happened");
+
     if (writing_finalized)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is already finalized");
+        return;
 
     if (!is_internal_backup)
     {
@@ -1015,20 +1014,58 @@ void BackupImpl::setCompressedSize()
 }
 
 
-void BackupImpl::tryRemoveAllFiles()
+bool BackupImpl::setIsCorrupted() noexcept
 {
-    if (open_mode != OpenMode::WRITE)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Backup is not opened for writing");
-
-    if (is_internal_backup)
-        return;
-
     try
     {
-        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+        std::lock_guard lock{mutex};
+        if (open_mode != OpenMode::WRITE)
+        {
+            LOG_ERROR(log, "Backup is not opened for writing. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not opened for writing when setIsCorrupted() is called");
+            return false;
+        }
+
+        if (writing_finalized)
+        {
+            LOG_WARNING(log, "An error happened after the backup was completed successfully, the backup must be correct!");
+            return false;
+        }
+
+        if (corrupted)
+            return true;
+
+        LOG_WARNING(log, "An error happened, the backup won't be completed");
+
         closeArchive(/* finalize= */ false);
 
+        corrupted = true;
+        return true;
+    }
+    catch (...)
+    {
+        DB::tryLogCurrentException(log, "Caught exception while setting that the backup was corrupted");
+        return false;
+    }
+}
+
+
+bool BackupImpl::tryRemoveAllFiles() noexcept
+{
+    try
+    {
+        std::lock_guard lock{mutex};
+        if (!corrupted)
+        {
+            LOG_ERROR(log, "Backup is not set as corrupted. Stack trace: {}", StackTrace().toString());
+            chassert(false, "Backup is not set as corrupted when tryRemoveAllFiles() is called");
+            return false;
+        }
+
+        LOG_INFO(log, "Removing all files of backup {}", backup_name_for_logging);
+
         Strings files_to_remove;
+
         if (use_archive)
         {
             files_to_remove.push_back(archive_params.archive_name);
@@ -1041,14 +1078,17 @@ void BackupImpl::tryRemoveAllFiles()
         }
 
         if (!checkLockFile(false))
-            return;
+            return false;
 
         writer->removeFiles(files_to_remove);
         removeLockFile();
+        writer->removeEmptyDirectories();
+        return true;
     }
     catch (...)
     {
-        DB::tryLogCurrentException(__PRETTY_FUNCTION__);
+        DB::tryLogCurrentException(log, "Caught exception while removing files of a corrupted backup");
+        return false;
     }
 }
 
diff --git a/src/Backups/BackupImpl.h b/src/Backups/BackupImpl.h
index d7846104c4c..4b0f9f879ec 100644
--- a/src/Backups/BackupImpl.h
+++ b/src/Backups/BackupImpl.h
@@ -86,7 +86,8 @@ public:
     void writeFile(const BackupFileInfo & info, BackupEntryPtr entry) override;
     bool supportsWritingInMultipleThreads() const override { return !use_archive; }
     void finalizeWriting() override;
-    void tryRemoveAllFiles() override;
+    bool setIsCorrupted() noexcept override;
+    bool tryRemoveAllFiles() noexcept override;
 
 private:
     void open();
@@ -146,13 +147,14 @@ private:
     int version;
     mutable std::optional<BackupInfo> base_backup_info;
     mutable std::shared_ptr<const IBackup> base_backup;
-    std::optional<UUID> base_backup_uuid;
+    mutable std::optional<UUID> base_backup_uuid;
     std::shared_ptr<IArchiveReader> archive_reader;
     std::shared_ptr<IArchiveWriter> archive_writer;
     String lock_file_name;
     std::atomic<bool> lock_file_before_first_file_checked = false;
 
     bool writing_finalized = false;
+    bool corrupted = false;
     bool deduplicate_files = true;
     bool use_same_s3_credentials_for_base_backup = false;
     bool use_same_password_for_base_backup = false;
diff --git a/src/Backups/BackupKeeperSettings.cpp b/src/Backups/BackupKeeperSettings.cpp
new file mode 100644
index 00000000000..180633cea1f
--- /dev/null
+++ b/src/Backups/BackupKeeperSettings.cpp
@@ -0,0 +1,58 @@
+#include <Backups/BackupKeeperSettings.h>
+
+#include <Core/Settings.h>
+#include <Interpreters/Context.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+
+namespace DB
+{
+
+namespace Setting
+{
+    extern const SettingsUInt64 backup_restore_keeper_max_retries;
+    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
+    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
+    extern const SettingsUInt64 backup_restore_failure_after_host_disconnected_for_seconds;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_initializing;
+    extern const SettingsUInt64 backup_restore_keeper_max_retries_while_handling_error;
+    extern const SettingsUInt64 backup_restore_finish_timeout_after_error_sec;
+    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
+    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
+    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
+    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
+}
+
+BackupKeeperSettings BackupKeeperSettings::fromContext(const ContextPtr & context)
+{
+    BackupKeeperSettings keeper_settings;
+
+    const auto & settings = context->getSettingsRef();
+    const auto & config = context->getConfigRef();
+
+    keeper_settings.max_retries = settings[Setting::backup_restore_keeper_max_retries];
+    keeper_settings.retry_initial_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_initial_backoff_ms]};
+    keeper_settings.retry_max_backoff_ms = std::chrono::milliseconds{settings[Setting::backup_restore_keeper_retry_max_backoff_ms]};
+
+    keeper_settings.failure_after_host_disconnected_for_seconds = std::chrono::seconds{settings[Setting::backup_restore_failure_after_host_disconnected_for_seconds]};
+    keeper_settings.max_retries_while_initializing = settings[Setting::backup_restore_keeper_max_retries_while_initializing];
+    keeper_settings.max_retries_while_handling_error = settings[Setting::backup_restore_keeper_max_retries_while_handling_error];
+    keeper_settings.finish_timeout_after_error = std::chrono::seconds(settings[Setting::backup_restore_finish_timeout_after_error_sec]);
+
+    if (config.has("backups.sync_period_ms"))
+        keeper_settings.sync_period_ms = std::chrono::milliseconds{config.getUInt64("backups.sync_period_ms")};
+
+    if (config.has("backups.max_attempts_after_bad_version"))
+        keeper_settings.max_attempts_after_bad_version = config.getUInt64("backups.max_attempts_after_bad_version");
+
+    keeper_settings.value_max_size = settings[Setting::backup_restore_keeper_value_max_size];
+    keeper_settings.batch_size_for_multi = settings[Setting::backup_restore_batch_size_for_keeper_multi];
+    keeper_settings.batch_size_for_multiread = settings[Setting::backup_restore_batch_size_for_keeper_multiread];
+    keeper_settings.fault_injection_probability = settings[Setting::backup_restore_keeper_fault_injection_probability];
+    keeper_settings.fault_injection_seed = settings[Setting::backup_restore_keeper_fault_injection_seed];
+
+    return keeper_settings;
+}
+
+}
diff --git a/src/Backups/BackupKeeperSettings.h b/src/Backups/BackupKeeperSettings.h
new file mode 100644
index 00000000000..6c4b2187094
--- /dev/null
+++ b/src/Backups/BackupKeeperSettings.h
@@ -0,0 +1,64 @@
+#pragma once
+
+#include <Interpreters/Context_fwd.h>
+
+
+namespace DB
+{
+
+/// Settings for [Zoo]Keeper-related works during BACKUP or RESTORE.
+struct BackupKeeperSettings
+{
+    /// Maximum number of retries in the middle of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Should be big enough so the whole operation won't be cancelled in the middle of it because of a temporary ZooKeeper failure.
+    UInt64 max_retries{1000};
+
+    /// Initial backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_initial_backoff_ms{100};
+
+    /// Max backoff timeout for ZooKeeper operations during backup or restore.
+    std::chrono::milliseconds retry_max_backoff_ms{5000};
+
+    /// If a host during BACKUP ON CLUSTER or RESTORE ON CLUSTER doesn't recreate its 'alive' node in ZooKeeper
+    /// for this amount of time then the whole backup or restore is considered as failed.
+    /// Should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+    /// Set to zero to disable (if it's zero and some host crashed then BACKUP ON CLUSTER or RESTORE ON CLUSTER will be waiting
+    /// for the crashed host forever until the operation is explicitly cancelled with KILL QUERY).
+    std::chrono::seconds failure_after_host_disconnected_for_seconds{3600};
+
+    /// Maximum number of retries during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because if the operation is going to fail then it's better if it fails faster.
+    UInt64 max_retries_while_initializing{20};
+
+    /// Maximum number of retries while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+    /// Shouldn't be too big because those retries are just for cleanup after the operation has failed already.
+    UInt64 max_retries_while_handling_error{20};
+
+    /// How long the initiator should wait for other host to handle the 'error' node and finish their work.
+    std::chrono::seconds finish_timeout_after_error{180};
+
+    /// How often the "stage" folder in ZooKeeper must be scanned in a background thread to track changes done by other hosts.
+    std::chrono::milliseconds sync_period_ms{5000};
+
+    /// Number of attempts after getting error ZBADVERSION from ZooKeeper.
+    size_t max_attempts_after_bad_version{10};
+
+    /// Maximum size of data of a ZooKeeper's node during backup.
+    UInt64 value_max_size{1048576};
+
+    /// Maximum size of a batch for a multi request.
+    UInt64 batch_size_for_multi{1000};
+
+    /// Maximum size of a batch for a multiread request.
+    UInt64 batch_size_for_multiread{10000};
+
+    /// Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f].
+    Float64 fault_injection_probability{0};
+
+    /// Seed for `fault_injection_probability`: 0 - random seed, otherwise the setting value.
+    UInt64 fault_injection_seed{0};
+
+    static BackupKeeperSettings fromContext(const ContextPtr & context);
+};
+
+}
diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp
index 9b8117c6587..915989735c3 100644
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@@ -74,6 +74,17 @@ BackupSettings BackupSettings::fromBackupQuery(const ASTBackupQuery & query)
     return res;
 }
 
+bool BackupSettings::isAsync(const ASTBackupQuery & query)
+{
+    if (query.settings)
+    {
+        const auto * field = query.settings->as<const ASTSetQuery &>().changes.tryGet("async");
+        if (field)
+            return field->safeGet<bool>();
+    }
+    return false; /// `async` is false by default.
+}
+
 void BackupSettings::copySettingsToQuery(ASTBackupQuery & query) const
 {
     auto query_settings = std::make_shared<ASTSetQuery>();
diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h
index 8c2ea21df01..fa1e5025935 100644
--- a/src/Backups/BackupSettings.h
+++ b/src/Backups/BackupSettings.h
@@ -101,6 +101,8 @@ struct BackupSettings
     static BackupSettings fromBackupQuery(const ASTBackupQuery & query);
     void copySettingsToQuery(ASTBackupQuery & query) const;
 
+    static bool isAsync(const ASTBackupQuery & query);
+
     struct Util
     {
         static std::vector<Strings> clusterHostIDsFromAST(const IAST & ast);
diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp
index d3889295598..8480dc5d64d 100644
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@@ -1,4 +1,6 @@
 #include <Backups/BackupsWorker.h>
+
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupFactory.h>
 #include <Backups/BackupInfo.h>
 #include <Backups/BackupSettings.h>
@@ -6,9 +8,9 @@
 #include <Backups/IBackupEntry.h>
 #include <Backups/BackupEntriesCollector.h>
 #include <Backups/BackupCoordinationStage.h>
-#include <Backups/BackupCoordinationRemote.h>
+#include <Backups/BackupCoordinationOnCluster.h>
 #include <Backups/BackupCoordinationLocal.h>
-#include <Backups/RestoreCoordinationRemote.h>
+#include <Backups/RestoreCoordinationOnCluster.h>
 #include <Backups/RestoreCoordinationLocal.h>
 #include <Backups/RestoreSettings.h>
 #include <Backups/RestorerFromBackup.h>
@@ -43,21 +45,11 @@ namespace CurrentMetrics
 
 namespace DB
 {
-namespace Setting
-{
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
-    extern const SettingsUInt64 backup_restore_keeper_max_retries;
-    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
-    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
-    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
-    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
-}
 
 namespace ErrorCodes
 {
     extern const int BAD_ARGUMENTS;
     extern const int LOGICAL_ERROR;
-    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
     extern const int QUERY_WAS_CANCELLED;
 }
 
@@ -66,102 +58,6 @@ namespace Stage = BackupCoordinationStage;
 
 namespace
 {
-    std::shared_ptr<IBackupCoordination> makeBackupCoordination(const ContextPtr & context, const BackupSettings & backup_settings, bool remote)
-    {
-        if (remote)
-        {
-            String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
-
-            auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
-
-            BackupCoordinationRemote::BackupKeeperSettings keeper_settings = WithRetries::KeeperSettings::fromContext(context);
-
-            auto all_hosts = BackupSettings::Util::filterHostIDs(
-                backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
-
-            return std::make_shared<BackupCoordinationRemote>(
-                get_zookeeper,
-                root_zk_path,
-                keeper_settings,
-                toString(*backup_settings.backup_uuid),
-                all_hosts,
-                backup_settings.host_id,
-                !backup_settings.deduplicate_files,
-                backup_settings.internal,
-                context->getProcessListElement());
-        }
-
-        return std::make_shared<BackupCoordinationLocal>(!backup_settings.deduplicate_files);
-    }
-
-    std::shared_ptr<IRestoreCoordination>
-    makeRestoreCoordination(const ContextPtr & context, const RestoreSettings & restore_settings, bool remote)
-    {
-        if (remote)
-        {
-            String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
-
-            auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
-
-            RestoreCoordinationRemote::RestoreKeeperSettings keeper_settings
-            {
-                .keeper_max_retries = context->getSettingsRef()[Setting::backup_restore_keeper_max_retries],
-                .keeper_retry_initial_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_initial_backoff_ms],
-                .keeper_retry_max_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_max_backoff_ms],
-                .batch_size_for_keeper_multiread = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multiread],
-                .keeper_fault_injection_probability = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_probability],
-                .keeper_fault_injection_seed = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_seed]
-            };
-
-            auto all_hosts = BackupSettings::Util::filterHostIDs(
-                restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
-
-            return std::make_shared<RestoreCoordinationRemote>(
-                get_zookeeper,
-                root_zk_path,
-                keeper_settings,
-                toString(*restore_settings.restore_uuid),
-                all_hosts,
-                restore_settings.host_id,
-                restore_settings.internal,
-                context->getProcessListElement());
-        }
-
-        return std::make_shared<RestoreCoordinationLocal>();
-    }
-
-    /// Sends information about an exception to IBackupCoordination or IRestoreCoordination.
-    template <typename CoordinationType>
-    void sendExceptionToCoordination(std::shared_ptr<CoordinationType> coordination, const Exception & exception)
-    {
-        try
-        {
-            if (coordination)
-                coordination->setError(exception);
-        }
-        catch (...) // NOLINT(bugprone-empty-catch)
-        {
-        }
-    }
-
-    /// Sends information about the current exception to IBackupCoordination or IRestoreCoordination.
-    template <typename CoordinationType>
-    void sendCurrentExceptionToCoordination(std::shared_ptr<CoordinationType> coordination)
-    {
-        try
-        {
-            throw;
-        }
-        catch (const Exception & e)
-        {
-            sendExceptionToCoordination(coordination, e);
-        }
-        catch (...)
-        {
-            sendExceptionToCoordination(coordination, Exception(getCurrentExceptionMessageAndPattern(true, true), getCurrentExceptionCode()));
-        }
-    }
-
     bool isFinishedSuccessfully(BackupStatus status)
     {
         return (status == BackupStatus::BACKUP_CREATED) || (status == BackupStatus::RESTORED);
@@ -262,24 +158,27 @@ namespace
 /// while the thread pool is still occupied with the waiting task then a scheduled task can be never executed).
 enum class BackupsWorker::ThreadPoolId : uint8_t
 {
-    /// "BACKUP ON CLUSTER ASYNC" waits in background while "BACKUP ASYNC" is finished on the nodes of the cluster, then finalizes the backup.
-    BACKUP_ASYNC_ON_CLUSTER = 0,
+    /// Making a list of files to copy or copying those files.
+    BACKUP,
 
-    /// "BACKUP ASYNC" waits in background while all file infos are built and then it copies the backup's files.
-    BACKUP_ASYNC = 1,
+    /// Creating of tables and databases during RESTORE and filling them with data.
+    RESTORE,
 
-    /// Making a list of files to copy and copying of those files is always sequential, so those operations can share one thread pool.
-    BACKUP_MAKE_FILES_LIST = 2,
-    BACKUP_COPY_FILES = BACKUP_MAKE_FILES_LIST,
+    /// We need background threads for ASYNC backups and restores.
+    ASYNC_BACKGROUND_BACKUP,
+    ASYNC_BACKGROUND_RESTORE,
 
-    /// "RESTORE ON CLUSTER ASYNC" waits in background while "BACKUP ASYNC" is finished on the nodes of the cluster, then finalizes the backup.
-    RESTORE_ASYNC_ON_CLUSTER = 3,
+    /// We need background threads for coordination workers (see BackgroundCoordinationStageSync).
+    ON_CLUSTER_COORDINATION_BACKUP,
+    ON_CLUSTER_COORDINATION_RESTORE,
 
-    /// "RESTORE ASYNC" waits in background while the data of all tables are restored.
-    RESTORE_ASYNC = 4,
-
-    /// Restores from backups.
-    RESTORE = 5,
+    /// We need separate threads for internal backups and restores.
+    /// An internal backup is a helper backup invoked on some shard and replica by a BACKUP ON CLUSTER command,
+    /// (see BackupSettings.internal); and the same for restores.
+    ASYNC_BACKGROUND_INTERNAL_BACKUP,
+    ASYNC_BACKGROUND_INTERNAL_RESTORE,
+    ON_CLUSTER_COORDINATION_INTERNAL_BACKUP,
+    ON_CLUSTER_COORDINATION_INTERNAL_RESTORE,
 };
 
 
@@ -312,22 +211,26 @@ public:
 
         switch (thread_pool_id)
         {
-            case ThreadPoolId::BACKUP_ASYNC:
-            case ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER:
-            case ThreadPoolId::BACKUP_COPY_FILES:
+            case ThreadPoolId::BACKUP:
+            case ThreadPoolId::ASYNC_BACKGROUND_BACKUP:
+            case ThreadPoolId::ON_CLUSTER_COORDINATION_BACKUP:
+            case ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_BACKUP:
+            case ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_BACKUP:
             {
                 metric_threads = CurrentMetrics::BackupsThreads;
                 metric_active_threads = CurrentMetrics::BackupsThreadsActive;
                 metric_active_threads = CurrentMetrics::BackupsThreadsScheduled;
                 max_threads = num_backup_threads;
                 /// We don't use thread pool queues for thread pools with a lot of tasks otherwise that queue could be memory-wasting.
-                use_queue = (thread_pool_id != ThreadPoolId::BACKUP_COPY_FILES);
+                use_queue = (thread_pool_id != ThreadPoolId::BACKUP);
                 break;
             }
 
-            case ThreadPoolId::RESTORE_ASYNC:
-            case ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER:
             case ThreadPoolId::RESTORE:
+            case ThreadPoolId::ASYNC_BACKGROUND_RESTORE:
+            case ThreadPoolId::ON_CLUSTER_COORDINATION_RESTORE:
+            case ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_RESTORE:
+            case ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_RESTORE:
             {
                 metric_threads = CurrentMetrics::RestoreThreads;
                 metric_active_threads = CurrentMetrics::RestoreThreadsActive;
@@ -352,12 +255,20 @@ public:
     void wait()
     {
         auto wait_sequence = {
-            ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER,
-            ThreadPoolId::RESTORE_ASYNC,
+            /// ASYNC_BACKGROUND_BACKUP must be before ASYNC_BACKGROUND_INTERNAL_BACKUP,
+            /// ASYNC_BACKGROUND_RESTORE must be before ASYNC_BACKGROUND_INTERNAL_RESTORE,
+            /// and everything else is after those ones.
+            ThreadPoolId::ASYNC_BACKGROUND_BACKUP,
+            ThreadPoolId::ASYNC_BACKGROUND_RESTORE,
+            ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_BACKUP,
+            ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_RESTORE,
+            /// Others:
+            ThreadPoolId::BACKUP,
             ThreadPoolId::RESTORE,
-            ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER,
-            ThreadPoolId::BACKUP_ASYNC,
-            ThreadPoolId::BACKUP_COPY_FILES,
+            ThreadPoolId::ON_CLUSTER_COORDINATION_BACKUP,
+            ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_BACKUP,
+            ThreadPoolId::ON_CLUSTER_COORDINATION_RESTORE,
+            ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_RESTORE,
         };
 
         for (auto thread_pool_id : wait_sequence)
@@ -392,6 +303,7 @@ BackupsWorker::BackupsWorker(ContextMutablePtr global_context, size_t num_backup
     , log(getLogger("BackupsWorker"))
     , backup_log(global_context->getBackupLog())
     , process_list(global_context->getProcessList())
+    , concurrency_counters(std::make_unique<BackupConcurrencyCounters>())
 {
 }
 
@@ -405,7 +317,7 @@ ThreadPool & BackupsWorker::getThreadPool(ThreadPoolId thread_pool_id)
 }
 
 
-OperationID BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
+std::pair<OperationID, BackupStatus> BackupsWorker::start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context)
 {
     const ASTBackupQuery & backup_query = typeid_cast<const ASTBackupQuery &>(*backup_or_restore_query);
     if (backup_query.kind == ASTBackupQuery::Kind::BACKUP)
@@ -414,180 +326,147 @@ OperationID BackupsWorker::start(const ASTPtr & backup_or_restore_query, Context
 }
 
 
-OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
+struct BackupsWorker::BackupStarter
 {
-    auto backup_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
-    auto backup_settings = BackupSettings::fromBackupQuery(*backup_query);
-
-    auto backup_info = BackupInfo::fromAST(*backup_query->backup_name);
-    String backup_name_for_logging = backup_info.toStringForLogging();
-
-    if (!backup_settings.backup_uuid)
-        backup_settings.backup_uuid = UUIDHelpers::generateV4();
-
-    /// `backup_id` will be used as a key to the `infos` map, so it should be unique.
-    OperationID backup_id;
-    if (backup_settings.internal)
-        backup_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `backup_id` for internal backup to avoid collision if both internal and non-internal backups are on the same host
-    else if (!backup_settings.id.empty())
-        backup_id = backup_settings.id;
-    else
-        backup_id = toString(*backup_settings.backup_uuid);
-
+    BackupsWorker & backups_worker;
+    std::shared_ptr<ASTBackupQuery> backup_query;
+    ContextPtr query_context; /// We have to keep `query_context` until the end of the operation because a pointer to it is stored inside the ThreadGroup we're using.
+    ContextMutablePtr backup_context;
+    BackupSettings backup_settings;
+    BackupInfo backup_info;
+    String backup_id;
+    String backup_name_for_logging;
+    bool on_cluster;
+    bool is_internal_backup;
     std::shared_ptr<IBackupCoordination> backup_coordination;
+    ClusterPtr cluster;
     BackupMutablePtr backup;
+    std::shared_ptr<ProcessListEntry> process_list_element_holder;
 
-    /// Called in exception handlers below. This lambda function can be called on a separate thread, so it can't capture local variables by reference.
-    auto on_exception = [this](BackupMutablePtr & backup_, const OperationID & backup_id_, const String & backup_name_for_logging_,
-                               const BackupSettings & backup_settings_, const std::shared_ptr<IBackupCoordination> & backup_coordination_)
+    BackupStarter(BackupsWorker & backups_worker_, const ASTPtr & query_, const ContextPtr & context_)
+        : backups_worker(backups_worker_)
+        , backup_query(std::static_pointer_cast<ASTBackupQuery>(query_->clone()))
+        , query_context(context_)
+        , backup_context(Context::createCopy(query_context))
     {
-        /// Something bad happened, the backup has not built.
-        tryLogCurrentException(log, fmt::format("Failed to make {} {}", (backup_settings_.internal ? "internal backup" : "backup"), backup_name_for_logging_));
-        setStatusSafe(backup_id_, getBackupStatusFromCurrentException());
-        sendCurrentExceptionToCoordination(backup_coordination_);
+        backup_context->makeQueryContext();
+        backup_settings = BackupSettings::fromBackupQuery(*backup_query);
+        backup_info = BackupInfo::fromAST(*backup_query->backup_name);
+        backup_name_for_logging = backup_info.toStringForLogging();
+        is_internal_backup = backup_settings.internal;
+        on_cluster = !backup_query->cluster.empty() || is_internal_backup;
 
-        if (backup_ && remove_backup_files_after_failure)
-            backup_->tryRemoveAllFiles();
-        backup_.reset();
-    };
+        if (!backup_settings.backup_uuid)
+            backup_settings.backup_uuid = UUIDHelpers::generateV4();
+
+        /// `backup_id` will be used as a key to the `infos` map, so it should be unique.
+        if (is_internal_backup)
+            backup_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `backup_id` for internal backup to avoid collision if both internal and non-internal backups are on the same host
+        else if (!backup_settings.id.empty())
+            backup_id = backup_settings.id;
+        else
+            backup_id = toString(*backup_settings.backup_uuid);
 
-    try
-    {
         String base_backup_name;
         if (backup_settings.base_backup_info)
             base_backup_name = backup_settings.base_backup_info->toStringForLogging();
 
-        addInfo(backup_id,
+        /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
+        auto process_list_element = backup_context->getProcessListElement();
+        if (process_list_element)
+            process_list_element_holder = process_list_element->getProcessListEntry();
+
+        backups_worker.addInfo(backup_id,
             backup_name_for_logging,
             base_backup_name,
-            context->getCurrentQueryId(),
-            backup_settings.internal,
-            context->getProcessListElement(),
+            backup_context->getCurrentQueryId(),
+            is_internal_backup,
+            process_list_element,
             BackupStatus::CREATING_BACKUP);
+    }
 
-        if (backup_settings.internal)
+    void doBackup()
+    {
+        chassert(!backup_coordination);
+        if (on_cluster && !is_internal_backup)
         {
-            /// The following call of makeBackupCoordination() is not essential because doBackup() will later create a backup coordination
-            /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
-            /// if an exception will be thrown in startMakingBackup() other hosts will know about that.
-            backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ true);
+            backup_query->cluster = backup_context->getMacros()->expand(backup_query->cluster);
+            cluster = backup_context->getCluster(backup_query->cluster);
+            backup_settings.cluster_host_ids = cluster->getHostIDs();
+        }
+        backup_coordination = backups_worker.makeBackupCoordination(on_cluster, backup_settings, backup_context);
+
+        chassert(!backup);
+        backup = backups_worker.openBackupForWriting(backup_info, backup_settings, backup_coordination, backup_context);
+
+        backups_worker.doBackup(
+            backup, backup_query, backup_id, backup_name_for_logging, backup_settings, backup_coordination, backup_context,
+            on_cluster, cluster);
+    }
+
+    void onException()
+    {
+        /// Something bad happened, the backup has not built.
+        tryLogCurrentException(backups_worker.log, fmt::format("Failed to make {} {}",
+                               (is_internal_backup ? "internal backup" : "backup"),
+                               backup_name_for_logging));
+
+        bool should_remove_files_in_backup = backup && !is_internal_backup && backups_worker.remove_backup_files_after_failure;
+
+        if (backup && !backup->setIsCorrupted())
+            should_remove_files_in_backup = false;
+
+        if (backup_coordination && backup_coordination->trySetError(std::current_exception()))
+        {
+            bool other_hosts_finished = backup_coordination->tryWaitForOtherHostsToFinishAfterError();
+
+            if (should_remove_files_in_backup && other_hosts_finished)
+                backup->tryRemoveAllFiles();
+
+            backup_coordination->tryFinishAfterError();
         }
 
-        /// Prepare context to use.
-        ContextPtr context_in_use = context;
-        ContextMutablePtr mutable_context;
-        bool on_cluster = !backup_query->cluster.empty();
-        if (on_cluster || backup_settings.async)
-        {
-            /// We have to clone the query context here because:
-            /// if this is an "ON CLUSTER" query we need to change some settings, and
-            /// if this is an "ASYNC" query it's going to be executed in another thread.
-            context_in_use = mutable_context = Context::createCopy(context);
-            mutable_context->makeQueryContext();
-        }
+        backups_worker.setStatusSafe(backup_id, getBackupStatusFromCurrentException());
+    }
+};
 
-        if (backup_settings.async)
-        {
-            auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::BACKUP_ASYNC_ON_CLUSTER : ThreadPoolId::BACKUP_ASYNC);
 
-            /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
-            auto process_list_element = context_in_use->getProcessListElement();
+std::pair<BackupOperationID, BackupStatus> BackupsWorker::startMakingBackup(const ASTPtr & query, const ContextPtr & context)
+{
+    auto starter = std::make_shared<BackupStarter>(*this, query, context);
 
-            thread_pool.scheduleOrThrowOnError(
-                [this,
-                 backup_query,
-                 backup_id,
-                 backup_name_for_logging,
-                 backup_info,
-                 backup_settings,
-                 backup_coordination,
-                 context_in_use,
-                 mutable_context,
-                 on_exception,
-                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
+    try
+    {
+        auto thread_pool_id = starter->is_internal_backup ? ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_BACKUP: ThreadPoolId::ASYNC_BACKGROUND_BACKUP;
+        String thread_name = starter->is_internal_backup ? "BackupAsyncInt" : "BackupAsync";
+        auto schedule = threadPoolCallbackRunnerUnsafe<void>(thread_pools->getThreadPool(thread_pool_id), thread_name);
+
+        schedule([starter]
+            {
+                try
                 {
-                    BackupMutablePtr backup_async;
-                    try
-                    {
-                        setThreadName("BackupWorker");
-                        CurrentThread::QueryScope query_scope(context_in_use);
-                        doBackup(
-                            backup_async,
-                            backup_query,
-                            backup_id,
-                            backup_name_for_logging,
-                            backup_info,
-                            backup_settings,
-                            backup_coordination,
-                            context_in_use,
-                            mutable_context);
-                    }
-                    catch (...)
-                    {
-                        on_exception(backup_async, backup_id, backup_name_for_logging, backup_settings, backup_coordination);
-                    }
-                });
-        }
-        else
-        {
-            doBackup(
-                backup,
-                backup_query,
-                backup_id,
-                backup_name_for_logging,
-                backup_info,
-                backup_settings,
-                backup_coordination,
-                context_in_use,
-                mutable_context);
-        }
+                    starter->doBackup();
+                }
+                catch (...)
+                {
+                    starter->onException();
+                }
+            },
+            Priority{});
 
-        return backup_id;
+        return {starter->backup_id, BackupStatus::CREATING_BACKUP};
     }
     catch (...)
     {
-        on_exception(backup, backup_id, backup_name_for_logging, backup_settings, backup_coordination);
+        starter->onException();
         throw;
     }
 }
 
 
-void BackupsWorker::doBackup(
-    BackupMutablePtr & backup,
-    const std::shared_ptr<ASTBackupQuery> & backup_query,
-    const OperationID & backup_id,
-    const String & backup_name_for_logging,
-    const BackupInfo & backup_info,
-    BackupSettings backup_settings,
-    std::shared_ptr<IBackupCoordination> backup_coordination,
-    const ContextPtr & context,
-    ContextMutablePtr mutable_context)
+BackupMutablePtr BackupsWorker::openBackupForWriting(const BackupInfo & backup_info, const BackupSettings & backup_settings, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context) const
 {
-    bool on_cluster = !backup_query->cluster.empty();
-    assert(!on_cluster || mutable_context);
-
-    /// Checks access rights if this is not ON CLUSTER query.
-    /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
-    auto required_access = BackupUtils::getRequiredAccessToBackup(backup_query->elements);
-    if (!on_cluster)
-        context->checkAccess(required_access);
-
-    ClusterPtr cluster;
-    if (on_cluster)
-    {
-        backup_query->cluster = context->getMacros()->expand(backup_query->cluster);
-        cluster = context->getCluster(backup_query->cluster);
-        backup_settings.cluster_host_ids = cluster->getHostIDs();
-    }
-
-    /// Make a backup coordination.
-    if (!backup_coordination)
-        backup_coordination = makeBackupCoordination(context, backup_settings, /* remote= */ on_cluster);
-
-    if (!allow_concurrent_backups && backup_coordination->hasConcurrentBackups(std::ref(num_active_backups)))
-        throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
-
-    /// Opens a backup for writing.
+    LOG_TRACE(log, "Opening backup for writing");
     BackupFactory::CreateParams backup_create_params;
     backup_create_params.open_mode = IBackup::OpenMode::WRITE;
     backup_create_params.context = context;
@@ -608,37 +487,57 @@ void BackupsWorker::doBackup(
     backup_create_params.azure_attempt_to_create_container = backup_settings.azure_attempt_to_create_container;
     backup_create_params.read_settings = getReadSettingsForBackup(context, backup_settings);
     backup_create_params.write_settings = getWriteSettingsForBackup(context);
-    backup = BackupFactory::instance().createBackup(backup_create_params);
+    auto backup = BackupFactory::instance().createBackup(backup_create_params);
+    LOG_INFO(log, "Opened backup for writing");
+    return backup;
+}
+
+
+void BackupsWorker::doBackup(
+    BackupMutablePtr backup,
+    const std::shared_ptr<ASTBackupQuery> & backup_query,
+    const OperationID & backup_id,
+    const String & backup_name_for_logging,
+    const BackupSettings & backup_settings,
+    std::shared_ptr<IBackupCoordination> backup_coordination,
+    ContextMutablePtr context,
+    bool on_cluster,
+    const ClusterPtr & cluster)
+{
+    bool is_internal_backup = backup_settings.internal;
+
+    /// Checks access rights if this is not ON CLUSTER query.
+    /// (If this is ON CLUSTER query executeDDLQueryOnCluster() will check access rights later.)
+    auto required_access = BackupUtils::getRequiredAccessToBackup(backup_query->elements);
+    if (!on_cluster)
+        context->checkAccess(required_access);
+
+    maybeSleepForTesting();
 
     /// Write the backup.
-    if (on_cluster)
+    if (on_cluster && !is_internal_backup)
     {
-        DDLQueryOnClusterParams params;
-        params.cluster = cluster;
-        params.only_shard_num = backup_settings.shard_num;
-        params.only_replica_num = backup_settings.replica_num;
-        params.access_to_check = required_access;
+        /// Send the BACKUP query to other hosts.
         backup_settings.copySettingsToQuery(*backup_query);
-
-        // executeDDLQueryOnCluster() will return without waiting for completion
-        mutable_context->setSetting("distributed_ddl_task_timeout", Field{0});
-        mutable_context->setSetting("distributed_ddl_output_mode", Field{"none"});
-        executeDDLQueryOnCluster(backup_query, mutable_context, params);
+        sendQueryToOtherHosts(*backup_query, cluster, backup_settings.shard_num, backup_settings.replica_num,
+                              context, required_access, backup_coordination->getOnClusterInitializationKeeperRetriesInfo());
+        backup_coordination->setBackupQueryWasSentToOtherHosts();
 
         /// Wait until all the hosts have written their backup entries.
-        backup_coordination->waitForStage(Stage::COMPLETED);
-        backup_coordination->setStage(Stage::COMPLETED,"");
+        backup_coordination->waitForOtherHostsToFinish();
     }
     else
     {
         backup_query->setCurrentDatabase(context->getCurrentDatabase());
 
+        auto read_settings = getReadSettingsForBackup(context, backup_settings);
+
         /// Prepare backup entries.
         BackupEntries backup_entries;
         {
             BackupEntriesCollector backup_entries_collector(
                 backup_query->elements, backup_settings, backup_coordination,
-                backup_create_params.read_settings, context, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST));
+                read_settings, context, getThreadPool(ThreadPoolId::BACKUP));
             backup_entries = backup_entries_collector.run();
         }
 
@@ -646,11 +545,11 @@ void BackupsWorker::doBackup(
         chassert(backup);
         chassert(backup_coordination);
         chassert(context);
-        buildFileInfosForBackupEntries(backup, backup_entries, backup_create_params.read_settings, backup_coordination, context->getProcessListElement());
-        writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, backup_settings.internal, context->getProcessListElement());
+        buildFileInfosForBackupEntries(backup, backup_entries, read_settings, backup_coordination, context->getProcessListElement());
+        writeBackupEntries(backup, std::move(backup_entries), backup_id, backup_coordination, is_internal_backup, context->getProcessListElement());
 
-        /// We have written our backup entries, we need to tell other hosts (they could be waiting for it).
-        backup_coordination->setStage(Stage::COMPLETED,"");
+        /// We have written our backup entries (there is no need to sync it with other hosts because it's the last stage).
+        backup_coordination->setStage(Stage::COMPLETED, "", /* sync = */ false);
     }
 
     size_t num_files = 0;
@@ -660,9 +559,9 @@ void BackupsWorker::doBackup(
     UInt64 compressed_size = 0;
 
     /// Finalize backup (write its metadata).
-    if (!backup_settings.internal)
+    backup->finalizeWriting();
+    if (!is_internal_backup)
     {
-        backup->finalizeWriting();
         num_files = backup->getNumFiles();
         total_size = backup->getTotalSize();
         num_entries = backup->getNumEntries();
@@ -673,19 +572,22 @@ void BackupsWorker::doBackup(
     /// Close the backup.
     backup.reset();
 
-    LOG_INFO(log, "{} {} was created successfully", (backup_settings.internal ? "Internal backup" : "Backup"), backup_name_for_logging);
+    /// The backup coordination is not needed anymore.
+    backup_coordination->finish();
+
     /// NOTE: we need to update metadata again after backup->finalizeWriting(), because backup metadata is written there.
     setNumFilesAndSize(backup_id, num_files, total_size, num_entries, uncompressed_size, compressed_size, 0, 0);
+
     /// NOTE: setStatus is called after setNumFilesAndSize in order to have actual information in a backup log record
+    LOG_INFO(log, "{} {} was created successfully", (is_internal_backup ? "Internal backup" : "Backup"), backup_name_for_logging);
     setStatus(backup_id, BackupStatus::BACKUP_CREATED);
 }
 
 
 void BackupsWorker::buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element)
 {
-    backup_coordination->setStage(Stage::BUILDING_FILE_INFOS, "");
-    backup_coordination->waitForStage(Stage::BUILDING_FILE_INFOS);
-    backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, getThreadPool(ThreadPoolId::BACKUP_MAKE_FILES_LIST), process_list_element));
+    backup_coordination->setStage(Stage::BUILDING_FILE_INFOS, "", /* sync = */ true);
+    backup_coordination->addFileInfos(::DB::buildFileInfosForBackupEntries(backup_entries, backup->getBaseBackup(), read_settings, getThreadPool(ThreadPoolId::BACKUP), process_list_element));
 }
 
 
@@ -694,12 +596,11 @@ void BackupsWorker::writeBackupEntries(
     BackupEntries && backup_entries,
     const OperationID & backup_id,
     std::shared_ptr<IBackupCoordination> backup_coordination,
-    bool internal,
+    bool is_internal_backup,
     QueryStatusPtr process_list_element)
 {
     LOG_TRACE(log, "{}, num backup entries={}", Stage::WRITING_BACKUP, backup_entries.size());
-    backup_coordination->setStage(Stage::WRITING_BACKUP, "");
-    backup_coordination->waitForStage(Stage::WRITING_BACKUP);
+    backup_coordination->setStage(Stage::WRITING_BACKUP, "", /* sync = */ true);
 
     auto file_infos = backup_coordination->getFileInfos();
     if (file_infos.size() != backup_entries.size())
@@ -715,7 +616,7 @@ void BackupsWorker::writeBackupEntries(
     std::atomic_bool failed = false;
 
     bool always_single_threaded = !backup->supportsWritingInMultipleThreads();
-    auto & thread_pool = getThreadPool(ThreadPoolId::BACKUP_COPY_FILES);
+    auto & thread_pool = getThreadPool(ThreadPoolId::BACKUP);
 
     std::vector<size_t> writing_order;
     if (test_randomize_order)
@@ -751,7 +652,7 @@ void BackupsWorker::writeBackupEntries(
                 maybeSleepForTesting();
 
                 // Update metadata
-                if (!internal)
+                if (!is_internal_backup)
                 {
                     setNumFilesAndSize(
                             backup_id,
@@ -783,142 +684,139 @@ void BackupsWorker::writeBackupEntries(
 }
 
 
-OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
+struct BackupsWorker::RestoreStarter
 {
-    auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
-    auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
-
-    auto backup_info = BackupInfo::fromAST(*restore_query->backup_name);
-    String backup_name_for_logging = backup_info.toStringForLogging();
-
-    if (!restore_settings.restore_uuid)
-        restore_settings.restore_uuid = UUIDHelpers::generateV4();
-
-    /// `restore_id` will be used as a key to the `infos` map, so it should be unique.
-    OperationID restore_id;
-    if (restore_settings.internal)
-        restore_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `restore_id` for internal restore to avoid collision if both internal and non-internal restores are on the same host
-    else if (!restore_settings.id.empty())
-        restore_id = restore_settings.id;
-    else
-        restore_id = toString(*restore_settings.restore_uuid);
-
+    BackupsWorker & backups_worker;
+    std::shared_ptr<ASTBackupQuery> restore_query;
+    ContextPtr query_context; /// We have to keep `query_context` until the end of the operation because a pointer to it is stored inside the ThreadGroup we're using.
+    ContextMutablePtr restore_context;
+    RestoreSettings restore_settings;
+    BackupInfo backup_info;
+    String restore_id;
+    String backup_name_for_logging;
+    bool on_cluster;
+    bool is_internal_restore;
     std::shared_ptr<IRestoreCoordination> restore_coordination;
+    ClusterPtr cluster;
+    std::shared_ptr<ProcessListEntry> process_list_element_holder;
 
-    /// Called in exception handlers below. This lambda function can be called on a separate thread, so it can't capture local variables by reference.
-    auto on_exception = [this](const OperationID & restore_id_, const String & backup_name_for_logging_,
-                               const RestoreSettings & restore_settings_, const std::shared_ptr<IRestoreCoordination> & restore_coordination_)
+    RestoreStarter(BackupsWorker & backups_worker_, const ASTPtr & query_, const ContextPtr & context_)
+        : backups_worker(backups_worker_)
+        , restore_query(std::static_pointer_cast<ASTBackupQuery>(query_->clone()))
+        , query_context(context_)
+        , restore_context(Context::createCopy(query_context))
     {
-        /// Something bad happened, some data were not restored.
-        tryLogCurrentException(log, fmt::format("Failed to restore from {} {}", (restore_settings_.internal ? "internal backup" : "backup"), backup_name_for_logging_));
-        setStatusSafe(restore_id_, getRestoreStatusFromCurrentException());
-        sendCurrentExceptionToCoordination(restore_coordination_);
-    };
+        restore_context->makeQueryContext();
+        restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);
+        backup_info = BackupInfo::fromAST(*restore_query->backup_name);
+        backup_name_for_logging = backup_info.toStringForLogging();
+        is_internal_restore = restore_settings.internal;
+        on_cluster = !restore_query->cluster.empty() || is_internal_restore;
+
+        if (!restore_settings.restore_uuid)
+            restore_settings.restore_uuid = UUIDHelpers::generateV4();
+
+        /// `restore_id` will be used as a key to the `infos` map, so it should be unique.
+        if (is_internal_restore)
+            restore_id = "internal-" + toString(UUIDHelpers::generateV4()); /// Always generate `restore_id` for internal restore to avoid collision if both internal and non-internal restores are on the same host
+        else if (!restore_settings.id.empty())
+            restore_id = restore_settings.id;
+        else
+            restore_id = toString(*restore_settings.restore_uuid);
 
-    try
-    {
         String base_backup_name;
         if (restore_settings.base_backup_info)
             base_backup_name = restore_settings.base_backup_info->toStringForLogging();
 
-        addInfo(restore_id,
+        /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously.
+        auto process_list_element = restore_context->getProcessListElement();
+        if (process_list_element)
+            process_list_element_holder = process_list_element->getProcessListEntry();
+
+        backups_worker.addInfo(restore_id,
             backup_name_for_logging,
             base_backup_name,
-            context->getCurrentQueryId(),
-            restore_settings.internal,
-            context->getProcessListElement(),
+            restore_context->getCurrentQueryId(),
+            is_internal_restore,
+            process_list_element,
             BackupStatus::RESTORING);
+    }
 
-        if (restore_settings.internal)
+    void doRestore()
+    {
+        chassert(!restore_coordination);
+        if (on_cluster && !is_internal_restore)
         {
-            /// The following call of makeRestoreCoordination() is not essential because doRestore() will later create a restore coordination
-            /// if it's not created here. However to handle errors better it's better to make a coordination here because this way
-            /// if an exception will be thrown in startRestoring() other hosts will know about that.
-            restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ true);
+            restore_query->cluster = restore_context->getMacros()->expand(restore_query->cluster);
+            cluster = restore_context->getCluster(restore_query->cluster);
+            restore_settings.cluster_host_ids = cluster->getHostIDs();
+        }
+        restore_coordination = backups_worker.makeRestoreCoordination(on_cluster, restore_settings, restore_context);
+
+        backups_worker.doRestore(
+            restore_query,
+            restore_id,
+            backup_name_for_logging,
+            backup_info,
+            restore_settings,
+            restore_coordination,
+            restore_context,
+            on_cluster,
+            cluster);
+    }
+
+    void onException()
+    {
+        /// Something bad happened, some data were not restored.
+        tryLogCurrentException(backups_worker.log, fmt::format("Failed to restore from {} {}", (is_internal_restore ? "internal backup" : "backup"), backup_name_for_logging));
+
+        if (restore_coordination && restore_coordination->trySetError(std::current_exception()))
+        {
+            restore_coordination->tryWaitForOtherHostsToFinishAfterError();
+            restore_coordination->tryFinishAfterError();
         }
 
-        /// Prepare context to use.
-        ContextMutablePtr context_in_use = context;
-        bool on_cluster = !restore_query->cluster.empty();
-        if (restore_settings.async || on_cluster)
-        {
-            /// We have to clone the query context here because:
-            /// if this is an "ON CLUSTER" query we need to change some settings, and
-            /// if this is an "ASYNC" query it's going to be executed in another thread.
-            context_in_use = Context::createCopy(context);
-            context_in_use->makeQueryContext();
-        }
+        backups_worker.setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
+    }
+};
 
-        if (restore_settings.async)
-        {
-            auto & thread_pool = getThreadPool(on_cluster ? ThreadPoolId::RESTORE_ASYNC_ON_CLUSTER : ThreadPoolId::RESTORE_ASYNC);
 
-            /// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously.
-            auto process_list_element = context_in_use->getProcessListElement();
+std::pair<BackupOperationID, BackupStatus> BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePtr context)
+{
+    auto starter = std::make_shared<RestoreStarter>(*this, query, context);
 
-            thread_pool.scheduleOrThrowOnError(
-                [this,
-                 restore_query,
-                 restore_id,
-                 backup_name_for_logging,
-                 backup_info,
-                 restore_settings,
-                 restore_coordination,
-                 context_in_use,
-                 on_exception,
-                 process_list_element_holder = process_list_element ? process_list_element->getProcessListEntry() : nullptr]
+    try
+    {
+        auto thread_pool_id = starter->is_internal_restore ? ThreadPoolId::ASYNC_BACKGROUND_INTERNAL_RESTORE : ThreadPoolId::ASYNC_BACKGROUND_RESTORE;
+        String thread_name = starter->is_internal_restore ? "RestoreAsyncInt" : "RestoreAsync";
+        auto schedule = threadPoolCallbackRunnerUnsafe<void>(thread_pools->getThreadPool(thread_pool_id), thread_name);
+
+        schedule([starter]
+            {
+                try
                 {
-                    try
-                    {
-                        setThreadName("RestorerWorker");
-                        CurrentThread::QueryScope query_scope(context_in_use);
-                        doRestore(
-                            restore_query,
-                            restore_id,
-                            backup_name_for_logging,
-                            backup_info,
-                            restore_settings,
-                            restore_coordination,
-                            context_in_use);
-                    }
-                    catch (...)
-                    {
-                        on_exception(restore_id, backup_name_for_logging, restore_settings, restore_coordination);
-                    }
-                });
-        }
-        else
-        {
-            doRestore(
-                restore_query,
-                restore_id,
-                backup_name_for_logging,
-                backup_info,
-                restore_settings,
-                restore_coordination,
-                context_in_use);
-        }
+                    starter->doRestore();
+                }
+                catch (...)
+                {
+                    starter->onException();
+                }
+            },
+            Priority{});
 
-        return restore_id;
+        return {starter->restore_id, BackupStatus::RESTORING};
     }
     catch (...)
     {
-        on_exception(restore_id, backup_name_for_logging, restore_settings, restore_coordination);
+        starter->onException();
         throw;
     }
 }
 
 
-void BackupsWorker::doRestore(
-    const std::shared_ptr<ASTBackupQuery> & restore_query,
-    const OperationID & restore_id,
-    const String & backup_name_for_logging,
-    const BackupInfo & backup_info,
-    RestoreSettings restore_settings,
-    std::shared_ptr<IRestoreCoordination> restore_coordination,
-    ContextMutablePtr context)
+BackupPtr BackupsWorker::openBackupForReading(const BackupInfo & backup_info, const RestoreSettings & restore_settings, const ContextPtr & context) const
 {
-    /// Open the backup for reading.
+    LOG_TRACE(log, "Opening backup for reading");
     BackupFactory::CreateParams backup_open_params;
     backup_open_params.open_mode = IBackup::OpenMode::READ;
     backup_open_params.context = context;
@@ -931,32 +829,35 @@ void BackupsWorker::doRestore(
     backup_open_params.read_settings = getReadSettingsForRestore(context);
     backup_open_params.write_settings = getWriteSettingsForRestore(context);
     backup_open_params.is_internal_backup = restore_settings.internal;
-    BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);
+    auto backup = BackupFactory::instance().createBackup(backup_open_params);
+    LOG_TRACE(log, "Opened backup for reading");
+    return backup;
+}
+
+
+void BackupsWorker::doRestore(
+    const std::shared_ptr<ASTBackupQuery> & restore_query,
+    const OperationID & restore_id,
+    const String & backup_name_for_logging,
+    const BackupInfo & backup_info,
+    RestoreSettings restore_settings,
+    std::shared_ptr<IRestoreCoordination> restore_coordination,
+    ContextMutablePtr context,
+    bool on_cluster,
+    const ClusterPtr & cluster)
+{
+    bool is_internal_restore = restore_settings.internal;
+
+    maybeSleepForTesting();
+
+    /// Open the backup for reading.
+    BackupPtr backup = openBackupForReading(backup_info, restore_settings, context);
 
     String current_database = context->getCurrentDatabase();
+
     /// Checks access rights if this is ON CLUSTER query.
     /// (If this isn't ON CLUSTER query RestorerFromBackup will check access rights later.)
-    ClusterPtr cluster;
-    bool on_cluster = !restore_query->cluster.empty();
-
-    if (on_cluster)
-    {
-        restore_query->cluster = context->getMacros()->expand(restore_query->cluster);
-        cluster = context->getCluster(restore_query->cluster);
-        restore_settings.cluster_host_ids = cluster->getHostIDs();
-    }
-
-    /// Make a restore coordination.
-    if (!restore_coordination)
-        restore_coordination = makeRestoreCoordination(context, restore_settings, /* remote= */ on_cluster);
-
-    if (!allow_concurrent_restores && restore_coordination->hasConcurrentRestores(std::ref(num_active_restores)))
-        throw Exception(
-            ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED,
-            "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
-
-
-    if (on_cluster)
+    if (on_cluster && !is_internal_restore)
     {
         /// We cannot just use access checking provided by the function executeDDLQueryOnCluster(): it would be incorrect
         /// because different replicas can contain different set of tables and so the required access rights can differ too.
@@ -975,27 +876,21 @@ void BackupsWorker::doRestore(
     }
 
     /// Do RESTORE.
-    if (on_cluster)
+    if (on_cluster && !is_internal_restore)
     {
-
-        DDLQueryOnClusterParams params;
-        params.cluster = cluster;
-        params.only_shard_num = restore_settings.shard_num;
-        params.only_replica_num = restore_settings.replica_num;
+        /// Send the RESTORE query to other hosts.
         restore_settings.copySettingsToQuery(*restore_query);
+        sendQueryToOtherHosts(*restore_query, cluster, restore_settings.shard_num, restore_settings.replica_num,
+                              context, {}, restore_coordination->getOnClusterInitializationKeeperRetriesInfo());
+        restore_coordination->setRestoreQueryWasSentToOtherHosts();
 
-        // executeDDLQueryOnCluster() will return without waiting for completion
-        context->setSetting("distributed_ddl_task_timeout", Field{0});
-        context->setSetting("distributed_ddl_output_mode", Field{"none"});
-
-        executeDDLQueryOnCluster(restore_query, context, params);
-
-        /// Wait until all the hosts have written their backup entries.
-        restore_coordination->waitForStage(Stage::COMPLETED);
-        restore_coordination->setStage(Stage::COMPLETED,"");
+        /// Wait until all the hosts have done with their restoring work.
+        restore_coordination->waitForOtherHostsToFinish();
     }
     else
     {
+        maybeSleepForTesting();
+
         restore_query->setCurrentDatabase(current_database);
 
         auto after_task_callback = [&]
@@ -1011,11 +906,115 @@ void BackupsWorker::doRestore(
         restorer.run(RestorerFromBackup::RESTORE);
     }
 
-    LOG_INFO(log, "Restored from {} {} successfully", (restore_settings.internal ? "internal backup" : "backup"), backup_name_for_logging);
+    /// The restore coordination is not needed anymore.
+    restore_coordination->finish();
+
+    LOG_INFO(log, "Restored from {} {} successfully", (is_internal_restore ? "internal backup" : "backup"), backup_name_for_logging);
     setStatus(restore_id, BackupStatus::RESTORED);
 }
 
 
+void BackupsWorker::sendQueryToOtherHosts(const ASTBackupQuery & backup_or_restore_query, const ClusterPtr & cluster,
+    size_t only_shard_num, size_t only_replica_num, ContextMutablePtr context, const AccessRightsElements & access_to_check,
+    const ZooKeeperRetriesInfo & retries_info) const
+{
+    chassert(cluster);
+
+    DDLQueryOnClusterParams params;
+    params.cluster = cluster;
+    params.only_shard_num = only_shard_num;
+    params.only_replica_num = only_replica_num;
+    params.access_to_check = access_to_check;
+    params.retries_info = retries_info;
+
+    context->setSetting("distributed_ddl_task_timeout", Field{0});
+    context->setSetting("distributed_ddl_output_mode", Field{"never_throw"});
+
+    // executeDDLQueryOnCluster() will return without waiting for completion
+    executeDDLQueryOnCluster(backup_or_restore_query.clone(), context, params);
+
+    maybeSleepForTesting();
+}
+
+
+std::shared_ptr<IBackupCoordination>
+BackupsWorker::makeBackupCoordination(bool on_cluster, const BackupSettings & backup_settings, const ContextPtr & context) const
+{
+    if (!on_cluster)
+    {
+        return std::make_shared<BackupCoordinationLocal>(
+            *backup_settings.backup_uuid, !backup_settings.deduplicate_files, allow_concurrent_backups, *concurrency_counters);
+    }
+
+    bool is_internal_backup = backup_settings.internal;
+
+    String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
+    auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
+    auto keeper_settings = BackupKeeperSettings::fromContext(context);
+
+    auto all_hosts = BackupSettings::Util::filterHostIDs(
+        backup_settings.cluster_host_ids, backup_settings.shard_num, backup_settings.replica_num);
+    all_hosts.emplace_back(BackupCoordinationOnCluster::kInitiator);
+
+    String current_host = is_internal_backup ? backup_settings.host_id : String{BackupCoordinationOnCluster::kInitiator};
+
+    auto thread_pool_id = is_internal_backup ? ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_BACKUP : ThreadPoolId::ON_CLUSTER_COORDINATION_BACKUP;
+    String thread_name = is_internal_backup ? "BackupCoordInt" : "BackupCoord";
+    auto schedule = threadPoolCallbackRunnerUnsafe<void>(thread_pools->getThreadPool(thread_pool_id), thread_name);
+
+    return std::make_shared<BackupCoordinationOnCluster>(
+        *backup_settings.backup_uuid,
+        !backup_settings.deduplicate_files,
+        root_zk_path,
+        get_zookeeper,
+        keeper_settings,
+        current_host,
+        all_hosts,
+        allow_concurrent_backups,
+        *concurrency_counters,
+        schedule,
+        context->getProcessListElement());
+}
+
+std::shared_ptr<IRestoreCoordination>
+BackupsWorker::makeRestoreCoordination(bool on_cluster, const RestoreSettings & restore_settings, const ContextPtr & context) const
+{
+    if (!on_cluster)
+    {
+        return std::make_shared<RestoreCoordinationLocal>(
+            *restore_settings.restore_uuid, allow_concurrent_restores, *concurrency_counters);
+    }
+
+    bool is_internal_restore = restore_settings.internal;
+
+    String root_zk_path = context->getConfigRef().getString("backups.zookeeper_path", "/clickhouse/backups");
+    auto get_zookeeper = [global_context = context->getGlobalContext()] { return global_context->getZooKeeper(); };
+    auto keeper_settings = BackupKeeperSettings::fromContext(context);
+
+    auto all_hosts = BackupSettings::Util::filterHostIDs(
+        restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
+    all_hosts.emplace_back(BackupCoordinationOnCluster::kInitiator);
+
+    String current_host = is_internal_restore ? restore_settings.host_id : String{RestoreCoordinationOnCluster::kInitiator};
+
+    auto thread_pool_id = is_internal_restore ? ThreadPoolId::ON_CLUSTER_COORDINATION_INTERNAL_RESTORE : ThreadPoolId::ON_CLUSTER_COORDINATION_RESTORE;
+    String thread_name = is_internal_restore ? "RestoreCoordInt" : "RestoreCoord";
+    auto schedule = threadPoolCallbackRunnerUnsafe<void>(thread_pools->getThreadPool(thread_pool_id), thread_name);
+
+    return std::make_shared<RestoreCoordinationOnCluster>(
+        *restore_settings.restore_uuid,
+        root_zk_path,
+        get_zookeeper,
+        keeper_settings,
+        current_host,
+        all_hosts,
+        allow_concurrent_restores,
+        *concurrency_counters,
+        schedule,
+        context->getProcessListElement());
+}
+
+
 void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, const String & query_id,
                             bool internal, QueryStatusPtr process_list_element, BackupStatus status)
 {
@@ -1135,23 +1134,25 @@ void BackupsWorker::maybeSleepForTesting() const
 }
 
 
-void BackupsWorker::wait(const OperationID & backup_or_restore_id, bool rethrow_exception)
+BackupStatus BackupsWorker::wait(const OperationID & backup_or_restore_id, bool rethrow_exception)
 {
     std::unique_lock lock{infos_mutex};
+    BackupStatus current_status;
     status_changed.wait(lock, [&]
     {
         auto it = infos.find(backup_or_restore_id);
         if (it == infos.end())
             throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown backup ID {}", backup_or_restore_id);
         const auto & info = it->second.info;
-        auto current_status = info.status;
+        current_status = info.status;
         if (rethrow_exception && isFailedOrCancelled(current_status))
             std::rethrow_exception(info.exception);
         if (isFinalStatus(current_status))
             return true;
-        LOG_INFO(log, "Waiting {} {}", isBackupStatus(info.status) ? "backup" : "restore", info.name);
+        LOG_INFO(log, "Waiting {} {} to complete", isBackupStatus(current_status) ? "backup" : "restore", info.name);
         return false;
     });
+    return current_status;
 }
 
 void BackupsWorker::waitAll()
@@ -1175,9 +1176,11 @@ void BackupsWorker::waitAll()
     LOG_INFO(log, "Backups and restores finished");
 }
 
-void BackupsWorker::cancel(const BackupOperationID & backup_or_restore_id, bool wait_)
+BackupStatus BackupsWorker::cancel(const BackupOperationID & backup_or_restore_id, bool wait_)
 {
     QueryStatusPtr process_list_element;
+    BackupStatus current_status;
+
     {
         std::unique_lock lock{infos_mutex};
         auto it = infos.find(backup_or_restore_id);
@@ -1186,17 +1189,20 @@ void BackupsWorker::cancel(const BackupOperationID & backup_or_restore_id, bool
 
         const auto & extended_info = it->second;
         const auto & info = extended_info.info;
-        if (isFinalStatus(info.status) || !extended_info.process_list_element)
-            return;
+        current_status = info.status;
+        if (isFinalStatus(current_status) || !extended_info.process_list_element)
+            return current_status;
 
-        LOG_INFO(log, "Cancelling {} {}", isBackupStatus(info.status) ? "backup" : "restore", info.name);
+        LOG_INFO(log, "Cancelling {} {}", isBackupStatus(current_status) ? "backup" : "restore", info.name);
         process_list_element = extended_info.process_list_element;
     }
 
     process_list.sendCancelToQuery(process_list_element);
 
-    if (wait_)
-        wait(backup_or_restore_id, /* rethrow_exception= */ false);
+    if (!wait_)
+        return current_status;
+
+    return wait(backup_or_restore_id, /* rethrow_exception= */ false);
 }
 
 
diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h
index 946562b575f..37f91e269a9 100644
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@@ -23,6 +23,7 @@ using BackupMutablePtr = std::shared_ptr<IBackup>;
 using BackupPtr = std::shared_ptr<const IBackup>;
 class IBackupEntry;
 using BackupEntries = std::vector<std::pair<String, std::shared_ptr<const IBackupEntry>>>;
+class BackupConcurrencyCounters;
 using DataRestoreTasks = std::vector<std::function<void()>>;
 struct ReadSettings;
 class BackupLog;
@@ -31,6 +32,10 @@ using ThreadGroupPtr = std::shared_ptr<ThreadGroup>;
 class QueryStatus;
 using QueryStatusPtr = std::shared_ptr<QueryStatus>;
 class ProcessList;
+class Cluster;
+using ClusterPtr = std::shared_ptr<Cluster>;
+class AccessRightsElements;
+struct ZooKeeperRetriesInfo;
 
 
 /// Manager of backups and restores: executes backups and restores' threads in the background.
@@ -47,18 +52,18 @@ public:
     /// Starts executing a BACKUP or RESTORE query. Returns ID of the operation.
     /// For asynchronous operations the function throws no exceptions on failure usually,
     /// call getInfo() on a returned operation id to check for errors.
-    BackupOperationID start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> start(const ASTPtr & backup_or_restore_query, ContextMutablePtr context);
 
     /// Waits until the specified backup or restore operation finishes or stops.
     /// The function returns immediately if the operation is already finished.
-    void wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);
+    BackupStatus wait(const BackupOperationID & backup_or_restore_id, bool rethrow_exception = true);
 
     /// Waits until all running backup and restore operations finish or stop.
     void waitAll();
 
     /// Cancels the specified backup or restore operation.
     /// The function does nothing if this operation has already finished.
-    void cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
+    BackupStatus cancel(const BackupOperationID & backup_or_restore_id, bool wait_ = true);
 
     /// Cancels all running backup and restore operations.
     void cancelAll(bool wait_ = true);
@@ -67,26 +72,32 @@ public:
     std::vector<BackupOperationInfo> getAllInfos() const;
 
 private:
-    BackupOperationID startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    std::pair<BackupOperationID, BackupStatus> startMakingBackup(const ASTPtr & query, const ContextPtr & context);
+    struct BackupStarter;
+
+    BackupMutablePtr openBackupForWriting(const BackupInfo & backup_info, const BackupSettings & backup_settings, std::shared_ptr<IBackupCoordination> backup_coordination, const ContextPtr & context) const;
 
     void doBackup(
-        BackupMutablePtr & backup,
+        BackupMutablePtr backup,
         const std::shared_ptr<ASTBackupQuery> & backup_query,
         const BackupOperationID & backup_id,
         const String & backup_name_for_logging,
-        const BackupInfo & backup_info,
-        BackupSettings backup_settings,
+        const BackupSettings & backup_settings,
         std::shared_ptr<IBackupCoordination> backup_coordination,
-        const ContextPtr & context,
-        ContextMutablePtr mutable_context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);
 
     /// Builds file infos for specified backup entries.
     void buildFileInfosForBackupEntries(const BackupPtr & backup, const BackupEntries & backup_entries, const ReadSettings & read_settings, std::shared_ptr<IBackupCoordination> backup_coordination, QueryStatusPtr process_list_element);
 
     /// Write backup entries to an opened backup.
-    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool internal, QueryStatusPtr process_list_element);
+    void writeBackupEntries(BackupMutablePtr backup, BackupEntries && backup_entries, const BackupOperationID & backup_id, std::shared_ptr<IBackupCoordination> backup_coordination, bool is_internal_backup, QueryStatusPtr process_list_element);
 
-    BackupOperationID startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    std::pair<BackupOperationID, BackupStatus> startRestoring(const ASTPtr & query, ContextMutablePtr context);
+    struct RestoreStarter;
+
+    BackupPtr openBackupForReading(const BackupInfo & backup_info, const RestoreSettings & restore_settings, const ContextPtr & context) const;
 
     void doRestore(
         const std::shared_ptr<ASTBackupQuery> & restore_query,
@@ -95,7 +106,17 @@ private:
         const BackupInfo & backup_info,
         RestoreSettings restore_settings,
         std::shared_ptr<IRestoreCoordination> restore_coordination,
-        ContextMutablePtr context);
+        ContextMutablePtr context,
+        bool on_cluster,
+        const ClusterPtr & cluster);
+
+    std::shared_ptr<IBackupCoordination> makeBackupCoordination(bool on_cluster, const BackupSettings & backup_settings, const ContextPtr & context) const;
+    std::shared_ptr<IRestoreCoordination> makeRestoreCoordination(bool on_cluster, const RestoreSettings & restore_settings, const ContextPtr & context) const;
+
+    /// Sends a BACKUP or RESTORE query to other hosts.
+    void sendQueryToOtherHosts(const ASTBackupQuery & backup_or_restore_query, const ClusterPtr & cluster,
+        size_t only_shard_num, size_t only_replica_num, ContextMutablePtr context, const AccessRightsElements & access_to_check,
+        const ZooKeeperRetriesInfo & retries_info) const;
 
     /// Run data restoring tasks which insert data to tables.
     void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);
@@ -139,6 +160,8 @@ private:
 
     std::shared_ptr<BackupLog> backup_log;
     ProcessList & process_list;
+
+    std::unique_ptr<BackupConcurrencyCounters> concurrency_counters;
 };
 
 }
diff --git a/src/Backups/IBackup.h b/src/Backups/IBackup.h
index 0aa2d34657f..126b4d764da 100644
--- a/src/Backups/IBackup.h
+++ b/src/Backups/IBackup.h
@@ -121,8 +121,13 @@ public:
     /// Finalizes writing the backup, should be called after all entries have been successfully written.
     virtual void finalizeWriting() = 0;
 
-    /// Try to remove all files copied to the backup. Used after an exception or it the backup was cancelled.
-    virtual void tryRemoveAllFiles() = 0;
+    /// Sets that a non-retriable error happened while the backup was being written which means that
+    /// the backup is most likely corrupted and it can't be finalized.
+    /// This function is called while handling an exception or if the backup was cancelled.
+    virtual bool setIsCorrupted() noexcept = 0;
+
+    /// Try to remove all files copied to the backup. Could be used after setIsCorrupted().
+    virtual bool tryRemoveAllFiles() noexcept = 0;
 };
 
 using BackupPtr = std::shared_ptr<const IBackup>;
diff --git a/src/Backups/IBackupCoordination.h b/src/Backups/IBackupCoordination.h
index 166a2c5bbbc..c0eb90de89b 100644
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@@ -5,26 +5,44 @@
 
 namespace DB
 {
-class Exception;
 struct BackupFileInfo;
 using BackupFileInfos = std::vector<BackupFileInfo>;
 enum class AccessEntityType : uint8_t;
 enum class UserDefinedSQLObjectType : uint8_t;
+struct ZooKeeperRetriesInfo;
 
 /// Replicas use this class to coordinate what they're writing to a backup while executing BACKUP ON CLUSTER.
-/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationRemote.
+/// There are two implementation of this interface: BackupCoordinationLocal and BackupCoordinationOnCluster.
 /// BackupCoordinationLocal is used while executing BACKUP without ON CLUSTER and performs coordination in memory.
-/// BackupCoordinationRemote is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
+/// BackupCoordinationOnCluster is used while executing BACKUP with ON CLUSTER and performs coordination via ZooKeeper.
 class IBackupCoordination
 {
 public:
     virtual ~IBackupCoordination() = default;
 
     /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;
+
+    /// Sets that the backup query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setBackupQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;
 
     struct PartNameAndChecksum
     {
@@ -87,9 +105,7 @@ public:
     /// Starts writing a specified file, the function returns false if that file is already being written concurrently.
     virtual bool startWritingFile(size_t data_file_index) = 0;
 
-    /// This function is used to check if concurrent backups are running
-    /// other than the backup passed to the function
-    virtual bool hasConcurrentBackups(const std::atomic<size_t> & num_active_backups) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };
 
 }
diff --git a/src/Backups/IRestoreCoordination.h b/src/Backups/IRestoreCoordination.h
index 37229534286..daabf1745f3 100644
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@@ -5,26 +5,42 @@
 
 namespace DB
 {
-class Exception;
 enum class UserDefinedSQLObjectType : uint8_t;
 class ASTCreateQuery;
+struct ZooKeeperRetriesInfo;
 
 /// Replicas use this class to coordinate what they're reading from a backup while executing RESTORE ON CLUSTER.
-/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationRemote.
+/// There are two implementation of this interface: RestoreCoordinationLocal and RestoreCoordinationOnCluster.
 /// RestoreCoordinationLocal is used while executing RESTORE without ON CLUSTER and performs coordination in memory.
-/// RestoreCoordinationRemote is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
+/// RestoreCoordinationOnCluster is used while executing RESTORE with ON CLUSTER and performs coordination via ZooKeeper.
 class IRestoreCoordination
 {
 public:
     virtual ~IRestoreCoordination() = default;
 
     /// Sets the current stage and waits for other hosts to come to this stage too.
-    virtual void setStage(const String & new_stage, const String & message) = 0;
-    virtual void setError(const Exception & exception) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait) = 0;
-    virtual Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) = 0;
+    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;
 
-    static constexpr const char * kErrorStatus = "error";
+    /// Sets that the restore query was sent to other hosts.
+    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
+    virtual void setRestoreQueryWasSentToOtherHosts() = 0;
+
+    /// Lets other hosts know that the current host has encountered an error.
+    virtual bool trySetError(std::exception_ptr exception) = 0;
+
+    /// Lets other hosts know that the current host has finished its work.
+    virtual void finish() = 0;
+
+    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
+    virtual bool tryFinishAfterError() noexcept = 0;
+
+    /// Waits until all the other hosts finish their work.
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    virtual void waitForOtherHostsToFinish() = 0;
+
+    /// Waits until all the other hosts finish their work (as a part of error-handling process).
+    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
+    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;
 
     /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
     virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
@@ -49,9 +65,7 @@ public:
     /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
     virtual void generateUUIDForTable(ASTCreateQuery & create_query) = 0;
 
-    /// This function is used to check if concurrent restores are running
-    /// other than the restore passed to the function
-    virtual bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const = 0;
+    virtual ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const = 0;
 };
 
 }
diff --git a/src/Backups/RestoreCoordinationLocal.cpp b/src/Backups/RestoreCoordinationLocal.cpp
index 9fe22f874b4..569f58f1909 100644
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@@ -1,32 +1,24 @@
 #include <Backups/RestoreCoordinationLocal.h>
+
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/formatAST.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/logger_useful.h>
 
 
 namespace DB
 {
 
-RestoreCoordinationLocal::RestoreCoordinationLocal() : log(getLogger("RestoreCoordinationLocal"))
+RestoreCoordinationLocal::RestoreCoordinationLocal(
+    const UUID & restore_uuid, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
+    : log(getLogger("RestoreCoordinationLocal"))
+    , concurrency_check(restore_uuid, /* is_restore = */ true, /* on_cluster = */ false, allow_concurrent_restore_, concurrency_counters_)
 {
 }
 
 RestoreCoordinationLocal::~RestoreCoordinationLocal() = default;
 
-void RestoreCoordinationLocal::setStage(const String &, const String &)
-{
-}
-
-void RestoreCoordinationLocal::setError(const Exception &)
-{
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &)
-{
-    return {};
-}
-
-Strings RestoreCoordinationLocal::waitForStage(const String &, std::chrono::milliseconds)
+ZooKeeperRetriesInfo RestoreCoordinationLocal::getOnClusterInitializationKeeperRetriesInfo() const
 {
     return {};
 }
@@ -63,7 +55,7 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
 {
     String query_str = serializeAST(create_query);
 
-    auto find_in_map = [&]
+    auto find_in_map = [&]() TSA_REQUIRES(mutex)
     {
         auto it = create_query_uuids.find(query_str);
         if (it != create_query_uuids.end())
@@ -91,14 +83,4 @@ void RestoreCoordinationLocal::generateUUIDForTable(ASTCreateQuery & create_quer
     }
 }
 
-bool RestoreCoordinationLocal::hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const
-{
-    if (num_active_restores > 1)
-    {
-        LOG_WARNING(log, "Found concurrent backups: num_active_restores={}", num_active_restores);
-        return true;
-    }
-    return false;
-}
-
 }
diff --git a/src/Backups/RestoreCoordinationLocal.h b/src/Backups/RestoreCoordinationLocal.h
index 35f93574b68..6be357c4b7e 100644
--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
 #include <Parsers/CreateQueryUUIDs.h>
 #include <Common/Logger.h>
 #include <mutex>
@@ -12,19 +13,20 @@ namespace DB
 {
 class ASTCreateQuery;
 
-
 /// Implementation of the IRestoreCoordination interface performing coordination in memory.
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationLocal();
+    RestoreCoordinationLocal(const UUID & restore_uuid_, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
     ~RestoreCoordinationLocal() override;
 
-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String &, const String &, bool) override { return {}; }
+    void setRestoreQueryWasSentToOtherHosts() override {}
+    bool trySetError(std::exception_ptr) override { return true; }
+    void finish() override {}
+    bool tryFinishAfterError() noexcept override { return true; }
+    void waitForOtherHostsToFinish() override {}
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }
 
     /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
     bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@@ -49,15 +51,16 @@ public:
     /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
     void generateUUIDForTable(ASTCreateQuery & create_query) override;
 
-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;
 
 private:
     LoggerPtr const log;
+    BackupConcurrencyCheck concurrency_check;
 
-    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases;
-    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables;
-    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids;
-    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables;
+    std::set<std::pair<String /* database_zk_path */, String /* table_name */>> acquired_tables_in_replicated_databases TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* table_zk_path */> acquired_data_in_replicated_tables TSA_GUARDED_BY(mutex);
+    std::unordered_map<String, CreateQueryUUIDs> create_query_uuids TSA_GUARDED_BY(mutex);
+    std::unordered_set<String /* root_zk_path */> acquired_data_in_keeper_map_tables TSA_GUARDED_BY(mutex);
 
     mutable std::mutex mutex;
 };
diff --git a/src/Backups/RestoreCoordinationOnCluster.cpp b/src/Backups/RestoreCoordinationOnCluster.cpp
new file mode 100644
index 00000000000..2029ad8b072
--- /dev/null
+++ b/src/Backups/RestoreCoordinationOnCluster.cpp
@@ -0,0 +1,318 @@
+#include <Backups/BackupCoordinationOnCluster.h>
+
+#include <Backups/BackupCoordinationStage.h>
+#include <Backups/BackupCoordinationStageSync.h>
+#include <Backups/RestoreCoordinationOnCluster.h>
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/CreateQueryUUIDs.h>
+#include <Parsers/formatAST.h>
+#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/escapeForFileName.h>
+
+
+namespace DB
+{
+
+RestoreCoordinationOnCluster::RestoreCoordinationOnCluster(
+    const UUID & restore_uuid_,
+    const String & root_zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_,
+    const BackupKeeperSettings & keeper_settings_,
+    const String & current_host_,
+    const Strings & all_hosts_,
+    bool allow_concurrent_restore_,
+    BackupConcurrencyCounters & concurrency_counters_,
+    ThreadPoolCallbackRunnerUnsafe<void> schedule_,
+    QueryStatusPtr process_list_element_)
+    : root_zookeeper_path(root_zookeeper_path_)
+    , keeper_settings(keeper_settings_)
+    , restore_uuid(restore_uuid_)
+    , zookeeper_path(root_zookeeper_path_ + "/restore-" + toString(restore_uuid_))
+    , all_hosts(all_hosts_)
+    , all_hosts_without_initiator(BackupCoordinationOnCluster::excludeInitiator(all_hosts))
+    , current_host(current_host_)
+    , current_host_index(BackupCoordinationOnCluster::findCurrentHostIndex(current_host, all_hosts))
+    , log(getLogger("RestoreCoordinationOnCluster"))
+    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
+    , concurrency_check(restore_uuid_, /* is_restore = */ true, /* on_cluster = */ true, allow_concurrent_restore_, concurrency_counters_)
+    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, with_retries, schedule_, process_list_element_, log)
+    , cleaner(zookeeper_path, with_retries, log)
+{
+    createRootNodes();
+}
+
+RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster()
+{
+    tryFinishImpl();
+}
+
+void RestoreCoordinationOnCluster::createRootNodes()
+{
+    auto holder = with_retries.createRetriesControlHolder("createRootNodes", WithRetries::kInitialization);
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            zk->createAncestors(zookeeper_path);
+            zk->createIfNotExists(zookeeper_path, "");
+            zk->createIfNotExists(zookeeper_path + "/repl_databases_tables_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_tables_data_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_access_storages_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/repl_sql_objects_acquired", "");
+            zk->createIfNotExists(zookeeper_path + "/keeper_map_tables", "");
+            zk->createIfNotExists(zookeeper_path + "/table_uuids", "");
+        });
+}
+
+Strings RestoreCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
+{
+    stage_sync.setStage(new_stage, message);
+
+    if (!sync)
+        return {};
+
+    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
+}
+
+void RestoreCoordinationOnCluster::setRestoreQueryWasSentToOtherHosts()
+{
+    restore_query_was_sent_to_other_hosts = true;
+}
+
+bool RestoreCoordinationOnCluster::trySetError(std::exception_ptr exception)
+{
+    return stage_sync.trySetError(exception);
+}
+
+void RestoreCoordinationOnCluster::finish()
+{
+    bool other_hosts_also_finished = false;
+    stage_sync.finish(other_hosts_also_finished);
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+        cleaner.cleanup();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishAfterError() noexcept
+{
+    return tryFinishImpl();
+}
+
+bool RestoreCoordinationOnCluster::tryFinishImpl() noexcept
+{
+    bool other_hosts_also_finished = false;
+    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
+        return false;
+
+    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
+    {
+        if (!cleaner.tryCleanupAfterError())
+            return false;
+    }
+
+    return true;
+}
+
+void RestoreCoordinationOnCluster::waitForOtherHostsToFinish()
+{
+    if ((current_host != kInitiator) || !restore_query_was_sent_to_other_hosts)
+        return;
+    stage_sync.waitForOtherHostsToFinish();
+}
+
+bool RestoreCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
+{
+    if (current_host != kInitiator)
+        return false;
+    if (!restore_query_was_sent_to_other_hosts)
+        return true;
+    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+}
+
+ZooKeeperRetriesInfo RestoreCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
+{
+    return ZooKeeperRetriesInfo{keeper_settings.max_retries_while_initializing,
+                                static_cast<UInt64>(keeper_settings.retry_initial_backoff_ms.count()),
+                                static_cast<UInt64>(keeper_settings.retry_max_backoff_ms.count())};
+}
+
+bool RestoreCoordinationOnCluster::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/" + escapeForFileName(table_name);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
+            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
+{
+    bool result = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
+            zk->createIfNotExists(path, "");
+
+            path += "/";
+            switch (object_type)
+            {
+                case UserDefinedSQLObjectType::Function:
+                    path += "functions";
+                    break;
+            }
+
+            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
+            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
+                throw zkutil::KeeperException::fromPath(code, path);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                result = true;
+                return;
+            }
+
+            /// We need to check who created that node
+            result = zk->get(path) == toString(current_host_index);
+        });
+    return result;
+}
+
+bool RestoreCoordinationOnCluster::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
+{
+    bool lock_acquired = false;
+    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            /// we need to remove leading '/' from root_zk_path
+            auto normalized_root_zk_path = root_zk_path.substr(1);
+            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
+            zk->createAncestors(restore_lock_path);
+            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
+
+            if (code == Coordination::Error::ZOK)
+            {
+                lock_acquired = true;
+                return;
+            }
+
+            if (code == Coordination::Error::ZNODEEXISTS)
+                lock_acquired = table_unique_id == zk->get(restore_lock_path);
+            else
+                zkutil::KeeperException::fromPath(code, restore_lock_path);
+        });
+    return lock_acquired;
+}
+
+void RestoreCoordinationOnCluster::generateUUIDForTable(ASTCreateQuery & create_query)
+{
+    String query_str = serializeAST(create_query);
+    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
+    String new_uuids_str = new_uuids.toString();
+
+    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
+    holder.retries_ctl.retryLoop(
+        [&, &zk = holder.faulty_zookeeper]()
+        {
+            with_retries.renewZooKeeper(zk);
+
+            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
+            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
+
+            if (res == Coordination::Error::ZOK)
+            {
+                new_uuids.copyToQuery(create_query);
+                return;
+            }
+
+            if (res == Coordination::Error::ZNODEEXISTS)
+            {
+                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
+                return;
+            }
+
+            zkutil::KeeperException::fromPath(res, path);
+        });
+}
+
+}
diff --git a/src/Backups/RestoreCoordinationRemote.h b/src/Backups/RestoreCoordinationOnCluster.h
similarity index 62%
rename from src/Backups/RestoreCoordinationRemote.h
rename to src/Backups/RestoreCoordinationOnCluster.h
index a3d57e9a4d0..87a8dd3ce83 100644
--- a/src/Backups/RestoreCoordinationRemote.h
+++ b/src/Backups/RestoreCoordinationOnCluster.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <Backups/IRestoreCoordination.h>
+#include <Backups/BackupConcurrencyCheck.h>
+#include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationStageSync.h>
 #include <Backups/WithRetries.h>
 
@@ -9,28 +11,33 @@ namespace DB
 {
 
 /// Implementation of the IRestoreCoordination interface performing coordination via ZooKeeper. It's necessary for "RESTORE ON CLUSTER".
-class RestoreCoordinationRemote : public IRestoreCoordination
+class RestoreCoordinationOnCluster : public IRestoreCoordination
 {
 public:
-    using RestoreKeeperSettings = WithRetries::KeeperSettings;
+    /// Empty string as the current host is used to mark the initiator of a RESTORE ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;
 
-    RestoreCoordinationRemote(
-        zkutil::GetZooKeeper get_zookeeper_,
+    RestoreCoordinationOnCluster(
+        const UUID & restore_uuid_,
         const String & root_zookeeper_path_,
-        const RestoreKeeperSettings & keeper_settings_,
-        const String & restore_uuid_,
-        const Strings & all_hosts_,
+        zkutil::GetZooKeeper get_zookeeper_,
+        const BackupKeeperSettings & keeper_settings_,
         const String & current_host_,
-        bool is_internal_,
+        const Strings & all_hosts_,
+        bool allow_concurrent_restore_,
+        BackupConcurrencyCounters & concurrency_counters_,
+        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
         QueryStatusPtr process_list_element_);
 
-    ~RestoreCoordinationRemote() override;
+    ~RestoreCoordinationOnCluster() override;
 
-    /// Sets the current stage and waits for other hosts to come to this stage too.
-    void setStage(const String & new_stage, const String & message) override;
-    void setError(const Exception & exception) override;
-    Strings waitForStage(const String & stage_to_wait) override;
-    Strings waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout) override;
+    Strings setStage(const String & new_stage, const String & message, bool sync) override;
+    void setRestoreQueryWasSentToOtherHosts() override;
+    bool trySetError(std::exception_ptr exception) override;
+    void finish() override;
+    bool tryFinishAfterError() noexcept override;
+    void waitForOtherHostsToFinish() override;
+    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;
 
     /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
     bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@@ -55,27 +62,27 @@ public:
     /// (because otherwise the macro "{uuid}" in the ZooKeeper path will not work correctly).
     void generateUUIDForTable(ASTCreateQuery & create_query) override;
 
-    bool hasConcurrentRestores(const std::atomic<size_t> & num_active_restores) const override;
+    ZooKeeperRetriesInfo getOnClusterInitializationKeeperRetriesInfo() const override;
 
 private:
     void createRootNodes();
-    void removeAllNodes();
+    bool tryFinishImpl() noexcept;
 
-    /// get_zookeeper will provide a zookeeper client without any fault injection
-    const zkutil::GetZooKeeper get_zookeeper;
     const String root_zookeeper_path;
-    const RestoreKeeperSettings keeper_settings;
-    const String restore_uuid;
+    const BackupKeeperSettings keeper_settings;
+    const UUID restore_uuid;
     const String zookeeper_path;
     const Strings all_hosts;
+    const Strings all_hosts_without_initiator;
     const String current_host;
     const size_t current_host_index;
-    const bool is_internal;
     LoggerPtr const log;
 
-    mutable WithRetries with_retries;
-    std::optional<BackupCoordinationStageSync> stage_sync;
-    mutable std::mutex mutex;
+    const WithRetries with_retries;
+    BackupConcurrencyCheck concurrency_check;
+    BackupCoordinationStageSync stage_sync;
+    BackupCoordinationCleaner cleaner;
+    std::atomic<bool> restore_query_was_sent_to_other_hosts = false;
 };
 
 }
diff --git a/src/Backups/RestoreCoordinationRemote.cpp b/src/Backups/RestoreCoordinationRemote.cpp
deleted file mode 100644
index 0a69bc0eafb..00000000000
--- a/src/Backups/RestoreCoordinationRemote.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-#include <Backups/BackupCoordinationRemote.h>
-#include <Backups/BackupCoordinationStage.h>
-#include <Backups/RestoreCoordinationRemote.h>
-#include <Backups/BackupCoordinationStageSync.h>
-#include <Parsers/ASTCreateQuery.h>
-#include <Parsers/CreateQueryUUIDs.h>
-#include <Parsers/formatAST.h>
-#include <Functions/UserDefined/UserDefinedSQLObjectType.h>
-#include <Common/ZooKeeper/KeeperException.h>
-#include <Common/escapeForFileName.h>
-
-
-namespace DB
-{
-
-namespace Stage = BackupCoordinationStage;
-
-RestoreCoordinationRemote::RestoreCoordinationRemote(
-    zkutil::GetZooKeeper get_zookeeper_,
-    const String & root_zookeeper_path_,
-    const RestoreKeeperSettings & keeper_settings_,
-    const String & restore_uuid_,
-    const Strings & all_hosts_,
-    const String & current_host_,
-    bool is_internal_,
-    QueryStatusPtr process_list_element_)
-    : get_zookeeper(get_zookeeper_)
-    , root_zookeeper_path(root_zookeeper_path_)
-    , keeper_settings(keeper_settings_)
-    , restore_uuid(restore_uuid_)
-    , zookeeper_path(root_zookeeper_path_ + "/restore-" + restore_uuid_)
-    , all_hosts(all_hosts_)
-    , current_host(current_host_)
-    , current_host_index(BackupCoordinationRemote::findCurrentHostIndex(all_hosts, current_host))
-    , is_internal(is_internal_)
-    , log(getLogger("RestoreCoordinationRemote"))
-    , with_retries(
-        log,
-        get_zookeeper_,
-        keeper_settings,
-        process_list_element_,
-        [my_zookeeper_path = zookeeper_path, my_current_host = current_host, my_is_internal = is_internal]
-        (WithRetries::FaultyKeeper & zk)
-        {
-            /// Recreate this ephemeral node to signal that we are alive.
-            if (my_is_internal)
-            {
-                String alive_node_path = my_zookeeper_path + "/stage/alive|" + my_current_host;
-
-                /// Delete the ephemeral node from the previous connection so we don't have to wait for keeper to do it automatically.
-                zk->tryRemove(alive_node_path);
-
-                zk->createAncestors(alive_node_path);
-                zk->create(alive_node_path, "", zkutil::CreateMode::Ephemeral);
-            }
-        })
-{
-    createRootNodes();
-
-    stage_sync.emplace(
-        zookeeper_path,
-        with_retries,
-        log);
-}
-
-RestoreCoordinationRemote::~RestoreCoordinationRemote()
-{
-    try
-    {
-        if (!is_internal)
-            removeAllNodes();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-}
-
-void RestoreCoordinationRemote::createRootNodes()
-{
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->createAncestors(zookeeper_path);
-
-            Coordination::Requests ops;
-            Coordination::Responses responses;
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_databases_tables_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_tables_data_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_access_storages_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/repl_sql_objects_acquired", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/keeper_map_tables", "", zkutil::CreateMode::Persistent));
-            ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/table_uuids", "", zkutil::CreateMode::Persistent));
-            zk->tryMulti(ops, responses);
-        });
-}
-
-void RestoreCoordinationRemote::setStage(const String & new_stage, const String & message)
-{
-    if (is_internal)
-        stage_sync->set(current_host, new_stage, message);
-    else
-        stage_sync->set(current_host, new_stage, /* message */ "", /* all_hosts */ true);
-}
-
-void RestoreCoordinationRemote::setError(const Exception & exception)
-{
-    stage_sync->setError(current_host, exception);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait)
-{
-    return stage_sync->wait(all_hosts, stage_to_wait);
-}
-
-Strings RestoreCoordinationRemote::waitForStage(const String & stage_to_wait, std::chrono::milliseconds timeout)
-{
-    return stage_sync->waitFor(all_hosts, stage_to_wait, timeout);
-}
-
-bool RestoreCoordinationRemote::acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireCreatingTableInReplicatedDatabase");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_databases_tables_acquired/" + escapeForFileName(database_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/" + escapeForFileName(table_name);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataIntoReplicatedTable(const String & table_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataIntoReplicatedTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_tables_data_acquired/" + escapeForFileName(table_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedAccessStorage(const String & access_storage_zk_path)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedAccessStorage");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_access_storages_acquired/" + escapeForFileName(access_storage_zk_path);
-            auto code = zk->tryCreate(path, toString(current_host_index), zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result = zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireReplicatedSQLObjects(const String & loader_zk_path, UserDefinedSQLObjectType object_type)
-{
-    bool result = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireReplicatedSQLObjects");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/repl_sql_objects_acquired/" + escapeForFileName(loader_zk_path);
-            zk->createIfNotExists(path, "");
-
-            path += "/";
-            switch (object_type)
-            {
-                case UserDefinedSQLObjectType::Function:
-                    path += "functions";
-                    break;
-            }
-
-            auto code = zk->tryCreate(path, "", zkutil::CreateMode::Persistent);
-            if ((code != Coordination::Error::ZOK) && (code != Coordination::Error::ZNODEEXISTS))
-                throw zkutil::KeeperException::fromPath(code, path);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                result = true;
-                return;
-            }
-
-            /// We need to check who created that node
-            result =  zk->get(path) == toString(current_host_index);
-        });
-    return result;
-}
-
-bool RestoreCoordinationRemote::acquireInsertingDataForKeeperMap(const String & root_zk_path, const String & table_unique_id)
-{
-    bool lock_acquired = false;
-    auto holder = with_retries.createRetriesControlHolder("acquireInsertingDataForKeeperMap");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            /// we need to remove leading '/' from root_zk_path
-            auto normalized_root_zk_path = root_zk_path.substr(1);
-            std::string restore_lock_path = fs::path(zookeeper_path) / "keeper_map_tables" / escapeForFileName(normalized_root_zk_path);
-            zk->createAncestors(restore_lock_path);
-            auto code = zk->tryCreate(restore_lock_path, table_unique_id, zkutil::CreateMode::Persistent);
-
-            if (code == Coordination::Error::ZOK)
-            {
-                lock_acquired = true;
-                return;
-            }
-
-            if (code == Coordination::Error::ZNODEEXISTS)
-                lock_acquired = table_unique_id == zk->get(restore_lock_path);
-            else
-                zkutil::KeeperException::fromPath(code, restore_lock_path);
-        });
-    return lock_acquired;
-}
-
-void RestoreCoordinationRemote::generateUUIDForTable(ASTCreateQuery & create_query)
-{
-    String query_str = serializeAST(create_query);
-    CreateQueryUUIDs new_uuids{create_query, /* generate_random= */ true, /* force_random= */ true};
-    String new_uuids_str = new_uuids.toString();
-
-    auto holder = with_retries.createRetriesControlHolder("generateUUIDForTable");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            String path = zookeeper_path + "/table_uuids/" + escapeForFileName(query_str);
-            Coordination::Error res = zk->tryCreate(path, new_uuids_str, zkutil::CreateMode::Persistent);
-
-            if (res == Coordination::Error::ZOK)
-            {
-                new_uuids.copyToQuery(create_query);
-                return;
-            }
-
-            if (res == Coordination::Error::ZNODEEXISTS)
-            {
-                CreateQueryUUIDs::fromString(zk->get(path)).copyToQuery(create_query);
-                return;
-            }
-
-            zkutil::KeeperException::fromPath(res, path);
-        });
-}
-
-void RestoreCoordinationRemote::removeAllNodes()
-{
-    /// Usually this function is called by the initiator when a restore operation is complete so we don't need the coordination anymore.
-    ///
-    /// However there can be a rare situation when this function is called after an error occurs on the initiator of a query
-    /// while some hosts are still restoring something. Removing all the nodes will remove the parent node of the restore coordination
-    /// at `zookeeper_path` which might cause such hosts to stop with exception "ZNONODE". Or such hosts might still do some part
-    /// of their restore work before that.
-
-    auto holder = with_retries.createRetriesControlHolder("removeAllNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-            zk->removeRecursive(zookeeper_path);
-        });
-}
-
-bool RestoreCoordinationRemote::hasConcurrentRestores(const std::atomic<size_t> &) const
-{
-    /// If its internal concurrency will be checked for the base restore
-    if (is_internal)
-        return false;
-
-    bool result = false;
-    std::string path = zookeeper_path + "/stage";
-
-    auto holder = with_retries.createRetriesControlHolder("createRootNodes");
-    holder.retries_ctl.retryLoop(
-        [&, &zk = holder.faulty_zookeeper]()
-        {
-            with_retries.renewZooKeeper(zk);
-
-            if (! zk->exists(root_zookeeper_path))
-                zk->createAncestors(root_zookeeper_path);
-
-            for (size_t attempt = 0; attempt < MAX_ZOOKEEPER_ATTEMPTS; ++attempt)
-            {
-                Coordination::Stat stat;
-                zk->get(root_zookeeper_path, &stat);
-                Strings existing_restore_paths = zk->getChildren(root_zookeeper_path);
-                for (const auto & existing_restore_path : existing_restore_paths)
-                {
-                    if (startsWith(existing_restore_path, "backup-"))
-                        continue;
-
-                    String existing_restore_uuid = existing_restore_path;
-                    existing_restore_uuid.erase(0, String("restore-").size());
-
-                    if (existing_restore_uuid == toString(restore_uuid))
-                        continue;
-
-                    String status;
-                    if (zk->tryGet(root_zookeeper_path + "/" + existing_restore_path + "/stage", status))
-                    {
-                        /// Check if some other restore is in progress
-                        if (status == Stage::SCHEDULED_TO_START)
-                        {
-                            LOG_WARNING(log, "Found a concurrent restore: {}, current restore: {}", existing_restore_uuid, toString(restore_uuid));
-                            result = true;
-                            return;
-                        }
-                    }
-                }
-
-                zk->createIfNotExists(path, "");
-                auto code = zk->trySet(path, Stage::SCHEDULED_TO_START, stat.version);
-                if (code == Coordination::Error::ZOK)
-                    break;
-                bool is_last_attempt = (attempt == MAX_ZOOKEEPER_ATTEMPTS - 1);
-                if ((code != Coordination::Error::ZBADVERSION) || is_last_attempt)
-                    throw zkutil::KeeperException::fromPath(code, path);
-            }
-        });
-
-    return result;
-}
-
-}
diff --git a/src/Backups/RestorerFromBackup.cpp b/src/Backups/RestorerFromBackup.cpp
index eb4ba9424ff..29579aa7348 100644
--- a/src/Backups/RestorerFromBackup.cpp
+++ b/src/Backups/RestorerFromBackup.cpp
@@ -100,7 +100,6 @@ RestorerFromBackup::RestorerFromBackup(
     , context(context_)
     , process_list_element(context->getProcessListElement())
     , after_task_callback(after_task_callback_)
-    , on_cluster_first_sync_timeout(context->getConfigRef().getUInt64("backups.on_cluster_first_sync_timeout", 180000))
     , create_table_timeout(context->getConfigRef().getUInt64("backups.create_table_timeout", 300000))
     , log(getLogger("RestorerFromBackup"))
     , tables_dependencies("RestorerFromBackup")
@@ -119,12 +118,14 @@ RestorerFromBackup::~RestorerFromBackup()
     }
 }
 
-void RestorerFromBackup::run(Mode mode)
+void RestorerFromBackup::run(Mode mode_)
 {
     /// run() can be called onle once.
     if (!current_stage.empty())
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Already restoring");
 
+    mode = mode_;
+
     /// Find other hosts working along with us to execute this ON CLUSTER query.
     all_hosts = BackupSettings::Util::filterHostIDs(
         restore_settings.cluster_host_ids, restore_settings.shard_num, restore_settings.replica_num);
@@ -139,6 +140,7 @@ void RestorerFromBackup::run(Mode mode)
     setStage(Stage::FINDING_TABLES_IN_BACKUP);
     findDatabasesAndTablesInBackup();
     waitFutures();
+    logNumberOfDatabasesAndTablesToRestore();
 
     /// Check access rights.
     setStage(Stage::CHECKING_ACCESS_RIGHTS);
@@ -228,20 +230,8 @@ void RestorerFromBackup::setStage(const String & new_stage, const String & messa
 
     if (restore_coordination)
     {
-        restore_coordination->setStage(new_stage, message);
-
-        /// The initiator of a RESTORE ON CLUSTER query waits for other hosts to complete their work (see waitForStage(Stage::COMPLETED) in BackupsWorker::doRestore),
-        /// but other hosts shouldn't wait for each others' completion. (That's simply unnecessary and also
-        /// the initiator may start cleaning up (e.g. removing restore-coordination ZooKeeper nodes) once all other hosts are in Stage::COMPLETED.)
-        bool need_wait = (new_stage != Stage::COMPLETED);
-
-        if (need_wait)
-        {
-            if (new_stage == Stage::FINDING_TABLES_IN_BACKUP)
-                restore_coordination->waitForStage(new_stage, on_cluster_first_sync_timeout);
-            else
-                restore_coordination->waitForStage(new_stage);
-        }
+        /// There is no need to sync Stage::COMPLETED with other hosts because it's the last stage.
+        restore_coordination->setStage(new_stage, message, /* sync = */ (new_stage != Stage::COMPLETED));
     }
 }
 
@@ -384,8 +374,12 @@ void RestorerFromBackup::findDatabasesAndTablesInBackup()
             }
         }
     }
+}
 
-    LOG_INFO(log, "Will restore {} databases and {} tables", getNumDatabases(), getNumTables());
+void RestorerFromBackup::logNumberOfDatabasesAndTablesToRestore() const
+{
+    std::string_view action = (mode == CHECK_ACCESS_ONLY) ? "check access rights for restoring" : "restore";
+    LOG_INFO(log, "Will {} {} databases and {} tables", action, getNumDatabases(), getNumTables());
 }
 
 void RestorerFromBackup::findTableInBackup(const QualifiedTableName & table_name_in_backup, bool skip_if_inner_table, const std::optional<ASTs> & partitions)
diff --git a/src/Backups/RestorerFromBackup.h b/src/Backups/RestorerFromBackup.h
index e0130ccfcb4..87290618487 100644
--- a/src/Backups/RestorerFromBackup.h
+++ b/src/Backups/RestorerFromBackup.h
@@ -53,7 +53,7 @@ public:
     using DataRestoreTasks = std::vector<DataRestoreTask>;
 
     /// Restores the metadata of databases and tables and returns tasks to restore the data of tables.
-    void run(Mode mode);
+    void run(Mode mode_);
 
     BackupPtr getBackup() const { return backup; }
     const RestoreSettings & getRestoreSettings() const { return restore_settings; }
@@ -80,10 +80,10 @@ private:
     ContextMutablePtr context;
     QueryStatusPtr process_list_element;
     std::function<void()> after_task_callback;
-    std::chrono::milliseconds on_cluster_first_sync_timeout;
     std::chrono::milliseconds create_table_timeout;
     LoggerPtr log;
 
+    Mode mode = Mode::RESTORE;
     Strings all_hosts;
     DDLRenamingMap renaming_map;
     std::vector<std::filesystem::path> root_paths_in_backup;
@@ -97,6 +97,7 @@ private:
     void findDatabaseInBackupImpl(const String & database_name_in_backup, const std::set<DatabaseAndTableName> & except_table_names);
     void findEverythingInBackup(const std::set<String> & except_database_names, const std::set<DatabaseAndTableName> & except_table_names);
 
+    void logNumberOfDatabasesAndTablesToRestore() const;
     size_t getNumDatabases() const;
     size_t getNumTables() const;
 
diff --git a/src/Backups/WithRetries.cpp b/src/Backups/WithRetries.cpp
index 772f746e40a..9c18be3ca9e 100644
--- a/src/Backups/WithRetries.cpp
+++ b/src/Backups/WithRetries.cpp
@@ -1,57 +1,34 @@
 #include <Backups/WithRetries.h>
-#include <Core/Settings.h>
 
 #include <mutex>
 
+
 namespace DB
 {
-namespace Setting
-{
-    extern const SettingsUInt64 backup_restore_keeper_max_retries;
-    extern const SettingsUInt64 backup_restore_keeper_retry_initial_backoff_ms;
-    extern const SettingsUInt64 backup_restore_keeper_retry_max_backoff_ms;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multiread;
-    extern const SettingsFloat backup_restore_keeper_fault_injection_probability;
-    extern const SettingsUInt64 backup_restore_keeper_fault_injection_seed;
-    extern const SettingsUInt64 backup_restore_keeper_value_max_size;
-    extern const SettingsUInt64 backup_restore_batch_size_for_keeper_multi;
-}
-
-WithRetries::KeeperSettings WithRetries::KeeperSettings::fromContext(ContextPtr context)
-{
-    return
-    {
-        .keeper_max_retries = context->getSettingsRef()[Setting::backup_restore_keeper_max_retries],
-        .keeper_retry_initial_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_initial_backoff_ms],
-        .keeper_retry_max_backoff_ms = context->getSettingsRef()[Setting::backup_restore_keeper_retry_max_backoff_ms],
-        .batch_size_for_keeper_multiread = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multiread],
-        .keeper_fault_injection_probability = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_probability],
-        .keeper_fault_injection_seed = context->getSettingsRef()[Setting::backup_restore_keeper_fault_injection_seed],
-        .keeper_value_max_size = context->getSettingsRef()[Setting::backup_restore_keeper_value_max_size],
-        .batch_size_for_keeper_multi = context->getSettingsRef()[Setting::backup_restore_batch_size_for_keeper_multi],
-    };
-}
 
 WithRetries::WithRetries(
-    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
+    LoggerPtr log_, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings_, QueryStatusPtr process_list_element_, RenewerCallback callback_)
     : log(log_)
     , get_zookeeper(get_zookeeper_)
     , settings(settings_)
     , process_list_element(process_list_element_)
     , callback(callback_)
-    , global_zookeeper_retries_info(
-          settings.keeper_max_retries, settings.keeper_retry_initial_backoff_ms, settings.keeper_retry_max_backoff_ms)
 {}
 
-WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name)
-    : info(parent->global_zookeeper_retries_info)
-    , retries_ctl(name, parent->log, info, parent->process_list_element)
+WithRetries::RetriesControlHolder::RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind)
+    : info(  (kind == kInitialization) ? parent->settings.max_retries_while_initializing
+           : (kind == kErrorHandling)  ? parent->settings.max_retries_while_handling_error
+                                       : parent->settings.max_retries,
+           parent->settings.retry_initial_backoff_ms.count(),
+           parent->settings.retry_max_backoff_ms.count())
+    /// We don't use process_list_element while handling an error because the error handling can't be cancellable.
+    , retries_ctl(name, parent->log, info, (kind == kErrorHandling) ? nullptr : parent->process_list_element)
     , faulty_zookeeper(parent->getFaultyZooKeeper())
 {}
 
-WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name)
+WithRetries::RetriesControlHolder WithRetries::createRetriesControlHolder(const String & name, Kind kind) const
 {
-    return RetriesControlHolder(this, name);
+    return RetriesControlHolder(this, name, kind);
 }
 
 void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
@@ -62,8 +39,8 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
     {
         zookeeper = get_zookeeper();
         my_faulty_zookeeper->setKeeper(zookeeper);
-
-        callback(my_faulty_zookeeper);
+        if (callback)
+            callback(my_faulty_zookeeper);
     }
     else
     {
@@ -71,7 +48,7 @@ void WithRetries::renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const
     }
 }
 
-const WithRetries::KeeperSettings & WithRetries::getKeeperSettings() const
+const BackupKeeperSettings & WithRetries::getKeeperSettings() const
 {
     return settings;
 }
@@ -88,8 +65,8 @@ WithRetries::FaultyKeeper WithRetries::getFaultyZooKeeper() const
     /// The reason is that ZooKeeperWithFaultInjection may reset the underlying pointer and there could be a race condition
     /// when the same object is used from multiple threads.
     auto faulty_zookeeper = ZooKeeperWithFaultInjection::createInstance(
-        settings.keeper_fault_injection_probability,
-        settings.keeper_fault_injection_seed,
+        settings.fault_injection_probability,
+        settings.fault_injection_seed,
         current_zookeeper,
         log->name(),
         log);
diff --git a/src/Backups/WithRetries.h b/src/Backups/WithRetries.h
index f795a963911..e465fbb1e50 100644
--- a/src/Backups/WithRetries.h
+++ b/src/Backups/WithRetries.h
@@ -1,9 +1,11 @@
 #pragma once
 
-#include <Common/ZooKeeper/ZooKeeperRetries.h>
+#include <Backups/BackupKeeperSettings.h>
 #include <Common/ZooKeeper/Common.h>
+#include <Common/ZooKeeper/ZooKeeperRetries.h>
 #include <Common/ZooKeeper/ZooKeeperWithFaultInjection.h>
 
+
 namespace DB
 {
 
@@ -15,20 +17,13 @@ class WithRetries
 {
 public:
     using FaultyKeeper = Coordination::ZooKeeperWithFaultInjection::Ptr;
-    using RenewerCallback = std::function<void(FaultyKeeper &)>;
+    using RenewerCallback = std::function<void(FaultyKeeper)>;
 
-    struct KeeperSettings
+    enum Kind
     {
-        UInt64 keeper_max_retries{0};
-        UInt64 keeper_retry_initial_backoff_ms{0};
-        UInt64 keeper_retry_max_backoff_ms{0};
-        UInt64 batch_size_for_keeper_multiread{10000};
-        Float64 keeper_fault_injection_probability{0};
-        UInt64 keeper_fault_injection_seed{42};
-        UInt64 keeper_value_max_size{1048576};
-        UInt64 batch_size_for_keeper_multi{1000};
-
-        static KeeperSettings fromContext(ContextPtr context);
+        kNormal,
+        kInitialization,
+        kErrorHandling,
     };
 
     /// For simplicity a separate ZooKeeperRetriesInfo and a faulty [Zoo]Keeper client
@@ -48,23 +43,23 @@ public:
 
     private:
         friend class WithRetries;
-        RetriesControlHolder(const WithRetries * parent, const String & name);
+        RetriesControlHolder(const WithRetries * parent, const String & name, Kind kind);
     };
 
-    RetriesControlHolder createRetriesControlHolder(const String & name);
-    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const KeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback);
+    RetriesControlHolder createRetriesControlHolder(const String & name, Kind kind = Kind::kNormal) const;
+    WithRetries(LoggerPtr log, zkutil::GetZooKeeper get_zookeeper_, const BackupKeeperSettings & settings, QueryStatusPtr process_list_element_, RenewerCallback callback = {});
 
     /// Used to re-establish new connection inside a retry loop.
     void renewZooKeeper(FaultyKeeper my_faulty_zookeeper) const;
 
-    const KeeperSettings & getKeeperSettings() const;
+    const BackupKeeperSettings & getKeeperSettings() const;
 private:
     /// This will provide a special wrapper which is useful for testing
     FaultyKeeper getFaultyZooKeeper() const;
 
     LoggerPtr log;
     zkutil::GetZooKeeper get_zookeeper;
-    KeeperSettings settings;
+    BackupKeeperSettings settings;
     QueryStatusPtr process_list_element;
 
     /// This callback is called each time when a new [Zoo]Keeper session is created.
@@ -76,7 +71,6 @@ private:
     /// it could lead just to a failed backup which could possibly be successful
     /// if there were a little bit more retries.
     RenewerCallback callback;
-    ZooKeeperRetriesInfo global_zookeeper_retries_info;
 
     /// This is needed only to protect zookeeper object
     mutable std::mutex zookeeper_mutex;
diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp
index d68537513da..320fc06cb2f 100644
--- a/src/Common/Exception.cpp
+++ b/src/Common/Exception.cpp
@@ -627,7 +627,7 @@ PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with
     return PreformattedMessage{stream.str(), e.tryGetMessageFormatString(), e.getMessageFormatStringArgs()};
 }
 
-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace)
 {
     try
     {
@@ -635,7 +635,7 @@ std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace)
     }
     catch (...)
     {
-        return getCurrentExceptionMessage(with_stacktrace);
+        return getCurrentExceptionMessage(with_stacktrace, check_embedded_stacktrace);
     }
 }
 
diff --git a/src/Common/Exception.h b/src/Common/Exception.h
index a4f55f41caa..8ec640ff642 100644
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@@ -329,7 +329,7 @@ void tryLogException(std::exception_ptr e, const AtomicLogger & logger, const st
 
 std::string getExceptionMessage(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
 PreformattedMessage getExceptionMessageAndPattern(const Exception & e, bool with_stacktrace, bool check_embedded_stacktrace = false);
-std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace);
+std::string getExceptionMessage(std::exception_ptr e, bool with_stacktrace, bool check_embedded_stacktrace = false);
 
 
 template <typename T>
diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 6c269e22c35..cdbade04a59 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -2660,29 +2660,44 @@ The maximum amount of data consumed by temporary files on disk in bytes for all
 The maximum amount of data consumed by temporary files on disk in bytes for all concurrently running queries. Zero means unlimited.
 )", 0)\
     \
-    DECLARE(UInt64, backup_restore_keeper_max_retries, 20, R"(
-Max retries for keeper operations during backup or restore
+    DECLARE(UInt64, backup_restore_keeper_max_retries, 1000, R"(
+Max retries for [Zoo]Keeper operations in the middle of a BACKUP or RESTORE operation.
+Should be big enough so the whole operation won't fail because of a temporary [Zoo]Keeper failure.
 )", 0) \
     DECLARE(UInt64, backup_restore_keeper_retry_initial_backoff_ms, 100, R"(
 Initial backoff timeout for [Zoo]Keeper operations during backup or restore
 )", 0) \
     DECLARE(UInt64, backup_restore_keeper_retry_max_backoff_ms, 5000, R"(
 Max backoff timeout for [Zoo]Keeper operations during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_failure_after_host_disconnected_for_seconds, 3600, R"(
+If a host during a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation doesn't recreate its ephemeral 'alive' node in ZooKeeper for this amount of time then the whole backup or restore is considered as failed.
+This value should be bigger than any reasonable time for a host to reconnect to ZooKeeper after a failure.
+Zero means unlimited.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_initializing, 20, R"(
+Max retries for [Zoo]Keeper operations during the initialization of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_max_retries_while_handling_error, 20, R"(
+Max retries for [Zoo]Keeper operations while handling an error of a BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_finish_timeout_after_error_sec, 180, R"(
+How long the initiator should wait for other host to react to the 'error' node and stop their work on the current BACKUP ON CLUSTER or RESTORE ON CLUSTER operation.
+)", 0) \
+    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
+Maximum size of data of a [Zoo]Keeper's node during backup
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
+Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
+)", 0) \
+    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
+Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
 )", 0) \
     DECLARE(Float, backup_restore_keeper_fault_injection_probability, 0.0f, R"(
 Approximate probability of failure for a keeper request during backup or restore. Valid value is in interval [0.0f, 1.0f]
 )", 0) \
     DECLARE(UInt64, backup_restore_keeper_fault_injection_seed, 0, R"(
 0 - random seed, otherwise the setting value
-)", 0) \
-    DECLARE(UInt64, backup_restore_keeper_value_max_size, 1048576, R"(
-Maximum size of data of a [Zoo]Keeper's node during backup
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, R"(
-Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore
-)", 0) \
-    DECLARE(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, R"(
-Maximum size of batch for multi request to [Zoo]Keeper during backup or restore
 )", 0) \
     DECLARE(UInt64, backup_restore_s3_retry_attempts, 1000, R"(
 Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 3fe3e960dc6..b6dd68e1571 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -113,6 +113,11 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
             {"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
             {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
+            {"backup_restore_keeper_max_retries", 20, 1000, "Should be big enough so the whole operation BACKUP or RESTORE operation won't fail because of a temporary [Zoo]Keeper failure in the middle of it."},
+            {"backup_restore_failure_after_host_disconnected_for_seconds", 0, 3600, "New setting."},
+            {"backup_restore_keeper_max_retries_while_initializing", 0, 20, "New setting."},
+            {"backup_restore_keeper_max_retries_while_handling_error", 0, 20, "New setting."},
+            {"backup_restore_finish_timeout_after_error_sec", 0, 180, "New setting."},
         }
     },
     {"24.9",
diff --git a/src/Interpreters/InterpreterBackupQuery.cpp b/src/Interpreters/InterpreterBackupQuery.cpp
index 6f76b21a7b8..baaa6d40f0d 100644
--- a/src/Interpreters/InterpreterBackupQuery.cpp
+++ b/src/Interpreters/InterpreterBackupQuery.cpp
@@ -2,6 +2,8 @@
 #include <Interpreters/InterpreterBackupQuery.h>
 
 #include <Backups/BackupsWorker.h>
+#include <Backups/BackupSettings.h>
+#include <Parsers/ASTBackupQuery.h>
 #include <Columns/ColumnString.h>
 #include <Columns/ColumnsNumber.h>
 #include <DataTypes/DataTypeEnum.h>
@@ -18,13 +20,13 @@ namespace DB
 
 namespace
 {
-    Block getResultRow(const BackupOperationInfo & info)
+    Block getResultRow(const String & id, BackupStatus status)
     {
         auto column_id = ColumnString::create();
         auto column_status = ColumnInt8::create();
 
-        column_id->insert(info.id);
-        column_status->insert(static_cast<Int8>(info.status));
+        column_id->insert(id);
+        column_status->insert(static_cast<Int8>(status));
 
         Block res_columns;
         res_columns.insert(0, {std::move(column_id), std::make_shared<DataTypeString>(), "id"});
@@ -36,15 +38,18 @@ namespace
 
 BlockIO InterpreterBackupQuery::execute()
 {
+    const ASTBackupQuery & backup_query = query_ptr->as<const ASTBackupQuery &>();
     auto & backups_worker = context->getBackupsWorker();
-    auto id = backups_worker.start(query_ptr, context);
 
-    auto info = backups_worker.getInfo(id);
-    if (info.exception)
-        std::rethrow_exception(info.exception);
+    auto [id, status] = backups_worker.start(query_ptr, context);
+
+    /// Wait if it's a synchronous operation.
+    bool async = BackupSettings::isAsync(backup_query);
+    if (!async)
+        status = backups_worker.wait(id);
 
     BlockIO res_io;
-    res_io.pipeline = QueryPipeline(std::make_shared<SourceFromSingleChunk>(getResultRow(info)));
+    res_io.pipeline = QueryPipeline(std::make_shared<SourceFromSingleChunk>(getResultRow(id, status)));
     return res_io;
 }
 
diff --git a/src/Storages/StorageKeeperMap.cpp b/src/Storages/StorageKeeperMap.cpp
index 316eced1ed6..2a4a5f3370f 100644
--- a/src/Storages/StorageKeeperMap.cpp
+++ b/src/Storages/StorageKeeperMap.cpp
@@ -889,7 +889,7 @@ private:
             }
         };
 
-        auto max_multiread_size = with_retries->getKeeperSettings().batch_size_for_keeper_multiread;
+        auto max_multiread_size = with_retries->getKeeperSettings().batch_size_for_multiread;
 
         auto keys_it = data_children.begin();
         while (keys_it != data_children.end())
@@ -941,9 +941,8 @@ void StorageKeeperMap::backupData(BackupEntriesCollector & backup_entries_collec
         (
             getLogger(fmt::format("StorageKeeperMapBackup ({})", getStorageID().getNameForLogs())),
             [&] { return getClient(); },
-            WithRetries::KeeperSettings::fromContext(backup_entries_collector.getContext()),
-            backup_entries_collector.getContext()->getProcessListElement(),
-            [](WithRetries::FaultyKeeper &) {}
+            BackupKeeperSettings::fromContext(backup_entries_collector.getContext()),
+            backup_entries_collector.getContext()->getProcessListElement()
         );
 
         backup_entries_collector.addBackupEntries(
@@ -972,9 +971,8 @@ void StorageKeeperMap::restoreDataFromBackup(RestorerFromBackup & restorer, cons
     (
         getLogger(fmt::format("StorageKeeperMapRestore ({})", getStorageID().getNameForLogs())),
         [&] { return getClient(); },
-        WithRetries::KeeperSettings::fromContext(restorer.getContext()),
-        restorer.getContext()->getProcessListElement(),
-        [](WithRetries::FaultyKeeper &) {}
+        BackupKeeperSettings::fromContext(restorer.getContext()),
+        restorer.getContext()->getProcessListElement()
     );
 
     bool allow_non_empty_tables = restorer.isNonEmptyTableAllowed();
@@ -1037,7 +1035,7 @@ void StorageKeeperMap::restoreDataImpl(
     CompressedReadBufferFromFile compressed_in{std::move(in_from_file)};
     fs::path data_path_fs(zk_data_path);
 
-    auto max_multi_size = with_retries->getKeeperSettings().batch_size_for_keeper_multi;
+    auto max_multi_size = with_retries->getKeeperSettings().batch_size_for_multi;
 
     Coordination::Requests create_requests;
     const auto flush_create_requests = [&]
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index bac783501e1..2ec04e74075 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -2125,6 +2125,16 @@ class ClickHouseCluster:
                 ],
             )
 
+    def remove_file_from_container(self, container_id, path):
+        self.exec_in_container(
+            container_id,
+            [
+                "bash",
+                "-c",
+                "rm {}".format(path),
+            ],
+        )
+
     def wait_for_url(
         self, url="http://localhost:8123/ping", conn_timeout=2, interval=2, timeout=60
     ):
@@ -4128,6 +4138,9 @@ class ClickHouseInstance:
             self.docker_id, local_path, dest_path
         )
 
+    def remove_file_from_container(self, path):
+        return self.cluster.remove_file_from_container(self.docker_id, path)
+
     def get_process_pid(self, process_name):
         output = self.exec_in_container(
             [
diff --git a/tests/integration/helpers/config_manager.py b/tests/integration/helpers/config_manager.py
new file mode 100644
index 00000000000..0a080a33477
--- /dev/null
+++ b/tests/integration/helpers/config_manager.py
@@ -0,0 +1,65 @@
+import os
+
+
+class ConfigManager:
+    """Allows to temporarily add configuration files to the "config.d" or "users.d" directories.
+
+    Can act as a context manager:
+
+    with ConfigManager() as cm:
+        cm.add_main_config("configs/test_specific_config.xml") # copy "configs/test_specific_config.xml" to "/etc/clickhouse-server/config.d"
+        ...
+        # "/etc/clickhouse-server/config.d/test_specific_config.xml" is removed automatically
+
+    """
+
+    def __init__(self):
+        self.__added_configs = []
+
+    def add_main_config(self, node_or_nodes, local_path, reload_config=True):
+        """Temporarily adds a configuration file to the "config.d" directory."""
+        self.__add_config(
+            node_or_nodes, local_path, dest_dir="config.d", reload_config=reload_config
+        )
+
+    def add_user_config(self, node_or_nodes, local_path, reload_config=True):
+        """Temporarily adds a configuration file to the "users.d" directory."""
+        self.__add_config(
+            node_or_nodes, local_path, dest_dir="users.d", reload_config=reload_config
+        )
+
+    def reset(self, reload_config=True):
+        """Removes all configuration files added by this ConfigManager."""
+        if not self.__added_configs:
+            return
+        for node, dest_path in self.__added_configs:
+            node.remove_file_from_container(dest_path)
+        if reload_config:
+            for node, _ in self.__added_configs:
+                node.query("SYSTEM RELOAD CONFIG")
+        self.__added_configs = []
+
+    def __add_config(self, node_or_nodes, local_path, dest_dir, reload_config):
+        nodes_to_add_config = (
+            node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+        )
+        for node in nodes_to_add_config:
+            src_path = os.path.join(node.cluster.base_dir, local_path)
+            dest_path = os.path.join(
+                "/etc/clickhouse-server", dest_dir, os.path.basename(local_path)
+            )
+            node.copy_file_to_container(src_path, dest_path)
+        if reload_config:
+            for node in nodes_to_add_config:
+                node.query("SYSTEM RELOAD CONFIG")
+        for node in nodes_to_add_config:
+            dest_path = os.path.join(
+                "/etc/clickhouse-server", dest_dir, os.path.basename(local_path)
+            )
+            self.__added_configs.append((node, dest_path))
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        self.reset()
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml b/tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml
new file mode 100644
index 00000000000..cfc6672ede4
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/configs/faster_zk_disconnect_detect.xml
@@ -0,0 +1,12 @@
+<clickhouse>
+    <zookeeper replace="replace">
+        <node index="1">
+            <host>zoo1</host>
+            <port>2181</port>
+        </node>
+        <connection_timeout_ms>500</connection_timeout_ms>
+        <num_connection_retries>0</num_connection_retries>
+        <operation_timeout_ms>1000</operation_timeout_ms>
+        <session_timeout_ms>5000</session_timeout_ms>
+    </zookeeper>
+</clickhouse>
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml b/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml
index 0886f4bc722..38947be6a5d 100644
--- a/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/lesser_timeouts.xml
@@ -1,6 +1,6 @@
 <clickhouse>
     <backups>
-        <on_cluster_first_sync_timeout>1000</on_cluster_first_sync_timeout>
+        <sync_period_ms>1000</sync_period_ms>
         <consistent_metadata_snapshot_timeout>10000</consistent_metadata_snapshot_timeout>
         <create_table_timeout>3000</create_table_timeout>
     </backups>
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml b/tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml
new file mode 100644
index 00000000000..e0c0e0b32cd
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/configs/shutdown_cancel_backups.xml
@@ -0,0 +1,3 @@
+<clickhouse>
+    <shutdown_wait_backups_and_restores>false</shutdown_wait_backups_and_restores>
+</clickhouse>
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml b/tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml
new file mode 100644
index 00000000000..933c3250054
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/configs/slow_backups.xml
@@ -0,0 +1,7 @@
+<clickhouse>
+    <backups>
+        <test_inject_sleep>true</test_inject_sleep>
+    </backups>
+    <backup_threads>12</backup_threads>
+    <restore_threads>2</restore_threads>
+</clickhouse>
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml b/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml
index 1283f28a8cb..7af54d2dd95 100644
--- a/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml
+++ b/tests/integration/test_backup_restore_on_cluster/configs/zookeeper_retries.xml
@@ -1,9 +1,12 @@
 <clickhouse>
     <profiles>
         <default>
-            <backup_restore_keeper_max_retries>1000</backup_restore_keeper_max_retries>
-            <backup_restore_keeper_retry_initial_backoff_ms>1</backup_restore_keeper_retry_initial_backoff_ms>
-            <backup_restore_keeper_retry_max_backoff_ms>1</backup_restore_keeper_retry_max_backoff_ms>
+            <backup_restore_keeper_max_retries>50</backup_restore_keeper_max_retries>
+            <backup_restore_keeper_retry_initial_backoff_ms>100</backup_restore_keeper_retry_initial_backoff_ms>
+            <backup_restore_keeper_retry_max_backoff_ms>1000</backup_restore_keeper_retry_max_backoff_ms>
+            <backup_restore_keeper_max_retries_while_initializing>10</backup_restore_keeper_max_retries_while_initializing>
+            <backup_restore_keeper_max_retries_while_handling_error>2</backup_restore_keeper_max_retries_while_handling_error>
+            <backup_restore_finish_timeout_after_error_sec>3</backup_restore_finish_timeout_after_error_sec>
             <backup_restore_keeper_fault_injection_seed>42</backup_restore_keeper_fault_injection_seed>
             <backup_restore_keeper_fault_injection_probability>0.002</backup_restore_keeper_fault_injection_probability>
         </default>
diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py
index a1082c563d1..257938a75c5 100644
--- a/tests/integration/test_backup_restore_on_cluster/test.py
+++ b/tests/integration/test_backup_restore_on_cluster/test.py
@@ -1153,7 +1153,7 @@ def test_get_error_from_other_host():
     node1.query("INSERT INTO tbl VALUES (3)")
 
     backup_name = new_backup_name()
-    expected_error = "Got error from node2.*Table default.tbl was not found"
+    expected_error = "Got error from host node2.*Table default.tbl was not found"
     assert re.search(
         expected_error,
         node1.query_and_get_error(
diff --git a/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py b/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py
new file mode 100644
index 00000000000..f63dc2aef3d
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/test_cancel_backup.py
@@ -0,0 +1,780 @@
+import os
+import random
+import time
+import uuid
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.config_manager import ConfigManager
+from helpers.network import PartitionManager
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+main_configs = [
+    "configs/backups_disk.xml",
+    "configs/cluster.xml",
+    "configs/lesser_timeouts.xml",  # Default timeouts are quite big (a few minutes), the tests don't need them to be that big.
+    "configs/slow_backups.xml",
+    "configs/shutdown_cancel_backups.xml",
+]
+
+user_configs = [
+    "configs/zookeeper_retries.xml",
+]
+
+node1 = cluster.add_instance(
+    "node1",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "node1", "shard": "shard1"},
+    with_zookeeper=True,
+    stay_alive=True,  # Necessary for "test_shutdown_cancel_backup"
+)
+
+node2 = cluster.add_instance(
+    "node2",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "node2", "shard": "shard1"},
+    with_zookeeper=True,
+    stay_alive=True,  # Necessary for "test_shutdown_cancel_backup"
+)
+
+nodes = [node1, node2]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_after_test():
+    try:
+        yield
+    finally:
+        node1.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster' SYNC")
+
+
+# Utilities
+
+
+# Gets a printable version the name of a node.
+def get_node_name(node):
+    return "node1" if (node == node1) else "node2"
+
+
+# Choose a random instance.
+def random_node():
+    return random.choice(nodes)
+
+
+# Makes table "tbl" and fill it with data.
+def create_and_fill_table(node, num_parts=10, on_cluster=True):
+    # We use partitioning to make sure there will be more files in a backup.
+    partition_by_clause = " PARTITION BY x%" + str(num_parts) if num_parts > 1 else ""
+    node.query(
+        "CREATE TABLE tbl "
+        + ("ON CLUSTER 'cluster' " if on_cluster else "")
+        + "(x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}') "
+        + "ORDER BY tuple()"
+        + partition_by_clause
+    )
+    if num_parts > 0:
+        node.query(f"INSERT INTO tbl SELECT number FROM numbers({num_parts})")
+
+
+# Generates an ID suitable both as backup id or restore id.
+def random_id():
+    return uuid.uuid4().hex
+
+
+# Generates a backup name prepared for using in BACKUP and RESTORE queries.
+def get_backup_name(backup_id):
+    return f"Disk('backups', '{backup_id}')"
+
+
+# Reads the status of a backup or a restore from system.backups.
+def get_status(initiator, backup_id=None, restore_id=None):
+    id = backup_id if backup_id is not None else restore_id
+    return initiator.query(f"SELECT status FROM system.backups WHERE id='{id}'").rstrip(
+        "\n"
+    )
+
+
+# Reads the error message of a failed backup or a failed restore from system.backups.
+def get_error(initiator, backup_id=None, restore_id=None):
+    id = backup_id if backup_id is not None else restore_id
+    return initiator.query(f"SELECT error FROM system.backups WHERE id='{id}'").rstrip(
+        "\n"
+    )
+
+
+# Waits until the status of a backup or a restore becomes a desired one.
+# Returns how many seconds the function was waiting.
+def wait_status(
+    initiator,
+    status="BACKUP_CREATED",
+    backup_id=None,
+    restore_id=None,
+    timeout=None,
+):
+    print(f"Waiting for status {status}")
+    id = backup_id if backup_id is not None else restore_id
+    operation_name = "backup" if backup_id is not None else "restore"
+    current_status = get_status(initiator, backup_id=backup_id, restore_id=restore_id)
+    waited = 0
+    while (
+        (current_status != status)
+        and (current_status in ["CREATING_BACKUP", "RESTORING"])
+        and ((timeout is None) or (waited < timeout))
+    ):
+        sleep_time = 1 if (timeout is None) else min(1, timeout - waited)
+        time.sleep(sleep_time)
+        waited += sleep_time
+        current_status = get_status(
+            initiator, backup_id=backup_id, restore_id=restore_id
+        )
+    start_time, end_time = (
+        initiator.query(
+            f"SELECT start_time, end_time FROM system.backups WHERE id='{id}'"
+        )
+        .splitlines()[0]
+        .split("\t")
+    )
+    print(
+        f"{get_node_name(initiator)} : Got status {current_status} for {operation_name} {id} after waiting {waited} seconds "
+        f"(start_time = {start_time}, end_time = {end_time})"
+    )
+    assert current_status == status
+
+
+# Returns how many entries are in system.processes corresponding to a specified backup or restore.
+def get_num_system_processes(
+    node_or_nodes, backup_id=None, restore_id=None, is_initial_query=None
+):
+    id = backup_id if backup_id is not None else restore_id
+    query_kind = "Backup" if backup_id is not None else "Restore"
+    total = 0
+    filter_for_is_initial_query = (
+        f" AND (is_initial_query = {is_initial_query})"
+        if is_initial_query is not None
+        else ""
+    )
+    nodes_to_consider = (
+        node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+    )
+    for node in nodes_to_consider:
+        count = int(
+            node.query(
+                f"SELECT count() FROM system.processes WHERE (query_kind='{query_kind}') AND (query LIKE '%{id}%'){filter_for_is_initial_query}"
+            )
+        )
+        total += count
+    return total
+
+
+# Waits until the number of entries in system.processes corresponding to a specified backup or restore becomes a desired one.
+# Returns how many seconds the function was waiting.
+def wait_num_system_processes(
+    node_or_nodes,
+    num_system_processes=0,
+    backup_id=None,
+    restore_id=None,
+    is_initial_query=None,
+    timeout=None,
+):
+    print(f"Waiting for number of system processes = {num_system_processes}")
+    id = backup_id if backup_id is not None else restore_id
+    operation_name = "backup" if backup_id is not None else "restore"
+    current_count = get_num_system_processes(
+        node_or_nodes,
+        backup_id=backup_id,
+        restore_id=restore_id,
+        is_initial_query=is_initial_query,
+    )
+
+    def is_current_count_ok():
+        return (current_count == num_system_processes) or (
+            num_system_processes == "1+" and current_count >= 1
+        )
+
+    waited = 0
+    while not is_current_count_ok() and ((timeout is None) or (waited < timeout)):
+        sleep_time = 1 if (timeout is None) else min(1, timeout - waited)
+        time.sleep(sleep_time)
+        waited += sleep_time
+        current_count = get_num_system_processes(
+            node_or_nodes,
+            backup_id=backup_id,
+            restore_id=restore_id,
+            is_initial_query=is_initial_query,
+        )
+    if is_current_count_ok():
+        print(
+            f"Got {current_count} system processes for {operation_name} {id} after waiting {waited} seconds"
+        )
+    else:
+        nodes_to_consider = (
+            node_or_nodes if (type(node_or_nodes) is list) else [node_or_nodes]
+        )
+        for node in nodes_to_consider:
+            count = get_num_system_processes(
+                node, backup_id=backup_id, restore_id=restore_id
+            )
+            print(
+                f"{get_node_name(node)}: Got {count} system processes for {operation_name} {id} after waiting {waited} seconds"
+            )
+        assert False
+    return waited
+
+
+# Kills a BACKUP or RESTORE query.
+# Returns how many seconds the KILL QUERY was executing.
+def kill_query(
+    node, backup_id=None, restore_id=None, is_initial_query=None, timeout=None
+):
+    id = backup_id if backup_id is not None else restore_id
+    query_kind = "Backup" if backup_id is not None else "Restore"
+    operation_name = "backup" if backup_id is not None else "restore"
+    print(f"{get_node_name(node)}: Cancelling {operation_name} {id}")
+    filter_for_is_initial_query = (
+        f" AND (is_initial_query = {is_initial_query})"
+        if is_initial_query is not None
+        else ""
+    )
+    node.query(
+        f"KILL QUERY WHERE (query_kind='{query_kind}') AND (query LIKE '%{id}%'){filter_for_is_initial_query} SYNC"
+    )
+    node.query("SYSTEM FLUSH LOGS")
+    duration = (
+        int(
+            node.query(
+                f"SELECT query_duration_ms FROM system.query_log WHERE query_kind='KillQuery' AND query LIKE '%{id}%' AND type='QueryFinish'"
+            )
+        )
+        / 1000
+    )
+    print(
+        f"{get_node_name(node)}: Cancelled {operation_name} {id} after {duration} seconds"
+    )
+    if timeout is not None:
+        assert duration < timeout
+
+
+# Stops all ZooKeeper servers.
+def stop_zookeeper_servers(zoo_nodes):
+    print(f"Stopping ZooKeeper servers {zoo_nodes}")
+    old_time = time.monotonic()
+    cluster.stop_zookeeper_nodes(zoo_nodes)
+    print(
+        f"Stopped ZooKeeper servers {zoo_nodes} in {time.monotonic() - old_time} seconds"
+    )
+
+
+# Starts all ZooKeeper servers back.
+def start_zookeeper_servers(zoo_nodes):
+    print(f"Starting ZooKeeper servers {zoo_nodes}")
+    old_time = time.monotonic()
+    cluster.start_zookeeper_nodes(zoo_nodes)
+    print(
+        f"Started ZooKeeper servers {zoo_nodes} in {time.monotonic() - old_time} seconds"
+    )
+
+
+# Sleeps for random amount of time.
+def random_sleep(max_seconds):
+    if random.randint(0, 5) > 0:
+        sleep(random.uniform(0, max_seconds))
+
+
+def sleep(seconds):
+    print(f"Sleeping {seconds} seconds")
+    time.sleep(seconds)
+
+
+# Checks that BACKUP and RESTORE cleaned up properly with no trash left in ZooKeeper, backups folder, and logs.
+class NoTrashChecker:
+    def __init__(self):
+        self.expect_backups = []
+        self.expect_unfinished_backups = []
+        self.expect_errors = []
+        self.allow_errors = []
+        self.check_zookeeper = True
+
+        # Sleep 1 second to ensure this NoTrashChecker won't collect errors from a possible previous NoTrashChecker.
+        time.sleep(1)
+
+        self.__start_time_for_collecting_errors = time.gmtime()
+        self.__previous_list_of_backups = set(
+            os.listdir(os.path.join(node1.cluster.instances_dir, "backups"))
+        )
+
+        self.__previous_list_of_znodes = set(
+            node1.query(
+                "SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups' "
+                + "AND NOT (name == 'alive_tracker')"
+            ).splitlines()
+        )
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, type, value, traceback):
+        list_of_znodes = set(
+            node1.query(
+                "SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups' "
+                + "AND NOT (name == 'alive_tracker')"
+            ).splitlines()
+        )
+        new_znodes = list_of_znodes.difference(self.__previous_list_of_znodes)
+        if new_znodes:
+            print(f"Found nodes in ZooKeeper: {new_znodes}")
+            for node in new_znodes:
+                print(
+                    f"Nodes in '/clickhouse/backups/{node}':\n"
+                    + node1.query(
+                        f"SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups/{node}'"
+                    )
+                )
+                print(
+                    f"Nodes in '/clickhouse/backups/{node}/stage':\n"
+                    + node1.query(
+                        f"SELECT name FROM system.zookeeper WHERE path = '/clickhouse/backups/{node}/stage'"
+                    )
+                )
+        if self.check_zookeeper:
+            assert new_znodes == set()
+
+        list_of_backups = set(
+            os.listdir(os.path.join(node1.cluster.instances_dir, "backups"))
+        )
+        new_backups = list_of_backups.difference(self.__previous_list_of_backups)
+        unfinished_backups = set(
+            backup
+            for backup in new_backups
+            if not os.path.exists(
+                os.path.join(node1.cluster.instances_dir, "backups", backup, ".backup")
+            )
+        )
+        new_backups = set(
+            backup for backup in new_backups if backup not in unfinished_backups
+        )
+        if new_backups:
+            print(f"Found new backups: {new_backups}")
+        if unfinished_backups:
+            print(f"Found unfinished backups: {unfinished_backups}")
+        assert new_backups == set(self.expect_backups)
+        assert unfinished_backups == set(self.expect_unfinished_backups)
+
+        all_errors = set()
+        start_time = time.strftime(
+            "%Y-%m-%d %H:%M:%S", self.__start_time_for_collecting_errors
+        )
+        for node in nodes:
+            errors_query_result = node.query(
+                "SELECT name FROM system.errors WHERE last_error_time >= toDateTime('"
+                + start_time
+                + "') "
+                + "AND NOT ((name == 'KEEPER_EXCEPTION') AND (last_error_message LIKE '%Fault injection%')) "
+                + "AND NOT (name == 'NO_ELEMENTS_IN_CONFIG')"
+            )
+            errors = errors_query_result.splitlines()
+            if errors:
+                print(f"{get_node_name(node)}: Found errors: {errors}")
+                print(
+                    node.query(
+                        "SELECT name, last_error_message FROM system.errors WHERE last_error_time >= toDateTime('"
+                        + start_time
+                        + "')"
+                    )
+                )
+            for error in errors:
+                assert (error in self.expect_errors) or (error in self.allow_errors)
+                all_errors.update(errors)
+
+        not_found_expected_errors = set(self.expect_errors).difference(all_errors)
+        if not_found_expected_errors:
+            print(f"Not found expected errors: {not_found_expected_errors}")
+            assert False
+
+
+__backup_id_of_successful_backup = None
+
+
+# Generates a backup which will be used to test RESTORE.
+def get_backup_id_of_successful_backup():
+    global __backup_id_of_successful_backup
+    if __backup_id_of_successful_backup is None:
+        __backup_id_of_successful_backup = random_id()
+        with NoTrashChecker() as no_trash_checker:
+            print("Will make backup successfully")
+            backup_id = __backup_id_of_successful_backup
+            create_and_fill_table(random_node())
+            initiator = random_node()
+            print(f"Using {get_node_name(initiator)} as initiator")
+            initiator.query(
+                f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+            )
+            wait_status(initiator, "BACKUP_CREATED", backup_id=backup_id)
+            assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+            no_trash_checker.expect_backups = [backup_id]
+
+            # Dropping the table before restoring.
+            node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
+
+    return __backup_id_of_successful_backup
+
+
+# Actual tests
+
+
+# Test that a BACKUP operation can be cancelled with KILL QUERY.
+def test_cancel_backup():
+    with NoTrashChecker() as no_trash_checker:
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the backup might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_cancel, cancel_as_initiator = random.choice(
+            [(node1, False), (node2, False), (initiator, True)]
+        )
+
+        wait_num_system_processes(
+            node_to_cancel,
+            "1+",
+            backup_id=backup_id,
+            is_initial_query=cancel_as_initiator,
+        )
+
+        print(
+            f"Cancelling on {'initiator' if cancel_as_initiator else 'node'} {get_node_name(node_to_cancel)}"
+        )
+
+        # The timeout is 2 seconds here because a backup must be cancelled quickly.
+        kill_query(
+            node_to_cancel,
+            backup_id=backup_id,
+            is_initial_query=cancel_as_initiator,
+            timeout=3,
+        )
+
+        if cancel_as_initiator:
+            assert get_status(initiator, backup_id=backup_id) == "BACKUP_CANCELLED"
+        wait_status(initiator, "BACKUP_CANCELLED", backup_id=backup_id, timeout=3)
+
+        assert "QUERY_WAS_CANCELLED" in get_error(initiator, backup_id=backup_id)
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+
+# Test that a RESTORE operation can be cancelled with KILL QUERY.
+def test_cancel_restore():
+    # Make backup.
+    backup_id = get_backup_id_of_successful_backup()
+
+    # Cancel restoring.
+    with NoTrashChecker() as no_trash_checker:
+        print("Will cancel restoring")
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        restore_id = random_id()
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC"
+        )
+
+        assert get_status(initiator, restore_id=restore_id) == "RESTORING"
+        assert get_num_system_processes(initiator, restore_id=restore_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the restore might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_cancel, cancel_as_initiator = random.choice(
+            [(node1, False), (node2, False), (initiator, True)]
+        )
+
+        wait_num_system_processes(
+            node_to_cancel,
+            "1+",
+            restore_id=restore_id,
+            is_initial_query=cancel_as_initiator,
+        )
+
+        print(
+            f"Cancelling on {'initiator' if cancel_as_initiator else 'node'} {get_node_name(node_to_cancel)}"
+        )
+
+        # The timeout is 2 seconds here because a restore must be cancelled quickly.
+        kill_query(
+            node_to_cancel,
+            restore_id=restore_id,
+            is_initial_query=cancel_as_initiator,
+            timeout=3,
+        )
+
+        if cancel_as_initiator:
+            assert get_status(initiator, restore_id=restore_id) == "RESTORE_CANCELLED"
+        wait_status(initiator, "RESTORE_CANCELLED", restore_id=restore_id, timeout=3)
+
+        assert "QUERY_WAS_CANCELLED" in get_error(initiator, restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+    # Restore successfully.
+    with NoTrashChecker() as no_trash_checker:
+        print("Will restore from backup successfully")
+        restore_id = random_id()
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC"
+        )
+
+        wait_status(initiator, "RESTORED", restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+
+
+# Test that shutdown cancels a running backup and doesn't wait until it finishes.
+def test_shutdown_cancels_backup():
+    with NoTrashChecker() as no_trash_checker:
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # We shouldn't wait too long here, because otherwise the backup might be completed before we cancel it.
+        random_sleep(3)
+
+        node_to_restart = random.choice([node1, node2])
+        wait_num_system_processes(node_to_restart, "1+", backup_id=backup_id)
+
+        print(f"{get_node_name(node_to_restart)}: Restarting...")
+        node_to_restart.restart_clickhouse()  # Must cancel the backup.
+        print(f"{get_node_name(node_to_restart)}: Restarted")
+
+        wait_num_system_processes(nodes, 0, backup_id=backup_id)
+
+        if initiator != node_to_restart:
+            assert get_status(initiator, backup_id=backup_id) == "BACKUP_CANCELLED"
+            assert "QUERY_WAS_CANCELLED" in get_error(initiator, backup_id=backup_id)
+
+        # The information about this cancelled backup must be stored in system.backup_log
+        initiator.query("SYSTEM FLUSH LOGS")
+        assert initiator.query(
+            f"SELECT status FROM system.backup_log WHERE id='{backup_id}' ORDER BY status"
+        ) == TSV(["CREATING_BACKUP", "BACKUP_CANCELLED"])
+
+        no_trash_checker.expect_errors = ["QUERY_WAS_CANCELLED"]
+
+
+# After an error backup should clean the destination folder and used nodes in ZooKeeper.
+# No unexpected errors must be generated.
+def test_error_leaves_no_trash():
+    with NoTrashChecker() as no_trash_checker:
+        # We create table "tbl" on one node only in order to make "BACKUP TABLE tbl ON CLUSTER" fail
+        # (because of the non-existing table on another node).
+        create_and_fill_table(random_node(), on_cluster=False)
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC"
+        )
+
+        wait_status(initiator, "BACKUP_FAILED", backup_id=backup_id)
+        assert "UNKNOWN_TABLE" in get_error(initiator, backup_id=backup_id)
+
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+        no_trash_checker.expect_errors = ["UNKNOWN_TABLE"]
+
+
+# A backup must be stopped if Zookeeper is disconnected longer than `failure_after_host_disconnected_for_seconds`.
+def test_long_disconnection_stops_backup():
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        # Config "faster_zk_disconnect_detect.xml" is used in this test to decrease number of retries when reconnecting to ZooKeeper.
+        # Without this config this test can take several minutes (instead of seconds) to run.
+        config_manager.add_main_config(nodes, "configs/faster_zk_disconnect_detect.xml")
+
+        create_and_fill_table(random_node(), num_parts=100)
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 3},
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        no_trash_checker.expect_unfinished_backups = [backup_id]
+        no_trash_checker.allow_errors = [
+            "FAILED_TO_SYNC_BACKUP_OR_RESTORE",
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
+        no_trash_checker.check_zookeeper = False
+
+        with PartitionManager() as pm:
+            random_sleep(3)
+
+            time_before_disconnection = time.monotonic()
+
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+
+            # Being disconnected from ZooKeeper a backup is expected to fail.
+            wait_status(initiator, "BACKUP_FAILED", backup_id=backup_id)
+
+            time_to_fail = time.monotonic() - time_before_disconnection
+            error = get_error(initiator, backup_id=backup_id)
+            print(f"error={error}")
+            assert "Lost connection" in error
+
+            # A backup is expected to fail, but it isn't expected to fail too soon.
+            print(f"Backup failed after {time_to_fail} seconds disconnection")
+            assert time_to_fail > 3
+            assert time_to_fail < 30
+
+
+# A backup must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.
+def test_short_disconnection_doesnt_stop_backup():
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        use_faster_zk_disconnect_detect = random.choice([True, False])
+        if use_faster_zk_disconnect_detect:
+            print("Using faster_zk_disconnect_detect.xml")
+            config_manager.add_main_config(
+                nodes, "configs/faster_zk_disconnect_detect.xml"
+            )
+
+        create_and_fill_table(random_node())
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        backup_id = random_id()
+        initiator.query(
+            f"BACKUP TABLE tbl ON CLUSTER 'cluster' TO {get_backup_name(backup_id)} SETTINGS id='{backup_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 6},
+        )
+
+        assert get_status(initiator, backup_id=backup_id) == "CREATING_BACKUP"
+        assert get_num_system_processes(initiator, backup_id=backup_id) >= 1
+
+        # Dropping connection for less than `failure_after_host_disconnected_for_seconds`
+        with PartitionManager() as pm:
+            random_sleep(3)
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+            random_sleep(3)
+            print(
+                f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+
+        # Backup must be successful.
+        wait_status(initiator, "BACKUP_CREATED", backup_id=backup_id)
+        assert get_num_system_processes(nodes, backup_id=backup_id) == 0
+
+        no_trash_checker.expect_backups = [backup_id]
+        no_trash_checker.allow_errors = [
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
+
+
+# A restore must NOT be stopped if Zookeeper is disconnected shorter than `failure_after_host_disconnected_for_seconds`.
+def test_short_disconnection_doesnt_stop_restore():
+    # Make a backup.
+    backup_id = get_backup_id_of_successful_backup()
+
+    # Restore from the backup.
+    with NoTrashChecker() as no_trash_checker, ConfigManager() as config_manager:
+        use_faster_zk_disconnect_detect = random.choice([True, False])
+        if use_faster_zk_disconnect_detect:
+            print("Using faster_zk_disconnect_detect.xml")
+            config_manager.add_main_config(
+                nodes, "configs/faster_zk_disconnect_detect.xml"
+            )
+
+        initiator = random_node()
+        print(f"Using {get_node_name(initiator)} as initiator")
+
+        restore_id = random_id()
+        initiator.query(
+            f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {get_backup_name(backup_id)} SETTINGS id='{restore_id}' ASYNC",
+            settings={"backup_restore_failure_after_host_disconnected_for_seconds": 6},
+        )
+
+        assert get_status(initiator, restore_id=restore_id) == "RESTORING"
+        assert get_num_system_processes(initiator, restore_id=restore_id) >= 1
+
+        # Dropping connection for less than `failure_after_host_disconnected_for_seconds`
+        with PartitionManager() as pm:
+            random_sleep(3)
+            node_to_drop_zk_connection = random_node()
+            print(
+                f"Dropping connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+            pm.drop_instance_zk_connections(node_to_drop_zk_connection)
+            random_sleep(3)
+            print(
+                f"Restoring connection between {get_node_name(node_to_drop_zk_connection)} and ZooKeeper"
+            )
+
+        # Restore must be successful.
+        wait_status(initiator, "RESTORED", restore_id=restore_id)
+        assert get_num_system_processes(nodes, restore_id=restore_id) == 0
+
+        no_trash_checker.allow_errors = [
+            "KEEPER_EXCEPTION",
+            "SOCKET_TIMEOUT",
+            "CANNOT_READ_ALL_DATA",
+            "NETWORK_ERROR",
+            "TABLE_IS_READ_ONLY",
+        ]
diff --git a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
index 846c41592f7..3dea986e3d9 100644
--- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
+++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py
@@ -145,7 +145,7 @@ def wait_for_restore(node, restore_id):
 
 def check_backup_error(error):
     expected_errors = [
-        "Concurrent backups not supported",
+        "Concurrent backups are not allowed",
         "BACKUP_ALREADY_EXISTS",
     ]
     assert any([expected_error in error for expected_error in expected_errors])
@@ -153,7 +153,7 @@ def check_backup_error(error):
 
 def check_restore_error(error):
     expected_errors = [
-        "Concurrent restores not supported",
+        "Concurrent restores are not allowed",
         "Cannot restore the table default.tbl because it already contains some data",
     ]
     assert any([expected_error in error for expected_error in expected_errors])

From 7c3ba9324a76ab05ebd132b80bc358f48b135f43 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Wed, 30 Oct 2024 22:09:14 +0100
Subject: [PATCH 45/80] Correct test
 "test_stop_other_host_during_backup[False]" and remove test
 "test_stop_other_host_during_backup[True]" because it was replaced by new
 test "test_long_disconnection_stops_backup".

---
 .../test_backup_restore_on_cluster/test.py    | 26 +++++--------------
 1 file changed, 7 insertions(+), 19 deletions(-)

diff --git a/tests/integration/test_backup_restore_on_cluster/test.py b/tests/integration/test_backup_restore_on_cluster/test.py
index 257938a75c5..4d4fe0e665a 100644
--- a/tests/integration/test_backup_restore_on_cluster/test.py
+++ b/tests/integration/test_backup_restore_on_cluster/test.py
@@ -1162,8 +1162,7 @@ def test_get_error_from_other_host():
     )
 
 
-@pytest.mark.parametrize("kill", [False, True])
-def test_stop_other_host_during_backup(kill):
+def test_shutdown_waits_for_backup():
     node1.query(
         "CREATE TABLE tbl ON CLUSTER 'cluster' ("
         "x UInt8"
@@ -1182,7 +1181,7 @@ def test_stop_other_host_during_backup(kill):
 
     # If kill=False the pending backup must be completed
     # If kill=True the pending backup might be completed or failed
-    node2.stop_clickhouse(kill=kill)
+    node2.stop_clickhouse(kill=False)
 
     assert_eq_with_retry(
         node1,
@@ -1192,22 +1191,11 @@ def test_stop_other_host_during_backup(kill):
     )
 
     status = node1.query(f"SELECT status FROM system.backups WHERE id='{id}'").strip()
-
-    if kill:
-        expected_statuses = ["BACKUP_CREATED", "BACKUP_FAILED"]
-    else:
-        expected_statuses = ["BACKUP_CREATED", "BACKUP_CANCELLED"]
-
-    assert status in expected_statuses
+    assert status == "BACKUP_CREATED"
 
     node2.start_clickhouse()
 
-    if status == "BACKUP_CREATED":
-        node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
-        node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}")
-        node1.query("SYSTEM SYNC REPLICA tbl")
-        assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5])
-    elif status == "BACKUP_FAILED":
-        assert not os.path.exists(
-            os.path.join(get_path_to_backup(backup_name), ".backup")
-        )
+    node1.query("DROP TABLE tbl ON CLUSTER 'cluster' SYNC")
+    node1.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster' FROM {backup_name}")
+    node1.query("SYSTEM SYNC REPLICA tbl")
+    assert node1.query("SELECT * FROM tbl ORDER BY x") == TSV([3, 5])

From 3184b1ef11afa500782118c9f663517ab4ebf20b Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 31 Oct 2024 12:24:03 +0000
Subject: [PATCH 46/80] Fix reading of LowCardinality dictionary in Dynamic
 column

---
 .../Serializations/ISerialization.cpp         |  8 ++++++++
 src/DataTypes/Serializations/ISerialization.h |  2 ++
 .../SerializationLowCardinality.cpp           |  2 +-
 .../MergeTree/MergeTreeReaderWide.cpp         |  2 +-
 ...dynamic_low_cardinality_dict_bug.reference | 20 +++++++++++++++++++
 ...03260_dynamic_low_cardinality_dict_bug.sql | 12 +++++++++++
 6 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.reference
 create mode 100644 tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.sql

diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index fdcdf9e0cda..42f1505118b 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -434,6 +434,14 @@ bool ISerialization::isDynamicSubcolumn(const DB::ISerialization::SubstreamPath
     return false;
 }
 
+bool ISerialization::isLowCardinalityDictionarySubcolumn(const DB::ISerialization::SubstreamPath & path, size_t prefix_len)
+{
+    if (prefix_len == 0 || prefix_len > path.size())
+        return false;
+
+    return path[prefix_len - 1].type == SubstreamType::DictionaryKeys;
+}
+
 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
 {
     assert(prefix_len <= path.size());
diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h
index 7bd58a8a981..e8056ea9665 100644
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@@ -463,6 +463,8 @@ public:
     /// Returns true if stream with specified path corresponds to dynamic subcolumn.
     static bool isDynamicSubcolumn(const SubstreamPath & path, size_t prefix_len);
 
+    static bool isLowCardinalityDictionarySubcolumn(const SubstreamPath & path, size_t prefix_len);
+
 protected:
     template <typename State, typename StatePtr>
     State * checkAndGetState(const StatePtr & state) const;
diff --git a/src/DataTypes/Serializations/SerializationLowCardinality.cpp b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
index baaab6ba3c3..248fe2681b0 100644
--- a/src/DataTypes/Serializations/SerializationLowCardinality.cpp
+++ b/src/DataTypes/Serializations/SerializationLowCardinality.cpp
@@ -54,7 +54,7 @@ void SerializationLowCardinality::enumerateStreams(
         .withSerializationInfo(data.serialization_info);
 
     settings.path.back().data = dict_data;
-    dict_inner_serialization->enumerateStreams(settings, callback, dict_data);
+    callback(settings.path);
 
     settings.path.back() = Substream::DictionaryIndexes;
     settings.path.back().data = data;
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index 898bf5a2933..9b93762a797 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -262,7 +262,7 @@ MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const
         /*num_columns_in_mark=*/ 1);
 
     auto stream_settings = settings;
-    stream_settings.is_low_cardinality_dictionary = substream_path.size() > 1 && substream_path[substream_path.size() - 2].type == ISerialization::Substream::Type::DictionaryKeys;
+    stream_settings.is_low_cardinality_dictionary = ISerialization::isLowCardinalityDictionarySubcolumn(substream_path, substream_path.size());
 
     auto create_stream = [&]<typename Stream>()
     {
diff --git a/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.reference b/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.reference
new file mode 100644
index 00000000000..8ae0f8e9f14
--- /dev/null
+++ b/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.reference
@@ -0,0 +1,20 @@
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
+12345678
diff --git a/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.sql b/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.sql
new file mode 100644
index 00000000000..c5b981d5965
--- /dev/null
+++ b/tests/queries/0_stateless/03260_dynamic_low_cardinality_dict_bug.sql
@@ -0,0 +1,12 @@
+set allow_experimental_dynamic_type = 1;
+set min_bytes_to_use_direct_io = 0;
+
+drop table if exists test;
+create table test (id UInt64, d Dynamic) engine=MergeTree order by id settings min_rows_for_wide_part=1, min_bytes_for_wide_part=1, index_granularity=1, use_adaptive_write_buffer_for_dynamic_subcolumns=0, max_compress_block_size=8, min_compress_block_size=8,  use_compact_variant_discriminators_serialization=0;
+
+insert into test select number, '12345678'::LowCardinality(String) from numbers(20);
+
+select d.`LowCardinality(String)` from test settings max_threads=1;
+
+drop table test;
+

From 9ea9e9422e478b84e8c750ba69e005a16d8ff30f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= <git@rmr.ninja>
Date: Thu, 31 Oct 2024 14:45:16 +0100
Subject: [PATCH 47/80] Fix bad cleanup of output format in client when an
 exception happens

---
 src/Client/ClientBase.cpp | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp
index 73885ba522d..b6bf637ab44 100644
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@@ -1454,8 +1454,22 @@ void ClientBase::resetOutput()
 
     /// Order is important: format, compression, file
 
-    if (output_format)
-        output_format->finalize();
+    try
+    {
+        if (output_format)
+            output_format->finalize();
+    }
+    catch (...)
+    {
+        /// We need to make sure we continue resetting output_format (will stop threads on parallel output)
+        /// as well as cleaning other output related setup
+        if (!have_error)
+        {
+            client_exception
+                = std::make_unique<Exception>(getCurrentExceptionMessageAndPattern(print_stack_trace), getCurrentExceptionCode());
+            have_error = true;
+        }
+    }
     output_format.reset();
 
     logs_out_stream.reset();

From 33cbc540d523888eea630f467718ac84f723f068 Mon Sep 17 00:00:00 2001
From: Thom O'Connor <thom@clickhouse.com>
Date: Thu, 31 Oct 2024 13:49:24 +0000
Subject: [PATCH 48/80] Update kill.md - remove ON CLUSTER for KILL MUTATION

ON CLUSTER is not valid for KILL MUTATION, and will result in an exception. Correcting the docs for this syntax
---
 docs/en/sql-reference/statements/kill.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md
index 667a5b51f5c..ff6f64a97fe 100644
--- a/docs/en/sql-reference/statements/kill.md
+++ b/docs/en/sql-reference/statements/kill.md
@@ -83,7 +83,7 @@ The presence of long-running or incomplete mutations often indicates that a Clic
 - Or manually kill some of these mutations by sending a `KILL` command.
 
 ``` sql
-KILL MUTATION [ON CLUSTER cluster]
+KILL MUTATION
   WHERE <where expression to SELECT FROM system.mutations query>
   [TEST]
   [FORMAT format]
@@ -135,7 +135,6 @@ KILL MUTATION WHERE database = 'default' AND table = 'table'
 -- Cancel the specific mutation:
 KILL MUTATION WHERE database = 'default' AND table = 'table' AND mutation_id = 'mutation_3.txt'
 ```
-:::tip If you are killing a mutation in ClickHouse Cloud or in a self-managed cluster, then be sure to use the ```ON CLUSTER [cluster-name]``` option, in order to ensure the mutation is killed on all replicas:::
 
 The query is useful when a mutation is stuck and cannot finish (e.g. if some function in the mutation query throws an exception when applied to the data contained in the table).
 

From b7907051b3eb7b2d669d48beda5dfde130d93b12 Mon Sep 17 00:00:00 2001
From: avogar <avogar@clickhouse.com>
Date: Thu, 31 Oct 2024 13:52:15 +0000
Subject: [PATCH 49/80] Fix comments, update tests

---
 src/DataTypes/Serializations/ISerialization.cpp             | 6 +++---
 src/DataTypes/Serializations/ISerialization.h               | 2 +-
 src/Storages/MergeTree/MergeTreeReaderWide.cpp              | 2 +-
 .../02240_get_type_serialization_streams.reference          | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/DataTypes/Serializations/ISerialization.cpp b/src/DataTypes/Serializations/ISerialization.cpp
index 42f1505118b..90ad822e6f5 100644
--- a/src/DataTypes/Serializations/ISerialization.cpp
+++ b/src/DataTypes/Serializations/ISerialization.cpp
@@ -434,12 +434,12 @@ bool ISerialization::isDynamicSubcolumn(const DB::ISerialization::SubstreamPath
     return false;
 }
 
-bool ISerialization::isLowCardinalityDictionarySubcolumn(const DB::ISerialization::SubstreamPath & path, size_t prefix_len)
+bool ISerialization::isLowCardinalityDictionarySubcolumn(const DB::ISerialization::SubstreamPath & path)
 {
-    if (prefix_len == 0 || prefix_len > path.size())
+    if (path.empty())
         return false;
 
-    return path[prefix_len - 1].type == SubstreamType::DictionaryKeys;
+    return path[path.size() - 1].type == SubstreamType::DictionaryKeys;
 }
 
 ISerialization::SubstreamData ISerialization::createFromPath(const SubstreamPath & path, size_t prefix_len)
diff --git a/src/DataTypes/Serializations/ISerialization.h b/src/DataTypes/Serializations/ISerialization.h
index e8056ea9665..400bdbf32d3 100644
--- a/src/DataTypes/Serializations/ISerialization.h
+++ b/src/DataTypes/Serializations/ISerialization.h
@@ -463,7 +463,7 @@ public:
     /// Returns true if stream with specified path corresponds to dynamic subcolumn.
     static bool isDynamicSubcolumn(const SubstreamPath & path, size_t prefix_len);
 
-    static bool isLowCardinalityDictionarySubcolumn(const SubstreamPath & path, size_t prefix_len);
+    static bool isLowCardinalityDictionarySubcolumn(const SubstreamPath & path);
 
 protected:
     template <typename State, typename StatePtr>
diff --git a/src/Storages/MergeTree/MergeTreeReaderWide.cpp b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
index 9b93762a797..77231d8d392 100644
--- a/src/Storages/MergeTree/MergeTreeReaderWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeReaderWide.cpp
@@ -262,7 +262,7 @@ MergeTreeReaderWide::FileStreams::iterator MergeTreeReaderWide::addStream(const
         /*num_columns_in_mark=*/ 1);
 
     auto stream_settings = settings;
-    stream_settings.is_low_cardinality_dictionary = ISerialization::isLowCardinalityDictionarySubcolumn(substream_path, substream_path.size());
+    stream_settings.is_low_cardinality_dictionary = ISerialization::isLowCardinalityDictionarySubcolumn(substream_path);
 
     auto create_stream = [&]<typename Stream>()
     {
diff --git a/tests/queries/0_stateless/02240_get_type_serialization_streams.reference b/tests/queries/0_stateless/02240_get_type_serialization_streams.reference
index 15e9bf87562..eb16198e877 100644
--- a/tests/queries/0_stateless/02240_get_type_serialization_streams.reference
+++ b/tests/queries/0_stateless/02240_get_type_serialization_streams.reference
@@ -1,7 +1,7 @@
 ['{ArraySizes}','{ArrayElements, Regular}']
 ['{ArraySizes}','{ArrayElements, TupleElement(keys), Regular}','{ArrayElements, TupleElement(values), Regular}']
 ['{TupleElement(1), Regular}','{TupleElement(2), Regular}','{TupleElement(3), Regular}']
-['{DictionaryKeys, Regular}','{DictionaryIndexes}']
+['{DictionaryKeys}','{DictionaryIndexes}']
 ['{NullMap}','{NullableElements, Regular}']
 ['{ArraySizes}','{ArrayElements, Regular}']
 ['{ArraySizes}','{ArrayElements, TupleElement(keys), Regular}','{ArrayElements, TupleElement(values), Regular}']

From cdb479d10daeb0edd4bd1ff2c9e400b6cb77c07d Mon Sep 17 00:00:00 2001
From: Pablo Marcos <pablo.marcos.oltra@clickhouse.com>
Date: Thu, 31 Oct 2024 14:37:37 +0000
Subject: [PATCH 50/80] Fix debug log timestamp

Increase the error margin for the test to avoid flakiness
in the intervals where the number of events is smaller.
---
 src/Interpreters/QueryMetricLog.cpp                        | 6 +++---
 tests/queries/0_stateless/03203_system_query_metric_log.sh | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Interpreters/QueryMetricLog.cpp b/src/Interpreters/QueryMetricLog.cpp
index 8a84c95a5a3..5ab3fe590e0 100644
--- a/src/Interpreters/QueryMetricLog.cpp
+++ b/src/Interpreters/QueryMetricLog.cpp
@@ -100,7 +100,7 @@ void QueryMetricLog::startQuery(const String & query_id, TimePoint start_time, U
         const auto query_info = process_list.getQueryInfo(query_id, false, true, false);
         if (!query_info)
         {
-            LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryInfo", query_id);
+            LOG_TRACE(logger, "Query {} is not running anymore, so we couldn't get its QueryStatusInfo", query_id);
             return;
         }
 
@@ -156,8 +156,8 @@ std::optional<QueryMetricLogElement> QueryMetricLog::createLogMetricElement(cons
 {
     /// fmtlib supports subsecond formatting in 10.0.0. We're in 9.1.0, so we need to add the milliseconds ourselves.
     auto seconds = std::chrono::time_point_cast<std::chrono::seconds>(query_info_time);
-    auto milliseconds = std::chrono::duration_cast<std::chrono::milliseconds>(query_info_time - seconds).count();
-    LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:05}. Schedule next: {}", query_id, seconds, milliseconds, schedule_next);
+    auto microseconds = std::chrono::duration_cast<std::chrono::microseconds>(query_info_time - seconds).count();
+    LOG_DEBUG(logger, "Collecting query_metric_log for query {} with QueryStatusInfo from {:%Y.%m.%d %H:%M:%S}.{:06}. Schedule next: {}", query_id, seconds, microseconds, schedule_next);
 
     std::unique_lock lock(queries_mutex);
     auto query_status_it = queries.find(query_id);
diff --git a/tests/queries/0_stateless/03203_system_query_metric_log.sh b/tests/queries/0_stateless/03203_system_query_metric_log.sh
index b66e274df78..bf94be79d7c 100755
--- a/tests/queries/0_stateless/03203_system_query_metric_log.sh
+++ b/tests/queries/0_stateless/03203_system_query_metric_log.sh
@@ -24,7 +24,7 @@ function check_log()
     $CLICKHOUSE_CLIENT -m -q """
         SELECT '--Interval $interval: check that amount of events is correct';
         SELECT
-            count() BETWEEN (ceil(2500 / $interval) * 0.8) AND (ceil(2500 / $interval) * 1.2)
+            count() BETWEEN ((ceil(2500 / $interval) - 1) * 0.8) AND ((ceil(2500 / $interval) + 1) * 1.2)
         FROM system.query_metric_log
         WHERE event_date >= yesterday() AND query_id = '${query_prefix}_${interval}'
     """

From a57c64e6b01dde6084e40162142bf1325f59f11c Mon Sep 17 00:00:00 2001
From: Anton Popov <anton@clickhouse.com>
Date: Thu, 31 Oct 2024 14:59:04 +0000
Subject: [PATCH 51/80] fix async inserts with empty blocks via native protocol

---
 src/Interpreters/AsynchronousInsertQueue.cpp  |  7 +++++
 ..._async_insert_native_empty_block.reference |  9 +++++++
 .../03257_async_insert_native_empty_block.sh  | 27 +++++++++++++++++++
 3 files changed, 43 insertions(+)
 create mode 100644 tests/queries/0_stateless/03257_async_insert_native_empty_block.reference
 create mode 100755 tests/queries/0_stateless/03257_async_insert_native_empty_block.sh

diff --git a/src/Interpreters/AsynchronousInsertQueue.cpp b/src/Interpreters/AsynchronousInsertQueue.cpp
index 5cc97effad6..8b8a6d4e9ef 100644
--- a/src/Interpreters/AsynchronousInsertQueue.cpp
+++ b/src/Interpreters/AsynchronousInsertQueue.cpp
@@ -1121,6 +1121,13 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries(
                 "Expected entry with data kind Preprocessed. Got: {}", entry->chunk.getDataKind());
 
         Block block_to_insert = *block;
+        if (block_to_insert.rows() == 0)
+        {
+            add_to_async_insert_log(entry, /*parsing_exception=*/ "", block_to_insert.rows(), block_to_insert.bytes());
+            entry->resetChunk();
+            continue;
+        }
+
         if (!isCompatibleHeader(block_to_insert, header))
             convertBlockToHeader(block_to_insert, header);
 
diff --git a/tests/queries/0_stateless/03257_async_insert_native_empty_block.reference b/tests/queries/0_stateless/03257_async_insert_native_empty_block.reference
new file mode 100644
index 00000000000..6df2a541bff
--- /dev/null
+++ b/tests/queries/0_stateless/03257_async_insert_native_empty_block.reference
@@ -0,0 +1,9 @@
+1	name1
+2	name2
+3	
+4	
+5	
+Ok	Preprocessed	2
+Ok	Preprocessed	3
+Ok	Preprocessed	0
+Ok	Preprocessed	0
diff --git a/tests/queries/0_stateless/03257_async_insert_native_empty_block.sh b/tests/queries/0_stateless/03257_async_insert_native_empty_block.sh
new file mode 100755
index 00000000000..43a5472914d
--- /dev/null
+++ b/tests/queries/0_stateless/03257_async_insert_native_empty_block.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT --query "
+    DROP TABLE IF EXISTS json_square_brackets;
+    CREATE TABLE json_square_brackets (id UInt32, name String) ENGINE = MergeTree ORDER BY tuple()
+"
+
+MY_CLICKHOUSE_CLIENT="$CLICKHOUSE_CLIENT --async_insert 1 --wait_for_async_insert 1"
+
+echo '[{"id": 1, "name": "name1"}, {"id": 2, "name": "name2"}]' | $MY_CLICKHOUSE_CLIENT -q "INSERT INTO json_square_brackets FORMAT JSONEachRow"
+
+echo '[{"id": 3}, {"id": 4}, {"id": 5}]' | $MY_CLICKHOUSE_CLIENT -q "INSERT INTO json_square_brackets FORMAT JSONEachRow"
+
+echo '[]' | $MY_CLICKHOUSE_CLIENT -q "INSERT INTO json_square_brackets FORMAT JSONEachRow"
+
+echo '' | $MY_CLICKHOUSE_CLIENT -q "INSERT INTO json_square_brackets FORMAT JSONEachRow"
+
+$CLICKHOUSE_CLIENT --query "
+    SYSTEM FLUSH LOGS;
+    SELECT * FROM json_square_brackets ORDER BY id;
+    SELECT status, data_kind, rows FROM system.asynchronous_insert_log WHERE database = currentDatabase() AND table = 'json_square_brackets' ORDER BY event_time_microseconds;
+    DROP TABLE json_square_brackets;
+"

From 2f0a8ecdcb0073f4b24a225b5a9608a31353e89e Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 31 Oct 2024 16:02:38 +0100
Subject: [PATCH 52/80] Expose one more simple merge selector setting

---
 src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 3 +++
 src/Storages/MergeTree/MergeTreeSettings.cpp          | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
index 8b3c7bdf3fb..a39b8a7a40b 100644
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
@@ -70,6 +70,7 @@ namespace MergeTreeSetting
     extern const MergeTreeSettingsBool ttl_only_drop_parts;
     extern const MergeTreeSettingsUInt64 parts_to_throw_insert;
     extern const MergeTreeSettingsMergeSelectorAlgorithm merge_selector_algorithm;
+    extern const MergeTreeSettingsBool merge_selector_enable_heuristic_to_remove_small_parts_at_right;
 }
 
 namespace ErrorCodes
@@ -540,6 +541,8 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges(
             /// Override value from table settings
             simple_merge_settings.window_size = (*data_settings)[MergeTreeSetting::merge_selector_window_size];
             simple_merge_settings.max_parts_to_merge_at_once = (*data_settings)[MergeTreeSetting::max_parts_to_merge_at_once];
+            simple_merge_settings.enable_heuristic_to_remove_small_parts_at_right = (*data_settings)[MergeTreeSettings::merge_selector_enable_heuristic_to_remove_small_parts_at_right];
+
             if (!(*data_settings)[MergeTreeSetting::min_age_to_force_merge_on_partition_only])
                 simple_merge_settings.min_age_to_force_merge = (*data_settings)[MergeTreeSetting::min_age_to_force_merge_seconds];
 
diff --git a/src/Storages/MergeTree/MergeTreeSettings.cpp b/src/Storages/MergeTree/MergeTreeSettings.cpp
index 3d2c9c63598..5d7d9cb3c6b 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.cpp
+++ b/src/Storages/MergeTree/MergeTreeSettings.cpp
@@ -99,6 +99,7 @@ namespace ErrorCodes
     DECLARE(String, mutation_workload, "", "Name of workload to be used to access resources for mutations", 0) \
     DECLARE(Milliseconds, background_task_preferred_step_execution_time_ms, 50, "Target time to execution of one step of merge or mutation. Can be exceeded if one step takes longer time", 0) \
     DECLARE(MergeSelectorAlgorithm, merge_selector_algorithm, MergeSelectorAlgorithm::SIMPLE, "The algorithm to select parts for merges assignment", 0) \
+    DECLARE(Bool, merge_selector_enable_heuristic_to_remove_small_parts_at_right, true, "Enable heuristic for selecting parts for merge which removes parts from right side of range, if their size is less than specified ratio (0.01) of sum_size. Works for Simple and StochasticSimple merge selectors", 0) \
     \
     /** Inserts settings. */ \
     DECLARE(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \

From 53d4f2aacf722cbb2fbadabf2b7899fe1f9f6fc0 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 31 Oct 2024 16:33:40 +0100
Subject: [PATCH 53/80] Followup

---
 src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
index a39b8a7a40b..62ad9d4a52a 100644
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
@@ -541,7 +541,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectPartsToMergeFromRanges(
             /// Override value from table settings
             simple_merge_settings.window_size = (*data_settings)[MergeTreeSetting::merge_selector_window_size];
             simple_merge_settings.max_parts_to_merge_at_once = (*data_settings)[MergeTreeSetting::max_parts_to_merge_at_once];
-            simple_merge_settings.enable_heuristic_to_remove_small_parts_at_right = (*data_settings)[MergeTreeSettings::merge_selector_enable_heuristic_to_remove_small_parts_at_right];
+            simple_merge_settings.enable_heuristic_to_remove_small_parts_at_right = (*data_settings)[MergeTreeSetting::merge_selector_enable_heuristic_to_remove_small_parts_at_right];
 
             if (!(*data_settings)[MergeTreeSetting::min_age_to_force_merge_on_partition_only])
                 simple_merge_settings.min_age_to_force_merge = (*data_settings)[MergeTreeSetting::min_age_to_force_merge_seconds];

From 4784c3f0a3e15d908148878270ba7695cadb22c8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 31 Oct 2024 17:12:43 +0100
Subject: [PATCH 54/80] Better style for some sever-level settings

---
 src/Core/ServerSettings.cpp  |  7 +++++++
 src/Interpreters/Context.cpp | 26 +++++++++++++++-----------
 2 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/src/Core/ServerSettings.cpp b/src/Core/ServerSettings.cpp
index ead40061493..637c3196f33 100644
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@@ -192,6 +192,13 @@ namespace DB
     DECLARE(UInt64, parts_killer_pool_size, 128, "Threads for cleanup of shared merge tree outdated threads. Only available in ClickHouse Cloud", 0) \
     DECLARE(UInt64, keeper_multiread_batch_size, 10'000, "Maximum size of batch for MultiRead request to [Zoo]Keeper that support batching. If set to 0, batching is disabled. Available only in ClickHouse Cloud.", 0) \
     DECLARE(Bool, use_legacy_mongodb_integration, true, "Use the legacy MongoDB integration implementation. Note: it's highly recommended to set this option to false, since legacy implementation will be removed in the future. Please submit any issues you encounter with the new implementation.", 0) \
+    \
+    DECLARE(UInt64, prefetch_threadpool_pool_size, 100, "Size of background pool for prefetches for remote object storages", 0) \
+    DECLARE(UInt64, prefetch_threadpool_queue_size, 1000000, "Number of tasks which is possible to push into prefetches pool", 0) \
+    DECLARE(UInt64, load_marks_threadpool_pool_size, 50, "Size of background pool for marks loading", 0) \
+    DECLARE(UInt64, load_marks_threadpool_queue_size, 1000000, "Number of tasks which is possible to push into prefetches pool", 0) \
+    DECLARE(UInt64, threadpool_writer_pool_size, 100, "Size of background pool for write requests to object storages", 0) \
+    DECLARE(UInt64, threadpool_writer_queue_size, 1000000, "Number of tasks which is possible to push into background pool for write requests to object storages", 0)
 
 /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in dumpToSystemServerSettingsColumns below
 
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index fbf0cbd0eb7..4f82ed7b046 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -273,6 +273,13 @@ namespace ServerSetting
     extern const ServerSettingsUInt64 max_replicated_sends_network_bandwidth_for_server;
     extern const ServerSettingsUInt64 tables_loader_background_pool_size;
     extern const ServerSettingsUInt64 tables_loader_foreground_pool_size;
+    extern const ServerSettingsUInt64 prefetch_threadpool_pool_size;
+    extern const ServerSettingsUInt64 prefetch_threadpool_queue_size;
+    extern const ServerSettingsUInt64 load_marks_threadpool_pool_size;
+    extern const ServerSettingsUInt64 load_marks_threadpool_queue_size;
+    extern const ServerSettingsUInt64 threadpool_writer_pool_size;
+    extern const ServerSettingsUInt64 threadpool_writer_queue_size;
+
 }
 
 namespace ErrorCodes
@@ -3215,9 +3222,8 @@ void Context::clearMarkCache() const
 ThreadPool & Context::getLoadMarksThreadpool() const
 {
     callOnce(shared->load_marks_threadpool_initialized, [&] {
-        const auto & config = getConfigRef();
-        auto pool_size = config.getUInt(".load_marks_threadpool_pool_size", 50);
-        auto queue_size = config.getUInt(".load_marks_threadpool_queue_size", 1000000);
+        auto pool_size = shared->server_settings[ServerSetting::load_marks_threadpool_pool_size];
+        auto queue_size = shared->server_settings[ServerSetting::load_marks_threadpool_queue_size];
         shared->load_marks_threadpool = std::make_unique<ThreadPool>(
             CurrentMetrics::MarksLoaderThreads, CurrentMetrics::MarksLoaderThreadsActive, CurrentMetrics::MarksLoaderThreadsScheduled, pool_size, pool_size, queue_size);
     });
@@ -3410,9 +3416,9 @@ AsynchronousMetrics * Context::getAsynchronousMetrics() const
 ThreadPool & Context::getPrefetchThreadpool() const
 {
     callOnce(shared->prefetch_threadpool_initialized, [&] {
-        const auto & config = getConfigRef();
-        auto pool_size = config.getUInt(".prefetch_threadpool_pool_size", 100);
-        auto queue_size = config.getUInt(".prefetch_threadpool_queue_size", 1000000);
+        auto pool_size = shared->server_settings[ServerSetting::prefetch_threadpool_pool_size];
+        auto queue_size = shared->server_settings[ServerSetting::prefetch_threadpool_queue_size];
+
         shared->prefetch_threadpool = std::make_unique<ThreadPool>(
             CurrentMetrics::IOPrefetchThreads, CurrentMetrics::IOPrefetchThreadsActive, CurrentMetrics::IOPrefetchThreadsScheduled, pool_size, pool_size, queue_size);
     });
@@ -3422,8 +3428,7 @@ ThreadPool & Context::getPrefetchThreadpool() const
 
 size_t Context::getPrefetchThreadpoolSize() const
 {
-    const auto & config = getConfigRef();
-    return config.getUInt(".prefetch_threadpool_pool_size", 100);
+    return shared->server_settings[ServerSetting::prefetch_threadpool_pool_size];
 }
 
 ThreadPool & Context::getBuildVectorSimilarityIndexThreadPool() const
@@ -5696,9 +5701,8 @@ IOUringReader & Context::getIOUringReader() const
 ThreadPool & Context::getThreadPoolWriter() const
 {
     callOnce(shared->threadpool_writer_initialized, [&] {
-        const auto & config = getConfigRef();
-        auto pool_size = config.getUInt(".threadpool_writer_pool_size", 100);
-        auto queue_size = config.getUInt(".threadpool_writer_queue_size", 1000000);
+        auto pool_size = shared->server_settings[ServerSetting::threadpool_writer_pool_size];
+        auto queue_size = shared->server_settings[ServerSetting::threadpool_writer_queue_size];
 
         shared->threadpool_writer = std::make_unique<ThreadPool>(
             CurrentMetrics::IOWriterThreads, CurrentMetrics::IOWriterThreadsActive, CurrentMetrics::IOWriterThreadsScheduled, pool_size, pool_size, queue_size);

From b16a18ed66e8d93e32b2f5956614f1c0a23b40e4 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitlibar@clickhouse.com>
Date: Thu, 31 Oct 2024 15:02:18 +0100
Subject: [PATCH 55/80] Add test for mixed version on hosts doing backup or
 restore.

---
 src/Backups/BackupCoordinationStageSync.cpp   |  49 +++++--
 src/Backups/BackupCoordinationStageSync.h     |   5 +-
 .../configs/cluster_different_versions.xml    |  16 +++
 .../test_different_versions.py                | 125 ++++++++++++++++++
 4 files changed, 186 insertions(+), 9 deletions(-)
 create mode 100644 tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml
 create mode 100644 tests/integration/test_backup_restore_on_cluster/test_different_versions.py

diff --git a/src/Backups/BackupCoordinationStageSync.cpp b/src/Backups/BackupCoordinationStageSync.cpp
index 1642cab70c7..9a05f9490c2 100644
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
@@ -27,9 +27,24 @@ namespace
 {
     /// The coordination version is stored in the 'start' node for each host
     /// by each host when it starts working on this backup or restore.
-    /// The initial version didn't use nodes 'finish*' and 'num_hosts'.
-    constexpr const int kInitialVersion = 1;
-    constexpr const int kCurrentVersion = 2;
+    enum Version
+    {
+        kInitialVersion = 1,
+
+        /// This old version didn't create the 'finish' node, it uses stage "completed" to tell other hosts that the work is done.
+        /// If an error happened this old version didn't change any nodes to tell other hosts that the error handling is done.
+        /// So while using this old version hosts couldn't know when other hosts are done with the error handling,
+        /// and that situation caused weird errors in the logs somehow.
+        /// Also this old version didn't create the 'start' node for the initiator.
+        kVersionWithoutFinishNode = 1,
+
+        /// Now we create the 'finish' node both if the work is done or if the error handling is done.
+
+        kCurrentVersion = 2,
+    };
+
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER or RESTORE ON CLUSTER query.
+    const constexpr std::string_view kInitiator;
 }
 
 bool BackupCoordinationStageSync::HostInfo::operator ==(const HostInfo & other) const
@@ -547,11 +562,9 @@ void BackupCoordinationStageSync::readCurrentState(Coordination::ZooKeeperWithFa
                     String result = zookeeper->get(fs::path{zookeeper_path} / zk_node);
                     host_info->stages[stage] = std::move(result);
 
-                    /// The initial version didn't create the 'finish' ZooKeeper nodes so
-                    /// we consider that if the "completed" stage is reached by a host then the host has finished its work.
-                    /// This assumption is not correct if an error happens, but the initial version can't handle errors quite
-                    /// correctly anyway.
-                    if ((host_info->version == kInitialVersion) && (stage == BackupCoordinationStage::COMPLETED))
+                    /// That old version didn't create the 'finish' node so we consider that a host finished its work
+                    /// if it reached the "completed" stage.
+                    if ((host_info->version == kVersionWithoutFinishNode) && (stage == BackupCoordinationStage::COMPLETED))
                         host_info->finished = true;
                 }
             }
@@ -933,6 +946,15 @@ void BackupCoordinationStageSync::createFinishNodeAndRemoveAliveNode(Coordinatio
     if (zookeeper->exists(finish_node_path))
         return;
 
+    /// If the initiator of the query has that old version then it doesn't expect us to create the 'finish' node and moreover
+    /// the initiator can start removing all the nodes immediately after all hosts report about reaching the "completed" status.
+    /// So to avoid weird errors in the logs we won't create the 'finish' node if the initiator of the query has that old version.
+    if ((getInitiatorVersion() == kVersionWithoutFinishNode) && (current_host != kInitiator))
+    {
+        LOG_INFO(log, "Skipped creating the 'finish' node because the initiator uses outdated version {}", getInitiatorVersion());
+        return;
+    }
+
     std::optional<size_t> num_hosts;
     int num_hosts_version = -1;
 
@@ -1001,6 +1023,17 @@ void BackupCoordinationStageSync::createFinishNodeAndRemoveAliveNode(Coordinatio
 }
 
 
+int BackupCoordinationStageSync::getInitiatorVersion() const
+{
+    std::lock_guard lock{mutex};
+    auto it = state.hosts.find(String{kInitiator});
+    if (it == state.hosts.end())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no initiator of this {} query, it's a bug", operation_name);
+    const HostInfo & host_info = it->second;
+    return host_info.version;
+}
+
+
 void BackupCoordinationStageSync::waitForOtherHostsToFinish() const
 {
     tryWaitForOtherHostsToFinishImpl(/* reason = */ "", /* throw_if_error = */ true, /* timeout = */ {});
diff --git a/src/Backups/BackupCoordinationStageSync.h b/src/Backups/BackupCoordinationStageSync.h
index 32f660af997..dc0d3c3c83d 100644
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@@ -109,6 +109,9 @@ private:
     bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
     void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
 
+    /// Returns the version used by the initiator.
+    int getInitiatorVersion() const;
+
     /// Waits until all the other hosts finish their work.
     bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
     bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
@@ -157,7 +160,7 @@ private:
         bool started = false;
         bool connected = false;
         bool finished = false;
-        int version = 0;
+        int version = 1;
         std::map<String /* stage */, String /* result */> stages = {}; /// std::map because we need to compare states
         std::exception_ptr exception = nullptr;
 
diff --git a/tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml b/tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml
new file mode 100644
index 00000000000..f70b255da18
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/configs/cluster_different_versions.xml
@@ -0,0 +1,16 @@
+<clickhouse>
+    <remote_servers>
+        <cluster_ver>
+            <shard>
+                <replica>
+                    <host>new_node</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>old_node</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster_ver>
+    </remote_servers>
+</clickhouse>
diff --git a/tests/integration/test_backup_restore_on_cluster/test_different_versions.py b/tests/integration/test_backup_restore_on_cluster/test_different_versions.py
new file mode 100644
index 00000000000..b5eea7a1902
--- /dev/null
+++ b/tests/integration/test_backup_restore_on_cluster/test_different_versions.py
@@ -0,0 +1,125 @@
+import random
+
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+main_configs = [
+    "configs/backups_disk.xml",
+    "configs/cluster_different_versions.xml",
+]
+
+user_configs = []
+
+new_node = cluster.add_instance(
+    "new_node",
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "new_node", "shard": "shard1"},
+    with_zookeeper=True,
+)
+
+old_node = cluster.add_instance(
+    "old_node",
+    image="clickhouse/clickhouse-server",
+    tag="24.9.2.42",
+    with_installed_binary=True,
+    main_configs=main_configs,
+    user_configs=user_configs,
+    external_dirs=["/backups/"],
+    macros={"replica": "old_node", "shard": "shard1"},
+    with_zookeeper=True,
+)
+
+nodes = [new_node, old_node]
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def cleanup_after_test():
+    try:
+        yield
+    finally:
+        new_node.query("DROP TABLE IF EXISTS tbl ON CLUSTER 'cluster_ver' SYNC")
+
+
+backup_id_counter = 0
+
+
+def new_backup_name():
+    global backup_id_counter
+    backup_id_counter += 1
+    return f"Disk('backups', '{backup_id_counter}')"
+
+
+# Gets a printable version the name of a node.
+def get_node_name(node):
+    return "new_node" if (node == new_node) else "old_node"
+
+
+# Choose a random instance.
+def random_node():
+    return random.choice(nodes)
+
+
+def test_different_versions():
+    new_node.query(
+        "CREATE TABLE tbl"
+        " ON CLUSTER 'cluster_ver'"
+        " (x UInt64) ENGINE=ReplicatedMergeTree('/clickhouse/tables/tbl/', '{replica}')"
+        " ORDER BY tuple()"
+    )
+
+    new_node.query(f"INSERT INTO tbl VALUES (1)")
+    old_node.query(f"INSERT INTO tbl VALUES (2)")
+
+    backup_name = new_backup_name()
+
+    initiator = random_node()
+    print(f"Using {get_node_name(initiator)} as initiator for BACKUP")
+    initiator.query(f"BACKUP TABLE tbl ON CLUSTER 'cluster_ver' TO {backup_name}")
+
+    new_node.query("DROP TABLE tbl ON CLUSTER 'cluster_ver' SYNC")
+
+    initiator = random_node()
+    print(f"Using {get_node_name(initiator)} as initiator for RESTORE")
+    initiator.query(f"RESTORE TABLE tbl ON CLUSTER 'cluster_ver' FROM {backup_name}")
+
+    new_node.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster_ver' tbl")
+    assert new_node.query("SELECT * FROM tbl ORDER BY x") == TSV([1, 2])
+    assert old_node.query("SELECT * FROM tbl ORDER BY x") == TSV([1, 2])
+
+    # Error NO_ELEMENTS_IN_CONFIG is unrelated.
+    assert (
+        new_node.query(
+            "SELECT name, last_error_message FROM system.errors WHERE NOT ("
+            "(name == 'NO_ELEMENTS_IN_CONFIG')"
+            ")"
+        )
+        == ""
+    )
+
+    # Error FAILED_TO_SYNC_BACKUP_OR_RESTORE: "No connection to host new_node:9000 yet, will retry" is generated by the old version
+    # when it fails to connect to other host because that other host hasn't started yet.
+    # This is not an error actually, just an exception thrown and caught. The new version doesn't throw this exception.
+    assert (
+        old_node.query(
+            "SELECT name, last_error_message FROM system.errors WHERE NOT ("
+            "(name == 'NO_ELEMENTS_IN_CONFIG') OR"
+            "((name == 'FAILED_TO_SYNC_BACKUP_OR_RESTORE') AND (last_error_message == 'No connection to host new_node:9000 yet, will retry'))"
+            ")"
+        )
+        == ""
+    )

From e5be813de559b197e020d4a474fb0bed5d0a2637 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Thu, 31 Oct 2024 18:50:43 +0100
Subject: [PATCH 56/80] Sync

---
 src/Core/Settings.cpp               | 3 +++
 src/Core/SettingsChangesHistory.cpp | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp
index 3b63d1231af..7ed24bb85fd 100644
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@@ -5111,6 +5111,9 @@ Only in ClickHouse Cloud. A maximum number of unacknowledged in-flight packets i
 )", 0) \
     DECLARE(UInt64, distributed_cache_data_packet_ack_window, DistributedCache::ACK_DATA_PACKET_WINDOW, R"(
 Only in ClickHouse Cloud. A window for sending ACK for DataPacket sequence in a single distributed cache read request
+)", 0) \
+    DECLARE(Bool, distributed_cache_discard_connection_if_unread_data, true, R"(
+Only in ClickHouse Cloud. Discard connection if some data is unread.
 )", 0) \
     \
     DECLARE(Bool, parallelize_output_from_storages, true, R"(
diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 3fe3e960dc6..7ea388f18dd 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -64,6 +64,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
     },
     {"24.11",
         {
+            {"distributed_cache_discard_connection_if_unread_data", true, true, "New setting"},
         }
     },
     {"24.10",

From ad85a29d522b192eaf05cb099c7ee15d8da3b08d Mon Sep 17 00:00:00 2001
From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com>
Date: Thu, 31 Oct 2024 19:46:35 +0100
Subject: [PATCH 57/80] add requirements and fix warning

---
 docker/test/style/Dockerfile       | 2 +-
 docker/test/style/requirements.txt | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile
index fa6b087eb7d..564301f447c 100644
--- a/docker/test/style/Dockerfile
+++ b/docker/test/style/Dockerfile
@@ -28,7 +28,7 @@ COPY requirements.txt /
 RUN pip3 install --no-cache-dir -r requirements.txt
 
 RUN echo "en_US.UTF-8 UTF-8" > /etc/locale.gen && locale-gen en_US.UTF-8
-ENV LC_ALL en_US.UTF-8
+ENV LC_ALL=en_US.UTF-8
 
 # Architecture of the image when BuildKit/buildx is used
 ARG TARGETARCH
diff --git a/docker/test/style/requirements.txt b/docker/test/style/requirements.txt
index cc87f6e548d..aab20b5bee0 100644
--- a/docker/test/style/requirements.txt
+++ b/docker/test/style/requirements.txt
@@ -12,6 +12,7 @@ charset-normalizer==3.3.2
 click==8.1.7
 codespell==2.2.1
 cryptography==43.0.1
+datacompy==0.7.3
 Deprecated==1.2.14
 dill==0.3.8
 flake8==4.0.1
@@ -23,6 +24,7 @@ mccabe==0.6.1
 multidict==6.0.5
 mypy==1.8.0
 mypy-extensions==1.0.0
+pandas==2.2.3
 packaging==24.1
 pathspec==0.9.0
 pip==24.1.1

From d8fd18c38e28c6a21a083610b252411a5d2dba26 Mon Sep 17 00:00:00 2001
From: serxa <sergei@clickhouse.com>
Date: Thu, 31 Oct 2024 19:06:51 +0000
Subject: [PATCH 58/80] Fix test: add more retries

---
 .../Scheduler/Workload/WorkloadEntityStorageBase.cpp      | 8 ++++----
 tests/integration/test_scheduler/test.py                  | 6 ++++--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp b/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
index 1b7a559698c..968dfd90796 100644
--- a/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
+++ b/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
@@ -578,15 +578,15 @@ void WorkloadEntityStorageBase::setAllEntities(const std::vector<std::pair<Strin
             if (!entityEquals(entity, it->second))
             {
                 changes.emplace_back(entity_name, entity, it->second); // Update entities that are present in both `new_entities` and `entities`
-                LOG_TRACE(log, "Entity {} was updated", entity_name);
+                LOG_TRACE(log, "Workload entity {} was updated", entity_name);
             }
             else
-                LOG_TRACE(log, "Entity {} is the same", entity_name);
+                LOG_TRACE(log, "Workload entity {} is the same", entity_name);
         }
         else
         {
             changes.emplace_back(entity_name, entity, ASTPtr{}); // Remove entities that are not present in `new_entities`
-            LOG_TRACE(log, "Entity {} was dropped", entity_name);
+            LOG_TRACE(log, "Workload entity {} was dropped", entity_name);
         }
     }
     for (const auto & [entity_name, entity] : new_entities)
@@ -594,7 +594,7 @@ void WorkloadEntityStorageBase::setAllEntities(const std::vector<std::pair<Strin
         if (!entities.contains(entity_name))
         {
             changes.emplace_back(entity_name, ASTPtr{}, entity); // Create entities that are only present in `new_entities`
-            LOG_TRACE(log, "Entity {} was created", entity_name);
+            LOG_TRACE(log, "Workload entity {} was created", entity_name);
         }
     }
 
diff --git a/tests/integration/test_scheduler/test.py b/tests/integration/test_scheduler/test.py
index e4ef83759e4..c8f16c150e1 100644
--- a/tests/integration/test_scheduler/test.py
+++ b/tests/integration/test_scheduler/test.py
@@ -921,10 +921,11 @@ def test_workload_entity_keeper_storage():
             "select name, create_query from system.resources order by all",
             "select resource, path, type, weight, priority, max_requests, max_cost, max_speed, max_burst from system.scheduler where resource not in ['network_read', 'network_write'] order by all",
         ]
-        attempts = 10
+        attempts = 30
         value1 = ""
         value2 = ""
         error_query = ""
+        retry_period = 0.1
         for attempt in range(attempts):
             for query in checks:
                 value1 = node.query(query)
@@ -934,7 +935,8 @@ def test_workload_entity_keeper_storage():
                     break  # error
             else:
                 break  # success
-            time.sleep(0.5)
+            time.sleep(retry_period)
+            retry_period = min(3, retry_period * 1.5)
         else:
             raise Exception(
                 f"query '{error_query}' gives different results after {attempts} attempts:\n=== leader node ===\n{value1}\n=== follower node ===\n{value2}"

From 7726866767c31a3aa85e573331c7286fecf4c6e3 Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Thu, 31 Oct 2024 22:25:04 -0400
Subject: [PATCH 59/80] Fix inconsistent AST formatting when granting wrong
 wildcard grants

---
 docs/en/sql-reference/statements/grant.md           | 1 +
 src/Access/Common/AccessRightsElement.cpp           | 2 --
 src/Parsers/Access/ParserGrantQuery.cpp             | 3 +++
 tests/queries/0_stateless/03141_wildcard_grants.sql | 2 ++
 4 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md
index 19305675ec8..d00d70ab578 100644
--- a/docs/en/sql-reference/statements/grant.md
+++ b/docs/en/sql-reference/statements/grant.md
@@ -117,6 +117,7 @@ GRANT SELECT ON db*.* TO john -- correct
 GRANT SELECT ON *.my_table TO john -- wrong
 GRANT SELECT ON foo*bar TO john -- wrong
 GRANT SELECT ON *suffix TO john -- wrong
+GRANT SELECT(foo) ON db.table* TO john -- wrong
 ```
 
 ## Privileges
diff --git a/src/Access/Common/AccessRightsElement.cpp b/src/Access/Common/AccessRightsElement.cpp
index 3a78420f411..3a02047e2b4 100644
--- a/src/Access/Common/AccessRightsElement.cpp
+++ b/src/Access/Common/AccessRightsElement.cpp
@@ -127,8 +127,6 @@ void AccessRightsElement::formatColumnNames(WriteBuffer & buffer) const
         if (std::exchange(need_comma, true))
             buffer << ", ";
         buffer << backQuoteIfNeed(column);
-        if (wildcard)
-            buffer << "*";
     }
     buffer << ")";
 }
diff --git a/src/Parsers/Access/ParserGrantQuery.cpp b/src/Parsers/Access/ParserGrantQuery.cpp
index e29cf11273b..4a0d24559a3 100644
--- a/src/Parsers/Access/ParserGrantQuery.cpp
+++ b/src/Parsers/Access/ParserGrantQuery.cpp
@@ -155,6 +155,9 @@ namespace
 
                 for (auto & [access_flags, columns] : access_and_columns)
                 {
+                    if (wildcard && !columns.empty())
+                        return false;
+
                     AccessRightsElement element;
                     element.access_flags = access_flags;
                     element.columns = std::move(columns);
diff --git a/tests/queries/0_stateless/03141_wildcard_grants.sql b/tests/queries/0_stateless/03141_wildcard_grants.sql
index 45962d9b929..e71fa531134 100644
--- a/tests/queries/0_stateless/03141_wildcard_grants.sql
+++ b/tests/queries/0_stateless/03141_wildcard_grants.sql
@@ -19,4 +19,6 @@ REVOKE SELECT ON team*.* FROM user_03141;
 SHOW GRANTS FOR user_03141;
 SELECT '---';
 
+GRANT SELECT(bar) ON foo.test* TO user_03141; -- { clientError SYNTAX_ERROR }
+
 DROP USER user_03141;

From 06a23c0a792642d04b5fcacb7b3e06d85ddd298e Mon Sep 17 00:00:00 2001
From: Chang Chen <baibaichen@gmail.com>
Date: Fri, 1 Nov 2024 11:42:22 +0800
Subject: [PATCH 60/80] fix debug build

---
 src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h b/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
index 84923c49c62..d486850a9db 100644
--- a/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
+++ b/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
@@ -494,7 +494,7 @@ public:
             nodes.push_back(impl.branch.queue);
         for (auto & [_, branch] : impl.branch.branch.branches)
         {
-            for (auto & [_, child] : branch.children)
+            for (auto & [_1, child] : branch.children)
                 child->addRawPointerNodes(nodes);
         }
     }

From 7bd984ceea3a0f366dd62d2407a910a17690be09 Mon Sep 17 00:00:00 2001
From: pufit <pufit@clickhouse.com>
Date: Thu, 31 Oct 2024 23:52:09 -0400
Subject: [PATCH 61/80] fix tests

---
 src/Access/Common/AccessRightsElement.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/Access/Common/AccessRightsElement.cpp b/src/Access/Common/AccessRightsElement.cpp
index 3a02047e2b4..3a78420f411 100644
--- a/src/Access/Common/AccessRightsElement.cpp
+++ b/src/Access/Common/AccessRightsElement.cpp
@@ -127,6 +127,8 @@ void AccessRightsElement::formatColumnNames(WriteBuffer & buffer) const
         if (std::exchange(need_comma, true))
             buffer << ", ";
         buffer << backQuoteIfNeed(column);
+        if (wildcard)
+            buffer << "*";
     }
     buffer << ")";
 }

From 4b04604f5bed39613b0c26da1199caa9eaa5ae89 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 1 Nov 2024 08:07:32 +0000
Subject: [PATCH 62/80] Bump USearch to 2.16.0 and add more tests

---
 contrib/SimSIMD                               |   2 +-
 contrib/usearch                               |   2 +-
 .../02354_vector_search_queries.reference     | 102 +++++++++++-
 .../02354_vector_search_queries.sql           | 145 ++++++++++++++----
 4 files changed, 217 insertions(+), 34 deletions(-)

diff --git a/contrib/SimSIMD b/contrib/SimSIMD
index ff51434d90c..935fef2964b 160000
--- a/contrib/SimSIMD
+++ b/contrib/SimSIMD
@@ -1 +1 @@
-Subproject commit ff51434d90c66f916e94ff05b24530b127aa4cff
+Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3
diff --git a/contrib/usearch b/contrib/usearch
index 1706420acaf..53799b84ca9 160000
--- a/contrib/usearch
+++ b/contrib/usearch
@@ -1 +1 @@
-Subproject commit 1706420acafbd83d852c512dcf343af0a4059e48
+Subproject commit 53799b84ca9ad708b060d0b1cfa5f039371721cd
diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference
index 223a18b57bf..cf80f46f53c 100644
--- a/tests/queries/0_stateless/02354_vector_search_queries.reference
+++ b/tests/queries/0_stateless/02354_vector_search_queries.reference
@@ -67,7 +67,7 @@ Expression (Projection)
             Condition: true
             Parts: 1/1
             Granules: 4/4
--- Non-default quantization
+-- Test all distance metrics x all quantization
 1	[2,3.2]	2.3323807824711897
 4	[2.4,5.2]	3.9999999046325727
 2	[4.2,3.4]	4.427188573446585
@@ -75,7 +75,7 @@ Expression (Projection)
   Limit (preliminary LIMIT (without OFFSET))
     Sorting (Sorting for ORDER BY)
       Expression (Before ORDER BY)
-        ReadFromMergeTree (default.tab_f64)
+        ReadFromMergeTree (default.tab_l2_f64)
         Indexes:
           PrimaryKey
             Condition: true
@@ -93,7 +93,7 @@ Expression (Projection)
   Limit (preliminary LIMIT (without OFFSET))
     Sorting (Sorting for ORDER BY)
       Expression (Before ORDER BY)
-        ReadFromMergeTree (default.tab_f32)
+        ReadFromMergeTree (default.tab_l2_f32)
         Indexes:
           PrimaryKey
             Condition: true
@@ -111,7 +111,7 @@ Expression (Projection)
   Limit (preliminary LIMIT (without OFFSET))
     Sorting (Sorting for ORDER BY)
       Expression (Before ORDER BY)
-        ReadFromMergeTree (default.tab_f16)
+        ReadFromMergeTree (default.tab_l2_f16)
         Indexes:
           PrimaryKey
             Condition: true
@@ -129,7 +129,7 @@ Expression (Projection)
   Limit (preliminary LIMIT (without OFFSET))
     Sorting (Sorting for ORDER BY)
       Expression (Before ORDER BY)
-        ReadFromMergeTree (default.tab_bf16)
+        ReadFromMergeTree (default.tab_l2_bf16)
         Indexes:
           PrimaryKey
             Condition: true
@@ -147,7 +147,97 @@ Expression (Projection)
   Limit (preliminary LIMIT (without OFFSET))
     Sorting (Sorting for ORDER BY)
       Expression (Before ORDER BY)
-        ReadFromMergeTree (default.tab_i8)
+        ReadFromMergeTree (default.tab_l2_i8)
+        Indexes:
+          PrimaryKey
+            Condition: true
+            Parts: 1/1
+            Granules: 4/4
+          Skip
+            Name: idx
+            Description: vector_similarity GRANULARITY 2
+            Parts: 1/1
+            Granules: 3/4
+6	[1,9.3]	0.005731362878640178
+4	[2.4,5.2]	0.09204062768384846
+1	[2,3.2]	0.15200169244542905
+Expression (Projection)
+  Limit (preliminary LIMIT (without OFFSET))
+    Sorting (Sorting for ORDER BY)
+      Expression (Before ORDER BY)
+        ReadFromMergeTree (default.tab_cos_f64)
+        Indexes:
+          PrimaryKey
+            Condition: true
+            Parts: 1/1
+            Granules: 4/4
+          Skip
+            Name: idx
+            Description: vector_similarity GRANULARITY 2
+            Parts: 1/1
+            Granules: 3/4
+6	[1,9.3]	0.005731362878640178
+4	[2.4,5.2]	0.09204062768384846
+1	[2,3.2]	0.15200169244542905
+Expression (Projection)
+  Limit (preliminary LIMIT (without OFFSET))
+    Sorting (Sorting for ORDER BY)
+      Expression (Before ORDER BY)
+        ReadFromMergeTree (default.tab_cos_f32)
+        Indexes:
+          PrimaryKey
+            Condition: true
+            Parts: 1/1
+            Granules: 4/4
+          Skip
+            Name: idx
+            Description: vector_similarity GRANULARITY 2
+            Parts: 1/1
+            Granules: 3/4
+6	[1,9.3]	0.005731362878640178
+4	[2.4,5.2]	0.09204062768384846
+1	[2,3.2]	0.15200169244542905
+Expression (Projection)
+  Limit (preliminary LIMIT (without OFFSET))
+    Sorting (Sorting for ORDER BY)
+      Expression (Before ORDER BY)
+        ReadFromMergeTree (default.tab_cos_f16)
+        Indexes:
+          PrimaryKey
+            Condition: true
+            Parts: 1/1
+            Granules: 4/4
+          Skip
+            Name: idx
+            Description: vector_similarity GRANULARITY 2
+            Parts: 1/1
+            Granules: 3/4
+6	[1,9.3]	0.005731362878640178
+4	[2.4,5.2]	0.09204062768384846
+1	[2,3.2]	0.15200169244542905
+Expression (Projection)
+  Limit (preliminary LIMIT (without OFFSET))
+    Sorting (Sorting for ORDER BY)
+      Expression (Before ORDER BY)
+        ReadFromMergeTree (default.tab_cos_bf16)
+        Indexes:
+          PrimaryKey
+            Condition: true
+            Parts: 1/1
+            Granules: 4/4
+          Skip
+            Name: idx
+            Description: vector_similarity GRANULARITY 2
+            Parts: 1/1
+            Granules: 3/4
+6	[1,9.3]	0.005731362878640178
+4	[2.4,5.2]	0.09204062768384846
+1	[2,3.2]	0.15200169244542905
+Expression (Projection)
+  Limit (preliminary LIMIT (without OFFSET))
+    Sorting (Sorting for ORDER BY)
+      Expression (Before ORDER BY)
+        ReadFromMergeTree (default.tab_cos_i8)
         Indexes:
           PrimaryKey
             Condition: true
diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql
index 71b8a1e520a..0941f9a43d6 100644
--- a/tests/queries/0_stateless/02354_vector_search_queries.sql
+++ b/tests/queries/0_stateless/02354_vector_search_queries.sql
@@ -81,88 +81,181 @@ SETTINGS max_limit_for_ann_queries = 2; -- LIMIT 3 > 2 --> don't use the ann ind
 
 DROP TABLE tab;
 
-SELECT '-- Non-default quantization';
-CREATE TABLE tab_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
-CREATE TABLE tab_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
-CREATE TABLE tab_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
-CREATE TABLE tab_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
-CREATE TABLE tab_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
-INSERT INTO tab_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
-INSERT INTO tab_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
-INSERT INTO tab_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
-INSERT INTO tab_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
-INSERT INTO tab_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+SELECT '-- Test all distance metrics x all quantization';
+
+DROP TABLE IF EXISTS tab_l2_f64;
+DROP TABLE IF EXISTS tab_l2_f32;
+DROP TABLE IF EXISTS tab_l2_f16;
+DROP TABLE IF EXISTS tab_l2_bf16;
+DROP TABLE IF EXISTS tab_l2_i8;
+DROP TABLE IF EXISTS tab_cos_f64;
+DROP TABLE IF EXISTS tab_cos_f32;
+DROP TABLE IF EXISTS tab_cos_f16;
+DROP TABLE IF EXISTS tab_cos_bf16;
+DROP TABLE IF EXISTS tab_cos_i8;
+
+CREATE TABLE tab_l2_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_l2_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_l2_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_l2_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_l2_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_cos_f64(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f64', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_cos_f32(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_cos_f16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_cos_bf16(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'bf16', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+CREATE TABLE tab_cos_i8(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'i8', 0, 0) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;
+
+INSERT INTO tab_l2_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_l2_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_l2_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_l2_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_l2_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_cos_f64 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_cos_f32 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_cos_f16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_cos_bf16 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
+INSERT INTO tab_cos_i8 VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]);
 
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f64
+FROM tab_l2_f64
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 EXPLAIN indexes = 1
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f64
+FROM tab_l2_f64
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f32
+FROM tab_l2_f32
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 EXPLAIN indexes = 1
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f32
+FROM tab_l2_f32
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f16
+FROM tab_l2_f16
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 EXPLAIN indexes = 1
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_f16
+FROM tab_l2_f16
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_bf16
+FROM tab_l2_bf16
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 EXPLAIN indexes = 1
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_bf16
+FROM tab_l2_bf16
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_i8
+FROM tab_l2_i8
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
 EXPLAIN indexes = 1
 WITH [0.0, 2.0] AS reference_vec
 SELECT id, vec, L2Distance(vec, reference_vec)
-FROM tab_i8
+FROM tab_l2_i8
 ORDER BY L2Distance(vec, reference_vec)
 LIMIT 3;
 
-DROP TABLE tab_f64;
-DROP TABLE tab_f32;
-DROP TABLE tab_f16;
-DROP TABLE tab_bf16;
-DROP TABLE tab_i8;
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f64
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+EXPLAIN indexes = 1
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f64
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f32
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+EXPLAIN indexes = 1
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f32
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f16
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+EXPLAIN indexes = 1
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_f16
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_bf16
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+EXPLAIN indexes = 1
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_bf16
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_i8
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+EXPLAIN indexes = 1
+WITH [0.0, 2.0] AS reference_vec
+SELECT id, vec, cosineDistance(vec, reference_vec)
+FROM tab_cos_i8
+ORDER BY cosineDistance(vec, reference_vec)
+LIMIT 3;
+
+DROP TABLE tab_l2_f64;
+DROP TABLE tab_l2_f32;
+DROP TABLE tab_l2_f16;
+DROP TABLE tab_l2_bf16;
+DROP TABLE tab_l2_i8;
+DROP TABLE tab_cos_f64;
+DROP TABLE tab_cos_f32;
+DROP TABLE tab_cos_f16;
+DROP TABLE tab_cos_bf16;
+DROP TABLE tab_cos_i8;
 
 SELECT '-- Index on Array(Float64) column';
 CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3;

From c6184440c4b036809de98d76efd3e177d4d8483e Mon Sep 17 00:00:00 2001
From: Vladimir Cherkasov <vdimir@clickhouse.com>
Date: Fri, 1 Nov 2024 10:39:14 +0100
Subject: [PATCH 63/80] check-doc-aspell: Print full path to script in CI
 report

---
 utils/check-style/check-doc-aspell | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/check-style/check-doc-aspell b/utils/check-style/check-doc-aspell
index b5a3958e6cf..0406b337575 100755
--- a/utils/check-style/check-doc-aspell
+++ b/utils/check-style/check-doc-aspell
@@ -53,7 +53,7 @@ done
 if (( STATUS != 0 )); then
     echo "====== Errors found ======"
     echo "To exclude some words add them to the dictionary file \"${ASPELL_IGNORE_PATH}/aspell-dict.txt\""
-    echo "You can also run ${0} -i to see the errors interactively and fix them or add to the dictionary file"
+    echo "You can also run '$(realpath --relative-base=${ROOT_PATH} ${0}) -i' to see the errors interactively and fix them or add to the dictionary file"
 fi
 
 exit ${STATUS}

From e4aa477c42e7a05e7de20187496e1b266b5b3187 Mon Sep 17 00:00:00 2001
From: Michael Stetsyuk <michael.stetsyuk@clickhouse.com>
Date: Thu, 31 Oct 2024 11:29:08 +0000
Subject: [PATCH 64/80] make integration tests that use hardcoded ip addresses
 sequential

---
 tests/integration/parallel_skip.json | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/tests/integration/parallel_skip.json b/tests/integration/parallel_skip.json
index 507894534d4..d293cae4dfd 100644
--- a/tests/integration/parallel_skip.json
+++ b/tests/integration/parallel_skip.json
@@ -170,6 +170,18 @@
   "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string",
   "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string_request_new_ticket_after_expiration",
   "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string_no_kdc",
-  "test_storage_kerberized_kafka/test.py::test_kafka_config_from_sql_named_collection"
+  "test_storage_kerberized_kafka/test.py::test_kafka_config_from_sql_named_collection",
 
+  "test_dns_cache/test.py::test_ip_change_drop_dns_cache",
+  "test_dns_cache/test.py::test_ip_change_update_dns_cache",
+  "test_dns_cache/test.py::test_dns_cache_update",
+  "test_dns_cache/test.py::test_user_access_ip_change",
+  "test_dns_cache/test.py::test_host_is_drop_from_cache_after_consecutive_failures",
+  "test_dns_cache/test.py::test_dns_resolver_filter",
+
+  "test_https_replication/test_change_ip.py::test_replication_when_node_ip_changed",
+
+  "test_host_regexp_multiple_ptr_records/test.py::test_host_regexp_multiple_ptr_v4_fails_with_wrong_resolution",
+  "test_host_regexp_multiple_ptr_records/test.py::test_host_regexp_multiple_ptr_v4",
+  "test_host_regexp_multiple_ptr_records/test.py::test_host_regexp_multiple_ptr_v6"
 ]

From fad6e8869182dad498a090e4ec442b949d619acc Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Tue, 15 Oct 2024 12:38:12 +0000
Subject: [PATCH 65/80] Bump

---
 contrib/arrow                      | 2 +-
 contrib/arrow-cmake/CMakeLists.txt | 1 -
 contrib/flatbuffers                | 2 +-
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/contrib/arrow b/contrib/arrow
index 5cfccd8ea65..3264fdad38b 160000
--- a/contrib/arrow
+++ b/contrib/arrow
@@ -1 +1 @@
-Subproject commit 5cfccd8ea65f33d4517e7409815d761c7650b45d
+Subproject commit 3264fdad38b2a1628f296cd574a9dd03f4928aea
diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt
index 96d1f4adda7..b1c5154a0fe 100644
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@@ -390,7 +390,6 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc"
-        "${LIBRARY_DIR}/compute/light_array.cc"
         "${LIBRARY_DIR}/compute/registry.cc"
         "${LIBRARY_DIR}/compute/expression.cc"
         "${LIBRARY_DIR}/compute/ordering.cc"
diff --git a/contrib/flatbuffers b/contrib/flatbuffers
index eb3f8279482..0100f6a5779 160000
--- a/contrib/flatbuffers
+++ b/contrib/flatbuffers
@@ -1 +1 @@
-Subproject commit eb3f827948241ce0e701516f16cd67324802bce9
+Subproject commit 0100f6a5779831fa7a651e4b67ef389a8752bd9b

From c6f4ae696be83ea40aeb83f99e6f303051be0158 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Tue, 15 Oct 2024 13:33:08 +0000
Subject: [PATCH 66/80] Sort lines in CMake

---
 contrib/arrow-cmake/CMakeLists.txt | 218 ++++++++++++++---------------
 1 file changed, 108 insertions(+), 110 deletions(-)

diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt
index b1c5154a0fe..a35a9b80cb9 100644
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@@ -220,6 +220,7 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/array/array_dict.cc"
         "${LIBRARY_DIR}/array/array_nested.cc"
         "${LIBRARY_DIR}/array/array_primitive.cc"
+        "${LIBRARY_DIR}/array/array_run_end.cc"
         "${LIBRARY_DIR}/array/builder_adaptive.cc"
         "${LIBRARY_DIR}/array/builder_base.cc"
         "${LIBRARY_DIR}/array/builder_binary.cc"
@@ -227,124 +228,25 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/array/builder_dict.cc"
         "${LIBRARY_DIR}/array/builder_nested.cc"
         "${LIBRARY_DIR}/array/builder_primitive.cc"
-        "${LIBRARY_DIR}/array/builder_union.cc"
         "${LIBRARY_DIR}/array/builder_run_end.cc"
-        "${LIBRARY_DIR}/array/array_run_end.cc"
+        "${LIBRARY_DIR}/array/builder_union.cc"
         "${LIBRARY_DIR}/array/concatenate.cc"
         "${LIBRARY_DIR}/array/data.cc"
         "${LIBRARY_DIR}/array/diff.cc"
         "${LIBRARY_DIR}/array/util.cc"
         "${LIBRARY_DIR}/array/validate.cc"
-        "${LIBRARY_DIR}/builder.cc"
         "${LIBRARY_DIR}/buffer.cc"
-        "${LIBRARY_DIR}/chunked_array.cc"
-        "${LIBRARY_DIR}/chunk_resolver.cc"
-        "${LIBRARY_DIR}/compare.cc"
-        "${LIBRARY_DIR}/config.cc"
-        "${LIBRARY_DIR}/datum.cc"
-        "${LIBRARY_DIR}/device.cc"
-        "${LIBRARY_DIR}/extension_type.cc"
-        "${LIBRARY_DIR}/memory_pool.cc"
-        "${LIBRARY_DIR}/pretty_print.cc"
-        "${LIBRARY_DIR}/record_batch.cc"
-        "${LIBRARY_DIR}/result.cc"
-        "${LIBRARY_DIR}/scalar.cc"
-        "${LIBRARY_DIR}/sparse_tensor.cc"
-        "${LIBRARY_DIR}/status.cc"
-        "${LIBRARY_DIR}/table.cc"
-        "${LIBRARY_DIR}/table_builder.cc"
-        "${LIBRARY_DIR}/tensor.cc"
-        "${LIBRARY_DIR}/tensor/coo_converter.cc"
-        "${LIBRARY_DIR}/tensor/csf_converter.cc"
-        "${LIBRARY_DIR}/tensor/csx_converter.cc"
-        "${LIBRARY_DIR}/type.cc"
-        "${LIBRARY_DIR}/visitor.cc"
+        "${LIBRARY_DIR}/builder.cc"
         "${LIBRARY_DIR}/c/bridge.cc"
-        "${LIBRARY_DIR}/io/buffered.cc"
-        "${LIBRARY_DIR}/io/caching.cc"
-        "${LIBRARY_DIR}/io/compressed.cc"
-        "${LIBRARY_DIR}/io/file.cc"
-        "${LIBRARY_DIR}/io/hdfs.cc"
-        "${LIBRARY_DIR}/io/hdfs_internal.cc"
-        "${LIBRARY_DIR}/io/interfaces.cc"
-        "${LIBRARY_DIR}/io/memory.cc"
-        "${LIBRARY_DIR}/io/slow.cc"
-        "${LIBRARY_DIR}/io/stdio.cc"
-        "${LIBRARY_DIR}/io/transform.cc"
-        "${LIBRARY_DIR}/util/async_util.cc"
-        "${LIBRARY_DIR}/util/basic_decimal.cc"
-        "${LIBRARY_DIR}/util/bit_block_counter.cc"
-        "${LIBRARY_DIR}/util/bit_run_reader.cc"
-        "${LIBRARY_DIR}/util/bit_util.cc"
-        "${LIBRARY_DIR}/util/bitmap.cc"
-        "${LIBRARY_DIR}/util/bitmap_builders.cc"
-        "${LIBRARY_DIR}/util/bitmap_ops.cc"
-        "${LIBRARY_DIR}/util/bpacking.cc"
-        "${LIBRARY_DIR}/util/cancel.cc"
-        "${LIBRARY_DIR}/util/compression.cc"
-        "${LIBRARY_DIR}/util/counting_semaphore.cc"
-        "${LIBRARY_DIR}/util/cpu_info.cc"
-        "${LIBRARY_DIR}/util/decimal.cc"
-        "${LIBRARY_DIR}/util/delimiting.cc"
-        "${LIBRARY_DIR}/util/formatting.cc"
-        "${LIBRARY_DIR}/util/future.cc"
-        "${LIBRARY_DIR}/util/int_util.cc"
-        "${LIBRARY_DIR}/util/io_util.cc"
-        "${LIBRARY_DIR}/util/logging.cc"
-        "${LIBRARY_DIR}/util/key_value_metadata.cc"
-        "${LIBRARY_DIR}/util/memory.cc"
-        "${LIBRARY_DIR}/util/mutex.cc"
-        "${LIBRARY_DIR}/util/string.cc"
-        "${LIBRARY_DIR}/util/string_builder.cc"
-        "${LIBRARY_DIR}/util/task_group.cc"
-        "${LIBRARY_DIR}/util/tdigest.cc"
-        "${LIBRARY_DIR}/util/thread_pool.cc"
-        "${LIBRARY_DIR}/util/time.cc"
-        "${LIBRARY_DIR}/util/trie.cc"
-        "${LIBRARY_DIR}/util/unreachable.cc"
-        "${LIBRARY_DIR}/util/uri.cc"
-        "${LIBRARY_DIR}/util/utf8.cc"
-        "${LIBRARY_DIR}/util/value_parsing.cc"
-        "${LIBRARY_DIR}/util/byte_size.cc"
-        "${LIBRARY_DIR}/util/debug.cc"
-        "${LIBRARY_DIR}/util/tracing.cc"
-        "${LIBRARY_DIR}/util/atfork_internal.cc"
-        "${LIBRARY_DIR}/util/crc32.cc"
-        "${LIBRARY_DIR}/util/hashing.cc"
-        "${LIBRARY_DIR}/util/ree_util.cc"
-        "${LIBRARY_DIR}/util/union_util.cc"
-        "${LIBRARY_DIR}/vendored/base64.cpp"
-        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
-        "${LIBRARY_DIR}/vendored/musl/strptime.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
-        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
-        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
-
+        "${LIBRARY_DIR}/chunk_resolver.cc"
+        "${LIBRARY_DIR}/chunked_array.cc"
+        "${LIBRARY_DIR}/compare.cc"
         "${LIBRARY_DIR}/compute/api_aggregate.cc"
         "${LIBRARY_DIR}/compute/api_scalar.cc"
         "${LIBRARY_DIR}/compute/api_vector.cc"
         "${LIBRARY_DIR}/compute/cast.cc"
         "${LIBRARY_DIR}/compute/exec.cc"
+        "${LIBRARY_DIR}/compute/expression.cc"
         "${LIBRARY_DIR}/compute/function.cc"
         "${LIBRARY_DIR}/compute/function_internal.cc"
         "${LIBRARY_DIR}/compute/kernel.cc"
@@ -386,18 +288,31 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
-        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc"
-        "${LIBRARY_DIR}/compute/registry.cc"
-        "${LIBRARY_DIR}/compute/expression.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
         "${LIBRARY_DIR}/compute/ordering.cc"
+        "${LIBRARY_DIR}/compute/registry.cc"
         "${LIBRARY_DIR}/compute/row/compare_internal.cc"
         "${LIBRARY_DIR}/compute/row/encode_internal.cc"
         "${LIBRARY_DIR}/compute/row/grouper.cc"
         "${LIBRARY_DIR}/compute/row/row_internal.cc"
-
+        "${LIBRARY_DIR}/config.cc"
+        "${LIBRARY_DIR}/datum.cc"
+        "${LIBRARY_DIR}/device.cc"
+        "${LIBRARY_DIR}/extension_type.cc"
+        "${LIBRARY_DIR}/io/buffered.cc"
+        "${LIBRARY_DIR}/io/caching.cc"
+        "${LIBRARY_DIR}/io/compressed.cc"
+        "${LIBRARY_DIR}/io/file.cc"
+        "${LIBRARY_DIR}/io/hdfs.cc"
+        "${LIBRARY_DIR}/io/hdfs_internal.cc"
+        "${LIBRARY_DIR}/io/interfaces.cc"
+        "${LIBRARY_DIR}/io/memory.cc"
+        "${LIBRARY_DIR}/io/slow.cc"
+        "${LIBRARY_DIR}/io/stdio.cc"
+        "${LIBRARY_DIR}/io/transform.cc"
         "${LIBRARY_DIR}/ipc/dictionary.cc"
         "${LIBRARY_DIR}/ipc/feather.cc"
         "${LIBRARY_DIR}/ipc/message.cc"
@@ -405,6 +320,89 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/ipc/options.cc"
         "${LIBRARY_DIR}/ipc/reader.cc"
         "${LIBRARY_DIR}/ipc/writer.cc"
+        "${LIBRARY_DIR}/memory_pool.cc"
+        "${LIBRARY_DIR}/pretty_print.cc"
+        "${LIBRARY_DIR}/record_batch.cc"
+        "${LIBRARY_DIR}/result.cc"
+        "${LIBRARY_DIR}/scalar.cc"
+        "${LIBRARY_DIR}/sparse_tensor.cc"
+        "${LIBRARY_DIR}/status.cc"
+        "${LIBRARY_DIR}/table.cc"
+        "${LIBRARY_DIR}/table_builder.cc"
+        "${LIBRARY_DIR}/tensor.cc"
+        "${LIBRARY_DIR}/tensor/coo_converter.cc"
+        "${LIBRARY_DIR}/tensor/csf_converter.cc"
+        "${LIBRARY_DIR}/tensor/csx_converter.cc"
+        "${LIBRARY_DIR}/type.cc"
+        "${LIBRARY_DIR}/util/async_util.cc"
+        "${LIBRARY_DIR}/util/atfork_internal.cc"
+        "${LIBRARY_DIR}/util/basic_decimal.cc"
+        "${LIBRARY_DIR}/util/bit_block_counter.cc"
+        "${LIBRARY_DIR}/util/bit_run_reader.cc"
+        "${LIBRARY_DIR}/util/bit_util.cc"
+        "${LIBRARY_DIR}/util/bitmap.cc"
+        "${LIBRARY_DIR}/util/bitmap_builders.cc"
+        "${LIBRARY_DIR}/util/bitmap_ops.cc"
+        "${LIBRARY_DIR}/util/bpacking.cc"
+        "${LIBRARY_DIR}/util/byte_size.cc"
+        "${LIBRARY_DIR}/util/cancel.cc"
+        "${LIBRARY_DIR}/util/compression.cc"
+        "${LIBRARY_DIR}/util/counting_semaphore.cc"
+        "${LIBRARY_DIR}/util/cpu_info.cc"
+        "${LIBRARY_DIR}/util/crc32.cc"
+        "${LIBRARY_DIR}/util/debug.cc"
+        "${LIBRARY_DIR}/util/decimal.cc"
+        "${LIBRARY_DIR}/util/delimiting.cc"
+        "${LIBRARY_DIR}/util/formatting.cc"
+        "${LIBRARY_DIR}/util/future.cc"
+        "${LIBRARY_DIR}/util/hashing.cc"
+        "${LIBRARY_DIR}/util/int_util.cc"
+        "${LIBRARY_DIR}/util/io_util.cc"
+        "${LIBRARY_DIR}/util/key_value_metadata.cc"
+        "${LIBRARY_DIR}/util/logging.cc"
+        "${LIBRARY_DIR}/util/memory.cc"
+        "${LIBRARY_DIR}/util/mutex.cc"
+        "${LIBRARY_DIR}/util/ree_util.cc"
+        "${LIBRARY_DIR}/util/string.cc"
+        "${LIBRARY_DIR}/util/string_builder.cc"
+        "${LIBRARY_DIR}/util/task_group.cc"
+        "${LIBRARY_DIR}/util/tdigest.cc"
+        "${LIBRARY_DIR}/util/thread_pool.cc"
+        "${LIBRARY_DIR}/util/time.cc"
+        "${LIBRARY_DIR}/util/tracing.cc"
+        "${LIBRARY_DIR}/util/trie.cc"
+        "${LIBRARY_DIR}/util/union_util.cc"
+        "${LIBRARY_DIR}/util/unreachable.cc"
+        "${LIBRARY_DIR}/util/uri.cc"
+        "${LIBRARY_DIR}/util/utf8.cc"
+        "${LIBRARY_DIR}/util/value_parsing.cc"
+        "${LIBRARY_DIR}/vendored/base64.cpp"
+        "${LIBRARY_DIR}/vendored/datetime/tz.cpp"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/bignum.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/cached-powers.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/double-to-string.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fast-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/fixed-dtoa.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/string-to-double.cc"
+        "${LIBRARY_DIR}/vendored/double-conversion/strtod.cc"
+        "${LIBRARY_DIR}/vendored/musl/strptime.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCommon.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriCompare.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriEscape.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriFile.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriIp4Base.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriMemory.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalize.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriNormalizeBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParse.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriParseBase.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriQuery.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriRecompose.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriResolve.c"
+        "${LIBRARY_DIR}/vendored/uriparser/UriShorten.c"
+        "${LIBRARY_DIR}/visitor.cc"
 
         "${ARROW_SRC_DIR}/arrow/adapters/orc/adapter.cc"
         "${ARROW_SRC_DIR}/arrow/adapters/orc/util.cc"

From f38e07a027c7868a05f88d6356a3d465e7a5d87c Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Tue, 15 Oct 2024 14:43:28 +0000
Subject: [PATCH 67/80] Fix build

---
 contrib/arrow-cmake/CMakeLists.txt | 42 +++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt
index a35a9b80cb9..06de5135ad2 100644
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@@ -213,7 +213,12 @@ target_include_directories(_orc SYSTEM PRIVATE
 set(LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/src/arrow")
 
 # arrow/cpp/src/arrow/CMakeLists.txt (ARROW_SRCS + ARROW_COMPUTE + ARROW_IPC)
+# find . \( -iname \*.cc -o -iname \*.cpp -o -iname \*.c \) | sort | awk '{print "\"${LIBRARY_DIR}" substr($1,2) "\"" }' | grep -v 'test.cc' | grep -v 'json' | grep -v 'flight' \|
+# grep -v 'csv' | grep -v 'acero' | grep -v 'dataset' | grep -v 'testing' | grep -v 'gpu' | grep -v 'engine' | grep -v 'filesystem' | grep -v 'benchmark.cc'
 set(ARROW_SRCS
+        "${LIBRARY_DIR}/adapters/orc/adapter.cc"
+        "${LIBRARY_DIR}/adapters/orc/options.cc"
+        "${LIBRARY_DIR}/adapters/orc/util.cc"
         "${LIBRARY_DIR}/array/array_base.cc"
         "${LIBRARY_DIR}/array/array_binary.cc"
         "${LIBRARY_DIR}/array/array_decimal.cc"
@@ -238,6 +243,7 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/buffer.cc"
         "${LIBRARY_DIR}/builder.cc"
         "${LIBRARY_DIR}/c/bridge.cc"
+        "${LIBRARY_DIR}/c/dlpack.cc"
         "${LIBRARY_DIR}/chunk_resolver.cc"
         "${LIBRARY_DIR}/chunked_array.cc"
         "${LIBRARY_DIR}/compare.cc"
@@ -257,6 +263,7 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/compute/kernels/aggregate_var_std.cc"
         "${LIBRARY_DIR}/compute/kernels/codegen_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/hash_aggregate.cc"
+        "${LIBRARY_DIR}/compute/kernels/ree_util_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/row_encoder.cc"
         "${LIBRARY_DIR}/compute/kernels/scalar_arithmetic.cc"
         "${LIBRARY_DIR}/compute/kernels/scalar_boolean.cc"
@@ -284,24 +291,31 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/compute/kernels/vector_cumulative_ops.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_hash.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_nested.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_pairwise.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_rank.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_replace.cc"
+        "${LIBRARY_DIR}/compute/kernels/vector_run_end_encode.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_select_k.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_filter_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_selection_take_internal.cc"
         "${LIBRARY_DIR}/compute/kernels/vector_sort.cc"
+        "${LIBRARY_DIR}/compute/key_hash_internal.cc"
+        "${LIBRARY_DIR}/compute/key_map_internal.cc"
+        "${LIBRARY_DIR}/compute/light_array_internal.cc"
         "${LIBRARY_DIR}/compute/ordering.cc"
         "${LIBRARY_DIR}/compute/registry.cc"
         "${LIBRARY_DIR}/compute/row/compare_internal.cc"
         "${LIBRARY_DIR}/compute/row/encode_internal.cc"
         "${LIBRARY_DIR}/compute/row/grouper.cc"
         "${LIBRARY_DIR}/compute/row/row_internal.cc"
+        "${LIBRARY_DIR}/compute/util.cc"
         "${LIBRARY_DIR}/config.cc"
         "${LIBRARY_DIR}/datum.cc"
         "${LIBRARY_DIR}/device.cc"
         "${LIBRARY_DIR}/extension_type.cc"
+        "${LIBRARY_DIR}/integration/c_data_integration_internal.cc"
         "${LIBRARY_DIR}/io/buffered.cc"
         "${LIBRARY_DIR}/io/caching.cc"
         "${LIBRARY_DIR}/io/compressed.cc"
@@ -315,10 +329,12 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/io/transform.cc"
         "${LIBRARY_DIR}/ipc/dictionary.cc"
         "${LIBRARY_DIR}/ipc/feather.cc"
+        "${LIBRARY_DIR}/ipc/file_to_stream.cc"
         "${LIBRARY_DIR}/ipc/message.cc"
         "${LIBRARY_DIR}/ipc/metadata_internal.cc"
         "${LIBRARY_DIR}/ipc/options.cc"
         "${LIBRARY_DIR}/ipc/reader.cc"
+        "${LIBRARY_DIR}/ipc/stream_to_file.cc"
         "${LIBRARY_DIR}/ipc/writer.cc"
         "${LIBRARY_DIR}/memory_pool.cc"
         "${LIBRARY_DIR}/pretty_print.cc"
@@ -334,6 +350,8 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/tensor/csf_converter.cc"
         "${LIBRARY_DIR}/tensor/csx_converter.cc"
         "${LIBRARY_DIR}/type.cc"
+        "${LIBRARY_DIR}/type_traits.cc"
+        "${LIBRARY_DIR}/util/align_util.cc"
         "${LIBRARY_DIR}/util/async_util.cc"
         "${LIBRARY_DIR}/util/atfork_internal.cc"
         "${LIBRARY_DIR}/util/basic_decimal.cc"
@@ -353,12 +371,15 @@ set(ARROW_SRCS
         "${LIBRARY_DIR}/util/debug.cc"
         "${LIBRARY_DIR}/util/decimal.cc"
         "${LIBRARY_DIR}/util/delimiting.cc"
+        "${LIBRARY_DIR}/util/dict_util.cc"
+        "${LIBRARY_DIR}/util/float16.cc"
         "${LIBRARY_DIR}/util/formatting.cc"
         "${LIBRARY_DIR}/util/future.cc"
         "${LIBRARY_DIR}/util/hashing.cc"
         "${LIBRARY_DIR}/util/int_util.cc"
         "${LIBRARY_DIR}/util/io_util.cc"
         "${LIBRARY_DIR}/util/key_value_metadata.cc"
+        "${LIBRARY_DIR}/util/list_util.cc"
         "${LIBRARY_DIR}/util/logging.cc"
         "${LIBRARY_DIR}/util/memory.cc"
         "${LIBRARY_DIR}/util/mutex.cc"
@@ -462,22 +483,38 @@ set(PARQUET_SRCS
         "${LIBRARY_DIR}/arrow/schema.cc"
         "${LIBRARY_DIR}/arrow/schema_internal.cc"
         "${LIBRARY_DIR}/arrow/writer.cc"
+        "${LIBRARY_DIR}/benchmark_util.cc"
         "${LIBRARY_DIR}/bloom_filter.cc"
+        "${LIBRARY_DIR}/bloom_filter_reader.cc"
         "${LIBRARY_DIR}/column_reader.cc"
         "${LIBRARY_DIR}/column_scanner.cc"
         "${LIBRARY_DIR}/column_writer.cc"
         "${LIBRARY_DIR}/encoding.cc"
+        "${LIBRARY_DIR}/encryption/crypto_factory.cc"
         "${LIBRARY_DIR}/encryption/encryption.cc"
         "${LIBRARY_DIR}/encryption/encryption_internal.cc"
+        "${LIBRARY_DIR}/encryption/encryption_internal_nossl.cc"
+        "${LIBRARY_DIR}/encryption/file_key_unwrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_key_wrapper.cc"
+        "${LIBRARY_DIR}/encryption/file_system_key_material_store.cc"
         "${LIBRARY_DIR}/encryption/internal_file_decryptor.cc"
         "${LIBRARY_DIR}/encryption/internal_file_encryptor.cc"
+        "${LIBRARY_DIR}/encryption/key_material.cc"
+        "${LIBRARY_DIR}/encryption/key_metadata.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit.cc"
+        "${LIBRARY_DIR}/encryption/key_toolkit_internal.cc"
+        "${LIBRARY_DIR}/encryption/kms_client.cc"
+        "${LIBRARY_DIR}/encryption/local_wrap_kms_client.cc"
+        "${LIBRARY_DIR}/encryption/openssl_internal.cc"
         "${LIBRARY_DIR}/exception.cc"
         "${LIBRARY_DIR}/file_reader.cc"
         "${LIBRARY_DIR}/file_writer.cc"
-        "${LIBRARY_DIR}/page_index.cc"
-        "${LIBRARY_DIR}/level_conversion.cc"
         "${LIBRARY_DIR}/level_comparison.cc"
+        "${LIBRARY_DIR}/level_comparison_avx2.cc"
+        "${LIBRARY_DIR}/level_conversion.cc"
+        "${LIBRARY_DIR}/level_conversion_bmi2.cc"
         "${LIBRARY_DIR}/metadata.cc"
+        "${LIBRARY_DIR}/page_index.cc"
         "${LIBRARY_DIR}/platform.cc"
         "${LIBRARY_DIR}/printer.cc"
         "${LIBRARY_DIR}/properties.cc"
@@ -486,7 +523,6 @@ set(PARQUET_SRCS
         "${LIBRARY_DIR}/stream_reader.cc"
         "${LIBRARY_DIR}/stream_writer.cc"
         "${LIBRARY_DIR}/types.cc"
-        "${LIBRARY_DIR}/bloom_filter_reader.cc"
         "${LIBRARY_DIR}/xxhasher.cc"
 
         "${GEN_LIBRARY_DIR}/parquet_constants.cpp"

From 6923b9ec3fff0891ceff75d2515d7d5c75de1293 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Wed, 16 Oct 2024 13:42:40 +0000
Subject: [PATCH 68/80] Update the submodule

---
 contrib/arrow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/arrow b/contrib/arrow
index 3264fdad38b..60896c89713 160000
--- a/contrib/arrow
+++ b/contrib/arrow
@@ -1 +1 @@
-Subproject commit 3264fdad38b2a1628f296cd574a9dd03f4928aea
+Subproject commit 60896c89713c2c1ed4bbc1e22e8eaeb6b7d7f9d5

From c895e855851972c0efd3dc742258ccd7dc234710 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Wed, 16 Oct 2024 15:28:41 +0000
Subject: [PATCH 69/80] Bump

---
 contrib/arrow | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/arrow b/contrib/arrow
index 60896c89713..6e2574f5013 160000
--- a/contrib/arrow
+++ b/contrib/arrow
@@ -1 +1 @@
-Subproject commit 60896c89713c2c1ed4bbc1e22e8eaeb6b7d7f9d5
+Subproject commit 6e2574f5013a005c050c9a7787d341aef09d0063

From fe2514955c0eb98e017c63adea1f4b4cdec57e70 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <nikitamikhaylov@clickhouse.com>
Date: Thu, 31 Oct 2024 17:55:31 +0000
Subject: [PATCH 70/80] Enable threading

---
 contrib/arrow-cmake/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contrib/arrow-cmake/CMakeLists.txt b/contrib/arrow-cmake/CMakeLists.txt
index 06de5135ad2..208d48df178 100644
--- a/contrib/arrow-cmake/CMakeLists.txt
+++ b/contrib/arrow-cmake/CMakeLists.txt
@@ -553,6 +553,9 @@ endif ()
 add_definitions(-DPARQUET_THRIFT_VERSION_MAJOR=0)
 add_definitions(-DPARQUET_THRIFT_VERSION_MINOR=16)
 
+# As per https://github.com/apache/arrow/pull/35672 you need to enable it explicitly.
+add_definitions(-DARROW_ENABLE_THREADING)
+
 # === tools
 
 set(TOOLS_DIR "${ClickHouse_SOURCE_DIR}/contrib/arrow/cpp/tools/parquet")

From 9d0da01ddbfe6648c1d4c0e5f958790d70861bc1 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 1 Nov 2024 12:43:38 +0100
Subject: [PATCH 71/80] Update
 src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h

---
 src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h b/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
index d486850a9db..2c4b7c4f3bc 100644
--- a/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
+++ b/src/Common/Scheduler/Nodes/UnifiedSchedulerNode.h
@@ -492,7 +492,7 @@ public:
             nodes.push_back(impl.semaphore);
         if (impl.branch.queue)
             nodes.push_back(impl.branch.queue);
-        for (auto & [_, branch] : impl.branch.branch.branches)
+        for (auto & [_0, branch] : impl.branch.branch.branches)
         {
             for (auto & [_1, child] : branch.children)
                 child->addRawPointerNodes(nodes);

From 24e7fc2714f2ccd3391c24b77e07ecaad8608d96 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 1 Nov 2024 13:15:15 +0100
Subject: [PATCH 72/80] Add try catch to data part destructors

---
 src/Storages/MergeTree/MergeTreeDataPartCompact.cpp | 9 ++++++++-
 src/Storages/MergeTree/MergeTreeDataPartWide.cpp    | 9 ++++++++-
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
index 22f3c379398..14c2da82de1 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp
@@ -256,7 +256,14 @@ bool MergeTreeDataPartCompact::isStoredOnRemoteDiskWithZeroCopySupport() const
 
 MergeTreeDataPartCompact::~MergeTreeDataPartCompact()
 {
-    removeIfNeeded();
+    try
+    {
+        removeIfNeeded();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
 }
 
 }
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
index d6f213463f2..c515d645253 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp
@@ -241,7 +241,14 @@ bool MergeTreeDataPartWide::isStoredOnRemoteDiskWithZeroCopySupport() const
 
 MergeTreeDataPartWide::~MergeTreeDataPartWide()
 {
-    removeIfNeeded();
+    try
+    {
+        removeIfNeeded();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
 }
 
 void MergeTreeDataPartWide::doCheckConsistency(bool require_part_metadata) const

From 00cd06838999b775877e2beef1550b17a399f6ca Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 1 Nov 2024 12:22:22 +0000
Subject: [PATCH 73/80] Remove upstream SimSIMD submodule

---
 .gitmodules     | 3 ---
 contrib/SimSIMD | 1 -
 2 files changed, 4 deletions(-)
 delete mode 160000 contrib/SimSIMD

diff --git a/.gitmodules b/.gitmodules
index bbc8fc7d06c..ac1c4d05e1a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -330,9 +330,6 @@
 [submodule "contrib/usearch"]
 	path = contrib/usearch
 	url = https://github.com/ClickHouse/usearch.git
-[submodule "contrib/SimSIMD"]
-	path = contrib/SimSIMD
-	url = https://github.com/ashvardanian/SimSIMD.git
 [submodule "contrib/FP16"]
 	path = contrib/FP16
 	url = https://github.com/Maratyszcza/FP16.git
diff --git a/contrib/SimSIMD b/contrib/SimSIMD
deleted file mode 160000
index 935fef2964b..00000000000
--- a/contrib/SimSIMD
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3

From 3e2d5e508b4c75537dd935bf380019f534aa6351 Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Fri, 1 Nov 2024 12:22:51 +0000
Subject: [PATCH 74/80] Add forked SimSIMD submodule

---
 .gitmodules     | 3 +++
 contrib/SimSIMD | 1 +
 2 files changed, 4 insertions(+)
 create mode 160000 contrib/SimSIMD

diff --git a/.gitmodules b/.gitmodules
index ac1c4d05e1a..a3b6450032a 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -330,6 +330,9 @@
 [submodule "contrib/usearch"]
 	path = contrib/usearch
 	url = https://github.com/ClickHouse/usearch.git
+[submodule "contrib/SimSIMD"]
+	path = contrib/SimSIMD
+	url = https://github.com/ClickHouse/SimSIMD.git
 [submodule "contrib/FP16"]
 	path = contrib/FP16
 	url = https://github.com/Maratyszcza/FP16.git
diff --git a/contrib/SimSIMD b/contrib/SimSIMD
new file mode 160000
index 00000000000..935fef2964b
--- /dev/null
+++ b/contrib/SimSIMD
@@ -0,0 +1 @@
+Subproject commit 935fef2964bc38e995c5f465b42259a35b8cf0d3

From 93acf134f68cfc091f8d5f7996e7b6ca21c17298 Mon Sep 17 00:00:00 2001
From: Max Kainov <max.kainov@clickhouse.com>
Date: Fri, 1 Nov 2024 14:02:43 +0100
Subject: [PATCH 75/80] Fix Fedora version for testing RPMs installations

---
 tests/ci/artifactory.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tests/ci/artifactory.py b/tests/ci/artifactory.py
index c66659d4e93..00a7eeebb35 100644
--- a/tests/ci/artifactory.py
+++ b/tests/ci/artifactory.py
@@ -200,6 +200,7 @@ class RpmArtifactory:
     )
     _PROD_REPO_URL = "https://packages.clickhouse.com/rpm/clickhouse.repo"
     _SIGN_KEY = "885E2BDCF96B0B45ABF058453E4AD4719DDE9A38"
+    FEDORA_VERSION = 40
 
     def __init__(self, release_info: ReleaseInfo, dry_run: bool):
         self.release_info = release_info
@@ -249,16 +250,16 @@ class RpmArtifactory:
         Shell.check("sync")
 
     def test_packages(self):
-        Shell.check("docker pull fedora:latest", strict=True)
+        Shell.check(f"docker pull fedora:{self.FEDORA_VERSION}", strict=True)
         print(f"Test package installation, version [{self.version}]")
         rpm_command = f"dnf config-manager --add-repo={self.repo_url} && dnf makecache && dnf -y install clickhouse-client-{self.version}-1"
-        cmd = f'docker run --rm fedora:latest /bin/bash -c "dnf -y install dnf-plugins-core && dnf config-manager --add-repo={self.repo_url} && {rpm_command}"'
+        cmd = f'docker run --rm fedora:{self.FEDORA_VERSION} /bin/bash -c "dnf -y install dnf-plugins-core && dnf config-manager --add-repo={self.repo_url} && {rpm_command}"'
         print("Running test command:")
         print(f"  {cmd}")
         assert Shell.check(cmd)
         print("Test package installation, version [latest]")
         rpm_command_2 = f"dnf config-manager --add-repo={self.repo_url} && dnf makecache && dnf -y install clickhouse-client"
-        cmd = f'docker run --rm fedora:latest /bin/bash -c "dnf -y install dnf-plugins-core && dnf config-manager --add-repo={self.repo_url} && {rpm_command_2}"'
+        cmd = f'docker run --rm fedora:{self.FEDORA_VERSION} /bin/bash -c "dnf -y install dnf-plugins-core && dnf config-manager --add-repo={self.repo_url} && {rpm_command_2}"'
         print("Running test command:")
         print(f"  {cmd}")
         assert Shell.check(cmd)

From 603cb16e986a7330693be95b23c30dc98ed307e0 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 1 Nov 2024 15:06:52 +0100
Subject: [PATCH 76/80] Fix build

---
 src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp b/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
index 1b7a559698c..4a2e867e2e2 100644
--- a/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
+++ b/src/Common/Scheduler/Workload/WorkloadEntityStorageBase.cpp
@@ -48,9 +48,9 @@ ASTPtr normalizeCreateWorkloadEntityQuery(const IAST & create_query)
 /// Returns a type of a workload entity `ptr`
 WorkloadEntityType getEntityType(const ASTPtr & ptr)
 {
-    if (auto * res = typeid_cast<ASTCreateWorkloadQuery *>(ptr.get()))
+    if (auto * res = typeid_cast<ASTCreateWorkloadQuery *>(ptr.get()); res)
         return WorkloadEntityType::Workload;
-    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(ptr.get()))
+    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(ptr.get()); res)
         return WorkloadEntityType::Resource;
     chassert(false);
     return WorkloadEntityType::MAX;
@@ -106,7 +106,7 @@ void forEachReference(
         for (const String & resource : resources)
             func(resource, res->getWorkloadName(), ReferenceType::ForResource);
     }
-    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(source_entity.get()))
+    if (auto * res = typeid_cast<ASTCreateResourceQuery *>(source_entity.get()); res)
     {
         // RESOURCE has no references to be validated, we allow mentioned disks to be created later
     }

From e23dc25863f49b419bc6ce28463a13bd8ad38277 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 1 Nov 2024 16:22:16 +0100
Subject: [PATCH 77/80] Done

---
 src/Core/SettingsChangesHistory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp
index 7ea388f18dd..49f6acff57b 100644
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@@ -65,6 +65,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
     {"24.11",
         {
             {"distributed_cache_discard_connection_if_unread_data", true, true, "New setting"},
+            {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
         }
     },
     {"24.10",
@@ -113,7 +114,6 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
             {"allow_reorder_prewhere_conditions", false, true, "New setting"},
             {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
             {"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
-            {"azure_check_objects_after_upload", false, false, "Check each uploaded object in azure blob storage to be sure that upload was successful"},
         }
     },
     {"24.9",

From b3b245e3b859fcd96cf3178afddaca1847ac5dcb Mon Sep 17 00:00:00 2001
From: Denny Crane <denis.zhuravlov@gmail.com>
Date: Fri, 1 Nov 2024 15:37:48 -0300
Subject: [PATCH 78/80] Update anylast.md

---
 .../en/sql-reference/aggregate-functions/reference/anylast.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/en/sql-reference/aggregate-functions/reference/anylast.md b/docs/en/sql-reference/aggregate-functions/reference/anylast.md
index 202d2e9fb10..4fe21531c76 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/anylast.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/anylast.md
@@ -17,7 +17,7 @@ anyLast(column) [RESPECT NULLS]
 - `column`: The column name. 
 
 :::note
-Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the first value passed, regardless of whether it is `NULL` or not.
+Supports the `RESPECT NULLS` modifier after the function name. Using this modifier will ensure the function selects the last value passed, regardless of whether it is `NULL` or not.
 :::
 
 **Returned value**
@@ -40,4 +40,4 @@ SELECT anyLast(city) FROM any_last_nulls;
 ┌─anyLast(city)─┐
 │ Valencia      │
 └───────────────┘
-```
\ No newline at end of file
+```

From b876d52e89ba6f28a71acbb7af3d43c7879c7dc4 Mon Sep 17 00:00:00 2001
From: Plasmaion <150329062+Plasmaion@users.noreply.github.com>
Date: Sat, 2 Nov 2024 15:40:18 +0300
Subject: [PATCH 79/80] Update install.md (comment)

typo in word "or"
:)
---
 docs/ru/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md
index f8a660fbec9..083ddc8c39c 100644
--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@@ -95,7 +95,7 @@ sudo yum install -y clickhouse-server clickhouse-client
 sudo systemctl enable clickhouse-server
 sudo systemctl start clickhouse-server
 sudo systemctl status clickhouse-server
-clickhouse-client # илм "clickhouse-client --password" если установлен пароль
+clickhouse-client # или "clickhouse-client --password" если установлен пароль
 ```
 
 Для использования наиболее свежих версий нужно заменить `stable` на `testing` (рекомендуется для тестовых окружений). Также иногда доступен `prestable`.

From 5c69cf3205c77bbb2dcf8a5d52539679507c86fe Mon Sep 17 00:00:00 2001
From: Robert Schulze <robert@clickhouse.com>
Date: Sun, 3 Nov 2024 13:20:27 +0100
Subject: [PATCH 80/80] Docs: An attempt to fix the missing sidebar for
 TPC-H/DS and SSB benchmark docs

See https://github.com/ClickHouse/clickhouse-docs/issues/2721
---
 docs/en/getting-started/index.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/en/getting-started/index.md b/docs/en/getting-started/index.md
index b520220984c..7898ca01129 100644
--- a/docs/en/getting-started/index.md
+++ b/docs/en/getting-started/index.md
@@ -23,6 +23,7 @@ functions in ClickHouse. The sample datasets include:
 - The [NYPD Complaint Data](../getting-started/example-datasets/nypd_complaint_data.md) demonstrates how to use data inference to simplify creating tables
 - The ["What's on the Menu?" dataset](../getting-started/example-datasets/menus.md) has an example of denormalizing data
 - The [Laion dataset](../getting-started/example-datasets/laion.md) has an example of [Approximate nearest neighbor search indexes](../engines/table-engines/mergetree-family/annindexes.md) usage
+- The [TPC-H](../getting-started/example-datasets/tpch.md), [TPC-DS](../getting-started/example-datasets/tpcds.md), and [Star Schema (SSB)](../getting-started/example-datasets/star-schema.md) industry benchmarks for analytics databases
 - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1) provides examples of defining a schema and loading a small Hacker News dataset 
 - [Getting Data Into ClickHouse - Part 3 - Using S3](https://clickhouse.com/blog/getting-data-into-clickhouse-part-3-s3) has examples of loading data from s3
 - [Generating random data in ClickHouse](https://clickhouse.com/blog/generating-random-test-distribution-data-for-clickhouse) shows how to generate random data if none of the above fit your needs.