From bfc10bd234f2791fd48d30437e76df7d4a304a44 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <tavplubix@clickhouse.com>
Date: Fri, 29 Dec 2023 15:16:12 +0100
Subject: [PATCH] an option to avoid waiting for inactive Replicated db
 replicas

---
 docs/en/operations/settings/settings.md       |  2 +
 src/Core/SettingsEnums.cpp                    |  2 +
 src/Core/SettingsEnums.h                      |  2 +
 src/Interpreters/executeDDLQueryOnCluster.cpp | 98 ++++++++++++++-----
 .../test_replicated_database/test.py          |  2 +-
 .../test.py                                   |  2 +-
 ...distributed_ddl_output_mode_long.reference |  4 +-
 .../02447_drop_database_replica.reference     |  8 ++
 .../02447_drop_database_replica.sh            |  3 +
 9 files changed, 96 insertions(+), 27 deletions(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 6e087467bb9..d4ee8106320 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -3847,6 +3847,8 @@ Possible values:
 - `none` — Is similar to throw, but distributed DDL query returns no result set.
 - `null_status_on_timeout` — Returns `NULL` as execution status in some rows of result set instead of throwing `TIMEOUT_EXCEEDED` if query is not finished on the corresponding hosts.
 - `never_throw` — Do not throw `TIMEOUT_EXCEEDED` and do not rethrow exceptions if query has failed on some hosts.
+- `null_status_on_timeout_only_active` — similar to `null_status_on_timeout`, but doesn't wait for inactive replicas of the `Replicated` database
+- `throw_only_active` — similar to `throw`, but doesn't wait for inactive replicas of the `Replicated` database
 
 Default value: `throw`.
 
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index c35e69977ed..2e6bb51176d 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -113,6 +113,8 @@ IMPLEMENT_SETTING_ENUM(DistributedDDLOutputMode, ErrorCodes::BAD_ARGUMENTS,
     {{"none",         DistributedDDLOutputMode::NONE},
      {"throw",    DistributedDDLOutputMode::THROW},
      {"null_status_on_timeout", DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT},
+     {"throw_only_active", DistributedDDLOutputMode::THROW_ONLY_ACTIVE},
+     {"null_status_on_timeout_only_active", DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT_ONLY_ACTIVE},
      {"never_throw", DistributedDDLOutputMode::NEVER_THROW}})
 
 IMPLEMENT_SETTING_ENUM(StreamingHandleErrorMode, ErrorCodes::BAD_ARGUMENTS,
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index 2e71c96b954..0b2d47210a8 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -165,6 +165,8 @@ enum class DistributedDDLOutputMode
     THROW,
     NULL_STATUS_ON_TIMEOUT,
     NEVER_THROW,
+    THROW_ONLY_ACTIVE,
+    NULL_STATUS_ON_TIMEOUT_ONLY_ACTIVE,
 };
 
 DECLARE_SETTING_ENUM(DistributedDDLOutputMode)
diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp
index 9486350a0f6..ba7638cd83f 100644
--- a/src/Interpreters/executeDDLQueryOnCluster.cpp
+++ b/src/Interpreters/executeDDLQueryOnCluster.cpp
@@ -200,8 +200,6 @@ public:
     Status prepare() override;
 
 private:
-    static Strings getChildrenAllowNoNode(const std::shared_ptr<zkutil::ZooKeeper> & zookeeper, const String & node_path);
-
     static Block getSampleBlock(ContextPtr context_, bool hosts_to_wait);
 
     Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts);
@@ -228,7 +226,8 @@ private:
     NameSet waiting_hosts;  /// hosts from task host list
     NameSet finished_hosts; /// finished hosts from host list
     NameSet ignoring_hosts; /// appeared hosts that are not in hosts list
-    Strings current_active_hosts; /// Hosts that were in active state at the last check
+    Strings current_active_hosts; /// Hosts that are currently executing the task
+    NameSet offline_hosts;  /// Hosts that are not currently running
     size_t num_hosts_finished = 0;
 
     /// Save the first detected error and throw it at the end of execution
@@ -237,7 +236,10 @@ private:
     Int64 timeout_seconds = 120;
     bool is_replicated_database = false;
     bool throw_on_timeout = true;
+    bool only_running_hosts = false;
+
     bool timeout_exceeded = false;
+    bool stop_waiting_offline_hosts = false;
 };
 
 
@@ -316,6 +318,8 @@ DDLQueryStatusSource::DDLQueryStatusSource(
     {
         waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end());
         is_replicated_database = true;
+        only_running_hosts = output_mode == DistributedDDLOutputMode::THROW_ONLY_ACTIVE ||
+                            output_mode == DistributedDDLOutputMode::NULL_STATUS_ON_TIMEOUT_ONLY_ACTIVE;
     }
     else
     {
@@ -377,6 +381,38 @@ Chunk DDLQueryStatusSource::generateChunkWithUnfinishedHosts() const
     return Chunk(std::move(columns), unfinished_hosts.size());
 }
 
+static NameSet getOfflineHosts(const String & node_path, const NameSet & hosts_to_wait, const ZooKeeperPtr & zookeeper, Poco::Logger * log)
+{
+    fs::path replicas_path;
+    if (node_path.ends_with('/'))
+        replicas_path = fs::path(node_path).parent_path().parent_path().parent_path() / "replicas";
+    else
+        replicas_path = fs::path(node_path).parent_path().parent_path() / "replicas";
+
+    Strings paths;
+    Strings hosts_array;
+    for (const auto & host : hosts_to_wait)
+    {
+        hosts_array.push_back(host);
+        paths.push_back(replicas_path / host / "active");
+    }
+
+    NameSet offline;
+    auto res = zookeeper->tryGet(paths);
+    for (size_t i = 0; i < res.size(); ++i)
+        if (res[i].error == Coordination::Error::ZNONODE)
+            offline.insert(hosts_array[i]);
+
+    if (offline.size() == hosts_to_wait.size())
+    {
+        /// Avoid reporting that all hosts are offline
+        LOG_WARNING(log, "Did not find active hosts, will wait for all {} hosts. This should not happen often", offline.size());
+        return {};
+    }
+
+    return offline;
+}
+
 Chunk DDLQueryStatusSource::generate()
 {
     bool all_hosts_finished = num_hosts_finished >= waiting_hosts.size();
@@ -398,7 +434,7 @@ Chunk DDLQueryStatusSource::generate()
         if (isCancelled())
             return {};
 
-        if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds)
+        if (stop_waiting_offline_hosts || (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds))
         {
             timeout_exceeded = true;
 
@@ -406,7 +442,7 @@ Chunk DDLQueryStatusSource::generate()
             size_t num_active_hosts = current_active_hosts.size();
 
             constexpr auto msg_format = "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. "
-                                                "There are {} unfinished hosts ({} of them are currently active), "
+                                                "There are {} unfinished hosts ({} of them are currently executing the task), "
                                                 "they are going to execute the query in background";
             if (throw_on_timeout)
             {
@@ -425,10 +461,7 @@ Chunk DDLQueryStatusSource::generate()
             return generateChunkWithUnfinishedHosts();
         }
 
-        if (num_hosts_finished != 0 || try_number != 0)
-        {
-            sleepForMilliseconds(std::min<size_t>(1000, 50 * (try_number + 1)));
-        }
+        sleepForMilliseconds(std::min<size_t>(1000, 50 * try_number));
 
         bool node_exists = false;
         Strings tmp_hosts;
@@ -440,9 +473,21 @@ Chunk DDLQueryStatusSource::generate()
             retries_ctl.retryLoop([&]()
             {
                 auto zookeeper = context->getZooKeeper();
-                node_exists = zookeeper->exists(node_path);
-                tmp_hosts = getChildrenAllowNoNode(zookeeper, fs::path(node_path) / node_to_wait);
-                tmp_active_hosts = getChildrenAllowNoNode(zookeeper, fs::path(node_path) / "active");
+                Strings paths = {String(fs::path(node_path) / node_to_wait), String(fs::path(node_path) / "active")};
+                auto res = zookeeper->tryGetChildren(paths);
+                for (size_t i = 0; i < res.size(); ++i)
+                    if (res[i].error != Coordination::Error::ZOK && res[i].error != Coordination::Error::ZNONODE)
+                        throw Coordination::Exception::fromPath(res[i].error, paths[i]);
+
+                if (res[0].error == Coordination::Error::ZNONODE)
+                    node_exists = zookeeper->exists(node_path);
+                else
+                    node_exists = true;
+                tmp_hosts = res[0].names;
+                tmp_active_hosts = res[1].names;
+
+                if (only_running_hosts)
+                    offline_hosts = getOfflineHosts(node_path, waiting_hosts, zookeeper, log);
             });
         }
 
@@ -460,6 +505,17 @@ Chunk DDLQueryStatusSource::generate()
 
         Strings new_hosts = getNewAndUpdate(tmp_hosts);
         ++try_number;
+
+        if (only_running_hosts)
+        {
+            size_t num_finished_or_offline = 0;
+            for (const auto & host : waiting_hosts)
+                num_finished_or_offline += finished_hosts.contains(host) || offline_hosts.contains(host);
+
+            if (num_finished_or_offline == waiting_hosts.size())
+                stop_waiting_offline_hosts = true;
+        }
+
         if (new_hosts.empty())
             continue;
 
@@ -470,7 +526,13 @@ Chunk DDLQueryStatusSource::generate()
         {
             ExecutionStatus status(-1, "Cannot obtain error message");
 
-            if (node_to_wait == "finished")
+            /// Replicated database retries in case of error, it should not write error status.
+#ifdef ABORT_ON_LOGICAL_ERROR
+            bool need_check_status = true;
+#else
+            bool need_check_status = !is_replicated_database;
+#endif
+            if (need_check_status)
             {
                 String status_data;
                 bool finished_exists = false;
@@ -496,7 +558,6 @@ Chunk DDLQueryStatusSource::generate()
             if (status.code != 0 && !first_exception
                 && context->getSettingsRef().distributed_ddl_output_mode != DistributedDDLOutputMode::NEVER_THROW)
             {
-                /// Replicated database retries in case of error, it should not write error status.
                 if (is_replicated_database)
                     throw Exception(ErrorCodes::LOGICAL_ERROR, "There was an error on {}: {} (probably it's a bug)", host_id, status.message);
 
@@ -555,15 +616,6 @@ IProcessor::Status DDLQueryStatusSource::prepare()
         return ISource::prepare();
 }
 
-Strings DDLQueryStatusSource::getChildrenAllowNoNode(const std::shared_ptr<zkutil::ZooKeeper> & zookeeper, const String & node_path)
-{
-    Strings res;
-    Coordination::Error code = zookeeper->tryGetChildren(node_path, res);
-    if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE)
-        throw Coordination::Exception::fromPath(code, node_path);
-    return res;
-}
-
 Strings DDLQueryStatusSource::getNewAndUpdate(const Strings & current_list_of_finished_hosts)
 {
     Strings diff;
diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py
index 3ced82ebb57..1fc3fe37044 100644
--- a/tests/integration/test_replicated_database/test.py
+++ b/tests/integration/test_replicated_database/test.py
@@ -507,7 +507,7 @@ def test_alters_from_different_replicas(started_cluster):
 
     settings = {"distributed_ddl_task_timeout": 5}
     assert (
-        "There are 1 unfinished hosts (0 of them are currently active)"
+        "There are 1 unfinished hosts (0 of them are currently executing the task"
         in competing_node.query_and_get_error(
             "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;",
             settings=settings,
diff --git a/tests/integration/test_replicated_database_cluster_groups/test.py b/tests/integration/test_replicated_database_cluster_groups/test.py
index b14581c1fe6..647626d8014 100644
--- a/tests/integration/test_replicated_database_cluster_groups/test.py
+++ b/tests/integration/test_replicated_database_cluster_groups/test.py
@@ -96,7 +96,7 @@ def test_cluster_groups(started_cluster):
     main_node_2.stop_clickhouse()
     settings = {"distributed_ddl_task_timeout": 5}
     assert (
-        "There are 1 unfinished hosts (0 of them are currently active)"
+        "There are 1 unfinished hosts (0 of them are currently executing the task)"
         in main_node_1.query_and_get_error(
             "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);",
             settings=settings,
diff --git a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference
index 39979a98bde..b9a66a1e1a9 100644
--- a/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference
+++ b/tests/queries/0_stateless/01175_distributed_ddl_output_mode_long.reference
@@ -3,7 +3,7 @@ Received exception from server:
 Code: 57. Error: Received from localhost:9000. Error: There was an error on [localhost:9000]: Code: 57. Error: Table default.none already exists. (TABLE_ALREADY_EXISTS)
 (query: create table none on cluster test_shard_localhost (n int) engine=Memory;)
 Received exception from server:
-Code: 159. Error: Received from localhost:9000. Error: Watching task <task> is executing longer than distributed_ddl_task_timeout (=1) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. (TIMEOUT_EXCEEDED)
+Code: 159. Error: Received from localhost:9000. Error: Watching task <task> is executing longer than distributed_ddl_task_timeout (=1) seconds. There are 1 unfinished hosts (0 of them are currently executing the task), they are going to execute the query in background. (TIMEOUT_EXCEEDED)
 (query: drop table if exists none on cluster test_unavailable_shard;)
 throw
 localhost	9000	0		0	0
@@ -12,7 +12,7 @@ Code: 57. Error: Received from localhost:9000. Error: There was an error on [loc
 (query: create table throw on cluster test_shard_localhost (n int) engine=Memory format Null;)
 localhost	9000	0		1	0
 Received exception from server:
-Code: 159. Error: Received from localhost:9000. Error: Watching task <task> is executing longer than distributed_ddl_task_timeout (=1) seconds. There are 1 unfinished hosts (0 of them are currently active), they are going to execute the query in background. (TIMEOUT_EXCEEDED)
+Code: 159. Error: Received from localhost:9000. Error: Watching task <task> is executing longer than distributed_ddl_task_timeout (=1) seconds. There are 1 unfinished hosts (0 of them are currently executing the task), they are going to execute the query in background. (TIMEOUT_EXCEEDED)
 (query: drop table if exists throw on cluster test_unavailable_shard;)
 null_status_on_timeout
 localhost	9000	0		0	0
diff --git a/tests/queries/0_stateless/02447_drop_database_replica.reference b/tests/queries/0_stateless/02447_drop_database_replica.reference
index f2b41569540..8ad9008057f 100644
--- a/tests/queries/0_stateless/02447_drop_database_replica.reference
+++ b/tests/queries/0_stateless/02447_drop_database_replica.reference
@@ -12,10 +12,18 @@ t
 2
 rdb_default	1	1	s1	r1	1
 2
+s1	r1	OK	2	0
+s2	r1	QUEUED	2	0
+s1	r2	QUEUED	2	0
+s1	r1	OK	2	0
+s2	r1	QUEUED	2	0
+s1	r2	QUEUED	2	0
 2
 rdb_default	1	1	s1	r1	1
 rdb_default	1	2	s1	r2	0
 2
 2
 t
+t2
+t3
 rdb_default_4	1	1	s1	r1	1
diff --git a/tests/queries/0_stateless/02447_drop_database_replica.sh b/tests/queries/0_stateless/02447_drop_database_replica.sh
index d5b3ceef46a..388af3fad74 100755
--- a/tests/queries/0_stateless/02447_drop_database_replica.sh
+++ b/tests/queries/0_stateless/02447_drop_database_replica.sh
@@ -32,6 +32,9 @@ $CLICKHOUSE_CLIENT -q "system sync database replica $db"
 $CLICKHOUSE_CLIENT -q "select cluster, shard_num, replica_num, database_shard_name, database_replica_name, is_active from system.clusters where cluster='$db' and shard_num=1 and replica_num=1"
 $CLICKHOUSE_CLIENT -q "system drop database replica 's1|r1' from database $db2" 2>&1| grep -Fac "is active, cannot drop it"
 
+$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=throw_only_active -q "create table $db.t2 (n int) engine=Log"
+$CLICKHOUSE_CLIENT --distributed_ddl_output_mode=null_status_on_timeout_only_active -q "create table $db.t3 (n int) engine=Log"
+
 $CLICKHOUSE_CLIENT -q "detach database $db3"
 $CLICKHOUSE_CLIENT -q "system drop database replica 'r1' from shard 's2' from database $db"
 $CLICKHOUSE_CLIENT -q "attach database $db3" 2>/dev/null