From 9bc98771f79391a8b5ab9fbb927c189e26bf4297 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 19 Apr 2021 11:21:42 +0300 Subject: [PATCH 1/3] fix race on replica creation --- src/Storages/StorageReplicatedMergeTree.cpp | 138 +++++++++--------- src/Storages/StorageReplicatedMergeTree.h | 4 + .../01305_replica_create_drop_zookeeper.sh | 3 +- 3 files changed, 71 insertions(+), 74 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 10061af22e7..e545c1d0c9e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -585,42 +585,24 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr /// This is Ok because another replica is definitely going to drop the table. LOG_WARNING(log, "Removing leftovers from table {} (this might take several minutes)", zookeeper_path); + String drop_lock_path = zookeeper_path + "/dropped/lock"; + Coordination::Error code = zookeeper->tryCreate(drop_lock_path, "", zkutil::CreateMode::Ephemeral); - Strings children; - Coordination::Error code = zookeeper->tryGetChildren(zookeeper_path, children); - if (code == Coordination::Error::ZNONODE) + if (code == Coordination::Error::ZNONODE || code == Coordination::Error::ZNODEEXISTS) { - LOG_WARNING(log, "Table {} is already finished removing by another replica right now", replica_path); + LOG_WARNING(log, "The leftovers from table {} were removed by another replica", zookeeper_path); + } + else if (code != Coordination::Error::ZOK) + { + throw Coordination::Exception(code, drop_lock_path); } else { - for (const auto & child : children) - if (child != "dropped") - zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child); - - Coordination::Requests ops; - Coordination::Responses responses; - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1)); - code = zookeeper->tryMulti(ops, responses); - - if (code == Coordination::Error::ZNONODE) + auto drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); + if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, drop_lock, log)) { - LOG_WARNING(log, "Table {} is already finished removing by another replica right now", replica_path); - } - else if (code == Coordination::Error::ZNOTEMPTY) - { - throw Exception(fmt::format( - "The old table was not completely removed from ZooKeeper, {} still exists and may contain some garbage. But it should never happen according to the logic of operations (it's a bug).", zookeeper_path), ErrorCodes::LOGICAL_ERROR); - } - else if (code != Coordination::Error::ZOK) - { - /// It is still possible that ZooKeeper session is expired or server is killed in the middle of the delete operation. - zkutil::KeeperMultiException::check(code, ops, responses); - } - else - { - LOG_WARNING(log, "The leftovers from table {} was successfully removed from ZooKeeper", zookeeper_path); + /// Someone is recursively removing table right now, we cannot create new table until old one is removed + continue; } } } @@ -633,10 +615,6 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); - /// Check that the table is not being dropped right now. - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/dropped", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata_str, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", metadata_snapshot->getColumns().toString(), @@ -824,10 +802,17 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con * because table creation is executed in single transaction that will conflict with remaining nodes. */ + /// Node /dropped works like a lock that protects from concurrent removal of old table and creation of new table. + /// But recursive removal may fail in the middle of operation leaving some garbage in zookeeper_path, so + /// we remove it on table creation if there is /dropped node. Creating thread may remove /dropped node created by + /// removing thread, and it causes race condition if removing thread is not finished yet. + /// To avoid this we also create ephemeral child before starting recursive removal. Coordination::Requests ops; Coordination::Responses responses; + String drop_lock_path = zookeeper_path + "/dropped/lock"; ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/replicas", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/dropped", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(drop_lock_path, "", zkutil::CreateMode::Ephemeral)); Coordination::Error code = zookeeper->tryMulti(ops, responses); if (code == Coordination::Error::ZNONODE || code == Coordination::Error::ZNODEEXISTS) @@ -844,48 +829,57 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con } else { + auto drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); LOG_INFO(logger, "Removing table {} (this might take several minutes)", zookeeper_path); - - Strings children; - code = zookeeper->tryGetChildren(zookeeper_path, children); - if (code == Coordination::Error::ZNONODE) - { - LOG_WARNING(logger, "Table {} is already finished removing by another replica right now", remote_replica_path); - } - else - { - for (const auto & child : children) - if (child != "dropped") - zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child); - - ops.clear(); - responses.clear(); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); - ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1)); - code = zookeeper->tryMulti(ops, responses); - - if (code == Coordination::Error::ZNONODE) - { - LOG_WARNING(logger, "Table {} is already finished removing by another replica right now", remote_replica_path); - } - else if (code == Coordination::Error::ZNOTEMPTY) - { - LOG_ERROR(logger, "Table was not completely removed from ZooKeeper, {} still exists and may contain some garbage.", - zookeeper_path); - } - else if (code != Coordination::Error::ZOK) - { - /// It is still possible that ZooKeeper session is expired or server is killed in the middle of the delete operation. - zkutil::KeeperMultiException::check(code, ops, responses); - } - else - { - LOG_INFO(logger, "Table {} was successfully removed from ZooKeeper", zookeeper_path); - } - } + removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, drop_lock, logger); } } +bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, + const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & drop_lock, Poco::Logger * logger) +{ + bool completely_removed = false; + Strings children; + Coordination::Error code = zookeeper->tryGetChildren(zookeeper_path, children); + if (code == Coordination::Error::ZNONODE) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is a race condition between creation and removal of replicated table. It's a bug"); + + + for (const auto & child : children) + if (child != "dropped") + zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child); + + Coordination::Requests ops; + Coordination::Responses responses; + ops.emplace_back(zkutil::makeRemoveRequest(drop_lock->getPath(), -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1)); + code = zookeeper->tryMulti(ops, responses); + + if (code == Coordination::Error::ZNONODE) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is a race condition between creation and removal of replicated table. It's a bug"); + } + else if (code == Coordination::Error::ZNOTEMPTY) + { + LOG_ERROR(logger, "Table was not completely removed from ZooKeeper, {} still exists and may contain some garbage," + "but someone is removing it right now.", zookeeper_path); + } + else if (code != Coordination::Error::ZOK) + { + /// It is still possible that ZooKeeper session is expired or server is killed in the middle of the delete operation. + zkutil::KeeperMultiException::check(code, ops, responses); + } + else + { + drop_lock->setAlreadyRemoved(); + completely_removed = true; + LOG_INFO(logger, "Table {} was successfully removed from ZooKeeper", zookeeper_path); + } + + return completely_removed; +} + /** Verify that list of columns and table storage_settings_ptr match those specified in ZK (/metadata). * If not, throw an exception. diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 9122bdafbf0..4fd5a129758 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -208,6 +208,10 @@ public: */ static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, Poco::Logger * logger); + /// Removes table from ZooKeeper after the last replica was dropped + static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, + const zkutil::EphemeralNodeHolder::Ptr & drop_lock, Poco::Logger * logger); + /// Get job to execute in background pool (merge, mutate, drop range and so on) std::optional getDataProcessingJob() override; diff --git a/tests/queries/0_stateless/01305_replica_create_drop_zookeeper.sh b/tests/queries/0_stateless/01305_replica_create_drop_zookeeper.sh index e7b8091284a..6248813c9ba 100755 --- a/tests/queries/0_stateless/01305_replica_create_drop_zookeeper.sh +++ b/tests/queries/0_stateless/01305_replica_create_drop_zookeeper.sh @@ -11,11 +11,10 @@ function thread() while true; do $CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS test_table_$1 SYNC; CREATE TABLE test_table_$1 (a UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$1') ORDER BY tuple();" 2>&1 | - grep -vP '(^$)|(^Received exception from server)|(^\d+\. )|because the last replica of the table was dropped right now|is already started to be removing by another replica right now|is already finished removing by another replica right now|Removing leftovers from table|Another replica was suddenly created|was successfully removed from ZooKeeper|was created by another server at the same moment|was suddenly removed|some other replicas were created at the same time' + grep -vP '(^$)|(^Received exception from server)|(^\d+\. )|because the last replica of the table was dropped right now|is already started to be removing by another replica right now| were removed by another replica|Removing leftovers from table|Another replica was suddenly created|was created by another server at the same moment|was suddenly removed|some other replicas were created at the same time' done } - # https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout export -f thread; From ecc625692ebb26bda48715386a603c78b68070aa Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 19 Apr 2021 13:40:20 +0300 Subject: [PATCH 2/3] fix build --- src/Storages/StorageReplicatedMergeTree.cpp | 14 +++++++------- src/Storages/StorageReplicatedMergeTree.h | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e545c1d0c9e..90563f61376 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -598,8 +598,8 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr } else { - auto drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); - if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, drop_lock, log)) + auto metadata_drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); + if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, log)) { /// Someone is recursively removing table right now, we cannot create new table until old one is removed continue; @@ -829,14 +829,14 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con } else { - auto drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); + auto metadata_drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper); LOG_INFO(logger, "Removing table {} (this might take several minutes)", zookeeper_path); - removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, drop_lock, logger); + removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, logger); } } bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, - const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & drop_lock, Poco::Logger * logger) + const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger) { bool completely_removed = false; Strings children; @@ -851,7 +851,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper Coordination::Requests ops; Coordination::Responses responses; - ops.emplace_back(zkutil::makeRemoveRequest(drop_lock->getPath(), -1)); + ops.emplace_back(zkutil::makeRemoveRequest(metadata_drop_lock->getPath(), -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1)); code = zookeeper->tryMulti(ops, responses); @@ -872,7 +872,7 @@ bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeper } else { - drop_lock->setAlreadyRemoved(); + metadata_drop_lock->setAlreadyRemoved(); completely_removed = true; LOG_INFO(logger, "Table {} was successfully removed from ZooKeeper", zookeeper_path); } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 4fd5a129758..c70556f40df 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -210,7 +210,7 @@ public: /// Removes table from ZooKeeper after the last replica was dropped static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, - const zkutil::EphemeralNodeHolder::Ptr & drop_lock, Poco::Logger * logger); + const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger); /// Get job to execute in background pool (merge, mutate, drop range and so on) std::optional getDataProcessingJob() override; From 34b30d80d4616a97d1a377f669146de3cacf6e18 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 21 Apr 2021 16:01:54 +0300 Subject: [PATCH 3/3] Update StorageReplicatedMergeTree.cpp --- src/Storages/StorageReplicatedMergeTree.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 90563f61376..3b4a1ec4e16 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -807,6 +807,7 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con /// we remove it on table creation if there is /dropped node. Creating thread may remove /dropped node created by /// removing thread, and it causes race condition if removing thread is not finished yet. /// To avoid this we also create ephemeral child before starting recursive removal. + /// (The existence of child node does not allow to remove parent node). Coordination::Requests ops; Coordination::Responses responses; String drop_lock_path = zookeeper_path + "/dropped/lock";