Merge pull request #23294 from ClickHouse/fix_create_drop_replica_race

Fix race between DROP/CREATE ReplicatedMergeTree
This commit is contained in:
alexey-milovidov 2021-04-21 16:06:15 +03:00 committed by GitHub
commit 6c5ff3db7d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 72 additions and 74 deletions

View File

@ -585,42 +585,24 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr
/// This is Ok because another replica is definitely going to drop the table. /// This is Ok because another replica is definitely going to drop the table.
LOG_WARNING(log, "Removing leftovers from table {} (this might take several minutes)", zookeeper_path); LOG_WARNING(log, "Removing leftovers from table {} (this might take several minutes)", zookeeper_path);
String drop_lock_path = zookeeper_path + "/dropped/lock";
Coordination::Error code = zookeeper->tryCreate(drop_lock_path, "", zkutil::CreateMode::Ephemeral);
Strings children; if (code == Coordination::Error::ZNONODE || code == Coordination::Error::ZNODEEXISTS)
Coordination::Error code = zookeeper->tryGetChildren(zookeeper_path, children);
if (code == Coordination::Error::ZNONODE)
{ {
LOG_WARNING(log, "Table {} is already finished removing by another replica right now", replica_path); LOG_WARNING(log, "The leftovers from table {} were removed by another replica", zookeeper_path);
}
else
{
for (const auto & child : children)
if (child != "dropped")
zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child);
Coordination::Requests ops;
Coordination::Responses responses;
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1));
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1));
code = zookeeper->tryMulti(ops, responses);
if (code == Coordination::Error::ZNONODE)
{
LOG_WARNING(log, "Table {} is already finished removing by another replica right now", replica_path);
}
else if (code == Coordination::Error::ZNOTEMPTY)
{
throw Exception(fmt::format(
"The old table was not completely removed from ZooKeeper, {} still exists and may contain some garbage. But it should never happen according to the logic of operations (it's a bug).", zookeeper_path), ErrorCodes::LOGICAL_ERROR);
} }
else if (code != Coordination::Error::ZOK) else if (code != Coordination::Error::ZOK)
{ {
/// It is still possible that ZooKeeper session is expired or server is killed in the middle of the delete operation. throw Coordination::Exception(code, drop_lock_path);
zkutil::KeeperMultiException::check(code, ops, responses);
} }
else else
{ {
LOG_WARNING(log, "The leftovers from table {} was successfully removed from ZooKeeper", zookeeper_path); auto metadata_drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper);
if (!removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, log))
{
/// Someone is recursively removing table right now, we cannot create new table until old one is removed
continue;
} }
} }
} }
@ -633,10 +615,6 @@ bool StorageReplicatedMergeTree::createTableIfNotExists(const StorageMetadataPtr
Coordination::Requests ops; Coordination::Requests ops;
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
/// Check that the table is not being dropped right now.
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/dropped", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata_str, ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata_str,
zkutil::CreateMode::Persistent)); zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", metadata_snapshot->getColumns().toString(), ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", metadata_snapshot->getColumns().toString(),
@ -824,10 +802,18 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con
* because table creation is executed in single transaction that will conflict with remaining nodes. * because table creation is executed in single transaction that will conflict with remaining nodes.
*/ */
/// Node /dropped works like a lock that protects from concurrent removal of old table and creation of new table.
/// But recursive removal may fail in the middle of operation leaving some garbage in zookeeper_path, so
/// we remove it on table creation if there is /dropped node. Creating thread may remove /dropped node created by
/// removing thread, and it causes race condition if removing thread is not finished yet.
/// To avoid this we also create ephemeral child before starting recursive removal.
/// (The existence of child node does not allow to remove parent node).
Coordination::Requests ops; Coordination::Requests ops;
Coordination::Responses responses; Coordination::Responses responses;
String drop_lock_path = zookeeper_path + "/dropped/lock";
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/replicas", -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/replicas", -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/dropped", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/dropped", "", zkutil::CreateMode::Persistent));
ops.emplace_back(zkutil::makeCreateRequest(drop_lock_path, "", zkutil::CreateMode::Ephemeral));
Coordination::Error code = zookeeper->tryMulti(ops, responses); Coordination::Error code = zookeeper->tryMulti(ops, responses);
if (code == Coordination::Error::ZNONODE || code == Coordination::Error::ZNODEEXISTS) if (code == Coordination::Error::ZNONODE || code == Coordination::Error::ZNODEEXISTS)
@ -844,34 +830,41 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con
} }
else else
{ {
auto metadata_drop_lock = zkutil::EphemeralNodeHolder::existing(drop_lock_path, *zookeeper);
LOG_INFO(logger, "Removing table {} (this might take several minutes)", zookeeper_path); LOG_INFO(logger, "Removing table {} (this might take several minutes)", zookeeper_path);
removeTableNodesFromZooKeeper(zookeeper, zookeeper_path, metadata_drop_lock, logger);
Strings children;
code = zookeeper->tryGetChildren(zookeeper_path, children);
if (code == Coordination::Error::ZNONODE)
{
LOG_WARNING(logger, "Table {} is already finished removing by another replica right now", remote_replica_path);
} }
else }
bool StorageReplicatedMergeTree::removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper,
const String & zookeeper_path, const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger)
{ {
bool completely_removed = false;
Strings children;
Coordination::Error code = zookeeper->tryGetChildren(zookeeper_path, children);
if (code == Coordination::Error::ZNONODE)
throw Exception(ErrorCodes::LOGICAL_ERROR, "There is a race condition between creation and removal of replicated table. It's a bug");
for (const auto & child : children) for (const auto & child : children)
if (child != "dropped") if (child != "dropped")
zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child); zookeeper->tryRemoveRecursive(zookeeper_path + "/" + child);
ops.clear(); Coordination::Requests ops;
responses.clear(); Coordination::Responses responses;
ops.emplace_back(zkutil::makeRemoveRequest(metadata_drop_lock->getPath(), -1));
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/dropped", -1));
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path, -1));
code = zookeeper->tryMulti(ops, responses); code = zookeeper->tryMulti(ops, responses);
if (code == Coordination::Error::ZNONODE) if (code == Coordination::Error::ZNONODE)
{ {
LOG_WARNING(logger, "Table {} is already finished removing by another replica right now", remote_replica_path); throw Exception(ErrorCodes::LOGICAL_ERROR, "There is a race condition between creation and removal of replicated table. It's a bug");
} }
else if (code == Coordination::Error::ZNOTEMPTY) else if (code == Coordination::Error::ZNOTEMPTY)
{ {
LOG_ERROR(logger, "Table was not completely removed from ZooKeeper, {} still exists and may contain some garbage.", LOG_ERROR(logger, "Table was not completely removed from ZooKeeper, {} still exists and may contain some garbage,"
zookeeper_path); "but someone is removing it right now.", zookeeper_path);
} }
else if (code != Coordination::Error::ZOK) else if (code != Coordination::Error::ZOK)
{ {
@ -880,10 +873,12 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con
} }
else else
{ {
metadata_drop_lock->setAlreadyRemoved();
completely_removed = true;
LOG_INFO(logger, "Table {} was successfully removed from ZooKeeper", zookeeper_path); LOG_INFO(logger, "Table {} was successfully removed from ZooKeeper", zookeeper_path);
} }
}
} return completely_removed;
} }

View File

@ -208,6 +208,10 @@ public:
*/ */
static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, Poco::Logger * logger); static void dropReplica(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path, const String & replica, Poco::Logger * logger);
/// Removes table from ZooKeeper after the last replica was dropped
static bool removeTableNodesFromZooKeeper(zkutil::ZooKeeperPtr zookeeper, const String & zookeeper_path,
const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger);
/// Get job to execute in background pool (merge, mutate, drop range and so on) /// Get job to execute in background pool (merge, mutate, drop range and so on)
std::optional<JobAndPool> getDataProcessingJob() override; std::optional<JobAndPool> getDataProcessingJob() override;

View File

@ -11,11 +11,10 @@ function thread()
while true; do while true; do
$CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS test_table_$1 SYNC; $CLICKHOUSE_CLIENT -n -q "DROP TABLE IF EXISTS test_table_$1 SYNC;
CREATE TABLE test_table_$1 (a UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$1') ORDER BY tuple();" 2>&1 | CREATE TABLE test_table_$1 (a UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/alter_table', 'r_$1') ORDER BY tuple();" 2>&1 |
grep -vP '(^$)|(^Received exception from server)|(^\d+\. )|because the last replica of the table was dropped right now|is already started to be removing by another replica right now|is already finished removing by another replica right now|Removing leftovers from table|Another replica was suddenly created|was successfully removed from ZooKeeper|was created by another server at the same moment|was suddenly removed|some other replicas were created at the same time' grep -vP '(^$)|(^Received exception from server)|(^\d+\. )|because the last replica of the table was dropped right now|is already started to be removing by another replica right now| were removed by another replica|Removing leftovers from table|Another replica was suddenly created|was created by another server at the same moment|was suddenly removed|some other replicas were created at the same time'
done done
} }
# https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout # https://stackoverflow.com/questions/9954794/execute-a-shell-function-with-timeout
export -f thread; export -f thread;