Merge pull request #11523 from ClickHouse/fixing_01079_tests

Fixing test and benign race condition during table drop
2024-11-27 18:12:02 +00:00 · 2020-06-09 02:02:39 +03:00 · 2020-06-09 02:02:39 +03:00 · 91d06c7e4f
commit 91d06c7e4f
parent f0f1d4b250 ce73b30505
4 changed files with 32 additions and 14 deletions
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@ -141,16 +141,6 @@ void StorageMergeTree::shutdown()
        mutation_wait_event.notify_all();
    }

-    try
-    {
-        clearOldPartsFromFilesystem(true);
-    }
-    catch (...)
-    {
-        /// Example: the case of readonly filesystem, we have failure removing old parts.
-        /// Should not prevent table shutdown.
-        tryLogCurrentException(log);
-    }

    merger_mutator.merges_blocker.cancelForever();
    parts_mover.moves_blocker.cancelForever();
@ -160,6 +150,23 @@ void StorageMergeTree::shutdown()

    if (moving_task_handle)
        global_context.getBackgroundMovePool().removeTask(moving_task_handle);
+
+
+    try
+    {
+        /// We clear all old parts after stopping all background operations.
+        /// It's important, because background operations can produce temporary
+        /// parts which will remove themselves in their descrutors. If so, we
+        /// may have race condition between our remove call and background
+        /// process.
+        clearOldPartsFromFilesystem(true);
+    }
+    catch (...)
+    {
+        /// Example: the case of readonly filesystem, we have failure removing old parts.
+        /// Should not prevent table shutdown.
+        tryLogCurrentException(log);
+    }
 }


--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -2961,7 +2961,6 @@ void StorageReplicatedMergeTree::startup()

 void StorageReplicatedMergeTree::shutdown()
 {
-    clearOldPartsFromFilesystem(true);
    /// Cancel fetches, merges and mutations to force the queue_task to finish ASAP.
    fetcher.blocker.cancelForever();
    merger_mutator.merges_blocker.cancelForever();
@ -2997,6 +2996,12 @@ void StorageReplicatedMergeTree::shutdown()
        std::unique_lock lock(data_parts_exchange_endpoint->rwlock);
    }
    data_parts_exchange_endpoint.reset();
+
+    /// We clear all old parts after stopping all background operations. It's
+    /// important, because background operations can produce temporary parts
+    /// which will remove themselves in their descrutors. If so, we may have
+    /// race condition between our remove call and background process.
+    clearOldPartsFromFilesystem(true);
 }


--- a/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh
+++ b/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh
@ -100,8 +100,14 @@ wait

 echo "Finishing alters"

-# This alter will finish all previous, but replica 1 maybe still not up-to-date
-while [[ $(timeout 30 $CLICKHOUSE_CLIENT --query "ALTER TABLE concurrent_alter_mt_1 MODIFY COLUMN value1 String SETTINGS replication_alter_partitions_sync=2" 2>&1) ]]; do
+# This alter will finish all previous, but replica 1 maybe still not up-to-date.
+# If query will throw something, than we will sleep 1 and retry. If timeout
+# happened we will silently go out of loop and probably fail tests in the
+# following for loop.
+#
+# 120 seconds is more than enough, but in rare cases for slow builds (debug,
+# thread) it maybe necessary.
+while [[ $(timeout 120 $CLICKHOUSE_CLIENT --query "ALTER TABLE concurrent_alter_mt_1 MODIFY COLUMN value1 String SETTINGS replication_alter_partitions_sync=2" 2>&1) ]]; do
    sleep 1
 done