Replace part_committed_locally_but_zookeeper with retries

2024-11-28 10:31:57 +00:00 · 2023-11-20 12:34:07 +01:00 · 2023-11-20 12:34:07 +01:00 · 8217915064
commit 8217915064
parent 20eb5d3251
6 changed files with 67 additions and 58 deletions
--- a/src/Common/FailPoint.cpp
+++ b/src/Common/FailPoint.cpp
@ -28,13 +28,14 @@ static struct InitFiu
 /// We should define different types of failpoints here. There are four types of them:
 /// - ONCE: the failpoint will only be triggered once.
-/// - REGULAR: the failpoint will always be triggered util disableFailPoint is called.
+/// - REGULAR: the failpoint will always be triggered until disableFailPoint is called.
 /// - PAUSEABLE_ONCE: the failpoint will be blocked one time when pauseFailPoint is called, util disableFailPoint is called.
 /// - PAUSEABLE: the failpoint will be blocked every time when pauseFailPoint is called, util disableFailPoint is called.
 #define APPLY_FOR_FAILPOINTS(ONCE, REGULAR, PAUSEABLE_ONCE, PAUSEABLE) \
    ONCE(replicated_merge_tree_commit_zk_fail_after_op) \
    ONCE(replicated_merge_tree_insert_quorum_fail_0) \
    REGULAR(replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault) \
    REGULAR(use_delayed_remote_source) \
    REGULAR(cluster_discovery_faults) \
    REGULAR(check_table_query_delay_for_part) \
--- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.cpp
+++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.cpp
@ -47,7 +47,7 @@ void ZooKeeperWithFaultInjection::injectFailureBeforeOperationThrow(const char *
 void ZooKeeperWithFaultInjection::injectFailureAfterOperationThrow(const char * func_name, const String & path)
 {
-    if (unlikely(fault_policy) && fault_policy->beforeOperation())
+    if (unlikely(fault_policy) && fault_policy->afterOperation())
    {
        keeper.reset();
        if (logger)
--- a/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
+++ b/src/Common/ZooKeeper/ZooKeeperWithFaultInjection.h
@ -30,7 +30,7 @@ public:
    bool beforeOperation()
    {
-        if (distribution(rndgen) || must_fail_before_op)
+        if (must_fail_before_op || distribution(rndgen))
        {
            must_fail_before_op = false;
            return true;
@ -40,7 +40,7 @@ public:
    bool afterOperation()
    {
-        if (distribution(rndgen) || must_fail_after_op)
+        if (must_fail_after_op || distribution(rndgen))
        {
            must_fail_after_op = false;
            return true;
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@ -29,6 +29,7 @@ namespace FailPoints
 {
    extern const char replicated_merge_tree_commit_zk_fail_after_op[];
    extern const char replicated_merge_tree_insert_quorum_fail_0[];
    extern const char replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault[];
 }
 namespace ErrorCodes
@ -568,9 +569,7 @@ std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::
    bool is_already_existing_part = false;
    /// for retries due to keeper error
    bool part_committed_locally_but_zookeeper = false;
    bool part_was_deduplicated = false;
    Coordination::Error write_part_info_keeper_error = Coordination::Error::ZOK;
    std::vector<String> conflict_block_ids;
    ZooKeeperRetriesControl retries_ctl("commitPart", zookeeper_retries_info, context->getProcessListElement());
@ -588,40 +587,12 @@ std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::
            /// For example during RESTORE REPLICA.
            if (!writing_existing_part)
            {
                /// We have lost connection to all keepers but it might be recovered, so we use setUserError to keep retrying
                retries_ctl.setUserError(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode: replica_path={}", storage.replica_path);
                return;
            }
        }
        if (retries_ctl.isRetry())
        {
            /// If we are retrying, check if last iteration was actually successful,
            /// we could get network error on committing part to zk
            /// but the operation could be completed by zk server
            /// If this flag is true, then part is in Active state, and we'll not retry anymore
            /// we only check if part was committed to zk and return success or failure correspondingly
            /// Note: if commit to zk failed then cleanup thread will mark the part as Outdated later
            if (part_committed_locally_but_zookeeper)
            {
                /// check that info about the part was actually written in zk
                if (zookeeper->exists(fs::path(storage.replica_path) / "parts" / part->name))
                {
                    LOG_DEBUG(log, "Part was successfully committed on previous iteration: part_id={}", part->name);
                }
                else
                {
                    retries_ctl.setUserError(
                        ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
                        "Insert failed due to zookeeper error. Please retry. Reason: {}",
                        write_part_info_keeper_error);
                }
                retries_ctl.stopRetries();
                return;
            }
        }
        /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
        /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned.
        /// Also, make deduplication check. If a duplicate is detected, no nodes are created.
@ -944,32 +915,49 @@ std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::
        }
        else if (Coordination::isHardwareError(multi_code))
        {
-            write_part_info_keeper_error = multi_code;
+            LOG_TRACE(
-            /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part
+                    log, "Insert of part {} failed when committing to keeper (Reason: {}). Attempting to recover it", part->name, multi_code);
-             *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
+            ZooKeeperRetriesControl new_retry_controller = retries_ctl;
             */
            transaction.commit();
-            /// Setting this flag is point of no return
+            /// We are going to try to verify if the transaction was written into keeper
-            /// On next retry, we'll just check if actually operation succeed or failed
+            /// If we fail to do so (keeper unavailable) then we don't know if the changes were applied or not so
-            /// and return ok or error correspondingly
+            /// we can't delete the local part, as if the changes were applied then inserted block appeared in
-            part_committed_locally_but_zookeeper = true;
+            /// `/blocks/`, and it can not be inserted again.
            new_retry_controller.actionAfterLastFailedRetry([&]
            {
                transaction.commit();
                storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
                throw Exception(ErrorCodes::UNKNOWN_STATUS_OF_INSERT,
                        "Unknown status of part {} (Reason: {}). Data was written locally but we don't know the status in keeper. It will be verified in ~{} seconds.",
                        part->name, multi_code, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
            });
            new_retry_controller.requestUnconditionalRetry();
-            /// if all retries will be exhausted by accessing zookeeper on fresh retry -> we'll add committed part to queue in the action
+            bool node_exists = false;
-            /// here lambda capture part name, it's ok since we'll not generate new one for this insert,
+            new_retry_controller.retryLoop([&]
-            /// see comments around 'part_committed_locally_but_zookeeper' flag
+            {
-            retries_ctl.actionAfterLastFailedRetry(
+                fiu_do_on(FailPoints::replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault, { zookeeper->forceFailureBeforeOperation(); });
-                [&my_storage = storage, part_name = part->name]
+                zookeeper->setKeeper(storage.getZooKeeper());
-                {
+                node_exists = zookeeper->exists(fs::path(storage.replica_path) / "parts" / part->name);
-                    my_storage.enqueuePartForCheck(part_name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
+            });
                });
-            /// We do not know whether or not data has been inserted.
+            if (node_exists)
-            retries_ctl.setUserError(
+            {
-                ErrorCodes::UNKNOWN_STATUS_OF_INSERT,
+                LOG_TRACE(log, "Insert of part {} recovered from keeper successfully. It will be committed", part->name);
-                "Unknown status, client must retry. Reason: {}",
+                part->new_part_was_committed_to_zookeeper_after_rename_on_disk = true;
-                multi_code);
+                transaction.commit();
-            return;
+                storage.merge_selecting_task->schedule();
            }
            else
            {
                LOG_TRACE(log, "Insert of part {} was not committed to keeper. Will try again with a new block", part->name);
                rename_part_to_temporary();
                retries_ctl.setUserError(
                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
                    "Insert of part {} failed when committing to keeper (Reason: {}",
                    part->name,
                    multi_code);
            }
        }
        else if (Coordination::isUserError(multi_code))
        {
--- a/tests/queries/0_stateless/02919_insert_meet_eternal_hardware_error.reference
+++ b/tests/queries/0_stateless/02919_insert_meet_eternal_hardware_error.reference
--- a/tests/queries/0_stateless/02919_insert_meet_eternal_hardware_error.sql
+++ b/tests/queries/0_stateless/02919_insert_meet_eternal_hardware_error.sql
@ -0,0 +1,20 @@
 -- Tags: zookeeper, no-parallel
 DROP TABLE IF EXISTS t_hardware_error NO DELAY;
 CREATE TABLE t_hardware_error (
    KeyID UInt32
 ) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
 ORDER BY (KeyID);
 insert into t_hardware_error values (1), (2), (3), (4), (5);
 system enable failpoint replicated_merge_tree_commit_zk_fail_after_op;
 system enable failpoint replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault;
 insert into t_hardware_error values (6), (7), (8), (9), (10); -- {serverError UNKNOWN_STATUS_OF_INSERT}
 system disable failpoint replicated_commit_zk_fail_after_op;
 system disable failpoint replicated_merge_tree_commit_zk_fail_when_recovering_from_hw_fault;
 DROP TABLE t_hardware_error NO DELAY;