make stages commit

2024-11-23 16:12:01 +00:00 · 2023-11-30 02:09:44 +01:00 · 2023-11-30 02:09:44 +01:00 · e4becc01ba
commit e4becc01ba
parent 7efa76f08d
4 changed files with 481 additions and 415 deletions
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -3769,25 +3769,6 @@ void MergeTreeData::removePartsFromWorkingSet(MergeTreeTransaction * txn, const
        resetObjectColumnsFromActiveParts(acquired_lock);
 }

-void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove)
-{
-    auto lock = lockParts();
-
-    for (const auto & part : remove)
-    {
-        auto it_part = data_parts_by_info.find(part->info);
-        if (it_part == data_parts_by_info.end())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} not found in data_parts", part->getNameWithState());
-
-        assert(part->getState() == MergeTreeDataPartState::PreActive);
-
-        modifyPartState(part, MergeTreeDataPartState::Temporary);
-        /// Erase immediately
-        LOG_TEST(log, "removePartsFromWorkingSetImmediatelyAndSetTemporaryState: removing {} from data_parts_indexes", part->getNameWithState());
-        data_parts_indexes.erase(it_part);
-    }
-}
-
 void MergeTreeData::removePartsFromWorkingSet(
        MergeTreeTransaction * txn, const DataPartsVector & remove, bool clear_without_timeout, DataPartsLock * acquired_lock)
 {
@ -6300,24 +6281,6 @@ MergeTreeData::Transaction::Transaction(MergeTreeData & data_, MergeTreeTransact
        data.transactions_enabled.store(true);
 }

-void MergeTreeData::Transaction::rollbackPartsToTemporaryState()
-{
-    if (!isEmpty())
-    {
-        WriteBufferFromOwnString buf;
-        buf << " Rollbacking parts state to temporary and removing from working set:";
-        for (const auto & part : precommitted_parts)
-            buf << " " << part->getDataPartStorage().getPartDirectory();
-        buf << ".";
-        LOG_DEBUG(data.log, "Undoing transaction.{}", buf.str());
-
-        data.removePartsFromWorkingSetImmediatelyAndSetTemporaryState(
-            DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()));
-    }
-
-    clear();
-}
-
 TransactionID MergeTreeData::Transaction::getTID() const
 {
    if (txn)
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -260,10 +260,6 @@ public:

        void rollback(DataPartsLock * lock = nullptr);

-        /// Immediately remove parts from table's data_parts set and change part
-        /// state to temporary. Useful for new parts which not present in table.
-        void rollbackPartsToTemporaryState();
-
        size_t size() const { return precommitted_parts.size(); }
        bool isEmpty() const { return precommitted_parts.empty(); }

@ -601,11 +597,6 @@ public:
        DataPartsLock & lock,
        DataPartsVector * out_covered_parts = nullptr);

-    /// Remove parts from working set immediately (without wait for background
-    /// process). Transfer part state to temporary. Have very limited usage only
-    /// for new parts which aren't already present in table.
-    void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove);
-
    /// Removes parts from the working set parts.
    /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states.
    /// If clear_without_timeout is true, the parts will be deleted at once, or during the next call to
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@ -543,6 +543,20 @@ bool ReplicatedMergeTreeSinkImpl<false>::writeExistingPart(MergeTreeData::Mutabl
    }
 }

+template<bool async_insert>
+std::vector<String> ReplicatedMergeTreeSinkImpl<async_insert>::detectConflictsInAsyncBlockIDs(const std::vector<String> & ids)
+{
+    auto conflict_block_ids = storage.async_block_ids_cache.detectConflicts(ids, cache_version);
+    if (!conflict_block_ids.empty())
+    {
+        cache_version = 0;
+    }
+    return conflict_block_ids;
+}
+
+namespace
+{
+
 bool contains(const std::vector<String> & block_ids, const String & path)
 {
    for (const auto & local_block_id : block_ids)
@ -556,6 +570,56 @@ bool contains(const String & block_ids, const String & path)
    return block_ids == path;
 }

+String getBlockIdPath(const String & zookeeper_path, const String & block_id)
+{
+    if (!block_id.empty())
+        return zookeeper_path + "/blocks/" + block_id;
+    return String();
+}
+
+std::vector<String> getBlockIdPath(const String & zookeeper_path, const std::vector<String> & block_id)
+{
+    std::vector<String> result;
+    result.reserve(block_id.size());
+    for (const auto & single_block_id : block_id)
+        result.push_back(zookeeper_path + "/async_blocks/" + single_block_id);
+    return result;
+}
+
+}
+
+struct CommitRetryContext
+{
+    enum Stages
+    {
+        LOCK_BLOCK,
+        COMMIT,
+        DUPLICATED_PART,
+        UNCERTAIN_COMMIT,
+        SUCCESS,
+        ERROR
+    };
+
+    /// Possible ways:
+    /// LOCK_BLOCK -> COMMIT
+    /// LOCK_BLOCK -> DUPLICATED_PART
+    /// DUPLICATED_PART -> SUCCESS
+    /// COMMIT -> SUCCESS
+    /// COMMIT -> DUPLICATED_PART
+    /// COMMIT -> UNCERTAIN_COMMIT
+    /// UNCERTAIN_COMMIT -> SUCCESS
+    /// * -> ERROR
+
+    Stages stage = LOCK_BLOCK;
+
+    String actual_part_name;
+    std::optional<EphemeralLockInZooKeeper> block_number_lock;
+    size_t block_number = 0;
+    std::vector<String> conflict_block_ids;
+    bool part_was_deduplicated = false;
+    Coordination::Error unsertain_keeper_error = Coordination::Error::ZOK;
+};
+
 template<bool async_insert>
 std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::commitPart(
    const ZooKeeperWithFaultInjectionPtr & zookeeper,
@ -570,18 +634,407 @@ std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::
    ///
    /// metadata_snapshot->check(part->getColumns());

-    const String temporary_part_relative_path = part->getDataPartStorage().getPartDirectory();
+    auto block_id_path = getBlockIdPath(storage.zookeeper_path, block_id);

-    /// for retries due to keeper error
-    bool part_committed_locally_but_zookeeper = false;
-    bool part_was_deduplicated = false;
-    Coordination::Error write_part_info_keeper_error = Coordination::Error::ZOK;
-    std::vector<String> conflict_block_ids;
+    CommitRetryContext retry_context;

    ZooKeeperRetriesControl retries_ctl("commitPart", zookeeper_retries_info, context->getProcessListElement());
+
+    auto resolve_duplicate_stage = [&] () -> bool
+    {
+        if constexpr (async_insert)
+        {
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                            "Conflict block ids and block number lock should not "
+                            "be empty at the same time for async inserts");
+        }
+        else
+        {
+            /// This block was already written to some replica. Get the part name for it.
+            /// Note: race condition with DROP PARTITION operation is possible. User will get "No node" exception and it is Ok.
+            retry_context.actual_part_name = zookeeper->get(block_id_path);
+
+            bool exists_locally = bool(storage.getActiveContainingPart(retry_context.actual_part_name));
+
+            if (exists_locally)
+                ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
+
+            LOG_INFO(log, "Block with ID {} {} as part {}; ignoring it.",
+                     block_id,
+                     exists_locally ? "already exists locally" : "already exists on other replicas",
+                     retry_context.actual_part_name);
+
+            retry_context.part_was_deduplicated = true;
+
+            retry_context.stage = CommitRetryContext::SUCCESS;
+            return false;
+        }
+    };
+
+    auto resolve_uncertain_commit_stage = [&] ()
+    {
+        /// check that info about the part was actually written in zk
+        if (zookeeper->exists(fs::path(storage.replica_path) / "parts" / retry_context.actual_part_name))
+        {
+            LOG_DEBUG(log, "Part was successfully committed on previous iteration: part_id={}", part->name);
+            retry_context.stage = CommitRetryContext::SUCCESS;
+            return false;
+        }
+        else
+        {
+            /// if all retries will be exhausted by accessing zookeeper on fresh retry -> we'll add committed part to queue in the action
+            /// here lambda capture part name, it's ok since we'll not generate new one for this insert,
+            retries_ctl.actionAfterLastFailedRetry(
+                    [&] ()
+                    {
+                        storage.enqueuePartForCheck(retry_context.actual_part_name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
+                    });
+
+            retries_ctl.setUserError(
+                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                    "Insert failed due to zookeeper error. Please retry. Reason: {}",
+                    retry_context.unsertain_keeper_error);
+
+            return false;
+        }
+    };
+
+    auto lock_part_number_stage = [&] ()
+    {
+        if constexpr (async_insert)
+        {
+            /// prefilter by cache
+            retry_context.conflict_block_ids = detectConflictsInAsyncBlockIDs(block_id);
+            if (!retry_context.conflict_block_ids.empty())
+            {
+                retry_context.stage = CommitRetryContext::ERROR;
+                return false;
+            }
+        }
+
+        /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
+        /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned.
+        /// Also, make deduplication check. If a duplicate is detected, no nodes are created.
+
+        /// Allocate new block number and check for duplicates
+        retry_context.block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path); /// 1 RTT
+        ThreadFuzzer::maybeInjectSleep();
+
+        if (!retry_context.block_number_lock)
+        {
+            retry_context.stage = CommitRetryContext::DUPLICATED_PART;
+            return true;
+        }
+
+        if constexpr (async_insert)
+        {
+            /// The truth is that we always get only one path from block_number_lock.
+            /// This is a restriction of Keeper. Here I would like to use vector because
+            /// I wanna keep extensibility for future optimization, for instance, using
+            /// cache to resolve conflicts in advance.
+            String conflict_path = retry_context.block_number_lock->getConflictPath();
+            if (!conflict_path.empty())
+            {
+                LOG_TRACE(log, "Cannot get lock, the conflict path is {}", conflict_path);
+                retry_context.conflict_block_ids.push_back(conflict_path);
+
+                retry_context.stage = CommitRetryContext::ERROR;
+                return false;
+            }
+        }
+
+        retry_context.block_number = retry_context.block_number_lock->getNumber();
+
+        /// Set part attributes according to part_number.
+        part->info.min_block = retry_context.block_number;
+        part->info.max_block = retry_context.block_number;
+        part->info.level = 0;
+        part->info.mutation = 0;
+
+        part->setName(part->getNewName(part->info));
+        retry_context.actual_part_name = part->name;
+
+        retry_context.stage = CommitRetryContext::COMMIT;
+        return true;
+    };
+
+    auto get_quorum_ops = [&] (Coordination::Requests & ops)
+    {
+        /** If we need a quorum - create a node in which the quorum is monitored.
+         * (If such a node already exists, then someone has managed to make another quorum record at the same time,
+         *  but for it the quorum has not yet been reached.
+         *  You can not do the next quorum record at this time.)
+         */
+        if (isQuorumEnabled())
+        {
+            ReplicatedMergeTreeQuorumEntry quorum_entry;
+            quorum_entry.part_name = part->name;
+            quorum_entry.required_number_of_replicas = getQuorumSize(replicas_num);
+            quorum_entry.replicas.insert(storage.replica_name);
+
+            /** At this point, this node will contain information that the current replica received a part.
+                * When other replicas will receive this part (in the usual way, processing the replication log),
+                *  they will add themselves to the contents of this node.
+                * When it contains information about `quorum` number of replicas, this node is deleted,
+                *  which indicates that the quorum has been reached.
+                */
+
+            quorum_info.status_path = storage.zookeeper_path + "/quorum/status";
+            if (quorum_parallel)
+                quorum_info.status_path = storage.zookeeper_path + "/quorum/parallel/" + retry_context.actual_part_name;
+
+            ops.emplace_back(
+                    zkutil::makeCreateRequest(
+                            quorum_info.status_path,
+                            quorum_entry.toString(),
+                            zkutil::CreateMode::Persistent));
+
+            /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished).
+            ops.emplace_back(
+                    zkutil::makeCheckRequest(
+                            storage.replica_path + "/is_active",
+                            quorum_info.is_active_node_version));
+
+            /// Unfortunately, just checking the above is not enough, because `is_active`
+            /// node can be deleted and reappear with the same version.
+            /// But then the `host` value will change. We will check this.
+            /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread).
+            ops.emplace_back(
+                    zkutil::makeCheckRequest(
+                            storage.replica_path + "/host",
+                            quorum_info.host_node_version));
+        }
+    };
+
+    auto get_logs_ops = [&] (Coordination::Requests & ops)
+    {
+        ReplicatedMergeTreeLogEntryData log_entry;
+
+        if (is_attach)
+        {
+            log_entry.type = ReplicatedMergeTreeLogEntry::ATTACH_PART;
+
+            /// We don't need to involve ZooKeeper to obtain checksums as by the time we get
+            /// MutableDataPartPtr here, we already have the data thus being able to
+            /// calculate the checksums.
+            log_entry.part_checksum = part->checksums.getTotalChecksumHex();
+        }
+        else
+            log_entry.type = ReplicatedMergeTreeLogEntry::GET_PART;
+
+        log_entry.create_time = time(nullptr);
+        log_entry.source_replica = storage.replica_name;
+        log_entry.new_part_name = part->name;
+        /// TODO maybe add UUID here as well?
+        log_entry.quorum = getQuorumSize(replicas_num);
+        log_entry.new_part_format = part->getFormat();
+
+        if constexpr (!async_insert)
+            log_entry.block_id = block_id;
+
+        /// Prepare an entry for log.
+        ops.emplace_back(zkutil::makeCreateRequest(
+                storage.zookeeper_path + "/log/log-",
+                log_entry.toString(),
+                zkutil::CreateMode::PersistentSequential));
+    };
+
+    auto commit_new_part_stage = [&] ()
+    {
+        chassert(retry_context.block_number_lock.has_value());
+
+        /// Prepare transaction to ZooKeeper
+        /// It will simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock.
+        Coordination::Requests ops;
+
+        /// Deletes the information that the block number is used for writing.
+        size_t block_unlock_op_idx = ops.size();
+        retry_context.block_number_lock->getUnlockOp(ops);
+
+        get_logs_ops(ops);
+        get_quorum_ops(ops);
+
+        size_t shared_lock_ops_id_begin = ops.size();
+        storage.getLockSharedDataOps(*part, zookeeper, /*replace_zero_copy_lock*/ false, {}, ops);
+        size_t shared_lock_op_id_end = ops.size();
+
+        storage.getCommitPartOps(ops, part, block_id_path);
+
+        /// It's important to create it outside of lock scope because
+        /// otherwise it can lock parts in destructor and deadlock is possible.
+        MergeTreeData::Transaction transaction(storage, NO_TRANSACTION_RAW); /// If you can not add a part to ZK, we'll remove it back from the working set.
+        try
+        {
+            auto lock = storage.lockParts();
+            storage.renameTempPartAndAdd(part, transaction, lock);
+        }
+        catch (const Exception & e)
+        {
+            if (e.code() == ErrorCodes::DUPLICATE_DATA_PART || e.code() == ErrorCodes::PART_IS_TEMPORARILY_LOCKED)
+            {
+                throw Exception(ErrorCodes::LOGICAL_ERROR,
+                                "Part with name {} is already written by concurrent request."
+                                " It should not happen for non-duplicate data parts because unique names are assigned for them. It's a bug",
+                                part->name);
+            }
+
+            throw;
+        }
+
+        ThreadFuzzer::maybeInjectSleep();
+
+        fiu_do_on(FailPoints::replicated_merge_tree_commit_zk_fail_after_op,
+                  {
+                      if (!zookeeper->fault_policy)
+                      {
+                          zookeeper->logger = log;
+                          zookeeper->fault_policy = std::make_unique<RandomFaultInjection>(0, 0);
+                      }
+                      zookeeper->fault_policy->must_fail_after_op = true;
+                  });
+
+        Coordination::Responses responses;
+        Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
+
+        if (multi_code == Coordination::Error::ZOK)
+        {
+            auto sleep_before_commit_local_part_in_replicated_table_ms = storage.getSettings()->sleep_before_commit_local_part_in_replicated_table_ms;
+            if (sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds())
+            {
+                LOG_INFO(log, "committing part {}, triggered sleep_before_commit_local_part_in_replicated_table_ms {}",
+                         part->name, sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds());
+                sleepForMilliseconds(sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds());
+            }
+
+            part->new_part_was_committed_to_zookeeper_after_rename_on_disk = true;
+            transaction.commit();
+
+            /// Lock nodes have been already deleted, do not delete them in destructor
+            retry_context.block_number_lock->assumeUnlocked();
+
+            retry_context.stage = CommitRetryContext::SUCCESS;
+            return false;
+        }
+
+        if (Coordination::isHardwareError(multi_code))
+        {
+            retry_context.unsertain_keeper_error = multi_code;
+
+            /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part
+             *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
+             */
+            transaction.commit();
+
+            retry_context.stage = CommitRetryContext::UNCERTAIN_COMMIT;
+            return true;
+        }
+
+        if (!Coordination::isUserError(multi_code))
+        {
+            transaction.rollback();
+            throw Exception(
+                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                    "Unexpected ZooKeeper error while adding block {} with ID '{}': {}",
+                    retry_context.block_number,
+                    toString(block_id),
+                    multi_code);
+        }
+
+        auto failed_op_idx = zkutil::getFailedOpIndex(multi_code, responses);
+        String failed_op_path = ops[failed_op_idx]->getPath();
+
+        if (multi_code == Coordination::Error::ZNODEEXISTS && !block_id_path.empty() && contains(block_id_path, failed_op_path))
+        {
+            /// Block with the same id have just appeared in table (or other replica), rollback the insertion.
+            LOG_INFO(log, "Block with ID {} already exists (it was just appeared) for part {}. Ignore it.",
+                     toString(block_id), part->name);
+
+            transaction.rollback();
+
+            if constexpr (async_insert)
+            {
+                retry_context.conflict_block_ids = std::vector<String>({failed_op_path});
+                LOG_TRACE(log, "conflict when committing, the conflict block ids are {}",
+                          toString(retry_context.conflict_block_ids));
+            }
+
+            retry_context.stage = CommitRetryContext::DUPLICATED_PART;
+            return true;
+        }
+        else if (multi_code == Coordination::Error::ZNONODE && failed_op_idx == block_unlock_op_idx)
+        {
+            transaction.rollback();
+            throw Exception(ErrorCodes::QUERY_WAS_CANCELLED,
+                            "Insert query (for block {}) was canceled by concurrent ALTER PARTITION or TRUNCATE",
+                            retry_context.block_number_lock->getPath());
+        }
+        else if (shared_lock_ops_id_begin <= failed_op_idx && failed_op_idx < shared_lock_op_id_end)
+        {
+            transaction.rollback();
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                            "Creating shared lock for part {} has failed with error: {}. It's a bug.",
+                            part->name, multi_code);
+        }
+        else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
+        {
+            transaction.rollback();
+            throw Exception(ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE, "Another quorum insert has been already started");
+        }
+        else
+        {
+            transaction.rollback();
+            throw Exception(
+                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                    "Unexpected logical error while adding block {} with ID '{}': {}, path {}",
+                    retry_context.block_number,
+                    toString(block_id),
+                    multi_code,
+                    failed_op_path);
+        }
+    };
+
+    auto stage_switcher = [&] ()
+    {
+        try
+        {
+            switch (retry_context.stage)
+            {
+                case CommitRetryContext::LOCK_BLOCK:
+                    return lock_part_number_stage();
+
+                case CommitRetryContext::COMMIT:
+                    return commit_new_part_stage();
+
+                case CommitRetryContext::DUPLICATED_PART:
+                    return resolve_duplicate_stage();
+
+                case CommitRetryContext::UNCERTAIN_COMMIT:
+                    return resolve_uncertain_commit_stage();
+
+                case CommitRetryContext::SUCCESS:
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                                    "Operation is already succeed.");
+
+                case CommitRetryContext::ERROR:
+                    throw Exception(ErrorCodes::LOGICAL_ERROR,
+                                    "Operation is already in error state.");
+            }
+        }
+        catch (const zkutil::KeeperException &)
+        {
+            throw;
+        }
+        catch (DB::Exception &)
+        {
+            retry_context.stage = CommitRetryContext::ERROR;
+            throw;
+        }
+    };
+
    retries_ctl.retryLoop([&]()
    {
        zookeeper->setKeeper(storage.getZooKeeper());
+
        if (storage.is_readonly)
        {
            /// stop retries if in shutdown
@ -598,374 +1051,31 @@ std::pair<std::vector<String>, bool> ReplicatedMergeTreeSinkImpl<async_insert>::
            }
        }

-        if (retries_ctl.isRetry())
+        while (stage_switcher())
        {
-            /// If we are retrying, check if last iteration was actually successful,
-            /// we could get network error on committing part to zk
-            /// but the operation could be completed by zk server
-
-            /// If this flag is true, then part is in Active state, and we'll not retry anymore
-            /// we only check if part was committed to zk and return success or failure correspondingly
-            /// Note: if commit to zk failed then cleanup thread will mark the part as Outdated later
-            if (part_committed_locally_but_zookeeper)
-            {
-                /// check that info about the part was actually written in zk
-                if (zookeeper->exists(fs::path(storage.replica_path) / "parts" / part->name))
-                {
-                    LOG_DEBUG(log, "Part was successfully committed on previous iteration: part_id={}", part->name);
-                }
-                else
-                {
-                    retries_ctl.setUserError(
-                        ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
-                        "Insert failed due to zookeeper error. Please retry. Reason: {}",
-                        write_part_info_keeper_error);
-                }
-
-                retries_ctl.stopRetries();
-                return;
-            }
+            // no op;
        }
-
-        /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
-        /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned.
-        /// Also, make deduplication check. If a duplicate is detected, no nodes are created.
-
-        /// Allocate new block number and check for duplicates
-        bool deduplicate_block = !block_id.empty();
-        BlockIDsType block_id_path ;
-        if constexpr (async_insert)
-        {
-            /// prefilter by cache
-            conflict_block_ids = storage.async_block_ids_cache.detectConflicts(block_id, cache_version);
-            if (!conflict_block_ids.empty())
-            {
-                cache_version = 0;
-                return;
-            }
-            for (const auto & single_block_id : block_id)
-                block_id_path.push_back(storage.zookeeper_path + "/async_blocks/" + single_block_id);
-        }
-        else if (deduplicate_block)
-            block_id_path = storage.zookeeper_path + "/blocks/" + block_id;
-        auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path);
-        ThreadFuzzer::maybeInjectSleep();
-
-        /// Prepare transaction to ZooKeeper
-        /// It will simultaneously add information about the part to all the necessary places in ZooKeeper and remove block_number_lock.
-        Coordination::Requests ops;
-
-        Int64 block_number = 0;
-        size_t block_unlock_op_idx = std::numeric_limits<size_t>::max();
-        if (block_number_lock)
-        {
-            if constexpr (async_insert)
-            {
-                /// The truth is that we always get only one path from block_number_lock.
-                /// This is a restriction of Keeper. Here I would like to use vector because
-                /// I wanna keep extensibility for future optimization, for instance, using
-                /// cache to resolve conflicts in advance.
-                String conflict_path = block_number_lock->getConflictPath();
-                if (!conflict_path.empty())
-                {
-                    LOG_TRACE(log, "Cannot get lock, the conflict path is {}", conflict_path);
-                    conflict_block_ids.push_back(conflict_path);
-                    return;
-                }
-            }
-            block_number = block_number_lock->getNumber();
-
-            /// Set part attributes according to part_number. Prepare an entry for log.
-
-            part->info.min_block = block_number;
-            part->info.max_block = block_number;
-            part->info.level = 0;
-            part->info.mutation = 0;
-
-            part->setName(part->getNewName(part->info));
-
-            StorageReplicatedMergeTree::LogEntry log_entry;
-
-            if (is_attach)
-            {
-                log_entry.type = StorageReplicatedMergeTree::LogEntry::ATTACH_PART;
-
-                /// We don't need to involve ZooKeeper to obtain checksums as by the time we get
-                /// MutableDataPartPtr here, we already have the data thus being able to
-                /// calculate the checksums.
-                log_entry.part_checksum = part->checksums.getTotalChecksumHex();
-            }
-            else
-                log_entry.type = StorageReplicatedMergeTree::LogEntry::GET_PART;
-
-            log_entry.create_time = time(nullptr);
-            log_entry.source_replica = storage.replica_name;
-            log_entry.new_part_name = part->name;
-            /// TODO maybe add UUID here as well?
-            log_entry.quorum = getQuorumSize(replicas_num);
-            log_entry.new_part_format = part->getFormat();
-
-            if constexpr (!async_insert)
-                log_entry.block_id = block_id;
-
-            ops.emplace_back(zkutil::makeCreateRequest(
-                storage.zookeeper_path + "/log/log-",
-                log_entry.toString(),
-                zkutil::CreateMode::PersistentSequential));
-
-            /// Deletes the information that the block number is used for writing.
-            block_unlock_op_idx = ops.size();
-            block_number_lock->getUnlockOp(ops);
-
-            /** If we need a quorum - create a node in which the quorum is monitored.
-              * (If such a node already exists, then someone has managed to make another quorum record at the same time,
-              *  but for it the quorum has not yet been reached.
-              *  You can not do the next quorum record at this time.)
-              */
-            if (isQuorumEnabled())
-            {
-                ReplicatedMergeTreeQuorumEntry quorum_entry;
-                quorum_entry.part_name = part->name;
-                quorum_entry.required_number_of_replicas = getQuorumSize(replicas_num);
-                quorum_entry.replicas.insert(storage.replica_name);
-
-                /** At this point, this node will contain information that the current replica received a part.
-                    * When other replicas will receive this part (in the usual way, processing the replication log),
-                    *  they will add themselves to the contents of this node.
-                    * When it contains information about `quorum` number of replicas, this node is deleted,
-                    *  which indicates that the quorum has been reached.
-                    */
-
-                if (quorum_parallel)
-                    quorum_info.status_path = storage.zookeeper_path + "/quorum/parallel/" + part->name;
-
-                ops.emplace_back(
-                    zkutil::makeCreateRequest(
-                        quorum_info.status_path,
-                        quorum_entry.toString(),
-                        zkutil::CreateMode::Persistent));
-
-                /// Make sure that during the insertion time, the replica was not reinitialized or disabled (when the server is finished).
-                ops.emplace_back(
-                    zkutil::makeCheckRequest(
-                        storage.replica_path + "/is_active",
-                        quorum_info.is_active_node_version));
-
-                /// Unfortunately, just checking the above is not enough, because `is_active`
-                /// node can be deleted and reappear with the same version.
-                /// But then the `host` value will change. We will check this.
-                /// It's great that these two nodes change in the same transaction (see MergeTreeRestartingThread).
-                ops.emplace_back(
-                    zkutil::makeCheckRequest(
-                        storage.replica_path + "/host",
-                        quorum_info.host_node_version));
-            }
-        }
-        else if constexpr (!async_insert)
-        {
-            /// This block was already written to some replica. Get the part name for it.
-            /// Note: race condition with DROP PARTITION operation is possible. User will get "No node" exception and it is Ok.
-            String existing_part_name = zookeeper->get(storage.zookeeper_path + "/blocks/" + block_id);
-
-            bool exists_locally = bool(storage.getActiveContainingPart(existing_part_name));
-
-            part_was_deduplicated = true;
-            if (exists_locally)
-                ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks);
-
-            if (isQuorumEnabled())
-            {
-                if (quorum_parallel)
-                    quorum_info.status_path = storage.zookeeper_path + "/quorum/parallel/" + existing_part_name;
-
-                LOG_INFO(log, "Block with ID {} {} as part {}; ignoring it, but checking quorum.",
-                         block_id,
-                         exists_locally ? "already exists locally" : "already exists on other replicas",
-                         existing_part_name);
-
-                waitForQuorum(zookeeper, existing_part_name, quorum_info.status_path, quorum_info.is_active_node_version, replicas_num);
-            }
-            else
-            {
-                LOG_INFO(log, "Block with ID {} {} as part {}; ignoring it.",
-                         block_id,
-                         exists_locally ? "already exists locally" : "already exists on other replicas",
-                         existing_part_name);
-            }
-
-            return;
-        }
-        /// async_insert will never return null lock, because they need the conflict path.
-        else
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "Conflict block ids and block number lock should not "
-                            "be empty at the same time for async inserts");
-
-        chassert(!async_insert || block_number_lock);
-
-        size_t shared_lock_op_idx_first = ops.size();
-        storage.getLockSharedDataOps(*part, zookeeper, /*replace_zero_copy_lock*/ false, {}, ops);
-        size_t shared_lock_op_idx_last = ops.size();
-
-        storage.getCommitPartOps(ops, part, block_id_path);
-
-        /// It's important to create it outside of lock scope because
-        /// otherwise it can lock parts in destructor and deadlock is possible.
-        MergeTreeData::Transaction transaction(storage, NO_TRANSACTION_RAW); /// If you can not add a part to ZK, we'll remove it back from the working set.
-
-        try
-        {
-            auto lock = storage.lockParts();
-            storage.renameTempPartAndAdd(part, transaction, lock);
-        }
-        catch (const Exception & e)
-        {
-            if (e.code() == ErrorCodes::DUPLICATE_DATA_PART
-                || e.code() == ErrorCodes::PART_IS_TEMPORARILY_LOCKED)
-            {
-                throw Exception(ErrorCodes::LOGICAL_ERROR,
-                                "Part with name {} is already written by concurrent request."
-                                " It should not happen for non-duplicate data parts because unique names are assigned for them. It's a bug",
-                                part->name);
-            }
-
-            throw;
-        }
-
-        ThreadFuzzer::maybeInjectSleep();
-
-        fiu_do_on(FailPoints::replicated_merge_tree_commit_zk_fail_after_op,
-        {
-            if (!zookeeper->fault_policy)
-            {
-                zookeeper->logger = log;
-                zookeeper->fault_policy = std::make_unique<RandomFaultInjection>(0, 0);
-            }
-            zookeeper->fault_policy->must_fail_after_op = true;
-        });
-
-        Coordination::Responses responses;
-        Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
-        if (multi_code == Coordination::Error::ZOK)
-        {
-            auto sleep_before_commit_local_part_in_replicated_table_ms = storage.getSettings()->sleep_before_commit_local_part_in_replicated_table_ms;
-            if (sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds())
-            {
-                LOG_INFO(log, "committing part {}, triggered sleep_before_commit_local_part_in_replicated_table_ms {}",
-                         part->name, sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds());
-                sleepForMilliseconds(sleep_before_commit_local_part_in_replicated_table_ms.totalMilliseconds());
-            }
-
-            part->new_part_was_committed_to_zookeeper_after_rename_on_disk = true;
-            transaction.commit();
-            storage.merge_selecting_task->schedule();
-
-            /// Lock nodes have been already deleted, do not delete them in destructor
-            block_number_lock->assumeUnlocked();
-
-            if (isQuorumEnabled())
-                waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_version, replicas_num);
-
-            return;
-        }
-
-        if (Coordination::isHardwareError(multi_code))
-        {
-            write_part_info_keeper_error = multi_code;
-            /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part
-             *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
-             */
-            transaction.commit();
-
-            /// Setting this flag is point of no return
-            /// On next retry, we'll just check if actually operation succeed or failed
-            /// and return ok or error correspondingly
-            part_committed_locally_but_zookeeper = true;
-
-            /// if all retries will be exhausted by accessing zookeeper on fresh retry -> we'll add committed part to queue in the action
-            /// here lambda capture part name, it's ok since we'll not generate new one for this insert,
-            /// see comments around 'part_committed_locally_but_zookeeper' flag
-            retries_ctl.actionAfterLastFailedRetry(
-                [&my_storage = storage, part_name = part->name]
-                {
-                    my_storage.enqueuePartForCheck(part_name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
-                });
-
-            /// We do not know whether or not data has been inserted.
-            retries_ctl.setUserError(
-                ErrorCodes::UNKNOWN_STATUS_OF_INSERT,
-                "Unknown status, client must retry. Reason: {}",
-                multi_code);
-            return;
-        }
-
-        if (!Coordination::isUserError(multi_code))
-        {
-            transaction.rollback();
-            throw Exception(
-                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
-                    "Unexpected ZooKeeper error while adding block {} with ID '{}': {}",
-                    block_number,
-                    toString(block_id),
-                    multi_code);
-        }
-
-        auto failed_op_idx = zkutil::getFailedOpIndex(multi_code, responses);
-        String failed_op_path = ops[zkutil::getFailedOpIndex(multi_code, responses)]    ->getPath();
-
-        if (multi_code == Coordination::Error::ZNONODE && zkutil::getFailedOpIndex(multi_code, responses) == block_unlock_op_idx)
-        {
-            throw Exception(ErrorCodes::QUERY_WAS_CANCELLED,
-                            "Insert query (for block {}) was canceled by concurrent ALTER PARTITION or TRUNCATE", block_number_lock->getPath());
-        }
-        else if (shared_lock_op_idx_first <= failed_op_idx && failed_op_idx < shared_lock_op_idx_last)
-        {
-            transaction.rollback();
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                            "Creating shared lock for part {} has failed with error: {}. It's a bug.",
-                            part->name, multi_code);
-        }
-        else if (multi_code == Coordination::Error::ZNODEEXISTS && deduplicate_block && contains(block_id_path, failed_op_path))
-        {
-            /// Block with the same id have just appeared in table (or other replica), rollback the insertion.
-            LOG_INFO(log, "Block with ID {} already exists (it was just appeared) for part {}. Ignore it.",
-                toString(block_id), part->name);
-
-            transaction.rollback();
-
-            if constexpr (async_insert)
-            {
-                conflict_block_ids = std::vector<String>({failed_op_path});
-                LOG_TRACE(log, "conflict when committing, the conflict block ids are {}", toString(conflict_block_ids));
-            }
-
-            return;
-        }
-        else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
-        {
-
-            transaction.rollback();
-            throw Exception(ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE, "Another quorum insert has been already started");
-        }
-        else
-        {
-            transaction.rollback();
-            throw Exception(
-                ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
-                "Unexpected logical error while adding block {} with ID '{}': {}, path {}",
-                block_number,
-                toString(block_id),
-                multi_code,
-                failed_op_path);
-        }
-
    },
    [&zookeeper]() { zookeeper->cleanupEphemeralNodes(); });

-    if (!conflict_block_ids.empty())
-        return {conflict_block_ids, part_was_deduplicated};
+    if (!retry_context.conflict_block_ids.empty())
+        return {retry_context.conflict_block_ids, false};

-    return {conflict_block_ids, part_was_deduplicated};
+    if (retry_context.stage == CommitRetryContext::SUCCESS)
+    {
+        storage.merge_selecting_task->schedule();
+
+        if (isQuorumEnabled())
+        {
+            quorum_info.status_path = storage.zookeeper_path + "/quorum/status";
+            if (quorum_parallel)
+                quorum_info.status_path = storage.zookeeper_path + "/quorum/parallel/" + retry_context.actual_part_name;
+
+            waitForQuorum(zookeeper, retry_context.actual_part_name, quorum_info.status_path, quorum_info.is_active_node_version, replicas_num);
+        }
+    }
+
+    return {retry_context.conflict_block_ids, retry_context.part_was_deduplicated};
 }

 template<bool async_insert>
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
@ -70,6 +70,8 @@ public:

    struct DelayedChunk;
 private:
+    std::vector<String> detectConflictsInAsyncBlockIDs(const std::vector<String> & ids);
+
    using BlockIDsType = std::conditional_t<async_insert, std::vector<String>, String>;

    ZooKeeperRetriesInfo zookeeper_retries_info;
@ -95,6 +97,7 @@ private:
        size_t replicas_num,
        bool writing_existing_part);

+
    /// Wait for quorum to be satisfied on path (quorum_path) form part (part_name)
    /// Also checks that replica still alive.
    void waitForQuorum(
@ -125,7 +128,6 @@ private:
    bool last_block_is_duplicate = false;
    UInt64 num_blocks_processed = 0;

-    using Logger = Poco::Logger;
    Poco::Logger * log;

    ContextPtr context;