diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index a9b7fec41e7..4f2ad823c3a 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -919,6 +919,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( new_data_part->version.setCreationTID(Tx::PrehistoricTID, nullptr); new_data_part->is_temp = true; + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs + /// The blobs have to stay intact, this temporary part does not own them and does not share them yet. new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::PRESERVE_BLOBS; new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 634e275bd64..ff40c1da8d1 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -218,14 +218,20 @@ public: /// FIXME Why do we need this flag? What's difference from Temporary and DeleteOnDestroy state? Can we get rid of this? bool is_temp = false; + /// This type and the field remove_tmp_policy is used as a hint + /// to help avoid communication with keeper when temporary part is deleting. + /// The common procedure is to ask the keeper with unlock request to release a references to the blobs. + /// And then follow the keeper answer decide remove or preserve the blobs in that part from s3. + /// However in some special cases Clickhouse can make a decision without asking keeper. enum class BlobsRemovalPolicyForTemporaryParts { + /// decision about removing blobs is determined by keeper, the common case ASK_KEEPER, + /// is set when Clickhouse is sure that the blobs in the part are belong only to it, other replicas have not seen them yet REMOVE_BLOBS, + /// is set when Clickhouse is sure that the blobs belong to other replica and current replica has not locked them on s3 yet PRESERVE_BLOBS, }; - /// That field is used by replicated merge tree with zero copy replication - /// Usually the data has to bo unlocked in keeper unless explicitly otherwise stated BlobsRemovalPolicyForTemporaryParts remove_tmp_policy = BlobsRemovalPolicyForTemporaryParts::ASK_KEEPER; /// If true it means that there are no ZooKeeper node for this part, so it should be deleted only from filesystem diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 2248b08b1da..e58a9fc283c 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -209,6 +209,9 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() global_ctx->new_data_part->uuid = global_ctx->future_part->uuid; global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition()); global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr; + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs + /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. global_ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; ctx->need_remove_expired_values = false; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index eeabaa9e145..ec141f594bc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8282,6 +8282,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::createEmptyPart( new_data_part->minmax_idx = std::move(minmax_idx); new_data_part->is_temp = true; + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs + /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; auto new_data_part_storage = new_data_part->getDataPartStoragePtr(); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index b5b982948b7..db486b163eb 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -469,6 +469,9 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); new_data_part->is_temp = true; + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs + /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; SyncGuardPtr sync_guard; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index b2c2b4a96c0..b12dbadf1ed 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1838,6 +1838,9 @@ bool MutateTask::prepare() if (!isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) || (ctx->interpreter && ctx->interpreter->isAffectingAllColumns())) { + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse claims that this new part can be deleted in temporary state without unlocking the blobs + /// The blobs have to be removed along with the part, this temporary part owns them and does not share them yet. ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::REMOVE_BLOBS; task = std::make_unique(ctx); @@ -1867,6 +1870,10 @@ bool MutateTask::prepare() ctx->for_file_renames, ctx->mrk_extension); + /// In case of replicated merge tree with zero copy replication + /// Here Clickhouse has to follow the common procedure when deleting new part in temporary state + /// Some of the files within the blobs are shared with source part, some belongs only to the part + /// Keeper has to be asked with unlock request to release the references to the blobs ctx->new_data_part->remove_tmp_policy = IMergeTreeDataPart::BlobsRemovalPolicyForTemporaryParts::ASK_KEEPER; task = std::make_unique(ctx);