From 22da0420affcfd7854e9b5037c8de22bbe81fc2c Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 19 Oct 2022 20:26:16 +0000 Subject: [PATCH 1/7] fix possible restart errors after failed quorum insert --- .../MergeTree/ReplicatedMergeTreeSink.cpp | 10 +++++++++- ..._manual_write_to_replicas_quorum.reference | 20 +++++++++---------- .../01459_manual_write_to_replicas_quorum.sh | 20 +++++++++++++++++-- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 0abea5977c3..feb3d9c3e9e 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -576,7 +576,15 @@ void ReplicatedMergeTreeSink::commitPart( else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path) { storage.unlockSharedData(*part); - transaction.rollback(); + + /// Part was not committed to keeper. + /// So make it temporary and remove immediately to avoid its resurrection after restart. + transaction.rollbackPartsToTemporaryState(); + + part->is_temp = true; + part->renameTo(temporary_part_relative_path, false, builder); + builder->commit(); + throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } else diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference index 52dea650ebc..812fa4477cb 100644 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference @@ -1,10 +1,10 @@ -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 +200 0 199 19900 diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh index 6eabc9ae1b5..5a666454531 100755 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh @@ -19,7 +19,7 @@ done valid_exceptions_to_retry='Quorum for previous write has not been satisfied yet|Another quorum insert has been already started|Unexpected logical error while adding block' -function thread { +function thread1 { for x in {0..99}; do while true; do $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break @@ -28,7 +28,23 @@ function thread { } for i in $(seq 1 $NUM_REPLICAS); do - thread $i & + thread1 $i & +done + +wait + +function thread2 { + for x in {100..199}; do + while true; do + $CLICKHOUSE_CLIENT --query "DETACH TABLE r$1" + $CLICKHOUSE_CLIENT --query "ATTACH TABLE r$1" + $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break + done + done +} + +for i in $(seq 1 $NUM_REPLICAS); do + thread2 $i & done wait From 3204f7353bfd062344d4e86c46e45c37f798a278 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 26 Dec 2022 20:49:04 +0000 Subject: [PATCH 2/7] Fix: handle exceptions from unlockSharedData() during error handling + renamePartToTemporary() lambda to reused code --- .../MergeTree/ReplicatedMergeTreeSink.cpp | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 487e80a3989..c1f29ce5b96 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -539,7 +539,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// /// metadata_snapshot->check(part->getColumns()); - String temporary_part_relative_path = part->getDataPartStorage().getPartDirectory(); + const String temporary_part_relative_path = part->getDataPartStorage().getPartDirectory(); /// There is one case when we need to retry transaction in a loop. /// But don't do it too many times - just as defensive measure. @@ -820,6 +820,14 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( part->name); } + auto renamePartToTemporary = [&temporary_part_relative_path, &transaction, &part]() + { + transaction.rollbackPartsToTemporaryState(); + + part->is_temp = true; + part->renameTo(temporary_part_relative_path, false); + }; + try { ThreadFuzzer::maybeInjectSleep(); @@ -828,11 +836,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( } catch (const Exception &) { - transaction.rollbackPartsToTemporaryState(); - - part->is_temp = true; - part->renameTo(temporary_part_relative_path, false); - + renamePartToTemporary(); throw; } @@ -906,10 +910,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// We will try to add this part again on the new iteration as it's just a new part. /// So remove it from storage parts set immediately and transfer state to temporary. - transaction.rollbackPartsToTemporaryState(); - - part->is_temp = true; - part->renameTo(temporary_part_relative_path, false); + renamePartToTemporary(); if constexpr (async_insert) { @@ -931,14 +932,19 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( } else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path) { - storage.unlockSharedData(*part); + try + { + storage.unlockSharedData(*part); + } + catch (const zkutil::KeeperException & e) + { + /// suppress this exception since need to rename part to temporary next + LOG_DEBUG(log, "Unlocking shared data failed during error hadling: code={} message={}", e.code, e.message()); + } - /// Part was not committed to keeper. - /// So make it temporary and remove immediately to avoid its resurrection after restart. - transaction.rollbackPartsToTemporaryState(); - - part->is_temp = true; - part->renameTo(temporary_part_relative_path, false); + /// Part was not committed to keeper + /// So make it temporary to avoid its resurrection on restart + renamePartToTemporary(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } From ba88bc17d4a4c3c01c3142209f494f55451a3c7c Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Mon, 26 Dec 2022 21:01:00 +0000 Subject: [PATCH 3/7] Fix typo --- src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index c1f29ce5b96..be43262e376 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -939,7 +939,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( catch (const zkutil::KeeperException & e) { /// suppress this exception since need to rename part to temporary next - LOG_DEBUG(log, "Unlocking shared data failed during error hadling: code={} message={}", e.code, e.message()); + LOG_DEBUG(log, "Unlocking shared data failed during error handling: code={} message={}", e.code, e.message()); } /// Part was not committed to keeper From 3a7f26ff3ac5b1e8652ed220d4f79193f1b4c08a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 27 Dec 2022 16:15:23 +0100 Subject: [PATCH 4/7] Split test into two --- .../MergeTree/ReplicatedMergeTreeSink.cpp | 2 +- ..._manual_write_to_replicas_quorum.reference | 20 ++++---- .../01459_manual_write_to_replicas_quorum.sh | 20 +------- ...to_replicas_quorum_detach_attach.reference | 10 ++++ ..._write_to_replicas_quorum_detach_attach.sh | 46 +++++++++++++++++++ 5 files changed, 69 insertions(+), 29 deletions(-) create mode 100644 tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference create mode 100755 tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index be43262e376..616b920b093 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -934,7 +934,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( { try { - storage.unlockSharedData(*part); + storage.unlockSharedData(*part, zookeeper); } catch (const zkutil::KeeperException & e) { diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference index 812fa4477cb..52dea650ebc 100644 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.reference @@ -1,10 +1,10 @@ -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 -200 0 199 19900 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh index 555d21a781f..209e18e3329 100755 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum.sh @@ -19,7 +19,7 @@ done valid_exceptions_to_retry='Quorum for previous write has not been satisfied yet|Another quorum insert has been already started|Unexpected logical error while adding block' -function thread1 { +function thread { for x in {0..99}; do while true; do $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --insert_keeper_fault_injection_probability=0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break @@ -28,23 +28,7 @@ function thread1 { } for i in $(seq 1 $NUM_REPLICAS); do - thread1 $i & -done - -wait - -function thread2 { - for x in {100..199}; do - while true; do - $CLICKHOUSE_CLIENT --query "DETACH TABLE r$1" - $CLICKHOUSE_CLIENT --query "ATTACH TABLE r$1" - $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break - done - done -} - -for i in $(seq 1 $NUM_REPLICAS); do - thread2 $i & + thread $i & done wait diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference new file mode 100644 index 00000000000..52dea650ebc --- /dev/null +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference @@ -0,0 +1,10 @@ +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 +100 0 99 4950 diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh new file mode 100755 index 00000000000..eaa98d220a7 --- /dev/null +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Tags: replica, no-replicated-database, no-parallel +# Tag no-replicated-database: Fails due to additional replicas or shards + +set -e + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +NUM_REPLICAS=10 + +for i in $(seq 1 $NUM_REPLICAS); do + $CLICKHOUSE_CLIENT -n -q " + DROP TABLE IF EXISTS r$i SYNC; + CREATE TABLE r$i (x UInt64) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/r', 'r$i') ORDER BY x; + " +done + +valid_exceptions_to_retry='Quorum for previous write has not been satisfied yet|Another quorum insert has been already started|Unexpected logical error while adding block' + +function thread { + for x in {0..99}; do + while true; do + $CLICKHOUSE_CLIENT --query "DETACH TABLE r$1" + $CLICKHOUSE_CLIENT --query "ATTACH TABLE r$1" + $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --insert_keeper_fault_injection_probability=0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break + done + done +} + +for i in $(seq 1 $NUM_REPLICAS); do + thread $i & +done + +wait + +for i in $(seq 1 $NUM_REPLICAS); do + $CLICKHOUSE_CLIENT -n -q " + SYSTEM SYNC REPLICA r$i; + SELECT count(), min(x), max(x), sum(x) FROM r$i;" +done + +for i in $(seq 1 $NUM_REPLICAS); do + $CLICKHOUSE_CLIENT -q "DROP TABLE IF EXISTS r$i SYNC;" +done From 3a1e9f4a4d7a646efca7bf0c1a79397f40bc8e53 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 28 Dec 2022 14:08:13 +0100 Subject: [PATCH 5/7] Fix style --- src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 616b920b093..7bd5df2b1dc 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -820,7 +820,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( part->name); } - auto renamePartToTemporary = [&temporary_part_relative_path, &transaction, &part]() + auto rename_part_to_temporary = [&temporary_part_relative_path, &transaction, &part]() { transaction.rollbackPartsToTemporaryState(); @@ -836,7 +836,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( } catch (const Exception &) { - renamePartToTemporary(); + rename_part_to_temporary(); throw; } @@ -910,7 +910,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// We will try to add this part again on the new iteration as it's just a new part. /// So remove it from storage parts set immediately and transfer state to temporary. - renamePartToTemporary(); + rename_part_to_temporary(); if constexpr (async_insert) { @@ -944,7 +944,7 @@ std::vector ReplicatedMergeTreeSinkImpl::commitPart( /// Part was not committed to keeper /// So make it temporary to avoid its resurrection on restart - renamePartToTemporary(); + rename_part_to_temporary(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } From 697617548ccb5e0868004c8fd1dd0d587f2fe9fb Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 28 Dec 2022 18:59:24 +0100 Subject: [PATCH 6/7] Tiny improvement --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 3 +++ src/Storages/MergeTree/MergeTreeData.cpp | 2 ++ 2 files changed, 5 insertions(+) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index b61cde8dbbc..afebb8992e0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -576,6 +576,9 @@ void IMergeTreeDataPart::assertState(const std::initializer_listgetDataPartStorage().commitTransaction(); if (txn) + { for (const auto & part : precommitted_parts) { DataPartPtr covering_part; @@ -5445,6 +5446,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(MergeTreeData: MergeTreeTransaction::addNewPartAndRemoveCovered(data.shared_from_this(), part, covered_parts, txn); } + } MergeTreeData::WriteAheadLogPtr wal; auto get_inited_wal = [&] () From 3ac0df9d7a1730d82575cc2add9df57e9e71b005 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 28 Dec 2022 19:30:49 +0100 Subject: [PATCH 7/7] Make test less stressful --- ...te_to_replicas_quorum_detach_attach.reference | 16 ++++++---------- ...ual_write_to_replicas_quorum_detach_attach.sh | 6 +++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference index 52dea650ebc..0d6e68f032f 100644 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.reference @@ -1,10 +1,6 @@ -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 -100 0 99 4950 +10 0 9 45 +10 0 9 45 +10 0 9 45 +10 0 9 45 +10 0 9 45 +10 0 9 45 diff --git a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh index eaa98d220a7..b97fcece267 100755 --- a/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh +++ b/tests/queries/0_stateless/01459_manual_write_to_replicas_quorum_detach_attach.sh @@ -8,7 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -NUM_REPLICAS=10 +NUM_REPLICAS=6 for i in $(seq 1 $NUM_REPLICAS); do $CLICKHOUSE_CLIENT -n -q " @@ -20,11 +20,11 @@ done valid_exceptions_to_retry='Quorum for previous write has not been satisfied yet|Another quorum insert has been already started|Unexpected logical error while adding block' function thread { - for x in {0..99}; do + for x in {0..9}; do while true; do $CLICKHOUSE_CLIENT --query "DETACH TABLE r$1" $CLICKHOUSE_CLIENT --query "ATTACH TABLE r$1" - $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 0 --insert_keeper_fault_injection_probability=0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break + $CLICKHOUSE_CLIENT --insert_quorum 3 --insert_quorum_parallel 0 --insert_keeper_fault_injection_probability=0 --query "INSERT INTO r$1 SELECT $x" 2>&1 | grep -qE "$valid_exceptions_to_retry" || break done done }