From 77769e8bc5f2ae56e70e3ae0345ff3d605b15189 Mon Sep 17 00:00:00 2001 From: Mikhail Filimonov Date: Fri, 18 Dec 2020 09:34:37 +0100 Subject: [PATCH 1/6] Issue with quorum retries behaviour --- ...509_parallel_quorum_insert_no_replicas.sql | 38 +++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql index 1b680cf26c1..6980dc02bfa 100644 --- a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql +++ b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql @@ -15,26 +15,44 @@ ORDER BY tuple(); SET insert_quorum_parallel=1; +SET insert_quorum_timeout=1; SET insert_quorum=3; INSERT INTO r1 VALUES(1, '1'); --{serverError 285} +-- retry should still fail despite the insert_deduplicate enabled +INSERT INTO r1 VALUES(1, '1'); --{serverError 285} +INSERT INTO r1 VALUES(1, '1'); --{serverError 285} + SELECT 'insert to two replicas works'; SET insert_quorum=2, insert_quorum_parallel=1; + +-- the first 'good' retry after failure will return: +-- Unknown status, client must retry. Reason: Code: 159, Timeout while waiting for quorum +-- it's related to decreased insert_quorum_timeout=1 at the test beginning +-- it's not critical but not good +INSERT INTO r1 VALUES(1, '1'); --{serverError 319} INSERT INTO r1 VALUES(1, '1'); SELECT COUNT() FROM r1; SELECT COUNT() FROM r2; +SET insert_quorum_timeout=600000; DETACH TABLE r2; INSERT INTO r1 VALUES(2, '2'); --{serverError 285} +-- retry should fail despite the insert_deduplicate enabled +INSERT INTO r1 VALUES(2, '2'); --{serverError 285} +INSERT INTO r1 VALUES(2, '2'); --{serverError 285} + SET insert_quorum=1, insert_quorum_parallel=1; SELECT 'insert to single replica works'; INSERT INTO r1 VALUES(2, '2'); ATTACH TABLE r2; +INSERT INTO r2 VALUES(2, '2'); + SYSTEM SYNC REPLICA r2; SET insert_quorum=2, insert_quorum_parallel=1; @@ -47,6 +65,17 @@ SELECT COUNT() FROM r2; SELECT 'deduplication works'; INSERT INTO r2 VALUES(3, '3'); +-- still works if we relax quorum +SET insert_quorum=1, insert_quorum_parallel=1; +INSERT INTO r2 VALUES(3, '3'); +INSERT INTO r1 VALUES(3, '3'); +-- will start failing if we increase quorum +SET insert_quorum=3, insert_quorum_parallel=1; +INSERT INTO r1 VALUES(3, '3'); --{serverError 285} +-- work back ok when quorum=2 +SET insert_quorum=2, insert_quorum_parallel=1; +INSERT INTO r2 VALUES(3, '3'); + SELECT COUNT() FROM r1; SELECT COUNT() FROM r2; @@ -56,8 +85,17 @@ SET insert_quorum_timeout=0; INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } +-- retry should fail despite the insert_deduplicate enabled +-- TODO: BUG HERE! it returns success while the record with key=4 does not exists on r2. +INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } +INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } +SELECT * FROM r2 WHERE key=4; + SYSTEM START FETCHES r2; +-- now retry should be successful +INSERT INTO r1 VALUES (4, '4'); + SYSTEM SYNC REPLICA r2; SELECT 'insert happened'; From e17444757bf04be812895ed59218a554bc39ba79 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Apr 2021 13:35:38 +0300 Subject: [PATCH 2/6] Fix rare case when quorum insert is not really quorum because of deduplication --- .../ReplicatedMergeTreeBlockOutputStream.cpp | 117 +++++++++++------- .../ReplicatedMergeTreeBlockOutputStream.h | 6 + ...509_parallel_quorum_insert_no_replicas.sql | 9 +- 3 files changed, 79 insertions(+), 53 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp index d22f94fefe3..2b09c4c0b1c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp @@ -341,13 +341,28 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( /// If it exists on our replica, ignore it. if (storage.getActiveContainingPart(existing_part_name)) { - LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it.", block_id, existing_part_name); part->is_duplicate = true; last_block_is_duplicate = true; ProfileEvents::increment(ProfileEvents::DuplicatedInsertedBlocks); + if (quorum) + { + LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it, but checking quorum.", block_id, existing_part_name); + + std::string quorum_path; + if (quorum_parallel) + quorum_path = storage.zookeeper_path + "/quorum/parallel/" + existing_part_name; + else + quorum_path = storage.zookeeper_path + "/quorum/status"; + + waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_value); + } + else + { + LOG_INFO(log, "Block with ID {} already exists locally as part {}; ignoring it.", block_id, existing_part_name); + } + return; } - LOG_INFO(log, "Block with ID {} already exists on other replicas as part {}; will write it locally with that name.", block_id, existing_part_name); @@ -486,50 +501,7 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart( storage.updateQuorum(part->name, false); } - /// We are waiting for quorum to be satisfied. - LOG_TRACE(log, "Waiting for quorum"); - - try - { - while (true) - { - zkutil::EventPtr event = std::make_shared(); - - std::string value; - /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there. - if (!zookeeper->tryGet(quorum_info.status_path, value, nullptr, event)) - break; - - LOG_TRACE(log, "Quorum node {} still exists, will wait for updates", quorum_info.status_path); - - ReplicatedMergeTreeQuorumEntry quorum_entry(value); - - /// If the node has time to disappear, and then appear again for the next insert. - if (quorum_entry.part_name != part->name) - break; - - if (!event->tryWait(quorum_timeout_ms)) - throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED); - - LOG_TRACE(log, "Quorum {} updated, will check quorum node still exists", quorum_info.status_path); - } - - /// And what if it is possible that the current replica at this time has ceased to be active - /// and the quorum is marked as failed and deleted? - String value; - if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr) - || value != quorum_info.is_active_node_value) - throw Exception("Replica become inactive while waiting for quorum", ErrorCodes::NO_ACTIVE_REPLICAS); - } - catch (...) - { - /// We do not know whether or not data has been inserted - /// - whether other replicas have time to download the part and mark the quorum as done. - throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false), - ErrorCodes::UNKNOWN_STATUS_OF_INSERT); - } - - LOG_TRACE(log, "Quorum satisfied"); + waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value); } } @@ -541,4 +513,57 @@ void ReplicatedMergeTreeBlockOutputStream::writePrefix() } +void ReplicatedMergeTreeBlockOutputStream::waitForQuorum( + zkutil::ZooKeeperPtr & zookeeper, + const std::string & part_name, + const std::string & quorum_path, + const std::string & is_active_node_value) +{ + /// We are waiting for quorum to be satisfied. + LOG_TRACE(log, "Waiting for quorum"); + + try + { + while (true) + { + zkutil::EventPtr event = std::make_shared(); + + std::string value; + /// `get` instead of `exists` so that `watch` does not leak if the node is no longer there. + if (!zookeeper->tryGet(quorum_path, value, nullptr, event)) + break; + + LOG_TRACE(log, "Quorum node {} still exists, will wait for updates", quorum_path); + + ReplicatedMergeTreeQuorumEntry quorum_entry(value); + + /// If the node has time to disappear, and then appear again for the next insert. + if (quorum_entry.part_name != part_name) + break; + + if (!event->tryWait(quorum_timeout_ms)) + throw Exception("Timeout while waiting for quorum", ErrorCodes::TIMEOUT_EXCEEDED); + + LOG_TRACE(log, "Quorum {} updated, will check quorum node still exists", quorum_path); + } + + /// And what if it is possible that the current replica at this time has ceased to be active + /// and the quorum is marked as failed and deleted? + String value; + if (!zookeeper->tryGet(storage.replica_path + "/is_active", value, nullptr) + || value != is_active_node_value) + throw Exception("Replica become inactive while waiting for quorum", ErrorCodes::NO_ACTIVE_REPLICAS); + } + catch (...) + { + /// We do not know whether or not data has been inserted + /// - whether other replicas have time to download the part and mark the quorum as done. + throw Exception("Unknown status, client must retry. Reason: " + getCurrentExceptionMessage(false), + ErrorCodes::UNKNOWN_STATUS_OF_INSERT); + } + + LOG_TRACE(log, "Quorum satisfied"); +} + + } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h index 860b0c4ed12..785da3ef72d 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h @@ -63,6 +63,12 @@ private: /// Rename temporary part and commit to ZooKeeper. void commitPart(zkutil::ZooKeeperPtr & zookeeper, MergeTreeData::MutableDataPartPtr & part, const String & block_id); + /// Wait for quorum to be satisfied on path (quorum_path) form part (part_name) + /// Also checks that replica still alive. + void waitForQuorum( + zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name, + const std::string & quorum_path, const std::string & is_active_node_value); + StorageReplicatedMergeTree & storage; StorageMetadataPtr metadata_snapshot; size_t quorum; diff --git a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql index 6980dc02bfa..6d65f2904e8 100644 --- a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql +++ b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql @@ -15,7 +15,6 @@ ORDER BY tuple(); SET insert_quorum_parallel=1; -SET insert_quorum_timeout=1; SET insert_quorum=3; INSERT INTO r1 VALUES(1, '1'); --{serverError 285} @@ -26,17 +25,11 @@ INSERT INTO r1 VALUES(1, '1'); --{serverError 285} SELECT 'insert to two replicas works'; SET insert_quorum=2, insert_quorum_parallel=1; --- the first 'good' retry after failure will return: --- Unknown status, client must retry. Reason: Code: 159, Timeout while waiting for quorum --- it's related to decreased insert_quorum_timeout=1 at the test beginning --- it's not critical but not good -INSERT INTO r1 VALUES(1, '1'); --{serverError 319} INSERT INTO r1 VALUES(1, '1'); SELECT COUNT() FROM r1; SELECT COUNT() FROM r2; -SET insert_quorum_timeout=600000; DETACH TABLE r2; INSERT INTO r1 VALUES(2, '2'); --{serverError 285} @@ -93,6 +86,8 @@ SELECT * FROM r2 WHERE key=4; SYSTEM START FETCHES r2; +SET insert_quorum_timeout=6000000; + -- now retry should be successful INSERT INTO r1 VALUES (4, '4'); From e762d02e375441be853f8f1dcf0166e6f88cfc11 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Apr 2021 13:38:40 +0300 Subject: [PATCH 3/6] More const --- src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp | 2 +- src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp index 2b09c4c0b1c..c9b66ff3a6c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp @@ -517,7 +517,7 @@ void ReplicatedMergeTreeBlockOutputStream::waitForQuorum( zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name, const std::string & quorum_path, - const std::string & is_active_node_value) + const std::string & is_active_node_value) const { /// We are waiting for quorum to be satisfied. LOG_TRACE(log, "Waiting for quorum"); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h index 785da3ef72d..6ea16491d64 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h @@ -67,7 +67,7 @@ private: /// Also checks that replica still alive. void waitForQuorum( zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name, - const std::string & quorum_path, const std::string & is_active_node_value); + const std::string & quorum_path, const std::string & is_active_node_value) const; StorageReplicatedMergeTree & storage; StorageMetadataPtr metadata_snapshot; From 95f27cb3566b0c727e9d1f0e696ffeda58507f83 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Apr 2021 13:39:21 +0300 Subject: [PATCH 4/6] No bug no more --- .../0_stateless/01509_parallel_quorum_insert_no_replicas.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql index 6d65f2904e8..7b36621a152 100644 --- a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql +++ b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql @@ -79,7 +79,6 @@ SET insert_quorum_timeout=0; INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } -- retry should fail despite the insert_deduplicate enabled --- TODO: BUG HERE! it returns success while the record with key=4 does not exists on r2. INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } INSERT INTO r1 VALUES (4, '4'); -- { serverError 319 } SELECT * FROM r2 WHERE key=4; From 1d56e055e21363a577409e9d76a6d5a6c5f89b3b Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Thu, 8 Apr 2021 13:00:01 +0200 Subject: [PATCH 5/6] Update 01509_parallel_quorum_insert_no_replicas.sql --- .../01509_parallel_quorum_insert_no_replicas.sql | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql index 7b36621a152..16c4a4df936 100644 --- a/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql +++ b/tests/queries/0_stateless/01509_parallel_quorum_insert_no_replicas.sql @@ -1,16 +1,16 @@ -DROP TABLE IF EXISTS r1; -DROP TABLE IF EXISTS r2; +DROP TABLE IF EXISTS r1 SYNC; +DROP TABLE IF EXISTS r2 SYNC; CREATE TABLE r1 ( key UInt64, value String ) -ENGINE = ReplicatedMergeTree('/clickhouse/01509_no_repliacs', '1') +ENGINE = ReplicatedMergeTree('/clickhouse/01509_parallel_quorum_insert_no_replicas', '1') ORDER BY tuple(); CREATE TABLE r2 ( key UInt64, value String ) -ENGINE = ReplicatedMergeTree('/clickhouse/01509_no_repliacs', '2') +ENGINE = ReplicatedMergeTree('/clickhouse/01509_parallel_quorum_insert_no_replicas', '2') ORDER BY tuple(); SET insert_quorum_parallel=1; From 9294f3ca9a442ae4cc9be2256297828207a0562b Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 8 Apr 2021 15:02:03 +0300 Subject: [PATCH 6/6] Fix skip list --- tests/queries/skip_list.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index d256b4493f9..c1a1cd02672 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -677,6 +677,7 @@ "01760_system_dictionaries", "01761_alter_decimal_zookeeper", "01360_materialized_view_with_join_on_query_log", // creates and drops MVs on query_log, which may interrupt flushes. + "01509_parallel_quorum_insert_no_replicas", // It's ok to execute in parallel with oter tests but not several instances of the same test. "attach", "ddl_dictionaries", "dictionary",