From c0a0fb1869775a264484fe95b3514838e82ec008 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 25 Nov 2024 17:44:55 +0100 Subject: [PATCH 1/3] Better error message on bad keeper snapshots --- src/Coordination/KeeperStateMachine.cpp | 22 ++++++++++++++-------- src/Coordination/SnapshotableHashTable.h | 14 +++++++++++--- 2 files changed, 25 insertions(+), 11 deletions(-) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index 704d3365fa2..9f9de7ae2f1 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -142,13 +142,14 @@ void KeeperStateMachine::init() } catch (...) { - tryLogCurrentException( + LOG_FATAL( log, - fmt::format( - "Aborting because of failure to load from latest snapshot with index {}. Problematic snapshot can be removed but it will " - "lead to data loss", - latest_log_index)); - std::abort(); + "Failure to load from latest snapshot with index {}: {}", + latest_log_index, + getCurrentExceptionMessage(true, true, false)); + LOG_FATAL( + log, "Manual intervention is necessary for recovery. Problematic snapshot can be removed but it will lead to data loss"); + abort(); } } @@ -427,8 +428,13 @@ bool KeeperStateMachine::preprocess(const KeeperStorageBase::RequestFor } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to preprocess stored log at index {}, aborting to avoid inconsistent state", request_for_session.log_idx)); - std::abort(); + LOG_FATAL( + log, + "Failed to preprocess stored log at index {}: {}", + request_for_session.log_idx, + getCurrentExceptionMessage(true, true, false)); + LOG_FATAL(log, "Aborting to avoid inconsistent state"); + abort(); } if (keeper_context->digestEnabled() && request_for_session.digest) diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index de089913ea7..bfe987de1cb 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -8,6 +8,11 @@ namespace DB { +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + template struct ListNode { @@ -292,7 +297,8 @@ public: { size_t hash_value = map.hash(key); auto it = map.find(key, hash_value); - chassert(it != map.end()); + if (it != map.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key); auto list_itr = it->getMapped(); uint64_t old_value_size = list_itr->value.sizeInBytes(); @@ -348,7 +354,8 @@ public: const V & getValue(StringRef key) const { auto it = map.find(key); - chassert(it); + if (it != map.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key); return it->getMapped()->value; } @@ -356,7 +363,8 @@ public: { for (auto & itr : snapshot_invalid_iters) { - chassert(!itr->isActiveInMap()); + if (itr->isActiveInMap()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "{} is not active in map", itr->key); updateDataSize(ERASE, itr->key.size, 0, itr->value.sizeInBytes(), /*remove_old=*/true); if (itr->getFreeKey()) arena.free(const_cast(itr->key.data), itr->key.size); From bd96303e1776496cdfd63a5d06f8bceedbff3079 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 25 Nov 2024 19:39:50 +0100 Subject: [PATCH 2/3] Fix asserts --- src/Coordination/SnapshotableHashTable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index bfe987de1cb..462ca18c502 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -297,7 +297,7 @@ public: { size_t hash_value = map.hash(key); auto it = map.find(key, hash_value); - if (it != map.end()) + if (it == map.end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key); auto list_itr = it->getMapped(); @@ -354,7 +354,7 @@ public: const V & getValue(StringRef key) const { auto it = map.find(key); - if (it != map.end()) + if (it == map.end()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not find key: '{}'", key); return it->getMapped()->value; } From 575e17758d4268c6dcc668ec6b6e943fced68b45 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 26 Nov 2024 13:34:00 +0100 Subject: [PATCH 3/3] Adjust tests --- tests/integration/test_keeper_snapshots/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_keeper_snapshots/test.py b/tests/integration/test_keeper_snapshots/test.py index 1af3318e24b..b48657f4c3d 100644 --- a/tests/integration/test_keeper_snapshots/test.py +++ b/tests/integration/test_keeper_snapshots/test.py @@ -191,8 +191,9 @@ def test_invalid_snapshot(started_cluster): ] ) node.start_clickhouse(start_wait_sec=120, expected_to_fail=True) + assert node.contains_in_log("Failure to load from latest snapshot with index") assert node.contains_in_log( - "Aborting because of failure to load from latest snapshot with index" + "Manual intervention is necessary for recovery. Problematic snapshot can be removed but it will lead to data loss" ) node.stop_clickhouse()