From 7cdad883278f9764b6888ec2d7d7dc5d7967f75a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Sat, 30 Jul 2022 11:34:17 +0000 Subject: [PATCH 1/7] Create snapshot on shutdown --- src/Coordination/KeeperContext.h | 3 ++- src/Coordination/KeeperServer.cpp | 9 ++++++++- src/Coordination/KeeperServer.h | 2 ++ src/Coordination/KeeperStateMachine.cpp | 8 +++++++- 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Coordination/KeeperContext.h b/src/Coordination/KeeperContext.h index 84ec65cecde..64fa8cea6ec 100644 --- a/src/Coordination/KeeperContext.h +++ b/src/Coordination/KeeperContext.h @@ -8,7 +8,8 @@ struct KeeperContext enum class Phase : uint8_t { INIT, - RUNNING + RUNNING, + SHUTDOWN }; Phase server_state{Phase::INIT}; diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 587ab9c8f66..20ce7e42afc 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -107,8 +107,9 @@ KeeperServer::KeeperServer( : server_id(configuration_and_settings_->server_id) , coordination_settings(configuration_and_settings_->coordination_settings) , log(&Poco::Logger::get("KeeperServer")) - , is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery")) + , is_recovering(config.getBool("keeper_server.force_recovery", false)) , keeper_context{std::make_shared()} + , create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true)) { if (coordination_settings->quorum_reads) LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); @@ -367,6 +368,12 @@ void KeeperServer::shutdownRaftServer() } raft_instance->shutdown(); + + keeper_context->server_state = KeeperContext::Phase::SHUTDOWN; + + if (create_snapshot_on_exit) + raft_instance->create_snapshot(); + raft_instance.reset(); if (asio_listener) diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 74dd05631f0..1fb3e579214 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -64,6 +64,8 @@ private: std::shared_ptr keeper_context; + const bool create_snapshot_on_exit; + public: KeeperServer( const KeeperConfigurationAndSettingsPtr & settings_, diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index f43a3dbb319..a55acaf9b91 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -383,7 +383,13 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res }; - LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flash to disk", s.get_last_log_idx()); + if (keeper_context->server_state == KeeperContext::Phase::SHUTDOWN) + { + snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); + return; + } + + LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flush to disk", s.get_last_log_idx()); /// Flush snapshot to disk in a separate thread. if (!snapshots_queue.push(std::move(snapshot_task))) LOG_WARNING(log, "Cannot push snapshot task into queue"); From 16a98d8eeff0820e653f37b5e6d488c2184508f3 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Sun, 31 Jul 2022 09:11:46 +0000 Subject: [PATCH 2/7] Add test for creating snapshot on exit --- .../configs/enable_keeper1.xml | 28 +++++++++++ .../configs/enable_keeper2.xml | 28 +++++++++++ .../test_keeper_snapshot_on_exit/test.py | 50 +++++++++++++++++++ 3 files changed, 106 insertions(+) create mode 100644 tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper1.xml create mode 100644 tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper2.xml create mode 100644 tests/integration/test_keeper_snapshot_on_exit/test.py diff --git a/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper1.xml b/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper1.xml new file mode 100644 index 00000000000..e4d5ae9aa82 --- /dev/null +++ b/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper1.xml @@ -0,0 +1,28 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + true + + + 5000 + 10000 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + + diff --git a/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper2.xml b/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper2.xml new file mode 100644 index 00000000000..216e6905a56 --- /dev/null +++ b/tests/integration/test_keeper_snapshot_on_exit/configs/enable_keeper2.xml @@ -0,0 +1,28 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + /var/lib/clickhouse/coordination/snapshots + false + + + 5000 + 10000 + trace + + + + + 1 + node1 + 9234 + + + 2 + node2 + 9234 + + + + diff --git a/tests/integration/test_keeper_snapshot_on_exit/test.py b/tests/integration/test_keeper_snapshot_on_exit/test.py new file mode 100644 index 00000000000..ad373011136 --- /dev/null +++ b/tests/integration/test_keeper_snapshot_on_exit/test.py @@ -0,0 +1,50 @@ +import pytest +from helpers.cluster import ClickHouseCluster +import os +from kazoo.client import KazooClient + +cluster = ClickHouseCluster(__file__) +CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs") + +node1 = cluster.add_instance( + "node1", main_configs=["configs/enable_keeper1.xml"], stay_alive=True +) + +node2 = cluster.add_instance( + "node2", main_configs=["configs/enable_keeper2.xml"], stay_alive=True +) + +def get_fake_zk(node, timeout=30.0): + _fake_zk_instance = KazooClient( + hosts=cluster.get_instance_ip(node.name) + ":9181", timeout=timeout + ) + _fake_zk_instance.start() + return _fake_zk_instance + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + +def test_snapshot_on_exit(started_cluster): + zk_conn = get_fake_zk(node1); + + zk_conn.create("/some_path", b"some_data") + + node1.stop_clickhouse() + assert node1.contains_in_log("Created persistent snapshot") + + node1.start_clickhouse() + assert node1.contains_in_log("Loaded snapshot") + + node2.stop_clickhouse() + assert not node2.contains_in_log("Created persistent snapshot") + + node2.start_clickhouse() + assert node2.contains_in_log("No existing snapshots") From 0b1554c2e76a439918c535bc278848c206649350 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 31 Jul 2022 09:24:53 +0000 Subject: [PATCH 3/7] Automatic style fix --- tests/integration/test_keeper_snapshot_on_exit/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_keeper_snapshot_on_exit/test.py b/tests/integration/test_keeper_snapshot_on_exit/test.py index ad373011136..1ca5888ab4d 100644 --- a/tests/integration/test_keeper_snapshot_on_exit/test.py +++ b/tests/integration/test_keeper_snapshot_on_exit/test.py @@ -14,6 +14,7 @@ node2 = cluster.add_instance( "node2", main_configs=["configs/enable_keeper2.xml"], stay_alive=True ) + def get_fake_zk(node, timeout=30.0): _fake_zk_instance = KazooClient( hosts=cluster.get_instance_ip(node.name) + ":9181", timeout=timeout @@ -32,8 +33,9 @@ def started_cluster(): finally: cluster.shutdown() + def test_snapshot_on_exit(started_cluster): - zk_conn = get_fake_zk(node1); + zk_conn = get_fake_zk(node1) zk_conn.create("/some_path", b"some_data") From 71e1b1916c2489f8a486d7b33463b8f4e9f242ab Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 3 Aug 2022 11:43:30 +0000 Subject: [PATCH 4/7] Randomize snapshot on exit in tests --- tests/config/config.d/keeper_port.xml | 2 ++ tests/config/install.sh | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tests/config/config.d/keeper_port.xml b/tests/config/config.d/keeper_port.xml index 8cea9044dd0..cffd325e968 100644 --- a/tests/config/config.d/keeper_port.xml +++ b/tests/config/config.d/keeper_port.xml @@ -3,6 +3,8 @@ 9181 1 + true + 10000 100000 diff --git a/tests/config/install.sh b/tests/config/install.sh index a9e66ebb633..46009ce671e 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -80,6 +80,10 @@ ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/ ln -sf --backup=simple --suffix=_original.xml \ $SRC_PATH/config.d/query_masking_rules.xml $DEST_SERVER_PATH/config.d/ +# We randomize creating the snapshot on exit for Keeper to test out using older snapshots +create_snapshot_on_exit=$(($RANDOM % 2)) +cat $DEST_SERVER_PATH/config.d/keeper_port.xml | sed "s|true|$create_snapshot_on_exit|" > $DEST_SERVER_PATH/config.d/keeper_port.xml + if [[ -n "$USE_POLYMORPHIC_PARTS" ]] && [[ "$USE_POLYMORPHIC_PARTS" -eq 1 ]]; then ln -sf $SRC_PATH/config.d/polymorphic_parts.xml $DEST_SERVER_PATH/config.d/ fi From 05a74850b31c29bea08d71f9a1604ec0fa615e38 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 3 Aug 2022 13:57:20 +0200 Subject: [PATCH 5/7] Update tests/config/install.sh Co-authored-by: Alexander Tokmakov --- tests/config/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/config/install.sh b/tests/config/install.sh index 46009ce671e..478601620e1 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -82,7 +82,7 @@ ln -sf --backup=simple --suffix=_original.xml \ # We randomize creating the snapshot on exit for Keeper to test out using older snapshots create_snapshot_on_exit=$(($RANDOM % 2)) -cat $DEST_SERVER_PATH/config.d/keeper_port.xml | sed "s|true|$create_snapshot_on_exit|" > $DEST_SERVER_PATH/config.d/keeper_port.xml +sed --follow-symlinks -i "s|true|$create_snapshot_on_exit|" $DEST_SERVER_PATH/config.d/keeper_port.xml if [[ -n "$USE_POLYMORPHIC_PARTS" ]] && [[ "$USE_POLYMORPHIC_PARTS" -eq 1 ]]; then ln -sf $SRC_PATH/config.d/polymorphic_parts.xml $DEST_SERVER_PATH/config.d/ From 8094f33078c92a7aed27377b34226e2add19a56a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 4 Aug 2022 06:48:33 +0000 Subject: [PATCH 6/7] Add init file for test --- tests/integration/test_keeper_snapshot_on_exit/__init__.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 tests/integration/test_keeper_snapshot_on_exit/__init__.py diff --git a/tests/integration/test_keeper_snapshot_on_exit/__init__.py b/tests/integration/test_keeper_snapshot_on_exit/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_keeper_snapshot_on_exit/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 From 10d7259c2b21c583abaa824507a1238eb5f26fb9 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 4 Aug 2022 13:12:24 +0000 Subject: [PATCH 7/7] Add log for snapshot on exit --- src/Coordination/KeeperStateMachine.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index a55acaf9b91..a568bb88302 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -385,6 +385,7 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res if (keeper_context->server_state == KeeperContext::Phase::SHUTDOWN) { + LOG_INFO(log, "Creating a snapshot during shutdown because 'create_snapshot_on_exit' is enabled."); snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); return; }