Merge pull request #39755 from ClickHouse/keeper-create-snapshot-on-exit

Create Keeper snapshot on exit
This commit is contained in:
alesapin 2022-08-05 12:00:16 +02:00 committed by GitHub
commit 329120ca7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 135 additions and 3 deletions

View File

@ -8,7 +8,8 @@ struct KeeperContext
enum class Phase : uint8_t
{
INIT,
RUNNING
RUNNING,
SHUTDOWN
};
Phase server_state{Phase::INIT};

View File

@ -107,8 +107,9 @@ KeeperServer::KeeperServer(
: server_id(configuration_and_settings_->server_id)
, coordination_settings(configuration_and_settings_->coordination_settings)
, log(&Poco::Logger::get("KeeperServer"))
, is_recovering(config.has("keeper_server.force_recovery") && config.getBool("keeper_server.force_recovery"))
, is_recovering(config.getBool("keeper_server.force_recovery", false))
, keeper_context{std::make_shared<KeeperContext>()}
, create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true))
{
if (coordination_settings->quorum_reads)
LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower.");
@ -367,6 +368,12 @@ void KeeperServer::shutdownRaftServer()
}
raft_instance->shutdown();
keeper_context->server_state = KeeperContext::Phase::SHUTDOWN;
if (create_snapshot_on_exit)
raft_instance->create_snapshot();
raft_instance.reset();
if (asio_listener)

View File

@ -64,6 +64,8 @@ private:
std::shared_ptr<KeeperContext> keeper_context;
const bool create_snapshot_on_exit;
public:
KeeperServer(
const KeeperConfigurationAndSettingsPtr & settings_,

View File

@ -395,7 +395,14 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
};
LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flash to disk", s.get_last_log_idx());
if (keeper_context->server_state == KeeperContext::Phase::SHUTDOWN)
{
LOG_INFO(log, "Creating a snapshot during shutdown because 'create_snapshot_on_exit' is enabled.");
snapshot_task.create_snapshot(std::move(snapshot_task.snapshot));
return;
}
LOG_DEBUG(log, "In memory snapshot {} created, queueing task to flush to disk", s.get_last_log_idx());
/// Flush snapshot to disk in a separate thread.
if (!snapshots_queue.push(std::move(snapshot_task)))
LOG_WARNING(log, "Cannot push snapshot task into queue");

View File

@ -3,6 +3,8 @@
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<create_snapshot_on_exit>true</create_snapshot_on_exit>
<coordination_settings>
<operation_timeout_ms>10000</operation_timeout_ms>
<session_timeout_ms>100000</session_timeout_ms>

View File

@ -80,6 +80,10 @@ ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/
ln -sf --backup=simple --suffix=_original.xml \
$SRC_PATH/config.d/query_masking_rules.xml $DEST_SERVER_PATH/config.d/
# We randomize creating the snapshot on exit for Keeper to test out using older snapshots
create_snapshot_on_exit=$(($RANDOM % 2))
sed --follow-symlinks -i "s|<create_snapshot_on_exit>true</create_snapshot_on_exit>|<create_snapshot_on_exit>$create_snapshot_on_exit</create_snapshot_on_exit>|" $DEST_SERVER_PATH/config.d/keeper_port.xml
if [[ -n "$USE_POLYMORPHIC_PARTS" ]] && [[ "$USE_POLYMORPHIC_PARTS" -eq 1 ]]; then
ln -sf $SRC_PATH/config.d/polymorphic_parts.xml $DEST_SERVER_PATH/config.d/
fi

View File

@ -0,0 +1 @@
#!/usr/bin/env python3

View File

@ -0,0 +1,28 @@
<clickhouse>
<keeper_server>
<tcp_port>9181</tcp_port>
<server_id>1</server_id>
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
<create_snapshot_on_exit>true</create_snapshot_on_exit>
<coordination_settings>
<operation_timeout_ms>5000</operation_timeout_ms>
<session_timeout_ms>10000</session_timeout_ms>
<raft_logs_level>trace</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>9234</port>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>9234</port>
</server>
</raft_configuration>
</keeper_server>
</clickhouse>

View File

@ -0,0 +1,28 @@
<clickhouse>
<keeper_server>
<tcp_port>9181</tcp_port>
<server_id>2</server_id>
<log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
<snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
<create_snapshot_on_exit>false</create_snapshot_on_exit>
<coordination_settings>
<operation_timeout_ms>5000</operation_timeout_ms>
<session_timeout_ms>10000</session_timeout_ms>
<raft_logs_level>trace</raft_logs_level>
</coordination_settings>
<raft_configuration>
<server>
<id>1</id>
<hostname>node1</hostname>
<port>9234</port>
</server>
<server>
<id>2</id>
<hostname>node2</hostname>
<port>9234</port>
</server>
</raft_configuration>
</keeper_server>
</clickhouse>

View File

@ -0,0 +1,52 @@
import pytest
from helpers.cluster import ClickHouseCluster
import os
from kazoo.client import KazooClient
cluster = ClickHouseCluster(__file__)
CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
node1 = cluster.add_instance(
"node1", main_configs=["configs/enable_keeper1.xml"], stay_alive=True
)
node2 = cluster.add_instance(
"node2", main_configs=["configs/enable_keeper2.xml"], stay_alive=True
)
def get_fake_zk(node, timeout=30.0):
_fake_zk_instance = KazooClient(
hosts=cluster.get_instance_ip(node.name) + ":9181", timeout=timeout
)
_fake_zk_instance.start()
return _fake_zk_instance
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
def test_snapshot_on_exit(started_cluster):
zk_conn = get_fake_zk(node1)
zk_conn.create("/some_path", b"some_data")
node1.stop_clickhouse()
assert node1.contains_in_log("Created persistent snapshot")
node1.start_clickhouse()
assert node1.contains_in_log("Loaded snapshot")
node2.stop_clickhouse()
assert not node2.contains_in_log("Created persistent snapshot")
node2.start_clickhouse()
assert node2.contains_in_log("No existing snapshots")