ClickHouse/tests/integration/test_keeper_force_recovery/test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

208 lines
6.2 KiB
Python
Raw Normal View History

2022-04-14 10:30:35 +00:00
import os
import pytest
import socket
from helpers.cluster import ClickHouseCluster
2022-09-06 10:58:14 +00:00
import helpers.keeper_utils as keeper_utils
2022-04-14 10:30:35 +00:00
import time
from kazoo.client import KazooClient, KazooRetry
2022-04-20 07:23:21 +00:00
CLUSTER_SIZE = 5
QUORUM_SIZE = CLUSTER_SIZE // 2 + 1
2022-04-20 07:23:21 +00:00
cluster = ClickHouseCluster(__file__)
CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
2022-04-20 07:23:21 +00:00
def get_nodes():
nodes = []
2022-04-20 07:23:21 +00:00
for i in range(CLUSTER_SIZE):
nodes.append(
cluster.add_instance(
2022-04-19 08:08:13 +00:00
f"node{i+1}",
main_configs=[
2022-04-20 07:23:21 +00:00
f"configs/enable_keeper{i+1}.xml",
f"configs/use_keeper.xml",
],
stay_alive=True,
)
)
2022-04-20 07:23:21 +00:00
for i in range(CLUSTER_SIZE, CLUSTER_SIZE + QUORUM_SIZE):
2022-04-19 08:08:13 +00:00
nodes.append(
cluster.add_instance(f"node{i+1}", main_configs=[], stay_alive=True)
)
2022-04-20 07:23:21 +00:00
return nodes
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
nodes = get_nodes()
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
def get_fake_zk(nodename, timeout=30.0):
2022-04-14 10:30:35 +00:00
_fake_zk_instance = KazooClient(
2022-07-19 09:03:58 +00:00
hosts=cluster.get_instance_ip(nodename) + ":9181",
timeout=timeout,
2022-08-10 12:39:20 +00:00
connection_retry=KazooRetry(max_tries=10),
2022-07-19 09:03:58 +00:00
command_retry=KazooRetry(max_tries=10),
2022-04-14 10:30:35 +00:00
)
2022-04-14 10:30:35 +00:00
_fake_zk_instance.start()
return _fake_zk_instance
2022-04-14 10:30:35 +00:00
def wait_and_assert_data(zk, path, data):
2022-08-10 12:39:20 +00:00
while zk.retry(zk.exists, path) is None:
time.sleep(0.1)
2022-08-10 12:39:20 +00:00
assert zk.retry(zk.get, path)[0] == data.encode()
2022-04-14 10:30:35 +00:00
def close_zk(zk):
zk.stop()
zk.close()
2022-04-20 07:23:21 +00:00
def test_cluster_recovery(started_cluster):
node_zks = []
2022-04-14 10:30:35 +00:00
try:
# initial cluster of `cluster_size` nodes
2022-04-20 07:23:21 +00:00
for node in nodes[CLUSTER_SIZE:]:
node.stop_clickhouse()
2022-04-14 10:30:35 +00:00
2022-09-06 10:58:14 +00:00
keeper_utils.wait_nodes(cluster, nodes[:CLUSTER_SIZE])
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
node_zks = [get_fake_zk(node.name) for node in nodes[:CLUSTER_SIZE]]
2022-04-14 10:30:35 +00:00
data_in_cluster = []
2022-04-14 10:30:35 +00:00
def add_data(zk, path, data):
zk.retry(zk.create, path, data.encode())
data_in_cluster.append((path, data))
2022-04-14 10:30:35 +00:00
def assert_all_data(zk):
for path, data in data_in_cluster:
wait_and_assert_data(zk, path, data)
2022-04-14 10:30:35 +00:00
for i, zk in enumerate(node_zks):
add_data(zk, f"/test_force_recovery_node{i+1}", f"somedata{i+1}")
2022-04-14 10:30:35 +00:00
for zk in node_zks:
assert_all_data(zk)
2022-04-14 10:30:35 +00:00
nodes[0].stop_clickhouse()
2022-05-31 08:31:00 +00:00
# we potentially killed the leader node so we give time for election
for _ in range(100):
try:
node_zks[1] = get_fake_zk(nodes[1].name, timeout=30.0)
add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra")
break
except Exception as ex:
time.sleep(0.5)
print(f"Retrying create on {nodes[1].name}, exception {ex}")
else:
raise Exception(f"Failed creating a node on {nodes[1].name}")
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
for node_zk in node_zks[2:CLUSTER_SIZE]:
wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra")
2022-04-14 10:30:35 +00:00
nodes[0].start_clickhouse()
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_connected(cluster, nodes[0])
2022-04-20 07:23:21 +00:00
node_zks[0] = get_fake_zk(nodes[0].name)
wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra")
2022-04-14 10:30:35 +00:00
# stop last quorum size nodes
2022-04-20 07:23:21 +00:00
nodes_left = CLUSTER_SIZE - QUORUM_SIZE
for node_zk in node_zks[nodes_left:CLUSTER_SIZE]:
close_zk(node_zk)
2022-04-14 10:30:35 +00:00
node_zks = node_zks[:nodes_left]
2022-04-20 07:23:21 +00:00
for node in nodes[nodes_left:CLUSTER_SIZE]:
node.stop_clickhouse()
# wait for node1 to lose quorum
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_quorum_lost(cluster, nodes[0])
nodes[0].copy_file_to_container(
2022-04-20 07:23:21 +00:00
os.path.join(CONFIG_DIR, "recovered_keeper1.xml"),
"/etc/clickhouse-server/config.d/enable_keeper1.xml",
)
nodes[0].query("SYSTEM RELOAD CONFIG")
2022-09-06 10:58:14 +00:00
assert (
keeper_utils.send_4lw_cmd(cluster, nodes[0], "mntr")
== keeper_utils.NOT_SERVING_REQUESTS_ERROR_MSG
)
keeper_utils.send_4lw_cmd(cluster, nodes[0], "rcvr")
assert (
keeper_utils.send_4lw_cmd(cluster, nodes[0], "mntr")
== keeper_utils.NOT_SERVING_REQUESTS_ERROR_MSG
)
# add one node to restore the quorum
2022-04-20 07:23:21 +00:00
nodes[CLUSTER_SIZE].copy_file_to_container(
2022-04-19 08:08:13 +00:00
os.path.join(
2022-04-20 07:23:21 +00:00
CONFIG_DIR,
f"enable_keeper{CLUSTER_SIZE+1}.xml",
2022-04-19 08:08:13 +00:00
),
2022-04-20 07:23:21 +00:00
f"/etc/clickhouse-server/config.d/enable_keeper{CLUSTER_SIZE+1}.xml",
2022-04-19 08:08:13 +00:00
)
2022-04-20 07:23:21 +00:00
nodes[CLUSTER_SIZE].start_clickhouse()
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_connected(cluster, nodes[CLUSTER_SIZE])
2022-04-14 10:30:35 +00:00
# node1 should have quorum now and accept requests
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_connected(cluster, nodes[0])
2022-04-14 10:30:35 +00:00
2022-04-20 07:23:21 +00:00
node_zks.append(get_fake_zk(nodes[CLUSTER_SIZE].name))
2022-04-14 10:30:35 +00:00
# add rest of the nodes
2022-04-20 07:23:21 +00:00
for i in range(CLUSTER_SIZE + 1, len(nodes)):
2022-04-19 08:08:13 +00:00
node = nodes[i]
node.copy_file_to_container(
2022-04-20 07:23:21 +00:00
os.path.join(CONFIG_DIR, f"enable_keeper{i+1}.xml"),
2022-04-19 08:08:13 +00:00
f"/etc/clickhouse-server/config.d/enable_keeper{i+1}.xml",
)
node.start_clickhouse()
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_connected(cluster, node)
2022-04-20 07:23:21 +00:00
node_zks.append(get_fake_zk(node.name))
# refresh old zk sessions
for i, node in enumerate(nodes[:nodes_left]):
node_zks[i] = get_fake_zk(node.name)
2022-04-14 10:30:35 +00:00
for zk in node_zks:
assert_all_data(zk)
2022-04-14 10:30:35 +00:00
# new nodes can achieve quorum without the recovery node (cluster should work properly from now on)
nodes[0].stop_clickhouse()
2022-04-14 10:30:35 +00:00
add_data(node_zks[-2], "/test_force_recovery_last", "somedatalast")
wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast")
2022-04-14 10:30:35 +00:00
nodes[0].start_clickhouse()
2022-09-06 10:58:14 +00:00
keeper_utils.wait_until_connected(cluster, nodes[0])
2022-04-20 07:23:21 +00:00
node_zks[0] = get_fake_zk(nodes[0].name)
for zk in node_zks[:nodes_left]:
assert_all_data(zk)
2022-04-14 10:30:35 +00:00
finally:
try:
for zk_conn in node_zks:
close_zk(zk_conn)
2022-04-14 10:30:35 +00:00
except:
pass