ClickHouse/tests/integration/test_keeper_snapshots/test.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

257 lines
7.5 KiB
Python
Raw Normal View History

2021-03-03 12:21:21 +00:00
#!/usr/bin/env python3
2024-09-27 10:19:39 +00:00
import os
2021-03-03 12:21:21 +00:00
import random
import string
2024-09-27 10:19:39 +00:00
import pytest
2024-03-01 08:39:56 +00:00
from kazoo.client import KazooClient
2021-03-03 12:21:21 +00:00
2024-09-27 10:19:39 +00:00
import helpers.keeper_utils as keeper_utils
from helpers.cluster import ClickHouseCluster
2021-03-03 12:21:21 +00:00
cluster = ClickHouseCluster(__file__)
# clickhouse itself will use external zookeeper
node = cluster.add_instance(
"node",
main_configs=["configs/enable_keeper.xml"],
stay_alive=True,
)
2021-03-03 12:21:21 +00:00
def random_string(length):
return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))
2021-03-03 12:21:21 +00:00
def create_random_path(prefix="", depth=1):
if depth == 0:
return prefix
return create_random_path(os.path.join(prefix, random_string(3)), depth - 1)
2021-03-03 12:21:21 +00:00
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
2021-03-03 12:21:21 +00:00
def get_connection_zk(nodename, timeout=30.0):
_fake_zk_instance = KazooClient(
hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout
)
_fake_zk_instance.start()
return _fake_zk_instance
2022-09-06 10:58:14 +00:00
def restart_clickhouse():
node.restart_clickhouse(kill=True)
keeper_utils.wait_until_connected(cluster, node)
2021-03-03 12:21:21 +00:00
def test_state_after_restart(started_cluster):
2023-04-05 08:56:01 +00:00
keeper_utils.wait_until_connected(started_cluster, node)
node_zk = None
node_zk2 = None
2021-03-03 12:21:21 +00:00
try:
node_zk = get_connection_zk("node")
node_zk.create("/test_state_after_restart", b"somevalue")
strs = []
for i in range(100):
strs.append(random_string(123).encode())
node_zk.create("/test_state_after_restart/node" + str(i), strs[i])
existing_children = []
for i in range(100):
if i % 7 == 0:
node_zk.delete("/test_state_after_restart/node" + str(i))
else:
existing_children.append("node" + str(i))
2022-09-06 10:58:14 +00:00
restart_clickhouse()
2021-03-03 12:21:21 +00:00
node_zk2 = get_connection_zk("node")
assert node_zk2.get("/test_state_after_restart")[0] == b"somevalue"
for i in range(100):
if i % 7 == 0:
assert (
node_zk2.exists("/test_state_after_restart/node" + str(i)) is None
)
2021-03-03 12:21:21 +00:00
else:
2021-03-03 13:18:28 +00:00
data, stat = node_zk2.get("/test_state_after_restart/node" + str(i))
assert len(data) == 123
assert data == strs[i]
2021-03-03 15:37:31 +00:00
assert stat.ephemeralOwner == 0
2021-03-03 12:21:21 +00:00
assert list(sorted(existing_children)) == list(
sorted(node_zk2.get_children("/test_state_after_restart"))
)
2021-03-03 12:21:21 +00:00
finally:
try:
if node_zk is not None:
node_zk.stop()
node_zk.close()
if node_zk2 is not None:
node_zk2.stop()
node_zk2.close()
except:
pass
2021-03-03 12:31:21 +00:00
def test_ephemeral_after_restart(started_cluster):
2023-04-05 08:56:01 +00:00
keeper_utils.wait_until_connected(started_cluster, node)
node_zk = None
node_zk2 = None
2021-03-03 12:31:21 +00:00
try:
node_zk = get_connection_zk("node")
2021-03-03 13:18:28 +00:00
session_id = node_zk._session_id
2021-03-03 12:31:21 +00:00
node_zk.create("/test_ephemeral_after_restart", b"somevalue")
strs = []
for i in range(100):
strs.append(random_string(123).encode())
node_zk.create(
"/test_ephemeral_after_restart/node" + str(i), strs[i], ephemeral=True
)
2021-03-03 13:18:28 +00:00
existing_children = []
2021-03-03 12:31:21 +00:00
for i in range(100):
if i % 7 == 0:
node_zk.delete("/test_ephemeral_after_restart/node" + str(i))
2021-03-03 13:18:28 +00:00
else:
existing_children.append("node" + str(i))
2021-03-03 12:31:21 +00:00
2022-09-06 10:58:14 +00:00
restart_clickhouse()
2021-03-03 12:31:21 +00:00
node_zk2 = get_connection_zk("node")
assert node_zk2.get("/test_ephemeral_after_restart")[0] == b"somevalue"
for i in range(100):
if i % 7 == 0:
assert (
node_zk2.exists("/test_ephemeral_after_restart/node" + str(i))
is None
)
2021-03-03 12:31:21 +00:00
else:
2021-03-03 13:18:28 +00:00
data, stat = node_zk2.get("/test_ephemeral_after_restart/node" + str(i))
assert len(data) == 123
assert data == strs[i]
2021-03-03 15:37:31 +00:00
assert stat.ephemeralOwner == session_id
assert list(sorted(existing_children)) == list(
sorted(node_zk2.get_children("/test_ephemeral_after_restart"))
)
2021-03-03 12:31:21 +00:00
finally:
try:
if node_zk is not None:
node_zk.stop()
node_zk.close()
if node_zk2 is not None:
node_zk2.stop()
node_zk2.close()
except:
pass
2024-02-29 11:46:12 +00:00
def test_invalid_snapshot(started_cluster):
keeper_utils.wait_until_connected(started_cluster, node)
node_zk = None
try:
node_zk = get_connection_zk("node")
node_zk.create("/test_invalid_snapshot", b"somevalue")
keeper_utils.send_4lw_cmd(started_cluster, node, "csnp")
node.stop_clickhouse()
snapshots = (
node.exec_in_container(["ls", "/var/lib/clickhouse/coordination/snapshots"])
.strip()
.split("\n")
)
def snapshot_sort_key(snapshot_name):
snapshot_prefix_size = len("snapshot_")
last_log_idx = snapshot_name.split(".")[0][snapshot_prefix_size:]
return int(last_log_idx)
snapshots.sort(key=snapshot_sort_key)
last_snapshot = snapshots[-1]
node.exec_in_container(
[
"truncate",
"-s",
"0",
f"/var/lib/clickhouse/coordination/snapshots/{last_snapshot}",
]
)
2024-06-20 14:38:07 +00:00
node.start_clickhouse(start_wait_sec=120, expected_to_fail=True)
2024-11-26 12:34:00 +00:00
assert node.contains_in_log("Failure to load from latest snapshot with index")
2024-02-29 11:46:12 +00:00
assert node.contains_in_log(
2024-11-26 12:34:00 +00:00
"Manual intervention is necessary for recovery. Problematic snapshot can be removed but it will lead to data loss"
2024-02-29 11:46:12 +00:00
)
2024-03-01 08:39:56 +00:00
node.stop_clickhouse()
node.exec_in_container(
[
"rm",
f"/var/lib/clickhouse/coordination/snapshots/{last_snapshot}",
]
)
node.start_clickhouse()
2024-02-29 11:46:12 +00:00
finally:
try:
if node_zk is not None:
node_zk.stop()
node_zk.close()
except:
pass
2024-06-04 09:30:22 +00:00
def test_snapshot_size(started_cluster):
keeper_utils.wait_until_connected(started_cluster, node)
node_zk = None
try:
node_zk = get_connection_zk("node")
node_zk.create("/test_state_size", b"somevalue")
strs = []
for i in range(100):
strs.append(random_string(123).encode())
node_zk.create("/test_state_size/node" + str(i), strs[i])
node_zk.stop()
node_zk.close()
keeper_utils.send_4lw_cmd(started_cluster, node, "csnp")
node.wait_for_log_line("Created persistent snapshot")
def get_snapshot_size():
return int(
next(
filter(
lambda line: "zk_latest_snapshot_size" in line,
keeper_utils.send_4lw_cmd(started_cluster, node, "mntr").split(
"\n"
),
)
).split("\t")[1]
)
assert get_snapshot_size() != 0
restart_clickhouse()
assert get_snapshot_size() != 0
finally:
try:
if node_zk is not None:
node_zk.stop()
node_zk.close()
except:
pass