2023-09-07 00:36:39 +00:00
|
|
|
import io
|
|
|
|
import subprocess
|
2022-09-06 10:58:14 +00:00
|
|
|
import socket
|
|
|
|
import time
|
2023-09-07 00:36:39 +00:00
|
|
|
import typing as tp
|
|
|
|
import contextlib
|
|
|
|
import select
|
2023-04-20 13:26:02 +00:00
|
|
|
from kazoo.client import KazooClient
|
2023-09-07 00:36:39 +00:00
|
|
|
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
|
|
|
|
from helpers.client import CommandRequest
|
|
|
|
|
|
|
|
|
2023-09-07 00:44:54 +00:00
|
|
|
def execute_keeper_client_query(
|
|
|
|
cluster: ClickHouseCluster, node: ClickHouseInstance, query: str
|
|
|
|
) -> str:
|
2023-09-07 00:36:39 +00:00
|
|
|
request = CommandRequest(
|
|
|
|
[
|
|
|
|
cluster.server_bin_path,
|
|
|
|
"keeper-client",
|
|
|
|
"--host",
|
|
|
|
str(cluster.get_instance_ip(node.name)),
|
|
|
|
"--port",
|
|
|
|
str(cluster.zookeeper_port),
|
|
|
|
"-q",
|
|
|
|
query,
|
|
|
|
],
|
|
|
|
stdin="",
|
|
|
|
)
|
|
|
|
|
|
|
|
return request.get_answer()
|
|
|
|
|
|
|
|
|
|
|
|
class KeeperException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class KeeperClient(object):
|
2023-09-07 00:44:54 +00:00
|
|
|
SEPARATOR = b"\a\a\a\a\n"
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
def __init__(self, bin_path: str, host: str, port: int):
|
|
|
|
self.bin_path = bin_path
|
|
|
|
self.host = host
|
|
|
|
self.port = port
|
|
|
|
|
|
|
|
self.proc = subprocess.Popen(
|
|
|
|
[
|
|
|
|
bin_path,
|
2023-09-07 00:44:54 +00:00
|
|
|
"keeper-client",
|
|
|
|
"--host",
|
2023-09-07 00:36:39 +00:00
|
|
|
host,
|
2023-09-07 00:44:54 +00:00
|
|
|
"--port",
|
2023-09-07 00:36:39 +00:00
|
|
|
str(port),
|
2023-09-07 00:44:54 +00:00
|
|
|
"--log-level",
|
|
|
|
"error",
|
|
|
|
"--tests-mode",
|
|
|
|
"--no-confirmation",
|
2023-09-07 00:36:39 +00:00
|
|
|
],
|
|
|
|
stdin=subprocess.PIPE,
|
|
|
|
stdout=subprocess.PIPE,
|
|
|
|
stderr=subprocess.PIPE,
|
|
|
|
)
|
|
|
|
|
|
|
|
self.poller = select.epoll()
|
|
|
|
self.poller.register(self.proc.stdout)
|
|
|
|
self.poller.register(self.proc.stderr)
|
|
|
|
|
|
|
|
self._fd_nums = {
|
|
|
|
self.proc.stdout.fileno(): self.proc.stdout,
|
|
|
|
self.proc.stderr.fileno(): self.proc.stderr,
|
|
|
|
}
|
|
|
|
|
|
|
|
self.stopped = False
|
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def execute_query(self, query: str, timeout: float = 60.0) -> str:
|
2023-09-07 00:36:39 +00:00
|
|
|
output = io.BytesIO()
|
|
|
|
|
2023-09-07 00:44:54 +00:00
|
|
|
self.proc.stdin.write(query.encode() + b"\n")
|
2023-09-07 00:36:39 +00:00
|
|
|
self.proc.stdin.flush()
|
|
|
|
|
|
|
|
events = self.poller.poll(timeout)
|
2023-09-11 18:46:40 +00:00
|
|
|
if not events:
|
|
|
|
raise TimeoutError(f"Keeper client returned no output")
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
for fd_num, event in events:
|
|
|
|
if event & (select.EPOLLIN | select.EPOLLPRI):
|
|
|
|
file = self._fd_nums[fd_num]
|
|
|
|
|
|
|
|
if file == self.proc.stdout:
|
|
|
|
while True:
|
|
|
|
chunk = file.readline()
|
|
|
|
if chunk.endswith(self.SEPARATOR):
|
|
|
|
break
|
|
|
|
|
|
|
|
output.write(chunk)
|
|
|
|
|
|
|
|
elif file == self.proc.stderr:
|
|
|
|
assert self.proc.stdout.readline() == self.SEPARATOR
|
|
|
|
raise KeeperException(self.proc.stderr.readline().strip().decode())
|
|
|
|
|
|
|
|
else:
|
2023-09-07 00:44:54 +00:00
|
|
|
raise ValueError(f"Failed to read from pipe. Flag {event}")
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
data = output.getvalue().strip().decode()
|
|
|
|
return data
|
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def cd(self, path: str, timeout: float = 60.0):
|
2023-09-07 00:44:54 +00:00
|
|
|
self.execute_query(f"cd {path}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def ls(self, path: str, timeout: float = 60.0) -> list[str]:
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query(f"ls {path}", timeout).split(" ")
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def create(self, path: str, value: str, timeout: float = 60.0):
|
2023-09-07 00:44:54 +00:00
|
|
|
self.execute_query(f"create {path} {value}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def get(self, path: str, timeout: float = 60.0) -> str:
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query(f"get {path}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-16 00:32:32 +00:00
|
|
|
def set(self, path: str, value: str, version: tp.Optional[int] = None) -> None:
|
|
|
|
self.execute_query(
|
|
|
|
f"set {path} {value} {version if version is not None else ''}"
|
|
|
|
)
|
2023-09-16 00:07:54 +00:00
|
|
|
|
2023-09-16 00:03:46 +00:00
|
|
|
def rm(self, path: str, version: tp.Optional[int] = None) -> None:
|
|
|
|
self.execute_query(f"rm {path} {version if version is not None else ''}")
|
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def exists(self, path: str, timeout: float = 60.0) -> bool:
|
2023-09-07 00:44:54 +00:00
|
|
|
return bool(int(self.execute_query(f"exists {path}", timeout)))
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
def stop(self):
|
|
|
|
if not self.stopped:
|
|
|
|
self.stopped = True
|
2023-09-07 00:44:54 +00:00
|
|
|
self.proc.communicate(b"exit\n", timeout=10.0)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def sync(self, path: str, timeout: float = 60.0):
|
2023-09-07 00:44:54 +00:00
|
|
|
self.execute_query(f"sync {path}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def touch(self, path: str, timeout: float = 60.0):
|
2023-09-07 00:44:54 +00:00
|
|
|
self.execute_query(f"touch {path}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def find_big_family(self, path: str, n: int = 10, timeout: float = 60.0) -> str:
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query(f"find_big_family {path} {n}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def find_super_nodes(self, threshold: int, timeout: float = 60.0) -> str:
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query(f"find_super_nodes {threshold}", timeout)
|
2023-10-12 14:01:37 +00:00
|
|
|
|
2023-10-24 15:02:54 +00:00
|
|
|
def get_direct_children_number(self, path: str, timeout: float = 60.0) -> str:
|
|
|
|
return self.execute_query(f"get_direct_children_number {path}", timeout)
|
|
|
|
|
2023-10-12 13:15:52 +00:00
|
|
|
def get_all_children_number(self, path: str, timeout: float = 60.0) -> str:
|
|
|
|
return self.execute_query(f"get_all_children_number {path}", timeout)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-12 18:06:57 +00:00
|
|
|
def delete_stale_backups(self, timeout: float = 60.0) -> str:
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query("delete_stale_backups", timeout)
|
2023-10-12 14:01:37 +00:00
|
|
|
|
2023-09-07 00:44:54 +00:00
|
|
|
def reconfig(
|
|
|
|
self,
|
|
|
|
joining: tp.Optional[str],
|
|
|
|
leaving: tp.Optional[str],
|
|
|
|
new_members: tp.Optional[str],
|
2023-09-12 18:06:57 +00:00
|
|
|
timeout: float = 60.0,
|
2023-09-07 00:44:54 +00:00
|
|
|
) -> str:
|
2023-09-07 00:36:39 +00:00
|
|
|
if bool(joining) + bool(leaving) + bool(new_members) != 1:
|
2023-09-07 00:44:54 +00:00
|
|
|
raise ValueError(
|
|
|
|
"Exactly one of joining, leaving or new_members must be specified"
|
|
|
|
)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
if joining is not None:
|
2023-09-07 00:44:54 +00:00
|
|
|
operation = "add"
|
2023-09-07 00:36:39 +00:00
|
|
|
elif leaving is not None:
|
2023-09-07 00:44:54 +00:00
|
|
|
operation = "remove"
|
2023-09-07 00:36:39 +00:00
|
|
|
elif new_members is not None:
|
2023-09-07 00:44:54 +00:00
|
|
|
operation = "set"
|
2023-09-07 00:36:39 +00:00
|
|
|
else:
|
2023-09-07 00:44:54 +00:00
|
|
|
raise ValueError(
|
|
|
|
"At least one of joining, leaving or new_members must be specified"
|
|
|
|
)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
2023-09-07 00:44:54 +00:00
|
|
|
return self.execute_query(
|
|
|
|
f"reconfig {operation} {joining or leaving or new_members}", timeout
|
|
|
|
)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
@contextlib.contextmanager
|
2023-09-07 00:44:54 +00:00
|
|
|
def from_cluster(
|
|
|
|
cls, cluster: ClickHouseCluster, keeper_node: str, port: tp.Optional[int] = None
|
|
|
|
) -> "KeeperClient":
|
|
|
|
client = cls(
|
|
|
|
cluster.server_bin_path,
|
|
|
|
cluster.get_instance_ip(keeper_node),
|
|
|
|
port or cluster.zookeeper_port,
|
|
|
|
)
|
2023-09-07 00:36:39 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
yield client
|
|
|
|
finally:
|
|
|
|
client.stop()
|
2022-11-08 11:56:20 +00:00
|
|
|
|
2022-09-06 10:58:14 +00:00
|
|
|
|
|
|
|
def get_keeper_socket(cluster, node, port=9181):
|
|
|
|
hosts = cluster.get_instance_ip(node.name)
|
|
|
|
client = socket.socket()
|
|
|
|
client.settimeout(10)
|
|
|
|
client.connect((hosts, port))
|
|
|
|
return client
|
|
|
|
|
|
|
|
|
|
|
|
def send_4lw_cmd(cluster, node, cmd="ruok", port=9181):
|
|
|
|
client = None
|
|
|
|
try:
|
|
|
|
client = get_keeper_socket(cluster, node, port)
|
|
|
|
client.send(cmd.encode())
|
|
|
|
data = client.recv(100_000)
|
|
|
|
data = data.decode()
|
|
|
|
return data
|
|
|
|
finally:
|
|
|
|
if client is not None:
|
|
|
|
client.close()
|
|
|
|
|
|
|
|
|
|
|
|
NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests"
|
|
|
|
|
|
|
|
|
2023-04-20 13:26:02 +00:00
|
|
|
def wait_until_connected(cluster, node, port=9181, timeout=30.0):
|
|
|
|
elapsed = 0.0
|
|
|
|
|
2022-09-06 10:58:14 +00:00
|
|
|
while send_4lw_cmd(cluster, node, "mntr", port) == NOT_SERVING_REQUESTS_ERROR_MSG:
|
|
|
|
time.sleep(0.1)
|
2023-04-20 13:26:02 +00:00
|
|
|
elapsed += 0.1
|
|
|
|
|
|
|
|
if elapsed >= timeout:
|
|
|
|
raise Exception(
|
|
|
|
f"{timeout}s timeout while waiting for {node.name} to start serving requests"
|
|
|
|
)
|
2022-09-06 10:58:14 +00:00
|
|
|
|
|
|
|
|
|
|
|
def wait_until_quorum_lost(cluster, node, port=9181):
|
|
|
|
while send_4lw_cmd(cluster, node, "mntr", port) != NOT_SERVING_REQUESTS_ERROR_MSG:
|
|
|
|
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
|
|
|
def wait_nodes(cluster, nodes):
|
|
|
|
for node in nodes:
|
|
|
|
wait_until_connected(cluster, node)
|
2022-11-08 10:44:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
def is_leader(cluster, node, port=9181):
|
|
|
|
stat = send_4lw_cmd(cluster, node, "stat", port)
|
2022-11-08 11:43:36 +00:00
|
|
|
return "Mode: leader" in stat
|
2022-11-16 01:33:46 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_leader(cluster, nodes):
|
|
|
|
for node in nodes:
|
|
|
|
if is_leader(cluster, node):
|
|
|
|
return node
|
|
|
|
raise Exception("No leader in Keeper cluster.")
|
2023-04-20 13:26:02 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_fake_zk(cluster, node, timeout: float = 30.0) -> KazooClient:
|
|
|
|
_fake = KazooClient(
|
|
|
|
hosts=cluster.get_instance_ip(node.name) + ":9181", timeout=timeout
|
|
|
|
)
|
|
|
|
_fake.start()
|
|
|
|
return _fake
|
|
|
|
|
|
|
|
|
2023-09-07 00:36:39 +00:00
|
|
|
def get_config_str(zk: KeeperClient) -> str:
|
2023-04-20 13:26:02 +00:00
|
|
|
"""
|
|
|
|
Return decoded contents of /keeper/config node
|
|
|
|
"""
|
2023-09-07 00:36:39 +00:00
|
|
|
return zk.get("/keeper/config")
|
2023-04-20 13:26:02 +00:00
|
|
|
|
|
|
|
|
2023-09-07 00:36:39 +00:00
|
|
|
def wait_configs_equal(left_config: str, right_zk: KeeperClient, timeout: float = 30.0):
|
2023-04-20 13:26:02 +00:00
|
|
|
"""
|
2023-07-03 16:38:26 +00:00
|
|
|
Check whether get /keeper/config result in left_config is equal
|
|
|
|
to get /keeper/config on right_zk ZK connection.
|
2023-04-20 13:26:02 +00:00
|
|
|
"""
|
2023-07-03 17:52:57 +00:00
|
|
|
elapsed: float = 0.0
|
|
|
|
while sorted(left_config.split("\n")) != sorted(
|
|
|
|
get_config_str(right_zk).split("\n")
|
|
|
|
):
|
2023-07-03 16:38:26 +00:00
|
|
|
time.sleep(1)
|
|
|
|
elapsed += 1
|
|
|
|
if elapsed >= timeout:
|
|
|
|
raise Exception(
|
|
|
|
f"timeout while checking nodes configs to get equal. "
|
2023-07-03 17:52:57 +00:00
|
|
|
f"Left: {left_config}, right: {get_config_str(right_zk)}"
|
|
|
|
)
|