ClickHouse/tests/integration/test_keeper_force_recovery/test.py

import os
import pytest
import socket
from helpers.cluster import ClickHouseCluster
import time


from kazoo.client import KazooClient, KazooRetry

CLUSTER_SIZE = 5
QUORUM_SIZE = CLUSTER_SIZE // 2 + 1


cluster = ClickHouseCluster(__file__)
CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")


def get_nodes():
    nodes = []
    for i in range(CLUSTER_SIZE):
        nodes.append(
            cluster.add_instance(
                f"node{i+1}",
                main_configs=[
                    f"configs/enable_keeper{i+1}.xml",
                    f"configs/use_keeper.xml",
                ],
                stay_alive=True,
            )
        )

    for i in range(CLUSTER_SIZE, CLUSTER_SIZE + QUORUM_SIZE):
        nodes.append(
            cluster.add_instance(f"node{i+1}", main_configs=[], stay_alive=True)
        )

    return nodes


nodes = get_nodes()


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster.start()

        yield cluster
    finally:
        cluster.shutdown()


def get_fake_zk(nodename, timeout=30.0):
    _fake_zk_instance = KazooClient(
        hosts=cluster.get_instance_ip(nodename) + ":9181",
        timeout=timeout,
        connection_retry=KazooRetry(max_tries=10),
        command_retry=KazooRetry(max_tries=10),
    )

    _fake_zk_instance.start()
    return _fake_zk_instance


def get_keeper_socket(node_name):
    hosts = cluster.get_instance_ip(node_name)
    client = socket.socket()
    client.settimeout(10)
    client.connect((hosts, 9181))
    return client


def send_4lw_cmd(node_name, cmd="ruok"):
    client = None
    try:
        client = get_keeper_socket(node_name)
        client.send(cmd.encode())
        data = client.recv(100_000)
        data = data.decode()
        return data
    finally:
        if client is not None:
            client.close()


def wait_until_connected(node_name):
    while send_4lw_cmd(node_name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG:
        time.sleep(0.1)


def wait_nodes(nodes):
    for node in nodes:
        wait_until_connected(node.name)


def wait_and_assert_data(zk, path, data):
    while zk.retry(zk.exists, path) is None:
        time.sleep(0.1)
    assert zk.retry(zk.get, path)[0] == data.encode()


def close_zk(zk):
    zk.stop()
    zk.close()


NOT_SERVING_REQUESTS_ERROR_MSG = "This instance is not currently serving requests"


def test_cluster_recovery(started_cluster):
    node_zks = []
    try:
        # initial cluster of `cluster_size` nodes
        for node in nodes[CLUSTER_SIZE:]:
            node.stop_clickhouse()

        wait_nodes(nodes[:CLUSTER_SIZE])

        node_zks = [get_fake_zk(node.name) for node in nodes[:CLUSTER_SIZE]]

        data_in_cluster = []

        def add_data(zk, path, data):
            zk.retry(zk.create, path, data.encode())
            data_in_cluster.append((path, data))

        def assert_all_data(zk):
            for path, data in data_in_cluster:
                wait_and_assert_data(zk, path, data)

        for i, zk in enumerate(node_zks):
            add_data(zk, f"/test_force_recovery_node{i+1}", f"somedata{i+1}")

        for zk in node_zks:
            assert_all_data(zk)

        nodes[0].stop_clickhouse()

        # we potentially killed the leader node so we give time for election
        for _ in range(100):
            try:
                node_zks[1] = get_fake_zk(nodes[1].name, timeout=30.0)
                add_data(node_zks[1], "/test_force_recovery_extra", "somedataextra")
                break
            except Exception as ex:
                time.sleep(0.5)
                print(f"Retrying create on {nodes[1].name}, exception {ex}")
        else:
            raise Exception(f"Failed creating a node on {nodes[1].name}")

        for node_zk in node_zks[2:CLUSTER_SIZE]:
            wait_and_assert_data(node_zk, "/test_force_recovery_extra", "somedataextra")

        nodes[0].start_clickhouse()
        wait_until_connected(nodes[0].name)
        node_zks[0] = get_fake_zk(nodes[0].name)
        wait_and_assert_data(node_zks[0], "/test_force_recovery_extra", "somedataextra")

        # stop last quorum size nodes
        nodes_left = CLUSTER_SIZE - QUORUM_SIZE
        for node_zk in node_zks[nodes_left:CLUSTER_SIZE]:
            close_zk(node_zk)

        node_zks = node_zks[:nodes_left]

        for node in nodes[nodes_left:CLUSTER_SIZE]:
            node.stop_clickhouse()

        # wait for node1 to lose quorum
        while send_4lw_cmd(nodes[0].name, "mntr") != NOT_SERVING_REQUESTS_ERROR_MSG:
            time.sleep(0.2)

        nodes[0].copy_file_to_container(
            os.path.join(CONFIG_DIR, "recovered_keeper1.xml"),
            "/etc/clickhouse-server/config.d/enable_keeper1.xml",
        )

        nodes[0].query("SYSTEM RELOAD CONFIG")

        assert send_4lw_cmd(nodes[0].name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG
        send_4lw_cmd(nodes[0].name, "rcvr")
        assert send_4lw_cmd(nodes[0].name, "mntr") == NOT_SERVING_REQUESTS_ERROR_MSG

        # add one node to restore the quorum
        nodes[CLUSTER_SIZE].copy_file_to_container(
            os.path.join(
                CONFIG_DIR,
                f"enable_keeper{CLUSTER_SIZE+1}.xml",
            ),
            f"/etc/clickhouse-server/config.d/enable_keeper{CLUSTER_SIZE+1}.xml",
        )

        nodes[CLUSTER_SIZE].start_clickhouse()
        wait_until_connected(nodes[CLUSTER_SIZE].name)

        # node1 should have quorum now and accept requests
        wait_until_connected(nodes[0].name)

        node_zks.append(get_fake_zk(nodes[CLUSTER_SIZE].name))

        # add rest of the nodes
        for i in range(CLUSTER_SIZE + 1, len(nodes)):
            node = nodes[i]
            node.copy_file_to_container(
                os.path.join(CONFIG_DIR, f"enable_keeper{i+1}.xml"),
                f"/etc/clickhouse-server/config.d/enable_keeper{i+1}.xml",
            )
            node.start_clickhouse()
            wait_until_connected(node.name)
            node_zks.append(get_fake_zk(node.name))

        # refresh old zk sessions
        for i, node in enumerate(nodes[:nodes_left]):
            node_zks[i] = get_fake_zk(node.name)

        for zk in node_zks:
            assert_all_data(zk)

        # new nodes can achieve quorum without the recovery node (cluster should work properly from now on)
        nodes[0].stop_clickhouse()

        add_data(node_zks[-2], "/test_force_recovery_last", "somedatalast")
        wait_and_assert_data(node_zks[-1], "/test_force_recovery_last", "somedatalast")

        nodes[0].start_clickhouse()
        wait_until_connected(nodes[0].name)
        node_zks[0] = get_fake_zk(nodes[0].name)
        for zk in node_zks[:nodes_left]:
            assert_all_data(zk)
    finally:
        try:
            for zk_conn in node_zks:
                close_zk(zk_conn)
        except:
            pass