ClickHouse/tests/integration/test_keeper_s3_snapshot/test.py

from multiprocessing.dummy import Pool
from time import sleep

import pytest
from kazoo.client import KazooClient
from minio.deleteobjects import DeleteObject

from helpers import keeper_utils
from helpers.cluster import ClickHouseCluster
from helpers.retry_decorator import retry

# from kazoo.protocol.serialization import Connect, read_buffer, write_buffer

cluster = ClickHouseCluster(__file__)
node1 = cluster.add_instance(
    "node1",
    main_configs=["configs/keeper_config1.xml"],
    stay_alive=True,
    with_minio=True,
)
node2 = cluster.add_instance(
    "node2",
    main_configs=["configs/keeper_config2.xml"],
    stay_alive=True,
    with_minio=True,
)
node3 = cluster.add_instance(
    "node3",
    main_configs=["configs/keeper_config3.xml"],
    stay_alive=True,
    with_minio=True,
)


@pytest.fixture(scope="module")
def started_cluster():
    try:
        cluster.start()

        cluster.minio_client.make_bucket("snapshots")

        yield cluster

    finally:
        cluster.shutdown()


def get_fake_zk(nodename, timeout=30.0):
    _fake_zk_instance = KazooClient(
        hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout
    )
    _fake_zk_instance.start()
    return _fake_zk_instance


def destroy_zk_client(zk):
    try:
        if zk:
            zk.stop()
            zk.close()
    except:
        pass


def wait_node(node):
    for _ in range(100):
        zk = None
        try:
            zk = get_fake_zk(node.name, timeout=30.0)
            zk.sync("/")
            print("node", node.name, "ready")
            break
        except Exception as ex:
            sleep(0.2)
            print("Waiting until", node.name, "will be ready, exception", ex)
        finally:
            destroy_zk_client(zk)
    else:
        raise Exception("Can't wait node", node.name, "to become ready")


def delete_keeper_snapshots_logs(nodex):
    nodex.exec_in_container(
        [
            "bash",
            "-c",
            "rm -rf /var/lib/clickhouse/coordination/log /var/lib/clickhouse/coordination/snapshots",
        ]
    )


def test_s3_upload(started_cluster):

    node1_zk = get_fake_zk(node1.name)

    # we defined in configs snapshot_distance as 50
    # so after 50 requests we should generate a snapshot
    for _ in range(210):
        node1_zk.create("/test", sequence=True)

    def get_saved_snapshots():
        return [
            obj.object_name
            for obj in list(cluster.minio_client.list_objects("snapshots"))
        ]

    def delete_s3_snapshots():
        snapshots = cluster.minio_client.list_objects("snapshots")
        for s in snapshots:
            cluster.minio_client.remove_object("snapshots", s.object_name)

    # Keeper sends snapshots asynchornously, hence we need to retry.
    def _check_snapshots():
        assert set(get_saved_snapshots()) == set(
            [
                "snapshot_50.bin.zstd",
                "snapshot_100.bin.zstd",
                "snapshot_150.bin.zstd",
                "snapshot_200.bin.zstd",
            ]
        )

    retry(AssertionError, retries=10, delay=2, jitter=0, backoff=1)(_check_snapshots)

    destroy_zk_client(node1_zk)
    node1.stop_clickhouse(kill=True)

    # wait for new leader to be picked and that it continues
    # uploading snapshots
    wait_node(node2)
    node2_zk = get_fake_zk(node2.name)
    for _ in range(200):
        node2_zk.create("/test", sequence=True)

    def _check_snapshots_without_quorum():
        assert len(get_saved_snapshots()) > 4

    retry(AssertionError, retries=10, delay=2, jitter=0, backoff=1)(
        _check_snapshots_without_quorum
    )

    _check_snapshots_without_quorum()

    success_upload_message = "Successfully uploaded"
    assert node2.contains_in_log(success_upload_message) or node3.contains_in_log(
        success_upload_message
    )

    destroy_zk_client(node2_zk)
    node2.stop_clickhouse()
    delete_keeper_snapshots_logs(node2)
    node3.stop_clickhouse()
    delete_keeper_snapshots_logs(node3)
    delete_keeper_snapshots_logs(node1)
    p = Pool(3)
    waiters = []

    def start_clickhouse(node):
        node.start_clickhouse()

    waiters.append(p.apply_async(start_clickhouse, args=(node1,)))
    waiters.append(p.apply_async(start_clickhouse, args=(node2,)))
    waiters.append(p.apply_async(start_clickhouse, args=(node3,)))

    delete_s3_snapshots()  # for next iteration

    for waiter in waiters:
        waiter.wait()

    keeper_utils.wait_until_connected(cluster, node1)
    keeper_utils.wait_until_connected(cluster, node2)
    keeper_utils.wait_until_connected(cluster, node3)
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`from multiprocessing.dummy import Pool`
Automatic style fix 2024-09-27 10:19:39 +00:00			`from time import sleep`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00
Automatic style fix 2024-09-27 10:19:39 +00:00			`import pytest`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00			`from kazoo.client import KazooClient`
Automatic style fix 2024-09-27 10:19:39 +00:00			`from minio.deleteobjects import DeleteObject`

Simplify retry decorator, use it in tests 2024-09-30 20:13:56 +00:00			`from helpers import keeper_utils`
Automatic style fix 2024-09-27 10:19:39 +00:00			`from helpers.cluster import ClickHouseCluster`
Simplify retry decorator, use it in tests 2024-09-30 20:13:56 +00:00			`from helpers.retry_decorator import retry`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00
			`# from kazoo.protocol.serialization import Connect, read_buffer, write_buffer`

			`cluster = ClickHouseCluster(__file__)`
			`node1 = cluster.add_instance(`
			`"node1",`
			`main_configs=["configs/keeper_config1.xml"],`
			`stay_alive=True,`
			`with_minio=True,`
			`)`
			`node2 = cluster.add_instance(`
			`"node2",`
			`main_configs=["configs/keeper_config2.xml"],`
			`stay_alive=True,`
			`with_minio=True,`
			`)`
			`node3 = cluster.add_instance(`
			`"node3",`
			`main_configs=["configs/keeper_config3.xml"],`
			`stay_alive=True,`
			`with_minio=True,`
			`)`


			`@pytest.fixture(scope="module")`
			`def started_cluster():`
			`try:`
			`cluster.start()`

			`cluster.minio_client.make_bucket("snapshots")`

			`yield cluster`

			`finally:`
			`cluster.shutdown()`


			`def get_fake_zk(nodename, timeout=30.0):`
			`_fake_zk_instance = KazooClient(`
			`hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout`
			`)`
			`_fake_zk_instance.start()`
			`return _fake_zk_instance`


			`def destroy_zk_client(zk):`
			`try:`
			`if zk:`
			`zk.stop()`
			`zk.close()`
			`except:`
			`pass`


			`def wait_node(node):`
			`for _ in range(100):`
			`zk = None`
			`try:`
			`zk = get_fake_zk(node.name, timeout=30.0)`
			`zk.sync("/")`
			`print("node", node.name, "ready")`
			`break`
			`except Exception as ex:`
			`sleep(0.2)`
			`print("Waiting until", node.name, "will be ready, exception", ex)`
			`finally:`
			`destroy_zk_client(zk)`
			`else:`
			`raise Exception("Can't wait node", node.name, "to become ready")`


Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`def delete_keeper_snapshots_logs(nodex):`
			`nodex.exec_in_container(`
			`[`
			`"bash",`
			`"-c",`
Python formatting 2024-09-05 13:57:46 +00:00			`"rm -rf /var/lib/clickhouse/coordination/log /var/lib/clickhouse/coordination/snapshots",`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`]`
			`)`


Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00			`def test_s3_upload(started_cluster):`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00			`node1_zk = get_fake_zk(node1.name)`

Address PR comments 2022-09-22 13:03:27 +00:00			`# we defined in configs snapshot_distance as 50`
			`# so after 50 requests we should generate a snapshot`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00			`for _ in range(210):`
			`node1_zk.create("/test", sequence=True)`

			`def get_saved_snapshots():`
			`return [`
			`obj.object_name`
			`for obj in list(cluster.minio_client.list_objects("snapshots"))`
			`]`

Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`def delete_s3_snapshots():`
			`snapshots = cluster.minio_client.list_objects("snapshots")`
			`for s in snapshots:`
Python formatting 2024-09-05 13:57:46 +00:00			`cluster.minio_client.remove_object("snapshots", s.object_name)`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00
Fix flakiness of test_keeper_s3_snapshot This is part of "Why my PR is not merged?" patch set series. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-07-13 11:29:45 +00:00			`# Keeper sends snapshots asynchornously, hence we need to retry.`
			`def _check_snapshots():`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`assert set(get_saved_snapshots()) == set(`
Fix flakiness of test_keeper_s3_snapshot This is part of "Why my PR is not merged?" patch set series. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-07-13 11:29:45 +00:00			`[`
			`"snapshot_50.bin.zstd",`
			`"snapshot_100.bin.zstd",`
			`"snapshot_150.bin.zstd",`
			`"snapshot_200.bin.zstd",`
			`]`
			`)`

Improve the @retry API 2024-09-30 20:42:47 +00:00			`retry(AssertionError, retries=10, delay=2, jitter=0, backoff=1)(_check_snapshots)`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00
			`destroy_zk_client(node1_zk)`
			`node1.stop_clickhouse(kill=True)`

Address PR comments 2022-09-22 13:03:27 +00:00			`# wait for new leader to be picked and that it continues`
			`# uploading snapshots`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00			`wait_node(node2)`
			`node2_zk = get_fake_zk(node2.name)`
			`for _ in range(200):`
			`node2_zk.create("/test", sequence=True)`

Fix flakiness of test_keeper_s3_snapshot This is part of "Why my PR is not merged?" patch set series. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-07-13 11:29:45 +00:00			`def _check_snapshots_without_quorum():`
			`assert len(get_saved_snapshots()) > 4`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00
Improve the @retry API 2024-09-30 20:42:47 +00:00			`retry(AssertionError, retries=10, delay=2, jitter=0, backoff=1)(`
			`_check_snapshots_without_quorum`
			`)`

Fix flakiness of test_keeper_s3_snapshot This is part of "Why my PR is not merged?" patch set series. Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com> 2023-07-13 11:29:45 +00:00			`_check_snapshots_without_quorum()`
Add tests for S3 snapshot upload 2022-09-15 13:37:17 +00:00
			`success_upload_message = "Successfully uploaded"`
			`assert node2.contains_in_log(success_upload_message) or node3.contains_in_log(`
			`success_upload_message`
			`)`

			`destroy_zk_client(node2_zk)`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`node2.stop_clickhouse()`
			`delete_keeper_snapshots_logs(node2)`
			`node3.stop_clickhouse()`
			`delete_keeper_snapshots_logs(node3)`
			`delete_keeper_snapshots_logs(node1)`
			`p = Pool(3)`
			`waiters = []`
Python formatting 2024-09-05 13:57:46 +00:00
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`def start_clickhouse(node):`
			`node.start_clickhouse()`

			`waiters.append(p.apply_async(start_clickhouse, args=(node1,)))`
			`waiters.append(p.apply_async(start_clickhouse, args=(node2,)))`
			`waiters.append(p.apply_async(start_clickhouse, args=(node3,)))`

			`delete_s3_snapshots() # for next iteration`

			`for waiter in waiters:`
			`waiter.wait()`

Added restart of node1 for running multiple iterations of test 2024-09-01 13:53:56 +00:00			`keeper_utils.wait_until_connected(cluster, node1)`
Proper cleanup & restart for subsequent iterations 2024-09-05 11:37:57 +00:00			`keeper_utils.wait_until_connected(cluster, node2)`
			`keeper_utils.wait_until_connected(cluster, node3)`