ClickHouse/tests/integration/test_alter_moving_garbage/test.py

import logging
import time

import pytest
import threading
import random

from helpers.client import QueryRuntimeException
from helpers.cluster import ClickHouseCluster

# two replicas in remote_servers.xml
REPLICA_COUNT = 2


@pytest.fixture(scope="module")
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        for i in range(1, REPLICA_COUNT + 1):
            cluster.add_instance(
                f"node{i}",
                main_configs=[
                    "configs/config.d/storage_conf.xml",
                    "configs/config.d/remote_servers.xml",
                ],
                with_minio=True,
                with_zookeeper=True,
            )

        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        yield cluster
    finally:
        cluster.shutdown()


def create_table(node, table_name, replicated, additional_settings):
    settings = {
        "storage_policy": "two_disks",
        "old_parts_lifetime": 0,
        "index_granularity": 512,
        "temporary_directories_lifetime": 0,
        "merge_tree_clear_old_temporary_directories_interval_seconds": 1,
    }
    settings.update(additional_settings)

    table_engine = (
        f"ReplicatedMergeTree('/clickhouse/tables/0/{table_name}', '{node.name}')"
        if replicated
        else "MergeTree()"
    )

    create_table_statement = f"""
        CREATE TABLE {table_name} (
            dt Date,
            id Int64,
            data String,
            INDEX min_max (id) TYPE minmax GRANULARITY 3
        ) ENGINE = {table_engine}
        PARTITION BY dt
        ORDER BY (dt, id)
        SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}"""

    if replicated:
        node.query_with_retry(create_table_statement)
    else:
        node.query(create_table_statement)


@pytest.mark.parametrize(
    "allow_remote_fs_zero_copy_replication,replicated_engine",
    [(False, False), (False, True), (True, True)],
)
def test_alter_moving(
    cluster, allow_remote_fs_zero_copy_replication, replicated_engine
):
    """
    Test that we correctly move parts during ALTER TABLE
    """

    if replicated_engine:
        nodes = list(cluster.instances.values())
    else:
        nodes = [cluster.instances["node1"]]

    additional_settings = {}

    # Different names for logs readability
    table_name = "test_table"
    if allow_remote_fs_zero_copy_replication:
        table_name = "test_table_zero_copy"
        additional_settings["allow_remote_fs_zero_copy_replication"] = 1
    if replicated_engine:
        table_name = table_name + "_replicated"

    for node in nodes:
        create_table(node, table_name, replicated_engine, additional_settings)

    for i in range(1, 11):
        partition = f"2021-01-{i:02d}"
        random.choice(nodes).query(
            f"INSERT INTO {table_name} SELECT toDate('{partition}'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)"
        )

    # Run ALTER in parallel with moving parts

    stop_alter = False

    def alter():
        random.choice(nodes).query(f"ALTER TABLE {table_name} ADD COLUMN col0 String")
        for d in range(1, 100):
            if stop_alter:
                break

            # Some lightweight mutation should change moving part before it is swapped, then we will have to cleanup it.
            # Messages `Failed to swap {}. Active part doesn't exist` should appear in logs.
            #
            # I managed to reproduce issue with DELETE (`ALTER TABLE {table_name} ADD/DROP COLUMN` also works on real s3 instead of minio)
            # Note: do not delete rows with id % 100 = 0, because they are used in `check_count` to use them in check that data is not corrupted
            random.choice(nodes).query(f"DELETE FROM {table_name} WHERE id % 100 = {d}")

            time.sleep(0.1)

    alter_thread = threading.Thread(target=alter)
    alter_thread.start()

    for i in range(1, 11):
        partition = f"2021-01-{i:02d}"
        try:
            random.choice(nodes).query(
                f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's31'",
            )
        except QueryRuntimeException as e:
            if "PART_IS_TEMPORARILY_LOCKED" in str(e):
                continue
            raise e

        # Function to clear old temporary directories wakes up every 1 second, sleep to make sure it is called
        time.sleep(0.5)

    stop_alter = True
    alter_thread.join()

    # Check that no data was lost

    data_digest = None
    if replicated_engine:
        # We don't know what data was replicated, so we need to check all replicas and take unique values
        data_digest = random.choice(nodes).query_with_retry(
            f"SELECT countDistinct(dt, data) FROM clusterAllReplicas(test_cluster, default.{table_name}) WHERE id % 100 == 0"
        )
    else:
        data_digest = random.choice(nodes).query(
            f"SELECT countDistinct(dt, data) FROM {table_name} WHERE id % 100 == 0"
        )

    assert data_digest == "1000\n"


def test_delete_race_leftovers(cluster):
    """
    Test that we correctly delete outdated parts and do not leave any leftovers on s3
    """

    node = cluster.instances["node1"]

    table_name = "test_delete_race_leftovers"
    additional_settings = {
        # use another disk not to interfere with other tests
        "storage_policy": "one_disk",
        # always remove parts in parallel
        "concurrent_part_removal_threshold": 1,
    }

    create_table(
        node, table_name, replicated=True, additional_settings=additional_settings
    )

    # Stop merges to have several small parts in active set
    node.query(f"SYSTEM STOP MERGES {table_name}")

    # Creare several small parts in one partition
    for i in range(1, 11):
        node.query(
            f"INSERT INTO {table_name} SELECT toDate('2021-01-01'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)"
        )
    table_digest_query = f"SELECT count(), sum(sipHash64(id, data)) FROM {table_name}"
    table_digest = node.query(table_digest_query)

    # Execute several noop deletes to have parts with updated mutation id without changes in data
    # New parts will have symlinks to old parts
    node.query(f"SYSTEM START MERGES {table_name}")
    for i in range(10):
        node.query(f"DELETE FROM {table_name} WHERE data = ''")

    # Make existing parts outdated
    # Also we don't want have changing parts set,
    # because it will be difficult match objects on s3 and in remote_data_paths to check correctness
    node.query(f"OPTIMIZE TABLE {table_name} FINAL")

    inactive_parts_query = (
        f"SELECT count() FROM system.parts "
        f"WHERE not active AND table = '{table_name}' AND database = 'default'"
    )

    # Try to wait for deletion of outdated parts
    # However, we do not want to wait too long
    # If some parts are not deleted after several iterations, we will just continue
    for i in range(20):
        inactive_parts_count = int(node.query(inactive_parts_query).strip())
        if inactive_parts_count == 0:
            print(f"Inactive parts are deleted after {i} iterations")
            break

        print(f"Inactive parts count: {inactive_parts_count}")
        time.sleep(5)

    # Check that we correctly deleted all outdated parts and no leftovers on s3
    # Do it with retries because we delete blobs in the background
    # and it can be race condition between removing from remote_data_paths and deleting blobs
    all_remote_paths = set()
    known_remote_paths = set()
    for i in range(3):
        known_remote_paths = set(
            node.query(
                f"SELECT remote_path FROM system.remote_data_paths WHERE disk_name = 's32'"
            ).splitlines()
        )

        all_remote_paths = set(
            obj.object_name
            for obj in cluster.minio_client.list_objects(
                cluster.minio_bucket, "data2/", recursive=True
            )
        )

        # Some blobs can be deleted after we listed remote_data_paths
        # It's alright, thus we check only that all remote paths are known
        # (in other words, all remote paths is subset of known paths)
        if all_remote_paths == {p for p in known_remote_paths if p in all_remote_paths}:
            break

        time.sleep(1)

    assert all_remote_paths == {p for p in known_remote_paths if p in all_remote_paths}

    # Check that we have all data
    assert table_digest == node.query(table_digest_query)
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`import logging`
			`import time`

			`import pytest`
			`import threading`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`import random`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00
			`from helpers.client import QueryRuntimeException`
			`from helpers.cluster import ClickHouseCluster`

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`# two replicas in remote_servers.xml`
			`REPLICA_COUNT = 2`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00
Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`@pytest.fixture(scope="module")`
			`def cluster():`
			`try:`
			`cluster = ClickHouseCluster(__file__)`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`for i in range(1, REPLICA_COUNT + 1):`
			`cluster.add_instance(`
			`f"node{i}",`
			`main_configs=[`
			`"configs/config.d/storage_conf.xml",`
			`"configs/config.d/remote_servers.xml",`
			`],`
			`with_minio=True,`
			`with_zookeeper=True,`
			`)`

Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`logging.info("Starting cluster...")`
			`cluster.start()`
			`logging.info("Cluster started")`

			`yield cluster`
			`finally:`
			`cluster.shutdown()`


Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`def create_table(node, table_name, replicated, additional_settings):`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`settings = {`
			`"storage_policy": "two_disks",`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`"old_parts_lifetime": 0,`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`"index_granularity": 512,`
			`"temporary_directories_lifetime": 0,`
			`"merge_tree_clear_old_temporary_directories_interval_seconds": 1,`
			`}`
			`settings.update(additional_settings)`

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`table_engine = (`
			`f"ReplicatedMergeTree('/clickhouse/tables/0/{table_name}', '{node.name}')"`
			`if replicated`
			`else "MergeTree()"`
			`)`

Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`create_table_statement = f"""`
			`CREATE TABLE {table_name} (`
			`dt Date,`
			`id Int64,`
			`data String,`
			`INDEX min_max (id) TYPE minmax GRANULARITY 3`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`) ENGINE = {table_engine}`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`PARTITION BY dt`
			`ORDER BY (dt, id)`
			`SETTINGS {",".join((k+"="+repr(v) for k, v in settings.items()))}"""`

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`if replicated:`
			`node.query_with_retry(create_table_statement)`
			`else:`
			`node.query(create_table_statement)`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`@pytest.mark.parametrize(`
			`"allow_remote_fs_zero_copy_replication,replicated_engine",`
			`[(False, False), (False, True), (True, True)],`
			`)`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`def test_alter_moving(`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`cluster, allow_remote_fs_zero_copy_replication, replicated_engine`
			`):`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`"""`
			`Test that we correctly move parts during ALTER TABLE`
			`"""`

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`if replicated_engine:`
			`nodes = list(cluster.instances.values())`
			`else:`
			`nodes = [cluster.instances["node1"]]`
Upd test test_alter_moving_garbage 2023-06-06 15:17:06 +00:00
			`additional_settings = {}`

Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00			`# Different names for logs readability`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`table_name = "test_table"`
Upd test test_alter_moving_garbage 2023-06-06 15:17:06 +00:00			`if allow_remote_fs_zero_copy_replication:`
			`table_name = "test_table_zero_copy"`
			`additional_settings["allow_remote_fs_zero_copy_replication"] = 1`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`if replicated_engine:`
			`table_name = table_name + "_replicated"`
Upd test test_alter_moving_garbage 2023-06-06 15:17:06 +00:00
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`for node in nodes:`
			`create_table(node, table_name, replicated_engine, additional_settings)`
Upd test test_alter_moving_garbage 2023-06-06 15:17:06 +00:00
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`for i in range(1, 11):`
			`partition = f"2021-01-{i:02d}"`
			`random.choice(nodes).query(`
			`f"INSERT INTO {table_name} SELECT toDate('{partition}'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)"`
			`)`

Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00			`# Run ALTER in parallel with moving parts`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00
			`stop_alter = False`

			`def alter():`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`random.choice(nodes).query(f"ALTER TABLE {table_name} ADD COLUMN col0 String")`
			`for d in range(1, 100):`
			`if stop_alter:`
			`break`
Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00
			`# Some lightweight mutation should change moving part before it is swapped, then we will have to cleanup it.`
			# Messages `Failed to swap {}. Active part doesn't exist` should appear in logs.
			`#`
			# I managed to reproduce issue with DELETE (`ALTER TABLE {table_name} ADD/DROP COLUMN` also works on real s3 instead of minio)
			# Note: do not delete rows with id % 100 = 0, because they are used in `check_count` to use them in check that data is not corrupted
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`random.choice(nodes).query(f"DELETE FROM {table_name} WHERE id % 100 = {d}")`
Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`time.sleep(0.1)`

			`alter_thread = threading.Thread(target=alter)`
			`alter_thread.start()`

Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`for i in range(1, 11):`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`partition = f"2021-01-{i:02d}"`
			`try:`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`random.choice(nodes).query(`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`f"ALTER TABLE {table_name} MOVE PARTITION '{partition}' TO DISK 's31'",`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`)`
			`except QueryRuntimeException as e:`
Upd test test_alter_moving_garbage 2023-06-06 15:17:06 +00:00			`if "PART_IS_TEMPORARILY_LOCKED" in str(e):`
			`continue`
			`raise e`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00			`# Function to clear old temporary directories wakes up every 1 second, sleep to make sure it is called`
Add test test_alter_moving_garbage 2023-06-05 18:23:24 +00:00			`time.sleep(0.5)`

			`stop_alter = True`
			`alter_thread.join()`
Upd test_alter_moving_garbage: use replicated engine 2023-06-07 17:37:32 +00:00
Upd test_alter_moving_garbage 2023-06-08 10:29:01 +00:00			`# Check that no data was lost`

			`data_digest = None`
			`if replicated_engine:`
			`# We don't know what data was replicated, so we need to check all replicas and take unique values`
			`data_digest = random.choice(nodes).query_with_retry(`
			`f"SELECT countDistinct(dt, data) FROM clusterAllReplicas(test_cluster, default.{table_name}) WHERE id % 100 == 0"`
			`)`
			`else:`
			`data_digest = random.choice(nodes).query(`
			`f"SELECT countDistinct(dt, data) FROM {table_name} WHERE id % 100 == 0"`
			`)`

			`assert data_digest == "1000\n"`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00

			`def test_delete_race_leftovers(cluster):`
			`"""`
			`Test that we correctly delete outdated parts and do not leave any leftovers on s3`
			`"""`

			`node = cluster.instances["node1"]`

			`table_name = "test_delete_race_leftovers"`
			`additional_settings = {`
			`# use another disk not to interfere with other tests`
			`"storage_policy": "one_disk",`
			`# always remove parts in parallel`
			`"concurrent_part_removal_threshold": 1,`
			`}`

			`create_table(`
			`node, table_name, replicated=True, additional_settings=additional_settings`
			`)`

			`# Stop merges to have several small parts in active set`
			`node.query(f"SYSTEM STOP MERGES {table_name}")`

			`# Creare several small parts in one partition`
			`for i in range(1, 11):`
			`node.query(`
			`f"INSERT INTO {table_name} SELECT toDate('2021-01-01'), number as id, toString(sipHash64(number, {i})) FROM numbers(10_000)"`
			`)`
			`table_digest_query = f"SELECT count(), sum(sipHash64(id, data)) FROM {table_name}"`
			`table_digest = node.query(table_digest_query)`

			`# Execute several noop deletes to have parts with updated mutation id without changes in data`
			`# New parts will have symlinks to old parts`
			`node.query(f"SYSTEM START MERGES {table_name}")`
			`for i in range(10):`
			`node.query(f"DELETE FROM {table_name} WHERE data = ''")`

			`# Make existing parts outdated`
			`# Also we don't want have changing parts set,`
			`# because it will be difficult match objects on s3 and in remote_data_paths to check correctness`
			`node.query(f"OPTIMIZE TABLE {table_name} FINAL")`

			`inactive_parts_query = (`
			`f"SELECT count() FROM system.parts "`
			`f"WHERE not active AND table = '{table_name}' AND database = 'default'"`
			`)`

			`# Try to wait for deletion of outdated parts`
			`# However, we do not want to wait too long`
			`# If some parts are not deleted after several iterations, we will just continue`
			`for i in range(20):`
			`inactive_parts_count = int(node.query(inactive_parts_query).strip())`
			`if inactive_parts_count == 0:`
			`print(f"Inactive parts are deleted after {i} iterations")`
			`break`

			`print(f"Inactive parts count: {inactive_parts_count}")`
			`time.sleep(5)`

			`# Check that we correctly deleted all outdated parts and no leftovers on s3`
Retry blob listing in test_alter_moving_garbage 2023-07-17 09:55:12 +00:00			`# Do it with retries because we delete blobs in the background`
			`# and it can be race condition between removing from remote_data_paths and deleting blobs`
			`all_remote_paths = set()`
			`known_remote_paths = set()`
			`for i in range(3):`
			`known_remote_paths = set(`
			`node.query(`
			`f"SELECT remote_path FROM system.remote_data_paths WHERE disk_name = 's32'"`
			`).splitlines()`
			`)`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00
Retry blob listing in test_alter_moving_garbage 2023-07-17 09:55:12 +00:00			`all_remote_paths = set(`
			`obj.object_name`
			`for obj in cluster.minio_client.list_objects(`
			`cluster.minio_bucket, "data2/", recursive=True`
			`)`
add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`)`

Retry blob listing in test_alter_moving_garbage 2023-07-17 09:55:12 +00:00			`# Some blobs can be deleted after we listed remote_data_paths`
			`# It's alright, thus we check only that all remote paths are known`
			`# (in other words, all remote paths is subset of known paths)`
			`if all_remote_paths == {p for p in known_remote_paths if p in all_remote_paths}:`
			`break`

			`time.sleep(1)`

add test_delete_race_leftovers 2023-07-08 13:33:49 +00:00			`assert all_remote_paths == {p for p in known_remote_paths if p in all_remote_paths}`

			`# Check that we have all data`
			`assert table_digest == node.query(table_digest_query)`