ClickHouse/tests/integration/test_parallel_replicas_custom_key/test.py

import pytest
from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)

nodes = [
    cluster.add_instance(
        f"n{i}", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
    )
    for i in range(1, 5)
]


@pytest.fixture(scope="module", autouse=True)
def start_cluster():
    try:
        cluster.start()
        yield cluster
    finally:
        cluster.shutdown()


def create_tables(cluster):
    n1 = nodes[0]
    n1.query("DROP TABLE IF EXISTS dist_table")
    n1.query(f"DROP TABLE IF EXISTS test_table ON CLUSTER {cluster}")

    n1.query(
        f"CREATE TABLE test_table ON CLUSTER {cluster} (key Int32, value String) Engine=MergeTree ORDER BY (key, sipHash64(value))"
    )
    n1.query(
        f"""
            CREATE TABLE dist_table AS test_table
            Engine=Distributed(
                {cluster},
                currentDatabase(),
                test_table,
                rand()
            )
            """
    )


def insert_data(cluster, row_num):
    create_tables(cluster)
    n1 = nodes[0]
    n1.query(
        f"INSERT INTO dist_table SELECT number % 4, number FROM numbers({row_num})"
    )
    n1.query("SYSTEM FLUSH DISTRIBUTED dist_table")


@pytest.mark.parametrize("custom_key", ["sipHash64(key)", "key"])
@pytest.mark.parametrize("filter_type", ["default", "range"])
@pytest.mark.parametrize(
    "cluster",
    ["test_multiple_shards_multiple_replicas", "test_single_shard_multiple_replicas"],
)
def test_parallel_replicas_custom_key(start_cluster, cluster, custom_key, filter_type):
    for node in nodes:
        node.rotate_logs()

    row_num = 1000
    insert_data(cluster, row_num)

    expected_result = ""
    for i in range(4):
        expected_result += f"{i}\t250\n"

    n1 = nodes[0]
    assert (
        n1.query(
            "SELECT key, count() FROM dist_table GROUP BY key ORDER BY key",
            settings={
                "prefer_localhost_replica": 0,
                "max_parallel_replicas": 4,
                "parallel_replicas_custom_key": custom_key,
                "parallel_replicas_custom_key_filter_type": filter_type,
            },
        )
        == expected_result
    )

    if cluster == "test_multiple_shards_multiple_replicas":
        # we simply process query on all replicas for each shard by appending the filter on replica
        assert all(
            node.contains_in_log("Processing query on a replica using custom_key")
            for node in nodes
        )
    else:
        # we first transform all replicas into shards and then append for each shard filter
        assert n1.contains_in_log(
            "Single shard cluster used with custom_key, transforming replicas into virtual shards"
        )