ClickHouse/tests/integration/test_parallel_replicas_custom_key_failover/test.py

import pytest
import uuid
from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)

node1 = cluster.add_instance(
    "n1", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)
node3 = cluster.add_instance(
    "n3", main_configs=["configs/remote_servers.xml"], with_zookeeper=True
)

nodes = [node1, node3]


@pytest.fixture(scope="module", autouse=True)
def start_cluster():
    try:
        cluster.start()
        yield cluster
    finally:
        cluster.shutdown()


def create_tables(cluster, table_name):
    node1.query(f"DROP TABLE IF EXISTS {table_name} SYNC")
    node3.query(f"DROP TABLE IF EXISTS {table_name} SYNC")

    node1.query(
        f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"
    )
    node3.query(
        f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"
    )

    # populate data
    node1.query(
        f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000)"
    )
    node1.query(
        f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000, 1000)"
    )
    node1.query(
        f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(2000, 1000)"
    )
    node1.query(
        f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(3000, 1000)"
    )
    node3.query(f"SYSTEM SYNC REPLICA {table_name}")


@pytest.mark.parametrize("use_hedged_requests", [1, 0])
@pytest.mark.parametrize("custom_key", ["sipHash64(key)", "key"])
@pytest.mark.parametrize("filter_type", ["default", "range"])
@pytest.mark.parametrize("prefer_localhost_replica", [0, 1])
def test_parallel_replicas_custom_key_failover(
    start_cluster,
    use_hedged_requests,
    custom_key,
    filter_type,
    prefer_localhost_replica,
):
    cluster_name = "test_single_shard_multiple_replicas"
    table = "test_table"

    create_tables(cluster_name, table)

    expected_result = ""
    for i in range(4):
        expected_result += f"{i}\t1000\n"

    log_comment = uuid.uuid4()
    assert (
        node1.query(
            f"SELECT key, count() FROM cluster('{cluster_name}', currentDatabase(), test_table) GROUP BY key ORDER BY key",
            settings={
                "log_comment": log_comment,
                "prefer_localhost_replica": prefer_localhost_replica,
                "max_parallel_replicas": 4,
                "parallel_replicas_custom_key": custom_key,
                "parallel_replicas_custom_key_filter_type": filter_type,
                "use_hedged_requests": use_hedged_requests,
                # avoid considering replica delay on connection choice
                # otherwise connection can be not distributed evenly among available nodes
                # and so custom key secondary queries (we check it bellow)
                "max_replica_delay_for_distributed_queries": 0,
            },
        )
        == expected_result
    )

    for node in nodes:
        node.query("system flush logs")

    # the subqueries should be spread over available nodes
    query_id = node1.query(
        f"SELECT query_id FROM system.query_log WHERE current_database = currentDatabase() AND log_comment = '{log_comment}' AND type = 'QueryFinish' AND initial_query_id = query_id"
    )
    assert query_id != ""
    query_id = query_id[:-1]

    if prefer_localhost_replica == 0:
        assert (
            node1.query(
                f"SELECT 'subqueries', count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' AND query_id != initial_query_id SETTINGS skip_unavailable_shards=1"
            )
            == "subqueries\t4\n"
        )

        # With enabled hedged requests, we can't guarantee exact query distribution among nodes
        # In case of a replica being slow in terms of responsiveness, hedged connection can change initial replicas choice
        if use_hedged_requests == 0:
            assert (
                node1.query(
                    f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"
                )
                == "n1\t3\nn3\t2\n"
            )
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00			`import pytest`
			`import uuid`
			`from helpers.cluster import ClickHouseCluster`

			`cluster = ClickHouseCluster(__file__)`

			`node1 = cluster.add_instance(`
			`"n1", main_configs=["configs/remote_servers.xml"], with_zookeeper=True`
			`)`
			`node3 = cluster.add_instance(`
			`"n3", main_configs=["configs/remote_servers.xml"], with_zookeeper=True`
			`)`

			`nodes = [node1, node3]`


			`@pytest.fixture(scope="module", autouse=True)`
			`def start_cluster():`
			`try:`
			`cluster.start()`
			`yield cluster`
			`finally:`
			`cluster.shutdown()`


			`def create_tables(cluster, table_name):`
			`node1.query(f"DROP TABLE IF EXISTS {table_name} SYNC")`
			`node3.query(f"DROP TABLE IF EXISTS {table_name} SYNC")`

			`node1.query(`
			`f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r1') ORDER BY (key)"`
			`)`
			`node3.query(`
			`f"CREATE TABLE IF NOT EXISTS {table_name} (key Int64, value String) Engine=ReplicatedMergeTree('/test_parallel_replicas/shard1/{table_name}', 'r3') ORDER BY (key)"`
			`)`

			`# populate data`
			`node1.query(`
			`f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000)"`
			`)`
			`node1.query(`
			`f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(1000, 1000)"`
			`)`
			`node1.query(`
			`f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(2000, 1000)"`
			`)`
			`node1.query(`
			`f"INSERT INTO {table_name} SELECT number % 4, number FROM numbers(3000, 1000)"`
			`)`
			`node3.query(f"SYSTEM SYNC REPLICA {table_name}")`


			`@pytest.mark.parametrize("use_hedged_requests", [1, 0])`
			`@pytest.mark.parametrize("custom_key", ["sipHash64(key)", "key"])`
			`@pytest.mark.parametrize("filter_type", ["default", "range"])`
			`@pytest.mark.parametrize("prefer_localhost_replica", [0, 1])`
			`def test_parallel_replicas_custom_key_failover(`
			`start_cluster,`
			`use_hedged_requests,`
			`custom_key,`
			`filter_type,`
			`prefer_localhost_replica,`
			`):`
Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`cluster_name = "test_single_shard_multiple_replicas"`
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00			`table = "test_table"`

Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`create_tables(cluster_name, table)`
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00
			`expected_result = ""`
			`for i in range(4):`
			`expected_result += f"{i}\t1000\n"`

			`log_comment = uuid.uuid4()`
			`assert (`
			`node1.query(`
Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`f"SELECT key, count() FROM cluster('{cluster_name}', currentDatabase(), test_table) GROUP BY key ORDER BY key",`
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00			`settings={`
			`"log_comment": log_comment,`
			`"prefer_localhost_replica": prefer_localhost_replica,`
			`"max_parallel_replicas": 4,`
			`"parallel_replicas_custom_key": custom_key,`
			`"parallel_replicas_custom_key_filter_type": filter_type,`
			`"use_hedged_requests": use_hedged_requests,`
Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`# avoid considering replica delay on connection choice`
			`# otherwise connection can be not distributed evenly among available nodes`
			`# and so custom key secondary queries (we check it bellow)`
			`"max_replica_delay_for_distributed_queries": 0,`
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00			`},`
			`)`
			`== expected_result`
			`)`

			`for node in nodes:`
			`node.query("system flush logs")`

			`# the subqueries should be spread over available nodes`
			`query_id = node1.query(`
			`f"SELECT query_id FROM system.query_log WHERE current_database = currentDatabase() AND log_comment = '{log_comment}' AND type = 'QueryFinish' AND initial_query_id = query_id"`
			`)`
			`assert query_id != ""`
			`query_id = query_id[:-1]`

			`if prefer_localhost_replica == 0:`
			`assert (`
			`node1.query(`
Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`f"SELECT 'subqueries', count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' AND query_id != initial_query_id SETTINGS skip_unavailable_shards=1"`
Revert "Revert flaky" 2024-01-19 12:51:30 +00:00			`)`
			`== "subqueries\t4\n"`
			`)`

Fix: can't guarantee query distribution with enabled hedged requests 2024-02-20 12:09:54 +00:00			`# With enabled hedged requests, we can't guarantee exact query distribution among nodes`
			`# In case of a replica being slow in terms of responsiveness, hedged connection can change initial replicas choice`
			`if use_hedged_requests == 0:`
			`assert (`
			`node1.query(`
			`f"SELECT h, count() FROM clusterAllReplicas({cluster_name}, system.query_log) WHERE initial_query_id = '{query_id}' AND type ='QueryFinish' GROUP BY hostname() as h ORDER BY h SETTINGS skip_unavailable_shards=1"`
			`)`
			`== "n1\t3\nn3\t2\n"`
Update tests + setting max_replica_delay_for_distributed_queries to avoid replica lag affecting load balancing + disable flaky assert for asan/tsan builds (will investigated separately) 2024-01-19 14:23:15 +00:00			`)`