ClickHouse/tests/integration/test_distributed_load_balancing/test.py

# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
# pylint: disable=line-too-long

import uuid

import pytest
from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)

n1 = cluster.add_instance("n1", main_configs=["configs/remote_servers.xml"])
n2 = cluster.add_instance("n2", main_configs=["configs/remote_servers.xml"])
n3 = cluster.add_instance("n3", main_configs=["configs/remote_servers.xml"])

nodes = len(cluster.instances)
queries = nodes * 10


def bootstrap():
    for n in list(cluster.instances.values()):
        # At startup, server loads configuration files.
        #
        # However ConfigReloader does not know about already loaded files
        # (files is empty()), hence it will always reload the configuration
        # just after server starts (+ 2 seconds, reload timeout).
        #
        # And on configuration reload the clusters will be re-created, so some
        # internal stuff will be reset:
        # - error_count
        # - last_used (round_robing)
        #
        # And if the reload will happen during round_robin test it will start
        # querying from the beginning, so let's issue config reload just after
        # start to avoid reload in the middle of the test execution.
        n.query("SYSTEM RELOAD CONFIG")
        n.query("DROP TABLE IF EXISTS data")
        n.query("DROP TABLE IF EXISTS dist")
        n.query("CREATE TABLE data (key Int) Engine=Memory()")
        n.query(
            """
        CREATE TABLE dist AS data
        Engine=Distributed(
            replicas_cluster,
            currentDatabase(),
            data)
        """
        )
        n.query(
            """
        CREATE TABLE dist_priority AS data
        Engine=Distributed(
            replicas_priority_cluster,
            currentDatabase(),
            data)
        """
        )
        n.query(
            """
        CREATE TABLE dist_priority_negative AS data
        Engine=Distributed(
            replicas_priority_negative_cluster,
            currentDatabase(),
            data)
        """
        )


def make_uuid():
    return uuid.uuid4().hex


@pytest.fixture(scope="module", autouse=True)
def start_cluster():
    try:
        cluster.start()
        bootstrap()
        yield cluster
    finally:
        cluster.shutdown()


def get_node(query_node, table="dist", *args, **kwargs):
    query_id = make_uuid()

    settings = {
        "query_id": query_id,
        "log_queries": 1,
        "log_queries_min_type": "QUERY_START",
        "prefer_localhost_replica": 0,
    }
    if "settings" not in kwargs:
        kwargs["settings"] = settings
    else:
        kwargs["settings"].update(settings)

    query_node.query("SELECT * FROM " + table, *args, **kwargs)

    for n in list(cluster.instances.values()):
        n.query("SYSTEM FLUSH LOGS")

    rows = query_node.query(
        """
    SELECT c.host_name
    FROM (
        SELECT _shard_num
        FROM cluster(shards_cluster, system.query_log)
        WHERE
            initial_query_id = '{query_id}' AND
            is_initial_query = 0 AND
            type = 'QueryFinish'
        ORDER BY event_date DESC, event_time DESC
        LIMIT 1
    ) a
    JOIN system.clusters c
    ON a._shard_num = c.shard_num WHERE cluster = 'shards_cluster'
    """.format(
            query_id=query_id
        )
    )
    return rows.strip()


# TODO: right now random distribution looks bad, but works
def test_load_balancing_default():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={"load_balancing": "random"}))
    assert len(unique_nodes) == nodes, unique_nodes


def test_load_balancing_nearest_hostname():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={"load_balancing": "nearest_hostname"}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(["n1"])


def test_load_balancing_in_order():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={"load_balancing": "in_order"}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(["n1"])


def test_load_balancing_first_or_random():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={"load_balancing": "first_or_random"}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(["n1"])


def test_load_balancing_round_robin():
    unique_nodes = set()
    for _ in range(0, nodes):
        unique_nodes.add(get_node(n1, settings={"load_balancing": "round_robin"}))
    assert len(unique_nodes) == nodes, unique_nodes
    assert unique_nodes == set(["n1", "n2", "n3"])


@pytest.mark.parametrize(
    "dist_table",
    [
        ("dist_priority"),
        ("dist_priority_negative"),
    ],
)
def test_load_balancing_priority_round_robin(dist_table):
    unique_nodes = set()
    for _ in range(0, nodes):
        unique_nodes.add(
            get_node(n1, dist_table, settings={"load_balancing": "round_robin"})
        )
    assert len(unique_nodes) == 2, unique_nodes
    # n2 has bigger priority in config
    assert unique_nodes == set(["n1", "n3"])


def test_distributed_replica_max_ignored_errors():
    settings = {
        "use_hedged_requests": 0,
        "load_balancing": "in_order",
        "prefer_localhost_replica": 0,
        "connect_timeout": 2,
        "receive_timeout": 2,
        "send_timeout": 2,
        "idle_connection_timeout": 2,
        "tcp_keep_alive_timeout": 2,
        "distributed_replica_max_ignored_errors": 0,
        "distributed_replica_error_half_life": 60,
    }

    # initiate connection (if started only this test)
    n2.query("SELECT * FROM dist", settings=settings)
    cluster.pause_container("n1")

    # n1 paused -- skipping, and increment error_count for n1
    # but the query succeeds, no need in query_and_get_error()
    n2.query("SELECT * FROM dist", settings=settings)
    # XXX: due to config reloading we need second time (sigh)
    n2.query("SELECT * FROM dist", settings=settings)
    # check error_count for n1
    assert (
        int(
            n2.query(
                """
    SELECT errors_count FROM system.clusters
    WHERE cluster = 'replicas_cluster' AND host_name = 'n1'
    """,
                settings=settings,
            )
        )
        == 1
    )

    cluster.unpause_container("n1")
    # still n2
    assert get_node(n2, settings=settings) == "n2"
    # now n1
    settings["distributed_replica_max_ignored_errors"] = 1
    assert get_node(n2, settings=settings) == "n1"