ClickHouse/tests/integration/test_distributed_load_balancing/test.py

# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
# pylint: disable=line-too-long

import uuid
import pytest

from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)

n1 = cluster.add_instance('n1', main_configs=['configs/remote_servers.xml'])
n2 = cluster.add_instance('n2', main_configs=['configs/remote_servers.xml'])
n3 = cluster.add_instance('n3', main_configs=['configs/remote_servers.xml'])

nodes = len(cluster.instances)
queries = nodes*5

def bootstrap():
    for n in cluster.instances.values():
        # At startup, server loads configuration files.
        #
        # However ConfigReloader does not know about already loaded files
        # (files is empty()), hence it will always reload the configuration
        # just after server starts (+ 2 seconds, reload timeout).
        #
        # And on configuration reload the clusters will be re-created, so some
        # internal stuff will be reseted:
        # - error_count
        # - last_used (round_robing)
        #
        # And if the reload will happen during round_robin test it will start
        # querying from the beginning, so let's issue config reload just after
        # start to avoid reload in the middle of the test execution.
        n.query('SYSTEM RELOAD CONFIG')
        n.query('DROP TABLE IF EXISTS data')
        n.query('DROP TABLE IF EXISTS dist')
        n.query('CREATE TABLE data (key Int) Engine=Memory()')
        n.query("""
        CREATE TABLE dist AS data
        Engine=Distributed(
            replicas_cluster,
            currentDatabase(),
            data)
        """.format())
        n.query("""
        CREATE TABLE dist_priority AS data
        Engine=Distributed(
            replicas_priority_cluster,
            currentDatabase(),
            data)
        """.format())
        n.query("""
        CREATE TABLE dist_priority_negative AS data
        Engine=Distributed(
            replicas_priority_negative_cluster,
            currentDatabase(),
            data)
        """.format())

def make_uuid():
    return uuid.uuid4().hex

@pytest.fixture(scope='module', autouse=True)
def start_cluster():
    try:
        cluster.start()
        bootstrap()
        yield cluster
    finally:
        cluster.shutdown()

def get_node(query_node, table='dist', *args, **kwargs):
    query_id = make_uuid()

    settings = {
        'query_id': query_id,
        'log_queries': 1,
        'log_queries_min_type': 'QUERY_START',
        'prefer_localhost_replica': 0,
    }
    if 'settings' not in kwargs:
        kwargs['settings'] = settings
    else:
        kwargs['settings'].update(settings)

    query_node.query('SELECT * FROM ' + table, *args, **kwargs)

    for n in cluster.instances.values():
        n.query('SYSTEM FLUSH LOGS')

    rows = query_node.query("""
    SELECT c.host_name
    FROM (
        SELECT _shard_num
        FROM cluster(shards_cluster, system.query_log)
        WHERE
            initial_query_id = '{query_id}' AND
            is_initial_query = 0 AND
            type = 'QueryFinish'
        ORDER BY event_date DESC, event_time DESC
        LIMIT 1
    ) a
    JOIN system.clusters c
    ON a._shard_num = c.shard_num AND cluster = 'shards_cluster'
    """.format(query_id=query_id))
    return rows.strip()

# TODO: right now random distribution looks bad, but works
def test_load_balancing_default():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={'load_balancing': 'random'}))
    assert len(unique_nodes) == nodes, unique_nodes

def test_load_balancing_nearest_hostname():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={'load_balancing': 'nearest_hostname'}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(['n1'])

def test_load_balancing_in_order():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={'load_balancing': 'in_order'}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(['n1'])

def test_load_balancing_first_or_random():
    unique_nodes = set()
    for _ in range(0, queries):
        unique_nodes.add(get_node(n1, settings={'load_balancing': 'first_or_random'}))
    assert len(unique_nodes) == 1, unique_nodes
    assert unique_nodes == set(['n1'])

def test_load_balancing_round_robin():
    unique_nodes = set()
    for _ in range(0, nodes):
        unique_nodes.add(get_node(n1, settings={'load_balancing': 'round_robin'}))
    assert len(unique_nodes) == nodes, unique_nodes
    assert unique_nodes == set(['n1', 'n2', 'n3'])

@pytest.mark.parametrize('dist_table', [
    ('dist_priority'),
    ('dist_priority_negative'),
])
def test_load_balancing_priority_round_robin(dist_table):
    unique_nodes = set()
    for _ in range(0, nodes):
        unique_nodes.add(get_node(n1, dist_table, settings={'load_balancing': 'round_robin'}))
    assert len(unique_nodes) == 2, unique_nodes
    # n2 has bigger priority in config
    assert unique_nodes == set(['n1', 'n3'])

def test_distributed_replica_max_ignored_errors():
    settings = {
        'load_balancing': 'in_order',
        'prefer_localhost_replica': 0,
        'connect_timeout': 2,
        'receive_timeout': 2,
        'send_timeout': 2,
        'idle_connection_timeout': 2,
        'tcp_keep_alive_timeout': 2,

        'distributed_replica_max_ignored_errors': 0,
        'distributed_replica_error_half_life': 60,
    }

    # initiate connection (if started only this test)
    n2.query('SELECT * FROM dist', settings=settings)
    cluster.pause_container('n1')

    # n1 paused -- skipping, and increment error_count for n1
    # but the query succeeds, no need in query_and_get_error()
    n2.query('SELECT * FROM dist', settings=settings)
    # XXX: due to config reloading we need second time (sigh)
    n2.query('SELECT * FROM dist', settings=settings)
    # check error_count for n1
    assert int(n2.query("""
    SELECT errors_count FROM system.clusters
    WHERE cluster = 'replicas_cluster' AND host_name = 'n1'
    """, settings=settings)) == 1

    cluster.unpause_container('n1')
    # still n2
    assert get_node(n2, settings=settings) == 'n2'
    # now n1
    settings['distributed_replica_max_ignored_errors'] = 1
    assert get_node(n2, settings=settings) == 'n1'
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00			`# pylint: disable=unused-argument`
			`# pylint: disable=redefined-outer-name`
			`# pylint: disable=line-too-long`

			`import uuid`
			`import pytest`

			`from helpers.cluster import ClickHouseCluster`

			`cluster = ClickHouseCluster(__file__)`

			`n1 = cluster.add_instance('n1', main_configs=['configs/remote_servers.xml'])`
			`n2 = cluster.add_instance('n2', main_configs=['configs/remote_servers.xml'])`
			`n3 = cluster.add_instance('n3', main_configs=['configs/remote_servers.xml'])`

			`nodes = len(cluster.instances)`
			`queries = nodes*5`

Fix test_distributed_load_balancing flaps (due to config reload) At startup, server loads configuration files. However ConfigReloader does not know about already loaded files (files is empty()), hence it will always reload the configuration just after server starts (+ 2 seconds, reload timeout). And on configuration reload the clusters will be re-created, so some internal stuff will be reseted: - error_count - last_used (round_robing) And if the reload will happen during round_robin test it will start querying from the beginning, so let's issue config reload just after start to avoid reload in the middle of the test execution. 2020-06-21 08:59:54 +00:00			`def bootstrap():`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00			`for n in cluster.instances.values():`
Fix test_distributed_load_balancing flaps (due to config reload) At startup, server loads configuration files. However ConfigReloader does not know about already loaded files (files is empty()), hence it will always reload the configuration just after server starts (+ 2 seconds, reload timeout). And on configuration reload the clusters will be re-created, so some internal stuff will be reseted: - error_count - last_used (round_robing) And if the reload will happen during round_robin test it will start querying from the beginning, so let's issue config reload just after start to avoid reload in the middle of the test execution. 2020-06-21 08:59:54 +00:00			`# At startup, server loads configuration files.`
			`#`
			`# However ConfigReloader does not know about already loaded files`
			`# (files is empty()), hence it will always reload the configuration`
			`# just after server starts (+ 2 seconds, reload timeout).`
			`#`
			`# And on configuration reload the clusters will be re-created, so some`
			`# internal stuff will be reseted:`
			`# - error_count`
			`# - last_used (round_robing)`
			`#`
			`# And if the reload will happen during round_robin test it will start`
			`# querying from the beginning, so let's issue config reload just after`
			`# start to avoid reload in the middle of the test execution.`
			`n.query('SYSTEM RELOAD CONFIG')`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00			`n.query('DROP TABLE IF EXISTS data')`
			`n.query('DROP TABLE IF EXISTS dist')`
			`n.query('CREATE TABLE data (key Int) Engine=Memory()')`
			`n.query("""`
			`CREATE TABLE dist AS data`
			`Engine=Distributed(`
			`replicas_cluster,`
			`currentDatabase(),`
			`data)`
			`""".format())`
Cover <replica> <priority> in test_distributed_load_balancing 2020-06-27 07:29:47 +00:00			`n.query("""`
			`CREATE TABLE dist_priority AS data`
			`Engine=Distributed(`
			`replicas_priority_cluster,`
			`currentDatabase(),`
			`data)`
			`""".format())`
Add a test with negative priority in test_distributed_load_balancing 2020-06-30 08:09:06 +00:00			`n.query("""`
			`CREATE TABLE dist_priority_negative AS data`
			`Engine=Distributed(`
			`replicas_priority_negative_cluster,`
			`currentDatabase(),`
			`data)`
			`""".format())`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00
			`def make_uuid():`
			`return uuid.uuid4().hex`

			`@pytest.fixture(scope='module', autouse=True)`
			`def start_cluster():`
			`try:`
			`cluster.start()`
Fix test_distributed_load_balancing flaps (due to config reload) At startup, server loads configuration files. However ConfigReloader does not know about already loaded files (files is empty()), hence it will always reload the configuration just after server starts (+ 2 seconds, reload timeout). And on configuration reload the clusters will be re-created, so some internal stuff will be reseted: - error_count - last_used (round_robing) And if the reload will happen during round_robin test it will start querying from the beginning, so let's issue config reload just after start to avoid reload in the middle of the test execution. 2020-06-21 08:59:54 +00:00			`bootstrap()`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00			`yield cluster`
			`finally:`
			`cluster.shutdown()`

Cover <replica> <priority> in test_distributed_load_balancing 2020-06-27 07:29:47 +00:00			`def get_node(query_node, table='dist', args, *kwargs):`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00			`query_id = make_uuid()`

			`settings = {`
			`'query_id': query_id,`
			`'log_queries': 1,`
			`'log_queries_min_type': 'QUERY_START',`
			`'prefer_localhost_replica': 0,`
			`}`
			`if 'settings' not in kwargs:`
			`kwargs['settings'] = settings`
			`else:`
			`kwargs['settings'].update(settings)`

Cover <replica> <priority> in test_distributed_load_balancing 2020-06-27 07:29:47 +00:00			`query_node.query('SELECT * FROM ' + table, args, *kwargs)`
Add test for load_balancing algorithms 2020-06-13 18:33:58 +00:00
			`for n in cluster.instances.values():`
			`n.query('SYSTEM FLUSH LOGS')`

			`rows = query_node.query("""`
			`SELECT c.host_name`
			`FROM (`
			`SELECT _shard_num`
			`FROM cluster(shards_cluster, system.query_log)`
			`WHERE`
			`initial_query_id = '{query_id}' AND`
			`is_initial_query = 0 AND`
			`type = 'QueryFinish'`
			`ORDER BY event_date DESC, event_time DESC`
			`LIMIT 1`
			`) a`
			`JOIN system.clusters c`
			`ON a._shard_num = c.shard_num AND cluster = 'shards_cluster'`
			`""".format(query_id=query_id))`
			`return rows.strip()`

			`# TODO: right now random distribution looks bad, but works`
			`def test_load_balancing_default():`
			`unique_nodes = set()`
			`for _ in range(0, queries):`
			`unique_nodes.add(get_node(n1, settings={'load_balancing': 'random'}))`
			`assert len(unique_nodes) == nodes, unique_nodes`

			`def test_load_balancing_nearest_hostname():`
			`unique_nodes = set()`
			`for _ in range(0, queries):`
			`unique_nodes.add(get_node(n1, settings={'load_balancing': 'nearest_hostname'}))`
			`assert len(unique_nodes) == 1, unique_nodes`
			`assert unique_nodes == set(['n1'])`

			`def test_load_balancing_in_order():`
			`unique_nodes = set()`
			`for _ in range(0, queries):`
			`unique_nodes.add(get_node(n1, settings={'load_balancing': 'in_order'}))`
			`assert len(unique_nodes) == 1, unique_nodes`
			`assert unique_nodes == set(['n1'])`

			`def test_load_balancing_first_or_random():`
			`unique_nodes = set()`
			`for _ in range(0, queries):`
			`unique_nodes.add(get_node(n1, settings={'load_balancing': 'first_or_random'}))`
			`assert len(unique_nodes) == 1, unique_nodes`
			`assert unique_nodes == set(['n1'])`

			`def test_load_balancing_round_robin():`
			`unique_nodes = set()`
			`for _ in range(0, nodes):`
			`unique_nodes.add(get_node(n1, settings={'load_balancing': 'round_robin'}))`
			`assert len(unique_nodes) == nodes, unique_nodes`
			`assert unique_nodes == set(['n1', 'n2', 'n3'])`
Add a test for distributed_replica_max_ignored_errors 2020-06-13 20:01:17 +00:00
Add a test with negative priority in test_distributed_load_balancing 2020-06-30 08:09:06 +00:00			`@pytest.mark.parametrize('dist_table', [`
			`('dist_priority'),`
			`('dist_priority_negative'),`
			`])`
			`def test_load_balancing_priority_round_robin(dist_table):`
Cover <replica> <priority> in test_distributed_load_balancing 2020-06-27 07:29:47 +00:00			`unique_nodes = set()`
			`for _ in range(0, nodes):`
Add a test with negative priority in test_distributed_load_balancing 2020-06-30 08:09:06 +00:00			`unique_nodes.add(get_node(n1, dist_table, settings={'load_balancing': 'round_robin'}))`
Cover <replica> <priority> in test_distributed_load_balancing 2020-06-27 07:29:47 +00:00			`assert len(unique_nodes) == 2, unique_nodes`
			`# n2 has bigger priority in config`
			`assert unique_nodes == set(['n1', 'n3'])`

Add a test for distributed_replica_max_ignored_errors 2020-06-13 20:01:17 +00:00			`def test_distributed_replica_max_ignored_errors():`
			`settings = {`
			`'load_balancing': 'in_order',`
			`'prefer_localhost_replica': 0,`
			`'connect_timeout': 2,`
			`'receive_timeout': 2,`
			`'send_timeout': 2,`
			`'idle_connection_timeout': 2,`
			`'tcp_keep_alive_timeout': 2,`

			`'distributed_replica_max_ignored_errors': 0,`
			`'distributed_replica_error_half_life': 60,`
			`}`

			`# initiate connection (if started only this test)`
			`n2.query('SELECT * FROM dist', settings=settings)`
			`cluster.pause_container('n1')`

			`# n1 paused -- skipping, and increment error_count for n1`
			`# but the query succeeds, no need in query_and_get_error()`
			`n2.query('SELECT * FROM dist', settings=settings)`
			`# XXX: due to config reloading we need second time (sigh)`
			`n2.query('SELECT * FROM dist', settings=settings)`
			`# check error_count for n1`
			`assert int(n2.query("""`
			`SELECT errors_count FROM system.clusters`
			`WHERE cluster = 'replicas_cluster' AND host_name = 'n1'`
			`""", settings=settings)) == 1`

			`cluster.unpause_container('n1')`
			`# still n2`
			`assert get_node(n2, settings=settings) == 'n2'`
			`# now n1`
			`settings['distributed_replica_max_ignored_errors'] = 1`
			`assert get_node(n2, settings=settings) == 'n1'`