ClickHouse/tests/integration/test_distributed_load_balancing/test.py
Azat Khuzhin bd45592539 Fix test_distributed_load_balancing flaps (due to config reload)
At startup, server loads configuration files.

However ConfigReloader does not know about already loaded files (files
is empty()), hence it will always reload the configuration just after
server starts (+ 2 seconds, reload timeout).

And on configuration reload the clusters will be re-created, so some
internal stuff will be reseted:
- error_count
- last_used (round_robing)

And if the reload will happen during round_robin test it will start
querying from the beginning, so let's issue config reload just after
start to avoid reload in the middle of the test execution.
2020-06-21 12:01:15 +03:00

166 lines
5.5 KiB
Python

# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
# pylint: disable=line-too-long
import uuid
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
n1 = cluster.add_instance('n1', main_configs=['configs/remote_servers.xml'])
n2 = cluster.add_instance('n2', main_configs=['configs/remote_servers.xml'])
n3 = cluster.add_instance('n3', main_configs=['configs/remote_servers.xml'])
nodes = len(cluster.instances)
queries = nodes*5
def bootstrap():
for n in cluster.instances.values():
# At startup, server loads configuration files.
#
# However ConfigReloader does not know about already loaded files
# (files is empty()), hence it will always reload the configuration
# just after server starts (+ 2 seconds, reload timeout).
#
# And on configuration reload the clusters will be re-created, so some
# internal stuff will be reseted:
# - error_count
# - last_used (round_robing)
#
# And if the reload will happen during round_robin test it will start
# querying from the beginning, so let's issue config reload just after
# start to avoid reload in the middle of the test execution.
n.query('SYSTEM RELOAD CONFIG')
n.query('DROP TABLE IF EXISTS data')
n.query('DROP TABLE IF EXISTS dist')
n.query('CREATE TABLE data (key Int) Engine=Memory()')
n.query("""
CREATE TABLE dist AS data
Engine=Distributed(
replicas_cluster,
currentDatabase(),
data)
""".format())
def make_uuid():
return uuid.uuid4().hex
@pytest.fixture(scope='module', autouse=True)
def start_cluster():
try:
cluster.start()
bootstrap()
yield cluster
finally:
cluster.shutdown()
def get_node(query_node, *args, **kwargs):
query_id = make_uuid()
settings = {
'query_id': query_id,
'log_queries': 1,
'log_queries_min_type': 'QUERY_START',
'prefer_localhost_replica': 0,
}
if 'settings' not in kwargs:
kwargs['settings'] = settings
else:
kwargs['settings'].update(settings)
query_node.query('SELECT * FROM dist', *args, **kwargs)
for n in cluster.instances.values():
n.query('SYSTEM FLUSH LOGS')
rows = query_node.query("""
SELECT c.host_name
FROM (
SELECT _shard_num
FROM cluster(shards_cluster, system.query_log)
WHERE
initial_query_id = '{query_id}' AND
is_initial_query = 0 AND
type = 'QueryFinish'
ORDER BY event_date DESC, event_time DESC
LIMIT 1
) a
JOIN system.clusters c
ON a._shard_num = c.shard_num AND cluster = 'shards_cluster'
""".format(query_id=query_id))
return rows.strip()
# TODO: right now random distribution looks bad, but works
def test_load_balancing_default():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'random'}))
assert len(unique_nodes) == nodes, unique_nodes
def test_load_balancing_nearest_hostname():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'nearest_hostname'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
def test_load_balancing_in_order():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'in_order'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
def test_load_balancing_first_or_random():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'first_or_random'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
# TODO: last_used will be reset on config reload, hence may fail
def test_load_balancing_round_robin():
unique_nodes = set()
for _ in range(0, nodes):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'round_robin'}))
assert len(unique_nodes) == nodes, unique_nodes
assert unique_nodes == set(['n1', 'n2', 'n3'])
def test_distributed_replica_max_ignored_errors():
settings = {
'load_balancing': 'in_order',
'prefer_localhost_replica': 0,
'connect_timeout': 2,
'receive_timeout': 2,
'send_timeout': 2,
'idle_connection_timeout': 2,
'tcp_keep_alive_timeout': 2,
'distributed_replica_max_ignored_errors': 0,
'distributed_replica_error_half_life': 60,
}
# initiate connection (if started only this test)
n2.query('SELECT * FROM dist', settings=settings)
cluster.pause_container('n1')
# n1 paused -- skipping, and increment error_count for n1
# but the query succeeds, no need in query_and_get_error()
n2.query('SELECT * FROM dist', settings=settings)
# XXX: due to config reloading we need second time (sigh)
n2.query('SELECT * FROM dist', settings=settings)
# check error_count for n1
assert int(n2.query("""
SELECT errors_count FROM system.clusters
WHERE cluster = 'replicas_cluster' AND host_name = 'n1'
""", settings=settings)) == 1
cluster.unpause_container('n1')
# still n2
assert get_node(n2, settings=settings) == 'n2'
# now n1
settings['distributed_replica_max_ignored_errors'] = 1
assert get_node(n2, settings=settings) == 'n1'