ClickHouse/tests/integration/test_distributed_load_balancing/test.py
Azat Khuzhin 4b10b978e8 Increase number of iterations in test_distributed_load_balancing (fixes flakiness)
Before there was some flakiness, because for 15 iterations it does
generate all unique values (3 unique nodes) [1].

  [1]: https://clickhouse-test-reports.s3.yandex.net/28604/9911b3ea368a7745934517747e62db3217cddb00/integration_tests_(thread).html#fail1

I've tried other techincs (cap random value to the maximum number of
nodes, other RNG, but these does not solve it either).
2021-09-08 10:49:01 +03:00

203 lines
6.3 KiB
Python

# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
# pylint: disable=line-too-long
import uuid
import pytest
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
n1 = cluster.add_instance('n1', main_configs=['configs/remote_servers.xml'])
n2 = cluster.add_instance('n2', main_configs=['configs/remote_servers.xml'])
n3 = cluster.add_instance('n3', main_configs=['configs/remote_servers.xml'])
nodes = len(cluster.instances)
queries = nodes * 10
def bootstrap():
for n in list(cluster.instances.values()):
# At startup, server loads configuration files.
#
# However ConfigReloader does not know about already loaded files
# (files is empty()), hence it will always reload the configuration
# just after server starts (+ 2 seconds, reload timeout).
#
# And on configuration reload the clusters will be re-created, so some
# internal stuff will be reset:
# - error_count
# - last_used (round_robing)
#
# And if the reload will happen during round_robin test it will start
# querying from the beginning, so let's issue config reload just after
# start to avoid reload in the middle of the test execution.
n.query('SYSTEM RELOAD CONFIG')
n.query('DROP TABLE IF EXISTS data')
n.query('DROP TABLE IF EXISTS dist')
n.query('CREATE TABLE data (key Int) Engine=Memory()')
n.query("""
CREATE TABLE dist AS data
Engine=Distributed(
replicas_cluster,
currentDatabase(),
data)
""")
n.query("""
CREATE TABLE dist_priority AS data
Engine=Distributed(
replicas_priority_cluster,
currentDatabase(),
data)
""")
n.query("""
CREATE TABLE dist_priority_negative AS data
Engine=Distributed(
replicas_priority_negative_cluster,
currentDatabase(),
data)
""")
def make_uuid():
return uuid.uuid4().hex
@pytest.fixture(scope='module', autouse=True)
def start_cluster():
try:
cluster.start()
bootstrap()
yield cluster
finally:
cluster.shutdown()
def get_node(query_node, table='dist', *args, **kwargs):
query_id = make_uuid()
settings = {
'query_id': query_id,
'log_queries': 1,
'log_queries_min_type': 'QUERY_START',
'prefer_localhost_replica': 0,
}
if 'settings' not in kwargs:
kwargs['settings'] = settings
else:
kwargs['settings'].update(settings)
query_node.query('SELECT * FROM ' + table, *args, **kwargs)
for n in list(cluster.instances.values()):
n.query('SYSTEM FLUSH LOGS')
rows = query_node.query("""
SELECT c.host_name
FROM (
SELECT _shard_num
FROM cluster(shards_cluster, system.query_log)
WHERE
initial_query_id = '{query_id}' AND
is_initial_query = 0 AND
type = 'QueryFinish'
ORDER BY event_date DESC, event_time DESC
LIMIT 1
) a
JOIN system.clusters c
ON a._shard_num = c.shard_num WHERE cluster = 'shards_cluster'
""".format(query_id=query_id))
return rows.strip()
# TODO: right now random distribution looks bad, but works
def test_load_balancing_default():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'random'}))
assert len(unique_nodes) == nodes, unique_nodes
def test_load_balancing_nearest_hostname():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'nearest_hostname'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
def test_load_balancing_in_order():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'in_order'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
def test_load_balancing_first_or_random():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'first_or_random'}))
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(['n1'])
def test_load_balancing_round_robin():
unique_nodes = set()
for _ in range(0, nodes):
unique_nodes.add(get_node(n1, settings={'load_balancing': 'round_robin'}))
assert len(unique_nodes) == nodes, unique_nodes
assert unique_nodes == set(['n1', 'n2', 'n3'])
@pytest.mark.parametrize('dist_table', [
('dist_priority'),
('dist_priority_negative'),
])
def test_load_balancing_priority_round_robin(dist_table):
unique_nodes = set()
for _ in range(0, nodes):
unique_nodes.add(get_node(n1, dist_table, settings={'load_balancing': 'round_robin'}))
assert len(unique_nodes) == 2, unique_nodes
# n2 has bigger priority in config
assert unique_nodes == set(['n1', 'n3'])
def test_distributed_replica_max_ignored_errors():
settings = {
'use_hedged_requests' : 0,
'load_balancing': 'in_order',
'prefer_localhost_replica': 0,
'connect_timeout': 2,
'receive_timeout': 2,
'send_timeout': 2,
'idle_connection_timeout': 2,
'tcp_keep_alive_timeout': 2,
'distributed_replica_max_ignored_errors': 0,
'distributed_replica_error_half_life': 60,
}
# initiate connection (if started only this test)
n2.query('SELECT * FROM dist', settings=settings)
cluster.pause_container('n1')
# n1 paused -- skipping, and increment error_count for n1
# but the query succeeds, no need in query_and_get_error()
n2.query('SELECT * FROM dist', settings=settings)
# XXX: due to config reloading we need second time (sigh)
n2.query('SELECT * FROM dist', settings=settings)
# check error_count for n1
assert int(n2.query("""
SELECT errors_count FROM system.clusters
WHERE cluster = 'replicas_cluster' AND host_name = 'n1'
""", settings=settings)) == 1
cluster.unpause_container('n1')
# still n2
assert get_node(n2, settings=settings) == 'n2'
# now n1
settings['distributed_replica_max_ignored_errors'] = 1
assert get_node(n2, settings=settings) == 'n1'