ClickHouse/tests/integration/test_distributed_load_balancing/test.py

225 lines
6.5 KiB
Python
Raw Normal View History

2020-06-13 18:33:58 +00:00
# pylint: disable=unused-argument
# pylint: disable=redefined-outer-name
# pylint: disable=line-too-long
import uuid
import pytest
2020-06-13 18:33:58 +00:00
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
n1 = cluster.add_instance("n1", main_configs=["configs/remote_servers.xml"])
n2 = cluster.add_instance("n2", main_configs=["configs/remote_servers.xml"])
n3 = cluster.add_instance("n3", main_configs=["configs/remote_servers.xml"])
2020-06-13 18:33:58 +00:00
nodes = len(cluster.instances)
queries = nodes * 10
2020-06-13 18:33:58 +00:00
def bootstrap():
2020-10-02 16:54:07 +00:00
for n in list(cluster.instances.values()):
# At startup, server loads configuration files.
#
# However ConfigReloader does not know about already loaded files
# (files is empty()), hence it will always reload the configuration
# just after server starts (+ 2 seconds, reload timeout).
#
# And on configuration reload the clusters will be re-created, so some
2020-10-27 11:04:03 +00:00
# internal stuff will be reset:
# - error_count
# - last_used (round_robing)
#
# And if the reload will happen during round_robin test it will start
# querying from the beginning, so let's issue config reload just after
# start to avoid reload in the middle of the test execution.
n.query("SYSTEM RELOAD CONFIG")
n.query("DROP TABLE IF EXISTS data")
n.query("DROP TABLE IF EXISTS dist")
n.query("CREATE TABLE data (key Int) Engine=Memory()")
n.query(
"""
2020-06-13 18:33:58 +00:00
CREATE TABLE dist AS data
Engine=Distributed(
replicas_cluster,
currentDatabase(),
data)
"""
)
n.query(
"""
CREATE TABLE dist_priority AS data
Engine=Distributed(
replicas_priority_cluster,
currentDatabase(),
data)
"""
)
n.query(
"""
CREATE TABLE dist_priority_negative AS data
Engine=Distributed(
replicas_priority_negative_cluster,
currentDatabase(),
data)
"""
)
2020-06-13 18:33:58 +00:00
2020-06-13 18:33:58 +00:00
def make_uuid():
return uuid.uuid4().hex
@pytest.fixture(scope="module", autouse=True)
2020-06-13 18:33:58 +00:00
def start_cluster():
try:
cluster.start()
bootstrap()
2020-06-13 18:33:58 +00:00
yield cluster
finally:
cluster.shutdown()
def get_node(query_node, table="dist", *args, **kwargs):
2020-06-13 18:33:58 +00:00
query_id = make_uuid()
settings = {
"query_id": query_id,
"log_queries": 1,
"log_queries_min_type": "QUERY_START",
"prefer_localhost_replica": 0,
2020-06-13 18:33:58 +00:00
}
if "settings" not in kwargs:
kwargs["settings"] = settings
2020-06-13 18:33:58 +00:00
else:
kwargs["settings"].update(settings)
2020-06-13 18:33:58 +00:00
query_node.query("SELECT * FROM " + table, *args, **kwargs)
2020-06-13 18:33:58 +00:00
2020-10-02 16:54:07 +00:00
for n in list(cluster.instances.values()):
n.query("SYSTEM FLUSH LOGS")
2020-06-13 18:33:58 +00:00
rows = query_node.query(
"""
2020-06-13 18:33:58 +00:00
SELECT c.host_name
FROM (
SELECT _shard_num
FROM cluster(shards_cluster, system.query_log)
WHERE
initial_query_id = '{query_id}' AND
is_initial_query = 0 AND
type = 'QueryFinish'
ORDER BY event_date DESC, event_time DESC
LIMIT 1
) a
JOIN system.clusters c
2021-01-03 23:20:12 +00:00
ON a._shard_num = c.shard_num WHERE cluster = 'shards_cluster'
""".format(
query_id=query_id
)
)
2020-06-13 18:33:58 +00:00
return rows.strip()
2020-06-13 18:33:58 +00:00
# TODO: right now random distribution looks bad, but works
def test_load_balancing_default():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={"load_balancing": "random"}))
2020-06-13 18:33:58 +00:00
assert len(unique_nodes) == nodes, unique_nodes
2020-06-13 18:33:58 +00:00
def test_load_balancing_nearest_hostname():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={"load_balancing": "nearest_hostname"}))
2020-06-13 18:33:58 +00:00
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(["n1"])
2020-06-13 18:33:58 +00:00
2020-06-13 18:33:58 +00:00
def test_load_balancing_in_order():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={"load_balancing": "in_order"}))
2020-06-13 18:33:58 +00:00
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(["n1"])
2020-06-13 18:33:58 +00:00
2020-06-13 18:33:58 +00:00
def test_load_balancing_first_or_random():
unique_nodes = set()
for _ in range(0, queries):
unique_nodes.add(get_node(n1, settings={"load_balancing": "first_or_random"}))
2020-06-13 18:33:58 +00:00
assert len(unique_nodes) == 1, unique_nodes
assert unique_nodes == set(["n1"])
2020-06-13 18:33:58 +00:00
2020-06-13 18:33:58 +00:00
def test_load_balancing_round_robin():
unique_nodes = set()
for _ in range(0, nodes):
unique_nodes.add(get_node(n1, settings={"load_balancing": "round_robin"}))
2020-06-13 18:33:58 +00:00
assert len(unique_nodes) == nodes, unique_nodes
assert unique_nodes == set(["n1", "n2", "n3"])
@pytest.mark.parametrize(
"dist_table",
[
("dist_priority"),
("dist_priority_negative"),
],
)
def test_load_balancing_priority_round_robin(dist_table):
unique_nodes = set()
for _ in range(0, nodes):
unique_nodes.add(
get_node(n1, dist_table, settings={"load_balancing": "round_robin"})
)
assert len(unique_nodes) == 2, unique_nodes
# n2 has bigger priority in config
assert unique_nodes == set(["n1", "n3"])
def test_distributed_replica_max_ignored_errors():
settings = {
"use_hedged_requests": 0,
"load_balancing": "in_order",
"prefer_localhost_replica": 0,
"connect_timeout": 2,
"receive_timeout": 2,
"send_timeout": 2,
"idle_connection_timeout": 2,
"tcp_keep_alive_timeout": 2,
"distributed_replica_max_ignored_errors": 0,
"distributed_replica_error_half_life": 60,
}
# initiate connection (if started only this test)
n2.query("SELECT * FROM dist", settings=settings)
cluster.pause_container("n1")
# n1 paused -- skipping, and increment error_count for n1
# but the query succeeds, no need in query_and_get_error()
n2.query("SELECT * FROM dist", settings=settings)
# XXX: due to config reloading we need second time (sigh)
n2.query("SELECT * FROM dist", settings=settings)
# check error_count for n1
assert (
int(
n2.query(
"""
SELECT errors_count FROM system.clusters
WHERE cluster = 'replicas_cluster' AND host_name = 'n1'
""",
settings=settings,
)
)
== 1
)
cluster.unpause_container("n1")
# still n2
assert get_node(n2, settings=settings) == "n2"
# now n1
settings["distributed_replica_max_ignored_errors"] = 1
assert get_node(n2, settings=settings) == "n1"