Test for HostResolver fail_count

This commit is contained in:
Anton Ivashkin 2024-05-02 16:10:05 +02:00 committed by Sema Checherinda
parent 87785e1c38
commit 22f1c197e5
4 changed files with 136 additions and 0 deletions

View File

@ -0,0 +1,12 @@
<clickhouse>
<profiles>
<default>
<connect_timeout>5</connect_timeout>
<receive_timeout>5</receive_timeout>
<send_timeout>5</send_timeout>
<http_connection_timeout>5</http_connection_timeout>
<http_send_timeout>5</http_send_timeout>
<http_receive_timeout>5</http_receive_timeout>
</default>
</profiles>
</clickhouse>

View File

@ -0,0 +1,21 @@
<clickhouse>
<storage_configuration>
<disks>
<s3>
<type>s3</type>
<endpoint>http://minio1:9001/root/data/</endpoint>
<access_key_id>minio</access_key_id>
<secret_access_key>minio123</secret_access_key>
</s3>
</disks>
<policies>
<s3>
<volumes>
<main>
<disk>s3</disk>
</main>
</volumes>
</s3>
</policies>
</storage_configuration>
</clickhouse>

View File

@ -0,0 +1,103 @@
"""Test Interserver responses on configured IP."""
import pytest
import time
from helpers.cluster import ClickHouseCluster
cluster = ClickHouseCluster(__file__)
node = cluster.add_instance(
"node",
main_configs=["configs/config.d/cluster.xml", "configs/config.d/s3.xml"],
with_minio=True,
)
@pytest.fixture(scope="module")
def start_cluster():
try:
cluster.start()
yield cluster
finally:
cluster.shutdown()
# The same value as in ClickHouse, this can't be confugured via config now
DEFAULT_RESOLVE_TIME_HISTORY_SECONDS = 2*60
def test_host_resolver(start_cluster):
minio_ip = cluster.get_instance_ip("minio1")
# drop DNS cache
node.set_hosts([
(minio_ip, "minio1"),
(node.ip_address, "minio1"), # no answer on 9001 port on this IP
])
node.query("SYSTEM DROP DNS CACHE")
node.query("SYSTEM DROP CONNECTIONS CACHE")
node.query("""
CREATE TABLE test (key UInt32, value UInt32)
Engine=MergeTree()
ORDER BY key PARTITION BY key
SETTINGS storage_policy='s3'
""")
initial_fails = "0\n"
k = 0
limit = 100
while initial_fails == "0\n":
node.query(f"""
INSERT INTO test VALUES (0,{k})
""")
# HostResolver chooses IP randomly, so on single call can choose worked ID
initial_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
k += 1
if k >= limit:
# Dead IP was not choosen for 100 iteration.
# This is not expected, but not an error actually.
# And test should be stopped.
return
# initial_fails can be more than 1 if clickhouse does something in several parallel threads
for j in range(10):
for i in range(10):
node.query(f"""
INSERT INTO test VALUES ({i+1},{j+1})
""")
fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
assert fails == initial_fails
# Check that clickhouse tries to recheck IP after 2 minutes
time.sleep(DEFAULT_RESOLVE_TIME_HISTORY_SECONDS)
intermediate_fails = initial_fails
limit = k + 100
while intermediate_fails == initial_fails:
node.query(f"""
INSERT INTO test VALUES (101,{k})
""")
intermediate_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
k += 1
if k >= limit:
# Dead IP was not choosen for 100 iteration.
# This is not expected, but not an error actually.
# And test should be stopped.
return
# After another 2 minutes shoudl not be new fails, next retry after 4 minutes
time.sleep(DEFAULT_RESOLVE_TIME_HISTORY_SECONDS)
initial_fails = intermediate_fails
limit = k + 100
while intermediate_fails == initial_fails:
node.query(f"""
INSERT INTO test VALUES (102,{k})
""")
intermediate_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
k += 1
if k >= limit:
break
assert k == limit