Test for HostResolver fail_count

2024-11-22 15:42:02 +00:00 · 2024-05-02 16:10:05 +02:00 · 2024-05-02 16:10:05 +02:00 · 22f1c197e5
commit 22f1c197e5
parent 87785e1c38
4 changed files with 136 additions and 0 deletions
--- a/tests/integration/test_host_resolver_fail_count/init.py
+++ b/tests/integration/test_host_resolver_fail_count/init.py
--- a/tests/integration/test_host_resolver_fail_count/configs/config.d/cluster.xml
+++ b/tests/integration/test_host_resolver_fail_count/configs/config.d/cluster.xml
@ -0,0 +1,12 @@
+<clickhouse>
+    <profiles>
+        <default>
+            <connect_timeout>5</connect_timeout>
+            <receive_timeout>5</receive_timeout>
+            <send_timeout>5</send_timeout>
+            <http_connection_timeout>5</http_connection_timeout>
+            <http_send_timeout>5</http_send_timeout>
+            <http_receive_timeout>5</http_receive_timeout>
+        </default>
+    </profiles>
+</clickhouse>
--- a/tests/integration/test_host_resolver_fail_count/configs/config.d/s3.xml
+++ b/tests/integration/test_host_resolver_fail_count/configs/config.d/s3.xml
@ -0,0 +1,21 @@
+<clickhouse>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <endpoint>http://minio1:9001/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+            </s3>
+        </disks>
+        <policies>
+            <s3>
+                <volumes>
+                    <main>
+                        <disk>s3</disk>
+                    </main>
+                </volumes>
+            </s3>
+        </policies>
+    </storage_configuration>
+</clickhouse>
--- a/tests/integration/test_host_resolver_fail_count/test_case.py
+++ b/tests/integration/test_host_resolver_fail_count/test_case.py
@ -0,0 +1,103 @@
+"""Test Interserver responses on configured IP."""
+import pytest
+import time
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+
+node = cluster.add_instance(
+    "node",
+    main_configs=["configs/config.d/cluster.xml", "configs/config.d/s3.xml"],
+    with_minio=True,
+)
+
+@pytest.fixture(scope="module")
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+# The same value as in ClickHouse, this can't be confugured via config now
+DEFAULT_RESOLVE_TIME_HISTORY_SECONDS = 2*60
+
+
+def test_host_resolver(start_cluster):
+    minio_ip = cluster.get_instance_ip("minio1")
+
+    # drop DNS cache    
+    node.set_hosts([
+        (minio_ip, "minio1"),
+        (node.ip_address, "minio1"), # no answer on 9001 port on this IP
+        ])
+    
+    node.query("SYSTEM DROP DNS CACHE")
+    node.query("SYSTEM DROP CONNECTIONS CACHE")
+
+    node.query("""
+                CREATE TABLE test (key UInt32, value UInt32)
+                Engine=MergeTree()
+                ORDER BY key PARTITION BY key
+                SETTINGS storage_policy='s3'
+                """)
+
+    initial_fails = "0\n"
+    k = 0
+    limit = 100
+    while initial_fails == "0\n":
+        node.query(f"""
+                    INSERT INTO test VALUES (0,{k})
+                    """)
+        # HostResolver chooses IP randomly, so on single call can choose worked ID
+        initial_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
+        k += 1
+        if k >= limit:
+            # Dead IP was not choosen for 100 iteration.
+            # This is not expected, but not an error actually.
+            # And test should be stopped.
+            return
+
+    # initial_fails can be more than 1 if clickhouse does something in several parallel threads
+
+    for j in range(10):
+        for i in range(10):
+            node.query(f"""
+                        INSERT INTO test VALUES ({i+1},{j+1})
+                        """)
+            fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
+            assert fails == initial_fails
+
+    # Check that clickhouse tries to recheck IP after 2 minutes
+    time.sleep(DEFAULT_RESOLVE_TIME_HISTORY_SECONDS)
+
+    intermediate_fails = initial_fails
+    limit = k + 100
+    while intermediate_fails == initial_fails:
+        node.query(f"""
+                    INSERT INTO test VALUES (101,{k})
+                    """)
+        intermediate_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
+        k += 1
+        if k >= limit:
+            # Dead IP was not choosen for 100 iteration.
+            # This is not expected, but not an error actually.
+            # And test should be stopped.
+            return
+
+    # After another 2 minutes shoudl not be new fails, next retry after 4 minutes
+    time.sleep(DEFAULT_RESOLVE_TIME_HISTORY_SECONDS)
+
+    initial_fails = intermediate_fails
+    limit = k + 100
+    while intermediate_fails == initial_fails:
+        node.query(f"""
+                    INSERT INTO test VALUES (102,{k})
+                    """)
+        intermediate_fails = node.query("SELECT value FROM system.events WHERE event LIKE 'AddressesMarkedAsFailed'")
+        k += 1
+        if k >= limit:
+            break
+
+    assert k == limit