Merge pull request #33625 from ClickHouse/better_integration_tests

Fix two flaky integration tests
2024-11-22 15:42:02 +00:00 · 2022-01-15 02:32:54 +03:00 · 2022-01-15 02:32:54 +03:00 · 1e8e21570c
commit 1e8e21570c
parent c3972f000d 436ad63958
2 changed files with 20 additions and 7 deletions
--- a/tests/integration/test_cleanup_dir_after_bad_zk_conn/test.py
+++ b/tests/integration/test_cleanup_dir_after_bad_zk_conn/test.py
@ -41,8 +41,7 @@ def test_cleanup_dir_after_bad_zk_conn(start_cluster):
        pm.drop_instance_zk_connections(node1)
        time.sleep(3)
        error = node1.query_and_get_error(query_create)
-        assert "Poco::Exception. Code: 1000" and \
-               "All connection tries failed while connecting to ZooKeeper" in error
+        time.sleep(3)
        error = node1.query_and_get_error(query_create)
        assert "Directory for table data data/replica/test/ already exists" not in error
    node1.query_with_retry(query_create)
--- a/tests/integration/test_delayed_replica_failover/test.py
+++ b/tests/integration/test_delayed_replica_failover/test.py
@ -79,7 +79,15 @@ SELECT sum(x) FROM distributed WITH TOTALS SETTINGS
        pm.drop_instance_zk_connections(node_1_2)
        pm.drop_instance_zk_connections(node_2_2)

-        time.sleep(4)  # allow pings to zookeeper to timeout (must be greater than ZK session timeout).
+        # allow pings to zookeeper to timeout (must be greater than ZK session timeout).
+        for _ in range(30):
+            try:
+                node_2_2.query("SELECT * FROM system.zookeeper where path = '/'")
+                time.sleep(0.5)
+            except:
+                break
+        else:
+            raise Exception("Connection with zookeeper was not lost")

        # At this point all replicas are stale, but the query must still go to second replicas which are the least stale ones.
        assert instance_with_dist_table.query('''
@ -96,14 +104,20 @@ SELECT sum(x) FROM distributed SETTINGS
    max_replica_delay_for_distributed_queries=1
 ''').strip() == '3'

-        # If we forbid stale replicas, the query must fail.
-        with pytest.raises(Exception):
-            print(instance_with_dist_table.query('''
+        # If we forbid stale replicas, the query must fail. But sometimes we must have bigger timeouts.
+        for _ in range(20):
+            try:
+                instance_with_dist_table.query('''
 SELECT count() FROM distributed SETTINGS
    load_balancing='in_order',
    max_replica_delay_for_distributed_queries=1,
    fallback_to_stale_replicas_for_distributed_queries=0
-'''))
+''')
+                time.sleep(0.5)
+            except:
+                break
+        else:
+            raise Exception("Didn't raise when stale replicas are not allowed")

        # Now partition off the remote replica of the local shard and test that failover still works.
        pm.partition_instances(node_1_1, node_1_2, port=9000)