Merge pull request #33625 from ClickHouse/better_integration_tests

Fix two flaky integration tests
This commit is contained in:
alexey-milovidov 2022-01-15 02:32:54 +03:00 committed by GitHub
commit 1e8e21570c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 7 deletions

View File

@ -41,8 +41,7 @@ def test_cleanup_dir_after_bad_zk_conn(start_cluster):
pm.drop_instance_zk_connections(node1)
time.sleep(3)
error = node1.query_and_get_error(query_create)
assert "Poco::Exception. Code: 1000" and \
"All connection tries failed while connecting to ZooKeeper" in error
time.sleep(3)
error = node1.query_and_get_error(query_create)
assert "Directory for table data data/replica/test/ already exists" not in error
node1.query_with_retry(query_create)

View File

@ -79,7 +79,15 @@ SELECT sum(x) FROM distributed WITH TOTALS SETTINGS
pm.drop_instance_zk_connections(node_1_2)
pm.drop_instance_zk_connections(node_2_2)
time.sleep(4) # allow pings to zookeeper to timeout (must be greater than ZK session timeout).
# allow pings to zookeeper to timeout (must be greater than ZK session timeout).
for _ in range(30):
try:
node_2_2.query("SELECT * FROM system.zookeeper where path = '/'")
time.sleep(0.5)
except:
break
else:
raise Exception("Connection with zookeeper was not lost")
# At this point all replicas are stale, but the query must still go to second replicas which are the least stale ones.
assert instance_with_dist_table.query('''
@ -96,14 +104,20 @@ SELECT sum(x) FROM distributed SETTINGS
max_replica_delay_for_distributed_queries=1
''').strip() == '3'
# If we forbid stale replicas, the query must fail.
with pytest.raises(Exception):
print(instance_with_dist_table.query('''
# If we forbid stale replicas, the query must fail. But sometimes we must have bigger timeouts.
for _ in range(20):
try:
instance_with_dist_table.query('''
SELECT count() FROM distributed SETTINGS
load_balancing='in_order',
max_replica_delay_for_distributed_queries=1,
fallback_to_stale_replicas_for_distributed_queries=0
'''))
''')
time.sleep(0.5)
except:
break
else:
raise Exception("Didn't raise when stale replicas are not allowed")
# Now partition off the remote replica of the local shard and test that failover still works.
pm.partition_instances(node_1_1, node_1_2, port=9000)