integration tests: fix flaky tests

This commit is contained in:
Nikita Fomichev 2024-03-25 11:13:37 +01:00
parent 0eb12d2d27
commit 31cd44457f
6 changed files with 59 additions and 28 deletions

View File

@ -1,4 +1,3 @@
from time import sleep
import pytest
import re
import os.path
@ -164,8 +163,15 @@ def test_replicated_database():
node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')")
node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')")
node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')")
node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
# check data in sync
expect = TSV([[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]])
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
# Make backup.
backup_name = new_backup_name()
node1.query(
@ -179,13 +185,8 @@ def test_replicated_database():
node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
[[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
)
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
[[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
)
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
def test_different_tables_on_nodes():
@ -427,7 +428,12 @@ def test_replicated_database_async():
node1.query("INSERT INTO mydb.tbl VALUES (22)")
node2.query("INSERT INTO mydb.tbl2 VALUES ('a')")
node2.query("INSERT INTO mydb.tbl2 VALUES ('bb')")
node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
node1.query("OPTIMIZE TABLE mydb.tbl2 ON CLUSTER 'cluster' FINAL")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl2")
backup_name = new_backup_name()
[id, status] = node1.query(

View File

@ -37,6 +37,16 @@ def get_status(dictionary_name):
).rstrip("\n")
def get_status_retry(dictionary_name, expect, retry_count=10, sleep_time=0.5):
for _ in range(retry_count):
res = get_status(dictionary_name)
if res == expect:
return res
time.sleep(sleep_time)
raise Exception(f'Expected result "{expect}" did not occur')
def get_last_exception(dictionary_name):
return (
instance.query(
@ -250,6 +260,15 @@ def test_reload_after_fail_by_timer(started_cluster):
assert expected_error in instance.query_and_get_error(
"SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))"
)
# on sanitizers builds it can return 'FAILED_AND_RELOADING' which is not quite right
# add retry for these builds
if (
instance.is_built_with_sanitizer()
and get_status("no_file_2") == "FAILED_AND_RELOADING"
):
get_status_retry("no_file_2", expect="FAILED")
assert get_status("no_file_2") == "FAILED"
# Creating the file source makes the dictionary able to load.

View File

@ -1,4 +1,4 @@
<clickhouse>
<max_server_memory_usage>2000000000</max_server_memory_usage>
<max_server_memory_usage>1500000000</max_server_memory_usage>
<allow_use_jemalloc_memory>false</allow_use_jemalloc_memory>
</clickhouse>
</clickhouse>

View File

@ -22,7 +22,7 @@ def start_cluster():
cluster.shutdown()
GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(5000000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
GLOBAL_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=80000000"
@ -42,11 +42,9 @@ def test_global_overcommit():
responses_A = list()
responses_B = list()
for i in range(100):
if i % 2 == 0:
responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
else:
responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))
for i in range(50):
responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))
overcommited_killed = False
for response in responses_A:

View File

@ -17,18 +17,24 @@ def start_cluster():
def test_memory_limit_success():
if node.is_built_with_thread_sanitizer():
pytest.skip(
"tsan build is skipped because it slowly merges the parts, "
"rather than failing over the memory limit"
)
node.query(
"CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
"CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap(x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
)
node.query("SYSTEM STOP MERGES test_merge_oom")
node.query(
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
)
node.query(
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
)
node.query(
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
"INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
)
_, error = node.query_and_get_answer_with_error(

View File

@ -1,7 +1,6 @@
import os
import pytest
import shutil
import time
import pytest
from helpers.cluster import ClickHouseCluster
# Tests that sizes of in-memory caches (mark / uncompressed / index mark / index uncompressed / mmapped file / query cache) can be changed
@ -101,9 +100,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "2\n",
)
assert res == "2\n"
@ -116,9 +116,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SYSTEM RELOAD CONFIG")
# check that eviction worked as expected
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "2\n",
)
assert (
res == "2\n"
@ -132,9 +133,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 5 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "1\n",
)
assert res == "1\n"