integration tests: fix flaky tests

2024-09-19 16:20:50 +00:00 · 2024-03-25 11:13:37 +01:00 · 2024-03-25 11:13:37 +01:00 · 31cd44457f
commit 31cd44457f
parent 0eb12d2d27
6 changed files with 59 additions and 28 deletions
--- a/tests/integration/test_backup_restore_on_cluster/test.py
+++ b/tests/integration/test_backup_restore_on_cluster/test.py
@ -1,4 +1,3 @@
-from time import sleep
 import pytest
 import re
 import os.path
@ -164,8 +163,15 @@ def test_replicated_database():
    node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')")
    node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')")
    node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')")
+    node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
+
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")

+    # check data in sync
+    expect = TSV([[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]])
+    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+
    # Make backup.
    backup_name = new_backup_name()
    node1.query(
@ -179,13 +185,8 @@ def test_replicated_database():
    node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}")
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")

-    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
-        [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
-    )
-
-    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
-        [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
-    )
+    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect


 def test_different_tables_on_nodes():
@ -427,7 +428,12 @@ def test_replicated_database_async():
    node1.query("INSERT INTO mydb.tbl VALUES (22)")
    node2.query("INSERT INTO mydb.tbl2 VALUES ('a')")
    node2.query("INSERT INTO mydb.tbl2 VALUES ('bb')")
+
+    node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
+    node1.query("OPTIMIZE TABLE mydb.tbl2 ON CLUSTER 'cluster' FINAL")
+
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
+    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl2")

    backup_name = new_backup_name()
    [id, status] = node1.query(
--- a/tests/integration/test_dictionaries_update_and_reload/test.py
+++ b/tests/integration/test_dictionaries_update_and_reload/test.py
@ -37,6 +37,16 @@ def get_status(dictionary_name):
    ).rstrip("\n")


+def get_status_retry(dictionary_name, expect, retry_count=10, sleep_time=0.5):
+    for _ in range(retry_count):
+        res = get_status(dictionary_name)
+        if res == expect:
+            return res
+        time.sleep(sleep_time)
+
+    raise Exception(f'Expected result "{expect}" did not occur')
+
+
 def get_last_exception(dictionary_name):
    return (
        instance.query(
@ -250,6 +260,15 @@ def test_reload_after_fail_by_timer(started_cluster):
    assert expected_error in instance.query_and_get_error(
        "SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))"
    )
+
+    # on sanitizers builds it can return 'FAILED_AND_RELOADING' which is not quite right
+    # add retry for these builds
+    if (
+        instance.is_built_with_sanitizer()
+        and get_status("no_file_2") == "FAILED_AND_RELOADING"
+    ):
+        get_status_retry("no_file_2", expect="FAILED")
+
    assert get_status("no_file_2") == "FAILED"

    # Creating the file source makes the dictionary able to load.
--- a/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
+++ b/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
@ -1,4 +1,4 @@
 <clickhouse>
-    <max_server_memory_usage>2000000000</max_server_memory_usage>
+    <max_server_memory_usage>1500000000</max_server_memory_usage>
    <allow_use_jemalloc_memory>false</allow_use_jemalloc_memory>
-</clickhouse>
+</clickhouse>
--- a/tests/integration/test_global_overcommit_tracker/test.py
+++ b/tests/integration/test_global_overcommit_tracker/test.py
@ -22,7 +22,7 @@ def start_cluster():
        cluster.shutdown()


-GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
+GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(5000000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
 GLOBAL_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=80000000"


@ -42,11 +42,9 @@ def test_global_overcommit():

    responses_A = list()
    responses_B = list()
-    for i in range(100):
-        if i % 2 == 0:
-            responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
-        else:
-            responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))
+    for i in range(50):
+        responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
+        responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))

    overcommited_killed = False
    for response in responses_A:
--- a/tests/integration/test_merges_memory_limit/test.py
+++ b/tests/integration/test_merges_memory_limit/test.py
@ -17,18 +17,24 @@ def start_cluster():


 def test_memory_limit_success():
+    if node.is_built_with_thread_sanitizer():
+        pytest.skip(
+            "tsan build is skipped because it slowly merges the parts, "
+            "rather than failing over the memory limit"
+        )
+
    node.query(
-        "CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
+        "CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap(x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
    )
    node.query("SYSTEM STOP MERGES test_merge_oom")
    node.query(
-        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
+        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
    )
    node.query(
-        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
+        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
    )
    node.query(
-        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce( 'groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
+        "INSERT INTO test_merge_oom SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap( x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(3000)"
    )

    _, error = node.query_and_get_answer_with_error(
--- a/tests/integration/test_runtime_configurable_cache_size/test.py
+++ b/tests/integration/test_runtime_configurable_cache_size/test.py
@ -1,7 +1,6 @@
 import os
-import pytest
-import shutil
 import time
+import pytest
 from helpers.cluster import ClickHouseCluster

 # Tests that sizes of in-memory caches (mark / uncompressed / index mark / index uncompressed / mmapped file / query cache) can be changed
@ -101,9 +100,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
    node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")

-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "2\n",
    )
    assert res == "2\n"

@ -116,9 +116,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SYSTEM RELOAD CONFIG")

    # check that eviction worked as expected
-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "2\n",
    )
    assert (
        res == "2\n"
@ -132,9 +133,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
    node.query("SELECT 5 SETTINGS use_query_cache = 1, query_cache_ttl = 1")

-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "1\n",
    )
    assert res == "1\n"