Backport #61869 to 24.3: Fix flaky tests 2 (stateless, integration)

2024-09-19 16:20:50 +00:00 · 2024-03-28 10:04:23 +00:00 · 2024-03-28 10:04:23 +00:00 · 969cc4e835
commit 969cc4e835
parent 2a929eaa7d
13 changed files with 153 additions and 34 deletions
--- a/.gitignore
+++ b/.gitignore
@ -164,6 +164,9 @@ tests/queries/0_stateless/*.generated-expect
 tests/queries/0_stateless/*.expect.history
 tests/integration/**/_gen

+# pytest --pdb history
+.pdb_history
+
 # rust
 /rust/**/target*
 # It is autogenerated from *.in
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@ -13,6 +13,34 @@ from helpers.network import _NetworkManager
 logging.raiseExceptions = False


+@pytest.fixture(scope="session", autouse=True)
+def pdb_history(request):
+    """
+    Fixture loads and saves pdb history to file, so it can be preserved between runs
+    """
+    if request.config.getoption("--pdb"):
+        import readline  # pylint:disable=import-outside-toplevel
+        import pdb  # pylint:disable=import-outside-toplevel
+
+        def save_history():
+            readline.write_history_file(".pdb_history")
+
+        def load_history():
+            try:
+                readline.read_history_file(".pdb_history")
+            except FileNotFoundError:
+                pass
+
+        load_history()
+        pdb.Pdb.use_rawinput = True
+
+        yield
+
+        save_history()
+    else:
+        yield
+
+
@pytest.fixture(autouse=True, scope="session")
 def tune_local_port_range():
    # Lots of services uses non privileged ports:
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@ -862,12 +862,12 @@ class ClickHouseCluster:

    def get_docker_handle(self, docker_id):
        exception = None
-        for i in range(5):
+        for i in range(20):
            try:
                return self.docker_client.containers.get(docker_id)
            except Exception as ex:
                print("Got exception getting docker handle", str(ex))
-                time.sleep(i * 2)
+                time.sleep(0.5)
                exception = ex
        raise exception

--- a/tests/integration/test_backup_restore_on_cluster/test.py
+++ b/tests/integration/test_backup_restore_on_cluster/test.py
@ -1,4 +1,3 @@
-from time import sleep
 import pytest
 import re
 import os.path
@ -164,8 +163,15 @@ def test_replicated_database():
    node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')")
    node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')")
    node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')")
+    node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
+
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")

+    # check data in sync
+    expect = TSV([[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]])
+    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+
    # Make backup.
    backup_name = new_backup_name()
    node1.query(
@ -179,14 +185,63 @@ def test_replicated_database():
    node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}")
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")

-    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
-        [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
+    assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
+
+
+def test_replicated_database_compare_parts():
+    """
+    stop merges and fetches then write data to two nodes and
+    compare that parts are restored from single node (second) after backup
+    replica is selected by settings replica_num=2, replica_num_in_backup=2
+    """
+    node1.query(
+        "CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE=Replicated('/clickhouse/path/','{shard}','{replica}')"
    )

-    assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
-        [[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
+    node1.query(
+        "CREATE TABLE mydb.tbl(x UInt8, y String) ENGINE=ReplicatedMergeTree ORDER BY x"
    )

+    node2.query("SYSTEM SYNC DATABASE REPLICA mydb")
+
+    node1.query("SYSTEM STOP MERGES mydb.tbl")
+    node2.query("SYSTEM STOP MERGES mydb.tbl")
+
+    node1.query("SYSTEM STOP FETCHES mydb.tbl")
+    node2.query("SYSTEM STOP FETCHES mydb.tbl")
+
+    node1.query("INSERT INTO mydb.tbl VALUES (1, 'a')")
+    node1.query("INSERT INTO mydb.tbl VALUES (2, 'b')")
+
+    node2.query("INSERT INTO mydb.tbl VALUES (3, 'x')")
+    node2.query("INSERT INTO mydb.tbl VALUES (4, 'y')")
+
+    p2 = node2.query("SELECT * FROM mydb.tbl ORDER BY x")
+
+    # Make backup.
+    backup_name = new_backup_name()
+    node1.query(
+        f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2"
+    )
+
+    # Drop table on both nodes.
+    node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' SYNC")
+
+    # Restore from backup on node2.
+    node1.query(
+        f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} SETTINGS replica_num_in_backup=2"
+    )
+    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
+
+    # compare parts
+    p1_ = node1.query("SELECT _part, * FROM mydb.tbl ORDER BY x")
+    p2_ = node2.query("SELECT _part, * FROM mydb.tbl ORDER BY x")
+    assert p1_ == p2_
+
+    # compare data
+    assert p2 == node2.query("SELECT * FROM mydb.tbl ORDER BY x")
+

 def test_different_tables_on_nodes():
    node1.query(
@ -427,7 +482,12 @@ def test_replicated_database_async():
    node1.query("INSERT INTO mydb.tbl VALUES (22)")
    node2.query("INSERT INTO mydb.tbl2 VALUES ('a')")
    node2.query("INSERT INTO mydb.tbl2 VALUES ('bb')")
+
+    node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
+    node1.query("OPTIMIZE TABLE mydb.tbl2 ON CLUSTER 'cluster' FINAL")
+
    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
+    node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl2")

    backup_name = new_backup_name()
    [id, status] = node1.query(
--- a/tests/integration/test_dictionaries_update_and_reload/test.py
+++ b/tests/integration/test_dictionaries_update_and_reload/test.py
@ -37,6 +37,16 @@ def get_status(dictionary_name):
    ).rstrip("\n")


+def get_status_retry(dictionary_name, expect, retry_count=10, sleep_time=0.5):
+    for _ in range(retry_count):
+        res = get_status(dictionary_name)
+        if res == expect:
+            return res
+        time.sleep(sleep_time)
+
+    raise Exception(f'Expected result "{expect}" did not occur')
+
+
 def get_last_exception(dictionary_name):
    return (
        instance.query(
@ -250,6 +260,15 @@ def test_reload_after_fail_by_timer(started_cluster):
    assert expected_error in instance.query_and_get_error(
        "SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))"
    )
+
+    # on sanitizers builds it can return 'FAILED_AND_RELOADING' which is not quite right
+    # add retry for these builds
+    if (
+        instance.is_built_with_sanitizer()
+        and get_status("no_file_2") == "FAILED_AND_RELOADING"
+    ):
+        get_status_retry("no_file_2", expect="FAILED")
+
    assert get_status("no_file_2") == "FAILED"

    # Creating the file source makes the dictionary able to load.
--- a/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
+++ b/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
@ -1,4 +1,4 @@
 <clickhouse>
-    <max_server_memory_usage>2000000000</max_server_memory_usage>
+    <max_server_memory_usage>1500000000</max_server_memory_usage>
    <allow_use_jemalloc_memory>false</allow_use_jemalloc_memory>
 </clickhouse>
--- a/tests/integration/test_global_overcommit_tracker/test.py
+++ b/tests/integration/test_global_overcommit_tracker/test.py
@ -22,7 +22,7 @@ def start_cluster():
        cluster.shutdown()


-GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
+GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(5000000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
 GLOBAL_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=80000000"


@ -42,10 +42,8 @@ def test_global_overcommit():

    responses_A = list()
    responses_B = list()
-    for i in range(100):
-        if i % 2 == 0:
+    for i in range(50):
        responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
-        else:
        responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))

    overcommited_killed = False
--- a/tests/integration/test_merges_memory_limit/test.py
+++ b/tests/integration/test_merges_memory_limit/test.py
@ -17,6 +17,12 @@ def start_cluster():


 def test_memory_limit_success():
+    if node.is_built_with_thread_sanitizer():
+        pytest.skip(
+            "tsan build is skipped because it slowly merges the parts, "
+            "rather than failing over the memory limit"
+        )
+
    node.query(
        "CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap(x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
    )
--- a/tests/integration/test_runtime_configurable_cache_size/test.py
+++ b/tests/integration/test_runtime_configurable_cache_size/test.py
@ -1,7 +1,6 @@
 import os
-import pytest
-import shutil
 import time
+import pytest
 from helpers.cluster import ClickHouseCluster

 # Tests that sizes of in-memory caches (mark / uncompressed / index mark / index uncompressed / mmapped file / query cache) can be changed
@ -101,9 +100,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
    node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")

-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "2\n",
    )
    assert res == "2\n"

@ -116,9 +116,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SYSTEM RELOAD CONFIG")

    # check that eviction worked as expected
-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "2\n",
    )
    assert (
        res == "2\n"
@ -132,9 +133,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
    node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
    node.query("SELECT 5 SETTINGS use_query_cache = 1, query_cache_ttl = 1")

-    res = node.query_with_retry(
+    time.sleep(2)
+    node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
+    res = node.query(
        "SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
-        check_callback=lambda result: result == "1\n",
    )
    assert res == "1\n"

--- a/tests/queries/0_stateless/01287_max_execution_speed.sql
+++ b/tests/queries/0_stateless/01287_max_execution_speed.sql
@ -2,13 +2,13 @@

 SET min_execution_speed = 100000000000, timeout_before_checking_execution_speed = 0;
 SELECT count() FROM system.numbers; -- { serverError 160 }
-SELECT 'Ok (1)';
 SET min_execution_speed = 0;
+SELECT 'Ok (1)';

 SET min_execution_speed_bytes = 800000000000, timeout_before_checking_execution_speed = 0;
 SELECT count() FROM system.numbers; -- { serverError 160 }
-SELECT 'Ok (2)';
 SET min_execution_speed_bytes = 0;
+SELECT 'Ok (2)';

 SET max_execution_speed = 1000000;
 SET max_block_size = 100;
--- a/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh
+++ b/tests/queries/0_stateless/01676_clickhouse_client_autocomplete.sh
@ -21,6 +21,10 @@ function test_completion_word()
    # - here and below you should escape variables of the expect.
    # - you should not use "expect <<..." since in this case timeout/eof will
    #   not work (I guess due to attached stdin)
+
+    # TODO: get build sanitizer and debug/release info to dynamically change test
+    # like here timeout 120 seconds is too big for release build
+    # but ok for sanitizer builds
    cat > "$SCRIPT_PATH" << EOF
 # NOTE: log will be appended
 exp_internal -f $CLICKHOUSE_TMP/$(basename "${BASH_SOURCE[0]}").debuglog 0
@ -30,7 +34,7 @@ exp_internal -f $CLICKHOUSE_TMP/$(basename "${BASH_SOURCE[0]}").debuglog 0
 set stdout_channel [open "/dev/stdout" w]

 log_user 0
-set timeout 60
+set timeout 120
 match_max 100000
 expect_after {
    # Do not ignore eof from expect
--- a/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql
+++ b/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql
@ -1,6 +1,5 @@
 create table test (number UInt64) engine=MergeTree order by number;
-insert into test select * from numbers(100000000);
+insert into test select * from numbers(50000000);
 select ignore(number) from test where RAND() > 4292390314 limit 10;
 select count() > 0 from test where RAND() > 4292390314;
 drop table test;
-
--- a/tests/queries/0_stateless/03006_correct_revoke_for_partial_rights.sh
+++ b/tests/queries/0_stateless/03006_correct_revoke_for_partial_rights.sh
@ -5,8 +5,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 . "$CURDIR"/../shell_config.sh

 db=${CLICKHOUSE_DATABASE}
-user1="user1_03006_$db_$RANDOM"
-user2="user2_03006_$db_$RANDOM"
+user1="user1_03006_${db}_$RANDOM"
+user2="user2_03006_${db}_$RANDOM"

 ${CLICKHOUSE_CLIENT} --multiquery <<EOF
 DROP DATABASE IF EXISTS $db;