Backport #61869 to 24.3: Fix flaky tests 2 (stateless, integration)

This commit is contained in:
robot-clickhouse 2024-03-28 10:04:23 +00:00
parent 2a929eaa7d
commit 969cc4e835
13 changed files with 153 additions and 34 deletions

3
.gitignore vendored
View File

@ -164,6 +164,9 @@ tests/queries/0_stateless/*.generated-expect
tests/queries/0_stateless/*.expect.history
tests/integration/**/_gen
# pytest --pdb history
.pdb_history
# rust
/rust/**/target*
# It is autogenerated from *.in

View File

@ -13,6 +13,34 @@ from helpers.network import _NetworkManager
logging.raiseExceptions = False
@pytest.fixture(scope="session", autouse=True)
def pdb_history(request):
"""
Fixture loads and saves pdb history to file, so it can be preserved between runs
"""
if request.config.getoption("--pdb"):
import readline # pylint:disable=import-outside-toplevel
import pdb # pylint:disable=import-outside-toplevel
def save_history():
readline.write_history_file(".pdb_history")
def load_history():
try:
readline.read_history_file(".pdb_history")
except FileNotFoundError:
pass
load_history()
pdb.Pdb.use_rawinput = True
yield
save_history()
else:
yield
@pytest.fixture(autouse=True, scope="session")
def tune_local_port_range():
# Lots of services uses non privileged ports:

View File

@ -862,12 +862,12 @@ class ClickHouseCluster:
def get_docker_handle(self, docker_id):
exception = None
for i in range(5):
for i in range(20):
try:
return self.docker_client.containers.get(docker_id)
except Exception as ex:
print("Got exception getting docker handle", str(ex))
time.sleep(i * 2)
time.sleep(0.5)
exception = ex
raise exception

View File

@ -1,4 +1,3 @@
from time import sleep
import pytest
import re
import os.path
@ -164,8 +163,15 @@ def test_replicated_database():
node2.query("INSERT INTO mydb.tbl VALUES (2, 'count')")
node1.query("INSERT INTO mydb.tbl VALUES (3, 'your')")
node2.query("INSERT INTO mydb.tbl VALUES (4, 'chickens')")
node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
# check data in sync
expect = TSV([[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]])
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
# Make backup.
backup_name = new_backup_name()
node1.query(
@ -179,14 +185,63 @@ def test_replicated_database():
node1.query(f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name}")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
[[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
assert node1.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == expect
def test_replicated_database_compare_parts():
"""
stop merges and fetches then write data to two nodes and
compare that parts are restored from single node (second) after backup
replica is selected by settings replica_num=2, replica_num_in_backup=2
"""
node1.query(
"CREATE DATABASE mydb ON CLUSTER 'cluster' ENGINE=Replicated('/clickhouse/path/','{shard}','{replica}')"
)
assert node2.query("SELECT * FROM mydb.tbl ORDER BY x") == TSV(
[[1, "Don\\'t"], [2, "count"], [3, "your"], [4, "chickens"]]
node1.query(
"CREATE TABLE mydb.tbl(x UInt8, y String) ENGINE=ReplicatedMergeTree ORDER BY x"
)
node2.query("SYSTEM SYNC DATABASE REPLICA mydb")
node1.query("SYSTEM STOP MERGES mydb.tbl")
node2.query("SYSTEM STOP MERGES mydb.tbl")
node1.query("SYSTEM STOP FETCHES mydb.tbl")
node2.query("SYSTEM STOP FETCHES mydb.tbl")
node1.query("INSERT INTO mydb.tbl VALUES (1, 'a')")
node1.query("INSERT INTO mydb.tbl VALUES (2, 'b')")
node2.query("INSERT INTO mydb.tbl VALUES (3, 'x')")
node2.query("INSERT INTO mydb.tbl VALUES (4, 'y')")
p2 = node2.query("SELECT * FROM mydb.tbl ORDER BY x")
# Make backup.
backup_name = new_backup_name()
node1.query(
f"BACKUP DATABASE mydb ON CLUSTER 'cluster' TO {backup_name} SETTINGS replica_num=2"
)
# Drop table on both nodes.
node1.query("DROP DATABASE mydb ON CLUSTER 'cluster' SYNC")
# Restore from backup on node2.
node1.query(
f"RESTORE DATABASE mydb ON CLUSTER 'cluster' FROM {backup_name} SETTINGS replica_num_in_backup=2"
)
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
# compare parts
p1_ = node1.query("SELECT _part, * FROM mydb.tbl ORDER BY x")
p2_ = node2.query("SELECT _part, * FROM mydb.tbl ORDER BY x")
assert p1_ == p2_
# compare data
assert p2 == node2.query("SELECT * FROM mydb.tbl ORDER BY x")
def test_different_tables_on_nodes():
node1.query(
@ -427,7 +482,12 @@ def test_replicated_database_async():
node1.query("INSERT INTO mydb.tbl VALUES (22)")
node2.query("INSERT INTO mydb.tbl2 VALUES ('a')")
node2.query("INSERT INTO mydb.tbl2 VALUES ('bb')")
node1.query("OPTIMIZE TABLE mydb.tbl ON CLUSTER 'cluster' FINAL")
node1.query("OPTIMIZE TABLE mydb.tbl2 ON CLUSTER 'cluster' FINAL")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl")
node1.query("SYSTEM SYNC REPLICA ON CLUSTER 'cluster' mydb.tbl2")
backup_name = new_backup_name()
[id, status] = node1.query(

View File

@ -37,6 +37,16 @@ def get_status(dictionary_name):
).rstrip("\n")
def get_status_retry(dictionary_name, expect, retry_count=10, sleep_time=0.5):
for _ in range(retry_count):
res = get_status(dictionary_name)
if res == expect:
return res
time.sleep(sleep_time)
raise Exception(f'Expected result "{expect}" did not occur')
def get_last_exception(dictionary_name):
return (
instance.query(
@ -250,6 +260,15 @@ def test_reload_after_fail_by_timer(started_cluster):
assert expected_error in instance.query_and_get_error(
"SELECT dictGetInt32('no_file_2', 'a', toUInt64(9))"
)
# on sanitizers builds it can return 'FAILED_AND_RELOADING' which is not quite right
# add retry for these builds
if (
instance.is_built_with_sanitizer()
and get_status("no_file_2") == "FAILED_AND_RELOADING"
):
get_status_retry("no_file_2", expect="FAILED")
assert get_status("no_file_2") == "FAILED"
# Creating the file source makes the dictionary able to load.

View File

@ -1,4 +1,4 @@
<clickhouse>
<max_server_memory_usage>2000000000</max_server_memory_usage>
<max_server_memory_usage>1500000000</max_server_memory_usage>
<allow_use_jemalloc_memory>false</allow_use_jemalloc_memory>
</clickhouse>

View File

@ -22,7 +22,7 @@ def start_cluster():
cluster.shutdown()
GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(5000000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
GLOBAL_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=80000000"
@ -42,10 +42,8 @@ def test_global_overcommit():
responses_A = list()
responses_B = list()
for i in range(100):
if i % 2 == 0:
for i in range(50):
responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
else:
responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))
overcommited_killed = False

View File

@ -17,6 +17,12 @@ def start_cluster():
def test_memory_limit_success():
if node.is_built_with_thread_sanitizer():
pytest.skip(
"tsan build is skipped because it slowly merges the parts, "
"rather than failing over the memory limit"
)
node.query(
"CREATE TABLE test_merge_oom ENGINE=AggregatingMergeTree ORDER BY id EMPTY AS SELECT number%1024 AS id, arrayReduce('groupArrayState', arrayMap(x-> randomPrintableASCII(100), range(8192))) fat_state FROM numbers(20000)"
)

View File

@ -1,7 +1,6 @@
import os
import pytest
import shutil
import time
import pytest
from helpers.cluster import ClickHouseCluster
# Tests that sizes of in-memory caches (mark / uncompressed / index mark / index uncompressed / mmapped file / query cache) can be changed
@ -101,9 +100,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SELECT 2 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 3 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "2\n",
)
assert res == "2\n"
@ -116,9 +116,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SYSTEM RELOAD CONFIG")
# check that eviction worked as expected
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "2\n",
)
assert (
res == "2\n"
@ -132,9 +133,10 @@ def test_query_cache_size_is_runtime_configurable(start_cluster):
node.query("SELECT 4 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
node.query("SELECT 5 SETTINGS use_query_cache = 1, query_cache_ttl = 1")
res = node.query_with_retry(
time.sleep(2)
node.query("SYSTEM RELOAD ASYNCHRONOUS METRICS")
res = node.query(
"SELECT value FROM system.asynchronous_metrics WHERE metric = 'QueryCacheEntries'",
check_callback=lambda result: result == "1\n",
)
assert res == "1\n"

View File

@ -2,13 +2,13 @@
SET min_execution_speed = 100000000000, timeout_before_checking_execution_speed = 0;
SELECT count() FROM system.numbers; -- { serverError 160 }
SELECT 'Ok (1)';
SET min_execution_speed = 0;
SELECT 'Ok (1)';
SET min_execution_speed_bytes = 800000000000, timeout_before_checking_execution_speed = 0;
SELECT count() FROM system.numbers; -- { serverError 160 }
SELECT 'Ok (2)';
SET min_execution_speed_bytes = 0;
SELECT 'Ok (2)';
SET max_execution_speed = 1000000;
SET max_block_size = 100;

View File

@ -21,6 +21,10 @@ function test_completion_word()
# - here and below you should escape variables of the expect.
# - you should not use "expect <<..." since in this case timeout/eof will
# not work (I guess due to attached stdin)
# TODO: get build sanitizer and debug/release info to dynamically change test
# like here timeout 120 seconds is too big for release build
# but ok for sanitizer builds
cat > "$SCRIPT_PATH" << EOF
# NOTE: log will be appended
exp_internal -f $CLICKHOUSE_TMP/$(basename "${BASH_SOURCE[0]}").debuglog 0
@ -30,7 +34,7 @@ exp_internal -f $CLICKHOUSE_TMP/$(basename "${BASH_SOURCE[0]}").debuglog 0
set stdout_channel [open "/dev/stdout" w]
log_user 0
set timeout 60
set timeout 120
match_max 100000
expect_after {
# Do not ignore eof from expect

View File

@ -1,6 +1,5 @@
create table test (number UInt64) engine=MergeTree order by number;
insert into test select * from numbers(100000000);
insert into test select * from numbers(50000000);
select ignore(number) from test where RAND() > 4292390314 limit 10;
select count() > 0 from test where RAND() > 4292390314;
drop table test;

View File

@ -5,8 +5,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
. "$CURDIR"/../shell_config.sh
db=${CLICKHOUSE_DATABASE}
user1="user1_03006_$db_$RANDOM"
user2="user2_03006_$db_$RANDOM"
user1="user1_03006_${db}_$RANDOM"
user2="user2_03006_${db}_$RANDOM"
${CLICKHOUSE_CLIENT} --multiquery <<EOF
DROP DATABASE IF EXISTS $db;