ClickHouse/tests/ci/stress.py

400 lines
15 KiB
Python
Raw Normal View History

2020-10-02 16:54:07 +00:00
#!/usr/bin/env python3
2020-07-08 08:41:39 +00:00
# -*- coding: utf-8 -*-
2023-09-27 14:27:37 +00:00
"""This script is used in docker images for stress tests and upgrade tests"""
import argparse
import logging
import random
2023-09-27 14:27:37 +00:00
import time
2024-02-26 18:25:02 +00:00
from multiprocessing import cpu_count
from pathlib import Path
from subprocess import PIPE, STDOUT, Popen, call, check_output
from typing import List
2020-07-08 08:41:39 +00:00
2023-09-27 14:27:37 +00:00
def get_options(i: int, upgrade_check: bool) -> str:
2021-06-03 15:13:29 +00:00
options = []
2021-07-29 12:16:34 +00:00
client_options = []
2023-01-03 20:13:12 +00:00
if i > 0:
2021-06-03 15:13:29 +00:00
options.append("--order=random")
2020-10-15 20:48:10 +00:00
if i % 3 == 2 and not upgrade_check:
2023-09-27 14:27:37 +00:00
options.append(f'''--db-engine="Replicated('/test/db/test_{i}', 's1', 'r1')"''')
client_options.append("enable_deflate_qpl_codec=1")
client_options.append("enable_zstd_qat_codec=1")
2021-01-25 18:59:23 +00:00
2020-10-15 18:14:04 +00:00
# If database name is not specified, new database is created for each functional test.
# Run some threads with one database for all tests.
2021-01-25 18:59:23 +00:00
if i % 2 == 1:
2023-09-27 14:27:37 +00:00
options.append(f" --database=test_{i}")
2020-10-15 18:14:04 +00:00
if i % 3 == 1:
2021-07-29 12:16:34 +00:00
client_options.append("join_use_nulls=1")
if i % 2 == 1:
join_alg_num = i // 2
if join_alg_num % 5 == 0:
client_options.append("join_algorithm='parallel_hash'")
if join_alg_num % 5 == 1:
client_options.append("join_algorithm='partial_merge'")
if join_alg_num % 5 == 2:
client_options.append("join_algorithm='full_sorting_merge'")
2023-06-25 03:07:15 +00:00
if join_alg_num % 5 == 3 and not upgrade_check:
# Some crashes are not fixed in 23.2 yet, so ignore the setting in Upgrade check
client_options.append("join_algorithm='grace_hash'")
if join_alg_num % 5 == 4:
client_options.append("join_algorithm='auto'")
2023-01-03 19:32:11 +00:00
client_options.append("max_rows_in_join=1000")
2023-03-22 12:55:24 +00:00
if i > 0 and random.random() < 1 / 3:
client_options.append("use_query_cache=1")
if i % 5 == 1:
client_options.append("memory_tracker_fault_probability=0.001")
2020-08-13 17:55:12 +00:00
2024-03-06 09:06:19 +00:00
if i % 5 == 1:
2024-03-06 09:20:51 +00:00
client_options.append(
"merge_tree_read_split_ranges_into_intersecting_and_non_intersecting_injection_probability=0.05"
)
2024-03-06 09:06:19 +00:00
if i % 2 == 1 and not upgrade_check:
client_options.append("group_by_use_nulls=1")
2020-08-13 17:55:12 +00:00
2023-02-21 19:16:05 +00:00
# 12 % 3 == 0, so it's Atomic database
if i == 12 and not upgrade_check:
client_options.append("implicit_transaction=1")
client_options.append("throw_on_unsupported_query_inside_transaction=0")
if random.random() < 0.1:
client_options.append("optimize_trivial_approximate_count_query=1")
if random.random() < 0.3:
client_options.append(f"http_make_head_request={random.randint(0, 1)}")
# TODO: After release 24.3 use ignore_drop_queries_probability for both
# stress test and upgrade check
if not upgrade_check:
client_options.append("ignore_drop_queries_probability=0.5")
2024-05-15 17:48:29 +00:00
if random.random() < 0.2:
client_options.append("allow_experimental_parallel_reading_from_replicas=1")
2024-05-15 18:28:05 +00:00
client_options.append("max_parallel_replicas=3")
client_options.append("cluster_for_parallel_replicas='parallel_replicas'")
client_options.append("parallel_replicas_for_non_replicated_merge_tree=1")
2024-05-15 17:48:29 +00:00
2021-07-29 12:16:34 +00:00
if client_options:
options.append(" --client-option " + " ".join(client_options))
return " ".join(options)
def run_func_test(
2023-09-27 14:27:37 +00:00
cmd: str,
output_prefix: Path,
num_processes: int,
skip_tests_option: str,
global_time_limit: int,
upgrade_check: bool,
) -> List[Popen]:
2023-01-03 19:32:11 +00:00
upgrade_check_option = "--upgrade-check" if upgrade_check else ""
2023-09-27 14:27:37 +00:00
global_time_limit_option = (
f"--global_time_limit={global_time_limit}" if global_time_limit else ""
)
2020-08-26 17:44:03 +00:00
output_paths = [
2023-09-27 14:27:37 +00:00
output_prefix / f"stress_test_run_{i}.txt" for i in range(num_processes)
]
2020-08-13 17:55:12 +00:00
pipes = []
2023-02-24 16:58:36 +00:00
for i, path in enumerate(output_paths):
2024-02-26 18:25:02 +00:00
with open(path, "w", encoding="utf-8") as op:
2023-09-27 14:27:37 +00:00
full_command = (
f"{cmd} {get_options(i, upgrade_check)} {global_time_limit_option} "
f"{skip_tests_option} {upgrade_check_option}"
)
logging.info("Run func tests '%s'", full_command)
2024-02-26 18:25:02 +00:00
# pylint:disable-next=consider-using-with
2023-09-27 14:27:37 +00:00
pipes.append(Popen(full_command, shell=True, stdout=op, stderr=op))
time.sleep(0.5)
return pipes
2023-09-27 14:27:37 +00:00
def compress_stress_logs(output_path: Path, files_prefix: str) -> None:
cmd = (
f"cd {output_path} && tar --zstd --create --file=stress_run_logs.tar.zst "
f"{files_prefix}* && rm {files_prefix}*"
)
2021-06-01 09:45:01 +00:00
check_output(cmd, shell=True)
2023-09-27 14:27:37 +00:00
def call_with_retry(query: str, timeout: int = 30, retry_count: int = 5) -> None:
2023-10-23 15:27:38 +00:00
logging.info("Running command: %s", str(query))
2021-11-30 10:19:03 +00:00
for i in range(retry_count):
code = call(query, shell=True, stderr=STDOUT, timeout=timeout)
if code != 0:
2023-10-23 15:27:38 +00:00
logging.info("Command returend %s, retrying", str(code))
2021-11-30 10:19:03 +00:00
time.sleep(i)
else:
break
2023-09-27 14:27:37 +00:00
def make_query_command(query: str) -> str:
return (
f'clickhouse client -q "{query}" --max_untracked_memory=1Gi '
2023-11-24 20:11:23 +00:00
"--memory_profiler_step=1Gi --max_memory_usage_for_user=0 --max_memory_usage_in_client=1000000000"
2023-09-27 14:27:37 +00:00
)
2021-11-30 10:19:03 +00:00
2023-09-27 14:27:37 +00:00
def prepare_for_hung_check(drop_databases: bool) -> bool:
2021-02-24 16:41:44 +00:00
# FIXME this function should not exist, but...
# We attach gdb to clickhouse-server before running tests
# to print stacktraces of all crashes even if clickhouse cannot print it for some reason.
# However, it obstructs checking for hung queries.
2021-02-24 16:41:44 +00:00
logging.info("Will terminate gdb (if any)")
2021-11-30 10:19:03 +00:00
call_with_retry("kill -TERM $(pidof gdb)")
2023-11-06 18:31:15 +00:00
call_with_retry(
"timeout 50s tail --pid=$(pidof gdb) -f /dev/null || kill -9 $(pidof gdb) ||:",
timeout=60,
)
# Sometimes there is a message `Child process was stopped by signal 19` in logs after stopping gdb
2023-10-20 19:06:01 +00:00
call_with_retry(
"kill -CONT $(cat /var/run/clickhouse-server/clickhouse-server.pid) && clickhouse client -q 'SELECT 1 FORMAT Null'"
2023-10-20 19:06:01 +00:00
)
2021-02-24 16:41:44 +00:00
2022-04-14 11:35:46 +00:00
# ThreadFuzzer significantly slows down server and causes false-positive hung check failures
call_with_retry(make_query_command("SYSTEM STOP THREAD FUZZER"))
2021-02-24 16:41:44 +00:00
# Some tests execute SYSTEM STOP MERGES or similar queries.
# It may cause some ALTERs to hang.
# Possibly we should fix tests and forbid to use such queries without specifying table.
call_with_retry(make_query_command("SYSTEM START MERGES"))
call_with_retry(make_query_command("SYSTEM START DISTRIBUTED SENDS"))
call_with_retry(make_query_command("SYSTEM START TTL MERGES"))
call_with_retry(make_query_command("SYSTEM START MOVES"))
call_with_retry(make_query_command("SYSTEM START FETCHES"))
call_with_retry(make_query_command("SYSTEM START REPLICATED SENDS"))
call_with_retry(make_query_command("SYSTEM START REPLICATION QUEUES"))
call_with_retry(make_query_command("SYSTEM DROP MARK CACHE"))
2021-02-24 16:41:44 +00:00
2021-03-15 14:23:04 +00:00
# Issue #21004, live views are experimental, so let's just suppress it
call_with_retry(make_query_command("KILL QUERY WHERE upper(query) LIKE 'WATCH %'"))
2021-03-15 14:23:04 +00:00
2021-04-13 13:45:08 +00:00
# Kill other queries which known to be slow
2023-09-27 14:27:37 +00:00
# It's query from 01232_preparing_sets_race_condition_long,
# it may take up to 1000 seconds in slow builds
call_with_retry(
make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'")
)
2021-04-13 13:45:08 +00:00
# Long query from 00084_external_agregation
call_with_retry(
make_query_command(
2023-09-27 14:27:37 +00:00
"KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM "
"test.hits GROUP BY URL ORDER BY u %'"
)
)
2023-01-24 11:36:29 +00:00
# Long query from 02136_kill_scalar_queries
call_with_retry(
make_query_command(
2023-09-27 14:27:37 +00:00
"KILL QUERY WHERE query LIKE "
"'SELECT (SELECT number FROM system.numbers WHERE number = 1000000000000)%'"
2023-01-24 11:36:29 +00:00
)
)
2021-04-13 13:45:08 +00:00
2021-06-03 15:13:29 +00:00
if drop_databases:
2021-11-30 10:19:03 +00:00
for i in range(5):
try:
2023-09-27 14:27:37 +00:00
# Here we try to drop all databases in async mode.
# If some queries really hung, than drop will hung too.
# Otherwise we will get rid of queries which wait for background pool.
# It can take a long time on slow builds (more than 900 seconds).
#
2021-12-03 18:27:40 +00:00
# Also specify max_untracked_memory to allow 1GiB of memory to overcommit.
databases = (
check_output(
make_query_command("SHOW DATABASES"), shell=True, timeout=30
)
.decode("utf-8")
.strip()
.split()
)
2021-11-30 10:19:03 +00:00
for db in databases:
if db == "system":
continue
2023-01-03 19:32:11 +00:00
command = make_query_command(f"DETACH DATABASE {db}")
2021-11-30 10:19:03 +00:00
# we don't wait for drop
2024-02-26 18:25:02 +00:00
# pylint:disable-next=consider-using-with
2021-11-30 10:19:03 +00:00
Popen(command, shell=True)
break
except Exception as ex:
logging.error(
"Failed to SHOW or DROP databasese, will retry %s", str(ex)
)
2021-11-30 10:19:03 +00:00
time.sleep(i)
else:
2024-02-26 18:25:02 +00:00
raise RuntimeError(
2023-09-27 14:27:37 +00:00
"Cannot drop databases after stress tests. Probably server consumed "
"too much memory and cannot execute simple queries"
)
2021-06-03 15:13:29 +00:00
2021-04-13 13:45:08 +00:00
# Wait for last queries to finish if any, not longer than 300 seconds
call(
make_query_command(
"""
2023-08-06 11:07:48 +00:00
SELECT sleepEachRow((
SELECT maxOrDefault(300 - elapsed) + 1
FROM system.processes
WHERE query NOT LIKE '%FROM system.processes%' AND elapsed < 300
) / 300)
2023-08-06 11:07:48 +00:00
FROM numbers(300)
FORMAT Null
SETTINGS function_sleep_max_microseconds_per_block = 0
"""
),
shell=True,
stderr=STDOUT,
timeout=330,
)
2021-04-13 13:45:08 +00:00
# Even if all clickhouse-test processes are finished, there are probably some sh scripts,
# which still run some new queries. Let's ignore them.
try:
2023-09-27 14:27:37 +00:00
query = 'clickhouse client -q "SELECT count() FROM system.processes where elapsed > 300" '
output = (
check_output(query, shell=True, stderr=STDOUT, timeout=30)
.decode("utf-8")
.strip()
)
2021-04-13 13:45:08 +00:00
if int(output) == 0:
return False
except:
pass
return True
2020-07-08 08:41:39 +00:00
2023-03-10 13:53:47 +00:00
2023-09-27 14:27:37 +00:00
def is_ubsan_build() -> bool:
2023-03-10 13:36:05 +00:00
try:
2023-09-27 14:27:37 +00:00
query = (
'clickhouse client -q "SELECT value FROM system.build_options '
"WHERE name = 'CXX_FLAGS'\" "
)
2023-03-10 13:36:05 +00:00
output = (
check_output(query, shell=True, stderr=STDOUT, timeout=30)
.decode("utf-8")
.strip()
)
2023-03-10 15:09:32 +00:00
return "-fsanitize=undefined" in output
2023-03-10 13:36:05 +00:00
except Exception as e:
2023-03-10 14:22:09 +00:00
logging.info("Failed to get build flags: %s", str(e))
2023-03-10 13:36:05 +00:00
return False
2023-09-27 14:27:37 +00:00
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="ClickHouse script for running stresstest"
)
parser.add_argument("--test-cmd", default="/usr/bin/clickhouse-test")
parser.add_argument("--skip-func-tests", default="")
2023-09-27 14:27:37 +00:00
parser.add_argument(
"--server-log-folder", default="/var/log/clickhouse-server", type=Path
)
parser.add_argument("--output-folder", type=Path)
2021-11-25 14:54:50 +00:00
parser.add_argument("--global-time-limit", type=int, default=1800)
2021-06-01 09:45:01 +00:00
parser.add_argument("--num-parallel", type=int, default=cpu_count())
parser.add_argument("--upgrade-check", action="store_true")
parser.add_argument("--hung-check", action="store_true", default=False)
2021-06-03 15:13:29 +00:00
# make sense only for hung check
parser.add_argument("--drop-databases", action="store_true", default=False)
2023-09-27 14:27:37 +00:00
return parser.parse_args()
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
args = parse_args()
2021-06-03 15:13:29 +00:00
if args.drop_databases and not args.hung_check:
2024-02-26 18:25:02 +00:00
raise argparse.ArgumentTypeError(
"--drop-databases only used in hung check (--hung-check)"
)
2023-03-10 13:36:05 +00:00
2023-09-27 14:27:37 +00:00
# FIXME Hung check with ubsan is temporarily disabled due to
# https://github.com/ClickHouse/ClickHouse/issues/45372
2023-03-10 13:36:05 +00:00
suppress_hung_check = is_ubsan_build()
func_pipes = []
func_pipes = run_func_test(
args.test_cmd,
args.output_folder,
args.num_parallel,
args.skip_func_tests,
args.global_time_limit,
args.upgrade_check,
)
logging.info("Will wait functests to finish")
while True:
retcodes = []
for p in func_pipes:
if p.poll() is not None:
retcodes.append(p.returncode)
if len(retcodes) == len(func_pipes):
break
logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes))
time.sleep(5)
2021-02-16 19:39:25 +00:00
logging.info("All processes finished")
2021-06-01 09:45:01 +00:00
logging.info("Compressing stress logs")
compress_stress_logs(args.output_folder, "stress_test_run_")
logging.info("Logs compressed")
2021-02-16 19:39:25 +00:00
if args.hung_check:
2022-04-12 09:22:03 +00:00
try:
have_long_running_queries = prepare_for_hung_check(args.drop_databases)
except Exception as ex:
have_long_running_queries = True
logging.error("Failed to prepare for hung check: %s", str(ex))
2021-02-16 19:39:25 +00:00
logging.info("Checking if some queries hung")
cmd = " ".join(
[
args.test_cmd,
# Do not track memory allocations up to 1Gi,
# this will allow to ignore server memory limit (max_server_memory_usage) for this query.
#
# NOTE: memory_profiler_step should be also adjusted, because:
#
# untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step)
#
# NOTE: that if there will be queries with GROUP BY, this trick
# will not work due to CurrentMemoryTracker::check() from
# Aggregator code.
# But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY.
"--client-option",
"max_untracked_memory=1Gi",
"max_memory_usage_for_user=0",
"memory_profiler_step=1Gi",
# Use system database to avoid CREATE/DROP DATABASE queries
"--database=system",
"--hung-check",
2022-12-23 14:48:26 +00:00
"--report-logs-stats",
"00001_select_1",
]
)
2023-09-27 14:27:37 +00:00
hung_check_log = args.output_folder / "hung_check.log" # type: Path
2024-02-26 18:25:02 +00:00
with Popen(["/usr/bin/tee", hung_check_log], stdin=PIPE) as tee:
res = call(cmd, shell=True, stdout=tee.stdin, stderr=STDOUT, timeout=600)
if tee.stdin is not None:
tee.stdin.close()
2023-03-10 13:36:05 +00:00
if res != 0 and have_long_running_queries and not suppress_hung_check:
2023-02-09 14:01:32 +00:00
logging.info("Hung check failed with exit code %d", res)
else:
hung_check_status = "No queries hung\tOK\t\\N\t\n"
2024-02-26 18:25:02 +00:00
with open(
args.output_folder / "test_results.tsv", "w+", encoding="utf-8"
) as results:
results.write(hung_check_status)
2023-09-27 14:27:37 +00:00
hung_check_log.unlink()
2021-02-16 19:39:25 +00:00
logging.info("Stress test finished")
2023-09-27 14:27:37 +00:00
if __name__ == "__main__":
main()