ClickHouse/tests/ci/integration_tests_runner.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

1106 lines
41 KiB
Python
Raw Normal View History

2021-02-23 07:24:51 +00:00
#!/usr/bin/env python3
import csv
import glob
import json
2021-02-23 07:24:51 +00:00
import logging
import os
import random
import re
import shlex
import shutil
2024-08-15 11:11:10 +00:00
import signal
import string
import subprocess
2023-05-29 14:34:31 +00:00
import sys
import time
from collections import defaultdict
2024-02-26 20:24:45 +00:00
from itertools import chain
from typing import Any, Dict, List, Optional
2021-02-23 07:24:51 +00:00
from ci_utils import kill_ci_runner
2024-06-10 09:18:03 +00:00
from env_helper import IS_CI
from integration_test_images import IMAGES
2024-08-15 11:11:10 +00:00
from report import JOB_TIMEOUT_TEST_NAME
from stopwatch import Stopwatch
from tee_popen import TeePopen
2021-02-23 07:24:51 +00:00
2023-01-02 17:56:24 +00:00
MAX_RETRY = 1
2021-05-14 14:15:08 +00:00
NUM_WORKERS = 5
2021-02-23 07:24:51 +00:00
SLEEP_BETWEEN_RETRIES = 5
2021-05-31 15:46:32 +00:00
PARALLEL_GROUP_SIZE = 100
2021-11-04 15:53:45 +00:00
CLICKHOUSE_BINARY_PATH = "usr/bin/clickhouse"
CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH = "usr/bin/clickhouse-odbc-bridge"
CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH = "usr/bin/clickhouse-library-bridge"
2021-02-23 07:24:51 +00:00
2024-08-28 16:18:57 +00:00
FLAKY_TRIES_COUNT = 3 # run whole pytest several times
FLAKY_REPEAT_COUNT = 5 # runs test case in single module several times
2021-02-24 16:41:44 +00:00
MAX_TIME_SECONDS = 3600
MAX_TIME_IN_SANDBOX = 20 * 60 # 20 minutes
TASK_TIMEOUT = 8 * 60 * 60 # 8 hours
2022-03-18 12:36:45 +00:00
NO_CHANGES_MSG = "Nothing to run"
2021-02-24 16:41:44 +00:00
# Search test by the common prefix.
# This is accept tests w/o parameters in skip list.
#
# Examples:
# - has_test(['foobar'], 'foobar[param]') == True
# - has_test(['foobar[param]'], 'foobar') == True
def has_test(tests, test_to_match):
for test in tests:
if len(test_to_match) < len(test):
if test[0 : len(test_to_match)] == test_to_match:
return True
else:
if test_to_match[0 : len(test)] == test:
return True
return False
2022-03-11 14:14:16 +00:00
def get_changed_tests_to_run(pr_info, repo_path):
result = set()
changed_files = pr_info["changed_files"]
2021-02-24 16:41:44 +00:00
2021-02-24 23:09:09 +00:00
if changed_files is None:
2021-02-24 16:41:44 +00:00
return []
2021-02-24 23:09:09 +00:00
for fpath in changed_files:
2024-08-12 13:48:44 +00:00
if re.search(r"tests/integration/test_.*/test.*\.py", fpath) is not None:
logging.info("File %s changed and seems like integration test", fpath)
2024-08-12 13:30:25 +00:00
result.add("/".join(fpath.split("/")[2:]))
2022-03-11 14:14:16 +00:00
return filter_existing_tests(result, repo_path)
2021-02-24 16:41:44 +00:00
def filter_existing_tests(tests_to_run, repo_path):
result = []
for relative_test_path in tests_to_run:
if os.path.exists(
os.path.join(repo_path, "tests/integration", relative_test_path)
):
2021-02-24 16:41:44 +00:00
result.append(relative_test_path)
else:
logging.info(
"Skipping test %s, seems like it was removed", relative_test_path
)
2021-02-24 16:41:44 +00:00
return result
2021-02-23 07:24:51 +00:00
def _get_deselect_option(tests):
return " ".join([f"--deselect {t}" for t in tests])
2021-02-23 07:24:51 +00:00
2021-05-25 16:53:42 +00:00
# https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
"""Yield successive n-sized chunks from lst."""
for i in range(0, len(lst), n):
yield lst[i : i + n]
2021-02-23 07:24:51 +00:00
def get_counters(fname):
counters = {
"ERROR": set([]),
"PASSED": set([]),
"FAILED": set([]),
"SKIPPED": set([]),
} # type: Dict[str, Any]
2024-02-26 20:24:45 +00:00
with open(fname, "r", encoding="utf-8") as out:
2021-02-23 07:24:51 +00:00
for line in out:
line = line.strip()
# Example of log:
#
# test_mysql_protocol/test.py::test_golang_client
# [gw0] [ 7%] ERROR test_mysql_protocol/test.py::test_golang_client
#
# And only the line with test status should be matched
if not (".py::" in line and " " in line):
continue
2021-02-23 07:24:51 +00:00
line = line.strip()
# [gw0] [ 7%] ERROR test_mysql_protocol/test.py::test_golang_client
# ^^^^^^^^^^^^^
if line.strip().startswith("["):
2024-02-26 20:24:45 +00:00
line = re.sub(r"^\[[^\[\]]*\] \[[^\[\]]*\] ", "", line)
line_arr = line.split(" ")
if len(line_arr) < 2:
logging.debug("Strange line %s", line)
continue
2021-02-23 07:24:51 +00:00
# Lines like:
#
# ERROR test_mysql_protocol/test.py::test_golang_client
# PASSED test_replicated_users/test.py::test_rename_replicated[QUOTA]
# PASSED test_drop_is_lock_free/test.py::test_query_is_lock_free[detach part]
#
state = line_arr.pop(0)
test_name = " ".join(line_arr)
# Normalize test names for lines like this:
#
# FAILED test_storage_s3/test.py::test_url_reconnect_in_the_middle - Exception
# FAILED test_distributed_ddl/test.py::test_default_database[configs] - AssertionError: assert ...
#
test_name = re.sub(
r"^(?P<test_name>[^\[\] ]+)(?P<test_param>\[[^\[\]]*\]|)(?P<test_error> - .*|)$",
r"\g<test_name>\g<test_param>",
test_name,
)
2021-02-23 07:24:51 +00:00
if state in counters:
counters[state].add(test_name)
else:
# will skip lines like:
# 30.76s call test_host_ip_change/test.py::test_ip_drop_cache
# 5.71s teardown test_host_ip_change/test.py::test_ip_change[node1]
# and similar
logging.debug("Strange state in line %s", line)
2021-02-23 07:24:51 +00:00
return {k: list(v) for k, v in counters.items()}
def parse_test_times(fname):
read = False
description_output = []
2024-02-26 20:24:45 +00:00
with open(fname, "r", encoding="utf-8") as out:
2021-02-23 07:24:51 +00:00
for line in out:
if read and "==" in line:
2021-02-23 07:24:51 +00:00
break
if read and line.strip():
description_output.append(line.strip())
if "slowest durations" in line:
2021-02-23 07:24:51 +00:00
read = True
return description_output
def get_test_times(output):
result = defaultdict(float)
for line in output:
if ".py" in line:
line_arr = line.strip().split(" ")
2021-02-23 07:24:51 +00:00
test_time = line_arr[0]
test_name = " ".join([elem for elem in line_arr[2:] if elem])
2021-02-23 07:24:51 +00:00
if test_name not in result:
result[test_name] = 0.0
result[test_name] += float(test_time[:-1])
return result
def clear_ip_tables_and_restart_daemons():
logging.info(
"Dump iptables after run %s",
subprocess.check_output("sudo iptables -nvL", shell=True),
)
2021-02-23 07:24:51 +00:00
try:
logging.info("Killing all alive docker containers")
subprocess.check_output(
2023-07-09 00:28:12 +00:00
"timeout --signal=KILL 10m docker ps --quiet | xargs --no-run-if-empty docker kill",
2022-09-17 18:38:12 +00:00
shell=True,
)
2021-02-23 07:24:51 +00:00
except subprocess.CalledProcessError as err:
2023-05-29 14:34:31 +00:00
logging.info("docker kill excepted: %s", str(err))
2021-02-23 07:24:51 +00:00
try:
logging.info("Removing all docker containers")
subprocess.check_output(
2023-07-09 00:28:12 +00:00
"timeout --signal=KILL 10m docker ps --all --quiet | xargs --no-run-if-empty docker rm --force",
2022-09-17 18:38:12 +00:00
shell=True,
)
2021-02-23 07:24:51 +00:00
except subprocess.CalledProcessError as err:
2023-05-29 14:34:31 +00:00
logging.info("docker rm excepted: %s", str(err))
2021-02-23 07:24:51 +00:00
2021-11-08 13:23:14 +00:00
# don't restart docker if it's disabled
if os.environ.get("CLICKHOUSE_TESTS_RUNNER_RESTART_DOCKER", "1") == "1":
2021-11-08 13:23:14 +00:00
try:
logging.info("Stopping docker daemon")
subprocess.check_output("service docker stop", shell=True)
except subprocess.CalledProcessError as err:
2023-05-29 14:34:31 +00:00
logging.info("docker stop excepted: %s", str(err))
2021-11-08 13:23:14 +00:00
try:
for i in range(200):
try:
logging.info("Restarting docker %s", i)
subprocess.check_output("service docker start", shell=True)
subprocess.check_output("docker ps", shell=True)
break
except subprocess.CalledProcessError as err:
time.sleep(0.5)
logging.info("Waiting docker to start, current %s", str(err))
else:
2024-02-26 20:24:45 +00:00
raise RuntimeError("Docker daemon doesn't responding")
2021-11-08 13:23:14 +00:00
except subprocess.CalledProcessError as err:
2023-05-29 14:34:31 +00:00
logging.info("Can't reload docker: %s", str(err))
2021-11-08 13:23:14 +00:00
2021-03-16 21:39:38 +00:00
iptables_iter = 0
2021-02-23 07:24:51 +00:00
try:
2021-03-16 23:16:03 +00:00
for i in range(1000):
2021-03-16 21:39:38 +00:00
iptables_iter = i
2021-02-23 20:15:11 +00:00
# when rules will be empty, it will raise exception
2021-11-04 20:22:13 +00:00
subprocess.check_output("sudo iptables -D DOCKER-USER 1", shell=True)
2021-03-16 21:39:38 +00:00
except subprocess.CalledProcessError as err:
logging.info(
"All iptables rules cleared, %s iterations, last error: %s",
iptables_iter,
str(err),
)
2021-02-23 07:24:51 +00:00
class ClickhouseIntegrationTestsRunner:
def __init__(self, repo_path: str, result_path: str, params: dict):
self.repo_path = repo_path
2021-02-23 07:24:51 +00:00
self.result_path = result_path
2021-02-24 16:41:44 +00:00
self.params = params
self.image_versions = self.params["docker_images_with_versions"]
self.shuffle_groups = self.params["shuffle_test_groups"]
self.flaky_check = "flaky check" in self.params["context_name"]
self.bugfix_validate_check = "bugfix" in self.params["context_name"].lower()
2021-11-16 08:59:38 +00:00
# if use_tmpfs is not set we assume it to be true, otherwise check
self.use_tmpfs = "use_tmpfs" not in self.params or self.params["use_tmpfs"]
self.disable_net_host = (
"disable_net_host" in self.params and self.params["disable_net_host"]
)
2021-03-16 17:48:45 +00:00
self.start_time = time.time()
2021-03-17 19:28:18 +00:00
self.soft_deadline_time = self.start_time + (TASK_TIMEOUT - MAX_TIME_IN_SANDBOX)
2021-02-23 07:24:51 +00:00
self.use_old_analyzer = (
os.environ.get("CLICKHOUSE_USE_OLD_ANALYZER") is not None
)
2023-06-20 13:20:21 +00:00
if "run_by_hash_total" in self.params:
self.run_by_hash_total = self.params["run_by_hash_total"]
self.run_by_hash_num = self.params["run_by_hash_num"]
else:
self.run_by_hash_total = 0
self.run_by_hash_num = 0
2021-02-23 07:24:51 +00:00
def path(self):
return self.result_path
2021-02-23 12:25:28 +00:00
def base_path(self):
return os.path.join(str(self.result_path), "../")
2021-02-23 12:25:28 +00:00
2023-05-29 14:34:31 +00:00
@staticmethod
def should_skip_tests():
2021-02-23 07:24:51 +00:00
return []
def get_image_with_version(self, name):
if name in self.image_versions:
return name + ":" + self.image_versions[name]
2023-05-29 14:34:31 +00:00
logging.warning(
"Cannot find image %s in params list %s", name, self.image_versions
)
if ":" not in name:
2021-02-23 07:24:51 +00:00
return name + ":latest"
return name
def get_image_version(self, name: str) -> Any:
2021-02-23 07:24:51 +00:00
if name in self.image_versions:
return self.image_versions[name]
2023-05-29 14:34:31 +00:00
logging.warning(
"Cannot find image %s in params list %s", name, self.image_versions
)
return "latest"
2021-02-23 07:24:51 +00:00
def shuffle_test_groups(self):
return self.shuffle_groups != 0
def _pre_pull_images(self):
image_cmd = self._get_runner_image_cmd()
2023-01-03 18:56:24 +00:00
2023-01-02 16:46:24 +00:00
cmd = (
f"cd {self.repo_path}/tests/integration && "
2024-02-26 20:24:45 +00:00
f"timeout --signal=KILL 1h ./runner {self._get_runner_opts()} {image_cmd} "
"--pre-pull --command ' echo Pre Pull finished ' "
2023-01-02 16:46:24 +00:00
)
for i in range(5):
2023-01-04 11:48:28 +00:00
logging.info("Pulling images before running tests. Attempt %s", i)
2023-01-02 16:46:24 +00:00
try:
subprocess.check_output(
cmd,
shell=True,
)
2023-01-04 13:16:31 +00:00
return
2023-01-02 16:46:24 +00:00
except subprocess.CalledProcessError as err:
2023-05-29 14:34:31 +00:00
logging.info("docker-compose pull failed: %s", str(err))
2023-01-04 13:16:31 +00:00
continue
message = "Pulling images failed for 5 attempts. Will fail the worker."
logging.error(message)
kill_ci_runner(message)
# We pass specific retcode to to ci/integration_test_check.py to skip status reporting and restart job
2023-05-29 14:34:31 +00:00
sys.exit(13)
2023-01-02 16:46:24 +00:00
2023-05-29 14:34:31 +00:00
@staticmethod
def _can_run_with(path, opt):
2024-02-26 20:24:45 +00:00
with open(path, "r", encoding="utf-8") as script:
2021-02-23 07:24:51 +00:00
for line in script:
if opt in line:
return True
return False
def _install_clickhouse(self, debs_path):
for package in (
"clickhouse-common-static_",
"clickhouse-server_",
"clickhouse-client",
2024-04-04 18:06:40 +00:00
"clickhouse-odbc-bridge_",
"clickhouse-library-bridge_",
"clickhouse-common-static-dbg_",
): # order matters
2021-02-23 07:24:51 +00:00
logging.info("Installing package %s", package)
for f in os.listdir(debs_path):
if package in f:
full_path = os.path.join(debs_path, f)
logging.info("Package found in %s", full_path)
log_name = "install_" + f + ".log"
log_path = os.path.join(str(self.path()), log_name)
cmd = f"dpkg -x {full_path} ."
logging.info("Executing installation cmd %s", cmd)
with TeePopen(cmd, log_file=log_path) as proc:
if proc.wait() == 0:
logging.info("Installation of %s successfull", full_path)
else:
raise RuntimeError(f"Installation of {full_path} failed")
2021-02-23 07:24:51 +00:00
break
else:
2024-02-26 20:24:45 +00:00
raise FileNotFoundError(f"Package with {package} not found")
# logging.info("Unstripping binary")
# logging.info(
# "Unstring %s",
# subprocess.check_output(
# "eu-unstrip /usr/bin/clickhouse {}".format(CLICKHOUSE_BINARY_PATH),
# shell=True,
# ),
# )
2021-02-23 07:24:51 +00:00
logging.info("All packages installed")
os.chmod(CLICKHOUSE_BINARY_PATH, 0o777)
os.chmod(CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH, 0o777)
2021-03-15 06:37:07 +00:00
os.chmod(CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH, 0o777)
shutil.copy(
CLICKHOUSE_BINARY_PATH, os.getenv("CLICKHOUSE_TESTS_SERVER_BIN_PATH") # type: ignore
)
shutil.copy(
CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH,
os.getenv("CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH"), # type: ignore
)
shutil.copy(
CLICKHOUSE_LIBRARY_BRIDGE_BINARY_PATH,
os.getenv("CLICKHOUSE_TESTS_LIBRARY_BRIDGE_BIN_PATH"), # type: ignore
)
2021-02-23 07:24:51 +00:00
2023-05-29 14:34:31 +00:00
@staticmethod
def _compress_logs(directory, relpaths, result_path):
retcode = subprocess.call(
f"sudo tar --use-compress-program='zstd --threads=0' "
f"-cf {result_path} -C {directory} {' '.join(relpaths)}",
shell=True,
)
# tar return 1 when the files are changed on compressing, we ignore it
if retcode in (0, 1):
return
# but even on the fatal errors it's better to retry
logging.error("Fatal error on compressing %s: %s", result_path, retcode)
2021-02-23 07:24:51 +00:00
2021-11-16 08:59:38 +00:00
def _get_runner_opts(self):
result = []
if self.use_tmpfs:
result.append("--tmpfs")
if self.disable_net_host:
result.append("--disable-net-host")
if self.use_old_analyzer:
result.append("--old-analyzer")
2023-06-20 13:20:21 +00:00
2021-11-16 08:59:38 +00:00
return " ".join(result)
def _get_all_tests(self) -> List[str]:
image_cmd = self._get_runner_image_cmd()
runner_opts = self._get_runner_opts()
out_file_full = os.path.join(self.result_path, "runner_get_all_tests.log")
cmd = (
f"cd {self.repo_path}/tests/integration && "
f"timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} -- --setup-plan "
)
logging.info(
"Getting all tests to the file %s with cmd: \n%s", out_file_full, cmd
)
with open(out_file_full, "wb") as ofd:
2024-07-01 15:52:35 +00:00
try:
subprocess.check_call(cmd, shell=True, stdout=ofd, stderr=ofd)
except subprocess.CalledProcessError as ex:
print("ERROR: Setting test plan failed. Output:")
2024-07-01 20:11:44 +00:00
with open(out_file_full, "r", encoding="utf-8") as file:
2024-07-01 15:52:35 +00:00
for line in file:
2024-07-01 16:02:06 +00:00
print(" " + line, end="")
2024-07-01 15:52:35 +00:00
raise ex
all_tests = set()
with open(out_file_full, "r", encoding="utf-8") as all_tests_fd:
for line in all_tests_fd:
if (
line[0] in string.whitespace # test names at the start of lines
or "::test" not in line # test names contain '::test'
or "SKIPPED" in line # pytest.mark.skip/-if
):
continue
all_tests.add(line.strip())
assert all_tests
2021-02-23 07:24:51 +00:00
return list(sorted(all_tests))
2023-05-29 14:34:31 +00:00
@staticmethod
def _get_parallel_tests_skip_list(repo_path):
2024-02-26 20:24:45 +00:00
skip_list_file_path = f"{repo_path}/tests/integration/parallel_skip.json"
if (
not os.path.isfile(skip_list_file_path)
or os.path.getsize(skip_list_file_path) == 0
):
2024-02-26 20:24:45 +00:00
raise ValueError(
"There is something wrong with getting all tests list: "
f"file '{skip_list_file_path}' is empty or does not exist."
)
2021-03-19 16:47:59 +00:00
2021-05-25 16:53:42 +00:00
skip_list_tests = []
2024-02-26 20:24:45 +00:00
with open(skip_list_file_path, "r", encoding="utf-8") as skip_list_file:
2021-05-25 16:53:42 +00:00
skip_list_tests = json.load(skip_list_file)
return list(sorted(skip_list_tests))
2021-03-19 16:47:59 +00:00
2023-05-29 14:34:31 +00:00
@staticmethod
def group_test_by_file(tests):
result = {} # type: Dict
2021-02-23 07:24:51 +00:00
for test in tests:
test_file = test.split("::")[0]
2021-02-23 07:24:51 +00:00
if test_file not in result:
result[test_file] = []
result[test_file].append(test)
return result
2023-05-29 14:34:31 +00:00
@staticmethod
2024-03-21 23:56:57 +00:00
def _update_counters(main_counters, current_counters):
2021-02-23 07:24:51 +00:00
for test in current_counters["PASSED"]:
if test not in main_counters["PASSED"]:
2021-02-23 07:24:51 +00:00
if test in main_counters["FAILED"]:
main_counters["FAILED"].remove(test)
2023-06-22 19:48:13 +00:00
if test in main_counters["BROKEN"]:
main_counters["BROKEN"].remove(test)
2024-03-21 23:56:57 +00:00
main_counters["PASSED"].append(test)
2021-02-23 07:24:51 +00:00
for state in ("ERROR", "FAILED"):
for test in current_counters[state]:
if test in main_counters["PASSED"]:
2021-03-17 19:28:18 +00:00
main_counters["PASSED"].remove(test)
2024-03-21 23:56:57 +00:00
if test not in main_counters[state]:
main_counters[state].append(test)
2021-02-23 07:24:51 +00:00
2022-04-14 12:39:36 +00:00
for state in ("SKIPPED",):
for test in current_counters[state]:
main_counters[state].append(test)
def _get_runner_image_cmd(self):
image_cmd = ""
if self._can_run_with(
os.path.join(self.repo_path, "tests/integration", "runner"),
"--docker-image-version",
):
for img in IMAGES:
2021-09-08 10:03:54 +00:00
if img == "clickhouse/integration-tests-runner":
2022-05-10 15:15:14 +00:00
runner_version = self.get_image_version(img)
logging.info(
"Can run with custom docker image version %s", runner_version
)
2024-02-26 20:24:45 +00:00
image_cmd += f" --docker-image-version={runner_version} "
2021-02-23 07:24:51 +00:00
else:
if self._can_run_with(
os.path.join(self.repo_path, "tests/integration", "runner"),
"--docker-compose-images-tags",
):
2024-02-26 20:24:45 +00:00
image_cmd += (
"--docker-compose-images-tags="
f"{self.get_image_with_version(img)} "
)
2021-02-23 07:24:51 +00:00
else:
image_cmd = ""
2021-02-23 07:24:51 +00:00
logging.info("Cannot run with custom docker image version :(")
return image_cmd
2023-05-29 14:34:31 +00:00
@staticmethod
def _find_test_data_dirs(repo_path, test_names):
2021-07-01 14:41:59 +00:00
relpaths = {}
for test_name in test_names:
if "/" in test_name:
test_dir = test_name[: test_name.find("/")]
2021-07-01 14:41:59 +00:00
else:
test_dir = test_name
if os.path.isdir(os.path.join(repo_path, "tests/integration", test_dir)):
for name in os.listdir(
os.path.join(repo_path, "tests/integration", test_dir)
):
2021-07-01 14:41:59 +00:00
relpath = os.path.join(os.path.join(test_dir, name))
mtime = os.path.getmtime(
os.path.join(repo_path, "tests/integration", relpath)
)
2021-07-01 14:41:59 +00:00
relpaths[relpath] = mtime
return relpaths
2023-05-29 14:34:31 +00:00
@staticmethod
def _get_test_data_dirs_difference(new_snapshot, old_snapshot):
2021-07-01 14:41:59 +00:00
res = set()
for path in new_snapshot:
if (path not in old_snapshot) or (old_snapshot[path] != new_snapshot[path]):
2021-07-01 14:41:59 +00:00
res.add(path)
return res
def try_run_test_group(
2023-06-22 19:58:58 +00:00
self,
test_group,
tests_in_group,
num_tries,
num_workers,
repeat_count,
):
2021-08-12 10:40:54 +00:00
try:
return self.run_test_group(
2023-06-22 19:58:58 +00:00
test_group,
tests_in_group,
num_tries,
num_workers,
repeat_count,
)
2021-08-12 10:40:54 +00:00
except Exception as e:
2024-02-26 20:24:45 +00:00
logging.info("Failed to run %s:\n%s", test_group, e)
2021-08-12 10:40:54 +00:00
counters = {
"ERROR": [],
"PASSED": [],
"FAILED": [],
"SKIPPED": [],
} # type: Dict
tests_times = defaultdict(float) # type: Dict
2021-08-12 10:40:54 +00:00
for test in tests_in_group:
counters["ERROR"].append(test)
tests_times[test] = 0
return counters, tests_times, []
def run_test_group(
2023-06-22 19:58:58 +00:00
self,
test_group,
tests_in_group,
num_tries,
num_workers,
repeat_count,
):
2021-02-23 07:24:51 +00:00
counters = {
"ERROR": [],
"PASSED": [],
"FAILED": [],
2021-03-17 19:28:18 +00:00
"SKIPPED": [],
2023-06-22 19:48:13 +00:00
"BROKEN": [],
"NOT_FAILED": [],
} # type: Dict
tests_times = defaultdict(float) # type: Dict
2021-03-16 17:48:45 +00:00
if self.soft_deadline_time < time.time():
for test in tests_in_group:
2021-03-17 19:28:18 +00:00
logging.info("Task timeout exceeded, skipping %s", test)
counters["SKIPPED"].append(test)
2021-03-16 17:48:45 +00:00
tests_times[test] = 0
2021-04-08 20:05:27 +00:00
return counters, tests_times, []
2021-03-16 17:48:45 +00:00
image_cmd = self._get_runner_image_cmd()
test_group_str = test_group.replace("/", "_").replace(".", "_")
2021-04-08 20:05:27 +00:00
log_paths = []
2021-07-01 14:41:59 +00:00
test_data_dirs = {}
2021-02-23 07:24:51 +00:00
for i in range(num_tries):
2024-08-15 11:11:10 +00:00
if timeout_expired:
print("Timeout expired - break test group execution")
break
2021-02-23 07:24:51 +00:00
logging.info("Running test group %s for the %s retry", test_group, i)
clear_ip_tables_and_restart_daemons()
test_names = set([])
for test_name in tests_in_group:
if test_name not in counters["PASSED"]:
test_names.add(test_name)
2021-02-23 07:24:51 +00:00
2021-07-01 14:41:59 +00:00
if i == 0:
test_data_dirs = self._find_test_data_dirs(self.repo_path, test_names)
2021-07-01 14:41:59 +00:00
info_basename = test_group_str + "_" + str(i) + ".nfo"
info_path = os.path.join(self.repo_path, "tests/integration", info_basename)
2021-07-01 14:41:59 +00:00
test_cmd = " ".join([shlex.quote(test) for test in sorted(test_names)])
2024-02-26 20:24:45 +00:00
parallel_cmd = f" --parallel {num_workers} " if num_workers > 0 else ""
repeat_cmd = f" --count {repeat_count} " if repeat_count > 0 else ""
# -r -- show extra test summary:
# -f -- (f)ailed
# -E -- (E)rror
# -p -- (p)assed
# -s -- (s)kipped
2024-02-26 20:24:45 +00:00
cmd = (
f"cd {self.repo_path}/tests/integration && "
2024-02-26 20:24:45 +00:00
f"timeout --signal=KILL 1h ./runner {self._get_runner_opts()} "
f"{image_cmd} -t {test_cmd} {parallel_cmd} {repeat_cmd} -- -rfEps --run-id={i} "
2024-02-26 20:24:45 +00:00
f"--color=no --durations=0 {_get_deselect_option(self.should_skip_tests())} "
f"| tee {info_path}"
)
2021-02-23 07:24:51 +00:00
2021-07-01 14:41:59 +00:00
log_basename = test_group_str + "_" + str(i) + ".log"
log_path = os.path.join(self.repo_path, "tests/integration", log_basename)
2024-02-26 20:24:45 +00:00
with open(log_path, "w", encoding="utf-8") as log:
2021-02-23 07:24:51 +00:00
logging.info("Executing cmd: %s", cmd)
# ignore retcode, since it meaningful due to pipe to tee
2024-02-26 20:24:45 +00:00
with subprocess.Popen(cmd, shell=True, stderr=log, stdout=log) as proc:
2024-09-26 13:40:40 +00:00
global runner_subprocess # pylint:disable=global-statement
2024-08-15 11:11:10 +00:00
runner_subprocess = proc
2024-02-26 20:24:45 +00:00
proc.wait()
2021-02-23 07:24:51 +00:00
2021-07-01 14:41:59 +00:00
extra_logs_names = [log_basename]
log_result_path = os.path.join(
str(self.path()), "integration_run_" + log_basename
)
2021-07-01 14:41:59 +00:00
shutil.copy(log_path, log_result_path)
log_paths.append(log_result_path)
for pytest_log_path in glob.glob(
os.path.join(self.repo_path, "tests/integration/pytest*.log")
):
new_name = (
test_group_str
+ "_"
+ str(i)
+ "_"
+ os.path.basename(pytest_log_path)
)
os.rename(
pytest_log_path,
os.path.join(self.repo_path, "tests/integration", new_name),
)
2021-07-01 14:41:59 +00:00
extra_logs_names.append(new_name)
dockerd_log_path = os.path.join(
self.repo_path, "tests/integration/dockerd.log"
)
2021-07-01 14:41:59 +00:00
if os.path.exists(dockerd_log_path):
new_name = (
test_group_str
+ "_"
+ str(i)
+ "_"
+ os.path.basename(dockerd_log_path)
)
os.rename(
dockerd_log_path,
os.path.join(self.repo_path, "tests/integration", new_name),
)
2021-07-01 14:41:59 +00:00
extra_logs_names.append(new_name)
if os.path.exists(info_path):
extra_logs_names.append(info_basename)
new_counters = get_counters(info_path)
for state, tests in new_counters.items():
logging.info(
"Tests with %s state (%s): %s", state, len(tests), tests
)
2021-07-01 14:41:59 +00:00
times_lines = parse_test_times(info_path)
2021-02-23 07:24:51 +00:00
new_tests_times = get_test_times(times_lines)
2024-03-21 23:56:57 +00:00
self._update_counters(counters, new_counters)
2021-02-23 07:24:51 +00:00
for test_name, test_time in new_tests_times.items():
tests_times[test_name] = test_time
2021-07-01 14:41:59 +00:00
test_data_dirs_new = self._find_test_data_dirs(self.repo_path, test_names)
test_data_dirs_diff = self._get_test_data_dirs_difference(
test_data_dirs_new, test_data_dirs
)
2021-07-01 14:41:59 +00:00
test_data_dirs = test_data_dirs_new
if extra_logs_names or test_data_dirs_diff:
extras_result_path = os.path.join(
str(self.path()),
"integration_run_" + test_group_str + "_" + str(i) + ".tar.zst",
)
self._compress_logs(
os.path.join(self.repo_path, "tests/integration"),
extra_logs_names + list(test_data_dirs_diff),
extras_result_path,
)
2021-07-01 14:41:59 +00:00
log_paths.append(extras_result_path)
if len(counters["PASSED"]) == len(tests_in_group):
2021-02-23 07:24:51 +00:00
logging.info("All tests from group %s passed", test_group)
break
if (
len(counters["PASSED"]) >= 0
and len(counters["FAILED"]) == 0
and len(counters["ERROR"]) == 0
):
logging.info(
"Seems like all tests passed but some of them are skipped or "
"deselected. Ignoring them and finishing group."
)
2021-02-23 07:24:51 +00:00
break
else:
2021-06-09 13:40:28 +00:00
# Mark all non tried tests as errors, with '::' in name
# (example test_partition/test.py::test_partition_simple). For flaky check
# we run whole test dirs like "test_odbc_interaction" and don't
# want to mark them as error so we filter by '::'.
2021-02-23 07:24:51 +00:00
for test in tests_in_group:
if (
2024-02-26 20:24:45 +00:00
test
not in chain(
counters["PASSED"],
counters["ERROR"],
counters["SKIPPED"],
counters["FAILED"],
counters["BROKEN"],
)
and "::" in test
):
2021-02-23 07:24:51 +00:00
counters["ERROR"].append(test)
2021-04-08 20:05:27 +00:00
return counters, tests_times, log_paths
2021-02-23 07:24:51 +00:00
def run_flaky_check(self, build_path, should_fail=False):
pr_info = self.params["pr_info"]
2021-02-24 16:41:44 +00:00
tests_to_run = get_changed_tests_to_run(pr_info, self.repo_path)
2021-02-24 16:41:44 +00:00
if not tests_to_run:
logging.info("No integration tests to run found")
2022-03-18 12:36:45 +00:00
return "success", NO_CHANGES_MSG, [(NO_CHANGES_MSG, "OK")], ""
2021-02-24 16:41:44 +00:00
self._install_clickhouse(build_path)
logging.info("Found '%s' tests to run", " ".join(tests_to_run))
2021-02-24 16:41:44 +00:00
result_state = "success"
description_prefix = "No flaky tests: "
logging.info("Starting check with retries")
final_retry = 0
counters = {
"ERROR": [],
"PASSED": [],
"FAILED": [],
"SKIPPED": [],
"BROKEN": [],
"NOT_FAILED": [],
} # type: Dict
tests_times = defaultdict(float) # type: Dict
tests_log_paths = defaultdict(list)
2024-08-23 19:09:09 +00:00
id_counter = 0
for test_to_run in tests_to_run:
tries_num = 1 if should_fail else FLAKY_TRIES_COUNT
for i in range(tries_num):
if timeout_expired:
print("Timeout expired - break flaky check execution")
2022-03-11 14:14:16 +00:00
break
final_retry += 1
logging.info("Running tests for the %s time", i)
group_counters, group_test_times, log_paths = self.try_run_test_group(
2024-08-23 19:09:09 +00:00
f"bugfix_{id_counter}" if should_fail else f"flaky{id_counter}",
[test_to_run],
1,
1,
FLAKY_REPEAT_COUNT,
)
2024-08-23 19:09:09 +00:00
id_counter = id_counter + 1
for counter, value in group_counters.items():
logging.info(
"Tests from group %s stats, %s count %s",
test_to_run,
counter,
len(value),
)
counters[counter] += value
for test_name, test_time in group_test_times.items():
tests_times[test_name] = test_time
tests_log_paths[test_name] = log_paths
if not should_fail and (
group_counters["FAILED"] or group_counters["ERROR"]
):
logging.info(
"Unexpected failure in group %s. Fail fast for current group",
test_to_run,
)
2022-03-11 14:14:16 +00:00
break
2024-09-03 08:24:57 +00:00
if counters["FAILED"]:
logging.info("Found failed tests: %s", " ".join(counters["FAILED"]))
description_prefix = "Failed tests found: "
result_state = "failure"
2024-09-03 08:24:57 +00:00
if counters["ERROR"]:
description_prefix = "Failed tests found: "
logging.info("Found error tests: %s", " ".join(counters["ERROR"]))
# NOTE "error" result state will restart the whole test task,
# so we use "failure" here
result_state = "failure"
logging.info("Try is OK, all tests passed, going to clear env")
clear_ip_tables_and_restart_daemons()
logging.info("And going to sleep for some time")
time.sleep(5)
2021-02-24 16:41:44 +00:00
test_result = []
for state in ("ERROR", "FAILED", "PASSED", "SKIPPED"):
2021-02-24 16:41:44 +00:00
if state == "PASSED":
text_state = "OK"
elif state == "FAILED":
text_state = "FAIL"
else:
text_state = state
test_result += [
(c, text_state, f"{tests_times[c]:.2f}", tests_log_paths[c])
for c in counters[state]
]
status_text = description_prefix + ", ".join(
[
str(n).lower().replace("failed", "fail") + ": " + str(len(c))
for n, c in counters.items()
]
)
2021-02-24 16:41:44 +00:00
2024-08-16 08:44:11 +00:00
return result_state, status_text, test_result, tests_log_paths
2021-02-24 16:41:44 +00:00
def run_impl(self, build_path):
2024-08-15 11:11:10 +00:00
stopwatch = Stopwatch()
2022-03-11 14:14:16 +00:00
if self.flaky_check or self.bugfix_validate_check:
2024-08-26 11:37:37 +00:00
result_state, status_text, test_result, tests_log_paths = (
self.run_flaky_check(build_path, should_fail=self.bugfix_validate_check)
2024-08-26 11:37:37 +00:00
)
else:
result_state, status_text, test_result, tests_log_paths = (
self.run_normal_check(build_path)
2022-03-11 14:14:16 +00:00
)
2021-02-24 16:41:44 +00:00
2024-08-26 11:37:37 +00:00
if self.soft_deadline_time < time.time():
status_text = "Timeout, " + status_text
result_state = "failure"
if timeout_expired:
logging.error(
"Job killed by external timeout signal - setting status to failure!"
)
status_text = "Job timeout expired, " + status_text
result_state = "failure"
# add mock test case to make timeout visible in job report and in ci db
test_result.insert(
0, (JOB_TIMEOUT_TEST_NAME, "FAIL", f"{stopwatch.duration_seconds}", "")
)
if "(memory)" in self.params["context_name"]:
result_state = "success"
2023-01-04 20:07:52 +00:00
2024-08-26 11:37:37 +00:00
return result_state, status_text, test_result, tests_log_paths
def _get_tests_by_hash(self) -> List[str]:
"Tries it's best to group the tests equally between groups"
all_tests = self._get_all_tests()
if self.run_by_hash_total == 0:
return all_tests
grouped_tests = self.group_test_by_file(all_tests)
groups_by_hash = {
g: [] for g in range(self.run_by_hash_total)
} # type: Dict[int, List[str]]
for tests_in_group in grouped_tests.values():
# It should work determenistic, because it searches groups with min tests
min_group = min(len(tests) for tests in groups_by_hash.values())
# And then it takes a group with min index
group_to_increase = min(
g for g, t in groups_by_hash.items() if len(t) == min_group
)
groups_by_hash[group_to_increase].extend(tests_in_group)
return groups_by_hash[self.run_by_hash_num]
def run_normal_check(self, build_path):
2024-08-26 11:37:37 +00:00
self._install_clickhouse(build_path)
2023-01-04 20:07:52 +00:00
logging.info("Pulling images")
self._pre_pull_images()
logging.info(
"Dump iptables before run %s",
subprocess.check_output("sudo iptables -nvL", shell=True),
)
all_tests = self._get_tests_by_hash()
parallel_skip_tests = self._get_parallel_tests_skip_list(self.repo_path)
logging.info(
"Found %s tests first 3 %s", len(all_tests), " ".join(all_tests[:3])
)
filtered_sequential_tests = list(
filter(lambda test: has_test(all_tests, test), parallel_skip_tests)
)
filtered_parallel_tests = list(
filter(
lambda test: not has_test(parallel_skip_tests, test),
all_tests,
)
)
not_found_tests = list(
filter(
lambda test: not has_test(all_tests, test),
parallel_skip_tests,
)
)
logging.info(
"Found %s tests first 3 %s, parallel %s, other %s",
len(all_tests),
" ".join(all_tests[:3]),
len(filtered_parallel_tests),
len(filtered_sequential_tests),
)
logging.info(
"Not found %s tests first 3 %s",
len(not_found_tests),
" ".join(not_found_tests[:3]),
)
2021-05-25 16:53:42 +00:00
grouped_tests = self.group_test_by_file(filtered_sequential_tests)
i = 0
2021-06-04 10:15:44 +00:00
for par_group in chunks(filtered_parallel_tests, PARALLEL_GROUP_SIZE):
grouped_tests[f"parallel{i}"] = par_group
i += 1
2021-02-23 07:24:51 +00:00
logging.info("Found %s tests groups", len(grouped_tests))
counters = {
"ERROR": [],
"PASSED": [],
"FAILED": [],
2021-03-17 19:28:18 +00:00
"SKIPPED": [],
2023-06-22 19:48:13 +00:00
"BROKEN": [],
"NOT_FAILED": [],
} # type: Dict
2021-02-23 07:24:51 +00:00
tests_times = defaultdict(float)
2021-04-08 20:05:27 +00:00
tests_log_paths = defaultdict(list)
2021-03-23 03:35:08 +00:00
items_to_run = list(grouped_tests.items())
2021-02-23 07:24:51 +00:00
logging.info("Total test groups %s", len(items_to_run))
if self.shuffle_test_groups():
logging.info("Shuffling test groups")
random.shuffle(items_to_run)
2021-02-25 07:02:45 +00:00
for group, tests in items_to_run:
2024-08-15 11:11:10 +00:00
if timeout_expired:
print("Timeout expired - break tests execution")
break
2021-12-17 12:01:57 +00:00
logging.info("Running test group %s containing %s tests", group, len(tests))
group_counters, group_test_times, log_paths = self.try_run_test_group(
group, tests, MAX_RETRY, NUM_WORKERS, 0
)
2021-02-23 07:24:51 +00:00
total_tests = 0
for counter, value in group_counters.items():
logging.info(
"Tests from group %s stats, %s count %s", group, counter, len(value)
)
2021-02-23 07:24:51 +00:00
counters[counter] += value
logging.info(
"Totally have %s with status %s", len(counters[counter]), counter
)
2021-02-23 07:24:51 +00:00
total_tests += len(counters[counter])
logging.info("Totally finished tests %s/%s", total_tests, len(all_tests))
for test_name, test_time in group_test_times.items():
tests_times[test_name] = test_time
2021-04-08 20:05:27 +00:00
tests_log_paths[test_name] = log_paths
2021-02-23 07:24:51 +00:00
if len(counters["FAILED"]) + len(counters["ERROR"]) >= 20:
logging.info("Collected more than 20 failed/error tests, stopping")
break
if counters["FAILED"] or counters["ERROR"]:
logging.info(
"Overall status failure, because we have tests in FAILED or ERROR state"
)
2021-02-23 07:24:51 +00:00
result_state = "failure"
else:
logging.info("Overall success!")
result_state = "success"
test_result = []
2023-07-12 09:26:01 +00:00
for state in (
"ERROR",
"FAILED",
"PASSED",
"SKIPPED",
"BROKEN",
"NOT_FAILED",
):
2021-02-23 07:24:51 +00:00
if state == "PASSED":
text_state = "OK"
elif state == "FAILED":
text_state = "FAIL"
else:
text_state = state
test_result += [
2024-02-26 20:24:45 +00:00
(c, text_state, f"{tests_times[c]:.2f}", tests_log_paths[c])
for c in counters[state]
]
failed_sum = len(counters["FAILED"]) + len(counters["ERROR"])
2024-02-26 20:24:45 +00:00
status_text = f"fail: {failed_sum}, passed: {len(counters['PASSED'])}"
2021-02-23 07:24:51 +00:00
if not counters or sum(len(counter) for counter in counters.values()) == 0:
status_text = "No tests found for some reason! It's a bug"
result_state = "failure"
2024-08-26 11:37:37 +00:00
return result_state, status_text, test_result, tests_log_paths
2021-02-23 07:24:51 +00:00
2021-02-23 07:24:51 +00:00
def write_results(results_file, status_file, results, status):
2024-02-26 20:24:45 +00:00
with open(results_file, "w", encoding="utf-8") as f:
out = csv.writer(f, delimiter="\t")
2021-02-23 07:24:51 +00:00
out.writerows(results)
2024-02-26 20:24:45 +00:00
with open(status_file, "w", encoding="utf-8") as f:
out = csv.writer(f, delimiter="\t")
2021-02-23 07:24:51 +00:00
out.writerow(status)
def run():
2024-08-15 11:11:10 +00:00
signal.signal(signal.SIGTERM, handle_sigterm)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
2021-02-23 07:24:51 +00:00
repo_path = os.environ.get("CLICKHOUSE_TESTS_REPO_PATH", "")
build_path = os.environ.get("CLICKHOUSE_TESTS_BUILD_PATH", "")
result_path = os.environ.get("CLICKHOUSE_TESTS_RESULT_PATH", "")
params_path = os.environ.get("CLICKHOUSE_TESTS_JSON_PARAMS_PATH", "")
assert all((repo_path, build_path, result_path, params_path))
2021-02-23 07:24:51 +00:00
2024-02-26 20:24:45 +00:00
with open(params_path, "r", encoding="utf-8") as jfd:
params = json.loads(jfd.read())
runner = ClickhouseIntegrationTestsRunner(repo_path, result_path, params)
2021-02-23 07:24:51 +00:00
logging.info("Running tests")
2024-06-10 09:18:03 +00:00
if IS_CI:
# Avoid overlaps with previous runs
logging.info("Clearing dmesg before run")
subprocess.check_call("sudo -E dmesg --clear", shell=True)
state, description, test_results, _test_log_paths = runner.run_impl(build_path)
2021-02-23 07:24:51 +00:00
logging.info("Tests finished")
2021-02-24 16:41:44 +00:00
2024-06-10 09:18:03 +00:00
if IS_CI:
# Dump dmesg (to capture possible OOMs)
logging.info("Dumping dmesg")
subprocess.check_call("sudo -E dmesg -T", shell=True)
2021-02-23 07:24:51 +00:00
status = (state, description)
out_results_file = os.path.join(str(runner.path()), "test_results.tsv")
out_status_file = os.path.join(str(runner.path()), "check_status.tsv")
write_results(out_results_file, out_status_file, test_results, status)
logging.info("Result written")
2024-08-15 11:11:10 +00:00
timeout_expired = False
runner_subprocess = None # type:Optional[subprocess.Popen]
def handle_sigterm(signum, _frame):
2024-09-26 13:40:40 +00:00
# TODO: think on how to process it without globals?
2024-08-15 11:11:10 +00:00
print(f"WARNING: Received signal {signum}")
2024-09-26 13:40:40 +00:00
global timeout_expired # pylint:disable=global-statement
2024-08-15 11:11:10 +00:00
timeout_expired = True
2024-08-16 08:22:12 +00:00
if runner_subprocess:
runner_subprocess.send_signal(signal.SIGTERM)
2024-08-15 11:11:10 +00:00
if __name__ == "__main__":
run()