Merge pull request #63982 from ClickHouse/expired-timeout

Implement a single point for "Check timeout expired" test result
This commit is contained in:
Mikhail f. Shiryaev 2024-05-30 11:32:41 +00:00 committed by GitHub
commit 7237242576
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 63 additions and 145 deletions

View File

@ -109,12 +109,12 @@ def main():
test_script = jobs_scripts[test_job]
if report_file.exists():
report_file.unlink()
extra_timeout_option = ""
if test_job == JobNames.STATELESS_TEST_RELEASE:
extra_timeout_option = str(3600)
# "bugfix" must be present in checkname, as integration test runner checks this
check_name = f"Validate bugfix: {test_job}"
command = f"python3 {test_script} '{check_name}' {extra_timeout_option} --validate-bugfix --report-to-file {report_file}"
command = (
f"python3 {test_script} '{check_name}' "
f"--validate-bugfix --report-to-file {report_file}"
)
print(f"Going to validate job [{test_job}], command [{command}]")
_ = subprocess.run(
command,

View File

@ -18,6 +18,7 @@ import docker_images_helper
import upload_result_helper
from build_check import get_release_or_pr
from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames, StatusNames
from ci_metadata import CiMetadata
from ci_utils import GHActions, is_hex, normalize_string
from clickhouse_helper import (
CiLogsCredentials,
@ -39,22 +40,23 @@ from digest_helper import DockerDigester, JobDigester
from env_helper import (
CI,
GITHUB_JOB_API_URL,
GITHUB_REPOSITORY,
GITHUB_RUN_ID,
GITHUB_RUN_URL,
REPO_COPY,
REPORT_PATH,
S3_BUILDS_BUCKET,
TEMP_PATH,
GITHUB_RUN_ID,
GITHUB_REPOSITORY,
)
from get_robot_token import get_best_robot_token
from git_helper import GIT_PREFIX, Git
from git_helper import Runner as GitRunner
from github_helper import GitHub
from pr_info import PRInfo
from report import ERROR, SUCCESS, BuildResult, JobReport, PENDING
from report import ERROR, FAILURE, PENDING, SUCCESS, BuildResult, JobReport, TestResult
from s3_helper import S3Helper
from ci_metadata import CiMetadata
from stopwatch import Stopwatch
from tee_popen import TeePopen
from version_helper import get_version_from_repo
# pylint: disable=too-many-lines
@ -1867,8 +1869,8 @@ def _run_test(job_name: str, run_command: str) -> int:
run_command or CI_CONFIG.get_job_config(job_name).run_command
), "Run command must be provided as input argument or be configured in job config"
if CI_CONFIG.get_job_config(job_name).timeout:
os.environ["KILL_TIMEOUT"] = str(CI_CONFIG.get_job_config(job_name).timeout)
env = os.environ.copy()
timeout = CI_CONFIG.get_job_config(job_name).timeout or None
if not run_command:
run_command = "/".join(
@ -1879,26 +1881,27 @@ def _run_test(job_name: str, run_command: str) -> int:
print("Use run command from a job config")
else:
print("Use run command from the workflow")
os.environ["CHECK_NAME"] = job_name
env["CHECK_NAME"] = job_name
print(f"Going to start run command [{run_command}]")
process = subprocess.run(
run_command,
stdout=sys.stdout,
stderr=sys.stderr,
text=True,
check=False,
shell=True,
)
stopwatch = Stopwatch()
job_log = Path(TEMP_PATH) / "job_log.txt"
with TeePopen(run_command, job_log, env, timeout) as process:
retcode = process.wait()
if retcode != 0:
print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
if timeout and process.timeout_exceeded:
print(f"Timeout {timeout} exceeded, dumping the job report")
JobReport(
status=FAILURE,
description=f"Timeout {timeout} exceeded",
test_results=[TestResult.create_check_timeout_expired(timeout)],
start_time=stopwatch.start_time_str,
duration=stopwatch.duration_seconds,
additional_files=[job_log],
).dump()
if process.returncode == 0:
print(f"Run action done for: [{job_name}]")
exit_code = 0
else:
print(
f"Run action failed for: [{job_name}] with exit code [{process.returncode}]"
)
exit_code = process.returncode
return exit_code
print(f"Run action done for: [{job_name}]")
return retcode
def _get_ext_check_name(check_name: str) -> str:

View File

@ -175,8 +175,8 @@ class JobNames(metaclass=WithIter):
COMPATIBILITY_TEST = "Compatibility check (amd64)"
COMPATIBILITY_TEST_ARM = "Compatibility check (aarch64)"
CLCIKBENCH_TEST = "ClickBench (amd64)"
CLCIKBENCH_TEST_ARM = "ClickBench (aarch64)"
CLICKBENCH_TEST = "ClickBench (amd64)"
CLICKBENCH_TEST_ARM = "ClickBench (aarch64)"
LIBFUZZER_TEST = "libFuzzer tests"
@ -472,17 +472,18 @@ compatibility_test_common_params = {
}
stateless_test_common_params = {
"digest": stateless_check_digest,
"run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT',
"run_command": 'functional_test_check.py "$CHECK_NAME"',
"timeout": 10800,
}
stateful_test_common_params = {
"digest": stateful_check_digest,
"run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT',
"run_command": 'functional_test_check.py "$CHECK_NAME"',
"timeout": 3600,
}
stress_test_common_params = {
"digest": stress_check_digest,
"run_command": "stress_check.py",
"timeout": 9000,
}
upgrade_test_common_params = {
"digest": upgrade_check_digest,
@ -531,6 +532,7 @@ clickbench_test_params = {
docker=["clickhouse/clickbench"],
),
"run_command": 'clickbench.py "$CHECK_NAME"',
"timeout": 900,
}
install_test_params = JobConfig(
digest=install_check_digest,
@ -1111,6 +1113,7 @@ CI_CONFIG = CIConfig(
exclude_files=[".md"],
docker=["clickhouse/fasttest"],
),
timeout=2400,
),
),
JobNames.STYLE_CHECK: TestConfig(
@ -1123,7 +1126,9 @@ CI_CONFIG = CIConfig(
"",
# we run this check by label - no digest required
job_config=JobConfig(
run_by_label="pr-bugfix", run_command="bugfix_validate_check.py"
run_by_label="pr-bugfix",
run_command="bugfix_validate_check.py",
timeout=900,
),
),
},
@ -1357,10 +1362,10 @@ CI_CONFIG = CIConfig(
Build.PACKAGE_RELEASE, job_config=sqllogic_test_params
),
JobNames.SQLTEST: TestConfig(Build.PACKAGE_RELEASE, job_config=sql_test_params),
JobNames.CLCIKBENCH_TEST: TestConfig(
JobNames.CLICKBENCH_TEST: TestConfig(
Build.PACKAGE_RELEASE, job_config=JobConfig(**clickbench_test_params) # type: ignore
),
JobNames.CLCIKBENCH_TEST_ARM: TestConfig(
JobNames.CLICKBENCH_TEST_ARM: TestConfig(
Build.PACKAGE_AARCH64, job_config=JobConfig(**clickbench_test_params) # type: ignore
),
JobNames.LIBFUZZER_TEST: TestConfig(
@ -1368,7 +1373,7 @@ CI_CONFIG = CIConfig(
job_config=JobConfig(
run_by_label=CILabels.libFuzzer,
timeout=10800,
run_command='libfuzzer_test_check.py "$CHECK_NAME" 10800',
run_command='libfuzzer_test_check.py "$CHECK_NAME"',
),
), # type: ignore
},

View File

@ -1,8 +1,7 @@
from contextlib import contextmanager
import os
import signal
from typing import Any, List, Union, Iterator
from contextlib import contextmanager
from pathlib import Path
from typing import Any, Iterator, List, Union
class WithIter(type):
@ -49,14 +48,3 @@ class GHActions:
for line in lines:
print(line)
print("::endgroup::")
def set_job_timeout():
def timeout_handler(_signum, _frame):
print("Timeout expired")
raise TimeoutError("Job's KILL_TIMEOUT expired")
kill_timeout = int(os.getenv("KILL_TIMEOUT", "0"))
assert kill_timeout > 0, "kill timeout must be provided in KILL_TIMEOUT env"
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(kill_timeout)

View File

@ -1,5 +1,4 @@
#!/usr/bin/env python3
import argparse
import csv
import logging
import os
@ -11,15 +10,7 @@ from typing import Tuple
from docker_images_helper import DockerImage, get_docker_image, pull_image
from env_helper import REPO_COPY, S3_BUILDS_BUCKET, TEMP_PATH
from pr_info import PRInfo
from report import (
ERROR,
FAILURE,
SUCCESS,
JobReport,
TestResult,
TestResults,
read_test_results,
)
from report import ERROR, FAILURE, SUCCESS, JobReport, TestResults, read_test_results
from stopwatch import Stopwatch
from tee_popen import TeePopen
@ -80,30 +71,9 @@ def process_results(result_directory: Path) -> Tuple[str, str, TestResults]:
return state, description, test_results
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
description="FastTest script",
)
parser.add_argument(
"--timeout",
type=int,
# Fast tests in most cases done within 10 min and 40 min timout should be sufficient,
# though due to cold cache build time can be much longer
# https://pastila.nl/?146195b6/9bb99293535e3817a9ea82c3f0f7538d.link#5xtClOjkaPLEjSuZ92L2/g==
default=40,
help="Timeout in minutes",
)
args = parser.parse_args()
args.timeout = args.timeout * 60
return args
def main():
logging.basicConfig(level=logging.INFO)
stopwatch = Stopwatch()
args = parse_args()
temp_path = Path(TEMP_PATH)
temp_path.mkdir(parents=True, exist_ok=True)
@ -134,14 +104,10 @@ def main():
logs_path.mkdir(parents=True, exist_ok=True)
run_log_path = logs_path / "run.log"
timeout_expired = False
with TeePopen(run_cmd, run_log_path, timeout=args.timeout) as process:
with TeePopen(run_cmd, run_log_path) as process:
retcode = process.wait()
if process.timeout_exceeded:
logging.info("Timeout expired for command: %s", run_cmd)
timeout_expired = True
elif retcode == 0:
if retcode == 0:
logging.info("Run successfully")
else:
logging.info("Run failed")
@ -175,11 +141,6 @@ def main():
else:
state, description, test_results = process_results(output_path)
if timeout_expired:
test_results.append(TestResult.create_check_timeout_expired(args.timeout))
state = FAILURE
description = test_results[-1].name
JobReport(
description=description,
test_results=test_results,

View File

@ -68,7 +68,6 @@ def get_run_command(
repo_path: Path,
result_path: Path,
server_log_path: Path,
kill_timeout: int,
additional_envs: List[str],
ci_logs_args: str,
image: DockerImage,
@ -86,7 +85,6 @@ def get_run_command(
)
envs = [
f"-e MAX_RUN_TIME={int(0.9 * kill_timeout)}",
# a static link, don't use S3_URL or S3_DOWNLOAD
'-e S3_URL="https://s3.amazonaws.com/clickhouse-datasets"',
]
@ -192,7 +190,6 @@ def process_results(
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("check_name")
parser.add_argument("kill_timeout", type=int)
parser.add_argument(
"--validate-bugfix",
action="store_true",
@ -224,12 +221,7 @@ def main():
assert (
check_name
), "Check name must be provided as an input arg or in CHECK_NAME env"
kill_timeout = args.kill_timeout or int(os.getenv("KILL_TIMEOUT", "0"))
assert (
kill_timeout > 0
), "kill timeout must be provided as an input arg or in KILL_TIMEOUT env"
validate_bugfix_check = args.validate_bugfix
print(f"Runnin check [{check_name}] with timeout [{kill_timeout}]")
flaky_check = "flaky" in check_name.lower()
@ -288,7 +280,6 @@ def main():
repo_path,
result_path,
server_log_path,
kill_timeout,
additional_envs,
ci_logs_args,
docker_image,

View File

@ -1,25 +1,21 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import subprocess
import sys
from pathlib import Path
from shutil import copy2
from typing import Dict
from build_download_helper import download_builds_filter
from compress_files import compress_fast
from docker_images_helper import DockerImage, pull_image, get_docker_image
from env_helper import CI, REPORT_PATH, TEMP_PATH as TEMP
from report import JobReport, TestResults, TestResult, FAILURE, FAIL, OK, SUCCESS
from docker_images_helper import DockerImage, get_docker_image, pull_image
from env_helper import REPORT_PATH
from env_helper import TEMP_PATH as TEMP
from report import FAIL, FAILURE, OK, SUCCESS, JobReport, TestResult, TestResults
from stopwatch import Stopwatch
from tee_popen import TeePopen
from ci_utils import set_job_timeout
RPM_IMAGE = "clickhouse/install-rpm-test"
DEB_IMAGE = "clickhouse/install-deb-test"
@ -256,9 +252,6 @@ def main():
args = parse_args()
if CI:
set_job_timeout()
TEMP_PATH.mkdir(parents=True, exist_ok=True)
LOGS_PATH.mkdir(parents=True, exist_ok=True)

View File

@ -10,6 +10,7 @@ from typing import Any, List
import boto3 # type: ignore
import requests
from build_download_helper import (
download_build_with_progress,
get_build_name_for_check,
@ -201,7 +202,7 @@ def main():
docker_image = KEEPER_IMAGE_NAME if args.program == "keeper" else SERVER_IMAGE_NAME
if pr_info.is_scheduled or pr_info.is_dispatched:
# get latest clcikhouse by the static link for latest master buit - get its version and provide permanent url for this version to the jepsen
# get latest clickhouse by the static link for latest master buit - get its version and provide permanent url for this version to the jepsen
build_url = f"{S3_URL}/{S3_BUILDS_BUCKET}/master/amd64/clickhouse"
download_build_with_progress(build_url, Path(TEMP_PATH) / "clickhouse")
git_runner.run(f"chmod +x {TEMP_PATH}/clickhouse")

View File

@ -46,7 +46,6 @@ def get_run_command(
fuzzers_path: Path,
repo_path: Path,
result_path: Path,
kill_timeout: int,
additional_envs: List[str],
ci_logs_args: str,
image: DockerImage,
@ -59,7 +58,6 @@ def get_run_command(
)
envs = [
f"-e MAX_RUN_TIME={int(0.9 * kill_timeout)}",
# a static link, don't use S3_URL or S3_DOWNLOAD
'-e S3_URL="https://s3.amazonaws.com/clickhouse-datasets"',
]
@ -83,7 +81,6 @@ def get_run_command(
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("check_name")
parser.add_argument("kill_timeout", type=int)
return parser.parse_args()
@ -99,7 +96,6 @@ def main():
args = parse_args()
check_name = args.check_name
kill_timeout = args.kill_timeout
pr_info = PRInfo()
@ -145,7 +141,6 @@ def main():
fuzzers_path,
repo_path,
result_path,
kill_timeout,
additional_envs,
ci_logs_args,
docker_image,

View File

@ -288,7 +288,7 @@ class JobReport:
start_time: str
duration: float
additional_files: Union[Sequence[str], Sequence[Path]]
# clcikhouse version, build job only
# clickhouse version, build job only
version: str = ""
# checkname to set in commit status, set if differs from jjob name
check_name: str = ""

View File

@ -9,8 +9,8 @@ from pathlib import Path
from typing import Tuple
from build_download_helper import download_all_deb_packages
from docker_images_helper import DockerImage, pull_image, get_docker_image
from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY
from docker_images_helper import DockerImage, get_docker_image, pull_image
from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH
from report import (
ERROR,
FAIL,
@ -72,11 +72,6 @@ def parse_args() -> argparse.Namespace:
required=False,
default="",
)
parser.add_argument(
"--kill-timeout",
required=False,
default=0,
)
return parser.parse_args()
@ -96,10 +91,6 @@ def main():
assert (
check_name
), "Check name must be provided as an input arg or in CHECK_NAME env"
kill_timeout = args.kill_timeout or int(os.getenv("KILL_TIMEOUT", "0"))
assert (
kill_timeout > 0
), "kill timeout must be provided as an input arg or in KILL_TIMEOUT env"
docker_image = pull_image(get_docker_image(IMAGE_NAME))
@ -127,7 +118,7 @@ def main():
)
logging.info("Going to run func tests: %s", run_command)
with TeePopen(run_command, run_log_path, timeout=kill_timeout) as process:
with TeePopen(run_command, run_log_path) as process:
retcode = process.wait()
if retcode == 0:
logging.info("Run successfully")

View File

@ -14,7 +14,7 @@ from docker_images_helper import DockerImage, get_docker_image, pull_image
from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH
from get_robot_token import get_parameter_from_ssm
from pr_info import PRInfo
from report import ERROR, JobReport, TestResult, TestResults, read_test_results
from report import ERROR, JobReport, TestResults, read_test_results
from stopwatch import Stopwatch
from tee_popen import TeePopen
@ -161,14 +161,9 @@ def run_stress_test(docker_image_name: str) -> None:
)
logging.info("Going to run stress test: %s", run_command)
timeout_expired = False
timeout = 60 * 150
with TeePopen(run_command, run_log_path, timeout=timeout) as process:
with TeePopen(run_command, run_log_path) as process:
retcode = process.wait()
if process.timeout_exceeded:
logging.info("Timeout expired for command: %s", run_command)
timeout_expired = True
elif retcode == 0:
if retcode == 0:
logging.info("Run successfully")
else:
logging.info("Run failed")
@ -180,11 +175,6 @@ def run_stress_test(docker_image_name: str) -> None:
result_path, server_log_path, run_log_path
)
if timeout_expired:
test_results.append(TestResult.create_check_timeout_expired(timeout))
state = "failure"
description = test_results[-1].name
JobReport(
description=description,
test_results=test_results,