Implement global timeout check in _test_run

2024-12-17 20:02:05 +00:00 · 2024-05-16 18:23:35 +02:00 · 2024-05-16 18:23:35 +02:00 · 92ee671310
commit 92ee671310
parent 87138301c4
6 changed files with 37 additions and 73 deletions
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@ -18,6 +18,7 @@ import docker_images_helper
 import upload_result_helper
 from build_check import get_release_or_pr
 from ci_config import CI_CONFIG, Build, CILabels, CIStages, JobNames, StatusNames
+from ci_metadata import CiMetadata
 from ci_utils import GHActions, is_hex, normalize_string
 from clickhouse_helper import (
    CiLogsCredentials,
@ -39,22 +40,23 @@ from digest_helper import DockerDigester, JobDigester
 from env_helper import (
    CI,
    GITHUB_JOB_API_URL,
+    GITHUB_REPOSITORY,
+    GITHUB_RUN_ID,
    GITHUB_RUN_URL,
    REPO_COPY,
    REPORT_PATH,
    S3_BUILDS_BUCKET,
    TEMP_PATH,
-    GITHUB_RUN_ID,
-    GITHUB_REPOSITORY,
 )
 from get_robot_token import get_best_robot_token
 from git_helper import GIT_PREFIX, Git
 from git_helper import Runner as GitRunner
 from github_helper import GitHub
 from pr_info import PRInfo
-from report import ERROR, SUCCESS, BuildResult, JobReport, PENDING
+from report import ERROR, FAILURE, PENDING, SUCCESS, BuildResult, JobReport, TestResult
 from s3_helper import S3Helper
-from ci_metadata import CiMetadata
+from stopwatch import Stopwatch
+from tee_popen import TeePopen
 from version_helper import get_version_from_repo

 # pylint: disable=too-many-lines
@ -1868,8 +1870,7 @@ def _run_test(job_name: str, run_command: str) -> int:
    ), "Run command must be provided as input argument or be configured in job config"

    env = os.environ.copy()
-    if CI_CONFIG.get_job_config(job_name).timeout:
-        env["KILL_TIMEOUT"] = str(CI_CONFIG.get_job_config(job_name).timeout)
+    timeout = CI_CONFIG.get_job_config(job_name).timeout or None

    if not run_command:
        run_command = "/".join(
@ -1882,25 +1883,25 @@ def _run_test(job_name: str, run_command: str) -> int:
        print("Use run command from the workflow")
    env["CHECK_NAME"] = job_name
    print(f"Going to start run command [{run_command}]")
-    process = subprocess.run(
-        run_command,
-        stdout=sys.stdout,
-        stderr=sys.stderr,
-        env=env,
-        text=True,
-        check=False,
-        shell=True,
-    )
+    stopwatch = Stopwatch()
+    job_log = Path(TEMP_PATH) / "job_log.txt"
+    with TeePopen(run_command, job_log, env, timeout) as process:
+        retcode = process.wait()
+        if retcode != 0:
+            print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
+            if timeout and process.timeout_exceeded:
+                print(f"Timeout {timeout} exceeded, dumping the job report")
+                JobReport(
+                    status=FAILURE,
+                    description=f"Timeout {timeout} exceeded",
+                    test_results=[TestResult.create_check_timeout_expired(timeout)],
+                    start_time=stopwatch.start_time_str,
+                    duration=stopwatch.duration_seconds,
+                    additional_files=[job_log],
+                ).dump()

-    if process.returncode == 0:
-        print(f"Run action done for: [{job_name}]")
-        exit_code = 0
-    else:
-        print(
-            f"Run action failed for: [{job_name}] with exit code [{process.returncode}]"
-        )
-        exit_code = process.returncode
-    return exit_code
+    print(f"Run action done for: [{job_name}]")
+    return retcode


 def _get_ext_check_name(check_name: str) -> str:
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@ -472,12 +472,12 @@ compatibility_test_common_params = {
 }
 stateless_test_common_params = {
    "digest": stateless_check_digest,
-    "run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT',
+    "run_command": 'functional_test_check.py "$CHECK_NAME"',
    "timeout": 10800,
 }
 stateful_test_common_params = {
    "digest": stateful_check_digest,
-    "run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT',
+    "run_command": 'functional_test_check.py "$CHECK_NAME"',
    "timeout": 3600,
 }
 stress_test_common_params = {
--- a/tests/ci/ci_utils.py
+++ b/tests/ci/ci_utils.py
@ -1,8 +1,7 @@
-from contextlib import contextmanager
 import os
-import signal
-from typing import Any, List, Union, Iterator
+from contextlib import contextmanager
 from pathlib import Path
+from typing import Any, Iterator, List, Union


 class WithIter(type):
@ -49,14 +48,3 @@ class GHActions:
        for line in lines:
            print(line)
        print("::endgroup::")
-
-
-def set_job_timeout():
-    def timeout_handler(_signum, _frame):
-        print("Timeout expired")
-        raise TimeoutError("Job's KILL_TIMEOUT expired")
-
-    kill_timeout = int(os.getenv("KILL_TIMEOUT", "0"))
-    assert kill_timeout > 0, "kill timeout must be provided in KILL_TIMEOUT env"
-    signal.signal(signal.SIGALRM, timeout_handler)
-    signal.alarm(kill_timeout)
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@ -68,7 +68,6 @@ def get_run_command(
    repo_path: Path,
    result_path: Path,
    server_log_path: Path,
-    kill_timeout: int,
    additional_envs: List[str],
    ci_logs_args: str,
    image: DockerImage,
@ -86,7 +85,6 @@ def get_run_command(
    )

    envs = [
-        f"-e MAX_RUN_TIME={int(0.9 * kill_timeout)}",
        # a static link, don't use S3_URL or S3_DOWNLOAD
        '-e S3_URL="https://s3.amazonaws.com/clickhouse-datasets"',
    ]
@ -192,7 +190,6 @@ def process_results(
 def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("check_name")
-    parser.add_argument("kill_timeout", type=int)
    parser.add_argument(
        "--validate-bugfix",
        action="store_true",
@ -224,12 +221,7 @@ def main():
    assert (
        check_name
    ), "Check name must be provided as an input arg or in CHECK_NAME env"
-    kill_timeout = args.kill_timeout or int(os.getenv("KILL_TIMEOUT", "0"))
-    assert (
-        kill_timeout > 0
-    ), "kill timeout must be provided as an input arg or in KILL_TIMEOUT env"
    validate_bugfix_check = args.validate_bugfix
-    print(f"Runnin check [{check_name}] with timeout [{kill_timeout}]")

    flaky_check = "flaky" in check_name.lower()

@ -288,7 +280,6 @@ def main():
            repo_path,
            result_path,
            server_log_path,
-            kill_timeout,
            additional_envs,
            ci_logs_args,
            docker_image,
--- a/tests/ci/install_check.py
+++ b/tests/ci/install_check.py
@ -1,25 +1,21 @@
 #!/usr/bin/env python3

 import argparse
-
 import logging
-import sys
 import subprocess
+import sys
 from pathlib import Path
 from shutil import copy2
 from typing import Dict

-
 from build_download_helper import download_builds_filter
-
 from compress_files import compress_fast
-from docker_images_helper import DockerImage, pull_image, get_docker_image
-from env_helper import CI, REPORT_PATH, TEMP_PATH as TEMP
-from report import JobReport, TestResults, TestResult, FAILURE, FAIL, OK, SUCCESS
+from docker_images_helper import DockerImage, get_docker_image, pull_image
+from env_helper import REPORT_PATH
+from env_helper import TEMP_PATH as TEMP
+from report import FAIL, FAILURE, OK, SUCCESS, JobReport, TestResult, TestResults
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
-from ci_utils import set_job_timeout
-

 RPM_IMAGE = "clickhouse/install-rpm-test"
 DEB_IMAGE = "clickhouse/install-deb-test"
@ -256,9 +252,6 @@ def main():

    args = parse_args()

-    if CI:
-        set_job_timeout()
-
    TEMP_PATH.mkdir(parents=True, exist_ok=True)
    LOGS_PATH.mkdir(parents=True, exist_ok=True)

--- a/tests/ci/sqllogic_test.py
+++ b/tests/ci/sqllogic_test.py
@ -9,8 +9,8 @@ from pathlib import Path
 from typing import Tuple

 from build_download_helper import download_all_deb_packages
-from docker_images_helper import DockerImage, pull_image, get_docker_image
-from env_helper import REPORT_PATH, TEMP_PATH, REPO_COPY
+from docker_images_helper import DockerImage, get_docker_image, pull_image
+from env_helper import REPO_COPY, REPORT_PATH, TEMP_PATH
 from report import (
    ERROR,
    FAIL,
@ -72,11 +72,6 @@ def parse_args() -> argparse.Namespace:
        required=False,
        default="",
    )
-    parser.add_argument(
-        "--kill-timeout",
-        required=False,
-        default=0,
-    )
    return parser.parse_args()


@ -96,10 +91,6 @@ def main():
    assert (
        check_name
    ), "Check name must be provided as an input arg or in CHECK_NAME env"
-    kill_timeout = args.kill_timeout or int(os.getenv("KILL_TIMEOUT", "0"))
-    assert (
-        kill_timeout > 0
-    ), "kill timeout must be provided as an input arg or in KILL_TIMEOUT env"

    docker_image = pull_image(get_docker_image(IMAGE_NAME))

@ -127,7 +118,7 @@ def main():
    )
    logging.info("Going to run func tests: %s", run_command)

-    with TeePopen(run_command, run_log_path, timeout=kill_timeout) as process:
+    with TeePopen(run_command, run_log_path) as process:
        retcode = process.wait()
        if retcode == 0:
            logging.info("Run successfully")