From 99ede620bec26b9b3922abc3225794a1afb3dda7 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 12 Sep 2024 15:24:25 +0200 Subject: [PATCH 1/2] Add `kill_ci_runner` to ci_utils, will allow restarts --- tests/ci/ci_utils.py | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/tests/ci/ci_utils.py b/tests/ci/ci_utils.py index 8b60f61b006..86fa1c008c9 100644 --- a/tests/ci/ci_utils.py +++ b/tests/ci/ci_utils.py @@ -1,4 +1,5 @@ import json +import logging import os import re import subprocess @@ -6,10 +7,12 @@ import sys import time from contextlib import contextmanager from pathlib import Path -from typing import Any, Iterator, List, Union, Optional, Sequence +from typing import Any, Dict, Iterator, List, Optional, Sequence, Tuple, Union import requests +logger = logging.getLogger(__name__) + class Envs: GITHUB_REPOSITORY = os.getenv("GITHUB_REPOSITORY", "ClickHouse/ClickHouse") @@ -36,6 +39,34 @@ def cd(path: Union[Path, str]) -> Iterator[None]: os.chdir(oldpwd) +def kill_ci_runner(message: str) -> None: + """The function to kill the current process with all parents when it's possible. + Works only when run with the set `CI` environment""" + if not os.getenv("CI", ""): # cycle import env_helper + logger.info("Running outside the CI, won't kill the runner") + return + print(f"::error::{message}") + + def get_ppid_name(pid: int) -> Tuple[int, str]: + # Avoid using psutil, it's not in stdlib + stats = Path(f"/proc/{pid}/stat").read_text(encoding="utf-8").split() + return int(stats[3]), stats[1] + + pid = os.getpid() + pids = {} # type: Dict[str, str] + while pid: + ppid, name = get_ppid_name(pid) + pids[str(pid)] = name + pid = ppid + logger.error( + "Sleeping 5 seconds and killing all possible processes from following:\n %s", + "\n ".join(f"{p}: {n}" for p, n in pids.items()), + ) + time.sleep(5) + # The current process will be killed too + subprocess.run(f"kill -9 {' '.join(pids.keys())}", check=False, shell=True) + + class GH: class ActionsNames: RunConfig = "RunConfig" From 8d5babf65fe78449e1bf3e3979f3fb7d09d18708 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 12 Sep 2024 15:26:21 +0200 Subject: [PATCH 2/2] Kill the runner process if integration tests fail to pre-pull --- tests/ci/integration_tests_runner.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/ci/integration_tests_runner.py b/tests/ci/integration_tests_runner.py index 6405492cd9c..21ca1649dc9 100755 --- a/tests/ci/integration_tests_runner.py +++ b/tests/ci/integration_tests_runner.py @@ -19,11 +19,12 @@ from collections import defaultdict from itertools import chain from typing import Any, Dict, Optional +from ci_utils import kill_ci_runner from env_helper import IS_CI from integration_test_images import IMAGES -from tee_popen import TeePopen from report import JOB_TIMEOUT_TEST_NAME from stopwatch import Stopwatch +from tee_popen import TeePopen MAX_RETRY = 1 NUM_WORKERS = 5 @@ -332,7 +333,9 @@ class ClickhouseIntegrationTestsRunner: except subprocess.CalledProcessError as err: logging.info("docker-compose pull failed: %s", str(err)) continue - logging.error("Pulling images failed for 5 attempts. Will fail the worker.") + message = "Pulling images failed for 5 attempts. Will fail the worker." + logging.error(message) + kill_ci_runner(message) # We pass specific retcode to to ci/integration_test_check.py to skip status reporting and restart job sys.exit(13)