CI: Functional tests to store artifacts on timeout

2024-09-19 16:20:50 +00:00 · 2024-08-06 21:39:41 +02:00 · 2024-08-06 21:39:41 +02:00 · 427016a450
commit 427016a450
parent 8d0d870326
13 changed files with 107 additions and 116 deletions
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -256,22 +256,6 @@ function configure
    rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
 }

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
 function run_tests
 {
    clickhouse-server --version
@ -340,7 +324,7 @@ case "$stage" in
    configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
    ;&
 "run_tests")
-    timeout_with_logging 35m bash -c run_tests ||:
+    run_tests ||:
    /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
        --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
        --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"
--- a/docker/test/sqllogic/Dockerfile
+++ b/docker/test/sqllogic/Dockerfile
@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \


 ENV TZ=Europe/Amsterdam
-ENV MAX_RUN_TIME=9000
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
--- a/docker/test/sqllogic/run.sh
+++ b/docker/test/sqllogic/run.sh
@ -94,7 +94,7 @@ function run_tests()

 export -f run_tests

-timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2
+run_tests

 #/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv

--- a/docker/test/sqltest/Dockerfile
+++ b/docker/test/sqltest/Dockerfile
@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
 RUN git clone ${sqltest_repo}

 ENV TZ=UTC
-ENV MAX_RUN_TIME=900
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 COPY run.sh /
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -4,9 +4,6 @@
 source /setup_export_logs.sh
 set -e -x

-MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
-MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
-
 # Choose random timezone for this test run
 TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
 echo "Choosen random timezone $TZ"
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]

    clickhouse-client --query "DROP TABLE datasets.hits_v1"
    clickhouse-client --query "DROP TABLE datasets.visits_v1"
-
-    MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000))  # min(MAX_RUN_TIME, 2.5 hours)
-    MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000))    # set to 2.5 hours if 0 (unlimited)
 else
    clickhouse-client --query "CREATE DATABASE test"
    clickhouse-client --query "SHOW TABLES FROM test"
@ -257,24 +251,7 @@ function run_tests()

 export -f run_tests

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
-TIMEOUT=$((MAX_RUN_TIME - 700))
-timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
+run_tests ||:

 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 ENV NUM_TRIES=1
-ENV MAX_RUN_TIME=0

 # Unrelated to vars in setup_minio.sh, but should be the same there
 # to have the same binaries for local running scenario
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -12,9 +12,6 @@ dmesg --clear
 # fail on errors, verbose and export all env variables
 set -e -x -a

-MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
-MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
-
 USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
 USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}

@ -308,8 +305,6 @@ function run_tests()

    try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"

-    TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
-    START_TIME=${SECONDS}
    set +e

    TEST_ARGS=(
@ -324,32 +319,22 @@ function run_tests()
        --test-runs "$NUM_TRIES"
        "${ADDITIONAL_OPTIONS[@]}"
    )
-    timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
+    clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee -a test_output/test_result.txt
    set -e
-    DURATION=$((SECONDS - START_TIME))
-
-    echo "Elapsed ${DURATION} seconds."
-    if [[ $DURATION -ge $TIMEOUT ]]
-    then
-        echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
-    fi
 }

 export -f run_tests

-
-# This should be enough to setup job and collect artifacts
-TIMEOUT=$((MAX_RUN_TIME - 700))
 if [ "$NUM_TRIES" -gt "1" ]; then
    # We don't run tests with Ordinary database in PRs, only in master.
    # So run new/changed tests with Ordinary at least once in flaky check.
-    timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
+    NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
      | sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
 fi

-timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
+run_tests ||:

 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/utils.lib
+++ b/docker/test/stateless/utils.lib
@ -40,22 +40,6 @@ function fn_exists() {
    declare -F "$1" > /dev/null;
 }

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
 function collect_core_dumps()
 {
  find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@ -50,7 +50,6 @@ from github_helper import GitHub
 from pr_info import PRInfo
 from report import (
    ERROR,
-    FAILURE,
    PENDING,
    SUCCESS,
    BuildResult,
@ -62,11 +61,11 @@ from report import (
    FAIL,
 )
 from s3_helper import S3Helper
-from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_cache import CiCache
 from ci_settings import CiSettings
 from ci_buddy import CIBuddy
+from stopwatch import Stopwatch
 from version_helper import get_version_from_repo

 # pylint: disable=too-many-lines
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
                # skip_status = SUCCESS already there
                GH.print_in_group("Commit Status Data", job_status)

-    # create pre report
-    jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
+    # create dummy report
+    jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
    jr.dump()

    if not to_be_skipped:
@ -990,19 +989,20 @@ def _run_test(job_name: str, run_command: str) -> int:
    stopwatch = Stopwatch()
    job_log = Path(TEMP_PATH) / "job_log.txt"
    with TeePopen(run_command, job_log, env, timeout) as process:
+        print(f"Job process started, pid [{process.process.pid}]")
        retcode = process.wait()
        if retcode != 0:
            print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
-            if timeout and process.timeout_exceeded:
-                print(f"Timeout {timeout} exceeded, dumping the job report")
-                JobReport(
-                    status=FAILURE,
-                    description=f"Timeout {timeout} exceeded",
-                    test_results=[TestResult.create_check_timeout_expired(timeout)],
-                    start_time=stopwatch.start_time_str,
-                    duration=stopwatch.duration_seconds,
-                    additional_files=[job_log],
-                ).dump()
+        if process.timeout_exceeded:
+            print(f"Job timed out: [{job_name}] exit code [{retcode}]")
+            assert JobReport.exist(), "JobReport real or dummy must be present"
+            jr = JobReport.load()
+            if jr.dummy:
+                print(
+                    f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
+                )
+                jr.test_results = [TestResult.create_check_timeout_expired()]
+                jr.duration = stopwatch.duration_seconds

    print(f"Run action done for: [{job_name}]")
    return retcode
@ -1205,7 +1205,7 @@ def main() -> int:
            job_report
        ), "BUG. There must be job report either real report, or pre-report if job was killed"
        error_description = ""
-        if not job_report.pre_report:
+        if not job_report.dummy:
            # it's a real job report
            ch_helper = ClickHouseHelper()
            check_url = ""
--- a/tests/ci/ci_definitions.py
+++ b/tests/ci/ci_definitions.py
@ -332,7 +332,7 @@ class JobConfig:
    # will be triggered for the job if omitted in CI workflow yml
    run_command: str = ""
    # job timeout, seconds
-    timeout: Optional[int] = None
+    timeout: int = 7200
    # sets number of batches for a multi-batch job
    num_batches: int = 1
    # label that enables job in CI, if set digest isn't used
@ -421,7 +421,6 @@ class CommonJobConfigs:
        ),
        run_command='functional_test_check.py "$CHECK_NAME"',
        runner_type=Runners.FUNC_TESTER,
-        timeout=9000,
    )
    STATEFUL_TEST = JobConfig(
        job_name_keyword="stateful",
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@ -5,10 +5,11 @@ import csv
 import logging
 import os
 import re
+import signal
 import subprocess
 import sys
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Tuple, Optional

 from build_download_helper import download_all_deb_packages
 from clickhouse_helper import CiLogsCredentials
@ -25,11 +26,12 @@ from report import (
    TestResults,
    read_test_results,
    FAILURE,
+    TestResult,
 )
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_config import CI
-from ci_utils import Utils
+from ci_utils import Utils, Shell

 NO_CHANGES_MSG = "Nothing to run"

@ -113,10 +115,6 @@ def get_run_command(

    if flaky_check:
        envs.append("-e NUM_TRIES=50")
-        envs.append("-e MAX_RUN_TIME=2800")
-    else:
-        max_run_time = os.getenv("MAX_RUN_TIME", "0")
-        envs.append(f"-e MAX_RUN_TIME={max_run_time}")

    envs += [f"-e {e}" for e in additional_envs]

@ -128,7 +126,7 @@ def get_run_command(
    )

    return (
-        f"docker run --volume={builds_path}:/package_folder "
+        f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
        # For dmesg and sysctl
        "--privileged "
        f"{ci_logs_args}"
@ -198,7 +196,7 @@ def process_results(
    state, description = status[0][0], status[0][1]
    if ret_code != 0:
        state = ERROR
-        description += " (but script exited with an error)"
+        description = f"Job failed, exit code: {ret_code}. " + description

    try:
        results_path = result_directory / "test_results.tsv"
@ -240,7 +238,19 @@ def parse_args():
    return parser.parse_args()


+test_process = None  # type: Optional[TeePopen]
+timeout_expired = False
+
+
+def handle_sigterm(signum, _frame):
+    print(f"WARNING: Received signal {signum}")
+    global timeout_expired
+    timeout_expired = True
+    Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
+
+
 def main():
+    signal.signal(signal.SIGTERM, handle_sigterm)
    logging.basicConfig(level=logging.INFO)
    for handler in logging.root.handlers:
        # pylint: disable=protected-access
@ -328,11 +338,13 @@ def main():
        logging.info("Going to run func tests: %s", run_command)

        with TeePopen(run_command, run_log_path) as process:
+            global test_process
+            test_process = process
            retcode = process.wait()
            if retcode == 0:
                logging.info("Run successfully")
            else:
-                logging.info("Run failed")
+                logging.info("Run failed, exit code %s", retcode)

        try:
            subprocess.check_call(
@ -348,6 +360,13 @@ def main():
        state, description, test_results, additional_logs = process_results(
            retcode, result_path, server_log_path
        )
+        if timeout_expired:
+            description = "Timeout expired"
+            state = FAILURE
+            test_results.insert(
+                0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
+            )
+
    else:
        print(
            "This is validate bugfix or flaky check run, but no changes test to run - skip with success"
--- a/tests/ci/report.py
+++ b/tests/ci/report.py
@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"

 JOB_STARTED_TEST_NAME = "STARTED"
 JOB_FINISHED_TEST_NAME = "COMPLETED"
+JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"


@dataclass
@ -277,8 +278,8 @@ class TestResult:
            self.log_files.append(log_path)

    @staticmethod
-    def create_check_timeout_expired(timeout: float) -> "TestResult":
-        return TestResult("Check timeout expired", "FAIL", timeout)
+    def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
+        return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)


 TestResults = List[TestResult]
@ -303,7 +304,7 @@ class JobReport:
    # indicates that this is not real job report but report for the job that was skipped by rerun check
    job_skipped: bool = False
    # indicates that report generated by CI script in order to check later if job was killed before real report is generated
-    pre_report: bool = False
+    dummy: bool = False
    exit_code: int = -1

    @staticmethod
@ -311,7 +312,7 @@ class JobReport:
        return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

    @classmethod
-    def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
+    def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
        return JobReport(
            status=status,
            description="",
@ -320,7 +321,7 @@ class JobReport:
            duration=0.0,
            additional_files=[],
            job_skipped=job_skipped,
-            pre_report=True,
+            dummy=True,
        )

    def update_duration(self):
--- a/tests/ci/tee_popen.py
+++ b/tests/ci/tee_popen.py
@ -2,6 +2,8 @@

 import logging
 import os
+import signal
+import subprocess
 import sys
 from io import TextIOWrapper
 from pathlib import Path
@ -30,20 +32,35 @@ class TeePopen:
        self._process = None  # type: Optional[Popen]
        self.timeout = timeout
        self.timeout_exceeded = False
+        self.terminated_by_sigterm = False
+        self.terminated_by_sigkill = False
+        self.pid = 0

    def _check_timeout(self) -> None:
        if self.timeout is None:
            return
        sleep(self.timeout)
+        logging.warning(
+            "Timeout exceeded. Send SIGTERM to process %s, timeout %s",
+            self.process.pid,
+            self.timeout,
+        )
+        self.send_signal(signal.SIGTERM)
+        time_wait = 0
+        self.terminated_by_sigterm = True
        self.timeout_exceeded = True
+        while self.process.poll() is None and time_wait < 100:
+            print("wait...")
+            wait = 5
+            sleep(wait)
+            time_wait += wait
        while self.process.poll() is None:
-            logging.warning(
-                "Killing process %s, timeout %s exceeded",
-                self.process.pid,
-                self.timeout,
+            logging.error(
+                "Process is still running. Send SIGKILL",
            )
-            os.killpg(self.process.pid, 9)
-            sleep(10)
+            self.send_signal(signal.SIGKILL)
+            self.terminated_by_sigkill = True
+            sleep(5)

    def __enter__(self) -> "TeePopen":
        self.process = Popen(
@ -57,6 +74,9 @@ class TeePopen:
            bufsize=1,
            errors="backslashreplace",
        )
+        sleep(1)
+        self.pid = self._get_child_pid()
+        print(f"Subprocess started, pid [{self.process.pid}], child pid [{self.pid}]")
        if self.timeout is not None and self.timeout > 0:
            t = Thread(target=self._check_timeout)
            t.daemon = True  # does not block the program from exit
@ -77,6 +97,22 @@ class TeePopen:

        self.log_file.close()

+    def _get_child_pid(self):
+        # linux only
+        ps_command = f"ps --ppid {self.process.pid} -o pid="
+        res = "NA"
+        try:
+            result = subprocess.run(
+                ps_command, shell=True, capture_output=True, text=True
+            )
+            res = result.stdout.strip()
+            pid = int(res)
+            return pid
+        except Exception as e:
+            print(f"Failed to get child's pid, command [{ps_command}], result [{res}]")
+            print(f"ERROR: getting Python subprocess PID: {e}")
+            return self.process.pid
+
    def wait(self) -> int:
        if self.process.stdout is not None:
            for line in self.process.stdout:
@ -85,6 +121,15 @@ class TeePopen:

        return self.process.wait()

+    def poll(self):
+        return self.process.poll()
+
+    def send_signal(self, signal_num):
+        if self.pid:
+            os.kill(self.pid, signal_num)
+        else:
+            print("ERROR: no process to send signal")
+
    @property
    def process(self) -> Popen:
        if self._process is not None: