CI: Functional tests to store artifacts on timeout

2024-09-20 00:30:49 +00:00 · 2024-08-06 21:39:41 +02:00 · 2024-08-06 21:39:41 +02:00 · 427016a450
commit 427016a450
parent 8d0d870326
13 changed files with 107 additions and 116 deletions
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -256,22 +256,6 @@ function configure
    rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
 }
 function timeout_with_logging() {
    local exit_code=0
    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
    if [[ "${exit_code}" -eq "124" ]]
    then
        echo "The command 'timeout ${*}' has been killed by timeout."
    else
        echo "No, it isn't a timeout."
    fi
    return $exit_code
 }
 function run_tests
 {
    clickhouse-server --version
@ -340,7 +324,7 @@ case "$stage" in
    configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
    ;&
 "run_tests")
-    timeout_with_logging 35m bash -c run_tests ||:
+    run_tests ||:
    /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
        --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
        --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"
--- a/docker/test/sqllogic/Dockerfile
+++ b/docker/test/sqllogic/Dockerfile
@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \
 ENV TZ=Europe/Amsterdam
 ENV MAX_RUN_TIME=9000
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
--- a/docker/test/sqllogic/run.sh
+++ b/docker/test/sqllogic/run.sh
@ -94,7 +94,7 @@ function run_tests()
 export -f run_tests
-timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2
+run_tests
 #/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv
--- a/docker/test/sqltest/Dockerfile
+++ b/docker/test/sqltest/Dockerfile
@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
 RUN git clone ${sqltest_repo}
 ENV TZ=UTC
 ENV MAX_RUN_TIME=900
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 COPY run.sh /
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -4,9 +4,6 @@
 source /setup_export_logs.sh
 set -e -x
 MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
 MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
 # Choose random timezone for this test run
 TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
 echo "Choosen random timezone $TZ"
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
    clickhouse-client --query "DROP TABLE datasets.hits_v1"
    clickhouse-client --query "DROP TABLE datasets.visits_v1"
    MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000))  # min(MAX_RUN_TIME, 2.5 hours)
    MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000))    # set to 2.5 hours if 0 (unlimited)
 else
    clickhouse-client --query "CREATE DATABASE test"
    clickhouse-client --query "SHOW TABLES FROM test"
@ -257,24 +251,7 @@ function run_tests()
 export -f run_tests
-function timeout_with_logging() {
+run_tests ||:
    local exit_code=0
    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
    if [[ "${exit_code}" -eq "124" ]]
    then
        echo "The command 'timeout ${*}' has been killed by timeout."
    else
        echo "No, it isn't a timeout."
    fi
    return $exit_code
 }
 TIMEOUT=$((MAX_RUN_TIME - 700))
 timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 ENV NUM_TRIES=1
 ENV MAX_RUN_TIME=0
 # Unrelated to vars in setup_minio.sh, but should be the same there
 # to have the same binaries for local running scenario
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -12,9 +12,6 @@ dmesg --clear
 # fail on errors, verbose and export all env variables
 set -e -x -a
 MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
 MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
 USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
 USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
@ -308,8 +305,6 @@ function run_tests()
    try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
    TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
    START_TIME=${SECONDS}
    set +e
    TEST_ARGS=(
@ -324,32 +319,22 @@ function run_tests()
        --test-runs "$NUM_TRIES"
        "${ADDITIONAL_OPTIONS[@]}"
    )
-    timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
+    clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee -a test_output/test_result.txt
    set -e
    DURATION=$((SECONDS - START_TIME))
    echo "Elapsed ${DURATION} seconds."
    if [[ $DURATION -ge $TIMEOUT ]]
    then
        echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
    fi
 }
 export -f run_tests
 # This should be enough to setup job and collect artifacts
 TIMEOUT=$((MAX_RUN_TIME - 700))
 if [ "$NUM_TRIES" -gt "1" ]; then
    # We don't run tests with Ordinary database in PRs, only in master.
    # So run new/changed tests with Ordinary at least once in flaky check.
-    timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
+    NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
      | sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
 fi
-timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
+run_tests ||:
 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/utils.lib
+++ b/docker/test/stateless/utils.lib
@ -40,22 +40,6 @@ function fn_exists() {
    declare -F "$1" > /dev/null;
 }
 function timeout_with_logging() {
    local exit_code=0
    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
    if [[ "${exit_code}" -eq "124" ]]
    then
        echo "The command 'timeout ${*}' has been killed by timeout."
    else
        echo "No, it isn't a timeout."
    fi
    return $exit_code
 }
 function collect_core_dumps()
 {
  find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@ -50,7 +50,6 @@ from github_helper import GitHub
 from pr_info import PRInfo
 from report import (
    ERROR,
    FAILURE,
    PENDING,
    SUCCESS,
    BuildResult,
@ -62,11 +61,11 @@ from report import (
    FAIL,
 )
 from s3_helper import S3Helper
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_cache import CiCache
 from ci_settings import CiSettings
 from ci_buddy import CIBuddy
 from stopwatch import Stopwatch
 from version_helper import get_version_from_repo
 # pylint: disable=too-many-lines
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
                # skip_status = SUCCESS already there
                GH.print_in_group("Commit Status Data", job_status)
-    # create pre report
+    # create dummy report
-    jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
+    jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
    jr.dump()
    if not to_be_skipped:
@ -990,19 +989,20 @@ def _run_test(job_name: str, run_command: str) -> int:
    stopwatch = Stopwatch()
    job_log = Path(TEMP_PATH) / "job_log.txt"
    with TeePopen(run_command, job_log, env, timeout) as process:
        print(f"Job process started, pid [{process.process.pid}]")
        retcode = process.wait()
        if retcode != 0:
            print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
-            if timeout and process.timeout_exceeded:
+        if process.timeout_exceeded:
-                print(f"Timeout {timeout} exceeded, dumping the job report")
+            print(f"Job timed out: [{job_name}] exit code [{retcode}]")
-                JobReport(
+            assert JobReport.exist(), "JobReport real or dummy must be present"
-                    status=FAILURE,
+            jr = JobReport.load()
-                    description=f"Timeout {timeout} exceeded",
+            if jr.dummy:
-                    test_results=[TestResult.create_check_timeout_expired(timeout)],
+                print(
-                    start_time=stopwatch.start_time_str,
+                    f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
-                    duration=stopwatch.duration_seconds,
+                )
-                    additional_files=[job_log],
+                jr.test_results = [TestResult.create_check_timeout_expired()]
-                ).dump()
+                jr.duration = stopwatch.duration_seconds
    print(f"Run action done for: [{job_name}]")
    return retcode
@ -1205,7 +1205,7 @@ def main() -> int:
            job_report
        ), "BUG. There must be job report either real report, or pre-report if job was killed"
        error_description = ""
-        if not job_report.pre_report:
+        if not job_report.dummy:
            # it's a real job report
            ch_helper = ClickHouseHelper()
            check_url = ""
--- a/tests/ci/ci_definitions.py
+++ b/tests/ci/ci_definitions.py
@ -332,7 +332,7 @@ class JobConfig:
    # will be triggered for the job if omitted in CI workflow yml
    run_command: str = ""
    # job timeout, seconds
-    timeout: Optional[int] = None
+    timeout: int = 7200
    # sets number of batches for a multi-batch job
    num_batches: int = 1
    # label that enables job in CI, if set digest isn't used
@ -421,7 +421,6 @@ class CommonJobConfigs:
        ),
        run_command='functional_test_check.py "$CHECK_NAME"',
        runner_type=Runners.FUNC_TESTER,
        timeout=9000,
    )
    STATEFUL_TEST = JobConfig(
        job_name_keyword="stateful",
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@ -5,10 +5,11 @@ import csv
 import logging
 import os
 import re
 import signal
 import subprocess
 import sys
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Tuple, Optional
 from build_download_helper import download_all_deb_packages
 from clickhouse_helper import CiLogsCredentials
@ -25,11 +26,12 @@ from report import (
    TestResults,
    read_test_results,
    FAILURE,
    TestResult,
 )
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_config import CI
-from ci_utils import Utils
+from ci_utils import Utils, Shell
 NO_CHANGES_MSG = "Nothing to run"
@ -113,10 +115,6 @@ def get_run_command(
    if flaky_check:
        envs.append("-e NUM_TRIES=50")
        envs.append("-e MAX_RUN_TIME=2800")
    else:
        max_run_time = os.getenv("MAX_RUN_TIME", "0")
        envs.append(f"-e MAX_RUN_TIME={max_run_time}")
    envs += [f"-e {e}" for e in additional_envs]
@ -128,7 +126,7 @@ def get_run_command(
    )
    return (
-        f"docker run --volume={builds_path}:/package_folder "
+        f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
        # For dmesg and sysctl
        "--privileged "
        f"{ci_logs_args}"
@ -198,7 +196,7 @@ def process_results(
    state, description = status[0][0], status[0][1]
    if ret_code != 0:
        state = ERROR
-        description += " (but script exited with an error)"
+        description = f"Job failed, exit code: {ret_code}. " + description
    try:
        results_path = result_directory / "test_results.tsv"
@ -240,7 +238,19 @@ def parse_args():
    return parser.parse_args()
 test_process = None  # type: Optional[TeePopen]
 timeout_expired = False
 def handle_sigterm(signum, _frame):
    print(f"WARNING: Received signal {signum}")
    global timeout_expired
    timeout_expired = True
    Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
 def main():
    signal.signal(signal.SIGTERM, handle_sigterm)
    logging.basicConfig(level=logging.INFO)
    for handler in logging.root.handlers:
        # pylint: disable=protected-access
@ -328,11 +338,13 @@ def main():
        logging.info("Going to run func tests: %s", run_command)
        with TeePopen(run_command, run_log_path) as process:
            global test_process
            test_process = process
            retcode = process.wait()
            if retcode == 0:
                logging.info("Run successfully")
            else:
-                logging.info("Run failed")
+                logging.info("Run failed, exit code %s", retcode)
        try:
            subprocess.check_call(
@ -348,6 +360,13 @@ def main():
        state, description, test_results, additional_logs = process_results(
            retcode, result_path, server_log_path
        )
        if timeout_expired:
            description = "Timeout expired"
            state = FAILURE
            test_results.insert(
                0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
            )
    else:
        print(
            "This is validate bugfix or flaky check run, but no changes test to run - skip with success"
--- a/tests/ci/report.py
+++ b/tests/ci/report.py
@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"
 JOB_STARTED_TEST_NAME = "STARTED"
 JOB_FINISHED_TEST_NAME = "COMPLETED"
 JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"
@dataclass
@ -277,8 +278,8 @@ class TestResult:
            self.log_files.append(log_path)
    @staticmethod
-    def create_check_timeout_expired(timeout: float) -> "TestResult":
+    def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
-        return TestResult("Check timeout expired", "FAIL", timeout)
+        return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)
 TestResults = List[TestResult]
@ -303,7 +304,7 @@ class JobReport:
    # indicates that this is not real job report but report for the job that was skipped by rerun check
    job_skipped: bool = False
    # indicates that report generated by CI script in order to check later if job was killed before real report is generated
-    pre_report: bool = False
+    dummy: bool = False
    exit_code: int = -1
    @staticmethod
@ -311,7 +312,7 @@ class JobReport:
        return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
    @classmethod
-    def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
+    def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
        return JobReport(
            status=status,
            description="",
@ -320,7 +321,7 @@ class JobReport:
            duration=0.0,
            additional_files=[],
            job_skipped=job_skipped,
-            pre_report=True,
+            dummy=True,
        )
    def update_duration(self):
--- a/tests/ci/tee_popen.py
+++ b/tests/ci/tee_popen.py
@ -2,6 +2,8 @@
 import logging
 import os
 import signal
 import subprocess
 import sys
 from io import TextIOWrapper
 from pathlib import Path
@ -30,20 +32,35 @@ class TeePopen:
        self._process = None  # type: Optional[Popen]
        self.timeout = timeout
        self.timeout_exceeded = False
        self.terminated_by_sigterm = False
        self.terminated_by_sigkill = False
        self.pid = 0
    def _check_timeout(self) -> None:
        if self.timeout is None:
            return
        sleep(self.timeout)
        logging.warning(
            "Timeout exceeded. Send SIGTERM to process %s, timeout %s",
            self.process.pid,
            self.timeout,
        )
        self.send_signal(signal.SIGTERM)
        time_wait = 0
        self.terminated_by_sigterm = True
        self.timeout_exceeded = True
        while self.process.poll() is None and time_wait < 100:
            print("wait...")
            wait = 5
            sleep(wait)
            time_wait += wait
        while self.process.poll() is None:
-            logging.warning(
+            logging.error(
-                "Killing process %s, timeout %s exceeded",
+                "Process is still running. Send SIGKILL",
                self.process.pid,
                self.timeout,
            )
-            os.killpg(self.process.pid, 9)
+            self.send_signal(signal.SIGKILL)
-            sleep(10)
+            self.terminated_by_sigkill = True
            sleep(5)
    def __enter__(self) -> "TeePopen":
        self.process = Popen(
@ -57,6 +74,9 @@ class TeePopen:
            bufsize=1,
            errors="backslashreplace",
        )
        sleep(1)
        self.pid = self._get_child_pid()
        print(f"Subprocess started, pid [{self.process.pid}], child pid [{self.pid}]")
        if self.timeout is not None and self.timeout > 0:
            t = Thread(target=self._check_timeout)
            t.daemon = True  # does not block the program from exit
@ -77,6 +97,22 @@ class TeePopen:
        self.log_file.close()
    def _get_child_pid(self):
        # linux only
        ps_command = f"ps --ppid {self.process.pid} -o pid="
        res = "NA"
        try:
            result = subprocess.run(
                ps_command, shell=True, capture_output=True, text=True
            )
            res = result.stdout.strip()
            pid = int(res)
            return pid
        except Exception as e:
            print(f"Failed to get child's pid, command [{ps_command}], result [{res}]")
            print(f"ERROR: getting Python subprocess PID: {e}")
            return self.process.pid
    def wait(self) -> int:
        if self.process.stdout is not None:
            for line in self.process.stdout:
@ -85,6 +121,15 @@ class TeePopen:
        return self.process.wait()
    def poll(self):
        return self.process.poll()
    def send_signal(self, signal_num):
        if self.pid:
            os.kill(self.pid, signal_num)
        else:
            print("ERROR: no process to send signal")
    @property
    def process(self) -> Popen:
        if self._process is not None: