Backport #67944 to 24.8: CI: Job Timeout handling

2024-09-20 08:40:50 +00:00 · 2024-08-19 12:09:58 +00:00 · 2024-08-19 12:09:58 +00:00 · f234ccf185
commit f234ccf185
parent e9b6b93cc4
15 changed files with 159 additions and 126 deletions
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -256,22 +256,6 @@ function configure
    rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
 }

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
 function run_tests
 {
    clickhouse-server --version
@ -340,7 +324,7 @@ case "$stage" in
    configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
    ;&
 "run_tests")
-    timeout_with_logging 35m bash -c run_tests ||:
+    run_tests ||:
    /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
        --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
        --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"
--- a/docker/test/sqllogic/Dockerfile
+++ b/docker/test/sqllogic/Dockerfile
@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \


 ENV TZ=Europe/Amsterdam
-ENV MAX_RUN_TIME=9000
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
--- a/docker/test/sqllogic/run.sh
+++ b/docker/test/sqllogic/run.sh
@ -94,7 +94,7 @@ function run_tests()

 export -f run_tests

-timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2
+run_tests

 #/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv

--- a/docker/test/sqltest/Dockerfile
+++ b/docker/test/sqltest/Dockerfile
@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
 RUN git clone ${sqltest_repo}

 ENV TZ=UTC
-ENV MAX_RUN_TIME=900
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 COPY run.sh /
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -4,9 +4,6 @@
 source /setup_export_logs.sh
 set -e -x

-MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
-MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
-
 # Choose random timezone for this test run
 TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
 echo "Choosen random timezone $TZ"
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]

    clickhouse-client --query "DROP TABLE datasets.hits_v1"
    clickhouse-client --query "DROP TABLE datasets.visits_v1"
-
-    MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000))  # min(MAX_RUN_TIME, 2.5 hours)
-    MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000))    # set to 2.5 hours if 0 (unlimited)
 else
    clickhouse-client --query "CREATE DATABASE test"
    clickhouse-client --query "SHOW TABLES FROM test"
@ -257,24 +251,7 @@ function run_tests()

 export -f run_tests

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
-TIMEOUT=$((MAX_RUN_TIME - 700))
-timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
+run_tests ||:

 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/Dockerfile
+++ b/docker/test/stateless/Dockerfile
@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone

 ENV NUM_TRIES=1
-ENV MAX_RUN_TIME=0

 # Unrelated to vars in setup_minio.sh, but should be the same there
 # to have the same binaries for local running scenario
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -12,9 +12,6 @@ dmesg --clear
 # fail on errors, verbose and export all env variables
 set -e -x -a

-MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
-MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
-
 USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
 USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}

@ -308,8 +305,6 @@ function run_tests()

    try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"

-    TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
-    START_TIME=${SECONDS}
    set +e

    TEST_ARGS=(
@ -324,32 +319,22 @@ function run_tests()
        --test-runs "$NUM_TRIES"
        "${ADDITIONAL_OPTIONS[@]}"
    )
-    timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
+    clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
        | ts '%Y-%m-%d %H:%M:%S' \
        | tee -a test_output/test_result.txt
    set -e
-    DURATION=$((SECONDS - START_TIME))
-
-    echo "Elapsed ${DURATION} seconds."
-    if [[ $DURATION -ge $TIMEOUT ]]
-    then
-        echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
-    fi
 }

 export -f run_tests

-
-# This should be enough to setup job and collect artifacts
-TIMEOUT=$((MAX_RUN_TIME - 700))
 if [ "$NUM_TRIES" -gt "1" ]; then
    # We don't run tests with Ordinary database in PRs, only in master.
    # So run new/changed tests with Ordinary at least once in flaky check.
-    timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
+    NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
      | sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
 fi

-timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
+run_tests ||:

 echo "Files in current directory"
 ls -la ./
--- a/docker/test/stateless/utils.lib
+++ b/docker/test/stateless/utils.lib
@ -40,22 +40,6 @@ function fn_exists() {
    declare -F "$1" > /dev/null;
 }

-function timeout_with_logging() {
-    local exit_code=0
-
-    timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
-
-    echo "Checking if it is a timeout. The code 124 will indicate a timeout."
-    if [[ "${exit_code}" -eq "124" ]]
-    then
-        echo "The command 'timeout ${*}' has been killed by timeout."
-    else
-        echo "No, it isn't a timeout."
-    fi
-
-    return $exit_code
-}
-
 function collect_core_dumps()
 {
  find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do
--- a/tests/ci/ci.py
+++ b/tests/ci/ci.py
@ -50,7 +50,6 @@ from github_helper import GitHub
 from pr_info import PRInfo
 from report import (
    ERROR,
-    FAILURE,
    PENDING,
    SUCCESS,
    BuildResult,
@ -62,11 +61,11 @@ from report import (
    FAIL,
 )
 from s3_helper import S3Helper
-from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_cache import CiCache
 from ci_settings import CiSettings
 from ci_buddy import CIBuddy
+from stopwatch import Stopwatch
 from version_helper import get_version_from_repo

 # pylint: disable=too-many-lines
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
                # skip_status = SUCCESS already there
                GH.print_in_group("Commit Status Data", job_status)

-    # create pre report
-    jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
+    # create dummy report
+    jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
    jr.dump()

    if not to_be_skipped:
@ -990,19 +989,21 @@ def _run_test(job_name: str, run_command: str) -> int:
    stopwatch = Stopwatch()
    job_log = Path(TEMP_PATH) / "job_log.txt"
    with TeePopen(run_command, job_log, env, timeout) as process:
+        print(f"Job process started, pid [{process.process.pid}]")
        retcode = process.wait()
        if retcode != 0:
            print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
-            if timeout and process.timeout_exceeded:
-                print(f"Timeout {timeout} exceeded, dumping the job report")
-                JobReport(
-                    status=FAILURE,
-                    description=f"Timeout {timeout} exceeded",
-                    test_results=[TestResult.create_check_timeout_expired(timeout)],
-                    start_time=stopwatch.start_time_str,
-                    duration=stopwatch.duration_seconds,
-                    additional_files=[job_log],
-                ).dump()
+        if process.timeout_exceeded:
+            print(f"Job timed out: [{job_name}] exit code [{retcode}]")
+            assert JobReport.exist(), "JobReport real or dummy must be present"
+            jr = JobReport.load()
+            if jr.dummy:
+                print(
+                    f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
+                )
+                jr.test_results = [TestResult.create_check_timeout_expired()]
+                jr.duration = stopwatch.duration_seconds
+                jr.additional_files += [job_log]

    print(f"Run action done for: [{job_name}]")
    return retcode
@ -1205,7 +1206,7 @@ def main() -> int:
            job_report
        ), "BUG. There must be job report either real report, or pre-report if job was killed"
        error_description = ""
-        if not job_report.pre_report:
+        if not job_report.dummy:
            # it's a real job report
            ch_helper = ClickHouseHelper()
            check_url = ""
@ -1329,10 +1330,20 @@ def main() -> int:
            if CI.is_test_job(args.job_name):
                gh = GitHub(get_best_robot_token(), per_page=100)
                commit = get_commit(gh, pr_info.sha)
+                check_url = ""
+                if job_report.test_results or job_report.additional_files:
+                    check_url = upload_result_helper.upload_results(
+                        s3,
+                        pr_info.number,
+                        pr_info.sha,
+                        job_report.test_results,
+                        job_report.additional_files,
+                        job_report.check_name or _get_ext_check_name(args.job_name),
+                    )
                post_commit_status(
                    commit,
                    ERROR,
-                    "",
+                    check_url,
                    "Error: " + error_description,
                    _get_ext_check_name(args.job_name),
                    pr_info,
--- a/tests/ci/ci_config.py
+++ b/tests/ci/ci_config.py
@ -315,6 +315,7 @@ class CI:
        JobNames.STATEFUL_TEST_PARALLEL_REPL_TSAN: CommonJobConfigs.STATEFUL_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_TSAN],
            random_bucket="parrepl_with_sanitizer",
+            timeout=3600,
        ),
        JobNames.STATELESS_TEST_ASAN: CommonJobConfigs.STATELESS_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_ASAN], num_batches=2
@ -342,17 +343,17 @@ class CI:
            runner_type=Runners.FUNC_TESTER_ARM,
        ),
        JobNames.STATELESS_TEST_OLD_ANALYZER_S3_REPLICATED_RELEASE: CommonJobConfigs.STATELESS_TEST.with_properties(
-            required_builds=[BuildNames.PACKAGE_RELEASE], num_batches=4
+            required_builds=[BuildNames.PACKAGE_RELEASE], num_batches=2
        ),
        JobNames.STATELESS_TEST_S3_DEBUG: CommonJobConfigs.STATELESS_TEST.with_properties(
-            required_builds=[BuildNames.PACKAGE_DEBUG], num_batches=2
+            required_builds=[BuildNames.PACKAGE_DEBUG], num_batches=1
        ),
        JobNames.STATELESS_TEST_AZURE_ASAN: CommonJobConfigs.STATELESS_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_ASAN], num_batches=3, release_only=True
        ),
        JobNames.STATELESS_TEST_S3_TSAN: CommonJobConfigs.STATELESS_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_TSAN],
-            num_batches=4,
+            num_batches=3,
        ),
        JobNames.STRESS_TEST_DEBUG: CommonJobConfigs.STRESS_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_DEBUG],
@ -400,7 +401,8 @@ class CI:
            required_builds=[BuildNames.PACKAGE_ASAN], release_only=True, num_batches=4
        ),
        JobNames.INTEGRATION_TEST_ASAN_OLD_ANALYZER: CommonJobConfigs.INTEGRATION_TEST.with_properties(
-            required_builds=[BuildNames.PACKAGE_ASAN], num_batches=6
+            required_builds=[BuildNames.PACKAGE_ASAN],
+            num_batches=6,
        ),
        JobNames.INTEGRATION_TEST_TSAN: CommonJobConfigs.INTEGRATION_TEST.with_properties(
            required_builds=[BuildNames.PACKAGE_TSAN], num_batches=6
--- a/tests/ci/ci_definitions.py
+++ b/tests/ci/ci_definitions.py
@ -331,7 +331,7 @@ class JobConfig:
    # will be triggered for the job if omitted in CI workflow yml
    run_command: str = ""
    # job timeout, seconds
-    timeout: Optional[int] = None
+    timeout: int = 7200
    # sets number of batches for a multi-batch job
    num_batches: int = 1
    # label that enables job in CI, if set digest isn't used
@ -420,7 +420,6 @@ class CommonJobConfigs:
        ),
        run_command='functional_test_check.py "$CHECK_NAME"',
        runner_type=Runners.FUNC_TESTER,
-        timeout=9000,
    )
    STATEFUL_TEST = JobConfig(
        job_name_keyword="stateful",
@ -465,6 +464,7 @@ class CommonJobConfigs:
        ),
        run_command="upgrade_check.py",
        runner_type=Runners.STRESS_TESTER,
+        timeout=3600,
    )
    INTEGRATION_TEST = JobConfig(
        job_name_keyword="integration",
--- a/tests/ci/functional_test_check.py
+++ b/tests/ci/functional_test_check.py
@ -5,10 +5,11 @@ import csv
 import logging
 import os
 import re
+import signal
 import subprocess
 import sys
 from pathlib import Path
-from typing import List, Tuple
+from typing import List, Tuple, Optional

 from build_download_helper import download_all_deb_packages
 from clickhouse_helper import CiLogsCredentials
@ -25,11 +26,12 @@ from report import (
    TestResults,
    read_test_results,
    FAILURE,
+    TestResult,
 )
 from stopwatch import Stopwatch
 from tee_popen import TeePopen
 from ci_config import CI
-from ci_utils import Utils
+from ci_utils import Utils, Shell

 NO_CHANGES_MSG = "Nothing to run"

@ -113,10 +115,6 @@ def get_run_command(

    if flaky_check:
        envs.append("-e NUM_TRIES=50")
-        envs.append("-e MAX_RUN_TIME=2800")
-    else:
-        max_run_time = os.getenv("MAX_RUN_TIME", "0")
-        envs.append(f"-e MAX_RUN_TIME={max_run_time}")

    envs += [f"-e {e}" for e in additional_envs]

@ -128,7 +126,7 @@ def get_run_command(
    )

    return (
-        f"docker run --volume={builds_path}:/package_folder "
+        f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
        # For dmesg and sysctl
        "--privileged "
        f"{ci_logs_args}"
@ -198,7 +196,7 @@ def process_results(
    state, description = status[0][0], status[0][1]
    if ret_code != 0:
        state = ERROR
-        description += " (but script exited with an error)"
+        description = f"Job failed, exit code: {ret_code}. " + description

    try:
        results_path = result_directory / "test_results.tsv"
@ -240,7 +238,19 @@ def parse_args():
    return parser.parse_args()


+test_process = None  # type: Optional[TeePopen]
+timeout_expired = False
+
+
+def handle_sigterm(signum, _frame):
+    print(f"WARNING: Received signal {signum}")
+    global timeout_expired
+    timeout_expired = True
+    Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
+
+
 def main():
+    signal.signal(signal.SIGTERM, handle_sigterm)
    logging.basicConfig(level=logging.INFO)
    for handler in logging.root.handlers:
        # pylint: disable=protected-access
@ -328,11 +338,13 @@ def main():
        logging.info("Going to run func tests: %s", run_command)

        with TeePopen(run_command, run_log_path) as process:
+            global test_process
+            test_process = process
            retcode = process.wait()
            if retcode == 0:
                logging.info("Run successfully")
            else:
-                logging.info("Run failed")
+                logging.info("Run failed, exit code %s", retcode)

        try:
            subprocess.check_call(
@ -348,6 +360,13 @@ def main():
        state, description, test_results, additional_logs = process_results(
            retcode, result_path, server_log_path
        )
+        if timeout_expired:
+            description = "Timeout expired"
+            state = FAILURE
+            test_results.insert(
+                0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
+            )
+
    else:
        print(
            "This is validate bugfix or flaky check run, but no changes test to run - skip with success"
--- a/tests/ci/integration_tests_runner.py
+++ b/tests/ci/integration_tests_runner.py
@ -9,6 +9,7 @@ import random
 import re
 import shlex
 import shutil
+import signal
 import string
 import subprocess
 import sys
@ -16,11 +17,13 @@ import time
 import zlib  # for crc32
 from collections import defaultdict
 from itertools import chain
-from typing import Any, Dict
+from typing import Any, Dict, Optional

 from env_helper import IS_CI
 from integration_test_images import IMAGES
 from tee_popen import TeePopen
+from report import JOB_TIMEOUT_TEST_NAME
+from stopwatch import Stopwatch

 MAX_RETRY = 1
 NUM_WORKERS = 5
@ -621,6 +624,9 @@ class ClickhouseIntegrationTestsRunner:
        test_data_dirs = {}

        for i in range(num_tries):
+            if timeout_expired:
+                print("Timeout expired - break test group execution")
+                break
            logging.info("Running test group %s for the %s retry", test_group, i)
            clear_ip_tables_and_restart_daemons()

@ -657,6 +663,8 @@ class ClickhouseIntegrationTestsRunner:
                logging.info("Executing cmd: %s", cmd)
                # ignore retcode, since it meaningful due to pipe to tee
                with subprocess.Popen(cmd, shell=True, stderr=log, stdout=log) as proc:
+                    global runner_subprocess
+                    runner_subprocess = proc
                    proc.wait()

            extra_logs_names = [log_basename]
@ -780,6 +788,9 @@ class ClickhouseIntegrationTestsRunner:
        logs = []
        tries_num = 1 if should_fail else FLAKY_TRIES_COUNT
        for i in range(tries_num):
+            if timeout_expired:
+                print("Timeout expired - break flaky check execution")
+                break
            final_retry += 1
            logging.info("Running tests for the %s time", i)
            counters, tests_times, log_paths = self.try_run_test_group(
@ -839,6 +850,7 @@ class ClickhouseIntegrationTestsRunner:
        return result_state, status_text, test_result, logs

    def run_impl(self, repo_path, build_path):
+        stopwatch = Stopwatch()
        if self.flaky_check or self.bugfix_validate_check:
            return self.run_flaky_check(
                repo_path, build_path, should_fail=self.bugfix_validate_check
@ -921,6 +933,9 @@ class ClickhouseIntegrationTestsRunner:
            random.shuffle(items_to_run)

        for group, tests in items_to_run:
+            if timeout_expired:
+                print("Timeout expired - break tests execution")
+                break
            logging.info("Running test group %s containing %s tests", group, len(tests))
            group_counters, group_test_times, log_paths = self.try_run_test_group(
                repo_path, group, tests, MAX_RETRY, NUM_WORKERS, 0
@ -981,6 +996,17 @@ class ClickhouseIntegrationTestsRunner:
            status_text = "Timeout, " + status_text
            result_state = "failure"

+        if timeout_expired:
+            logging.error(
+                "Job killed by external timeout signal - setting status to failure!"
+            )
+            status_text = "Job timeout expired, " + status_text
+            result_state = "failure"
+            # add mock test case to make timeout visible in job report and in ci db
+            test_result.insert(
+                0, (JOB_TIMEOUT_TEST_NAME, "FAIL", f"{stopwatch.duration_seconds}", "")
+            )
+
        if not counters or sum(len(counter) for counter in counters.values()) == 0:
            status_text = "No tests found for some reason! It's a bug"
            result_state = "failure"
@ -1001,6 +1027,7 @@ def write_results(results_file, status_file, results, status):


 def run():
+    signal.signal(signal.SIGTERM, handle_sigterm)
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")

    repo_path = os.environ.get("CLICKHOUSE_TESTS_REPO_PATH")
@ -1035,5 +1062,17 @@ def run():
    logging.info("Result written")


+timeout_expired = False
+runner_subprocess = None  # type:Optional[subprocess.Popen]
+
+
+def handle_sigterm(signum, _frame):
+    print(f"WARNING: Received signal {signum}")
+    global timeout_expired
+    timeout_expired = True
+    if runner_subprocess:
+        runner_subprocess.send_signal(signal.SIGTERM)
+
+
 if __name__ == "__main__":
    run()
--- a/tests/ci/report.py
+++ b/tests/ci/report.py
@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"

 JOB_STARTED_TEST_NAME = "STARTED"
 JOB_FINISHED_TEST_NAME = "COMPLETED"
+JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"


@dataclass
@ -277,8 +278,8 @@ class TestResult:
            self.log_files.append(log_path)

    @staticmethod
-    def create_check_timeout_expired(timeout: float) -> "TestResult":
-        return TestResult("Check timeout expired", "FAIL", timeout)
+    def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
+        return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)


 TestResults = List[TestResult]
@ -303,7 +304,7 @@ class JobReport:
    # indicates that this is not real job report but report for the job that was skipped by rerun check
    job_skipped: bool = False
    # indicates that report generated by CI script in order to check later if job was killed before real report is generated
-    pre_report: bool = False
+    dummy: bool = False
    exit_code: int = -1

    @staticmethod
@ -311,7 +312,7 @@ class JobReport:
        return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")

    @classmethod
-    def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
+    def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
        return JobReport(
            status=status,
            description="",
@ -320,7 +321,7 @@ class JobReport:
            duration=0.0,
            additional_files=[],
            job_skipped=job_skipped,
-            pre_report=True,
+            dummy=True,
        )

    def update_duration(self):
@ -741,10 +742,21 @@ def create_test_html_report(
        has_test_time = any(tr.time is not None for tr in test_results)
        has_log_urls = False

-        # Display entires with logs at the top (they correspond to failed tests)
-        test_results.sort(
-            key=lambda result: result.raw_logs is None and result.log_files is None
-        )
+        def sort_key(status):
+            if "fail" in status.lower():
+                return 0
+            elif "error" in status.lower():
+                return 1
+            elif "not" in status.lower():
+                return 2
+            elif "ok" in status.lower():
+                return 10
+            elif "success" in status.lower():
+                return 9
+            else:
+                return 5
+
+        test_results.sort(key=lambda result: sort_key(result.status))

        for test_result in test_results:
            colspan = 0
--- a/tests/ci/tee_popen.py
+++ b/tests/ci/tee_popen.py
@ -2,6 +2,7 @@

 import logging
 import os
+import signal
 import sys
 from io import TextIOWrapper
 from pathlib import Path
@ -30,20 +31,34 @@ class TeePopen:
        self._process = None  # type: Optional[Popen]
        self.timeout = timeout
        self.timeout_exceeded = False
+        self.terminated_by_sigterm = False
+        self.terminated_by_sigkill = False

    def _check_timeout(self) -> None:
        if self.timeout is None:
            return
        sleep(self.timeout)
+        logging.warning(
+            "Timeout exceeded. Send SIGTERM to process %s, timeout %s",
+            self.process.pid,
+            self.timeout,
+        )
+        self.send_signal(signal.SIGTERM)
+        time_wait = 0
+        self.terminated_by_sigterm = True
        self.timeout_exceeded = True
+        while self.process.poll() is None and time_wait < 100:
+            print("wait...")
+            wait = 5
+            sleep(wait)
+            time_wait += wait
        while self.process.poll() is None:
-            logging.warning(
-                "Killing process %s, timeout %s exceeded",
-                self.process.pid,
-                self.timeout,
+            logging.error(
+                "Process is still running. Send SIGKILL",
            )
-            os.killpg(self.process.pid, 9)
-            sleep(10)
+            self.send_signal(signal.SIGKILL)
+            self.terminated_by_sigkill = True
+            sleep(5)

    def __enter__(self) -> "TeePopen":
        self.process = Popen(
@ -57,6 +72,8 @@ class TeePopen:
            bufsize=1,
            errors="backslashreplace",
        )
+        sleep(1)
+        print(f"Subprocess started, pid [{self.process.pid}]")
        if self.timeout is not None and self.timeout > 0:
            t = Thread(target=self._check_timeout)
            t.daemon = True  # does not block the program from exit
@ -85,6 +102,12 @@ class TeePopen:

        return self.process.wait()

+    def poll(self):
+        return self.process.poll()
+
+    def send_signal(self, signal_num):
+        os.killpg(self.process.pid, signal_num)
+
    @property
    def process(self) -> Popen:
        if self._process is not None: