diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 394d31addb1..9920326b11c 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -256,22 +256,6 @@ function configure rm -f "$FASTTEST_DATA/config.d/secure_ports.xml" } -function timeout_with_logging() { - local exit_code=0 - - timeout -s TERM --preserve-status "${@}" || exit_code="${?}" - - echo "Checking if it is a timeout. The code 124 will indicate a timeout." - if [[ "${exit_code}" -eq "124" ]] - then - echo "The command 'timeout ${*}' has been killed by timeout." - else - echo "No, it isn't a timeout." - fi - - return $exit_code -} - function run_tests { clickhouse-server --version @@ -340,7 +324,7 @@ case "$stage" in configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" ;& "run_tests") - timeout_with_logging 35m bash -c run_tests ||: + run_tests ||: /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \ --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \ --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv" diff --git a/docker/test/sqllogic/Dockerfile b/docker/test/sqllogic/Dockerfile index 1425e12cd84..6397526388e 100644 --- a/docker/test/sqllogic/Dockerfile +++ b/docker/test/sqllogic/Dockerfile @@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \ ENV TZ=Europe/Amsterdam -ENV MAX_RUN_TIME=9000 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git" diff --git a/docker/test/sqllogic/run.sh b/docker/test/sqllogic/run.sh index ccba344035e..32368980f9b 100755 --- a/docker/test/sqllogic/run.sh +++ b/docker/test/sqllogic/run.sh @@ -94,7 +94,7 @@ function run_tests() export -f run_tests -timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2 +run_tests #/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv diff --git a/docker/test/sqltest/Dockerfile b/docker/test/sqltest/Dockerfile index 71d915b0c7a..b805bb03c2b 100644 --- a/docker/test/sqltest/Dockerfile +++ b/docker/test/sqltest/Dockerfile @@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/" RUN git clone ${sqltest_repo} ENV TZ=UTC -ENV MAX_RUN_TIME=900 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone COPY run.sh / diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index 3a4f0d97993..c072eeb0fa8 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -4,9 +4,6 @@ source /setup_export_logs.sh set -e -x -MAX_RUN_TIME=${MAX_RUN_TIME:-3600} -MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME)) - # Choose random timezone for this test run TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)" echo "Choosen random timezone $TZ" @@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] clickhouse-client --query "DROP TABLE datasets.hits_v1" clickhouse-client --query "DROP TABLE datasets.visits_v1" - - MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours) - MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited) else clickhouse-client --query "CREATE DATABASE test" clickhouse-client --query "SHOW TABLES FROM test" @@ -257,24 +251,7 @@ function run_tests() export -f run_tests -function timeout_with_logging() { - local exit_code=0 - - timeout -s TERM --preserve-status "${@}" || exit_code="${?}" - - echo "Checking if it is a timeout. The code 124 will indicate a timeout." - if [[ "${exit_code}" -eq "124" ]] - then - echo "The command 'timeout ${*}' has been killed by timeout." - else - echo "No, it isn't a timeout." - fi - - return $exit_code -} - -TIMEOUT=$((MAX_RUN_TIME - 700)) -timeout_with_logging "$TIMEOUT" bash -c run_tests ||: +run_tests ||: echo "Files in current directory" ls -la ./ diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index d8eb072328f..b0c4914a4e8 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone ENV NUM_TRIES=1 -ENV MAX_RUN_TIME=0 # Unrelated to vars in setup_minio.sh, but should be the same there # to have the same binaries for local running scenario diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index c70cbe1fe45..ad0cd321cc5 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -12,9 +12,6 @@ dmesg --clear # fail on errors, verbose and export all env variables set -e -x -a -MAX_RUN_TIME=${MAX_RUN_TIME:-9000} -MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME)) - USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0} USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0} @@ -308,8 +305,6 @@ function run_tests() try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" - TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800)) - START_TIME=${SECONDS} set +e TEST_ARGS=( @@ -324,32 +319,22 @@ function run_tests() --test-runs "$NUM_TRIES" "${ADDITIONAL_OPTIONS[@]}" ) - timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \ + clickhouse-test "${TEST_ARGS[@]}" 2>&1 \ | ts '%Y-%m-%d %H:%M:%S' \ | tee -a test_output/test_result.txt set -e - DURATION=$((SECONDS - START_TIME)) - - echo "Elapsed ${DURATION} seconds." - if [[ $DURATION -ge $TIMEOUT ]] - then - echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds." - fi } export -f run_tests - -# This should be enough to setup job and collect artifacts -TIMEOUT=$((MAX_RUN_TIME - 700)) if [ "$NUM_TRIES" -gt "1" ]; then # We don't run tests with Ordinary database in PRs, only in master. # So run new/changed tests with Ordinary at least once in flaky check. - timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \ + NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \ | sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||: fi -timeout_with_logging "$TIMEOUT" bash -c run_tests ||: +run_tests ||: echo "Files in current directory" ls -la ./ diff --git a/docker/test/stateless/utils.lib b/docker/test/stateless/utils.lib index cb257536c36..31cd67254b4 100644 --- a/docker/test/stateless/utils.lib +++ b/docker/test/stateless/utils.lib @@ -40,22 +40,6 @@ function fn_exists() { declare -F "$1" > /dev/null; } -function timeout_with_logging() { - local exit_code=0 - - timeout -s TERM --preserve-status "${@}" || exit_code="${?}" - - echo "Checking if it is a timeout. The code 124 will indicate a timeout." - if [[ "${exit_code}" -eq "124" ]] - then - echo "The command 'timeout ${*}' has been killed by timeout." - else - echo "No, it isn't a timeout." - fi - - return $exit_code -} - function collect_core_dumps() { find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 49b597333dc..1208d8642ad 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -50,7 +50,6 @@ from github_helper import GitHub from pr_info import PRInfo from report import ( ERROR, - FAILURE, PENDING, SUCCESS, BuildResult, @@ -62,11 +61,11 @@ from report import ( FAIL, ) from s3_helper import S3Helper -from stopwatch import Stopwatch from tee_popen import TeePopen from ci_cache import CiCache from ci_settings import CiSettings from ci_buddy import CIBuddy +from stopwatch import Stopwatch from version_helper import get_version_from_repo # pylint: disable=too-many-lines @@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info): # skip_status = SUCCESS already there GH.print_in_group("Commit Status Data", job_status) - # create pre report - jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped) + # create dummy report + jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped) jr.dump() if not to_be_skipped: @@ -990,19 +989,20 @@ def _run_test(job_name: str, run_command: str) -> int: stopwatch = Stopwatch() job_log = Path(TEMP_PATH) / "job_log.txt" with TeePopen(run_command, job_log, env, timeout) as process: + print(f"Job process started, pid [{process.process.pid}]") retcode = process.wait() if retcode != 0: print(f"Run action failed for: [{job_name}] with exit code [{retcode}]") - if timeout and process.timeout_exceeded: - print(f"Timeout {timeout} exceeded, dumping the job report") - JobReport( - status=FAILURE, - description=f"Timeout {timeout} exceeded", - test_results=[TestResult.create_check_timeout_expired(timeout)], - start_time=stopwatch.start_time_str, - duration=stopwatch.duration_seconds, - additional_files=[job_log], - ).dump() + if process.timeout_exceeded: + print(f"Job timed out: [{job_name}] exit code [{retcode}]") + assert JobReport.exist(), "JobReport real or dummy must be present" + jr = JobReport.load() + if jr.dummy: + print( + f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time" + ) + jr.test_results = [TestResult.create_check_timeout_expired()] + jr.duration = stopwatch.duration_seconds print(f"Run action done for: [{job_name}]") return retcode @@ -1205,7 +1205,7 @@ def main() -> int: job_report ), "BUG. There must be job report either real report, or pre-report if job was killed" error_description = "" - if not job_report.pre_report: + if not job_report.dummy: # it's a real job report ch_helper = ClickHouseHelper() check_url = "" diff --git a/tests/ci/ci_definitions.py b/tests/ci/ci_definitions.py index 1bed9db06f2..1d1c39f913d 100644 --- a/tests/ci/ci_definitions.py +++ b/tests/ci/ci_definitions.py @@ -332,7 +332,7 @@ class JobConfig: # will be triggered for the job if omitted in CI workflow yml run_command: str = "" # job timeout, seconds - timeout: Optional[int] = None + timeout: int = 7200 # sets number of batches for a multi-batch job num_batches: int = 1 # label that enables job in CI, if set digest isn't used @@ -421,7 +421,6 @@ class CommonJobConfigs: ), run_command='functional_test_check.py "$CHECK_NAME"', runner_type=Runners.FUNC_TESTER, - timeout=9000, ) STATEFUL_TEST = JobConfig( job_name_keyword="stateful", diff --git a/tests/ci/functional_test_check.py b/tests/ci/functional_test_check.py index b7391eff01b..d08f98fa05f 100644 --- a/tests/ci/functional_test_check.py +++ b/tests/ci/functional_test_check.py @@ -5,10 +5,11 @@ import csv import logging import os import re +import signal import subprocess import sys from pathlib import Path -from typing import List, Tuple +from typing import List, Tuple, Optional from build_download_helper import download_all_deb_packages from clickhouse_helper import CiLogsCredentials @@ -25,11 +26,12 @@ from report import ( TestResults, read_test_results, FAILURE, + TestResult, ) from stopwatch import Stopwatch from tee_popen import TeePopen from ci_config import CI -from ci_utils import Utils +from ci_utils import Utils, Shell NO_CHANGES_MSG = "Nothing to run" @@ -113,10 +115,6 @@ def get_run_command( if flaky_check: envs.append("-e NUM_TRIES=50") - envs.append("-e MAX_RUN_TIME=2800") - else: - max_run_time = os.getenv("MAX_RUN_TIME", "0") - envs.append(f"-e MAX_RUN_TIME={max_run_time}") envs += [f"-e {e}" for e in additional_envs] @@ -128,7 +126,7 @@ def get_run_command( ) return ( - f"docker run --volume={builds_path}:/package_folder " + f"docker run --rm --name func-tester --volume={builds_path}:/package_folder " # For dmesg and sysctl "--privileged " f"{ci_logs_args}" @@ -198,7 +196,7 @@ def process_results( state, description = status[0][0], status[0][1] if ret_code != 0: state = ERROR - description += " (but script exited with an error)" + description = f"Job failed, exit code: {ret_code}. " + description try: results_path = result_directory / "test_results.tsv" @@ -240,7 +238,19 @@ def parse_args(): return parser.parse_args() +test_process = None # type: Optional[TeePopen] +timeout_expired = False + + +def handle_sigterm(signum, _frame): + print(f"WARNING: Received signal {signum}") + global timeout_expired + timeout_expired = True + Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True) + + def main(): + signal.signal(signal.SIGTERM, handle_sigterm) logging.basicConfig(level=logging.INFO) for handler in logging.root.handlers: # pylint: disable=protected-access @@ -328,11 +338,13 @@ def main(): logging.info("Going to run func tests: %s", run_command) with TeePopen(run_command, run_log_path) as process: + global test_process + test_process = process retcode = process.wait() if retcode == 0: logging.info("Run successfully") else: - logging.info("Run failed") + logging.info("Run failed, exit code %s", retcode) try: subprocess.check_call( @@ -348,6 +360,13 @@ def main(): state, description, test_results, additional_logs = process_results( retcode, result_path, server_log_path ) + if timeout_expired: + description = "Timeout expired" + state = FAILURE + test_results.insert( + 0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds) + ) + else: print( "This is validate bugfix or flaky check run, but no changes test to run - skip with success" diff --git a/tests/ci/report.py b/tests/ci/report.py index 6779a6dae96..c2632719aef 100644 --- a/tests/ci/report.py +++ b/tests/ci/report.py @@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json" JOB_STARTED_TEST_NAME = "STARTED" JOB_FINISHED_TEST_NAME = "COMPLETED" +JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired" @dataclass @@ -277,8 +278,8 @@ class TestResult: self.log_files.append(log_path) @staticmethod - def create_check_timeout_expired(timeout: float) -> "TestResult": - return TestResult("Check timeout expired", "FAIL", timeout) + def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult": + return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration) TestResults = List[TestResult] @@ -303,7 +304,7 @@ class JobReport: # indicates that this is not real job report but report for the job that was skipped by rerun check job_skipped: bool = False # indicates that report generated by CI script in order to check later if job was killed before real report is generated - pre_report: bool = False + dummy: bool = False exit_code: int = -1 @staticmethod @@ -311,7 +312,7 @@ class JobReport: return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") @classmethod - def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport": + def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport": return JobReport( status=status, description="", @@ -320,7 +321,7 @@ class JobReport: duration=0.0, additional_files=[], job_skipped=job_skipped, - pre_report=True, + dummy=True, ) def update_duration(self): diff --git a/tests/ci/tee_popen.py b/tests/ci/tee_popen.py index 13db50df53f..ad3e62dab9c 100644 --- a/tests/ci/tee_popen.py +++ b/tests/ci/tee_popen.py @@ -2,6 +2,8 @@ import logging import os +import signal +import subprocess import sys from io import TextIOWrapper from pathlib import Path @@ -30,20 +32,35 @@ class TeePopen: self._process = None # type: Optional[Popen] self.timeout = timeout self.timeout_exceeded = False + self.terminated_by_sigterm = False + self.terminated_by_sigkill = False + self.pid = 0 def _check_timeout(self) -> None: if self.timeout is None: return sleep(self.timeout) + logging.warning( + "Timeout exceeded. Send SIGTERM to process %s, timeout %s", + self.process.pid, + self.timeout, + ) + self.send_signal(signal.SIGTERM) + time_wait = 0 + self.terminated_by_sigterm = True self.timeout_exceeded = True + while self.process.poll() is None and time_wait < 100: + print("wait...") + wait = 5 + sleep(wait) + time_wait += wait while self.process.poll() is None: - logging.warning( - "Killing process %s, timeout %s exceeded", - self.process.pid, - self.timeout, + logging.error( + "Process is still running. Send SIGKILL", ) - os.killpg(self.process.pid, 9) - sleep(10) + self.send_signal(signal.SIGKILL) + self.terminated_by_sigkill = True + sleep(5) def __enter__(self) -> "TeePopen": self.process = Popen( @@ -57,6 +74,9 @@ class TeePopen: bufsize=1, errors="backslashreplace", ) + sleep(1) + self.pid = self._get_child_pid() + print(f"Subprocess started, pid [{self.process.pid}], child pid [{self.pid}]") if self.timeout is not None and self.timeout > 0: t = Thread(target=self._check_timeout) t.daemon = True # does not block the program from exit @@ -77,6 +97,22 @@ class TeePopen: self.log_file.close() + def _get_child_pid(self): + # linux only + ps_command = f"ps --ppid {self.process.pid} -o pid=" + res = "NA" + try: + result = subprocess.run( + ps_command, shell=True, capture_output=True, text=True + ) + res = result.stdout.strip() + pid = int(res) + return pid + except Exception as e: + print(f"Failed to get child's pid, command [{ps_command}], result [{res}]") + print(f"ERROR: getting Python subprocess PID: {e}") + return self.process.pid + def wait(self) -> int: if self.process.stdout is not None: for line in self.process.stdout: @@ -85,6 +121,15 @@ class TeePopen: return self.process.wait() + def poll(self): + return self.process.poll() + + def send_signal(self, signal_num): + if self.pid: + os.kill(self.pid, signal_num) + else: + print("ERROR: no process to send signal") + @property def process(self) -> Popen: if self._process is not None: