mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Backport #67944 to 24.8: CI: Job Timeout handling
This commit is contained in:
parent
e9b6b93cc4
commit
f234ccf185
@ -256,22 +256,6 @@ function configure
|
||||
rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
|
||||
}
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
function run_tests
|
||||
{
|
||||
clickhouse-server --version
|
||||
@ -340,7 +324,7 @@ case "$stage" in
|
||||
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
|
||||
;&
|
||||
"run_tests")
|
||||
timeout_with_logging 35m bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
/process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
|
||||
--out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
|
||||
--out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"
|
||||
|
@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \
|
||||
|
||||
|
||||
ENV TZ=Europe/Amsterdam
|
||||
ENV MAX_RUN_TIME=9000
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
|
||||
|
@ -94,7 +94,7 @@ function run_tests()
|
||||
|
||||
export -f run_tests
|
||||
|
||||
timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2
|
||||
run_tests
|
||||
|
||||
#/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv
|
||||
|
||||
|
@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
|
||||
RUN git clone ${sqltest_repo}
|
||||
|
||||
ENV TZ=UTC
|
||||
ENV MAX_RUN_TIME=900
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
COPY run.sh /
|
||||
|
@ -4,9 +4,6 @@
|
||||
source /setup_export_logs.sh
|
||||
set -e -x
|
||||
|
||||
MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
|
||||
|
||||
# Choose random timezone for this test run
|
||||
TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
|
||||
echo "Choosen random timezone $TZ"
|
||||
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
|
||||
|
||||
clickhouse-client --query "DROP TABLE datasets.hits_v1"
|
||||
clickhouse-client --query "DROP TABLE datasets.visits_v1"
|
||||
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours)
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited)
|
||||
else
|
||||
clickhouse-client --query "CREATE DATABASE test"
|
||||
clickhouse-client --query "SHOW TABLES FROM test"
|
||||
@ -257,24 +251,7 @@ function run_tests()
|
||||
|
||||
export -f run_tests
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
TIMEOUT=$((MAX_RUN_TIME - 700))
|
||||
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
|
||||
echo "Files in current directory"
|
||||
ls -la ./
|
||||
|
@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ENV NUM_TRIES=1
|
||||
ENV MAX_RUN_TIME=0
|
||||
|
||||
# Unrelated to vars in setup_minio.sh, but should be the same there
|
||||
# to have the same binaries for local running scenario
|
||||
|
@ -12,9 +12,6 @@ dmesg --clear
|
||||
# fail on errors, verbose and export all env variables
|
||||
set -e -x -a
|
||||
|
||||
MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
|
||||
|
||||
USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
|
||||
USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
|
||||
|
||||
@ -308,8 +305,6 @@ function run_tests()
|
||||
|
||||
try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
|
||||
|
||||
TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
|
||||
START_TIME=${SECONDS}
|
||||
set +e
|
||||
|
||||
TEST_ARGS=(
|
||||
@ -324,32 +319,22 @@ function run_tests()
|
||||
--test-runs "$NUM_TRIES"
|
||||
"${ADDITIONAL_OPTIONS[@]}"
|
||||
)
|
||||
timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
|
||||
clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
|
||||
| ts '%Y-%m-%d %H:%M:%S' \
|
||||
| tee -a test_output/test_result.txt
|
||||
set -e
|
||||
DURATION=$((SECONDS - START_TIME))
|
||||
|
||||
echo "Elapsed ${DURATION} seconds."
|
||||
if [[ $DURATION -ge $TIMEOUT ]]
|
||||
then
|
||||
echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
|
||||
fi
|
||||
}
|
||||
|
||||
export -f run_tests
|
||||
|
||||
|
||||
# This should be enough to setup job and collect artifacts
|
||||
TIMEOUT=$((MAX_RUN_TIME - 700))
|
||||
if [ "$NUM_TRIES" -gt "1" ]; then
|
||||
# We don't run tests with Ordinary database in PRs, only in master.
|
||||
# So run new/changed tests with Ordinary at least once in flaky check.
|
||||
timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
|
||||
NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
|
||||
| sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
|
||||
fi
|
||||
|
||||
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
|
||||
echo "Files in current directory"
|
||||
ls -la ./
|
||||
|
@ -40,22 +40,6 @@ function fn_exists() {
|
||||
declare -F "$1" > /dev/null;
|
||||
}
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
function collect_core_dumps()
|
||||
{
|
||||
find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do
|
||||
|
@ -50,7 +50,6 @@ from github_helper import GitHub
|
||||
from pr_info import PRInfo
|
||||
from report import (
|
||||
ERROR,
|
||||
FAILURE,
|
||||
PENDING,
|
||||
SUCCESS,
|
||||
BuildResult,
|
||||
@ -62,11 +61,11 @@ from report import (
|
||||
FAIL,
|
||||
)
|
||||
from s3_helper import S3Helper
|
||||
from stopwatch import Stopwatch
|
||||
from tee_popen import TeePopen
|
||||
from ci_cache import CiCache
|
||||
from ci_settings import CiSettings
|
||||
from ci_buddy import CIBuddy
|
||||
from stopwatch import Stopwatch
|
||||
from version_helper import get_version_from_repo
|
||||
|
||||
# pylint: disable=too-many-lines
|
||||
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
|
||||
# skip_status = SUCCESS already there
|
||||
GH.print_in_group("Commit Status Data", job_status)
|
||||
|
||||
# create pre report
|
||||
jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
|
||||
# create dummy report
|
||||
jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
|
||||
jr.dump()
|
||||
|
||||
if not to_be_skipped:
|
||||
@ -990,19 +989,21 @@ def _run_test(job_name: str, run_command: str) -> int:
|
||||
stopwatch = Stopwatch()
|
||||
job_log = Path(TEMP_PATH) / "job_log.txt"
|
||||
with TeePopen(run_command, job_log, env, timeout) as process:
|
||||
print(f"Job process started, pid [{process.process.pid}]")
|
||||
retcode = process.wait()
|
||||
if retcode != 0:
|
||||
print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
|
||||
if timeout and process.timeout_exceeded:
|
||||
print(f"Timeout {timeout} exceeded, dumping the job report")
|
||||
JobReport(
|
||||
status=FAILURE,
|
||||
description=f"Timeout {timeout} exceeded",
|
||||
test_results=[TestResult.create_check_timeout_expired(timeout)],
|
||||
start_time=stopwatch.start_time_str,
|
||||
duration=stopwatch.duration_seconds,
|
||||
additional_files=[job_log],
|
||||
).dump()
|
||||
if process.timeout_exceeded:
|
||||
print(f"Job timed out: [{job_name}] exit code [{retcode}]")
|
||||
assert JobReport.exist(), "JobReport real or dummy must be present"
|
||||
jr = JobReport.load()
|
||||
if jr.dummy:
|
||||
print(
|
||||
f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
|
||||
)
|
||||
jr.test_results = [TestResult.create_check_timeout_expired()]
|
||||
jr.duration = stopwatch.duration_seconds
|
||||
jr.additional_files += [job_log]
|
||||
|
||||
print(f"Run action done for: [{job_name}]")
|
||||
return retcode
|
||||
@ -1205,7 +1206,7 @@ def main() -> int:
|
||||
job_report
|
||||
), "BUG. There must be job report either real report, or pre-report if job was killed"
|
||||
error_description = ""
|
||||
if not job_report.pre_report:
|
||||
if not job_report.dummy:
|
||||
# it's a real job report
|
||||
ch_helper = ClickHouseHelper()
|
||||
check_url = ""
|
||||
@ -1329,10 +1330,20 @@ def main() -> int:
|
||||
if CI.is_test_job(args.job_name):
|
||||
gh = GitHub(get_best_robot_token(), per_page=100)
|
||||
commit = get_commit(gh, pr_info.sha)
|
||||
check_url = ""
|
||||
if job_report.test_results or job_report.additional_files:
|
||||
check_url = upload_result_helper.upload_results(
|
||||
s3,
|
||||
pr_info.number,
|
||||
pr_info.sha,
|
||||
job_report.test_results,
|
||||
job_report.additional_files,
|
||||
job_report.check_name or _get_ext_check_name(args.job_name),
|
||||
)
|
||||
post_commit_status(
|
||||
commit,
|
||||
ERROR,
|
||||
"",
|
||||
check_url,
|
||||
"Error: " + error_description,
|
||||
_get_ext_check_name(args.job_name),
|
||||
pr_info,
|
||||
|
@ -315,6 +315,7 @@ class CI:
|
||||
JobNames.STATEFUL_TEST_PARALLEL_REPL_TSAN: CommonJobConfigs.STATEFUL_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_TSAN],
|
||||
random_bucket="parrepl_with_sanitizer",
|
||||
timeout=3600,
|
||||
),
|
||||
JobNames.STATELESS_TEST_ASAN: CommonJobConfigs.STATELESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_ASAN], num_batches=2
|
||||
@ -342,17 +343,17 @@ class CI:
|
||||
runner_type=Runners.FUNC_TESTER_ARM,
|
||||
),
|
||||
JobNames.STATELESS_TEST_OLD_ANALYZER_S3_REPLICATED_RELEASE: CommonJobConfigs.STATELESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_RELEASE], num_batches=4
|
||||
required_builds=[BuildNames.PACKAGE_RELEASE], num_batches=2
|
||||
),
|
||||
JobNames.STATELESS_TEST_S3_DEBUG: CommonJobConfigs.STATELESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_DEBUG], num_batches=2
|
||||
required_builds=[BuildNames.PACKAGE_DEBUG], num_batches=1
|
||||
),
|
||||
JobNames.STATELESS_TEST_AZURE_ASAN: CommonJobConfigs.STATELESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_ASAN], num_batches=3, release_only=True
|
||||
),
|
||||
JobNames.STATELESS_TEST_S3_TSAN: CommonJobConfigs.STATELESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_TSAN],
|
||||
num_batches=4,
|
||||
num_batches=3,
|
||||
),
|
||||
JobNames.STRESS_TEST_DEBUG: CommonJobConfigs.STRESS_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_DEBUG],
|
||||
@ -400,7 +401,8 @@ class CI:
|
||||
required_builds=[BuildNames.PACKAGE_ASAN], release_only=True, num_batches=4
|
||||
),
|
||||
JobNames.INTEGRATION_TEST_ASAN_OLD_ANALYZER: CommonJobConfigs.INTEGRATION_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_ASAN], num_batches=6
|
||||
required_builds=[BuildNames.PACKAGE_ASAN],
|
||||
num_batches=6,
|
||||
),
|
||||
JobNames.INTEGRATION_TEST_TSAN: CommonJobConfigs.INTEGRATION_TEST.with_properties(
|
||||
required_builds=[BuildNames.PACKAGE_TSAN], num_batches=6
|
||||
|
@ -331,7 +331,7 @@ class JobConfig:
|
||||
# will be triggered for the job if omitted in CI workflow yml
|
||||
run_command: str = ""
|
||||
# job timeout, seconds
|
||||
timeout: Optional[int] = None
|
||||
timeout: int = 7200
|
||||
# sets number of batches for a multi-batch job
|
||||
num_batches: int = 1
|
||||
# label that enables job in CI, if set digest isn't used
|
||||
@ -420,7 +420,6 @@ class CommonJobConfigs:
|
||||
),
|
||||
run_command='functional_test_check.py "$CHECK_NAME"',
|
||||
runner_type=Runners.FUNC_TESTER,
|
||||
timeout=9000,
|
||||
)
|
||||
STATEFUL_TEST = JobConfig(
|
||||
job_name_keyword="stateful",
|
||||
@ -465,6 +464,7 @@ class CommonJobConfigs:
|
||||
),
|
||||
run_command="upgrade_check.py",
|
||||
runner_type=Runners.STRESS_TESTER,
|
||||
timeout=3600,
|
||||
)
|
||||
INTEGRATION_TEST = JobConfig(
|
||||
job_name_keyword="integration",
|
||||
|
@ -5,10 +5,11 @@ import csv
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from build_download_helper import download_all_deb_packages
|
||||
from clickhouse_helper import CiLogsCredentials
|
||||
@ -25,11 +26,12 @@ from report import (
|
||||
TestResults,
|
||||
read_test_results,
|
||||
FAILURE,
|
||||
TestResult,
|
||||
)
|
||||
from stopwatch import Stopwatch
|
||||
from tee_popen import TeePopen
|
||||
from ci_config import CI
|
||||
from ci_utils import Utils
|
||||
from ci_utils import Utils, Shell
|
||||
|
||||
NO_CHANGES_MSG = "Nothing to run"
|
||||
|
||||
@ -113,10 +115,6 @@ def get_run_command(
|
||||
|
||||
if flaky_check:
|
||||
envs.append("-e NUM_TRIES=50")
|
||||
envs.append("-e MAX_RUN_TIME=2800")
|
||||
else:
|
||||
max_run_time = os.getenv("MAX_RUN_TIME", "0")
|
||||
envs.append(f"-e MAX_RUN_TIME={max_run_time}")
|
||||
|
||||
envs += [f"-e {e}" for e in additional_envs]
|
||||
|
||||
@ -128,7 +126,7 @@ def get_run_command(
|
||||
)
|
||||
|
||||
return (
|
||||
f"docker run --volume={builds_path}:/package_folder "
|
||||
f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
|
||||
# For dmesg and sysctl
|
||||
"--privileged "
|
||||
f"{ci_logs_args}"
|
||||
@ -198,7 +196,7 @@ def process_results(
|
||||
state, description = status[0][0], status[0][1]
|
||||
if ret_code != 0:
|
||||
state = ERROR
|
||||
description += " (but script exited with an error)"
|
||||
description = f"Job failed, exit code: {ret_code}. " + description
|
||||
|
||||
try:
|
||||
results_path = result_directory / "test_results.tsv"
|
||||
@ -240,7 +238,19 @@ def parse_args():
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
test_process = None # type: Optional[TeePopen]
|
||||
timeout_expired = False
|
||||
|
||||
|
||||
def handle_sigterm(signum, _frame):
|
||||
print(f"WARNING: Received signal {signum}")
|
||||
global timeout_expired
|
||||
timeout_expired = True
|
||||
Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
|
||||
|
||||
|
||||
def main():
|
||||
signal.signal(signal.SIGTERM, handle_sigterm)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for handler in logging.root.handlers:
|
||||
# pylint: disable=protected-access
|
||||
@ -328,11 +338,13 @@ def main():
|
||||
logging.info("Going to run func tests: %s", run_command)
|
||||
|
||||
with TeePopen(run_command, run_log_path) as process:
|
||||
global test_process
|
||||
test_process = process
|
||||
retcode = process.wait()
|
||||
if retcode == 0:
|
||||
logging.info("Run successfully")
|
||||
else:
|
||||
logging.info("Run failed")
|
||||
logging.info("Run failed, exit code %s", retcode)
|
||||
|
||||
try:
|
||||
subprocess.check_call(
|
||||
@ -348,6 +360,13 @@ def main():
|
||||
state, description, test_results, additional_logs = process_results(
|
||||
retcode, result_path, server_log_path
|
||||
)
|
||||
if timeout_expired:
|
||||
description = "Timeout expired"
|
||||
state = FAILURE
|
||||
test_results.insert(
|
||||
0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
|
||||
)
|
||||
|
||||
else:
|
||||
print(
|
||||
"This is validate bugfix or flaky check run, but no changes test to run - skip with success"
|
||||
|
@ -9,6 +9,7 @@ import random
|
||||
import re
|
||||
import shlex
|
||||
import shutil
|
||||
import signal
|
||||
import string
|
||||
import subprocess
|
||||
import sys
|
||||
@ -16,11 +17,13 @@ import time
|
||||
import zlib # for crc32
|
||||
from collections import defaultdict
|
||||
from itertools import chain
|
||||
from typing import Any, Dict
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from env_helper import IS_CI
|
||||
from integration_test_images import IMAGES
|
||||
from tee_popen import TeePopen
|
||||
from report import JOB_TIMEOUT_TEST_NAME
|
||||
from stopwatch import Stopwatch
|
||||
|
||||
MAX_RETRY = 1
|
||||
NUM_WORKERS = 5
|
||||
@ -621,6 +624,9 @@ class ClickhouseIntegrationTestsRunner:
|
||||
test_data_dirs = {}
|
||||
|
||||
for i in range(num_tries):
|
||||
if timeout_expired:
|
||||
print("Timeout expired - break test group execution")
|
||||
break
|
||||
logging.info("Running test group %s for the %s retry", test_group, i)
|
||||
clear_ip_tables_and_restart_daemons()
|
||||
|
||||
@ -657,6 +663,8 @@ class ClickhouseIntegrationTestsRunner:
|
||||
logging.info("Executing cmd: %s", cmd)
|
||||
# ignore retcode, since it meaningful due to pipe to tee
|
||||
with subprocess.Popen(cmd, shell=True, stderr=log, stdout=log) as proc:
|
||||
global runner_subprocess
|
||||
runner_subprocess = proc
|
||||
proc.wait()
|
||||
|
||||
extra_logs_names = [log_basename]
|
||||
@ -780,6 +788,9 @@ class ClickhouseIntegrationTestsRunner:
|
||||
logs = []
|
||||
tries_num = 1 if should_fail else FLAKY_TRIES_COUNT
|
||||
for i in range(tries_num):
|
||||
if timeout_expired:
|
||||
print("Timeout expired - break flaky check execution")
|
||||
break
|
||||
final_retry += 1
|
||||
logging.info("Running tests for the %s time", i)
|
||||
counters, tests_times, log_paths = self.try_run_test_group(
|
||||
@ -839,6 +850,7 @@ class ClickhouseIntegrationTestsRunner:
|
||||
return result_state, status_text, test_result, logs
|
||||
|
||||
def run_impl(self, repo_path, build_path):
|
||||
stopwatch = Stopwatch()
|
||||
if self.flaky_check or self.bugfix_validate_check:
|
||||
return self.run_flaky_check(
|
||||
repo_path, build_path, should_fail=self.bugfix_validate_check
|
||||
@ -921,6 +933,9 @@ class ClickhouseIntegrationTestsRunner:
|
||||
random.shuffle(items_to_run)
|
||||
|
||||
for group, tests in items_to_run:
|
||||
if timeout_expired:
|
||||
print("Timeout expired - break tests execution")
|
||||
break
|
||||
logging.info("Running test group %s containing %s tests", group, len(tests))
|
||||
group_counters, group_test_times, log_paths = self.try_run_test_group(
|
||||
repo_path, group, tests, MAX_RETRY, NUM_WORKERS, 0
|
||||
@ -981,6 +996,17 @@ class ClickhouseIntegrationTestsRunner:
|
||||
status_text = "Timeout, " + status_text
|
||||
result_state = "failure"
|
||||
|
||||
if timeout_expired:
|
||||
logging.error(
|
||||
"Job killed by external timeout signal - setting status to failure!"
|
||||
)
|
||||
status_text = "Job timeout expired, " + status_text
|
||||
result_state = "failure"
|
||||
# add mock test case to make timeout visible in job report and in ci db
|
||||
test_result.insert(
|
||||
0, (JOB_TIMEOUT_TEST_NAME, "FAIL", f"{stopwatch.duration_seconds}", "")
|
||||
)
|
||||
|
||||
if not counters or sum(len(counter) for counter in counters.values()) == 0:
|
||||
status_text = "No tests found for some reason! It's a bug"
|
||||
result_state = "failure"
|
||||
@ -1001,6 +1027,7 @@ def write_results(results_file, status_file, results, status):
|
||||
|
||||
|
||||
def run():
|
||||
signal.signal(signal.SIGTERM, handle_sigterm)
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
|
||||
|
||||
repo_path = os.environ.get("CLICKHOUSE_TESTS_REPO_PATH")
|
||||
@ -1035,5 +1062,17 @@ def run():
|
||||
logging.info("Result written")
|
||||
|
||||
|
||||
timeout_expired = False
|
||||
runner_subprocess = None # type:Optional[subprocess.Popen]
|
||||
|
||||
|
||||
def handle_sigterm(signum, _frame):
|
||||
print(f"WARNING: Received signal {signum}")
|
||||
global timeout_expired
|
||||
timeout_expired = True
|
||||
if runner_subprocess:
|
||||
runner_subprocess.send_signal(signal.SIGTERM)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run()
|
||||
|
@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"
|
||||
|
||||
JOB_STARTED_TEST_NAME = "STARTED"
|
||||
JOB_FINISHED_TEST_NAME = "COMPLETED"
|
||||
JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -277,8 +278,8 @@ class TestResult:
|
||||
self.log_files.append(log_path)
|
||||
|
||||
@staticmethod
|
||||
def create_check_timeout_expired(timeout: float) -> "TestResult":
|
||||
return TestResult("Check timeout expired", "FAIL", timeout)
|
||||
def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
|
||||
return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)
|
||||
|
||||
|
||||
TestResults = List[TestResult]
|
||||
@ -303,7 +304,7 @@ class JobReport:
|
||||
# indicates that this is not real job report but report for the job that was skipped by rerun check
|
||||
job_skipped: bool = False
|
||||
# indicates that report generated by CI script in order to check later if job was killed before real report is generated
|
||||
pre_report: bool = False
|
||||
dummy: bool = False
|
||||
exit_code: int = -1
|
||||
|
||||
@staticmethod
|
||||
@ -311,7 +312,7 @@ class JobReport:
|
||||
return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
@classmethod
|
||||
def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
|
||||
def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
|
||||
return JobReport(
|
||||
status=status,
|
||||
description="",
|
||||
@ -320,7 +321,7 @@ class JobReport:
|
||||
duration=0.0,
|
||||
additional_files=[],
|
||||
job_skipped=job_skipped,
|
||||
pre_report=True,
|
||||
dummy=True,
|
||||
)
|
||||
|
||||
def update_duration(self):
|
||||
@ -741,10 +742,21 @@ def create_test_html_report(
|
||||
has_test_time = any(tr.time is not None for tr in test_results)
|
||||
has_log_urls = False
|
||||
|
||||
# Display entires with logs at the top (they correspond to failed tests)
|
||||
test_results.sort(
|
||||
key=lambda result: result.raw_logs is None and result.log_files is None
|
||||
)
|
||||
def sort_key(status):
|
||||
if "fail" in status.lower():
|
||||
return 0
|
||||
elif "error" in status.lower():
|
||||
return 1
|
||||
elif "not" in status.lower():
|
||||
return 2
|
||||
elif "ok" in status.lower():
|
||||
return 10
|
||||
elif "success" in status.lower():
|
||||
return 9
|
||||
else:
|
||||
return 5
|
||||
|
||||
test_results.sort(key=lambda result: sort_key(result.status))
|
||||
|
||||
for test_result in test_results:
|
||||
colspan = 0
|
||||
|
@ -2,6 +2,7 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
from io import TextIOWrapper
|
||||
from pathlib import Path
|
||||
@ -30,20 +31,34 @@ class TeePopen:
|
||||
self._process = None # type: Optional[Popen]
|
||||
self.timeout = timeout
|
||||
self.timeout_exceeded = False
|
||||
self.terminated_by_sigterm = False
|
||||
self.terminated_by_sigkill = False
|
||||
|
||||
def _check_timeout(self) -> None:
|
||||
if self.timeout is None:
|
||||
return
|
||||
sleep(self.timeout)
|
||||
logging.warning(
|
||||
"Timeout exceeded. Send SIGTERM to process %s, timeout %s",
|
||||
self.process.pid,
|
||||
self.timeout,
|
||||
)
|
||||
self.send_signal(signal.SIGTERM)
|
||||
time_wait = 0
|
||||
self.terminated_by_sigterm = True
|
||||
self.timeout_exceeded = True
|
||||
while self.process.poll() is None and time_wait < 100:
|
||||
print("wait...")
|
||||
wait = 5
|
||||
sleep(wait)
|
||||
time_wait += wait
|
||||
while self.process.poll() is None:
|
||||
logging.warning(
|
||||
"Killing process %s, timeout %s exceeded",
|
||||
self.process.pid,
|
||||
self.timeout,
|
||||
logging.error(
|
||||
"Process is still running. Send SIGKILL",
|
||||
)
|
||||
os.killpg(self.process.pid, 9)
|
||||
sleep(10)
|
||||
self.send_signal(signal.SIGKILL)
|
||||
self.terminated_by_sigkill = True
|
||||
sleep(5)
|
||||
|
||||
def __enter__(self) -> "TeePopen":
|
||||
self.process = Popen(
|
||||
@ -57,6 +72,8 @@ class TeePopen:
|
||||
bufsize=1,
|
||||
errors="backslashreplace",
|
||||
)
|
||||
sleep(1)
|
||||
print(f"Subprocess started, pid [{self.process.pid}]")
|
||||
if self.timeout is not None and self.timeout > 0:
|
||||
t = Thread(target=self._check_timeout)
|
||||
t.daemon = True # does not block the program from exit
|
||||
@ -85,6 +102,12 @@ class TeePopen:
|
||||
|
||||
return self.process.wait()
|
||||
|
||||
def poll(self):
|
||||
return self.process.poll()
|
||||
|
||||
def send_signal(self, signal_num):
|
||||
os.killpg(self.process.pid, signal_num)
|
||||
|
||||
@property
|
||||
def process(self) -> Popen:
|
||||
if self._process is not None:
|
||||
|
Loading…
Reference in New Issue
Block a user