mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-19 16:20:50 +00:00
CI: Functional tests to store artifacts on timeout
This commit is contained in:
parent
8d0d870326
commit
427016a450
@ -256,22 +256,6 @@ function configure
|
||||
rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
|
||||
}
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
function run_tests
|
||||
{
|
||||
clickhouse-server --version
|
||||
@ -340,7 +324,7 @@ case "$stage" in
|
||||
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
|
||||
;&
|
||||
"run_tests")
|
||||
timeout_with_logging 35m bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
/process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
|
||||
--out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
|
||||
--out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"
|
||||
|
@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \
|
||||
|
||||
|
||||
ENV TZ=Europe/Amsterdam
|
||||
ENV MAX_RUN_TIME=9000
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"
|
||||
|
@ -94,7 +94,7 @@ function run_tests()
|
||||
|
||||
export -f run_tests
|
||||
|
||||
timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2
|
||||
run_tests
|
||||
|
||||
#/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv
|
||||
|
||||
|
@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
|
||||
RUN git clone ${sqltest_repo}
|
||||
|
||||
ENV TZ=UTC
|
||||
ENV MAX_RUN_TIME=900
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
COPY run.sh /
|
||||
|
@ -4,9 +4,6 @@
|
||||
source /setup_export_logs.sh
|
||||
set -e -x
|
||||
|
||||
MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
|
||||
|
||||
# Choose random timezone for this test run
|
||||
TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
|
||||
echo "Choosen random timezone $TZ"
|
||||
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
|
||||
|
||||
clickhouse-client --query "DROP TABLE datasets.hits_v1"
|
||||
clickhouse-client --query "DROP TABLE datasets.visits_v1"
|
||||
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours)
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited)
|
||||
else
|
||||
clickhouse-client --query "CREATE DATABASE test"
|
||||
clickhouse-client --query "SHOW TABLES FROM test"
|
||||
@ -257,24 +251,7 @@ function run_tests()
|
||||
|
||||
export -f run_tests
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
TIMEOUT=$((MAX_RUN_TIME - 700))
|
||||
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
|
||||
echo "Files in current directory"
|
||||
ls -la ./
|
||||
|
@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
|
||||
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
|
||||
|
||||
ENV NUM_TRIES=1
|
||||
ENV MAX_RUN_TIME=0
|
||||
|
||||
# Unrelated to vars in setup_minio.sh, but should be the same there
|
||||
# to have the same binaries for local running scenario
|
||||
|
@ -12,9 +12,6 @@ dmesg --clear
|
||||
# fail on errors, verbose and export all env variables
|
||||
set -e -x -a
|
||||
|
||||
MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
|
||||
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
|
||||
|
||||
USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
|
||||
USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
|
||||
|
||||
@ -308,8 +305,6 @@ function run_tests()
|
||||
|
||||
try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
|
||||
|
||||
TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
|
||||
START_TIME=${SECONDS}
|
||||
set +e
|
||||
|
||||
TEST_ARGS=(
|
||||
@ -324,32 +319,22 @@ function run_tests()
|
||||
--test-runs "$NUM_TRIES"
|
||||
"${ADDITIONAL_OPTIONS[@]}"
|
||||
)
|
||||
timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
|
||||
clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
|
||||
| ts '%Y-%m-%d %H:%M:%S' \
|
||||
| tee -a test_output/test_result.txt
|
||||
set -e
|
||||
DURATION=$((SECONDS - START_TIME))
|
||||
|
||||
echo "Elapsed ${DURATION} seconds."
|
||||
if [[ $DURATION -ge $TIMEOUT ]]
|
||||
then
|
||||
echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
|
||||
fi
|
||||
}
|
||||
|
||||
export -f run_tests
|
||||
|
||||
|
||||
# This should be enough to setup job and collect artifacts
|
||||
TIMEOUT=$((MAX_RUN_TIME - 700))
|
||||
if [ "$NUM_TRIES" -gt "1" ]; then
|
||||
# We don't run tests with Ordinary database in PRs, only in master.
|
||||
# So run new/changed tests with Ordinary at least once in flaky check.
|
||||
timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \
|
||||
NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
|
||||
| sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
|
||||
fi
|
||||
|
||||
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
|
||||
run_tests ||:
|
||||
|
||||
echo "Files in current directory"
|
||||
ls -la ./
|
||||
|
@ -40,22 +40,6 @@ function fn_exists() {
|
||||
declare -F "$1" > /dev/null;
|
||||
}
|
||||
|
||||
function timeout_with_logging() {
|
||||
local exit_code=0
|
||||
|
||||
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
|
||||
|
||||
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
|
||||
if [[ "${exit_code}" -eq "124" ]]
|
||||
then
|
||||
echo "The command 'timeout ${*}' has been killed by timeout."
|
||||
else
|
||||
echo "No, it isn't a timeout."
|
||||
fi
|
||||
|
||||
return $exit_code
|
||||
}
|
||||
|
||||
function collect_core_dumps()
|
||||
{
|
||||
find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do
|
||||
|
@ -50,7 +50,6 @@ from github_helper import GitHub
|
||||
from pr_info import PRInfo
|
||||
from report import (
|
||||
ERROR,
|
||||
FAILURE,
|
||||
PENDING,
|
||||
SUCCESS,
|
||||
BuildResult,
|
||||
@ -62,11 +61,11 @@ from report import (
|
||||
FAIL,
|
||||
)
|
||||
from s3_helper import S3Helper
|
||||
from stopwatch import Stopwatch
|
||||
from tee_popen import TeePopen
|
||||
from ci_cache import CiCache
|
||||
from ci_settings import CiSettings
|
||||
from ci_buddy import CIBuddy
|
||||
from stopwatch import Stopwatch
|
||||
from version_helper import get_version_from_repo
|
||||
|
||||
# pylint: disable=too-many-lines
|
||||
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
|
||||
# skip_status = SUCCESS already there
|
||||
GH.print_in_group("Commit Status Data", job_status)
|
||||
|
||||
# create pre report
|
||||
jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
|
||||
# create dummy report
|
||||
jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
|
||||
jr.dump()
|
||||
|
||||
if not to_be_skipped:
|
||||
@ -990,19 +989,20 @@ def _run_test(job_name: str, run_command: str) -> int:
|
||||
stopwatch = Stopwatch()
|
||||
job_log = Path(TEMP_PATH) / "job_log.txt"
|
||||
with TeePopen(run_command, job_log, env, timeout) as process:
|
||||
print(f"Job process started, pid [{process.process.pid}]")
|
||||
retcode = process.wait()
|
||||
if retcode != 0:
|
||||
print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
|
||||
if timeout and process.timeout_exceeded:
|
||||
print(f"Timeout {timeout} exceeded, dumping the job report")
|
||||
JobReport(
|
||||
status=FAILURE,
|
||||
description=f"Timeout {timeout} exceeded",
|
||||
test_results=[TestResult.create_check_timeout_expired(timeout)],
|
||||
start_time=stopwatch.start_time_str,
|
||||
duration=stopwatch.duration_seconds,
|
||||
additional_files=[job_log],
|
||||
).dump()
|
||||
if process.timeout_exceeded:
|
||||
print(f"Job timed out: [{job_name}] exit code [{retcode}]")
|
||||
assert JobReport.exist(), "JobReport real or dummy must be present"
|
||||
jr = JobReport.load()
|
||||
if jr.dummy:
|
||||
print(
|
||||
f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
|
||||
)
|
||||
jr.test_results = [TestResult.create_check_timeout_expired()]
|
||||
jr.duration = stopwatch.duration_seconds
|
||||
|
||||
print(f"Run action done for: [{job_name}]")
|
||||
return retcode
|
||||
@ -1205,7 +1205,7 @@ def main() -> int:
|
||||
job_report
|
||||
), "BUG. There must be job report either real report, or pre-report if job was killed"
|
||||
error_description = ""
|
||||
if not job_report.pre_report:
|
||||
if not job_report.dummy:
|
||||
# it's a real job report
|
||||
ch_helper = ClickHouseHelper()
|
||||
check_url = ""
|
||||
|
@ -332,7 +332,7 @@ class JobConfig:
|
||||
# will be triggered for the job if omitted in CI workflow yml
|
||||
run_command: str = ""
|
||||
# job timeout, seconds
|
||||
timeout: Optional[int] = None
|
||||
timeout: int = 7200
|
||||
# sets number of batches for a multi-batch job
|
||||
num_batches: int = 1
|
||||
# label that enables job in CI, if set digest isn't used
|
||||
@ -421,7 +421,6 @@ class CommonJobConfigs:
|
||||
),
|
||||
run_command='functional_test_check.py "$CHECK_NAME"',
|
||||
runner_type=Runners.FUNC_TESTER,
|
||||
timeout=9000,
|
||||
)
|
||||
STATEFUL_TEST = JobConfig(
|
||||
job_name_keyword="stateful",
|
||||
|
@ -5,10 +5,11 @@ import csv
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
from build_download_helper import download_all_deb_packages
|
||||
from clickhouse_helper import CiLogsCredentials
|
||||
@ -25,11 +26,12 @@ from report import (
|
||||
TestResults,
|
||||
read_test_results,
|
||||
FAILURE,
|
||||
TestResult,
|
||||
)
|
||||
from stopwatch import Stopwatch
|
||||
from tee_popen import TeePopen
|
||||
from ci_config import CI
|
||||
from ci_utils import Utils
|
||||
from ci_utils import Utils, Shell
|
||||
|
||||
NO_CHANGES_MSG = "Nothing to run"
|
||||
|
||||
@ -113,10 +115,6 @@ def get_run_command(
|
||||
|
||||
if flaky_check:
|
||||
envs.append("-e NUM_TRIES=50")
|
||||
envs.append("-e MAX_RUN_TIME=2800")
|
||||
else:
|
||||
max_run_time = os.getenv("MAX_RUN_TIME", "0")
|
||||
envs.append(f"-e MAX_RUN_TIME={max_run_time}")
|
||||
|
||||
envs += [f"-e {e}" for e in additional_envs]
|
||||
|
||||
@ -128,7 +126,7 @@ def get_run_command(
|
||||
)
|
||||
|
||||
return (
|
||||
f"docker run --volume={builds_path}:/package_folder "
|
||||
f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
|
||||
# For dmesg and sysctl
|
||||
"--privileged "
|
||||
f"{ci_logs_args}"
|
||||
@ -198,7 +196,7 @@ def process_results(
|
||||
state, description = status[0][0], status[0][1]
|
||||
if ret_code != 0:
|
||||
state = ERROR
|
||||
description += " (but script exited with an error)"
|
||||
description = f"Job failed, exit code: {ret_code}. " + description
|
||||
|
||||
try:
|
||||
results_path = result_directory / "test_results.tsv"
|
||||
@ -240,7 +238,19 @@ def parse_args():
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
test_process = None # type: Optional[TeePopen]
|
||||
timeout_expired = False
|
||||
|
||||
|
||||
def handle_sigterm(signum, _frame):
|
||||
print(f"WARNING: Received signal {signum}")
|
||||
global timeout_expired
|
||||
timeout_expired = True
|
||||
Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
|
||||
|
||||
|
||||
def main():
|
||||
signal.signal(signal.SIGTERM, handle_sigterm)
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
for handler in logging.root.handlers:
|
||||
# pylint: disable=protected-access
|
||||
@ -328,11 +338,13 @@ def main():
|
||||
logging.info("Going to run func tests: %s", run_command)
|
||||
|
||||
with TeePopen(run_command, run_log_path) as process:
|
||||
global test_process
|
||||
test_process = process
|
||||
retcode = process.wait()
|
||||
if retcode == 0:
|
||||
logging.info("Run successfully")
|
||||
else:
|
||||
logging.info("Run failed")
|
||||
logging.info("Run failed, exit code %s", retcode)
|
||||
|
||||
try:
|
||||
subprocess.check_call(
|
||||
@ -348,6 +360,13 @@ def main():
|
||||
state, description, test_results, additional_logs = process_results(
|
||||
retcode, result_path, server_log_path
|
||||
)
|
||||
if timeout_expired:
|
||||
description = "Timeout expired"
|
||||
state = FAILURE
|
||||
test_results.insert(
|
||||
0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
|
||||
)
|
||||
|
||||
else:
|
||||
print(
|
||||
"This is validate bugfix or flaky check run, but no changes test to run - skip with success"
|
||||
|
@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"
|
||||
|
||||
JOB_STARTED_TEST_NAME = "STARTED"
|
||||
JOB_FINISHED_TEST_NAME = "COMPLETED"
|
||||
JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"
|
||||
|
||||
|
||||
@dataclass
|
||||
@ -277,8 +278,8 @@ class TestResult:
|
||||
self.log_files.append(log_path)
|
||||
|
||||
@staticmethod
|
||||
def create_check_timeout_expired(timeout: float) -> "TestResult":
|
||||
return TestResult("Check timeout expired", "FAIL", timeout)
|
||||
def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
|
||||
return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)
|
||||
|
||||
|
||||
TestResults = List[TestResult]
|
||||
@ -303,7 +304,7 @@ class JobReport:
|
||||
# indicates that this is not real job report but report for the job that was skipped by rerun check
|
||||
job_skipped: bool = False
|
||||
# indicates that report generated by CI script in order to check later if job was killed before real report is generated
|
||||
pre_report: bool = False
|
||||
dummy: bool = False
|
||||
exit_code: int = -1
|
||||
|
||||
@staticmethod
|
||||
@ -311,7 +312,7 @@ class JobReport:
|
||||
return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
|
||||
|
||||
@classmethod
|
||||
def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
|
||||
def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
|
||||
return JobReport(
|
||||
status=status,
|
||||
description="",
|
||||
@ -320,7 +321,7 @@ class JobReport:
|
||||
duration=0.0,
|
||||
additional_files=[],
|
||||
job_skipped=job_skipped,
|
||||
pre_report=True,
|
||||
dummy=True,
|
||||
)
|
||||
|
||||
def update_duration(self):
|
||||
|
@ -2,6 +2,8 @@
|
||||
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
from io import TextIOWrapper
|
||||
from pathlib import Path
|
||||
@ -30,20 +32,35 @@ class TeePopen:
|
||||
self._process = None # type: Optional[Popen]
|
||||
self.timeout = timeout
|
||||
self.timeout_exceeded = False
|
||||
self.terminated_by_sigterm = False
|
||||
self.terminated_by_sigkill = False
|
||||
self.pid = 0
|
||||
|
||||
def _check_timeout(self) -> None:
|
||||
if self.timeout is None:
|
||||
return
|
||||
sleep(self.timeout)
|
||||
logging.warning(
|
||||
"Timeout exceeded. Send SIGTERM to process %s, timeout %s",
|
||||
self.process.pid,
|
||||
self.timeout,
|
||||
)
|
||||
self.send_signal(signal.SIGTERM)
|
||||
time_wait = 0
|
||||
self.terminated_by_sigterm = True
|
||||
self.timeout_exceeded = True
|
||||
while self.process.poll() is None and time_wait < 100:
|
||||
print("wait...")
|
||||
wait = 5
|
||||
sleep(wait)
|
||||
time_wait += wait
|
||||
while self.process.poll() is None:
|
||||
logging.warning(
|
||||
"Killing process %s, timeout %s exceeded",
|
||||
self.process.pid,
|
||||
self.timeout,
|
||||
logging.error(
|
||||
"Process is still running. Send SIGKILL",
|
||||
)
|
||||
os.killpg(self.process.pid, 9)
|
||||
sleep(10)
|
||||
self.send_signal(signal.SIGKILL)
|
||||
self.terminated_by_sigkill = True
|
||||
sleep(5)
|
||||
|
||||
def __enter__(self) -> "TeePopen":
|
||||
self.process = Popen(
|
||||
@ -57,6 +74,9 @@ class TeePopen:
|
||||
bufsize=1,
|
||||
errors="backslashreplace",
|
||||
)
|
||||
sleep(1)
|
||||
self.pid = self._get_child_pid()
|
||||
print(f"Subprocess started, pid [{self.process.pid}], child pid [{self.pid}]")
|
||||
if self.timeout is not None and self.timeout > 0:
|
||||
t = Thread(target=self._check_timeout)
|
||||
t.daemon = True # does not block the program from exit
|
||||
@ -77,6 +97,22 @@ class TeePopen:
|
||||
|
||||
self.log_file.close()
|
||||
|
||||
def _get_child_pid(self):
|
||||
# linux only
|
||||
ps_command = f"ps --ppid {self.process.pid} -o pid="
|
||||
res = "NA"
|
||||
try:
|
||||
result = subprocess.run(
|
||||
ps_command, shell=True, capture_output=True, text=True
|
||||
)
|
||||
res = result.stdout.strip()
|
||||
pid = int(res)
|
||||
return pid
|
||||
except Exception as e:
|
||||
print(f"Failed to get child's pid, command [{ps_command}], result [{res}]")
|
||||
print(f"ERROR: getting Python subprocess PID: {e}")
|
||||
return self.process.pid
|
||||
|
||||
def wait(self) -> int:
|
||||
if self.process.stdout is not None:
|
||||
for line in self.process.stdout:
|
||||
@ -85,6 +121,15 @@ class TeePopen:
|
||||
|
||||
return self.process.wait()
|
||||
|
||||
def poll(self):
|
||||
return self.process.poll()
|
||||
|
||||
def send_signal(self, signal_num):
|
||||
if self.pid:
|
||||
os.kill(self.pid, signal_num)
|
||||
else:
|
||||
print("ERROR: no process to send signal")
|
||||
|
||||
@property
|
||||
def process(self) -> Popen:
|
||||
if self._process is not None:
|
||||
|
Loading…
Reference in New Issue
Block a user