CI: Functional tests to store artifacts on timeout

This commit is contained in:
Max Kainov 2024-08-06 21:39:41 +02:00
parent 8d0d870326
commit 427016a450
13 changed files with 107 additions and 116 deletions

View File

@ -256,22 +256,6 @@ function configure
rm -f "$FASTTEST_DATA/config.d/secure_ports.xml" rm -f "$FASTTEST_DATA/config.d/secure_ports.xml"
} }
function timeout_with_logging() {
local exit_code=0
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
if [[ "${exit_code}" -eq "124" ]]
then
echo "The command 'timeout ${*}' has been killed by timeout."
else
echo "No, it isn't a timeout."
fi
return $exit_code
}
function run_tests function run_tests
{ {
clickhouse-server --version clickhouse-server --version
@ -340,7 +324,7 @@ case "$stage" in
configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt" configure 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/install_log.txt"
;& ;&
"run_tests") "run_tests")
timeout_with_logging 35m bash -c run_tests ||: run_tests ||:
/process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \ /process_functional_tests_result.py --in-results-dir "$FASTTEST_OUTPUT/" \
--out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \ --out-results-file "$FASTTEST_OUTPUT/test_results.tsv" \
--out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv" --out-status-file "$FASTTEST_OUTPUT/check_status.tsv" || echo -e "failure\tCannot parse results" > "$FASTTEST_OUTPUT/check_status.tsv"

View File

@ -35,7 +35,6 @@ RUN mkdir -p /tmp/clickhouse-odbc-tmp \
ENV TZ=Europe/Amsterdam ENV TZ=Europe/Amsterdam
ENV MAX_RUN_TIME=9000
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git" ARG sqllogic_test_repo="https://github.com/gregrahn/sqllogictest.git"

View File

@ -94,7 +94,7 @@ function run_tests()
export -f run_tests export -f run_tests
timeout "${MAX_RUN_TIME:-9000}" bash -c run_tests || echo "timeout reached" >&2 run_tests
#/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv #/process_functional_tests_result.py || echo -e "failure\tCannot parse results" > /test_output/check_status.tsv

View File

@ -22,7 +22,6 @@ ARG sqltest_repo="https://github.com/elliotchance/sqltest/"
RUN git clone ${sqltest_repo} RUN git clone ${sqltest_repo}
ENV TZ=UTC ENV TZ=UTC
ENV MAX_RUN_TIME=900
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
COPY run.sh / COPY run.sh /

View File

@ -4,9 +4,6 @@
source /setup_export_logs.sh source /setup_export_logs.sh
set -e -x set -e -x
MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
# Choose random timezone for this test run # Choose random timezone for this test run
TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)" TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
echo "Choosen random timezone $TZ" echo "Choosen random timezone $TZ"
@ -123,9 +120,6 @@ if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]
clickhouse-client --query "DROP TABLE datasets.hits_v1" clickhouse-client --query "DROP TABLE datasets.hits_v1"
clickhouse-client --query "DROP TABLE datasets.visits_v1" clickhouse-client --query "DROP TABLE datasets.visits_v1"
MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours)
MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited)
else else
clickhouse-client --query "CREATE DATABASE test" clickhouse-client --query "CREATE DATABASE test"
clickhouse-client --query "SHOW TABLES FROM test" clickhouse-client --query "SHOW TABLES FROM test"
@ -257,24 +251,7 @@ function run_tests()
export -f run_tests export -f run_tests
function timeout_with_logging() { run_tests ||:
local exit_code=0
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
if [[ "${exit_code}" -eq "124" ]]
then
echo "The command 'timeout ${*}' has been killed by timeout."
else
echo "No, it isn't a timeout."
fi
return $exit_code
}
TIMEOUT=$((MAX_RUN_TIME - 700))
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
echo "Files in current directory" echo "Files in current directory"
ls -la ./ ls -la ./

View File

@ -65,7 +65,6 @@ ENV TZ=Europe/Amsterdam
RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
ENV NUM_TRIES=1 ENV NUM_TRIES=1
ENV MAX_RUN_TIME=0
# Unrelated to vars in setup_minio.sh, but should be the same there # Unrelated to vars in setup_minio.sh, but should be the same there
# to have the same binaries for local running scenario # to have the same binaries for local running scenario

View File

@ -12,9 +12,6 @@ dmesg --clear
# fail on errors, verbose and export all env variables # fail on errors, verbose and export all env variables
set -e -x -a set -e -x -a
MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0} USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0} USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
@ -308,8 +305,6 @@ function run_tests()
try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')" try_run_with_retry 10 clickhouse-client -q "insert into system.zookeeper (name, path, value) values ('auxiliary_zookeeper2', '/test/chroot/', '')"
TIMEOUT=$((MAX_RUN_TIME - 800 > 8400 ? 8400 : MAX_RUN_TIME - 800))
START_TIME=${SECONDS}
set +e set +e
TEST_ARGS=( TEST_ARGS=(
@ -324,32 +319,22 @@ function run_tests()
--test-runs "$NUM_TRIES" --test-runs "$NUM_TRIES"
"${ADDITIONAL_OPTIONS[@]}" "${ADDITIONAL_OPTIONS[@]}"
) )
timeout --preserve-status --signal TERM --kill-after 60m ${TIMEOUT}s clickhouse-test "${TEST_ARGS[@]}" 2>&1 \ clickhouse-test "${TEST_ARGS[@]}" 2>&1 \
| ts '%Y-%m-%d %H:%M:%S' \ | ts '%Y-%m-%d %H:%M:%S' \
| tee -a test_output/test_result.txt | tee -a test_output/test_result.txt
set -e set -e
DURATION=$((SECONDS - START_TIME))
echo "Elapsed ${DURATION} seconds."
if [[ $DURATION -ge $TIMEOUT ]]
then
echo "It looks like the command is terminated by the timeout, which is ${TIMEOUT} seconds."
fi
} }
export -f run_tests export -f run_tests
# This should be enough to setup job and collect artifacts
TIMEOUT=$((MAX_RUN_TIME - 700))
if [ "$NUM_TRIES" -gt "1" ]; then if [ "$NUM_TRIES" -gt "1" ]; then
# We don't run tests with Ordinary database in PRs, only in master. # We don't run tests with Ordinary database in PRs, only in master.
# So run new/changed tests with Ordinary at least once in flaky check. # So run new/changed tests with Ordinary at least once in flaky check.
timeout_with_logging "$TIMEOUT" bash -c 'NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests' \ NUM_TRIES=1; USE_DATABASE_ORDINARY=1; run_tests \
| sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||: | sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
fi fi
timeout_with_logging "$TIMEOUT" bash -c run_tests ||: run_tests ||:
echo "Files in current directory" echo "Files in current directory"
ls -la ./ ls -la ./

View File

@ -40,22 +40,6 @@ function fn_exists() {
declare -F "$1" > /dev/null; declare -F "$1" > /dev/null;
} }
function timeout_with_logging() {
local exit_code=0
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
echo "Checking if it is a timeout. The code 124 will indicate a timeout."
if [[ "${exit_code}" -eq "124" ]]
then
echo "The command 'timeout ${*}' has been killed by timeout."
else
echo "No, it isn't a timeout."
fi
return $exit_code
}
function collect_core_dumps() function collect_core_dumps()
{ {
find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do find . -type f -maxdepth 1 -name 'core.*' | while read -r core; do

View File

@ -50,7 +50,6 @@ from github_helper import GitHub
from pr_info import PRInfo from pr_info import PRInfo
from report import ( from report import (
ERROR, ERROR,
FAILURE,
PENDING, PENDING,
SUCCESS, SUCCESS,
BuildResult, BuildResult,
@ -62,11 +61,11 @@ from report import (
FAIL, FAIL,
) )
from s3_helper import S3Helper from s3_helper import S3Helper
from stopwatch import Stopwatch
from tee_popen import TeePopen from tee_popen import TeePopen
from ci_cache import CiCache from ci_cache import CiCache
from ci_settings import CiSettings from ci_settings import CiSettings
from ci_buddy import CIBuddy from ci_buddy import CIBuddy
from stopwatch import Stopwatch
from version_helper import get_version_from_repo from version_helper import get_version_from_repo
# pylint: disable=too-many-lines # pylint: disable=too-many-lines
@ -370,8 +369,8 @@ def _pre_action(s3, job_name, batch, indata, pr_info):
# skip_status = SUCCESS already there # skip_status = SUCCESS already there
GH.print_in_group("Commit Status Data", job_status) GH.print_in_group("Commit Status Data", job_status)
# create pre report # create dummy report
jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped) jr = JobReport.create_dummy(status=skip_status, job_skipped=to_be_skipped)
jr.dump() jr.dump()
if not to_be_skipped: if not to_be_skipped:
@ -990,19 +989,20 @@ def _run_test(job_name: str, run_command: str) -> int:
stopwatch = Stopwatch() stopwatch = Stopwatch()
job_log = Path(TEMP_PATH) / "job_log.txt" job_log = Path(TEMP_PATH) / "job_log.txt"
with TeePopen(run_command, job_log, env, timeout) as process: with TeePopen(run_command, job_log, env, timeout) as process:
print(f"Job process started, pid [{process.process.pid}]")
retcode = process.wait() retcode = process.wait()
if retcode != 0: if retcode != 0:
print(f"Run action failed for: [{job_name}] with exit code [{retcode}]") print(f"Run action failed for: [{job_name}] with exit code [{retcode}]")
if timeout and process.timeout_exceeded: if process.timeout_exceeded:
print(f"Timeout {timeout} exceeded, dumping the job report") print(f"Job timed out: [{job_name}] exit code [{retcode}]")
JobReport( assert JobReport.exist(), "JobReport real or dummy must be present"
status=FAILURE, jr = JobReport.load()
description=f"Timeout {timeout} exceeded", if jr.dummy:
test_results=[TestResult.create_check_timeout_expired(timeout)], print(
start_time=stopwatch.start_time_str, f"ERROR: Run action failed with timeout and did not generate JobReport - update dummy report with execution time"
duration=stopwatch.duration_seconds, )
additional_files=[job_log], jr.test_results = [TestResult.create_check_timeout_expired()]
).dump() jr.duration = stopwatch.duration_seconds
print(f"Run action done for: [{job_name}]") print(f"Run action done for: [{job_name}]")
return retcode return retcode
@ -1205,7 +1205,7 @@ def main() -> int:
job_report job_report
), "BUG. There must be job report either real report, or pre-report if job was killed" ), "BUG. There must be job report either real report, or pre-report if job was killed"
error_description = "" error_description = ""
if not job_report.pre_report: if not job_report.dummy:
# it's a real job report # it's a real job report
ch_helper = ClickHouseHelper() ch_helper = ClickHouseHelper()
check_url = "" check_url = ""

View File

@ -332,7 +332,7 @@ class JobConfig:
# will be triggered for the job if omitted in CI workflow yml # will be triggered for the job if omitted in CI workflow yml
run_command: str = "" run_command: str = ""
# job timeout, seconds # job timeout, seconds
timeout: Optional[int] = None timeout: int = 7200
# sets number of batches for a multi-batch job # sets number of batches for a multi-batch job
num_batches: int = 1 num_batches: int = 1
# label that enables job in CI, if set digest isn't used # label that enables job in CI, if set digest isn't used
@ -421,7 +421,6 @@ class CommonJobConfigs:
), ),
run_command='functional_test_check.py "$CHECK_NAME"', run_command='functional_test_check.py "$CHECK_NAME"',
runner_type=Runners.FUNC_TESTER, runner_type=Runners.FUNC_TESTER,
timeout=9000,
) )
STATEFUL_TEST = JobConfig( STATEFUL_TEST = JobConfig(
job_name_keyword="stateful", job_name_keyword="stateful",

View File

@ -5,10 +5,11 @@ import csv
import logging import logging
import os import os
import re import re
import signal
import subprocess import subprocess
import sys import sys
from pathlib import Path from pathlib import Path
from typing import List, Tuple from typing import List, Tuple, Optional
from build_download_helper import download_all_deb_packages from build_download_helper import download_all_deb_packages
from clickhouse_helper import CiLogsCredentials from clickhouse_helper import CiLogsCredentials
@ -25,11 +26,12 @@ from report import (
TestResults, TestResults,
read_test_results, read_test_results,
FAILURE, FAILURE,
TestResult,
) )
from stopwatch import Stopwatch from stopwatch import Stopwatch
from tee_popen import TeePopen from tee_popen import TeePopen
from ci_config import CI from ci_config import CI
from ci_utils import Utils from ci_utils import Utils, Shell
NO_CHANGES_MSG = "Nothing to run" NO_CHANGES_MSG = "Nothing to run"
@ -113,10 +115,6 @@ def get_run_command(
if flaky_check: if flaky_check:
envs.append("-e NUM_TRIES=50") envs.append("-e NUM_TRIES=50")
envs.append("-e MAX_RUN_TIME=2800")
else:
max_run_time = os.getenv("MAX_RUN_TIME", "0")
envs.append(f"-e MAX_RUN_TIME={max_run_time}")
envs += [f"-e {e}" for e in additional_envs] envs += [f"-e {e}" for e in additional_envs]
@ -128,7 +126,7 @@ def get_run_command(
) )
return ( return (
f"docker run --volume={builds_path}:/package_folder " f"docker run --rm --name func-tester --volume={builds_path}:/package_folder "
# For dmesg and sysctl # For dmesg and sysctl
"--privileged " "--privileged "
f"{ci_logs_args}" f"{ci_logs_args}"
@ -198,7 +196,7 @@ def process_results(
state, description = status[0][0], status[0][1] state, description = status[0][0], status[0][1]
if ret_code != 0: if ret_code != 0:
state = ERROR state = ERROR
description += " (but script exited with an error)" description = f"Job failed, exit code: {ret_code}. " + description
try: try:
results_path = result_directory / "test_results.tsv" results_path = result_directory / "test_results.tsv"
@ -240,7 +238,19 @@ def parse_args():
return parser.parse_args() return parser.parse_args()
test_process = None # type: Optional[TeePopen]
timeout_expired = False
def handle_sigterm(signum, _frame):
print(f"WARNING: Received signal {signum}")
global timeout_expired
timeout_expired = True
Shell.check(f"docker exec func-tester pkill -f clickhouse-test", verbose=True)
def main(): def main():
signal.signal(signal.SIGTERM, handle_sigterm)
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
for handler in logging.root.handlers: for handler in logging.root.handlers:
# pylint: disable=protected-access # pylint: disable=protected-access
@ -328,11 +338,13 @@ def main():
logging.info("Going to run func tests: %s", run_command) logging.info("Going to run func tests: %s", run_command)
with TeePopen(run_command, run_log_path) as process: with TeePopen(run_command, run_log_path) as process:
global test_process
test_process = process
retcode = process.wait() retcode = process.wait()
if retcode == 0: if retcode == 0:
logging.info("Run successfully") logging.info("Run successfully")
else: else:
logging.info("Run failed") logging.info("Run failed, exit code %s", retcode)
try: try:
subprocess.check_call( subprocess.check_call(
@ -348,6 +360,13 @@ def main():
state, description, test_results, additional_logs = process_results( state, description, test_results, additional_logs = process_results(
retcode, result_path, server_log_path retcode, result_path, server_log_path
) )
if timeout_expired:
description = "Timeout expired"
state = FAILURE
test_results.insert(
0, TestResult.create_check_timeout_expired(stopwatch.duration_seconds)
)
else: else:
print( print(
"This is validate bugfix or flaky check run, but no changes test to run - skip with success" "This is validate bugfix or flaky check run, but no changes test to run - skip with success"

View File

@ -249,6 +249,7 @@ JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"
JOB_STARTED_TEST_NAME = "STARTED" JOB_STARTED_TEST_NAME = "STARTED"
JOB_FINISHED_TEST_NAME = "COMPLETED" JOB_FINISHED_TEST_NAME = "COMPLETED"
JOB_TIMEOUT_TEST_NAME = "Job Timeout Expired"
@dataclass @dataclass
@ -277,8 +278,8 @@ class TestResult:
self.log_files.append(log_path) self.log_files.append(log_path)
@staticmethod @staticmethod
def create_check_timeout_expired(timeout: float) -> "TestResult": def create_check_timeout_expired(duration: Optional[float] = None) -> "TestResult":
return TestResult("Check timeout expired", "FAIL", timeout) return TestResult(JOB_TIMEOUT_TEST_NAME, "FAIL", time=duration)
TestResults = List[TestResult] TestResults = List[TestResult]
@ -303,7 +304,7 @@ class JobReport:
# indicates that this is not real job report but report for the job that was skipped by rerun check # indicates that this is not real job report but report for the job that was skipped by rerun check
job_skipped: bool = False job_skipped: bool = False
# indicates that report generated by CI script in order to check later if job was killed before real report is generated # indicates that report generated by CI script in order to check later if job was killed before real report is generated
pre_report: bool = False dummy: bool = False
exit_code: int = -1 exit_code: int = -1
@staticmethod @staticmethod
@ -311,7 +312,7 @@ class JobReport:
return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S") return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
@classmethod @classmethod
def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport": def create_dummy(cls, status: str, job_skipped: bool) -> "JobReport":
return JobReport( return JobReport(
status=status, status=status,
description="", description="",
@ -320,7 +321,7 @@ class JobReport:
duration=0.0, duration=0.0,
additional_files=[], additional_files=[],
job_skipped=job_skipped, job_skipped=job_skipped,
pre_report=True, dummy=True,
) )
def update_duration(self): def update_duration(self):

View File

@ -2,6 +2,8 @@
import logging import logging
import os import os
import signal
import subprocess
import sys import sys
from io import TextIOWrapper from io import TextIOWrapper
from pathlib import Path from pathlib import Path
@ -30,20 +32,35 @@ class TeePopen:
self._process = None # type: Optional[Popen] self._process = None # type: Optional[Popen]
self.timeout = timeout self.timeout = timeout
self.timeout_exceeded = False self.timeout_exceeded = False
self.terminated_by_sigterm = False
self.terminated_by_sigkill = False
self.pid = 0
def _check_timeout(self) -> None: def _check_timeout(self) -> None:
if self.timeout is None: if self.timeout is None:
return return
sleep(self.timeout) sleep(self.timeout)
self.timeout_exceeded = True
while self.process.poll() is None:
logging.warning( logging.warning(
"Killing process %s, timeout %s exceeded", "Timeout exceeded. Send SIGTERM to process %s, timeout %s",
self.process.pid, self.process.pid,
self.timeout, self.timeout,
) )
os.killpg(self.process.pid, 9) self.send_signal(signal.SIGTERM)
sleep(10) time_wait = 0
self.terminated_by_sigterm = True
self.timeout_exceeded = True
while self.process.poll() is None and time_wait < 100:
print("wait...")
wait = 5
sleep(wait)
time_wait += wait
while self.process.poll() is None:
logging.error(
"Process is still running. Send SIGKILL",
)
self.send_signal(signal.SIGKILL)
self.terminated_by_sigkill = True
sleep(5)
def __enter__(self) -> "TeePopen": def __enter__(self) -> "TeePopen":
self.process = Popen( self.process = Popen(
@ -57,6 +74,9 @@ class TeePopen:
bufsize=1, bufsize=1,
errors="backslashreplace", errors="backslashreplace",
) )
sleep(1)
self.pid = self._get_child_pid()
print(f"Subprocess started, pid [{self.process.pid}], child pid [{self.pid}]")
if self.timeout is not None and self.timeout > 0: if self.timeout is not None and self.timeout > 0:
t = Thread(target=self._check_timeout) t = Thread(target=self._check_timeout)
t.daemon = True # does not block the program from exit t.daemon = True # does not block the program from exit
@ -77,6 +97,22 @@ class TeePopen:
self.log_file.close() self.log_file.close()
def _get_child_pid(self):
# linux only
ps_command = f"ps --ppid {self.process.pid} -o pid="
res = "NA"
try:
result = subprocess.run(
ps_command, shell=True, capture_output=True, text=True
)
res = result.stdout.strip()
pid = int(res)
return pid
except Exception as e:
print(f"Failed to get child's pid, command [{ps_command}], result [{res}]")
print(f"ERROR: getting Python subprocess PID: {e}")
return self.process.pid
def wait(self) -> int: def wait(self) -> int:
if self.process.stdout is not None: if self.process.stdout is not None:
for line in self.process.stdout: for line in self.process.stdout:
@ -85,6 +121,15 @@ class TeePopen:
return self.process.wait() return self.process.wait()
def poll(self):
return self.process.poll()
def send_signal(self, signal_num):
if self.pid:
os.kill(self.pid, signal_num)
else:
print("ERROR: no process to send signal")
@property @property
def process(self) -> Popen: def process(self) -> Popen:
if self._process is not None: if self._process is not None: