CI: Report job start and finish to CI DB

This commit is contained in:
Max K 2024-07-15 16:32:49 +02:00
parent 8340cec8b7
commit d4e7188750
2 changed files with 146 additions and 104 deletions

View File

@ -48,7 +48,19 @@ from git_helper import GIT_PREFIX, Git
from git_helper import Runner as GitRunner
from github_helper import GitHub
from pr_info import PRInfo
from report import ERROR, FAILURE, PENDING, SUCCESS, BuildResult, JobReport, TestResult
from report import (
ERROR,
FAILURE,
PENDING,
SUCCESS,
BuildResult,
JobReport,
TestResult,
OK,
JOB_STARTED_TEST_NAME,
JOB_FINISHED_TEST_NAME,
FAIL,
)
from s3_helper import S3Helper
from stopwatch import Stopwatch
from tee_popen import TeePopen
@ -263,7 +275,8 @@ def check_missing_images_on_dockerhub(
return result
def _pre_action(s3, indata, pr_info):
def _pre_action(s3, job_name, batch, indata, pr_info):
no_cache = CiSettings.create_from_run_config(indata).no_ci_cache
print("Clear dmesg")
Utils.clear_dmesg()
CommitStatusData.cleanup()
@ -282,6 +295,90 @@ def _pre_action(s3, indata, pr_info):
ci_cache.dump_run_config(indata)
to_be_skipped = False
skip_status = SUCCESS
# check if job was run already
if CI.is_build_job(job_name):
# this is a build job - check if a build report is present
build_result = (
BuildResult.load_any(job_name, pr_info.number, pr_info.head_ref)
if not no_cache
else None
)
if build_result:
if build_result.status == SUCCESS:
to_be_skipped = True
else:
print(
"Build report found but status is unsuccessful - will try to rerun"
)
print("::group::Build Report")
print(build_result.as_json())
print("::endgroup::")
else:
# this is a test job - check if GH commit status or cache record is present
commit = get_commit(GitHub(get_best_robot_token(), per_page=100), pr_info.sha)
# rerun helper check
# FIXME: Find a way to identify if job restarted manually (by developer) or by automatic workflow restart (died spot-instance)
# disable rerun check for the former
if job_name not in (
CI.JobNames.BUILD_CHECK,
): # we might want to rerun build report job
rerun_helper = RerunHelper(commit, _get_ext_check_name(job_name))
if rerun_helper.is_already_finished_by_status():
print("WARNING: Rerunning job with GH status ")
status = rerun_helper.get_finished_status()
assert status
print("::group::Commit Status")
print(status)
print("::endgroup::")
to_be_skipped = True
skip_status = status.state
# ci cache check
if not to_be_skipped and not no_cache:
ci_cache = CiCache(s3, indata["jobs_data"]["digests"]).update()
job_config = CI.get_job_config(job_name)
if ci_cache.is_successful(
job_name,
batch,
job_config.num_batches,
job_config.required_on_release_branch,
):
print("CICache record has be found - job will be skipped")
job_status = ci_cache.get_successful(
job_name, batch, job_config.num_batches
)
assert job_status, "BUG"
_create_gh_status(
commit,
job_name,
batch,
job_config.num_batches,
job_status,
)
to_be_skipped = True
# skip_status = SUCCESS already there
GHActions.print_in_group("Commit Status Data", job_status)
# create pre report
jr = JobReport.create_pre_report(status=skip_status, job_skipped=to_be_skipped)
jr.dump()
if not to_be_skipped:
print("push start record to ci db")
prepared_events = prepare_tests_results_for_clickhouse(
pr_info,
[TestResult(JOB_STARTED_TEST_NAME, OK)],
SUCCESS,
0.0,
JobReport.get_start_time_from_current(),
"",
_get_ext_check_name(job_name),
)
ClickHouseHelper().insert_events_into(
db="default", table="checks", events=prepared_events
)
print(f"Pre action done. Report files [{reports_files}] have been downloaded")
@ -1045,108 +1142,23 @@ def main() -> int:
### PRE action: start
elif args.pre:
assert indata, "Run config must be provided via --infile"
_pre_action(s3, indata, pr_info)
JobReport.create_pre_report().dump()
_pre_action(s3, args.job_name, args.batch, indata, pr_info)
### RUN action: start
elif args.run:
assert indata
ci_settings = CiSettings.create_from_run_config(indata)
job_report = JobReport.load()
check_name = args.job_name
check_name_with_group = _get_ext_check_name(check_name)
print(
f"Check if rerun for name: [{check_name}], extended name [{check_name_with_group}]"
)
previous_status = None
if CI.is_build_job(check_name):
# this is a build job - check if a build report is present
build_result = (
BuildResult.load_any(check_name, pr_info.number, pr_info.head_ref)
if not ci_settings.no_ci_cache
else None
)
if build_result:
if build_result.status == SUCCESS:
previous_status = build_result.status
JobReport(
status=SUCCESS,
description="",
test_results=[],
start_time="",
duration=0.0,
additional_files=[],
job_skipped=True,
).dump()
else:
# FIXME: Consider reusing failures for build jobs.
# Just remove this if/else - that makes build job starting and failing immediately
if job_report.job_skipped and not args.force:
print(
"Build report found but status is unsuccessful - will try to rerun"
f"Commit status or Build Report is already present - job will be skipped with status: [{job_report.status}]"
)
print("::group::Build Report")
print(build_result.as_json())
print("::endgroup::")
else:
# this is a test job - check if GH commit status or cache record is present
commit = get_commit(
GitHub(get_best_robot_token(), per_page=100), pr_info.sha
)
# rerun helper check
# FIXME: Find a way to identify if job restarted manually (by developer) or by automatic workflow restart (died spot-instance)
# disable rerun check for the former
if check_name not in (
CI.JobNames.BUILD_CHECK,
): # we might want to rerun build report job
rerun_helper = RerunHelper(commit, check_name_with_group)
if rerun_helper.is_already_finished_by_status():
print("WARNING: Rerunning job with GH status ")
status = rerun_helper.get_finished_status()
assert status
print("::group::Commit Status")
print(status)
print("::endgroup::")
previous_status = status.state
print("Create dummy job report with job_skipped flag")
JobReport(
status=status.state,
description="",
test_results=[],
start_time="",
duration=0.0,
additional_files=[],
job_skipped=True,
).dump()
# ci cache check
if not previous_status and not ci_settings.no_ci_cache:
ci_cache = CiCache(s3, indata["jobs_data"]["digests"]).update()
job_config = CI.get_job_config(check_name)
if ci_cache.is_successful(
check_name,
args.batch,
job_config.num_batches,
job_config.required_on_release_branch,
):
job_status = ci_cache.get_successful(
check_name, args.batch, job_config.num_batches
)
assert job_status, "BUG"
_create_gh_status(
commit,
check_name,
args.batch,
job_config.num_batches,
job_status,
)
previous_status = job_status.status
GHActions.print_in_group("Commit Status Data", job_status)
if previous_status and not args.force:
print(
f"Commit status or Build Report is already present - job will be skipped with status: [{previous_status}]"
)
if previous_status == SUCCESS:
if job_report.status == SUCCESS:
exit_code = 0
else:
exit_code = 1
@ -1166,7 +1178,8 @@ def main() -> int:
assert (
job_report
), "BUG. There must be job report either real report, or pre-report if job was killed"
if not job_report.job_skipped and not job_report.pre_report:
error_description = ""
if not job_report.pre_report:
# it's a real job report
ch_helper = ClickHouseHelper()
check_url = ""
@ -1244,7 +1257,6 @@ def main() -> int:
pr_info,
dump_to_file=True,
)
print(f"Job report url: [{check_url}]")
prepared_events = prepare_tests_results_for_clickhouse(
pr_info,
@ -1269,9 +1281,7 @@ def main() -> int:
)
elif job_report.job_skipped:
print(f"Skipped after rerun check {[args.job_name]} - do nothing")
elif job_report.job_skipped:
print(f"Job was skipped {[args.job_name]} - do nothing")
elif job_report.pre_report:
else:
print(f"ERROR: Job was killed - generate evidence")
job_report.update_duration()
ret_code = os.getenv("JOB_EXIT_CODE", "")
@ -1282,10 +1292,13 @@ def main() -> int:
pass
if Utils.is_killed_with_oom():
print("WARNING: OOM while job execution")
error = f"Out Of Memory, exit_code {job_report.exit_code}, after {int(job_report.duration)}s"
error_description = f"Out Of Memory, exit_code {job_report.exit_code}"
else:
error = f"Unknown, exit_code {job_report.exit_code}, after {int(job_report.duration)}s"
CIBuddy().post_error(error, job_name=_get_ext_check_name(args.job_name))
error_description = f"Unknown, exit_code {job_report.exit_code}"
CIBuddy().post_error(
error_description + f" after {int(job_report.duration)}s",
job_name=_get_ext_check_name(args.job_name),
)
if CI.is_test_job(args.job_name):
gh = GitHub(get_best_robot_token(), per_page=100)
commit = get_commit(gh, pr_info.sha)
@ -1293,11 +1306,32 @@ def main() -> int:
commit,
ERROR,
"",
"Error: " + error,
"Error: " + error_description,
_get_ext_check_name(args.job_name),
pr_info,
dump_to_file=True,
)
if not job_report.job_skipped:
print("push finish record to ci db")
prepared_events = prepare_tests_results_for_clickhouse(
pr_info,
[
TestResult(
JOB_FINISHED_TEST_NAME,
FAIL if error_description else OK,
raw_logs=error_description or None,
)
],
SUCCESS if not error_description else ERROR,
0.0,
JobReport.get_start_time_from_current(),
"",
_get_ext_check_name(args.job_name),
)
ClickHouseHelper().insert_events_into(
db="default", table="checks", events=prepared_events
)
### POST action: end
### MARK SUCCESS action: start

View File

@ -247,6 +247,9 @@ BASE_HEADERS = ["Test name", "Test status"]
# should not be in TEMP directory or any directory that may be cleaned during the job execution
JOB_REPORT_FILE = Path(GITHUB_WORKSPACE) / "job_report.json"
JOB_STARTED_TEST_NAME = "STARTED"
JOB_FINISHED_TEST_NAME = "COMPLETED"
@dataclass
class TestResult:
@ -304,14 +307,19 @@ class JobReport:
exit_code: int = -1
@staticmethod
def create_pre_report() -> "JobReport":
def get_start_time_from_current():
return datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
@classmethod
def create_pre_report(cls, status: str, job_skipped: bool) -> "JobReport":
return JobReport(
status=ERROR,
status=status,
description="",
test_results=[],
start_time=datetime.datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S"),
start_time=cls.get_start_time_from_current(),
duration=0.0,
additional_files=[],
job_skipped=job_skipped,
pre_report=True,
)