ClickHouse/tests/ci/workflow_jobs_lambda/app.py
2023-12-12 14:05:59 +01:00

203 lines
6.3 KiB
Python

#!/usr/bin/env python
"""
Lambda gets the workflow_job events, see
https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_job
Then it either posts it as is to the play.clickhouse.com, or anonymizes the sensitive
fields for private repositories
"""
import json
import logging
from base64 import b64decode
from dataclasses import dataclass
from typing import Any, List, Optional
from lambda_shared import ClickHouseHelper, InsertException, get_parameter_from_ssm
logging.getLogger().setLevel(logging.INFO)
@dataclass
class WorkflowJob:
id: int
run_id: int
workflow_name: str
head_branch: str
run_url: str
run_attempt: int
node_id: str
head_sha: str
url: str
html_url: str
status: str
conclusion: str
started_at: str
completed_at: str
name: str
steps: int # just number of steps, we don't keep steps
check_run_url: str
labels: List[str]
runner_id: int
runner_name: str
runner_group_id: int
runner_group_name: str
repository: str
def anonimyze_url(self, url: str) -> str:
return url.replace(self.repository, "ANONYMIZED_REPO")
def anonimyze(self):
anm = "ANONYMIZED"
self.workflow_name = anm
self.head_branch = anm
self.run_url = self.anonimyze_url(self.run_url)
self.node_id = anm
self.url = self.anonimyze_url(self.url)
self.html_url = self.anonimyze_url(self.html_url)
self.name = anm
self.check_run_url = self.anonimyze_url(self.check_run_url)
self.repository = anm
def as_dict(self) -> dict:
return self.__dict__
CH_CLIENT = None # type: Optional[ClickHouseHelper]
def send_event_workflow_job(workflow_job: WorkflowJob) -> None:
# # SHOW CREATE TABLE default.workflow_jobs
# CREATE TABLE default.workflow_jobs UUID 'c0351924-8ccd-47a6-9db0-e28a9eee2fdf'
# (
# `id` UInt64,
# `run_id` UInt64,
# `workflow_name` LowCardinality(String),
# `head_branch` LowCardinality(String),
# `run_url` String,
# `run_attempt` UInt16,
# `node_id` String,
# `head_sha` String,
# `url` String,
# `html_url` String,
# `status` Enum8('waiting' = 1, 'queued' = 2, 'in_progress' = 3, 'completed' = 4),
# `conclusion` LowCardinality(String),
# `started_at` DateTime,
# `completed_at` DateTime,
# `name` LowCardinality(String),
# `steps` UInt16,
# `check_run_url` String,
# `labels` Array(LowCardinality(String)),
# `runner_id` UInt64,
# `runner_name` String,
# `runner_group_id` UInt64,
# `runner_group_name` LowCardinality(String),
# `repository` LowCardinality(String),
# `updated_at` DateTime DEFAULT now()
# )
# ENGINE = ReplicatedMergeTree('/clickhouse/tables/c0351924-8ccd-47a6-9db0-e28a9eee2fdf/{shard}', '{replica}')
# PARTITION BY toStartOfMonth(started_at)
# ORDER BY (id, updated_at)
# SETTINGS index_granularity = 8192
global CH_CLIENT
CH_CLIENT = CH_CLIENT or ClickHouseHelper(
get_parameter_from_ssm("clickhouse-test-stat-url"),
get_parameter_from_ssm("clickhouse-test-stat-login"),
get_parameter_from_ssm("clickhouse-test-stat-password"),
)
try:
CH_CLIENT.insert_event_into(
"default", "workflow_jobs", workflow_job.as_dict(), False
)
except InsertException as ex:
logging.exception(
"Got an exception on insert, tryuing to update the client "
"credentials and repeat",
exc_info=ex,
)
CH_CLIENT = ClickHouseHelper(
get_parameter_from_ssm("clickhouse-test-stat-url"),
get_parameter_from_ssm("clickhouse-test-stat-login"),
get_parameter_from_ssm("clickhouse-test-stat-password"),
)
CH_CLIENT.insert_event_into(
"default", "workflow_jobs", workflow_job.as_dict(), False
)
def killed_job(wf_job: dict) -> bool:
"""a hack to identify the killed runner if "Complete job" is omit"""
if (
wf_job.get("status", "") != "completed"
or wf_job.get("conclusion", "") != "failure"
):
# The task either success or in progress
return False
return not any(
step["name"] == "Complete job" and step["conclusion"] is not None
for step in wf_job["steps"]
)
def handler(event: dict, context: Any) -> dict:
if event["isBase64Encoded"]:
event_data = json.loads(b64decode(event["body"]))
else:
event_data = json.loads(event["body"])
logging.info("Got the next raw event from the github hook: %s", event_data)
repo = event_data["repository"]
try:
wf_job = event_data["workflow_job"]
except KeyError:
logging.error("The event does not contain valid workflow_jobs data")
logging.error("The event data: %s", event)
logging.error("The context data: %s", context)
if killed_job(wf_job):
# for killed job we record 0
steps = 0
else:
# We record only finished steps
steps = sum(1 for st in wf_job["steps"] if st["conclusion"] is not None)
workflow_job = WorkflowJob(
wf_job["id"],
wf_job["run_id"],
wf_job["workflow_name"] or "", # nullable
wf_job["head_branch"],
wf_job["run_url"],
wf_job["run_attempt"],
wf_job["node_id"],
wf_job["head_sha"],
wf_job["url"],
wf_job["html_url"],
wf_job["status"],
wf_job["conclusion"] or "", # nullable
wf_job["started_at"],
wf_job["completed_at"] or "1970-01-01T00:00:00", # nullable date
wf_job["name"],
steps,
wf_job["check_run_url"],
wf_job["labels"],
wf_job["runner_id"] or 0, # nullable
wf_job["runner_name"] or "", # nullable
wf_job["runner_group_id"] or 0, # nullable
wf_job["runner_group_name"] or "", # nullable
repo["full_name"],
)
logging.info(
"Got the next event (private_repo=%s): %s", repo["private"], workflow_job
)
if repo["private"]:
workflow_job.anonimyze()
send_event_workflow_job(workflow_job)
return {
"statusCode": 200,
"headers": {"Content-Type": "application/json"},
"body": '{"status": "OK"}',
}