2023-04-04 12:27:58 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2023-09-04 15:04:07 +00:00
|
|
|
"""
|
|
|
|
A trivial stateless slack bot that notifies about new broken tests in ClickHouse CI.
|
|
|
|
It checks what happened to our CI during the last check_period hours (1 hour) and
|
|
|
|
notifies us in slack if necessary.
|
|
|
|
This script should be executed once each check_period hours (1 hour).
|
|
|
|
It will post duplicate messages if you run it more often; it will lose some messages
|
|
|
|
if you run it less often.
|
|
|
|
|
|
|
|
You can run it locally with no arguments, it will work in a dry-run mode.
|
|
|
|
Or you can set your own SLACK_URL_DEFAULT.
|
|
|
|
Feel free to add more checks, more details to messages, or better heuristics.
|
|
|
|
|
2023-09-04 16:37:58 +00:00
|
|
|
It's deployed to slack-bot-ci-lambda in CI/CD account
|
|
|
|
|
2023-09-04 15:04:07 +00:00
|
|
|
See also: https://aretestsgreenyet.com/
|
|
|
|
"""
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
import base64
|
2024-02-28 21:58:33 +00:00
|
|
|
import json
|
|
|
|
import os
|
2023-04-04 23:57:26 +00:00
|
|
|
import random
|
2023-04-04 12:27:58 +00:00
|
|
|
|
2024-02-28 21:58:33 +00:00
|
|
|
import requests
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
DRY_RUN_MARK = "<no url, dry run>"
|
|
|
|
|
2023-07-20 13:19:12 +00:00
|
|
|
MAX_FAILURES_DEFAULT = 30
|
2023-04-04 12:27:58 +00:00
|
|
|
SLACK_URL_DEFAULT = DRY_RUN_MARK
|
|
|
|
|
2023-07-20 13:19:12 +00:00
|
|
|
FLAKY_ALERT_PROBABILITY = 0.50
|
|
|
|
REPORT_NO_FAILURES_PROBABILITY = 0.99
|
2023-04-04 23:57:26 +00:00
|
|
|
|
2023-04-13 21:23:34 +00:00
|
|
|
MAX_TESTS_TO_REPORT = 4
|
|
|
|
|
2023-09-04 15:04:07 +00:00
|
|
|
# Slack has a stupid limitation on message size, it splits long messages into multiple,
|
|
|
|
# ones breaking formatting
|
2023-04-12 13:50:06 +00:00
|
|
|
MESSAGE_LENGTH_LIMIT = 4000
|
|
|
|
|
|
|
|
# Find tests that failed in master during the last check_period * 24 hours,
|
2023-04-04 12:27:58 +00:00
|
|
|
# but did not fail during the last 2 weeks. Assuming these tests were broken recently.
|
2023-04-12 13:50:06 +00:00
|
|
|
# Counts number of failures in check_period and check_period * 24 time windows
|
2023-04-04 23:57:26 +00:00
|
|
|
# to distinguish rare flaky tests from completely broken tests
|
2023-04-04 12:27:58 +00:00
|
|
|
NEW_BROKEN_TESTS_QUERY = """
|
|
|
|
WITH
|
|
|
|
1 AS check_period,
|
2023-04-12 13:50:06 +00:00
|
|
|
check_period * 24 AS extended_check_period,
|
2023-04-04 12:27:58 +00:00
|
|
|
now() as now
|
2023-04-04 23:57:26 +00:00
|
|
|
SELECT
|
|
|
|
test_name,
|
|
|
|
any(report_url),
|
|
|
|
countIf((check_start_time + check_duration_ms / 1000) < now - INTERVAL check_period HOUR) AS count_prev_periods,
|
|
|
|
countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL check_period HOUR) AS count
|
2023-04-04 12:27:58 +00:00
|
|
|
FROM checks
|
|
|
|
WHERE 1
|
2023-04-04 23:57:26 +00:00
|
|
|
AND check_start_time BETWEEN now - INTERVAL 1 WEEK AND now
|
|
|
|
AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL extended_check_period HOUR
|
2023-04-04 12:27:58 +00:00
|
|
|
AND pull_request_number = 0
|
|
|
|
AND test_status LIKE 'F%'
|
|
|
|
AND check_status != 'success'
|
|
|
|
AND test_name NOT IN (
|
|
|
|
SELECT test_name FROM checks WHERE 1
|
|
|
|
AND check_start_time >= now - INTERVAL 1 MONTH
|
2023-09-04 14:56:41 +00:00
|
|
|
AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR
|
2023-04-04 12:27:58 +00:00
|
|
|
AND pull_request_number = 0
|
|
|
|
AND check_status != 'success'
|
|
|
|
AND test_status LIKE 'F%')
|
|
|
|
AND test_context_raw NOT LIKE '%CannotSendRequest%' and test_context_raw NOT LIKE '%Server does not respond to health check%'
|
|
|
|
GROUP BY test_name
|
2023-04-13 21:23:34 +00:00
|
|
|
ORDER BY (count_prev_periods + count) DESC
|
2023-04-04 12:27:58 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
# Returns total number of failed checks during the last 24 hours
|
|
|
|
# and previous value of that metric (check_period hours ago)
|
|
|
|
COUNT_FAILURES_QUERY = """
|
|
|
|
WITH
|
|
|
|
1 AS check_period,
|
|
|
|
'%' AS check_name_pattern,
|
|
|
|
now() as now
|
|
|
|
SELECT
|
|
|
|
countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 HOUR) AS new_val,
|
|
|
|
countIf((check_start_time + check_duration_ms / 1000) <= now - INTERVAL check_period HOUR) AS prev_val
|
|
|
|
FROM checks
|
|
|
|
WHERE 1
|
|
|
|
AND check_start_time >= now - INTERVAL 1 WEEK
|
|
|
|
AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 + check_period HOUR
|
|
|
|
AND pull_request_number = 0
|
|
|
|
AND test_status LIKE 'F%'
|
|
|
|
AND check_status != 'success'
|
|
|
|
AND check_name ILIKE check_name_pattern
|
|
|
|
"""
|
|
|
|
|
2023-07-20 13:19:12 +00:00
|
|
|
# Returns percentage of failed checks (once per day, at noon)
|
|
|
|
FAILED_CHECKS_PERCENTAGE_QUERY = """
|
|
|
|
SELECT if(toHour(now('Europe/Amsterdam')) = 12, v, 0)
|
|
|
|
FROM
|
|
|
|
(
|
2023-09-04 14:56:41 +00:00
|
|
|
SELECT
|
|
|
|
countDistinctIf((commit_sha, check_name), (test_status LIKE 'F%') AND (check_status != 'success'))
|
2023-07-20 13:19:12 +00:00
|
|
|
/ countDistinct((commit_sha, check_name)) AS v
|
|
|
|
FROM checks
|
2023-09-04 14:56:41 +00:00
|
|
|
WHERE 1
|
2023-07-20 13:19:12 +00:00
|
|
|
AND (pull_request_number = 0)
|
|
|
|
AND (test_status != 'SKIPPED')
|
|
|
|
AND (check_start_time > (now() - toIntervalDay(1)))
|
|
|
|
)
|
|
|
|
"""
|
|
|
|
|
2023-04-04 23:57:26 +00:00
|
|
|
# It shows all recent failures of the specified test (helps to find when it started)
|
|
|
|
ALL_RECENT_FAILURES_QUERY = """
|
|
|
|
WITH
|
|
|
|
'{}' AS name_substr,
|
|
|
|
90 AS interval_days,
|
2023-09-04 13:39:09 +00:00
|
|
|
('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)', 'Integration tests (asan) [1/3]', 'Stateless tests (tsan) [1/3]') AS backport_and_release_specific_checks
|
2023-04-04 23:57:26 +00:00
|
|
|
SELECT
|
|
|
|
toStartOfDay(check_start_time) AS d,
|
|
|
|
count(),
|
|
|
|
groupUniqArray(pull_request_number) AS prs,
|
|
|
|
any(report_url)
|
|
|
|
FROM checks
|
|
|
|
WHERE ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (pull_request_number NOT IN (
|
|
|
|
SELECT pull_request_number AS prn
|
|
|
|
FROM checks
|
|
|
|
WHERE (prn != 0) AND ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (check_name IN (backport_and_release_specific_checks))
|
|
|
|
)) AND (position(test_name, name_substr) > 0) AND (test_status IN ('FAIL', 'ERROR', 'FLAKY'))
|
|
|
|
GROUP BY d
|
|
|
|
ORDER BY d DESC
|
|
|
|
"""
|
|
|
|
|
2023-04-04 12:27:58 +00:00
|
|
|
SLACK_MESSAGE_JSON = {"type": "mrkdwn", "text": None}
|
|
|
|
|
|
|
|
|
|
|
|
def get_play_url(query):
|
|
|
|
return (
|
|
|
|
"https://play.clickhouse.com/play?user=play#"
|
|
|
|
+ base64.b64encode(query.encode()).decode()
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def run_clickhouse_query(query):
|
2024-02-28 21:58:33 +00:00
|
|
|
url = "https://play.clickhouse.com/?user=play&query=" + requests.compat.quote(query)
|
2024-02-29 01:21:39 +00:00
|
|
|
res = requests.get(url, timeout=30)
|
2023-04-04 12:27:58 +00:00
|
|
|
if res.status_code != 200:
|
|
|
|
print("Failed to execute query: ", res.status_code, res.content)
|
2024-02-28 21:58:33 +00:00
|
|
|
res.raise_for_status()
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
lines = res.text.strip().splitlines()
|
|
|
|
return [x.split("\t") for x in lines]
|
|
|
|
|
|
|
|
|
2023-04-04 23:57:26 +00:00
|
|
|
def split_broken_and_flaky_tests(failed_tests):
|
|
|
|
if not failed_tests:
|
|
|
|
return None
|
|
|
|
|
|
|
|
broken_tests = []
|
|
|
|
flaky_tests = []
|
|
|
|
for name, report, count_prev_str, count_str in failed_tests:
|
|
|
|
count_prev, count = int(count_prev_str), int(count_str)
|
2024-02-29 01:21:39 +00:00
|
|
|
if (count_prev < 2 <= count) or (count_prev == count == 1):
|
2023-04-04 23:57:26 +00:00
|
|
|
# It failed 2 times or more within extended time window, it's definitely broken.
|
2024-02-29 01:21:39 +00:00
|
|
|
# 2 <= count means that it was not reported as broken on previous runs
|
2023-04-04 23:57:26 +00:00
|
|
|
broken_tests.append([name, report])
|
|
|
|
elif 0 < count and count_prev == 0:
|
|
|
|
# It failed only once, can be a rare flaky test
|
|
|
|
flaky_tests.append([name, report])
|
|
|
|
|
|
|
|
return broken_tests, flaky_tests
|
|
|
|
|
|
|
|
|
|
|
|
def format_failed_tests_list(failed_tests, failure_type):
|
|
|
|
if len(failed_tests) == 1:
|
2024-02-29 01:21:39 +00:00
|
|
|
res = f"There is a new {failure_type} test:\n"
|
2023-04-04 23:57:26 +00:00
|
|
|
else:
|
2024-02-29 01:21:39 +00:00
|
|
|
res = f"There are {len(failed_tests)} new {failure_type} tests:\n"
|
2023-04-04 23:57:26 +00:00
|
|
|
|
2023-04-13 21:23:34 +00:00
|
|
|
for name, report in failed_tests[:MAX_TESTS_TO_REPORT]:
|
2023-04-04 23:57:26 +00:00
|
|
|
cidb_url = get_play_url(ALL_RECENT_FAILURES_QUERY.format(name))
|
2024-02-29 01:21:39 +00:00
|
|
|
res += f"- *{name}* - <{report}|Report> - <{cidb_url}|CI DB> \n"
|
2023-04-13 21:23:34 +00:00
|
|
|
|
|
|
|
if MAX_TESTS_TO_REPORT < len(failed_tests):
|
2024-02-29 01:21:39 +00:00
|
|
|
res += (
|
|
|
|
f"- and {len(failed_tests) - MAX_TESTS_TO_REPORT} other "
|
|
|
|
"tests... :this-is-fine-fire:"
|
2023-04-13 21:23:34 +00:00
|
|
|
)
|
|
|
|
|
2023-04-04 23:57:26 +00:00
|
|
|
return res
|
|
|
|
|
|
|
|
|
|
|
|
def get_new_broken_tests_message(failed_tests):
|
|
|
|
if not failed_tests:
|
|
|
|
return None
|
|
|
|
|
|
|
|
broken_tests, flaky_tests = split_broken_and_flaky_tests(failed_tests)
|
|
|
|
if len(broken_tests) == 0 and len(flaky_tests) == 0:
|
|
|
|
return None
|
|
|
|
|
|
|
|
msg = ""
|
|
|
|
if len(broken_tests) > 0:
|
|
|
|
msg += format_failed_tests_list(broken_tests, "*BROKEN*")
|
|
|
|
elif random.random() > FLAKY_ALERT_PROBABILITY:
|
2023-04-12 13:50:06 +00:00
|
|
|
looks_like_fuzzer = [x[0].count(" ") > 2 for x in flaky_tests]
|
|
|
|
if not any(looks_like_fuzzer):
|
|
|
|
print("Will not report flaky tests to avoid noise: ", flaky_tests)
|
|
|
|
return None
|
2023-04-04 23:57:26 +00:00
|
|
|
|
|
|
|
if len(flaky_tests) > 0:
|
2023-04-12 13:50:06 +00:00
|
|
|
if len(msg) > 0:
|
|
|
|
msg += "\n"
|
2023-04-04 23:57:26 +00:00
|
|
|
msg += format_failed_tests_list(flaky_tests, "flaky")
|
|
|
|
|
2023-04-04 12:27:58 +00:00
|
|
|
return msg
|
|
|
|
|
|
|
|
|
2023-04-04 23:57:26 +00:00
|
|
|
def get_too_many_failures_message_impl(failures_count):
|
2023-04-04 12:27:58 +00:00
|
|
|
MAX_FAILURES = int(os.environ.get("MAX_FAILURES", MAX_FAILURES_DEFAULT))
|
|
|
|
curr_failures = int(failures_count[0][0])
|
|
|
|
prev_failures = int(failures_count[0][1])
|
2023-04-12 13:50:06 +00:00
|
|
|
if curr_failures == 0 and prev_failures != 0:
|
2023-07-20 13:19:12 +00:00
|
|
|
if random.random() < REPORT_NO_FAILURES_PROBABILITY:
|
|
|
|
return None
|
|
|
|
return "Wow, there are *no failures* at all... 0_o"
|
2024-02-29 01:21:39 +00:00
|
|
|
return_none = (
|
|
|
|
curr_failures < MAX_FAILURES
|
|
|
|
or curr_failures < prev_failures
|
|
|
|
or (curr_failures - prev_failures) / prev_failures < 0.2
|
|
|
|
)
|
|
|
|
if return_none:
|
2023-04-04 12:27:58 +00:00
|
|
|
return None
|
|
|
|
if prev_failures < MAX_FAILURES:
|
2024-02-29 01:21:39 +00:00
|
|
|
return f":alert: *CI is broken: there are {curr_failures} failures during the last 24 hours*"
|
|
|
|
return "CI is broken and it's getting worse: there are {curr_failures} failures during the last 24 hours"
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
|
2023-04-04 23:57:26 +00:00
|
|
|
def get_too_many_failures_message(failures_count):
|
|
|
|
msg = get_too_many_failures_message_impl(failures_count)
|
|
|
|
if msg:
|
|
|
|
msg += "\nSee https://aretestsgreenyet.com/"
|
|
|
|
return msg
|
|
|
|
|
|
|
|
|
2023-07-20 13:19:12 +00:00
|
|
|
def get_failed_checks_percentage_message(percentage):
|
2023-07-20 14:59:43 +00:00
|
|
|
p = float(percentage[0][0]) * 100
|
2023-07-20 13:19:12 +00:00
|
|
|
|
|
|
|
# Always report more than 1% of failed checks
|
|
|
|
# For <= 1%: higher percentage of failures == higher probability
|
|
|
|
if p <= random.random():
|
|
|
|
return None
|
|
|
|
|
|
|
|
msg = ":alert: " if p > 1 else "Only " if p < 0.5 else ""
|
2024-02-29 01:21:39 +00:00
|
|
|
msg += f"*{p:.2f}%* of all checks in master have failed yesterday"
|
2023-07-20 13:19:12 +00:00
|
|
|
return msg
|
|
|
|
|
|
|
|
|
2023-04-12 13:50:06 +00:00
|
|
|
def split_slack_message(long_message):
|
|
|
|
lines = long_message.split("\n")
|
|
|
|
messages = []
|
|
|
|
curr_msg = ""
|
|
|
|
for line in lines:
|
|
|
|
if len(curr_msg) + len(line) < MESSAGE_LENGTH_LIMIT:
|
|
|
|
curr_msg += "\n"
|
|
|
|
curr_msg += line
|
|
|
|
else:
|
|
|
|
messages.append(curr_msg)
|
|
|
|
curr_msg = line
|
|
|
|
messages.append(curr_msg)
|
|
|
|
return messages
|
|
|
|
|
|
|
|
|
|
|
|
def send_to_slack_impl(message):
|
2023-04-04 12:27:58 +00:00
|
|
|
SLACK_URL = os.environ.get("SLACK_URL", SLACK_URL_DEFAULT)
|
|
|
|
if SLACK_URL == DRY_RUN_MARK:
|
|
|
|
return
|
|
|
|
|
|
|
|
payload = SLACK_MESSAGE_JSON.copy()
|
|
|
|
payload["text"] = message
|
2024-02-29 01:21:39 +00:00
|
|
|
res = requests.post(SLACK_URL, json.dumps(payload), timeout=30)
|
2023-04-04 12:27:58 +00:00
|
|
|
if res.status_code != 200:
|
|
|
|
print("Failed to send a message to Slack: ", res.status_code, res.content)
|
2024-02-28 21:58:33 +00:00
|
|
|
res.raise_for_status()
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
|
2023-04-12 13:50:06 +00:00
|
|
|
def send_to_slack(message):
|
|
|
|
messages = split_slack_message(message)
|
|
|
|
for msg in messages:
|
|
|
|
send_to_slack_impl(msg)
|
|
|
|
|
|
|
|
|
2023-04-04 12:27:58 +00:00
|
|
|
def query_and_alert_if_needed(query, get_message_func):
|
|
|
|
query_res = run_clickhouse_query(query)
|
|
|
|
print("Got result {} for query {}", query_res, query)
|
|
|
|
msg = get_message_func(query_res)
|
|
|
|
if msg is None:
|
|
|
|
return
|
|
|
|
|
2024-02-29 01:21:39 +00:00
|
|
|
msg += f"\nCI DB query: <{get_play_url(query)}|link>"
|
2023-04-04 12:27:58 +00:00
|
|
|
print("Sending message to slack:", msg)
|
|
|
|
send_to_slack(msg)
|
|
|
|
|
|
|
|
|
|
|
|
def check_and_alert():
|
|
|
|
query_and_alert_if_needed(NEW_BROKEN_TESTS_QUERY, get_new_broken_tests_message)
|
|
|
|
query_and_alert_if_needed(COUNT_FAILURES_QUERY, get_too_many_failures_message)
|
2023-07-20 13:19:12 +00:00
|
|
|
query_and_alert_if_needed(
|
|
|
|
FAILED_CHECKS_PERCENTAGE_QUERY, get_failed_checks_percentage_message
|
|
|
|
)
|
2023-04-04 12:27:58 +00:00
|
|
|
|
|
|
|
|
2023-09-04 14:56:41 +00:00
|
|
|
def handler(event, context):
|
2024-02-29 01:21:39 +00:00
|
|
|
_, _ = event, context
|
2023-04-04 12:27:58 +00:00
|
|
|
try:
|
|
|
|
check_and_alert()
|
|
|
|
return {"statusCode": 200, "body": "OK"}
|
|
|
|
except Exception as e:
|
|
|
|
send_to_slack(
|
2023-09-04 12:33:44 +00:00
|
|
|
"I failed, please help me "
|
|
|
|
f"(see ClickHouse/ClickHouse/tests/ci/slack_bot_ci_lambda/app.py): {e}"
|
2023-04-04 12:27:58 +00:00
|
|
|
)
|
|
|
|
return {"statusCode": 200, "body": "FAIL"}
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
check_and_alert()
|