ClickHouse/utils/ci-slack-bot/ci-slack-bot.py

#!/usr/bin/env python3

# A trivial stateless slack bot that notifies about new broken tests in ClickHouse CI.
# It checks what happened to our CI during the last check_period hours (1 hour) and notifies us in slack if necessary.
# This script should be executed once each check_period hours (1 hour).
# It will post duplicate messages if you run it more often; it will lose some messages if you run it less often.
#
# You can run it locally with no arguments, it will work in a dry-run mode. Or you can set your own SLACK_URL_DEFAULT.
# Feel free to add more checks, more details to messages, or better heuristics.
# NOTE There's no deployment automation for now,
# an AWS Lambda (slack-ci-bot-test lambda in CI-CD) has to be updated manually after changing this script.
#
# See also: https://aretestsgreenyet.com/

import os
import json
import base64
import random

if os.environ.get("AWS_LAMBDA_ENV", "0") == "1":
    # For AWS labmda (python 3.7)
    from botocore.vendored import requests
else:
    # For running locally
    import requests

DRY_RUN_MARK = "<no url, dry run>"

MAX_FAILURES_DEFAULT = 30
SLACK_URL_DEFAULT = DRY_RUN_MARK

FLAKY_ALERT_PROBABILITY = 0.50
REPORT_NO_FAILURES_PROBABILITY = 0.99

MAX_TESTS_TO_REPORT = 4

# Slack has a stupid limitation on message size, it splits long messages into multiple ones breaking formatting
MESSAGE_LENGTH_LIMIT = 4000

# Find tests that failed in master during the last check_period * 24 hours,
# but did not fail during the last 2 weeks. Assuming these tests were broken recently.
# Counts number of failures in check_period and check_period * 24 time windows
# to distinguish rare flaky tests from completely broken tests
NEW_BROKEN_TESTS_QUERY = """
WITH
    1 AS check_period,
    check_period * 24 AS extended_check_period,
    now() as now
SELECT
    test_name,
    any(report_url),
    countIf((check_start_time + check_duration_ms / 1000) < now - INTERVAL check_period HOUR) AS count_prev_periods,
    countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL check_period HOUR) AS count
FROM checks
WHERE 1
    AND check_start_time BETWEEN now - INTERVAL 1 WEEK AND now
    AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL extended_check_period HOUR
    AND pull_request_number = 0
    AND test_status LIKE 'F%'
    AND check_status != 'success'
    AND test_name NOT IN (
        SELECT test_name FROM checks WHERE 1
        AND check_start_time >= now - INTERVAL 1 MONTH
        AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR 
        AND pull_request_number = 0
        AND check_status != 'success'
        AND test_status LIKE 'F%')
    AND test_context_raw NOT LIKE '%CannotSendRequest%' and test_context_raw NOT LIKE '%Server does not respond to health check%'
GROUP BY test_name
ORDER BY (count_prev_periods + count) DESC
"""

# Returns total number of failed checks during the last 24 hours
# and previous value of that metric (check_period hours ago)
COUNT_FAILURES_QUERY = """
WITH
    1 AS check_period,
    '%' AS check_name_pattern,
    now() as now
SELECT
    countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 HOUR) AS new_val,
    countIf((check_start_time + check_duration_ms / 1000) <= now - INTERVAL check_period HOUR) AS prev_val
FROM checks
WHERE 1
    AND check_start_time >= now - INTERVAL 1 WEEK
    AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 + check_period HOUR
    AND pull_request_number = 0
    AND test_status LIKE 'F%'
    AND check_status != 'success'
    AND check_name ILIKE check_name_pattern
"""

# Returns percentage of failed checks (once per day, at noon)
FAILED_CHECKS_PERCENTAGE_QUERY = """
SELECT if(toHour(now('Europe/Amsterdam')) = 12, v, 0)
FROM
(
    SELECT 
        countDistinctIf((commit_sha, check_name), (test_status LIKE 'F%') AND (check_status != 'success')) 
            / countDistinct((commit_sha, check_name)) AS v
    FROM checks
    WHERE 1 
        AND (pull_request_number = 0)
        AND (test_status != 'SKIPPED')
        AND (check_start_time > (now() - toIntervalDay(1)))
)
"""

# It shows all recent failures of the specified test (helps to find when it started)
ALL_RECENT_FAILURES_QUERY = """
WITH
    '{}' AS name_substr,
    90 AS interval_days,
    ('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)') AS backport_and_release_specific_checks
SELECT
    toStartOfDay(check_start_time) AS d,
    count(),
    groupUniqArray(pull_request_number) AS prs,
    any(report_url)
FROM checks
WHERE ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (pull_request_number NOT IN (
    SELECT pull_request_number AS prn
    FROM checks
    WHERE (prn != 0) AND ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (check_name IN (backport_and_release_specific_checks))
)) AND (position(test_name, name_substr) > 0) AND (test_status IN ('FAIL', 'ERROR', 'FLAKY'))
GROUP BY d
ORDER BY d DESC
"""

SLACK_MESSAGE_JSON = {"type": "mrkdwn", "text": None}


def get_play_url(query):
    return (
        "https://play.clickhouse.com/play?user=play#"
        + base64.b64encode(query.encode()).decode()
    )


def run_clickhouse_query(query):
    url = "https://play.clickhouse.com/?user=play&query=" + requests.utils.quote(query)
    res = requests.get(url)
    if res.status_code != 200:
        print("Failed to execute query: ", res.status_code, res.content)
        raise Exception(
            "Failed to execute query: {}: {}".format(res.status_code, res.content)
        )

    lines = res.text.strip().splitlines()
    return [x.split("\t") for x in lines]


def split_broken_and_flaky_tests(failed_tests):
    if not failed_tests:
        return None

    broken_tests = []
    flaky_tests = []
    for name, report, count_prev_str, count_str in failed_tests:
        count_prev, count = int(count_prev_str), int(count_str)
        if (2 <= count and count_prev < 2) or (count_prev == 1 and count == 1):
            # It failed 2 times or more within extended time window, it's definitely broken.
            # 2 <= count_prev means that it was not reported as broken on previous runs
            broken_tests.append([name, report])
        elif 0 < count and count_prev == 0:
            # It failed only once, can be a rare flaky test
            flaky_tests.append([name, report])

    return broken_tests, flaky_tests


def format_failed_tests_list(failed_tests, failure_type):
    if len(failed_tests) == 1:
        res = "There is a new {} test:\n".format(failure_type)
    else:
        res = "There are {} new {} tests:\n".format(len(failed_tests), failure_type)

    for name, report in failed_tests[:MAX_TESTS_TO_REPORT]:
        cidb_url = get_play_url(ALL_RECENT_FAILURES_QUERY.format(name))
        res += "-   *{}*  -  <{}|Report>  -  <{}|CI DB> \n".format(
            name, report, cidb_url
        )

    if MAX_TESTS_TO_REPORT < len(failed_tests):
        res += "-   and {} other tests... :this-is-fine-fire:".format(
            len(failed_tests) - MAX_TESTS_TO_REPORT
        )

    return res


def get_new_broken_tests_message(failed_tests):
    if not failed_tests:
        return None

    broken_tests, flaky_tests = split_broken_and_flaky_tests(failed_tests)
    if len(broken_tests) == 0 and len(flaky_tests) == 0:
        return None

    msg = ""
    if len(broken_tests) > 0:
        msg += format_failed_tests_list(broken_tests, "*BROKEN*")
    elif random.random() > FLAKY_ALERT_PROBABILITY:
        looks_like_fuzzer = [x[0].count(" ") > 2 for x in flaky_tests]
        if not any(looks_like_fuzzer):
            print("Will not report flaky tests to avoid noise: ", flaky_tests)
            return None

    if len(flaky_tests) > 0:
        if len(msg) > 0:
            msg += "\n"
        msg += format_failed_tests_list(flaky_tests, "flaky")

    return msg


def get_too_many_failures_message_impl(failures_count):
    MAX_FAILURES = int(os.environ.get("MAX_FAILURES", MAX_FAILURES_DEFAULT))
    curr_failures = int(failures_count[0][0])
    prev_failures = int(failures_count[0][1])
    if curr_failures == 0 and prev_failures != 0:
        if random.random() < REPORT_NO_FAILURES_PROBABILITY:
            return None
        return "Wow, there are *no failures* at all... 0_o"
    if curr_failures < MAX_FAILURES:
        return None
    if prev_failures < MAX_FAILURES:
        return ":alert: *CI is broken: there are {} failures during the last 24 hours*".format(
            curr_failures
        )
    if curr_failures < prev_failures:
        return None
    if (curr_failures - prev_failures) / prev_failures < 0.2:
        return None
    return "CI is broken and it's getting worse: there are {} failures during the last 24 hours".format(
        curr_failures
    )


def get_too_many_failures_message(failures_count):
    msg = get_too_many_failures_message_impl(failures_count)
    if msg:
        msg += "\nSee https://aretestsgreenyet.com/"
    return msg


def get_failed_checks_percentage_message(percentage):
    p = float(percentage[0][0]) * 100

    # Always report more than 1% of failed checks
    # For <= 1%: higher percentage of failures == higher probability
    if p <= random.random():
        return None

    msg = ":alert: " if p > 1 else "Only " if p < 0.5 else ""
    msg += "*{0:.2f}%* of all checks in master have failed yesterday".format(p)
    return msg


def split_slack_message(long_message):
    lines = long_message.split("\n")
    messages = []
    curr_msg = ""
    for line in lines:
        if len(curr_msg) + len(line) < MESSAGE_LENGTH_LIMIT:
            curr_msg += "\n"
            curr_msg += line
        else:
            messages.append(curr_msg)
            curr_msg = line
    messages.append(curr_msg)
    return messages


def send_to_slack_impl(message):
    SLACK_URL = os.environ.get("SLACK_URL", SLACK_URL_DEFAULT)
    if SLACK_URL == DRY_RUN_MARK:
        return

    payload = SLACK_MESSAGE_JSON.copy()
    payload["text"] = message
    res = requests.post(SLACK_URL, json.dumps(payload))
    if res.status_code != 200:
        print("Failed to send a message to Slack: ", res.status_code, res.content)
        raise Exception(
            "Failed to send a message to Slack: {}: {}".format(
                res.status_code, res.content
            )
        )


def send_to_slack(message):
    messages = split_slack_message(message)
    for msg in messages:
        send_to_slack_impl(msg)


def query_and_alert_if_needed(query, get_message_func):
    query_res = run_clickhouse_query(query)
    print("Got result {} for query {}", query_res, query)
    msg = get_message_func(query_res)
    if msg is None:
        return

    msg += "\nCI DB query: <{}|link>".format(get_play_url(query))
    print("Sending message to slack:", msg)
    send_to_slack(msg)


def check_and_alert():
    query_and_alert_if_needed(NEW_BROKEN_TESTS_QUERY, get_new_broken_tests_message)
    query_and_alert_if_needed(COUNT_FAILURES_QUERY, get_too_many_failures_message)
    query_and_alert_if_needed(
        FAILED_CHECKS_PERCENTAGE_QUERY, get_failed_checks_percentage_message
    )


def lambda_handler(event, context):
    try:
        check_and_alert()
        return {"statusCode": 200, "body": "OK"}
    except Exception as e:
        send_to_slack(
            "I failed, please help me (see ClickHouse/utils/ci-slack-bot/ci-slack-bot.py): "
            + str(e)
        )
        return {"statusCode": 200, "body": "FAIL"}


if __name__ == "__main__":
    check_and_alert()
add script for a slack bot 2023-04-04 12:27:58 +00:00			`#!/usr/bin/env python3`

			`# A trivial stateless slack bot that notifies about new broken tests in ClickHouse CI.`
			`# It checks what happened to our CI during the last check_period hours (1 hour) and notifies us in slack if necessary.`
			`# This script should be executed once each check_period hours (1 hour).`
			`# It will post duplicate messages if you run it more often; it will lose some messages if you run it less often.`
			`#`
			`# You can run it locally with no arguments, it will work in a dry-run mode. Or you can set your own SLACK_URL_DEFAULT.`
			`# Feel free to add more checks, more details to messages, or better heuristics.`
			`# NOTE There's no deployment automation for now,`
			`# an AWS Lambda (slack-ci-bot-test lambda in CI-CD) has to be updated manually after changing this script.`
			`#`
			`# See also: https://aretestsgreenyet.com/`

			`import os`
			`import json`
			`import base64`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`import random`
add script for a slack bot 2023-04-04 12:27:58 +00:00
			`if os.environ.get("AWS_LAMBDA_ENV", "0") == "1":`
			`# For AWS labmda (python 3.7)`
			`from botocore.vendored import requests`
			`else:`
			`# For running locally`
			`import requests`

			`DRY_RUN_MARK = "<no url, dry run>"`

Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`MAX_FAILURES_DEFAULT = 30`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`SLACK_URL_DEFAULT = DRY_RUN_MARK`

Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`FLAKY_ALERT_PROBABILITY = 0.50`
			`REPORT_NO_FAILURES_PROBABILITY = 0.99`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00
limit size of messages from the CI slack bot 2023-04-13 21:23:34 +00:00			`MAX_TESTS_TO_REPORT = 4`

better messages formatting 2023-04-12 13:50:06 +00:00			`# Slack has a stupid limitation on message size, it splits long messages into multiple ones breaking formatting`
			`MESSAGE_LENGTH_LIMIT = 4000`

			`# Find tests that failed in master during the last check_period * 24 hours,`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`# but did not fail during the last 2 weeks. Assuming these tests were broken recently.`
better messages formatting 2023-04-12 13:50:06 +00:00			`# Counts number of failures in check_period and check_period * 24 time windows`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`# to distinguish rare flaky tests from completely broken tests`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`NEW_BROKEN_TESTS_QUERY = """`
			`WITH`
			`1 AS check_period,`
better messages formatting 2023-04-12 13:50:06 +00:00			`check_period * 24 AS extended_check_period,`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`now() as now`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`SELECT`
			`test_name,`
			`any(report_url),`
			`countIf((check_start_time + check_duration_ms / 1000) < now - INTERVAL check_period HOUR) AS count_prev_periods,`
			`countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL check_period HOUR) AS count`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`FROM checks`
			`WHERE 1`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`AND check_start_time BETWEEN now - INTERVAL 1 WEEK AND now`
			`AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL extended_check_period HOUR`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`AND pull_request_number = 0`
			`AND test_status LIKE 'F%'`
			`AND check_status != 'success'`
			`AND test_name NOT IN (`
			`SELECT test_name FROM checks WHERE 1`
			`AND check_start_time >= now - INTERVAL 1 MONTH`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`AND (check_start_time + check_duration_ms / 1000) BETWEEN now - INTERVAL 2 WEEK AND now - INTERVAL extended_check_period HOUR`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`AND pull_request_number = 0`
			`AND check_status != 'success'`
			`AND test_status LIKE 'F%')`
			`AND test_context_raw NOT LIKE '%CannotSendRequest%' and test_context_raw NOT LIKE '%Server does not respond to health check%'`
			`GROUP BY test_name`
limit size of messages from the CI slack bot 2023-04-13 21:23:34 +00:00			`ORDER BY (count_prev_periods + count) DESC`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`"""`

			`# Returns total number of failed checks during the last 24 hours`
			`# and previous value of that metric (check_period hours ago)`
			`COUNT_FAILURES_QUERY = """`
			`WITH`
			`1 AS check_period,`
			`'%' AS check_name_pattern,`
			`now() as now`
			`SELECT`
			`countIf((check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 HOUR) AS new_val,`
			`countIf((check_start_time + check_duration_ms / 1000) <= now - INTERVAL check_period HOUR) AS prev_val`
			`FROM checks`
			`WHERE 1`
			`AND check_start_time >= now - INTERVAL 1 WEEK`
			`AND (check_start_time + check_duration_ms / 1000) >= now - INTERVAL 24 + check_period HOUR`
			`AND pull_request_number = 0`
			`AND test_status LIKE 'F%'`
			`AND check_status != 'success'`
			`AND check_name ILIKE check_name_pattern`
			`"""`

Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`# Returns percentage of failed checks (once per day, at noon)`
			`FAILED_CHECKS_PERCENTAGE_QUERY = """`
			`SELECT if(toHour(now('Europe/Amsterdam')) = 12, v, 0)`
			`FROM`
			`(`
			`SELECT`
			`countDistinctIf((commit_sha, check_name), (test_status LIKE 'F%') AND (check_status != 'success'))`
			`/ countDistinct((commit_sha, check_name)) AS v`
			`FROM checks`
			`WHERE 1`
			`AND (pull_request_number = 0)`
			`AND (test_status != 'SKIPPED')`
			`AND (check_start_time > (now() - toIntervalDay(1)))`
			`)`
			`"""`

make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`# It shows all recent failures of the specified test (helps to find when it started)`
			`ALL_RECENT_FAILURES_QUERY = """`
			`WITH`
			`'{}' AS name_substr,`
			`90 AS interval_days,`
			`('Stateless tests (asan)', 'Stateless tests (address)', 'Stateless tests (address, actions)') AS backport_and_release_specific_checks`
			`SELECT`
			`toStartOfDay(check_start_time) AS d,`
			`count(),`
			`groupUniqArray(pull_request_number) AS prs,`
			`any(report_url)`
			`FROM checks`
			`WHERE ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (pull_request_number NOT IN (`
			`SELECT pull_request_number AS prn`
			`FROM checks`
			`WHERE (prn != 0) AND ((now() - toIntervalDay(interval_days)) <= check_start_time) AND (check_name IN (backport_and_release_specific_checks))`
			`)) AND (position(test_name, name_substr) > 0) AND (test_status IN ('FAIL', 'ERROR', 'FLAKY'))`
			`GROUP BY d`
			`ORDER BY d DESC`
			`"""`

add script for a slack bot 2023-04-04 12:27:58 +00:00			`SLACK_MESSAGE_JSON = {"type": "mrkdwn", "text": None}`


			`def get_play_url(query):`
			`return (`
			`"https://play.clickhouse.com/play?user=play#"`
			`+ base64.b64encode(query.encode()).decode()`
			`)`


			`def run_clickhouse_query(query):`
			`url = "https://play.clickhouse.com/?user=play&query=" + requests.utils.quote(query)`
			`res = requests.get(url)`
			`if res.status_code != 200:`
			`print("Failed to execute query: ", res.status_code, res.content)`
			`raise Exception(`
			`"Failed to execute query: {}: {}".format(res.status_code, res.content)`
			`)`

			`lines = res.text.strip().splitlines()`
			`return [x.split("\t") for x in lines]`


make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`def split_broken_and_flaky_tests(failed_tests):`
			`if not failed_tests:`
			`return None`

			`broken_tests = []`
			`flaky_tests = []`
			`for name, report, count_prev_str, count_str in failed_tests:`
			`count_prev, count = int(count_prev_str), int(count_str)`
			`if (2 <= count and count_prev < 2) or (count_prev == 1 and count == 1):`
			`# It failed 2 times or more within extended time window, it's definitely broken.`
			`# 2 <= count_prev means that it was not reported as broken on previous runs`
			`broken_tests.append([name, report])`
			`elif 0 < count and count_prev == 0:`
			`# It failed only once, can be a rare flaky test`
			`flaky_tests.append([name, report])`

			`return broken_tests, flaky_tests`


			`def format_failed_tests_list(failed_tests, failure_type):`
			`if len(failed_tests) == 1:`
			`res = "There is a new {} test:\n".format(failure_type)`
			`else:`
			`res = "There are {} new {} tests:\n".format(len(failed_tests), failure_type)`

limit size of messages from the CI slack bot 2023-04-13 21:23:34 +00:00			`for name, report in failed_tests[:MAX_TESTS_TO_REPORT]:`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`cidb_url = get_play_url(ALL_RECENT_FAILURES_QUERY.format(name))`
better messages formatting 2023-04-12 13:50:06 +00:00			`res += "- {} - <{}\|Report> - <{}\|CI DB> \n".format(`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`name, report, cidb_url`
			`)`
limit size of messages from the CI slack bot 2023-04-13 21:23:34 +00:00
			`if MAX_TESTS_TO_REPORT < len(failed_tests):`
			`res += "- and {} other tests... :this-is-fine-fire:".format(`
			`len(failed_tests) - MAX_TESTS_TO_REPORT`
			`)`

make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`return res`


			`def get_new_broken_tests_message(failed_tests):`
			`if not failed_tests:`
			`return None`

			`broken_tests, flaky_tests = split_broken_and_flaky_tests(failed_tests)`
			`if len(broken_tests) == 0 and len(flaky_tests) == 0:`
			`return None`

			`msg = ""`
			`if len(broken_tests) > 0:`
			`msg += format_failed_tests_list(broken_tests, "BROKEN")`
			`elif random.random() > FLAKY_ALERT_PROBABILITY:`
better messages formatting 2023-04-12 13:50:06 +00:00			`looks_like_fuzzer = [x[0].count(" ") > 2 for x in flaky_tests]`
			`if not any(looks_like_fuzzer):`
			`print("Will not report flaky tests to avoid noise: ", flaky_tests)`
			`return None`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00
			`if len(flaky_tests) > 0:`
better messages formatting 2023-04-12 13:50:06 +00:00			`if len(msg) > 0:`
			`msg += "\n"`
make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`msg += format_failed_tests_list(flaky_tests, "flaky")`

add script for a slack bot 2023-04-04 12:27:58 +00:00			`return msg`


make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`def get_too_many_failures_message_impl(failures_count):`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`MAX_FAILURES = int(os.environ.get("MAX_FAILURES", MAX_FAILURES_DEFAULT))`
			`curr_failures = int(failures_count[0][0])`
			`prev_failures = int(failures_count[0][1])`
better messages formatting 2023-04-12 13:50:06 +00:00			`if curr_failures == 0 and prev_failures != 0:`
Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`if random.random() < REPORT_NO_FAILURES_PROBABILITY:`
			`return None`
			`return "Wow, there are no failures at all... 0_o"`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`if curr_failures < MAX_FAILURES:`
			`return None`
			`if prev_failures < MAX_FAILURES:`
limit size of messages from the CI slack bot 2023-04-13 21:23:34 +00:00			`return ":alert: CI is broken: there are {} failures during the last 24 hours".format(`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`curr_failures`
			`)`
			`if curr_failures < prev_failures:`
			`return None`
			`if (curr_failures - prev_failures) / prev_failures < 0.2:`
			`return None`
			`return "CI is broken and it's getting worse: there are {} failures during the last 24 hours".format(`
			`curr_failures`
			`)`


make CI slack bot less noisy 2023-04-04 23:57:26 +00:00			`def get_too_many_failures_message(failures_count):`
			`msg = get_too_many_failures_message_impl(failures_count)`
			`if msg:`
			`msg += "\nSee https://aretestsgreenyet.com/"`
			`return msg`


Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`def get_failed_checks_percentage_message(percentage):`
Update ci-slack-bot.py 2023-07-20 14:59:43 +00:00			`p = float(percentage[0][0]) * 100`
Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00
			`# Always report more than 1% of failed checks`
			`# For <= 1%: higher percentage of failures == higher probability`
			`if p <= random.random():`
			`return None`

			`msg = ":alert: " if p > 1 else "Only " if p < 0.5 else ""`
			`msg += "{0:.2f}% of all checks in master have failed yesterday".format(p)`
			`return msg`


better messages formatting 2023-04-12 13:50:06 +00:00			`def split_slack_message(long_message):`
			`lines = long_message.split("\n")`
			`messages = []`
			`curr_msg = ""`
			`for line in lines:`
			`if len(curr_msg) + len(line) < MESSAGE_LENGTH_LIMIT:`
			`curr_msg += "\n"`
			`curr_msg += line`
			`else:`
			`messages.append(curr_msg)`
			`curr_msg = line`
			`messages.append(curr_msg)`
			`return messages`


			`def send_to_slack_impl(message):`
add script for a slack bot 2023-04-04 12:27:58 +00:00			`SLACK_URL = os.environ.get("SLACK_URL", SLACK_URL_DEFAULT)`
			`if SLACK_URL == DRY_RUN_MARK:`
			`return`

			`payload = SLACK_MESSAGE_JSON.copy()`
			`payload["text"] = message`
			`res = requests.post(SLACK_URL, json.dumps(payload))`
			`if res.status_code != 200:`
			`print("Failed to send a message to Slack: ", res.status_code, res.content)`
			`raise Exception(`
			`"Failed to send a message to Slack: {}: {}".format(`
			`res.status_code, res.content`
			`)`
			`)`


better messages formatting 2023-04-12 13:50:06 +00:00			`def send_to_slack(message):`
			`messages = split_slack_message(message)`
			`for msg in messages:`
			`send_to_slack_impl(msg)`


add script for a slack bot 2023-04-04 12:27:58 +00:00			`def query_and_alert_if_needed(query, get_message_func):`
			`query_res = run_clickhouse_query(query)`
			`print("Got result {} for query {}", query_res, query)`
			`msg = get_message_func(query_res)`
			`if msg is None:`
			`return`

			`msg += "\nCI DB query: <{}\|link>".format(get_play_url(query))`
			`print("Sending message to slack:", msg)`
			`send_to_slack(msg)`


			`def check_and_alert():`
			`query_and_alert_if_needed(NEW_BROKEN_TESTS_QUERY, get_new_broken_tests_message)`
			`query_and_alert_if_needed(COUNT_FAILURES_QUERY, get_too_many_failures_message)`
Update ci-slack-bot.py 2023-07-20 13:19:12 +00:00			`query_and_alert_if_needed(`
			`FAILED_CHECKS_PERCENTAGE_QUERY, get_failed_checks_percentage_message`
			`)`
add script for a slack bot 2023-04-04 12:27:58 +00:00

			`def lambda_handler(event, context):`
			`try:`
			`check_and_alert()`
			`return {"statusCode": 200, "body": "OK"}`
			`except Exception as e:`
			`send_to_slack(`
			`"I failed, please help me (see ClickHouse/utils/ci-slack-bot/ci-slack-bot.py): "`
			`+ str(e)`
			`)`
			`return {"statusCode": 200, "body": "FAIL"}`


			`if __name__ == "__main__":`
			`check_and_alert()`