ClickHouse/tests/ci/autoscale_runners_lambda/app.py

#!/usr/bin/env python3

"""The lambda to decrease/increase ASG desired capacity based on current queue"""

import logging
from dataclasses import dataclass
from pprint import pformat
from typing import Any, List, Literal, Optional, Tuple

import boto3  # type: ignore

from lambda_shared import (
    CHException,
    ClickHouseHelper,
    RUNNER_TYPE_LABELS,
    get_parameter_from_ssm,
)

### Update comment on the change ###
# 4 HOUR - is a balance to get the most precise values
#   - Our longest possible running check is around 5h on the worst scenario
#   - The long queue won't be wiped out and replaced, so the measurmenet is fine
#   - If the data is spoiled by something, we are from the bills perspective
# Changed it to 3 HOUR: in average we have 1h tasks, but p90 is around 2h.
# With 4h we have too much wasted computing time in case of issues with DB
QUEUE_QUERY = f"""SELECT
    last_status AS status,
    toUInt32(count()) AS length,
    labels
FROM
(
    SELECT
        arraySort(groupArray(status))[-1] AS last_status,
        labels,
        id,
        html_url
    FROM default.workflow_jobs
    WHERE has(labels, 'self-hosted')
        AND hasAny({RUNNER_TYPE_LABELS}, labels)
        AND started_at > now() - INTERVAL 3 HOUR
    GROUP BY ALL
    HAVING last_status IN ('in_progress', 'queued')
)
GROUP BY ALL
ORDER BY labels, last_status"""


@dataclass
class Queue:
    status: Literal["in_progress", "queued"]
    lentgh: int
    label: str


def get_scales(runner_type: str) -> Tuple[int, int]:
    "returns the multipliers for scaling down and up ASG by types"
    # Scaling down is quicker on the lack of running jobs than scaling up on
    # queue
    scale_down = 2
    scale_up = 5
    if runner_type == "style-checker":
        # The ASG should deflate almost instantly
        scale_down = 1
        # the style checkers have so many noise, so it scales up too quickly
        # The 5 was too quick, there are complainings regarding too slow with
        # 10. I am trying 7 now.
        # 7 still looks a bit slow, so I try 6
        # UPDATE THE COMMENT ON CHANGES
        scale_up = 6
    elif runner_type == "limited-tester":
        # The limited runners should inflate and deflate faster
        scale_down = 1
        scale_up = 2
    return scale_down, scale_up


CH_CLIENT = None  # type: Optional[ClickHouseHelper]


def set_capacity(
    runner_type: str, queues: List[Queue], client: Any, dry_run: bool = True
) -> None:
    assert len(queues) in (1, 2)
    assert all(q.label == runner_type for q in queues)
    as_groups = client.describe_auto_scaling_groups(
        Filters=[
            {"Name": "tag-key", "Values": ["github:runner-type"]},
            {"Name": "tag-value", "Values": [runner_type]},
        ]
    )["AutoScalingGroups"]
    assert len(as_groups) == 1
    asg = as_groups[0]
    running = 0
    queued = 0
    for q in queues:
        if q.status == "in_progress":
            running = q.lentgh
            continue
        if q.status == "queued":
            queued = q.lentgh
            continue
        raise ValueError("Queue status is not in ['in_progress', 'queued']")

    scale_down, scale_up = get_scales(runner_type)
    # With lyfecycle hooks some instances are actually free because some of
    # them are in 'Terminating:Wait' state
    effective_capacity = max(
        asg["DesiredCapacity"],
        len([ins for ins in asg["Instances"] if ins["HealthStatus"] == "Healthy"]),
    )

    # How much nodes are free (positive) or need to be added (negative)
    capacity_reserve = effective_capacity - running - queued
    stop = False
    if capacity_reserve < 0:
        # This part is about scaling up
        capacity_deficit = -capacity_reserve
        # It looks that we are still OK, since no queued jobs exist
        stop = stop or queued == 0
        # Are we already at the capacity limits
        stop = stop or asg["MaxSize"] <= asg["DesiredCapacity"]
        # Let's calculate a new desired capacity
        desired_capacity = asg["DesiredCapacity"] + (capacity_deficit // scale_up)
        desired_capacity = max(desired_capacity, asg["MinSize"])
        desired_capacity = min(desired_capacity, asg["MaxSize"])
        # Finally, should the capacity be even changed
        stop = stop or asg["DesiredCapacity"] == desired_capacity
        if stop:
            logging.info(
                "Do not increase ASG %s capacity, current capacity=%s, effective "
                "capacity=%s, maximum capacity=%s, running jobs=%s, queue size=%s",
                asg["AutoScalingGroupName"],
                asg["DesiredCapacity"],
                effective_capacity,
                asg["MaxSize"],
                running,
                queued,
            )
            return

        logging.info(
            "The ASG %s capacity will be increased to %s, current capacity=%s, "
            "effective capacity=%sm maximum capacity=%s, running jobs=%s, queue size=%s",
            asg["AutoScalingGroupName"],
            desired_capacity,
            effective_capacity,
            asg["DesiredCapacity"],
            asg["MaxSize"],
            running,
            queued,
        )
        if not dry_run:
            client.set_desired_capacity(
                AutoScalingGroupName=asg["AutoScalingGroupName"],
                DesiredCapacity=desired_capacity,
            )
        return

    # Now we will calculate if we need to scale down
    stop = stop or asg["DesiredCapacity"] == asg["MinSize"]
    desired_capacity = asg["DesiredCapacity"] - (capacity_reserve // scale_down)
    desired_capacity = max(desired_capacity, asg["MinSize"])
    desired_capacity = min(desired_capacity, asg["MaxSize"])
    stop = stop or asg["DesiredCapacity"] == desired_capacity
    if stop:
        logging.info(
            "Do not decrease ASG %s capacity, current capacity=%s, effective "
            "capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",
            asg["AutoScalingGroupName"],
            asg["DesiredCapacity"],
            effective_capacity,
            asg["MinSize"],
            running,
            queued,
        )
        return

    logging.info(
        "The ASG %s capacity will be decreased to %s, current capacity=%s, effective "
        "capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",
        asg["AutoScalingGroupName"],
        desired_capacity,
        asg["DesiredCapacity"],
        effective_capacity,
        asg["MinSize"],
        running,
        queued,
    )
    if not dry_run:
        client.set_desired_capacity(
            AutoScalingGroupName=asg["AutoScalingGroupName"],
            DesiredCapacity=desired_capacity,
        )


def main(dry_run: bool = True) -> None:
    logging.getLogger().setLevel(logging.INFO)
    asg_client = boto3.client("autoscaling")
    try:
        global CH_CLIENT
        CH_CLIENT = CH_CLIENT or ClickHouseHelper(
            get_parameter_from_ssm("clickhouse-test-stat-url"), "play"
        )
        queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)
    except CHException as ex:
        logging.exception(
            "Got an exception on insert, tryuing to update the client "
            "credentials and repeat",
            exc_info=ex,
        )
        CH_CLIENT = ClickHouseHelper(
            get_parameter_from_ssm("clickhouse-test-stat-url"), "play"
        )
        queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)

    logging.info("Received queue data:\n%s", pformat(queues, width=120))
    for runner_type in RUNNER_TYPE_LABELS:
        runner_queues = [
            Queue(queue["status"], queue["length"], runner_type)
            for queue in queues
            if runner_type in queue["labels"]
        ]
        runner_queues = runner_queues or [Queue("in_progress", 0, runner_type)]
        set_capacity(runner_type, runner_queues, asg_client, dry_run)


def handler(event: dict, context: Any) -> None:
    _ = event
    _ = context
    return main(False)
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`#!/usr/bin/env python3`

			`"""The lambda to decrease/increase ASG desired capacity based on current queue"""`

			`import logging`
			`from dataclasses import dataclass`
			`from pprint import pformat`
Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`from typing import Any, List, Literal, Optional, Tuple`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00
			`import boto3 # type: ignore`
Create lambda_shared package for lambdas 2023-05-22 21:07:35 +00:00
			`from lambda_shared import (`
			`CHException,`
			`ClickHouseHelper,`
			`RUNNER_TYPE_LABELS,`
			`get_parameter_from_ssm,`
			`)`
Adjust the aggregation interval 2023-03-06 13:38:54 +00:00
Decrease the time window for autoscale_runners_lambda 2023-05-22 15:57:12 +00:00			`### Update comment on the change ###`
Adjust the aggregation interval 2023-03-06 13:38:54 +00:00			`# 4 HOUR - is a balance to get the most precise values`
			`# - Our longest possible running check is around 5h on the worst scenario`
			`# - The long queue won't be wiped out and replaced, so the measurmenet is fine`
			`# - If the data is spoiled by something, we are from the bills perspective`
Decrease the time window for autoscale_runners_lambda 2023-05-22 15:57:12 +00:00			`# Changed it to 3 HOUR: in average we have 1h tasks, but p90 is around 2h.`
			`# With 4h we have too much wasted computing time in case of issues with DB`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`QUEUE_QUERY = f"""SELECT`
			`last_status AS status,`
			`toUInt32(count()) AS length,`
			`labels`
			`FROM`
			`(`
			`SELECT`
			`arraySort(groupArray(status))[-1] AS last_status,`
			`labels,`
			`id,`
			`html_url`
			`FROM default.workflow_jobs`
			`WHERE has(labels, 'self-hosted')`
			`AND hasAny({RUNNER_TYPE_LABELS}, labels)`
Decrease the time window for autoscale_runners_lambda 2023-05-22 15:57:12 +00:00			`AND started_at > now() - INTERVAL 3 HOUR`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`GROUP BY ALL`
			`HAVING last_status IN ('in_progress', 'queued')`
			`)`
			`GROUP BY ALL`
			`ORDER BY labels, last_status"""`


			`@dataclass`
			`class Queue:`
			`status: Literal["in_progress", "queued"]`
			`lentgh: int`
			`label: str`


Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`def get_scales(runner_type: str) -> Tuple[int, int]:`
			`"returns the multipliers for scaling down and up ASG by types"`
			`# Scaling down is quicker on the lack of running jobs than scaling up on`
			`# queue`
Decrease scale_down ratio for faster deflation 2023-03-20 19:56:11 +00:00			`scale_down = 2`
Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`scale_up = 5`
			`if runner_type == "style-checker":`
Experiment with decreasing scale_up for style-checkers 2023-06-19 16:43:38 +00:00			`# The ASG should deflate almost instantly`
Decrease scale_down ratio for faster deflation 2023-03-20 19:56:11 +00:00			`scale_down = 1`
Experiment with decreasing scale_up for style-checkers 2023-06-19 16:43:38 +00:00			`# the style checkers have so many noise, so it scales up too quickly`
Update the style_checker upscale speed 2023-04-25 10:42:06 +00:00			`# The 5 was too quick, there are complainings regarding too slow with`
			`# 10. I am trying 7 now.`
Experiment with decreasing scale_up for style-checkers 2023-06-19 16:43:38 +00:00			`# 7 still looks a bit slow, so I try 6`
Update the style_checker upscale speed 2023-04-25 10:42:06 +00:00			`# UPDATE THE COMMENT ON CHANGES`
Experiment with decreasing scale_up for style-checkers 2023-06-19 16:43:38 +00:00			`scale_up = 6`
Add a new runner type for ci metrics and autoscaling 2023-05-31 21:17:41 +00:00			`elif runner_type == "limited-tester":`
			`# The limited runners should inflate and deflate faster`
			`scale_down = 1`
			`scale_up = 2`
Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`return scale_down, scale_up`


Create lambda_shared package for lambdas 2023-05-22 21:07:35 +00:00			`CH_CLIENT = None # type: Optional[ClickHouseHelper]`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00

			`def set_capacity(`
			`runner_type: str, queues: List[Queue], client: Any, dry_run: bool = True`
			`) -> None:`
			`assert len(queues) in (1, 2)`
			`assert all(q.label == runner_type for q in queues)`
			`as_groups = client.describe_auto_scaling_groups(`
			`Filters=[`
			`{"Name": "tag-key", "Values": ["github:runner-type"]},`
			`{"Name": "tag-value", "Values": [runner_type]},`
			`]`
			`)["AutoScalingGroups"]`
			`assert len(as_groups) == 1`
			`asg = as_groups[0]`
			`running = 0`
			`queued = 0`
			`for q in queues:`
			`if q.status == "in_progress":`
			`running = q.lentgh`
			`continue`
			`if q.status == "queued":`
			`queued = q.lentgh`
			`continue`
			`raise ValueError("Queue status is not in ['in_progress', 'queued']")`

Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`scale_down, scale_up = get_scales(runner_type)`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`# With lyfecycle hooks some instances are actually free because some of`
			`# them are in 'Terminating:Wait' state`
			`effective_capacity = max(`
			`asg["DesiredCapacity"],`
			`len([ins for ins in asg["Instances"] if ins["HealthStatus"] == "Healthy"]),`
			`)`

Make smarter scaler based on reserve/deficit 2023-03-03 11:09:09 +00:00			`# How much nodes are free (positive) or need to be added (negative)`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`capacity_reserve = effective_capacity - running - queued`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`stop = False`
Make smarter scaler based on reserve/deficit 2023-03-03 11:09:09 +00:00			`if capacity_reserve < 0:`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`# This part is about scaling up`
Make smarter scaler based on reserve/deficit 2023-03-03 11:09:09 +00:00			`capacity_deficit = -capacity_reserve`
			`# It looks that we are still OK, since no queued jobs exist`
			`stop = stop or queued == 0`
			`# Are we already at the capacity limits`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`stop = stop or asg["MaxSize"] <= asg["DesiredCapacity"]`
Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`# Let's calculate a new desired capacity`
Make smarter scaler based on reserve/deficit 2023-03-03 11:09:09 +00:00			`desired_capacity = asg["DesiredCapacity"] + (capacity_deficit // scale_up)`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`desired_capacity = max(desired_capacity, asg["MinSize"])`
			`desired_capacity = min(desired_capacity, asg["MaxSize"])`
			`# Finally, should the capacity be even changed`
			`stop = stop or asg["DesiredCapacity"] == desired_capacity`
			`if stop:`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`logging.info(`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`"Do not increase ASG %s capacity, current capacity=%s, effective "`
			`"capacity=%s, maximum capacity=%s, running jobs=%s, queue size=%s",`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`asg["AutoScalingGroupName"],`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`asg["DesiredCapacity"],`
			`effective_capacity,`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`asg["MaxSize"],`
			`running,`
			`queued,`
			`)`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`return`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`logging.info(`
			`"The ASG %s capacity will be increased to %s, current capacity=%s, "`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`"effective capacity=%sm maximum capacity=%s, running jobs=%s, queue size=%s",`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`asg["AutoScalingGroupName"],`
			`desired_capacity,`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`effective_capacity,`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`asg["DesiredCapacity"],`
			`asg["MaxSize"],`
			`running,`
			`queued,`
			`)`
			`if not dry_run:`
			`client.set_desired_capacity(`
			`AutoScalingGroupName=asg["AutoScalingGroupName"],`
			`DesiredCapacity=desired_capacity,`
			`)`
			`return`

			`# Now we will calculate if we need to scale down`
Make smarter scaler based on reserve/deficit 2023-03-03 11:09:09 +00:00			`stop = stop or asg["DesiredCapacity"] == asg["MinSize"]`
Tune style-checker scaling up and down by modifiers 2023-03-03 08:53:21 +00:00			`desired_capacity = asg["DesiredCapacity"] - (capacity_reserve // scale_down)`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`desired_capacity = max(desired_capacity, asg["MinSize"])`
			`desired_capacity = min(desired_capacity, asg["MaxSize"])`
			`stop = stop or asg["DesiredCapacity"] == desired_capacity`
			`if stop:`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`logging.info(`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`"Do not decrease ASG %s capacity, current capacity=%s, effective "`
			`"capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`asg["AutoScalingGroupName"],`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`asg["DesiredCapacity"],`
			`effective_capacity,`
Additional logging in autoscale_runners_lambda 2023-06-01 15:12:19 +00:00			`asg["MinSize"],`
			`running,`
			`queued,`
			`)`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`return`

			`logging.info(`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`"The ASG %s capacity will be decreased to %s, current capacity=%s, effective "`
			`"capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`asg["AutoScalingGroupName"],`
			`desired_capacity,`
			`asg["DesiredCapacity"],`
Calculate current capacity according to HealthStatus 2023-06-06 18:39:05 +00:00			`effective_capacity,`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`asg["MinSize"],`
			`running,`
			`queued,`
			`)`
			`if not dry_run:`
			`client.set_desired_capacity(`
			`AutoScalingGroupName=asg["AutoScalingGroupName"],`
			`DesiredCapacity=desired_capacity,`
			`)`


			`def main(dry_run: bool = True) -> None:`
			`logging.getLogger().setLevel(logging.INFO)`
			`asg_client = boto3.client("autoscaling")`
			`try:`
			`global CH_CLIENT`
Create lambda_shared package for lambdas 2023-05-22 21:07:35 +00:00			`CH_CLIENT = CH_CLIENT or ClickHouseHelper(`
			`get_parameter_from_ssm("clickhouse-test-stat-url"), "play"`
			`)`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)`
			`except CHException as ex:`
			`logging.exception(`
			`"Got an exception on insert, tryuing to update the client "`
			`"credentials and repeat",`
			`exc_info=ex,`
			`)`
			`CH_CLIENT = ClickHouseHelper(`
			`get_parameter_from_ssm("clickhouse-test-stat-url"), "play"`
			`)`
			`queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)`

			`logging.info("Received queue data:\n%s", pformat(queues, width=120))`
			`for runner_type in RUNNER_TYPE_LABELS:`
			`runner_queues = [`
			`Queue(queue["status"], queue["length"], runner_type)`
			`for queue in queues`
			`if runner_type in queue["labels"]`
			`]`
Reduce statisctics interval, process empty response for runners 2023-03-03 11:01:05 +00:00			`runner_queues = runner_queues or [Queue("in_progress", 0, runner_type)]`
Add code for autoscaling lambda 2023-03-01 19:59:56 +00:00			`set_capacity(runner_type, runner_queues, asg_client, dry_run)`


			`def handler(event: dict, context: Any) -> None:`
			`_ = event`
			`_ = context`
			`return main(False)`