2023-03-01 19:59:56 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
"""The lambda to decrease/increase ASG desired capacity based on current queue"""
|
|
|
|
|
|
|
|
import logging
|
|
|
|
from dataclasses import dataclass
|
|
|
|
from pprint import pformat
|
2023-03-03 08:53:21 +00:00
|
|
|
from typing import Any, List, Literal, Optional, Tuple
|
2023-03-01 19:59:56 +00:00
|
|
|
|
|
|
|
import boto3 # type: ignore
|
2023-05-22 21:07:35 +00:00
|
|
|
from lambda_shared import (
|
2024-02-07 12:49:35 +00:00
|
|
|
RUNNER_TYPE_LABELS,
|
2023-05-22 21:07:35 +00:00
|
|
|
CHException,
|
|
|
|
ClickHouseHelper,
|
|
|
|
get_parameter_from_ssm,
|
|
|
|
)
|
2023-03-06 13:38:54 +00:00
|
|
|
|
2023-05-22 15:57:12 +00:00
|
|
|
### Update comment on the change ###
|
2023-03-06 13:38:54 +00:00
|
|
|
# 4 HOUR - is a balance to get the most precise values
|
|
|
|
# - Our longest possible running check is around 5h on the worst scenario
|
|
|
|
# - The long queue won't be wiped out and replaced, so the measurmenet is fine
|
|
|
|
# - If the data is spoiled by something, we are from the bills perspective
|
2023-05-22 15:57:12 +00:00
|
|
|
# Changed it to 3 HOUR: in average we have 1h tasks, but p90 is around 2h.
|
|
|
|
# With 4h we have too much wasted computing time in case of issues with DB
|
2023-03-01 19:59:56 +00:00
|
|
|
QUEUE_QUERY = f"""SELECT
|
|
|
|
last_status AS status,
|
|
|
|
toUInt32(count()) AS length,
|
|
|
|
labels
|
|
|
|
FROM
|
|
|
|
(
|
|
|
|
SELECT
|
|
|
|
arraySort(groupArray(status))[-1] AS last_status,
|
|
|
|
labels,
|
|
|
|
id,
|
|
|
|
html_url
|
|
|
|
FROM default.workflow_jobs
|
|
|
|
WHERE has(labels, 'self-hosted')
|
|
|
|
AND hasAny({RUNNER_TYPE_LABELS}, labels)
|
2023-05-22 15:57:12 +00:00
|
|
|
AND started_at > now() - INTERVAL 3 HOUR
|
2023-03-01 19:59:56 +00:00
|
|
|
GROUP BY ALL
|
|
|
|
HAVING last_status IN ('in_progress', 'queued')
|
|
|
|
)
|
|
|
|
GROUP BY ALL
|
|
|
|
ORDER BY labels, last_status"""
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass
|
|
|
|
class Queue:
|
|
|
|
status: Literal["in_progress", "queued"]
|
|
|
|
lentgh: int
|
|
|
|
label: str
|
|
|
|
|
|
|
|
|
2023-03-03 08:53:21 +00:00
|
|
|
def get_scales(runner_type: str) -> Tuple[int, int]:
|
|
|
|
"returns the multipliers for scaling down and up ASG by types"
|
|
|
|
# Scaling down is quicker on the lack of running jobs than scaling up on
|
|
|
|
# queue
|
2023-12-06 16:15:44 +00:00
|
|
|
|
|
|
|
# The ASG should deflate almost instantly
|
|
|
|
scale_down = 1
|
|
|
|
# the style checkers have so many noise, so it scales up too quickly
|
|
|
|
# The 5 was too quick, there are complainings regarding too slow with
|
|
|
|
# 10. I am trying 7 now.
|
|
|
|
# 7 still looks a bit slow, so I try 6
|
|
|
|
# Let's have it the same as the other ASG
|
|
|
|
# UPDATE THE COMMENT ON CHANGES
|
2023-11-07 16:45:10 +00:00
|
|
|
scale_up = 3
|
2023-03-03 08:53:21 +00:00
|
|
|
return scale_down, scale_up
|
|
|
|
|
|
|
|
|
2023-05-22 21:07:35 +00:00
|
|
|
CH_CLIENT = None # type: Optional[ClickHouseHelper]
|
2023-03-01 19:59:56 +00:00
|
|
|
|
|
|
|
|
|
|
|
def set_capacity(
|
|
|
|
runner_type: str, queues: List[Queue], client: Any, dry_run: bool = True
|
|
|
|
) -> None:
|
|
|
|
assert len(queues) in (1, 2)
|
|
|
|
assert all(q.label == runner_type for q in queues)
|
|
|
|
as_groups = client.describe_auto_scaling_groups(
|
|
|
|
Filters=[
|
|
|
|
{"Name": "tag-key", "Values": ["github:runner-type"]},
|
|
|
|
{"Name": "tag-value", "Values": [runner_type]},
|
|
|
|
]
|
|
|
|
)["AutoScalingGroups"]
|
|
|
|
assert len(as_groups) == 1
|
|
|
|
asg = as_groups[0]
|
|
|
|
running = 0
|
|
|
|
queued = 0
|
|
|
|
for q in queues:
|
|
|
|
if q.status == "in_progress":
|
|
|
|
running = q.lentgh
|
|
|
|
continue
|
|
|
|
if q.status == "queued":
|
|
|
|
queued = q.lentgh
|
|
|
|
continue
|
|
|
|
raise ValueError("Queue status is not in ['in_progress', 'queued']")
|
|
|
|
|
2023-03-03 08:53:21 +00:00
|
|
|
scale_down, scale_up = get_scales(runner_type)
|
2023-06-06 18:39:05 +00:00
|
|
|
# With lyfecycle hooks some instances are actually free because some of
|
|
|
|
# them are in 'Terminating:Wait' state
|
|
|
|
effective_capacity = max(
|
|
|
|
asg["DesiredCapacity"],
|
|
|
|
len([ins for ins in asg["Instances"] if ins["HealthStatus"] == "Healthy"]),
|
|
|
|
)
|
|
|
|
|
2023-03-03 11:09:09 +00:00
|
|
|
# How much nodes are free (positive) or need to be added (negative)
|
2023-06-06 18:39:05 +00:00
|
|
|
capacity_reserve = effective_capacity - running - queued
|
2023-03-01 19:59:56 +00:00
|
|
|
stop = False
|
2023-03-03 11:09:09 +00:00
|
|
|
if capacity_reserve < 0:
|
2023-03-01 19:59:56 +00:00
|
|
|
# This part is about scaling up
|
2023-03-03 11:09:09 +00:00
|
|
|
capacity_deficit = -capacity_reserve
|
|
|
|
# It looks that we are still OK, since no queued jobs exist
|
|
|
|
stop = stop or queued == 0
|
|
|
|
# Are we already at the capacity limits
|
2023-03-01 19:59:56 +00:00
|
|
|
stop = stop or asg["MaxSize"] <= asg["DesiredCapacity"]
|
2023-03-03 08:53:21 +00:00
|
|
|
# Let's calculate a new desired capacity
|
2024-02-07 12:49:35 +00:00
|
|
|
# (capacity_deficit + scale_up - 1) // scale_up : will increase min by 1
|
|
|
|
# if there is any capacity_deficit
|
2023-12-11 12:07:56 +00:00
|
|
|
desired_capacity = (
|
|
|
|
asg["DesiredCapacity"] + (capacity_deficit + scale_up - 1) // scale_up
|
|
|
|
)
|
2023-03-01 19:59:56 +00:00
|
|
|
desired_capacity = max(desired_capacity, asg["MinSize"])
|
|
|
|
desired_capacity = min(desired_capacity, asg["MaxSize"])
|
|
|
|
# Finally, should the capacity be even changed
|
|
|
|
stop = stop or asg["DesiredCapacity"] == desired_capacity
|
|
|
|
if stop:
|
2023-06-01 15:12:19 +00:00
|
|
|
logging.info(
|
2023-06-06 18:39:05 +00:00
|
|
|
"Do not increase ASG %s capacity, current capacity=%s, effective "
|
|
|
|
"capacity=%s, maximum capacity=%s, running jobs=%s, queue size=%s",
|
2023-06-01 15:12:19 +00:00
|
|
|
asg["AutoScalingGroupName"],
|
2023-06-06 18:39:05 +00:00
|
|
|
asg["DesiredCapacity"],
|
|
|
|
effective_capacity,
|
2023-06-01 15:12:19 +00:00
|
|
|
asg["MaxSize"],
|
|
|
|
running,
|
|
|
|
queued,
|
|
|
|
)
|
2023-03-01 19:59:56 +00:00
|
|
|
return
|
2023-06-01 15:12:19 +00:00
|
|
|
|
2023-03-01 19:59:56 +00:00
|
|
|
logging.info(
|
|
|
|
"The ASG %s capacity will be increased to %s, current capacity=%s, "
|
2023-06-06 18:39:05 +00:00
|
|
|
"effective capacity=%sm maximum capacity=%s, running jobs=%s, queue size=%s",
|
2023-03-01 19:59:56 +00:00
|
|
|
asg["AutoScalingGroupName"],
|
|
|
|
desired_capacity,
|
2023-06-06 18:39:05 +00:00
|
|
|
effective_capacity,
|
2023-03-01 19:59:56 +00:00
|
|
|
asg["DesiredCapacity"],
|
|
|
|
asg["MaxSize"],
|
|
|
|
running,
|
|
|
|
queued,
|
|
|
|
)
|
|
|
|
if not dry_run:
|
|
|
|
client.set_desired_capacity(
|
|
|
|
AutoScalingGroupName=asg["AutoScalingGroupName"],
|
|
|
|
DesiredCapacity=desired_capacity,
|
|
|
|
)
|
|
|
|
return
|
|
|
|
|
|
|
|
# Now we will calculate if we need to scale down
|
2023-03-03 11:09:09 +00:00
|
|
|
stop = stop or asg["DesiredCapacity"] == asg["MinSize"]
|
2023-03-03 08:53:21 +00:00
|
|
|
desired_capacity = asg["DesiredCapacity"] - (capacity_reserve // scale_down)
|
2023-03-01 19:59:56 +00:00
|
|
|
desired_capacity = max(desired_capacity, asg["MinSize"])
|
|
|
|
desired_capacity = min(desired_capacity, asg["MaxSize"])
|
|
|
|
stop = stop or asg["DesiredCapacity"] == desired_capacity
|
|
|
|
if stop:
|
2023-06-01 15:12:19 +00:00
|
|
|
logging.info(
|
2023-06-06 18:39:05 +00:00
|
|
|
"Do not decrease ASG %s capacity, current capacity=%s, effective "
|
|
|
|
"capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",
|
2023-06-01 15:12:19 +00:00
|
|
|
asg["AutoScalingGroupName"],
|
2023-06-06 18:39:05 +00:00
|
|
|
asg["DesiredCapacity"],
|
|
|
|
effective_capacity,
|
2023-06-01 15:12:19 +00:00
|
|
|
asg["MinSize"],
|
|
|
|
running,
|
|
|
|
queued,
|
|
|
|
)
|
2023-03-01 19:59:56 +00:00
|
|
|
return
|
|
|
|
|
|
|
|
logging.info(
|
2023-06-06 18:39:05 +00:00
|
|
|
"The ASG %s capacity will be decreased to %s, current capacity=%s, effective "
|
|
|
|
"capacity=%s, minimum capacity=%s, running jobs=%s, queue size=%s",
|
2023-03-01 19:59:56 +00:00
|
|
|
asg["AutoScalingGroupName"],
|
|
|
|
desired_capacity,
|
|
|
|
asg["DesiredCapacity"],
|
2023-06-06 18:39:05 +00:00
|
|
|
effective_capacity,
|
2023-03-01 19:59:56 +00:00
|
|
|
asg["MinSize"],
|
|
|
|
running,
|
|
|
|
queued,
|
|
|
|
)
|
|
|
|
if not dry_run:
|
|
|
|
client.set_desired_capacity(
|
|
|
|
AutoScalingGroupName=asg["AutoScalingGroupName"],
|
|
|
|
DesiredCapacity=desired_capacity,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def main(dry_run: bool = True) -> None:
|
|
|
|
logging.getLogger().setLevel(logging.INFO)
|
|
|
|
asg_client = boto3.client("autoscaling")
|
|
|
|
try:
|
|
|
|
global CH_CLIENT
|
2023-05-22 21:07:35 +00:00
|
|
|
CH_CLIENT = CH_CLIENT or ClickHouseHelper(
|
|
|
|
get_parameter_from_ssm("clickhouse-test-stat-url"), "play"
|
|
|
|
)
|
2023-03-01 19:59:56 +00:00
|
|
|
queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)
|
|
|
|
except CHException as ex:
|
|
|
|
logging.exception(
|
|
|
|
"Got an exception on insert, tryuing to update the client "
|
|
|
|
"credentials and repeat",
|
|
|
|
exc_info=ex,
|
|
|
|
)
|
|
|
|
CH_CLIENT = ClickHouseHelper(
|
|
|
|
get_parameter_from_ssm("clickhouse-test-stat-url"), "play"
|
|
|
|
)
|
|
|
|
queues = CH_CLIENT.select_json_each_row("default", QUEUE_QUERY)
|
|
|
|
|
|
|
|
logging.info("Received queue data:\n%s", pformat(queues, width=120))
|
|
|
|
for runner_type in RUNNER_TYPE_LABELS:
|
|
|
|
runner_queues = [
|
|
|
|
Queue(queue["status"], queue["length"], runner_type)
|
|
|
|
for queue in queues
|
|
|
|
if runner_type in queue["labels"]
|
|
|
|
]
|
2023-03-03 11:01:05 +00:00
|
|
|
runner_queues = runner_queues or [Queue("in_progress", 0, runner_type)]
|
2023-03-01 19:59:56 +00:00
|
|
|
set_capacity(runner_type, runner_queues, asg_client, dry_run)
|
|
|
|
|
|
|
|
|
|
|
|
def handler(event: dict, context: Any) -> None:
|
|
|
|
_ = event
|
|
|
|
_ = context
|
|
|
|
return main(False)
|