ClickHouse/tests/ci/termination_lambda/app.py

284 lines
10 KiB
Python
Raw Normal View History

2021-09-30 10:12:58 +00:00
#!/usr/bin/env python3
import requests
import argparse
import jwt
import sys
import json
import time
from collections import namedtuple
def get_key_and_app_from_aws():
import boto3
2021-10-19 19:39:55 +00:00
secret_name = "clickhouse_github_secret_key"
2021-09-30 10:12:58 +00:00
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
)
get_secret_value_response = client.get_secret_value(
SecretId=secret_name
)
data = json.loads(get_secret_value_response['SecretString'])
return data['clickhouse-app-key'], int(data['clickhouse-app-id'])
def get_installation_id(jwt_token):
headers = {
"Authorization": f"Bearer {jwt_token}",
"Accept": "application/vnd.github.v3+json",
}
response = requests.get("https://api.github.com/app/installations", headers=headers)
response.raise_for_status()
data = response.json()
return data[0]['id']
def get_access_token(jwt_token, installation_id):
headers = {
"Authorization": f"Bearer {jwt_token}",
"Accept": "application/vnd.github.v3+json",
}
response = requests.post(f"https://api.github.com/app/installations/{installation_id}/access_tokens", headers=headers)
response.raise_for_status()
data = response.json()
return data['token']
RunnerDescription = namedtuple('RunnerDescription', ['id', 'name', 'tags', 'offline', 'busy'])
def list_runners(access_token):
headers = {
"Authorization": f"token {access_token}",
"Accept": "application/vnd.github.v3+json",
}
2021-10-29 22:09:07 +00:00
response = requests.get("https://api.github.com/orgs/ClickHouse/actions/runners?per_page=100", headers=headers)
2021-09-30 10:12:58 +00:00
response.raise_for_status()
data = response.json()
2021-10-29 22:09:07 +00:00
total_runners = data['total_count']
2021-09-30 10:12:58 +00:00
runners = data['runners']
2021-10-29 22:09:07 +00:00
total_pages = int(total_runners / 100 + 1)
for i in range(2, total_pages + 1):
response = requests.get(f"https://api.github.com/orgs/ClickHouse/actions/runners?page={i}&per_page=100", headers=headers)
response.raise_for_status()
data = response.json()
runners += data['runners']
print("Total runners", len(runners))
2021-09-30 10:12:58 +00:00
result = []
for runner in runners:
tags = [tag['name'] for tag in runner['labels']]
desc = RunnerDescription(id=runner['id'], name=runner['name'], tags=tags,
offline=runner['status']=='offline', busy=runner['busy'])
result.append(desc)
return result
2021-09-30 10:39:15 +00:00
def push_metrics_to_cloudwatch(listed_runners, namespace):
import boto3
client = boto3.client('cloudwatch')
metrics_data = []
busy_runners = sum(1 for runner in listed_runners if runner.busy)
metrics_data.append({
'MetricName': 'BusyRunners',
'Value': busy_runners,
'Unit': 'Count',
})
total_active_runners = sum(1 for runner in listed_runners if not runner.offline)
metrics_data.append({
'MetricName': 'ActiveRunners',
'Value': total_active_runners,
'Unit': 'Count',
})
total_runners = len(listed_runners)
metrics_data.append({
'MetricName': 'TotalRunners',
'Value': total_runners,
'Unit': 'Count',
})
if total_active_runners == 0:
busy_ratio = 100
else:
busy_ratio = busy_runners / total_active_runners * 100
metrics_data.append({
'MetricName': 'BusyRunnersRatio',
'Value': busy_ratio,
'Unit': 'Percent',
})
client.put_metric_data(Namespace='RunnersMetrics', MetricData=metrics_data)
2021-09-30 10:12:58 +00:00
def how_many_instances_to_kill(event_data):
data_array = event_data['CapacityToTerminate']
to_kill_by_zone = {}
for av_zone in data_array:
zone_name = av_zone['AvailabilityZone']
to_kill = av_zone['Capacity']
if zone_name not in to_kill_by_zone:
to_kill_by_zone[zone_name] = 0
to_kill_by_zone[zone_name] += to_kill
return to_kill_by_zone
def get_candidates_to_be_killed(event_data):
data_array = event_data['Instances']
instances_by_zone = {}
for instance in data_array:
zone_name = instance['AvailabilityZone']
instance_id = instance['InstanceId']
if zone_name not in instances_by_zone:
instances_by_zone[zone_name] = []
instances_by_zone[zone_name].append(instance_id)
return instances_by_zone
def delete_runner(access_token, runner):
headers = {
"Authorization": f"token {access_token}",
"Accept": "application/vnd.github.v3+json",
}
response = requests.delete(f"https://api.github.com/orgs/ClickHouse/actions/runners/{runner.id}", headers=headers)
response.raise_for_status()
2021-12-02 16:38:18 +00:00
print(f"Response code deleting {runner.name} with id {runner.id} is {response.status_code}")
2021-09-30 10:12:58 +00:00
return response.status_code == 204
def main(github_secret_key, github_app_id, event):
print("Got event", json.dumps(event, sort_keys=True, indent=4))
to_kill_by_zone = how_many_instances_to_kill(event)
instances_by_zone = get_candidates_to_be_killed(event)
payload = {
"iat": int(time.time()) - 60,
"exp": int(time.time()) + (10 * 60),
"iss": github_app_id,
}
encoded_jwt = jwt.encode(payload, github_secret_key, algorithm="RS256")
installation_id = get_installation_id(encoded_jwt)
access_token = get_access_token(encoded_jwt, installation_id)
runners = list_runners(access_token)
to_delete_runners = []
instances_to_kill = []
for zone in to_kill_by_zone:
num_to_kill = to_kill_by_zone[zone]
candidates = instances_by_zone[zone]
if num_to_kill > len(candidates):
raise Exception(f"Required to kill {num_to_kill}, but have only {len(candidates)} candidates in AV {zone}")
delete_for_av = []
for candidate in candidates:
2021-09-30 11:26:46 +00:00
if candidate not in set([runner.name for runner in runners]):
print(f"Candidate {candidate} was not in runners list, simply delete it")
instances_to_kill.append(candidate)
for candidate in candidates:
if len(delete_for_av) + len(instances_to_kill) == num_to_kill:
2021-09-30 10:12:58 +00:00
break
2021-09-30 11:26:46 +00:00
if candidate in instances_to_kill:
continue
2021-09-30 10:12:58 +00:00
for runner in runners:
if runner.name == candidate:
if not runner.busy:
print(f"Runner {runner.name} is not busy and can be deleted from AV {zone}")
delete_for_av.append(runner)
else:
print(f"Runner {runner.name} is busy, not going to delete it")
break
if len(delete_for_av) < num_to_kill:
print(f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}")
to_delete_runners += delete_for_av
print("Got instances to kill: ", ', '.join(instances_to_kill))
print("Going to delete runners:", ', '.join([runner.name for runner in to_delete_runners]))
for runner in to_delete_runners:
if delete_runner(access_token, runner):
2021-12-02 16:38:18 +00:00
print(f"Runner with name {runner.name} and id {runner.id} successfuly deleted from github")
2021-09-30 10:12:58 +00:00
instances_to_kill.append(runner.name)
else:
print(f"Cannot delete {runner.name} from github")
2021-09-30 11:26:46 +00:00
## push metrics
#runners = list_runners(access_token)
#push_metrics_to_cloudwatch(runners, 'RunnersMetrics')
2021-09-30 10:39:15 +00:00
2021-09-30 10:12:58 +00:00
response = {
"InstanceIDs": instances_to_kill
}
print(response)
return response
def handler(event, context):
private_key, app_id = get_key_and_app_from_aws()
return main(private_key, app_id, event)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Get list of runners and their states')
parser.add_argument('-p', '--private-key-path', help='Path to file with private key')
parser.add_argument('-k', '--private-key', help='Private key')
parser.add_argument('-a', '--app-id', type=int, help='GitHub application ID', required=True)
args = parser.parse_args()
if not args.private_key_path and not args.private_key:
print("Either --private-key-path or --private-key must be specified", file=sys.stderr)
if args.private_key_path and args.private_key:
print("Either --private-key-path or --private-key must be specified", file=sys.stderr)
if args.private_key:
private_key = args.private_key
else:
with open(args.private_key_path, 'r') as key_file:
private_key = key_file.read()
sample_event = {
"AutoScalingGroupARN": "arn:aws:autoscaling:us-east-1:<account-id>:autoScalingGroup:d4738357-2d40-4038-ae7e-b00ae0227003:autoScalingGroupName/my-asg",
"AutoScalingGroupName": "my-asg",
"CapacityToTerminate": [
{
"AvailabilityZone": "us-east-1b",
"Capacity": 1,
"InstanceMarketOption": "OnDemand"
},
{
"AvailabilityZone": "us-east-1c",
"Capacity": 2,
"InstanceMarketOption": "OnDemand"
}
],
"Instances": [
{
"AvailabilityZone": "us-east-1b",
"InstanceId": "i-08d0b3c1a137e02a5",
"InstanceType": "t2.nano",
"InstanceMarketOption": "OnDemand"
},
{
"AvailabilityZone": "us-east-1c",
"InstanceId": "ip-172-31-45-253.eu-west-1.compute.internal",
"InstanceType": "t2.nano",
"InstanceMarketOption": "OnDemand"
},
{
"AvailabilityZone": "us-east-1c",
"InstanceId": "ip-172-31-27-227.eu-west-1.compute.internal",
"InstanceType": "t2.nano",
"InstanceMarketOption": "OnDemand"
},
{
"AvailabilityZone": "us-east-1c",
"InstanceId": "ip-172-31-45-253.eu-west-1.compute.internal",
"InstanceType": "t2.nano",
"InstanceMarketOption": "OnDemand"
2021-09-30 11:26:46 +00:00
}
2021-09-30 10:12:58 +00:00
],
"Cause": "SCALE_IN"
}
main(private_key, args.app_id, sample_event)