diff --git a/tests/ci/metrics_lambda/app.py b/tests/ci/metrics_lambda/app.py index d776aa2be49..d6717f76801 100644 --- a/tests/ci/metrics_lambda/app.py +++ b/tests/ci/metrics_lambda/app.py @@ -7,6 +7,30 @@ import sys import json import time from collections import namedtuple +import boto3 + +def get_dead_runners_in_ec2(runners): + ids = {runner.name: runner for runner in runners if runner.offline == True and runner.busy == False} + if not ids: + return [] + + client = boto3.client('ec2') + + print("Checking ids", list(ids.keys())) + instances_statuses = client.describe_instance_status(InstanceIds=list(ids.keys())) + found_instances = set([]) + print("Response", instances_statuses) + for instance_status in instances_statuses['InstanceStatuses']: + if instance_status['InstanceState']['Name'] in ('pending', 'running'): + found_instances.add(instance_status['InstanceId']) + + print("Found instances", found_instances) + result_to_delete = [] + for instance_id, runner in ids.items(): + if instance_id not in found_instances: + print("Instance", instance_id, "is not alive, going to remove it") + result_to_delete.append(runner) + return result_to_delete def get_key_and_app_from_aws(): import boto3 @@ -23,7 +47,7 @@ def get_key_and_app_from_aws(): def handler(event, context): private_key, app_id = get_key_and_app_from_aws() - main(private_key, app_id, True, False) + main(private_key, app_id, True, True) def get_installation_id(jwt_token): headers = { @@ -74,6 +98,7 @@ def list_runners(access_token): desc = RunnerDescription(id=runner['id'], name=runner['name'], tags=tags, offline=runner['status']=='offline', busy=runner['busy']) result.append(desc) + return result def group_runners_by_tag(listed_runners): @@ -95,7 +120,6 @@ def group_runners_by_tag(listed_runners): def push_metrics_to_cloudwatch(listed_runners, namespace): - import boto3 client = boto3.client('cloudwatch') metrics_data = [] busy_runners = sum(1 for runner in listed_runners if runner.busy) @@ -162,12 +186,10 @@ def main(github_secret_key, github_app_id, push_to_cloudwatch, delete_offline_ru if delete_offline_runners: print("Going to delete offline runners") - for runner in runners: - if runner.offline and not runner.busy: - print("Deleting runner", runner) - delete_runner(access_token, runner) - - + dead_runners = get_dead_runners_in_ec2(runners) + for runner in dead_runners: + print("Deleting runner", runner) + delete_runner(access_token, runner) if __name__ == "__main__": parser = argparse.ArgumentParser(description='Get list of runners and their states') diff --git a/tests/ci/termination_lambda/app.py b/tests/ci/termination_lambda/app.py index cd7d51ae8eb..5de3d1531f2 100644 --- a/tests/ci/termination_lambda/app.py +++ b/tests/ci/termination_lambda/app.py @@ -139,7 +139,7 @@ def delete_runner(access_token, runner): response = requests.delete(f"https://api.github.com/orgs/ClickHouse/actions/runners/{runner.id}", headers=headers) response.raise_for_status() - print(f"Response code deleting {runner.name} is {response.status_code}") + print(f"Response code deleting {runner.name} with id {runner.id} is {response.status_code}") return response.status_code == 204 @@ -197,7 +197,7 @@ def main(github_secret_key, github_app_id, event): print("Going to delete runners:", ', '.join([runner.name for runner in to_delete_runners])) for runner in to_delete_runners: if delete_runner(access_token, runner): - print(f"Runner {runner.name} successfuly deleted from github") + print(f"Runner with name {runner.name} and id {runner.id} successfuly deleted from github") instances_to_kill.append(runner.name) else: print(f"Cannot delete {runner.name} from github")