Merge pull request #44015 from ClickHouse/try-to-keep-runners

Try to keep runners alive for longer
This commit is contained in:
Mikhail f. Shiryaev 2022-12-09 20:24:03 +01:00 committed by GitHub
commit 658bd348ce
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 35 deletions

View File

@ -43,13 +43,14 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
# Only `i-deadbead123` are valid names for an instance ID # Only `i-deadbead123` are valid names for an instance ID
if runner.offline and not runner.busy and runner.name.startswith("i-") if runner.offline and not runner.busy and runner.name.startswith("i-")
} }
if not ids:
return []
result_to_delete = [ result_to_delete = [
runner runner
for runner in runners for runner in runners
if not ids.get(runner.name) and runner.offline and not runner.busy if not ids.get(runner.name) and runner.offline and not runner.busy
] ]
if not ids:
return []
client = boto3.client("ec2") client = boto3.client("ec2")
@ -65,6 +66,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
InstanceIds=list(ids.keys())[i : i + inc] InstanceIds=list(ids.keys())[i : i + inc]
) )
) )
# It applied only if all ids exist in EC2
i += inc i += inc
except ClientError as e: except ClientError as e:
# The list of non-existent instances is in the message: # The list of non-existent instances is in the message:
@ -93,7 +95,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
print("Instance", runner.name, "is not alive, going to remove it") print("Instance", runner.name, "is not alive, going to remove it")
for instance_id, runner in ids.items(): for instance_id, runner in ids.items():
if instance_id not in found_instances: if instance_id not in found_instances:
print("Instance", instance_id, "is not alive, going to remove it") print("Instance", instance_id, "is not found in EC2, going to remove it")
result_to_delete.append(runner) result_to_delete.append(runner)
return result_to_delete return result_to_delete
@ -334,7 +336,7 @@ def main(
grouped_runners = group_runners_by_tag(gh_runners) grouped_runners = group_runners_by_tag(gh_runners)
for group, group_runners in grouped_runners.items(): for group, group_runners in grouped_runners.items():
if push_to_cloudwatch: if push_to_cloudwatch:
print(group) print(f"Pushing metrics for group '{group}'")
push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group) push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group)
else: else:
print(group, f"({len(group_runners)})") print(group, f"({len(group_runners)})")

View File

@ -125,23 +125,6 @@ def get_candidates_to_be_killed(event_data: dict) -> Dict[str, List[str]]:
return instances_by_zone return instances_by_zone
def delete_runner(access_token: str, runner: RunnerDescription) -> bool:
headers = {
"Authorization": f"token {access_token}",
"Accept": "application/vnd.github.v3+json",
}
response = requests.delete(
f"https://api.github.com/orgs/ClickHouse/actions/runners/{runner.id}",
headers=headers,
)
response.raise_for_status()
print(
f"Response code deleting {runner.name} with id {runner.id} is {response.status_code}"
)
return bool(response.status_code == 204)
def main( def main(
github_secret_key: str, github_app_id: int, event: dict github_secret_key: str, github_app_id: int, event: dict
) -> Dict[str, List[str]]: ) -> Dict[str, List[str]]:
@ -160,8 +143,11 @@ def main(
access_token = get_access_token(encoded_jwt, installation_id) access_token = get_access_token(encoded_jwt, installation_id)
runners = list_runners(access_token) runners = list_runners(access_token)
# We used to delete potential hosts to terminate from GitHub runners pool,
# but the documentation states:
# --- Returning an instance first in the response data does not guarantee its termination
# so they will be cleaned out by ci_runners_metrics_lambda eventually
to_delete_runners = []
instances_to_kill = [] instances_to_kill = []
for zone, num_to_kill in to_kill_by_zone.items(): for zone, num_to_kill in to_kill_by_zone.items():
candidates = instances_by_zone[zone] candidates = instances_by_zone[zone]
@ -199,21 +185,10 @@ def main(
print( print(
f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}" f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}"
) )
to_delete_runners += delete_for_av
instances_to_kill += [runner.name for runner in delete_for_av]
print("Got instances to kill: ", ", ".join(instances_to_kill)) print("Got instances to kill: ", ", ".join(instances_to_kill))
print(
"Going to delete runners:",
", ".join([runner.name for runner in to_delete_runners]),
)
for runner in to_delete_runners:
if delete_runner(access_token, runner):
print(
f"Runner with name {runner.name} and id {runner.id} successfuly deleted from github"
)
instances_to_kill.append(runner.name)
else:
print(f"Cannot delete {runner.name} from github")
response = {"InstanceIDs": instances_to_kill} response = {"InstanceIDs": instances_to_kill}
print(response) print(response)