mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Merge pull request #44015 from ClickHouse/try-to-keep-runners
Try to keep runners alive for longer
This commit is contained in:
commit
658bd348ce
@ -43,13 +43,14 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
||||
# Only `i-deadbead123` are valid names for an instance ID
|
||||
if runner.offline and not runner.busy and runner.name.startswith("i-")
|
||||
}
|
||||
if not ids:
|
||||
return []
|
||||
|
||||
result_to_delete = [
|
||||
runner
|
||||
for runner in runners
|
||||
if not ids.get(runner.name) and runner.offline and not runner.busy
|
||||
]
|
||||
if not ids:
|
||||
return []
|
||||
|
||||
client = boto3.client("ec2")
|
||||
|
||||
@ -65,6 +66,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
||||
InstanceIds=list(ids.keys())[i : i + inc]
|
||||
)
|
||||
)
|
||||
# It applied only if all ids exist in EC2
|
||||
i += inc
|
||||
except ClientError as e:
|
||||
# The list of non-existent instances is in the message:
|
||||
@ -93,7 +95,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
||||
print("Instance", runner.name, "is not alive, going to remove it")
|
||||
for instance_id, runner in ids.items():
|
||||
if instance_id not in found_instances:
|
||||
print("Instance", instance_id, "is not alive, going to remove it")
|
||||
print("Instance", instance_id, "is not found in EC2, going to remove it")
|
||||
result_to_delete.append(runner)
|
||||
return result_to_delete
|
||||
|
||||
@ -334,7 +336,7 @@ def main(
|
||||
grouped_runners = group_runners_by_tag(gh_runners)
|
||||
for group, group_runners in grouped_runners.items():
|
||||
if push_to_cloudwatch:
|
||||
print(group)
|
||||
print(f"Pushing metrics for group '{group}'")
|
||||
push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group)
|
||||
else:
|
||||
print(group, f"({len(group_runners)})")
|
||||
|
@ -125,23 +125,6 @@ def get_candidates_to_be_killed(event_data: dict) -> Dict[str, List[str]]:
|
||||
return instances_by_zone
|
||||
|
||||
|
||||
def delete_runner(access_token: str, runner: RunnerDescription) -> bool:
|
||||
headers = {
|
||||
"Authorization": f"token {access_token}",
|
||||
"Accept": "application/vnd.github.v3+json",
|
||||
}
|
||||
|
||||
response = requests.delete(
|
||||
f"https://api.github.com/orgs/ClickHouse/actions/runners/{runner.id}",
|
||||
headers=headers,
|
||||
)
|
||||
response.raise_for_status()
|
||||
print(
|
||||
f"Response code deleting {runner.name} with id {runner.id} is {response.status_code}"
|
||||
)
|
||||
return bool(response.status_code == 204)
|
||||
|
||||
|
||||
def main(
|
||||
github_secret_key: str, github_app_id: int, event: dict
|
||||
) -> Dict[str, List[str]]:
|
||||
@ -160,8 +143,11 @@ def main(
|
||||
access_token = get_access_token(encoded_jwt, installation_id)
|
||||
|
||||
runners = list_runners(access_token)
|
||||
# We used to delete potential hosts to terminate from GitHub runners pool,
|
||||
# but the documentation states:
|
||||
# --- Returning an instance first in the response data does not guarantee its termination
|
||||
# so they will be cleaned out by ci_runners_metrics_lambda eventually
|
||||
|
||||
to_delete_runners = []
|
||||
instances_to_kill = []
|
||||
for zone, num_to_kill in to_kill_by_zone.items():
|
||||
candidates = instances_by_zone[zone]
|
||||
@ -199,21 +185,10 @@ def main(
|
||||
print(
|
||||
f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}"
|
||||
)
|
||||
to_delete_runners += delete_for_av
|
||||
|
||||
instances_to_kill += [runner.name for runner in delete_for_av]
|
||||
|
||||
print("Got instances to kill: ", ", ".join(instances_to_kill))
|
||||
print(
|
||||
"Going to delete runners:",
|
||||
", ".join([runner.name for runner in to_delete_runners]),
|
||||
)
|
||||
for runner in to_delete_runners:
|
||||
if delete_runner(access_token, runner):
|
||||
print(
|
||||
f"Runner with name {runner.name} and id {runner.id} successfuly deleted from github"
|
||||
)
|
||||
instances_to_kill.append(runner.name)
|
||||
else:
|
||||
print(f"Cannot delete {runner.name} from github")
|
||||
|
||||
response = {"InstanceIDs": instances_to_kill}
|
||||
print(response)
|
||||
|
Loading…
Reference in New Issue
Block a user