mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 00:30:49 +00:00
Merge pull request #44015 from ClickHouse/try-to-keep-runners
Try to keep runners alive for longer
This commit is contained in:
commit
658bd348ce
@ -43,13 +43,14 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
|||||||
# Only `i-deadbead123` are valid names for an instance ID
|
# Only `i-deadbead123` are valid names for an instance ID
|
||||||
if runner.offline and not runner.busy and runner.name.startswith("i-")
|
if runner.offline and not runner.busy and runner.name.startswith("i-")
|
||||||
}
|
}
|
||||||
|
if not ids:
|
||||||
|
return []
|
||||||
|
|
||||||
result_to_delete = [
|
result_to_delete = [
|
||||||
runner
|
runner
|
||||||
for runner in runners
|
for runner in runners
|
||||||
if not ids.get(runner.name) and runner.offline and not runner.busy
|
if not ids.get(runner.name) and runner.offline and not runner.busy
|
||||||
]
|
]
|
||||||
if not ids:
|
|
||||||
return []
|
|
||||||
|
|
||||||
client = boto3.client("ec2")
|
client = boto3.client("ec2")
|
||||||
|
|
||||||
@ -65,6 +66,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
|||||||
InstanceIds=list(ids.keys())[i : i + inc]
|
InstanceIds=list(ids.keys())[i : i + inc]
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
# It applied only if all ids exist in EC2
|
||||||
i += inc
|
i += inc
|
||||||
except ClientError as e:
|
except ClientError as e:
|
||||||
# The list of non-existent instances is in the message:
|
# The list of non-existent instances is in the message:
|
||||||
@ -93,7 +95,7 @@ def get_dead_runners_in_ec2(runners: RunnerDescriptions) -> RunnerDescriptions:
|
|||||||
print("Instance", runner.name, "is not alive, going to remove it")
|
print("Instance", runner.name, "is not alive, going to remove it")
|
||||||
for instance_id, runner in ids.items():
|
for instance_id, runner in ids.items():
|
||||||
if instance_id not in found_instances:
|
if instance_id not in found_instances:
|
||||||
print("Instance", instance_id, "is not alive, going to remove it")
|
print("Instance", instance_id, "is not found in EC2, going to remove it")
|
||||||
result_to_delete.append(runner)
|
result_to_delete.append(runner)
|
||||||
return result_to_delete
|
return result_to_delete
|
||||||
|
|
||||||
@ -334,7 +336,7 @@ def main(
|
|||||||
grouped_runners = group_runners_by_tag(gh_runners)
|
grouped_runners = group_runners_by_tag(gh_runners)
|
||||||
for group, group_runners in grouped_runners.items():
|
for group, group_runners in grouped_runners.items():
|
||||||
if push_to_cloudwatch:
|
if push_to_cloudwatch:
|
||||||
print(group)
|
print(f"Pushing metrics for group '{group}'")
|
||||||
push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group)
|
push_metrics_to_cloudwatch(group_runners, "RunnersMetrics/" + group)
|
||||||
else:
|
else:
|
||||||
print(group, f"({len(group_runners)})")
|
print(group, f"({len(group_runners)})")
|
||||||
|
@ -125,23 +125,6 @@ def get_candidates_to_be_killed(event_data: dict) -> Dict[str, List[str]]:
|
|||||||
return instances_by_zone
|
return instances_by_zone
|
||||||
|
|
||||||
|
|
||||||
def delete_runner(access_token: str, runner: RunnerDescription) -> bool:
|
|
||||||
headers = {
|
|
||||||
"Authorization": f"token {access_token}",
|
|
||||||
"Accept": "application/vnd.github.v3+json",
|
|
||||||
}
|
|
||||||
|
|
||||||
response = requests.delete(
|
|
||||||
f"https://api.github.com/orgs/ClickHouse/actions/runners/{runner.id}",
|
|
||||||
headers=headers,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
print(
|
|
||||||
f"Response code deleting {runner.name} with id {runner.id} is {response.status_code}"
|
|
||||||
)
|
|
||||||
return bool(response.status_code == 204)
|
|
||||||
|
|
||||||
|
|
||||||
def main(
|
def main(
|
||||||
github_secret_key: str, github_app_id: int, event: dict
|
github_secret_key: str, github_app_id: int, event: dict
|
||||||
) -> Dict[str, List[str]]:
|
) -> Dict[str, List[str]]:
|
||||||
@ -160,8 +143,11 @@ def main(
|
|||||||
access_token = get_access_token(encoded_jwt, installation_id)
|
access_token = get_access_token(encoded_jwt, installation_id)
|
||||||
|
|
||||||
runners = list_runners(access_token)
|
runners = list_runners(access_token)
|
||||||
|
# We used to delete potential hosts to terminate from GitHub runners pool,
|
||||||
|
# but the documentation states:
|
||||||
|
# --- Returning an instance first in the response data does not guarantee its termination
|
||||||
|
# so they will be cleaned out by ci_runners_metrics_lambda eventually
|
||||||
|
|
||||||
to_delete_runners = []
|
|
||||||
instances_to_kill = []
|
instances_to_kill = []
|
||||||
for zone, num_to_kill in to_kill_by_zone.items():
|
for zone, num_to_kill in to_kill_by_zone.items():
|
||||||
candidates = instances_by_zone[zone]
|
candidates = instances_by_zone[zone]
|
||||||
@ -199,21 +185,10 @@ def main(
|
|||||||
print(
|
print(
|
||||||
f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}"
|
f"Checked all candidates for av {zone}, get to delete {len(delete_for_av)}, but still cannot get required {num_to_kill}"
|
||||||
)
|
)
|
||||||
to_delete_runners += delete_for_av
|
|
||||||
|
instances_to_kill += [runner.name for runner in delete_for_av]
|
||||||
|
|
||||||
print("Got instances to kill: ", ", ".join(instances_to_kill))
|
print("Got instances to kill: ", ", ".join(instances_to_kill))
|
||||||
print(
|
|
||||||
"Going to delete runners:",
|
|
||||||
", ".join([runner.name for runner in to_delete_runners]),
|
|
||||||
)
|
|
||||||
for runner in to_delete_runners:
|
|
||||||
if delete_runner(access_token, runner):
|
|
||||||
print(
|
|
||||||
f"Runner with name {runner.name} and id {runner.id} successfuly deleted from github"
|
|
||||||
)
|
|
||||||
instances_to_kill.append(runner.name)
|
|
||||||
else:
|
|
||||||
print(f"Cannot delete {runner.name} from github")
|
|
||||||
|
|
||||||
response = {"InstanceIDs": instances_to_kill}
|
response = {"InstanceIDs": instances_to_kill}
|
||||||
print(response)
|
print(response)
|
||||||
|
Loading…
Reference in New Issue
Block a user