ClickHouse/ci/praktika/execution/machine_init.py

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

339 lines
13 KiB
Python
Raw Normal View History

2024-10-01 19:19:35 +00:00
import os
import platform
import signal
import time
import traceback
import requests
from praktika.execution.execution_settings import ExecutionSettings, ScalingType
from praktika.utils import ContextManager, Shell
class StateMachine:
class StateNames:
INIT = "init"
WAIT = "wait"
RUN = "run"
def __init__(self):
self.state = self.StateNames.INIT
self.scale_type = ExecutionSettings.RUNNER_SCALING_TYPE
self.machine = Machine(scaling_type=self.scale_type).update_instance_info()
self.state_updated_at = int(time.time())
self.forked = False
def kick(self):
if self.state == self.StateNames.INIT:
self.machine.config_actions().run_actions_async()
print("State Machine: INIT -> WAIT")
self.state = self.StateNames.WAIT
self.state_updated_at = int(time.time())
# TODO: add monitoring
if not self.machine.is_actions_process_healthy():
print(f"ERROR: GH runner process unexpectedly died")
self.machine.self_terminate(decrease_capacity=False)
elif self.state == self.StateNames.WAIT:
res = self.machine.check_job_assigned()
if res:
print("State Machine: WAIT -> RUN")
self.state = self.StateNames.RUN
self.state_updated_at = int(time.time())
self.check_scale_up()
else:
self.check_scale_down()
elif self.state == self.StateNames.RUN:
res = self.machine.check_job_running()
if res:
pass
else:
print("State Machine: RUN -> INIT")
self.state = self.StateNames.INIT
self.state_updated_at = int(time.time())
def check_scale_down(self):
if self.scale_type not in (
ScalingType.AUTOMATIC_SCALE_DOWN,
ScalingType.AUTOMATIC_SCALE_UP_DOWN,
):
return
if ScalingType.AUTOMATIC_SCALE_UP_DOWN and not self.forked:
print(
f"Scaling type is AUTOMATIC_SCALE_UP_DOWN and machine has not run a job - do not scale down"
)
return
if (
int(time.time()) - self.state_updated_at
> ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC
):
print(
f"No job assigned for more than MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC [{ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC}] - scale down the instance"
)
if not ExecutionSettings.LOCAL_EXECUTION:
self.machine.self_terminate(decrease_capacity=True)
else:
print("Local execution - skip scaling operation")
def check_scale_up(self):
if self.scale_type not in (ScalingType.AUTOMATIC_SCALE_UP_DOWN,):
return
if self.forked:
print("This instance already forked once - do not scale up")
return
self.machine.self_fork()
self.forked = True
def run(self):
self.machine.unconfig_actions()
while True:
self.kick()
time.sleep(5)
def terminate(self):
try:
self.machine.unconfig_actions()
except:
print("WARNING: failed to unconfig runner")
if not ExecutionSettings.LOCAL_EXECUTION:
if self.machine is not None:
self.machine.self_terminate(decrease_capacity=False)
time.sleep(10)
# wait termination
print("ERROR: failed to terminate instance via aws cli - try os call")
os.system("sudo shutdown now")
else:
print("NOTE: Local execution - machine won't be terminated")
class Machine:
@staticmethod
def get_latest_gh_actions_release():
url = f"https://api.github.com/repos/actions/runner/releases/latest"
response = requests.get(url, timeout=5)
if response.status_code == 200:
latest_release = response.json()
return latest_release["tag_name"].removeprefix("v")
else:
print(f"Failed to get the latest release: {response.status_code}")
return None
def __init__(self, scaling_type):
self.os_name = platform.system().lower()
assert self.os_name == "linux", f"Unsupported OS [{self.os_name}]"
if platform.machine() == "x86_64":
self.arch = "x64"
elif "aarch64" in platform.machine().lower():
self.arch = "arm64"
else:
assert False, f"Unsupported arch [{platform.machine()}]"
self.instance_id = None
self.asg_name = None
self.runner_api_endpoint = None
self.runner_type = None
self.labels = []
self.proc = None
assert scaling_type in ScalingType
self.scaling_type = scaling_type
def install_gh_actions_runner(self):
gh_actions_version = self.get_latest_gh_actions_release()
assert self.os_name and gh_actions_version and self.arch
Shell.check(
f"rm -rf {ExecutionSettings.GH_ACTIONS_DIRECTORY}",
strict=True,
verbose=True,
)
Shell.check(
f"mkdir {ExecutionSettings.GH_ACTIONS_DIRECTORY}", strict=True, verbose=True
)
with ContextManager.cd(ExecutionSettings.GH_ACTIONS_DIRECTORY):
Shell.check(
f"curl -O -L https://github.com/actions/runner/releases/download/v{gh_actions_version}/actions-runner-{self.os_name}-{self.arch}-{gh_actions_version}.tar.gz",
strict=True,
verbose=True,
)
Shell.check(f"tar xzf *tar.gz", strict=True, verbose=True)
Shell.check(f"rm -f *tar.gz", strict=True, verbose=True)
Shell.check(f"sudo ./bin/installdependencies.sh", strict=True, verbose=True)
Shell.check(
f"chown -R ubuntu:ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}",
strict=True,
verbose=True,
)
def _get_gh_token_from_ssm(self):
gh_token = Shell.get_output_or_raise(
"/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value"
)
return gh_token
def update_instance_info(self):
self.instance_id = Shell.get_output_or_raise("ec2metadata --instance-id")
assert self.instance_id
self.asg_name = Shell.get_output(
f"aws ec2 describe-instances --instance-id {self.instance_id} --query \"Reservations[].Instances[].Tags[?Key=='aws:autoscaling:groupName'].Value\" --output text"
)
# self.runner_type = Shell.get_output_or_raise(
# f'/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values={self.instance_id}" --query "Tags[?Key==\'github:runner-type\'].Value" --output text'
# )
self.runner_type = self.asg_name
if (
self.scaling_type != ScalingType.DISABLED
and not ExecutionSettings.LOCAL_EXECUTION
):
assert (
self.asg_name and self.runner_type
), f"Failed to retrieve ASG name, which is required for scaling_type [{self.scaling_type}]"
org = os.getenv("MY_ORG", "")
assert (
org
), "MY_ORG env variable myst be set to use init script for runner machine"
self.runner_api_endpoint = f"https://github.com/{org}"
self.labels = ["self-hosted", self.runner_type]
return self
@classmethod
def check_job_assigned(cls):
runner_pid = Shell.get_output_or_raise("pgrep Runner.Listener")
if not runner_pid:
print("check_job_assigned: No runner pid")
return False
log_file = Shell.get_output_or_raise(
f"lsof -p {runner_pid} | grep -o {ExecutionSettings.GH_ACTIONS_DIRECTORY}/_diag/Runner.*log"
)
if not log_file:
print("check_job_assigned: No log file")
return False
return Shell.check(f"grep -q 'Terminal] .* Running job:' {log_file}")
def check_job_running(self):
if self.proc is None:
print(f"WARNING: No job started")
return False
exit_code = self.proc.poll()
if exit_code is None:
return True
else:
print(f"Job runner finished with exit code [{exit_code}]")
self.proc = None
return False
def config_actions(self):
if not self.instance_id:
self.update_instance_info()
token = self._get_gh_token_from_ssm()
assert token and self.instance_id and self.runner_api_endpoint and self.labels
command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh --token {token} \
--url {self.runner_api_endpoint} --ephemeral --unattended --replace \
--runnergroup Default --labels {','.join(self.labels)} --work wd --name {self.instance_id}"
res = 1
i = 0
while i < 10 and res != 0:
res = Shell.run(command)
i += 1
if res != 0:
print(
f"ERROR: failed to configure GH actions runner after [{i}] attempts, exit code [{res}], retry after 10s"
)
time.sleep(10)
self._get_gh_token_from_ssm()
if res == 0:
print("GH action runner has been configured")
else:
assert False, "GH actions runner configuration failed"
return self
def unconfig_actions(self):
token = self._get_gh_token_from_ssm()
command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh remove --token {token}"
Shell.check(command, strict=True)
return self
def run_actions_async(self):
command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/run.sh"
self.proc = Shell.run_async(command)
assert self.proc is not None
return self
def is_actions_process_healthy(self):
try:
if self.proc.poll() is None:
return True
stdout, stderr = self.proc.communicate()
if self.proc.returncode != 0:
# Handle failure
print(
f"GH Action process failed with return code {self.proc.returncode}"
)
print(f"Error output: {stderr}")
return False
else:
print(f"GH Action process is not running")
return False
except Exception as e:
print(f"GH Action process exception: {e}")
return False
def self_terminate(self, decrease_capacity):
print(
f"WARNING: Self terminate is called, decrease_capacity [{decrease_capacity}]"
)
traceback.print_stack()
if not self.instance_id:
self.update_instance_info()
assert self.instance_id
command = f"aws autoscaling terminate-instance-in-auto-scaling-group --instance-id {self.instance_id}"
if decrease_capacity:
command += " --should-decrement-desired-capacity"
else:
command += " --no-should-decrement-desired-capacity"
Shell.check(
command=command,
verbose=True,
)
def self_fork(self):
current_capacity = Shell.get_output(
f'aws autoscaling describe-auto-scaling-groups --auto-scaling-group-name {self.asg_name} \
--query "AutoScalingGroups[0].DesiredCapacity" --output text'
)
current_capacity = int(current_capacity)
if not current_capacity:
print("ERROR: failed to get current capacity - cannot scale up")
return
desired_capacity = current_capacity + 1
command = f"aws autoscaling set-desired-capacity --auto-scaling-group-name {self.asg_name} --desired-capacity {desired_capacity}"
print(f"Increase capacity [{current_capacity} -> {desired_capacity}]")
res = Shell.check(
command=command,
verbose=True,
)
if not res:
print("ERROR: failed to increase capacity - cannot scale up")
def handle_signal(signum, _frame):
print(f"FATAL: Received signal {signum}")
raise RuntimeError(f"killed by signal {signum}")
def run():
signal.signal(signal.SIGINT, handle_signal)
signal.signal(signal.SIGTERM, handle_signal)
m = None
try:
m = StateMachine()
m.run()
except Exception as e:
print(f"FATAL: Exception [{e}] - terminate instance")
time.sleep(10)
if m:
m.terminate()
raise e
if __name__ == "__main__":
run()