ClickHouse/ci/praktika/execution/machine_init.py

import os
import platform
import signal
import time
import traceback

import requests
from praktika.execution.execution_settings import ExecutionSettings, ScalingType
from praktika.utils import ContextManager, Shell


class StateMachine:
    class StateNames:
        INIT = "init"
        WAIT = "wait"
        RUN = "run"

    def __init__(self):
        self.state = self.StateNames.INIT
        self.scale_type = ExecutionSettings.RUNNER_SCALING_TYPE
        self.machine = Machine(scaling_type=self.scale_type).update_instance_info()
        self.state_updated_at = int(time.time())
        self.forked = False

    def kick(self):
        if self.state == self.StateNames.INIT:
            self.machine.config_actions().run_actions_async()
            print("State Machine: INIT -> WAIT")
            self.state = self.StateNames.WAIT
            self.state_updated_at = int(time.time())
            # TODO: add monitoring
            if not self.machine.is_actions_process_healthy():
                print(f"ERROR: GH runner process unexpectedly died")
                self.machine.self_terminate(decrease_capacity=False)
        elif self.state == self.StateNames.WAIT:
            res = self.machine.check_job_assigned()
            if res:
                print("State Machine: WAIT -> RUN")
                self.state = self.StateNames.RUN
                self.state_updated_at = int(time.time())
                self.check_scale_up()
            else:
                self.check_scale_down()
        elif self.state == self.StateNames.RUN:
            res = self.machine.check_job_running()
            if res:
                pass
            else:
                print("State Machine: RUN -> INIT")
                self.state = self.StateNames.INIT
                self.state_updated_at = int(time.time())

    def check_scale_down(self):
        if self.scale_type not in (
            ScalingType.AUTOMATIC_SCALE_DOWN,
            ScalingType.AUTOMATIC_SCALE_UP_DOWN,
        ):
            return
        if ScalingType.AUTOMATIC_SCALE_UP_DOWN and not self.forked:
            print(
                f"Scaling type is AUTOMATIC_SCALE_UP_DOWN and machine has not run a job - do not scale down"
            )
            return
        if (
            int(time.time()) - self.state_updated_at
            > ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC
        ):
            print(
                f"No job assigned for more than MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC [{ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC}] - scale down the instance"
            )
            if not ExecutionSettings.LOCAL_EXECUTION:
                self.machine.self_terminate(decrease_capacity=True)
            else:
                print("Local execution - skip scaling operation")

    def check_scale_up(self):
        if self.scale_type not in (ScalingType.AUTOMATIC_SCALE_UP_DOWN,):
            return
        if self.forked:
            print("This instance already forked once - do not scale up")
            return
        self.machine.self_fork()
        self.forked = True

    def run(self):
        self.machine.unconfig_actions()
        while True:
            self.kick()
            time.sleep(5)

    def terminate(self):
        try:
            self.machine.unconfig_actions()
        except:
            print("WARNING: failed to unconfig runner")
        if not ExecutionSettings.LOCAL_EXECUTION:
            if self.machine is not None:
                self.machine.self_terminate(decrease_capacity=False)
                time.sleep(10)
                # wait termination
            print("ERROR: failed to terminate instance via aws cli - try os call")
            os.system("sudo shutdown now")
        else:
            print("NOTE: Local execution - machine won't be terminated")


class Machine:
    @staticmethod
    def get_latest_gh_actions_release():
        url = f"https://api.github.com/repos/actions/runner/releases/latest"
        response = requests.get(url, timeout=5)
        if response.status_code == 200:
            latest_release = response.json()
            return latest_release["tag_name"].removeprefix("v")
        else:
            print(f"Failed to get the latest release: {response.status_code}")
            return None

    def __init__(self, scaling_type):
        self.os_name = platform.system().lower()
        assert self.os_name == "linux", f"Unsupported OS [{self.os_name}]"
        if platform.machine() == "x86_64":
            self.arch = "x64"
        elif "aarch64" in platform.machine().lower():
            self.arch = "arm64"
        else:
            assert False, f"Unsupported arch [{platform.machine()}]"
        self.instance_id = None
        self.asg_name = None
        self.runner_api_endpoint = None
        self.runner_type = None
        self.labels = []
        self.proc = None
        assert scaling_type in ScalingType
        self.scaling_type = scaling_type

    def install_gh_actions_runner(self):
        gh_actions_version = self.get_latest_gh_actions_release()
        assert self.os_name and gh_actions_version and self.arch
        Shell.check(
            f"rm -rf {ExecutionSettings.GH_ACTIONS_DIRECTORY}",
            strict=True,
            verbose=True,
        )
        Shell.check(
            f"mkdir {ExecutionSettings.GH_ACTIONS_DIRECTORY}", strict=True, verbose=True
        )
        with ContextManager.cd(ExecutionSettings.GH_ACTIONS_DIRECTORY):
            Shell.check(
                f"curl -O -L https://github.com/actions/runner/releases/download/v{gh_actions_version}/actions-runner-{self.os_name}-{self.arch}-{gh_actions_version}.tar.gz",
                strict=True,
                verbose=True,
            )
            Shell.check(f"tar xzf *tar.gz", strict=True, verbose=True)
            Shell.check(f"rm -f *tar.gz", strict=True, verbose=True)
            Shell.check(f"sudo ./bin/installdependencies.sh", strict=True, verbose=True)
            Shell.check(
                f"chown -R ubuntu:ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}",
                strict=True,
                verbose=True,
            )

    def _get_gh_token_from_ssm(self):
        gh_token = Shell.get_output_or_raise(
            "/usr/local/bin/aws ssm  get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value"
        )
        return gh_token

    def update_instance_info(self):
        self.instance_id = Shell.get_output_or_raise("ec2metadata --instance-id")
        assert self.instance_id
        self.asg_name = Shell.get_output(
            f"aws ec2 describe-instances --instance-id {self.instance_id} --query \"Reservations[].Instances[].Tags[?Key=='aws:autoscaling:groupName'].Value\" --output text"
        )
        # self.runner_type = Shell.get_output_or_raise(
        #     f'/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values={self.instance_id}" --query "Tags[?Key==\'github:runner-type\'].Value" --output text'
        # )
        self.runner_type = self.asg_name
        if (
            self.scaling_type != ScalingType.DISABLED
            and not ExecutionSettings.LOCAL_EXECUTION
        ):
            assert (
                self.asg_name and self.runner_type
            ), f"Failed to retrieve ASG name, which is required for scaling_type [{self.scaling_type}]"
        org = os.getenv("MY_ORG", "")
        assert (
            org
        ), "MY_ORG env variable myst be set to use init script for runner machine"
        self.runner_api_endpoint = f"https://github.com/{org}"

        self.labels = ["self-hosted", self.runner_type]
        return self

    @classmethod
    def check_job_assigned(cls):
        runner_pid = Shell.get_output_or_raise("pgrep Runner.Listener")
        if not runner_pid:
            print("check_job_assigned: No runner pid")
            return False
        log_file = Shell.get_output_or_raise(
            f"lsof -p {runner_pid} | grep -o {ExecutionSettings.GH_ACTIONS_DIRECTORY}/_diag/Runner.*log"
        )
        if not log_file:
            print("check_job_assigned: No log file")
            return False
        return Shell.check(f"grep -q 'Terminal] .* Running job:' {log_file}")

    def check_job_running(self):
        if self.proc is None:
            print(f"WARNING: No job started")
            return False
        exit_code = self.proc.poll()
        if exit_code is None:
            return True
        else:
            print(f"Job runner finished with exit code [{exit_code}]")
            self.proc = None
            return False

    def config_actions(self):
        if not self.instance_id:
            self.update_instance_info()
        token = self._get_gh_token_from_ssm()
        assert token and self.instance_id and self.runner_api_endpoint and self.labels
        command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh --token {token} \
            --url {self.runner_api_endpoint} --ephemeral --unattended --replace \
            --runnergroup Default --labels {','.join(self.labels)} --work wd --name {self.instance_id}"
        res = 1
        i = 0
        while i < 10 and res != 0:
            res = Shell.run(command)
            i += 1
            if res != 0:
                print(
                    f"ERROR: failed to configure GH actions runner after [{i}] attempts, exit code [{res}], retry after 10s"
                )
                time.sleep(10)
                self._get_gh_token_from_ssm()
        if res == 0:
            print("GH action runner has been configured")
        else:
            assert False, "GH actions runner configuration failed"
        return self

    def unconfig_actions(self):
        token = self._get_gh_token_from_ssm()
        command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh remove --token {token}"
        Shell.check(command, strict=True)
        return self

    def run_actions_async(self):
        command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/run.sh"
        self.proc = Shell.run_async(command)
        assert self.proc is not None
        return self

    def is_actions_process_healthy(self):
        try:
            if self.proc.poll() is None:
                return True

            stdout, stderr = self.proc.communicate()

            if self.proc.returncode != 0:
                # Handle failure
                print(
                    f"GH Action process failed with return code {self.proc.returncode}"
                )
                print(f"Error output: {stderr}")
                return False
            else:
                print(f"GH Action process is not running")
                return False
        except Exception as e:
            print(f"GH Action process exception: {e}")
            return False

    def self_terminate(self, decrease_capacity):
        print(
            f"WARNING: Self terminate is called, decrease_capacity [{decrease_capacity}]"
        )
        traceback.print_stack()
        if not self.instance_id:
            self.update_instance_info()
        assert self.instance_id
        command = f"aws autoscaling terminate-instance-in-auto-scaling-group --instance-id {self.instance_id}"
        if decrease_capacity:
            command += " --should-decrement-desired-capacity"
        else:
            command += " --no-should-decrement-desired-capacity"
        Shell.check(
            command=command,
            verbose=True,
        )

    def self_fork(self):
        current_capacity = Shell.get_output(
            f'aws autoscaling describe-auto-scaling-groups --auto-scaling-group-name {self.asg_name} \
                --query "AutoScalingGroups[0].DesiredCapacity" --output text'
        )
        current_capacity = int(current_capacity)
        if not current_capacity:
            print("ERROR: failed to get current capacity - cannot scale up")
            return
        desired_capacity = current_capacity + 1
        command = f"aws autoscaling set-desired-capacity --auto-scaling-group-name {self.asg_name} --desired-capacity {desired_capacity}"
        print(f"Increase capacity [{current_capacity} -> {desired_capacity}]")
        res = Shell.check(
            command=command,
            verbose=True,
        )
        if not res:
            print("ERROR: failed to increase capacity - cannot scale up")


def handle_signal(signum, _frame):
    print(f"FATAL: Received signal {signum}")
    raise RuntimeError(f"killed by signal {signum}")


def run():
    signal.signal(signal.SIGINT, handle_signal)
    signal.signal(signal.SIGTERM, handle_signal)
    m = None
    try:
        m = StateMachine()
        m.run()
    except Exception as e:
        print(f"FATAL: Exception [{e}] - terminate instance")
        time.sleep(10)
        if m:
            m.terminate()
        raise e


if __name__ == "__main__":
    run()
CI: FastTest with praktika 2024-10-01 19:19:35 +00:00			`import os`
			`import platform`
			`import signal`
			`import time`
			`import traceback`

			`import requests`
			`from praktika.execution.execution_settings import ExecutionSettings, ScalingType`
			`from praktika.utils import ContextManager, Shell`


			`class StateMachine:`
			`class StateNames:`
			`INIT = "init"`
			`WAIT = "wait"`
			`RUN = "run"`

			`def __init__(self):`
			`self.state = self.StateNames.INIT`
			`self.scale_type = ExecutionSettings.RUNNER_SCALING_TYPE`
			`self.machine = Machine(scaling_type=self.scale_type).update_instance_info()`
			`self.state_updated_at = int(time.time())`
			`self.forked = False`

			`def kick(self):`
			`if self.state == self.StateNames.INIT:`
			`self.machine.config_actions().run_actions_async()`
			`print("State Machine: INIT -> WAIT")`
			`self.state = self.StateNames.WAIT`
			`self.state_updated_at = int(time.time())`
			`# TODO: add monitoring`
			`if not self.machine.is_actions_process_healthy():`
			`print(f"ERROR: GH runner process unexpectedly died")`
			`self.machine.self_terminate(decrease_capacity=False)`
			`elif self.state == self.StateNames.WAIT:`
			`res = self.machine.check_job_assigned()`
			`if res:`
			`print("State Machine: WAIT -> RUN")`
			`self.state = self.StateNames.RUN`
			`self.state_updated_at = int(time.time())`
			`self.check_scale_up()`
			`else:`
			`self.check_scale_down()`
			`elif self.state == self.StateNames.RUN:`
			`res = self.machine.check_job_running()`
			`if res:`
			`pass`
			`else:`
			`print("State Machine: RUN -> INIT")`
			`self.state = self.StateNames.INIT`
			`self.state_updated_at = int(time.time())`

			`def check_scale_down(self):`
			`if self.scale_type not in (`
			`ScalingType.AUTOMATIC_SCALE_DOWN,`
			`ScalingType.AUTOMATIC_SCALE_UP_DOWN,`
			`):`
			`return`
			`if ScalingType.AUTOMATIC_SCALE_UP_DOWN and not self.forked:`
			`print(`
			`f"Scaling type is AUTOMATIC_SCALE_UP_DOWN and machine has not run a job - do not scale down"`
			`)`
			`return`
			`if (`
			`int(time.time()) - self.state_updated_at`
			`> ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC`
			`):`
			`print(`
			`f"No job assigned for more than MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC [{ExecutionSettings.MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC}] - scale down the instance"`
			`)`
			`if not ExecutionSettings.LOCAL_EXECUTION:`
			`self.machine.self_terminate(decrease_capacity=True)`
			`else:`
			`print("Local execution - skip scaling operation")`

			`def check_scale_up(self):`
			`if self.scale_type not in (ScalingType.AUTOMATIC_SCALE_UP_DOWN,):`
			`return`
			`if self.forked:`
			`print("This instance already forked once - do not scale up")`
			`return`
			`self.machine.self_fork()`
			`self.forked = True`

			`def run(self):`
			`self.machine.unconfig_actions()`
			`while True:`
			`self.kick()`
			`time.sleep(5)`

			`def terminate(self):`
			`try:`
			`self.machine.unconfig_actions()`
			`except:`
			`print("WARNING: failed to unconfig runner")`
			`if not ExecutionSettings.LOCAL_EXECUTION:`
			`if self.machine is not None:`
			`self.machine.self_terminate(decrease_capacity=False)`
			`time.sleep(10)`
			`# wait termination`
			`print("ERROR: failed to terminate instance via aws cli - try os call")`
			`os.system("sudo shutdown now")`
			`else:`
			`print("NOTE: Local execution - machine won't be terminated")`


			`class Machine:`
			`@staticmethod`
			`def get_latest_gh_actions_release():`
			`url = f"https://api.github.com/repos/actions/runner/releases/latest"`
			`response = requests.get(url, timeout=5)`
			`if response.status_code == 200:`
			`latest_release = response.json()`
			`return latest_release["tag_name"].removeprefix("v")`
			`else:`
			`print(f"Failed to get the latest release: {response.status_code}")`
			`return None`

			`def __init__(self, scaling_type):`
			`self.os_name = platform.system().lower()`
			`assert self.os_name == "linux", f"Unsupported OS [{self.os_name}]"`
			`if platform.machine() == "x86_64":`
			`self.arch = "x64"`
			`elif "aarch64" in platform.machine().lower():`
			`self.arch = "arm64"`
			`else:`
			`assert False, f"Unsupported arch [{platform.machine()}]"`
			`self.instance_id = None`
			`self.asg_name = None`
			`self.runner_api_endpoint = None`
			`self.runner_type = None`
			`self.labels = []`
			`self.proc = None`
			`assert scaling_type in ScalingType`
			`self.scaling_type = scaling_type`

			`def install_gh_actions_runner(self):`
			`gh_actions_version = self.get_latest_gh_actions_release()`
			`assert self.os_name and gh_actions_version and self.arch`
			`Shell.check(`
			`f"rm -rf {ExecutionSettings.GH_ACTIONS_DIRECTORY}",`
			`strict=True,`
			`verbose=True,`
			`)`
			`Shell.check(`
			`f"mkdir {ExecutionSettings.GH_ACTIONS_DIRECTORY}", strict=True, verbose=True`
			`)`
			`with ContextManager.cd(ExecutionSettings.GH_ACTIONS_DIRECTORY):`
			`Shell.check(`
			`f"curl -O -L https://github.com/actions/runner/releases/download/v{gh_actions_version}/actions-runner-{self.os_name}-{self.arch}-{gh_actions_version}.tar.gz",`
			`strict=True,`
			`verbose=True,`
			`)`
			`Shell.check(f"tar xzf *tar.gz", strict=True, verbose=True)`
			`Shell.check(f"rm -f *tar.gz", strict=True, verbose=True)`
			`Shell.check(f"sudo ./bin/installdependencies.sh", strict=True, verbose=True)`
			`Shell.check(`
			`f"chown -R ubuntu:ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}",`
			`strict=True,`
			`verbose=True,`
			`)`

			`def _get_gh_token_from_ssm(self):`
			`gh_token = Shell.get_output_or_raise(`
			`"/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value"`
			`)`
			`return gh_token`

			`def update_instance_info(self):`
			`self.instance_id = Shell.get_output_or_raise("ec2metadata --instance-id")`
			`assert self.instance_id`
			`self.asg_name = Shell.get_output(`
			`f"aws ec2 describe-instances --instance-id {self.instance_id} --query \"Reservations[].Instances[].Tags[?Key=='aws:autoscaling:groupName'].Value\" --output text"`
			`)`
			`# self.runner_type = Shell.get_output_or_raise(`
			`# f'/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values={self.instance_id}" --query "Tags[?Key==\'github:runner-type\'].Value" --output text'`
			`# )`
			`self.runner_type = self.asg_name`
			`if (`
			`self.scaling_type != ScalingType.DISABLED`
			`and not ExecutionSettings.LOCAL_EXECUTION`
			`):`
			`assert (`
			`self.asg_name and self.runner_type`
			`), f"Failed to retrieve ASG name, which is required for scaling_type [{self.scaling_type}]"`
			`org = os.getenv("MY_ORG", "")`
			`assert (`
			`org`
			`), "MY_ORG env variable myst be set to use init script for runner machine"`
			`self.runner_api_endpoint = f"https://github.com/{org}"`

			`self.labels = ["self-hosted", self.runner_type]`
			`return self`

			`@classmethod`
			`def check_job_assigned(cls):`
			`runner_pid = Shell.get_output_or_raise("pgrep Runner.Listener")`
			`if not runner_pid:`
			`print("check_job_assigned: No runner pid")`
			`return False`
			`log_file = Shell.get_output_or_raise(`
			`f"lsof -p {runner_pid} \| grep -o {ExecutionSettings.GH_ACTIONS_DIRECTORY}/_diag/Runner.*log"`
			`)`
			`if not log_file:`
			`print("check_job_assigned: No log file")`
			`return False`
			`return Shell.check(f"grep -q 'Terminal] .* Running job:' {log_file}")`

			`def check_job_running(self):`
			`if self.proc is None:`
			`print(f"WARNING: No job started")`
			`return False`
			`exit_code = self.proc.poll()`
			`if exit_code is None:`
			`return True`
			`else:`
			`print(f"Job runner finished with exit code [{exit_code}]")`
			`self.proc = None`
			`return False`

			`def config_actions(self):`
			`if not self.instance_id:`
			`self.update_instance_info()`
			`token = self._get_gh_token_from_ssm()`
			`assert token and self.instance_id and self.runner_api_endpoint and self.labels`
			`command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh --token {token} \`
			`--url {self.runner_api_endpoint} --ephemeral --unattended --replace \`
			`--runnergroup Default --labels {','.join(self.labels)} --work wd --name {self.instance_id}"`
			`res = 1`
			`i = 0`
			`while i < 10 and res != 0:`
			`res = Shell.run(command)`
			`i += 1`
			`if res != 0:`
			`print(`
			`f"ERROR: failed to configure GH actions runner after [{i}] attempts, exit code [{res}], retry after 10s"`
			`)`
			`time.sleep(10)`
			`self._get_gh_token_from_ssm()`
			`if res == 0:`
			`print("GH action runner has been configured")`
			`else:`
			`assert False, "GH actions runner configuration failed"`
			`return self`

			`def unconfig_actions(self):`
			`token = self._get_gh_token_from_ssm()`
			`command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/config.sh remove --token {token}"`
			`Shell.check(command, strict=True)`
			`return self`

			`def run_actions_async(self):`
			`command = f"sudo -u ubuntu {ExecutionSettings.GH_ACTIONS_DIRECTORY}/run.sh"`
			`self.proc = Shell.run_async(command)`
			`assert self.proc is not None`
			`return self`

			`def is_actions_process_healthy(self):`
			`try:`
			`if self.proc.poll() is None:`
			`return True`

			`stdout, stderr = self.proc.communicate()`

			`if self.proc.returncode != 0:`
			`# Handle failure`
			`print(`
			`f"GH Action process failed with return code {self.proc.returncode}"`
			`)`
			`print(f"Error output: {stderr}")`
			`return False`
			`else:`
			`print(f"GH Action process is not running")`
			`return False`
			`except Exception as e:`
			`print(f"GH Action process exception: {e}")`
			`return False`

			`def self_terminate(self, decrease_capacity):`
			`print(`
			`f"WARNING: Self terminate is called, decrease_capacity [{decrease_capacity}]"`
			`)`
			`traceback.print_stack()`
			`if not self.instance_id:`
			`self.update_instance_info()`
			`assert self.instance_id`
			`command = f"aws autoscaling terminate-instance-in-auto-scaling-group --instance-id {self.instance_id}"`
			`if decrease_capacity:`
			`command += " --should-decrement-desired-capacity"`
			`else:`
			`command += " --no-should-decrement-desired-capacity"`
			`Shell.check(`
			`command=command,`
			`verbose=True,`
			`)`

			`def self_fork(self):`
			`current_capacity = Shell.get_output(`
			`f'aws autoscaling describe-auto-scaling-groups --auto-scaling-group-name {self.asg_name} \`
			`--query "AutoScalingGroups[0].DesiredCapacity" --output text'`
			`)`
			`current_capacity = int(current_capacity)`
			`if not current_capacity:`
			`print("ERROR: failed to get current capacity - cannot scale up")`
			`return`
			`desired_capacity = current_capacity + 1`
			`command = f"aws autoscaling set-desired-capacity --auto-scaling-group-name {self.asg_name} --desired-capacity {desired_capacity}"`
			`print(f"Increase capacity [{current_capacity} -> {desired_capacity}]")`
			`res = Shell.check(`
			`command=command,`
			`verbose=True,`
			`)`
			`if not res:`
			`print("ERROR: failed to increase capacity - cannot scale up")`


			`def handle_signal(signum, _frame):`
			`print(f"FATAL: Received signal {signum}")`
			`raise RuntimeError(f"killed by signal {signum}")`


			`def run():`
			`signal.signal(signal.SIGINT, handle_signal)`
			`signal.signal(signal.SIGTERM, handle_signal)`
			`m = None`
			`try:`
			`m = StateMachine()`
			`m.run()`
			`except Exception as e:`
			`print(f"FATAL: Exception [{e}] - terminate instance")`
			`time.sleep(10)`
			`if m:`
			`m.terminate()`
			`raise e`


			`if __name__ == "__main__":`
			`run()`