2024-10-01 19:19:35 +00:00
import os
import platform
import signal
import time
import traceback
import requests
2024-11-27 12:50:58 +00:00
from . . utils import ContextManager , Shell
from . execution_settings import ExecutionSettings , ScalingType
2024-10-01 19:19:35 +00:00
class StateMachine :
class StateNames :
INIT = " init "
WAIT = " wait "
RUN = " run "
def __init__ ( self ) :
self . state = self . StateNames . INIT
self . scale_type = ExecutionSettings . RUNNER_SCALING_TYPE
self . machine = Machine ( scaling_type = self . scale_type ) . update_instance_info ( )
self . state_updated_at = int ( time . time ( ) )
self . forked = False
def kick ( self ) :
if self . state == self . StateNames . INIT :
self . machine . config_actions ( ) . run_actions_async ( )
print ( " State Machine: INIT -> WAIT " )
self . state = self . StateNames . WAIT
self . state_updated_at = int ( time . time ( ) )
# TODO: add monitoring
if not self . machine . is_actions_process_healthy ( ) :
print ( f " ERROR: GH runner process unexpectedly died " )
self . machine . self_terminate ( decrease_capacity = False )
elif self . state == self . StateNames . WAIT :
res = self . machine . check_job_assigned ( )
if res :
print ( " State Machine: WAIT -> RUN " )
self . state = self . StateNames . RUN
self . state_updated_at = int ( time . time ( ) )
self . check_scale_up ( )
else :
self . check_scale_down ( )
elif self . state == self . StateNames . RUN :
res = self . machine . check_job_running ( )
if res :
pass
else :
print ( " State Machine: RUN -> INIT " )
self . state = self . StateNames . INIT
self . state_updated_at = int ( time . time ( ) )
def check_scale_down ( self ) :
if self . scale_type not in (
ScalingType . AUTOMATIC_SCALE_DOWN ,
ScalingType . AUTOMATIC_SCALE_UP_DOWN ,
) :
return
if ScalingType . AUTOMATIC_SCALE_UP_DOWN and not self . forked :
print (
f " Scaling type is AUTOMATIC_SCALE_UP_DOWN and machine has not run a job - do not scale down "
)
return
if (
int ( time . time ( ) ) - self . state_updated_at
> ExecutionSettings . MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC
) :
print (
f " No job assigned for more than MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC [ { ExecutionSettings . MAX_WAIT_TIME_BEFORE_SCALE_DOWN_SEC } ] - scale down the instance "
)
if not ExecutionSettings . LOCAL_EXECUTION :
self . machine . self_terminate ( decrease_capacity = True )
else :
print ( " Local execution - skip scaling operation " )
def check_scale_up ( self ) :
if self . scale_type not in ( ScalingType . AUTOMATIC_SCALE_UP_DOWN , ) :
return
if self . forked :
print ( " This instance already forked once - do not scale up " )
return
self . machine . self_fork ( )
self . forked = True
def run ( self ) :
self . machine . unconfig_actions ( )
while True :
self . kick ( )
time . sleep ( 5 )
def terminate ( self ) :
try :
self . machine . unconfig_actions ( )
except :
print ( " WARNING: failed to unconfig runner " )
if not ExecutionSettings . LOCAL_EXECUTION :
if self . machine is not None :
self . machine . self_terminate ( decrease_capacity = False )
time . sleep ( 10 )
# wait termination
print ( " ERROR: failed to terminate instance via aws cli - try os call " )
os . system ( " sudo shutdown now " )
else :
print ( " NOTE: Local execution - machine won ' t be terminated " )
class Machine :
@staticmethod
def get_latest_gh_actions_release ( ) :
url = f " https://api.github.com/repos/actions/runner/releases/latest "
response = requests . get ( url , timeout = 5 )
if response . status_code == 200 :
latest_release = response . json ( )
return latest_release [ " tag_name " ] . removeprefix ( " v " )
else :
print ( f " Failed to get the latest release: { response . status_code } " )
return None
def __init__ ( self , scaling_type ) :
self . os_name = platform . system ( ) . lower ( )
assert self . os_name == " linux " , f " Unsupported OS [ { self . os_name } ] "
if platform . machine ( ) == " x86_64 " :
self . arch = " x64 "
elif " aarch64 " in platform . machine ( ) . lower ( ) :
self . arch = " arm64 "
else :
assert False , f " Unsupported arch [ { platform . machine ( ) } ] "
self . instance_id = None
self . asg_name = None
self . runner_api_endpoint = None
self . runner_type = None
self . labels = [ ]
self . proc = None
assert scaling_type in ScalingType
self . scaling_type = scaling_type
def install_gh_actions_runner ( self ) :
gh_actions_version = self . get_latest_gh_actions_release ( )
assert self . os_name and gh_actions_version and self . arch
Shell . check (
f " rm -rf { ExecutionSettings . GH_ACTIONS_DIRECTORY } " ,
strict = True ,
verbose = True ,
)
Shell . check (
f " mkdir { ExecutionSettings . GH_ACTIONS_DIRECTORY } " , strict = True , verbose = True
)
with ContextManager . cd ( ExecutionSettings . GH_ACTIONS_DIRECTORY ) :
Shell . check (
f " curl -O -L https://github.com/actions/runner/releases/download/v { gh_actions_version } /actions-runner- { self . os_name } - { self . arch } - { gh_actions_version } .tar.gz " ,
strict = True ,
verbose = True ,
)
Shell . check ( f " tar xzf *tar.gz " , strict = True , verbose = True )
Shell . check ( f " rm -f *tar.gz " , strict = True , verbose = True )
Shell . check ( f " sudo ./bin/installdependencies.sh " , strict = True , verbose = True )
Shell . check (
f " chown -R ubuntu:ubuntu { ExecutionSettings . GH_ACTIONS_DIRECTORY } " ,
strict = True ,
verbose = True ,
)
def _get_gh_token_from_ssm ( self ) :
gh_token = Shell . get_output_or_raise (
" /usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value "
)
return gh_token
def update_instance_info ( self ) :
self . instance_id = Shell . get_output_or_raise ( " ec2metadata --instance-id " )
assert self . instance_id
self . asg_name = Shell . get_output (
f " aws ec2 describe-instances --instance-id { self . instance_id } --query \" Reservations[].Instances[].Tags[?Key== ' aws:autoscaling:groupName ' ].Value \" --output text "
)
# self.runner_type = Shell.get_output_or_raise(
# f'/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values={self.instance_id}" --query "Tags[?Key==\'github:runner-type\'].Value" --output text'
# )
self . runner_type = self . asg_name
if (
self . scaling_type != ScalingType . DISABLED
and not ExecutionSettings . LOCAL_EXECUTION
) :
assert (
self . asg_name and self . runner_type
) , f " Failed to retrieve ASG name, which is required for scaling_type [ { self . scaling_type } ] "
org = os . getenv ( " MY_ORG " , " " )
assert (
org
) , " MY_ORG env variable myst be set to use init script for runner machine "
self . runner_api_endpoint = f " https://github.com/ { org } "
self . labels = [ " self-hosted " , self . runner_type ]
return self
@classmethod
def check_job_assigned ( cls ) :
runner_pid = Shell . get_output_or_raise ( " pgrep Runner.Listener " )
if not runner_pid :
print ( " check_job_assigned: No runner pid " )
return False
log_file = Shell . get_output_or_raise (
f " lsof -p { runner_pid } | grep -o { ExecutionSettings . GH_ACTIONS_DIRECTORY } /_diag/Runner.*log "
)
if not log_file :
print ( " check_job_assigned: No log file " )
return False
return Shell . check ( f " grep -q ' Terminal] .* Running job: ' { log_file } " )
def check_job_running ( self ) :
if self . proc is None :
print ( f " WARNING: No job started " )
return False
exit_code = self . proc . poll ( )
if exit_code is None :
return True
else :
print ( f " Job runner finished with exit code [ { exit_code } ] " )
self . proc = None
return False
def config_actions ( self ) :
if not self . instance_id :
self . update_instance_info ( )
token = self . _get_gh_token_from_ssm ( )
assert token and self . instance_id and self . runner_api_endpoint and self . labels
command = f " sudo -u ubuntu { ExecutionSettings . GH_ACTIONS_DIRECTORY } /config.sh --token { token } \
- - url { self . runner_api_endpoint } - - ephemeral - - unattended - - replace \
- - runnergroup Default - - labels { ' , ' . join ( self . labels ) } - - work wd - - name { self . instance_id } "
res = 1
i = 0
while i < 10 and res != 0 :
res = Shell . run ( command )
i + = 1
if res != 0 :
print (
f " ERROR: failed to configure GH actions runner after [ { i } ] attempts, exit code [ { res } ], retry after 10s "
)
time . sleep ( 10 )
self . _get_gh_token_from_ssm ( )
if res == 0 :
print ( " GH action runner has been configured " )
else :
assert False , " GH actions runner configuration failed "
return self
def unconfig_actions ( self ) :
token = self . _get_gh_token_from_ssm ( )
command = f " sudo -u ubuntu { ExecutionSettings . GH_ACTIONS_DIRECTORY } /config.sh remove --token { token } "
Shell . check ( command , strict = True )
return self
def run_actions_async ( self ) :
command = f " sudo -u ubuntu { ExecutionSettings . GH_ACTIONS_DIRECTORY } /run.sh "
self . proc = Shell . run_async ( command )
assert self . proc is not None
return self
def is_actions_process_healthy ( self ) :
try :
if self . proc . poll ( ) is None :
return True
stdout , stderr = self . proc . communicate ( )
if self . proc . returncode != 0 :
# Handle failure
print (
f " GH Action process failed with return code { self . proc . returncode } "
)
print ( f " Error output: { stderr } " )
return False
else :
print ( f " GH Action process is not running " )
return False
except Exception as e :
print ( f " GH Action process exception: { e } " )
return False
def self_terminate ( self , decrease_capacity ) :
print (
f " WARNING: Self terminate is called, decrease_capacity [ { decrease_capacity } ] "
)
traceback . print_stack ( )
if not self . instance_id :
self . update_instance_info ( )
assert self . instance_id
command = f " aws autoscaling terminate-instance-in-auto-scaling-group --instance-id { self . instance_id } "
if decrease_capacity :
command + = " --should-decrement-desired-capacity "
else :
command + = " --no-should-decrement-desired-capacity "
Shell . check (
command = command ,
verbose = True ,
)
def self_fork ( self ) :
current_capacity = Shell . get_output (
f ' aws autoscaling describe-auto-scaling-groups --auto-scaling-group-name { self . asg_name } \
- - query " AutoScalingGroups[0].DesiredCapacity " - - output text '
)
current_capacity = int ( current_capacity )
if not current_capacity :
print ( " ERROR: failed to get current capacity - cannot scale up " )
return
desired_capacity = current_capacity + 1
command = f " aws autoscaling set-desired-capacity --auto-scaling-group-name { self . asg_name } --desired-capacity { desired_capacity } "
print ( f " Increase capacity [ { current_capacity } -> { desired_capacity } ] " )
res = Shell . check (
command = command ,
verbose = True ,
)
if not res :
print ( " ERROR: failed to increase capacity - cannot scale up " )
def handle_signal ( signum , _frame ) :
print ( f " FATAL: Received signal { signum } " )
raise RuntimeError ( f " killed by signal { signum } " )
def run ( ) :
signal . signal ( signal . SIGINT , handle_signal )
signal . signal ( signal . SIGTERM , handle_signal )
m = None
try :
m = StateMachine ( )
m . run ( )
except Exception as e :
print ( f " FATAL: Exception [ { e } ] - terminate instance " )
time . sleep ( 10 )
if m :
m . terminate ( )
raise e
if __name__ == " __main__ " :
run ( )