#!/usr/bin/env bash cat > /dev/null << 'EOF' The following content is embedded into the s3 object via the script deploy-runner-init.sh {staging,production} with additional helping information In the `user data` you should define as the following text between `### COPY BELOW` and `### COPY ABOVE` ### COPY BELOW Content-Type: multipart/mixed; boundary="//" MIME-Version: 1.0 --// Content-Type: text/cloud-config; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="cloud-config.txt" #cloud-config cloud_final_modules: - [scripts-user, always] --// Content-Type: text/x-shellscript; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Content-Disposition: attachment; filename="userdata.txt" #!/bin/bash INSTANCE_ID=$(ec2metadata --instance-id) INIT_ENVIRONMENT=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='github:init-environment'].Value" --output text) echo "Downloading and using $INIT_ENVIRONMENT cloud-init.sh" aws s3 cp "s3://github-runners-data/cloud-init/${INIT_ENVIRONMENT:-production}.sh" /tmp/cloud-init.sh chmod 0700 /tmp/cloud-init.sh exec bash /tmp/cloud-init.sh --// ### COPY ABOVE EOF # THE SCRIPT START set -uo pipefail #################################### # IMPORTANT! # # EC2 instance should have # # `github:runner-type` tag # # set accordingly to a runner role # #################################### echo "Running init script" export DEBIAN_FRONTEND=noninteractive export RUNNER_HOME=/home/ubuntu/actions-runner export RUNNER_ORG="ClickHouse" export RUNNER_URL="https://github.com/${RUNNER_ORG}" # Funny fact, but metadata service has fixed IP INSTANCE_ID=$(ec2metadata --instance-id) export INSTANCE_ID bash /usr/local/share/scripts/init-network.sh # combine labels RUNNER_TYPE=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='github:runner-type'].Value" --output text) LABELS="self-hosted,Linux,$(uname -m),$RUNNER_TYPE" export LABELS # Refresh CloudWatch agent config aws ssm get-parameter --region us-east-1 --name AmazonCloudWatch-github-runners --query 'Parameter.Value' --output text > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json systemctl restart amazon-cloudwatch-agent.service # Refresh teams ssh keys TEAM_KEYS_URL=$(aws ssm get-parameter --region us-east-1 --name team-keys-url --query 'Parameter.Value' --output=text) curl -s "${TEAM_KEYS_URL}" > /home/ubuntu/.ssh/authorized_keys2 chown ubuntu: /home/ubuntu/.ssh -R # Create a pre-run script that will provide diagnostics info mkdir -p /tmp/actions-hooks cat > /tmp/actions-hooks/common.sh << 'EOF' #!/bin/bash EOF terminate_delayed() { # The function for post hook to gracefully finish the job and then tear down # The very specific sleep time is used later to determine in the main loop if # the instance is tearing down # IF `sleep` IS CHANGED, CHANGE ANOTHER VALUE IN `pgrep` sleep=13.14159265358979323846 echo "Going to terminate the runner's instance in $sleep seconds" INSTANCE_ID=$(ec2metadata --instance-id) # We execute it with `at` to not have it as an orphan process, but launched independently # GH Runners kill all remain processes echo "sleep '$sleep'; aws ec2 terminate-instances --instance-ids $INSTANCE_ID" | at now || \ aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" # workaround for complete out of space or non-installed `at` exit 0 } detect_delayed_termination() { # The function look for very specific sleep with pi if pgrep 'sleep 13.14159265358979323846'; then echo 'The instance has delayed termination, sleep the same time to wait if it goes down' sleep 14 fi } declare -f terminate_delayed >> /tmp/actions-hooks/common.sh terminate_and_exit() { # Terminate instance and exit from the script instantly echo "Going to terminate the runner's instance" INSTANCE_ID=$(ec2metadata --instance-id) aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" exit 0 } declare -f terminate_and_exit >> /tmp/actions-hooks/common.sh check_spot_instance_is_old() { # This function should be executed ONLY BETWEEN runnings. # It's unsafe to execute while the runner is working! local LIFE_CYCLE LIFE_CYCLE=$(curl -s --fail http://169.254.169.254/latest/meta-data/instance-life-cycle) if [ "$LIFE_CYCLE" == "spot" ]; then local UPTIME UPTIME=$(< /proc/uptime) UPTIME=${UPTIME%%.*} if (( 3600 < UPTIME )); then echo "The spot instance has uptime $UPTIME, it's time to shut it down" return 0 fi fi return 1 } check_proceed_spot_termination() { # The function checks and proceeds spot instance termination if exists # The event for spot instance termination local FORCE FORCE=${1:-} if TERMINATION_DATA=$(curl -s --fail http://169.254.169.254/latest/meta-data/spot/instance-action); then # https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/spot-instance-termination-notices.html#instance-action-metadata _action=$(jq '.action' -r <<< "$TERMINATION_DATA") _time=$(jq '.time | fromdate' <<< "$TERMINATION_DATA") _until_action=$((_time - $(date +%s))) echo "Received the '$_action' event that will be effective in $_until_action seconds" if (( _until_action <= 30 )) || [ "$FORCE" == "force" ]; then echo "The action $_action will be done in $_until_action, killing the runner and exit" local runner_pid runner_pid=$(pgrep Runner.Listener) if [ -n "$runner_pid" ]; then # Kill the runner to not allow it cancelling the job # shellcheck disable=SC2046 kill -9 "$runner_pid" $(list_children "$runner_pid") fi sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" terminate_and_exit fi fi } no_terminating_metadata() { # The function check that instance could continue work # Returns 1 if any of termination events are received # The event for rebalance recommendation. Not strict, so we have some room to make a decision here if curl -s --fail http://169.254.169.254/latest/meta-data/events/recommendations/rebalance; then echo 'Received recommendation to rebalance, checking the uptime' local UPTIME UPTIME=$(< /proc/uptime) UPTIME=${UPTIME%%.*} # We don't shutdown the instances younger than 30m if (( 1800 < UPTIME )); then # To not shutdown everything at once, use the 66% to survive if (( $((RANDOM % 3)) == 0 )); then echo 'The instance is older than 30m and won the roulette' return 1 fi echo 'The instance is older than 30m, but is not chosen for rebalance' else echo 'The instance is younger than 30m, do not shut it down' fi fi # Checks if the ASG in a lifecycle hook state local ASG_STATUS ASG_STATUS=$(curl -s http://169.254.169.254/latest/meta-data/autoscaling/target-lifecycle-state) if [ "$ASG_STATUS" == "Terminated" ]; then echo 'The instance in ASG status Terminating:Wait' return 1 fi } terminate_on_event() { # If there is a rebalance event, then the instance could die soon # Let's don't wait for it and terminate proactively if curl -s --fail http://169.254.169.254/latest/meta-data/events/recommendations/rebalance; then terminate_and_exit fi # Here we check if the autoscaling group marked the instance for termination, and it's wait for the job to finish ASG_STATUS=$(curl -s http://169.254.169.254/latest/meta-data/autoscaling/target-lifecycle-state) if [ "$ASG_STATUS" == "Terminated" ]; then INSTANCE_ID=$(ec2metadata --instance-id) ASG_NAME=$(aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='aws:autoscaling:groupName'].Value" --output text) LIFECYCLE_HOOKS=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name "$ASG_NAME" --query "LifecycleHooks[].LifecycleHookName" --output text) for LCH in $LIFECYCLE_HOOKS; do aws autoscaling complete-lifecycle-action --lifecycle-action-result CONTINUE \ --lifecycle-hook-name "$LCH" --auto-scaling-group-name "$ASG_NAME" \ --instance-id "$INSTANCE_ID" true # autoformat issue done echo 'The runner is marked as "Terminated" by the autoscaling group, we are terminating' terminate_and_exit fi } cat > /tmp/actions-hooks/pre-run.sh << EOF #!/bin/bash set -uo pipefail echo "Runner's public DNS: $(ec2metadata --public-hostname)" echo "Runner's labels: ${LABELS}" echo "Runner's instance type: $(ec2metadata --instance-type)" EOF # Create a post-run script that will restart docker daemon before the job started cat > /tmp/actions-hooks/post-run.sh << 'EOF' #!/bin/bash set -xuo pipefail source /tmp/actions-hooks/common.sh # Free KiB, free percents ROOT_STAT=($(df / | awk '/\// {print $4 " " int($4/$2 * 100)}')) if [[ ${ROOT_STAT[0]} -lt 3000000 ]] || [[ ${ROOT_STAT[1]} -lt 5 ]]; then echo "The runner has ${ROOT_STAT[0]}KiB and ${ROOT_STAT[1]}% of free space on /" terminate_delayed fi # shellcheck disable=SC2046 docker ps --quiet | xargs --no-run-if-empty docker kill ||: # shellcheck disable=SC2046 docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||: # If we have hanged containers after the previous commands, than we have a hanged one # and should restart the daemon if [ "$(docker ps --all --quiet)" ]; then # Systemd service of docker has StartLimitBurst=3 and StartLimitInterval=60s, # that's why we try restarting it for long for i in {1..25}; do sudo systemctl restart docker && break || sleep 5 done for i in {1..10} do docker info && break || sleep 2 done # Last chance, otherwise we have to terminate poor instance docker info 1>/dev/null || { echo Docker unable to start; terminate_delayed ; } fi EOF get_runner_token() { /usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value } is_job_assigned() { local runner_pid runner_pid=$(pgrep Runner.Listener) if [ -z "$runner_pid" ]; then # if runner has finished, it's fine return 0 fi local log_file log_file=$(lsof -p "$runner_pid" 2>/dev/null | grep -o "$RUNNER_HOME/_diag/Runner.*log") if [ -z "$log_file" ]; then # assume, the process is over or just started return 0 fi # So far it's the only solid way to determine that the job is starting grep -q 'Terminal] .* Running job:' "$log_file" \ && return 0 \ || return 1 } list_children () { local children children=$(ps --ppid "$1" -o pid=) if [ -z "$children" ]; then return fi for pid in $children; do list_children "$pid" done echo "$children" } # There's possibility that it fails because the runner's version is outdated, # so after the first failure we'll try to launch it with enabled autoupdate. # # We'll fail and terminate after 10 consequent failures. ATTEMPT=0 # In `kill` 0 means "all processes in process group", -1 is "all but PID 1" # We use `-2` to get an error RUNNER_PID=-2 while true; do # Does not send signal, but checks that the process $RUNNER_PID is running if kill -0 -- $RUNNER_PID; then ATTEMPT=0 echo "Runner is working with pid $RUNNER_PID, checking the metadata in background" check_proceed_spot_termination if ! is_job_assigned; then RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$RUNNER_PID" 2>/dev/null || date +%s) )) echo "The runner is launched $RUNNER_AGE seconds ago and still hasn't received a job" if (( 60 < RUNNER_AGE )); then echo "Attempt to delete the runner for a graceful shutdown" sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \ || continue echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down" terminate_and_exit fi fi else if [ "$RUNNER_PID" != "-2" ]; then wait $RUNNER_PID \ && echo "Runner with PID $RUNNER_PID successfully finished" \ || echo "Attempt $((++ATTEMPT)) to start the runner" fi if (( ATTEMPT > 10 )); then echo "The runner has failed to start after $ATTEMPT attempt. Give up and terminate it" terminate_and_exit fi cd $RUNNER_HOME || terminate_and_exit detect_delayed_termination # If runner is not active, check that it needs to terminate itself echo "Checking if the instance suppose to terminate" no_terminating_metadata || terminate_on_event check_spot_instance_is_old && terminate_and_exit check_proceed_spot_termination force echo "Going to configure runner" token_args=(--token "$(get_runner_token)") config_args=( "${token_args[@]}" --url "$RUNNER_URL" --ephemeral --unattended --replace --runnergroup Default --labels "$LABELS" --work _work --name "$INSTANCE_ID" ) if (( ATTEMPT > 1 )); then echo 'The runner failed to start at least once. Removing it and then configuring with autoupdate enabled.' sudo -u ubuntu ./config.sh remove "${token_args[@]}" sudo -u ubuntu ./config.sh "${config_args[@]}" else echo "Configure runner with disabled autoupdate" config_args+=("--disableupdate") sudo -u ubuntu ./config.sh "${config_args[@]}" fi echo "Another one check to avoid race between runner and infrastructure" no_terminating_metadata || terminate_on_event check_spot_instance_is_old && terminate_and_exit check_proceed_spot_termination force # There were some failures to start the Job because of trash in _work rm -rf _work # https://github.com/actions/runner/issues/3266 # We're unable to know if the runner is failed to start. echo 'Monkey-patching run helpers to get genuine exit code of the runner' for script in run.sh run-helper.sh.template; do # shellcheck disable=SC2016 grep -q 'exit 0$' "$script" && \ sed 's/exit 0/exit $returnCode/' -i "$script" && \ echo "Script $script is patched" done echo "Run" sudo -u ubuntu \ ACTIONS_RUNNER_HOOK_JOB_STARTED=/tmp/actions-hooks/pre-run.sh \ ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/tmp/actions-hooks/post-run.sh \ ./run.sh & RUNNER_PID=$! sleep 10 fi sleep 5 done # vim:ts=4:sw=4