ClickHouse/tests/ci/worker/init_runner.sh
2023-06-19 15:40:28 +02:00

200 lines
7.7 KiB
Bash

#!/usr/bin/env bash
set -uo pipefail
####################################
# IMPORTANT! #
# EC2 instance should have #
# `github:runner-type` tag #
# set accordingly to a runner role #
####################################
echo "Running init script"
export DEBIAN_FRONTEND=noninteractive
export RUNNER_HOME=/home/ubuntu/actions-runner
export RUNNER_URL="https://github.com/ClickHouse"
# Funny fact, but metadata service has fixed IP
INSTANCE_ID=$(ec2metadata --instance-id)
export INSTANCE_ID
# Add cloudflare DNS as a fallback
# Get default gateway interface
IFACE=$(ip --json route list | jq '.[]|select(.dst == "default").dev' --raw-output)
# `Link 2 (eth0): 172.31.0.2`
ETH_DNS=$(resolvectl dns "$IFACE") || :
CLOUDFLARE_NS=1.1.1.1
if [[ "$ETH_DNS" ]] && [[ "${ETH_DNS#*: }" != *"$CLOUDFLARE_NS"* ]]; then
# Cut the leading legend
ETH_DNS=${ETH_DNS#*: }
# shellcheck disable=SC2206
new_dns=(${ETH_DNS} "$CLOUDFLARE_NS")
resolvectl dns "$IFACE" "${new_dns[@]}"
fi
# combine labels
RUNNER_TYPE=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='github:runner-type'].Value" --output text)
LABELS="self-hosted,Linux,$(uname -m),$RUNNER_TYPE"
export LABELS
# Refresh CloudWatch agent config
aws ssm get-parameter --region us-east-1 --name AmazonCloudWatch-github-runners --query 'Parameter.Value' --output text > /opt/aws/amazon-cloudwatch-agent/etc/amazon-cloudwatch-agent.json
systemctl restart amazon-cloudwatch-agent.service
# Refresh teams ssh keys
TEAM_KEYS_URL=$(aws ssm get-parameter --region us-east-1 --name team-keys-url --query 'Parameter.Value' --output=text)
curl -s "${TEAM_KEYS_URL}" > /home/ubuntu/.ssh/authorized_keys2
chown ubuntu: /home/ubuntu/.ssh -R
# Create a pre-run script that will provide diagnostics info
mkdir -p /tmp/actions-hooks
cat > /tmp/actions-hooks/common.sh << 'EOF'
#!/bin/bash
terminate-delayed() {
sleep=7
echo "Going to terminate the runner's instance in $sleep seconds"
INSTANCE_ID=$(ec2metadata --instance-id)
# We execute it with `at` to not have it as an orphan process, but launched independently
# GH Runners kill all remain processes
echo "sleep '$sleep'; aws ec2 terminate-instances --instance-ids $INSTANCE_ID" | at now || \
aws ec2 terminate-instances --instance-ids "$INSTANCE_ID" # workaround for complete out of space or non-installed `at`
exit 0
}
terminate-and-exit() {
echo "Going to terminate the runner's instance"
INSTANCE_ID=$(ec2metadata --instance-id)
aws ec2 terminate-instances --instance-ids "$INSTANCE_ID"
}
check-terminating-metadata() {
# If there is a rebalance event, then the instance could die soon
# Let's don't wait for it and terminate proactively
if curl -s --fail http://169.254.169.254/latest/meta-data/events/recommendations/rebalance; then
echo 'The received recommendation to rebalance, checking the uptime'
UPTIME=$(< /proc/uptime)
UPTIME=${UPTIME%%.*}
# We don't shutdown the instances younger than 30m
if (( 1800 < UPTIME )); then
# To not shutdown everything at once, use the 66% to survive
if (( $((RANDOM % 3)) == 0 )); then
echo 'The instance is older than 30m and won the roulette'
terminate-and-exit
fi
echo 'The instance is older than 30m, but is not chosen for rebalance'
else
echo 'The instance is younger than 30m, do not shut it down'
fi
fi
# Here we check if the autoscaling group marked the instance for termination, and it's wait for the job to finish
ASG_STATUS=$(curl -s http://169.254.169.254/latest/meta-data/autoscaling/target-lifecycle-state)
if [ "$ASG_STATUS" == "Terminated" ]; then
INSTANCE_ID=$(ec2metadata --instance-id)
ASG_NAME=$(aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='aws:autoscaling:groupName'].Value" --output text)
LIFECYCLE_HOOKS=$(aws autoscaling describe-lifecycle-hooks --auto-scaling-group-name "$ASG_NAME" --query "LifecycleHooks[].LifecycleHookName" --output text)
for LCH in $LIFECYCLE_HOOKS; do
aws autoscaling complete-lifecycle-action --lifecycle-action-result CONTINUE \
--lifecycle-hook-name "$LCH" --auto-scaling-group-name "$ASG_NAME" \
--instance-id "$INSTANCE_ID"
done
echo 'The runner is marked as "Terminated" by the autoscaling group, we are terminating'
terminate-and-exit
fi
}
EOF
cat > /tmp/actions-hooks/pre-run.sh << EOF
#!/bin/bash
set -uo pipefail
echo "Runner's public DNS: $(ec2metadata --public-hostname)"
echo "Runner's labels: ${LABELS}"
EOF
# Create a post-run script that will restart docker daemon before the job started
cat > /tmp/actions-hooks/post-run.sh << 'EOF'
#!/bin/bash
set -xuo pipefail
source /tmp/actions-hooks/common.sh
# Free KiB, free percents
ROOT_STAT=($(df / | awk '/\// {print $4 " " int($4/$2 * 100)}'))
if [[ ${ROOT_STAT[0]} -lt 3000000 ]] || [[ ${ROOT_STAT[1]} -lt 5 ]]; then
echo "The runner has ${ROOT_STAT[0]}KiB and ${ROOT_STAT[1]}% of free space on /"
terminate-delayed
fi
# shellcheck disable=SC2046
docker ps --quiet | xargs --no-run-if-empty docker kill ||:
# shellcheck disable=SC2046
docker ps --all --quiet | xargs --no-run-if-empty docker rm -f ||:
# If we have hanged containers after the previous commands, than we have a hanged one
# and should restart the daemon
if [ "$(docker ps --all --quiet)" ]; then
# Systemd service of docker has StartLimitBurst=3 and StartLimitInterval=60s,
# that's why we try restarting it for long
for i in {1..25};
do
sudo systemctl restart docker && break || sleep 5
done
for i in {1..10}
do
docker info && break || sleep 2
done
# Last chance, otherwise we have to terminate poor instance
docker info 1>/dev/null || { echo Docker unable to start; terminate-delayed ; }
fi
EOF
source /tmp/actions-hooks/common.sh
while true; do
runner_pid=$(pgrep Runner.Listener)
echo "Got runner pid $runner_pid"
if [ -z "$runner_pid" ]; then
cd $RUNNER_HOME || exit 1
# If runner is not active, check that it needs to terminate itself
echo "Checking if the instance suppose to terminate"
check-terminating-metadata
echo "Receiving token"
RUNNER_TOKEN=$(/usr/local/bin/aws ssm get-parameter --name github_runner_registration_token --with-decryption --output text --query Parameter.Value)
echo "Going to configure runner"
sudo -u ubuntu ./config.sh --url $RUNNER_URL --token "$RUNNER_TOKEN" --ephemeral \
--runnergroup Default --labels "$LABELS" --work _work --name "$INSTANCE_ID"
echo "Another one check to avoid race between runner and infrastructure"
check-terminating-metadata
echo "Run"
sudo -u ubuntu \
ACTIONS_RUNNER_HOOK_JOB_STARTED=/tmp/actions-hooks/pre-run.sh \
ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/tmp/actions-hooks/post-run.sh \
./run.sh &
sleep 15
else
echo "Runner is working with pid $runner_pid, nothing to do"
# The runner does not provide a way to determine, if it runs the job,
# neither the way to determine if it just litens. But there should be a
# process for Runner.Worker. So if the runner just hangs around for long,
# we check if it's fine to let it go
if ! pgrep Runner.Worker > /dev/null; then
RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$runner_pid" 2>/dev/null || date +%s) ))
echo "The runner is launched $RUNNER_AGE seconds ago and still doesn't have launched Runner.Worker"
if (( 60 < RUNNER_AGE )); then
echo "Check if the instance should tear down"
check-terminating-metadata
fi
fi
sleep 5
fi
done
# vim:ts=4:sw=4