Merge pull request #63195 from ClickHouse/make-gh-runner-resilient

Fallback action-runner to autoupdate when it's unable to start
This commit is contained in:
Mikhail f. Shiryaev 2024-05-16 13:23:39 +00:00 committed by GitHub
commit 1fba7c372a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 115 additions and 44 deletions

View File

@ -300,11 +300,44 @@ list_children () {
echo "$children"
}
while true; do
runner_pid=$(pgrep Runner.Listener)
echo "Got runner pid '$runner_pid'"
# There's possibility that it fails because the runner's version is outdated,
# so after the first failure we'll try to launch it with enabled autoupdate.
#
# We'll fail and terminate after 10 consequent failures.
ATTEMPT=0
# In `kill` 0 means "all processes in process group", -1 is "all but PID 1"
# We use `-2` to get an error
RUNNER_PID=-2
while true; do
# Does not send signal, but checks that the process $RUNNER_PID is running
if kill -0 -- $RUNNER_PID; then
ATTEMPT=0
echo "Runner is working with pid $RUNNER_PID, checking the metadata in background"
check_proceed_spot_termination
if ! is_job_assigned; then
RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$RUNNER_PID" 2>/dev/null || date +%s) ))
echo "The runner is launched $RUNNER_AGE seconds ago and still hasn't received a job"
if (( 60 < RUNNER_AGE )); then
echo "Attempt to delete the runner for a graceful shutdown"
sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \
|| continue
echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down"
terminate_and_exit
fi
fi
else
if [ "$RUNNER_PID" != "-2" ]; then
wait $RUNNER_PID \
&& echo "Runner with PID $RUNNER_PID successfully finished" \
|| echo "Attempt $((++ATTEMPT)) to start the runner"
fi
if (( ATTEMPT > 10 )); then
echo "The runner has failed to start after $ATTEMPT attempt. Give up and terminate it"
terminate_and_exit
fi
if [ -z "$runner_pid" ]; then
cd $RUNNER_HOME || terminate_and_exit
detect_delayed_termination
# If runner is not active, check that it needs to terminate itself
@ -314,37 +347,50 @@ while true; do
check_proceed_spot_termination force
echo "Going to configure runner"
sudo -u ubuntu ./config.sh --url $RUNNER_URL --token "$(get_runner_token)" \
--ephemeral --disableupdate --unattended \
--runnergroup Default --labels "$LABELS" --work _work --name "$INSTANCE_ID"
token_args=(--token "$(get_runner_token)")
config_args=(
"${token_args[@]}" --url "$RUNNER_URL"
--ephemeral --unattended --replace --runnergroup Default
--labels "$LABELS" --work _work --name "$INSTANCE_ID"
)
if (( ATTEMPT > 1 )); then
echo 'The runner failed to start at least once. Removing it and then configuring with autoupdate enabled.'
sudo -u ubuntu ./config.sh remove "${token_args[@]}"
sudo -u ubuntu ./config.sh "${config_args[@]}"
else
echo "Configure runner with disabled autoupdate"
config_args+=("--disableupdate")
sudo -u ubuntu ./config.sh "${config_args[@]}"
fi
echo "Another one check to avoid race between runner and infrastructure"
no_terminating_metadata || terminate_on_event
check_spot_instance_is_old && terminate_and_exit
check_proceed_spot_termination force
# There were some failures to start the Job because of trash in _work
rm -rf _work
# https://github.com/actions/runner/issues/3266
# We're unable to know if the runner is failed to start.
echo 'Monkey-patching run helpers to get genuine exit code of the runner'
for script in run.sh run-helper.sh.template; do
# shellcheck disable=SC2016
grep -q 'exit 0$' "$script" && \
sed 's/exit 0/exit $returnCode/' -i "$script" && \
echo "Script $script is patched"
done
echo "Run"
sudo -u ubuntu \
ACTIONS_RUNNER_HOOK_JOB_STARTED=/tmp/actions-hooks/pre-run.sh \
ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/tmp/actions-hooks/post-run.sh \
./run.sh &
sleep 10
else
echo "Runner is working with pid $runner_pid, checking the metadata in background"
check_proceed_spot_termination
RUNNER_PID=$!
if ! is_job_assigned; then
RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$runner_pid" 2>/dev/null || date +%s) ))
echo "The runner is launched $RUNNER_AGE seconds ago and still has hot received the job"
if (( 60 < RUNNER_AGE )); then
echo "Attempt to delete the runner for a graceful shutdown"
sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \
|| continue
echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down"
terminate_and_exit
fi
fi
sleep 10
fi
sleep 5
done

View File

@ -9,7 +9,7 @@ set -xeuo pipefail
echo "Running prepare script"
export DEBIAN_FRONTEND=noninteractive
export RUNNER_VERSION=2.315.0
export RUNNER_VERSION=2.316.1
export RUNNER_HOME=/home/ubuntu/actions-runner
deb_arch() {
@ -155,31 +155,56 @@ apt-get install tailscale --yes --no-install-recommends
# Create a common script for the instances
mkdir /usr/local/share/scripts -p
cat > /usr/local/share/scripts/init-network.sh << 'EOF'
#!/usr/bin/env bash
setup_cloudflare_dns() {
# Add cloudflare DNS as a fallback
# Get default gateway interface
local IFACE ETH_DNS CLOUDFLARE_NS new_dns
IFACE=$(ip --json route list | jq '.[]|select(.dst == "default").dev' --raw-output)
# `Link 2 (eth0): 172.31.0.2`
ETH_DNS=$(resolvectl dns "$IFACE") || :
CLOUDFLARE_NS=1.1.1.1
if [[ "$ETH_DNS" ]] && [[ "${ETH_DNS#*: }" != *"$CLOUDFLARE_NS"* ]]; then
# Cut the leading legend
ETH_DNS=${ETH_DNS#*: }
# shellcheck disable=SC2206
new_dns=(${ETH_DNS} "$CLOUDFLARE_NS")
resolvectl dns "$IFACE" "${new_dns[@]}"
fi
}
# Add cloudflare DNS as a fallback
# Get default gateway interface
IFACE=$(ip --json route list | jq '.[]|select(.dst == "default").dev' --raw-output)
# `Link 2 (eth0): 172.31.0.2`
ETH_DNS=$(resolvectl dns "$IFACE") || :
CLOUDFLARE_NS=1.1.1.1
if [[ "$ETH_DNS" ]] && [[ "${ETH_DNS#*: }" != *"$CLOUDFLARE_NS"* ]]; then
# Cut the leading legend
ETH_DNS=${ETH_DNS#*: }
# shellcheck disable=SC2206
new_dns=(${ETH_DNS} "$CLOUDFLARE_NS")
resolvectl dns "$IFACE" "${new_dns[@]}"
fi
setup_tailscale() {
# Setup tailscale, the very first action
local TS_API_CLIENT_ID TS_API_CLIENT_SECRET TS_AUTHKEY RUNNER_TYPE
TS_API_CLIENT_ID=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-id --query 'Parameter.Value' --output text --with-decryption)
TS_API_CLIENT_SECRET=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-secret --query 'Parameter.Value' --output text --with-decryption)
# Setup tailscale, the very first action
TS_API_CLIENT_ID=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-id --query 'Parameter.Value' --output text --with-decryption)
TS_API_CLIENT_SECRET=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-secret --query 'Parameter.Value' --output text --with-decryption)
export TS_API_CLIENT_ID TS_API_CLIENT_SECRET
TS_AUTHKEY=$(get-authkey -tags tag:svc-core-ci-github -reusable -ephemeral)
tailscale up --ssh --auth-key="$TS_AUTHKEY" --hostname="ci-runner-$INSTANCE_ID"
RUNNER_TYPE=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='github:runner-type'].Value" --output text)
RUNNER_TYPE=${RUNNER_TYPE:-unknown}
# Clean possible garbage from the runner type
RUNNER_TYPE=${RUNNER_TYPE//[^0-9a-z]/-}
TS_AUTHKEY=$(TS_API_CLIENT_ID="$TS_API_CLIENT_ID" TS_API_CLIENT_SECRET="$TS_API_CLIENT_SECRET" \
get-authkey -tags tag:svc-core-ci-github -reusable -ephemeral)
tailscale up --ssh --auth-key="$TS_AUTHKEY" --hostname="ci-runner-$RUNNER_TYPE-$INSTANCE_ID"
}
cat > /usr/local/share/scripts/init-network.sh << EOF
!/usr/bin/env bash
$(declare -f setup_cloudflare_dns)
$(declare -f setup_tailscale)
# If the script is sourced, it will return now and won't execute functions
return 0 &>/dev/null || :
echo Setup Cloudflare DNS
setup_cloudflare_dns
echo Setup Tailscale VPN
setup_tailscale
EOF
chmod +x /usr/local/share/scripts/init-network.sh
# The following line is used in aws TOE check.
touch /var/tmp/clickhouse-ci-ami.success