mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-26 17:41:59 +00:00
Merge pull request #63195 from ClickHouse/make-gh-runner-resilient
Fallback action-runner to autoupdate when it's unable to start
This commit is contained in:
commit
1fba7c372a
@ -300,11 +300,44 @@ list_children () {
|
||||
echo "$children"
|
||||
}
|
||||
|
||||
while true; do
|
||||
runner_pid=$(pgrep Runner.Listener)
|
||||
echo "Got runner pid '$runner_pid'"
|
||||
# There's possibility that it fails because the runner's version is outdated,
|
||||
# so after the first failure we'll try to launch it with enabled autoupdate.
|
||||
#
|
||||
# We'll fail and terminate after 10 consequent failures.
|
||||
ATTEMPT=0
|
||||
# In `kill` 0 means "all processes in process group", -1 is "all but PID 1"
|
||||
# We use `-2` to get an error
|
||||
RUNNER_PID=-2
|
||||
|
||||
while true; do
|
||||
# Does not send signal, but checks that the process $RUNNER_PID is running
|
||||
if kill -0 -- $RUNNER_PID; then
|
||||
ATTEMPT=0
|
||||
echo "Runner is working with pid $RUNNER_PID, checking the metadata in background"
|
||||
check_proceed_spot_termination
|
||||
|
||||
if ! is_job_assigned; then
|
||||
RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$RUNNER_PID" 2>/dev/null || date +%s) ))
|
||||
echo "The runner is launched $RUNNER_AGE seconds ago and still hasn't received a job"
|
||||
if (( 60 < RUNNER_AGE )); then
|
||||
echo "Attempt to delete the runner for a graceful shutdown"
|
||||
sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \
|
||||
|| continue
|
||||
echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down"
|
||||
terminate_and_exit
|
||||
fi
|
||||
fi
|
||||
else
|
||||
if [ "$RUNNER_PID" != "-2" ]; then
|
||||
wait $RUNNER_PID \
|
||||
&& echo "Runner with PID $RUNNER_PID successfully finished" \
|
||||
|| echo "Attempt $((++ATTEMPT)) to start the runner"
|
||||
fi
|
||||
if (( ATTEMPT > 10 )); then
|
||||
echo "The runner has failed to start after $ATTEMPT attempt. Give up and terminate it"
|
||||
terminate_and_exit
|
||||
fi
|
||||
|
||||
if [ -z "$runner_pid" ]; then
|
||||
cd $RUNNER_HOME || terminate_and_exit
|
||||
detect_delayed_termination
|
||||
# If runner is not active, check that it needs to terminate itself
|
||||
@ -314,37 +347,50 @@ while true; do
|
||||
check_proceed_spot_termination force
|
||||
|
||||
echo "Going to configure runner"
|
||||
sudo -u ubuntu ./config.sh --url $RUNNER_URL --token "$(get_runner_token)" \
|
||||
--ephemeral --disableupdate --unattended \
|
||||
--runnergroup Default --labels "$LABELS" --work _work --name "$INSTANCE_ID"
|
||||
token_args=(--token "$(get_runner_token)")
|
||||
config_args=(
|
||||
"${token_args[@]}" --url "$RUNNER_URL"
|
||||
--ephemeral --unattended --replace --runnergroup Default
|
||||
--labels "$LABELS" --work _work --name "$INSTANCE_ID"
|
||||
)
|
||||
if (( ATTEMPT > 1 )); then
|
||||
echo 'The runner failed to start at least once. Removing it and then configuring with autoupdate enabled.'
|
||||
sudo -u ubuntu ./config.sh remove "${token_args[@]}"
|
||||
sudo -u ubuntu ./config.sh "${config_args[@]}"
|
||||
else
|
||||
echo "Configure runner with disabled autoupdate"
|
||||
config_args+=("--disableupdate")
|
||||
sudo -u ubuntu ./config.sh "${config_args[@]}"
|
||||
fi
|
||||
|
||||
echo "Another one check to avoid race between runner and infrastructure"
|
||||
no_terminating_metadata || terminate_on_event
|
||||
check_spot_instance_is_old && terminate_and_exit
|
||||
check_proceed_spot_termination force
|
||||
|
||||
# There were some failures to start the Job because of trash in _work
|
||||
rm -rf _work
|
||||
|
||||
# https://github.com/actions/runner/issues/3266
|
||||
# We're unable to know if the runner is failed to start.
|
||||
echo 'Monkey-patching run helpers to get genuine exit code of the runner'
|
||||
for script in run.sh run-helper.sh.template; do
|
||||
# shellcheck disable=SC2016
|
||||
grep -q 'exit 0$' "$script" && \
|
||||
sed 's/exit 0/exit $returnCode/' -i "$script" && \
|
||||
echo "Script $script is patched"
|
||||
done
|
||||
|
||||
echo "Run"
|
||||
sudo -u ubuntu \
|
||||
ACTIONS_RUNNER_HOOK_JOB_STARTED=/tmp/actions-hooks/pre-run.sh \
|
||||
ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/tmp/actions-hooks/post-run.sh \
|
||||
./run.sh &
|
||||
sleep 10
|
||||
else
|
||||
echo "Runner is working with pid $runner_pid, checking the metadata in background"
|
||||
check_proceed_spot_termination
|
||||
RUNNER_PID=$!
|
||||
|
||||
if ! is_job_assigned; then
|
||||
RUNNER_AGE=$(( $(date +%s) - $(stat -c +%Y /proc/"$runner_pid" 2>/dev/null || date +%s) ))
|
||||
echo "The runner is launched $RUNNER_AGE seconds ago and still has hot received the job"
|
||||
if (( 60 < RUNNER_AGE )); then
|
||||
echo "Attempt to delete the runner for a graceful shutdown"
|
||||
sudo -u ubuntu ./config.sh remove --token "$(get_runner_token)" \
|
||||
|| continue
|
||||
echo "Runner didn't launch or have assigned jobs after ${RUNNER_AGE} seconds, shutting down"
|
||||
terminate_and_exit
|
||||
fi
|
||||
fi
|
||||
sleep 10
|
||||
fi
|
||||
|
||||
sleep 5
|
||||
done
|
||||
|
||||
|
@ -9,7 +9,7 @@ set -xeuo pipefail
|
||||
|
||||
echo "Running prepare script"
|
||||
export DEBIAN_FRONTEND=noninteractive
|
||||
export RUNNER_VERSION=2.315.0
|
||||
export RUNNER_VERSION=2.316.1
|
||||
export RUNNER_HOME=/home/ubuntu/actions-runner
|
||||
|
||||
deb_arch() {
|
||||
@ -155,31 +155,56 @@ apt-get install tailscale --yes --no-install-recommends
|
||||
|
||||
# Create a common script for the instances
|
||||
mkdir /usr/local/share/scripts -p
|
||||
cat > /usr/local/share/scripts/init-network.sh << 'EOF'
|
||||
#!/usr/bin/env bash
|
||||
setup_cloudflare_dns() {
|
||||
# Add cloudflare DNS as a fallback
|
||||
# Get default gateway interface
|
||||
local IFACE ETH_DNS CLOUDFLARE_NS new_dns
|
||||
IFACE=$(ip --json route list | jq '.[]|select(.dst == "default").dev' --raw-output)
|
||||
# `Link 2 (eth0): 172.31.0.2`
|
||||
ETH_DNS=$(resolvectl dns "$IFACE") || :
|
||||
CLOUDFLARE_NS=1.1.1.1
|
||||
if [[ "$ETH_DNS" ]] && [[ "${ETH_DNS#*: }" != *"$CLOUDFLARE_NS"* ]]; then
|
||||
# Cut the leading legend
|
||||
ETH_DNS=${ETH_DNS#*: }
|
||||
# shellcheck disable=SC2206
|
||||
new_dns=(${ETH_DNS} "$CLOUDFLARE_NS")
|
||||
resolvectl dns "$IFACE" "${new_dns[@]}"
|
||||
fi
|
||||
}
|
||||
|
||||
# Add cloudflare DNS as a fallback
|
||||
# Get default gateway interface
|
||||
IFACE=$(ip --json route list | jq '.[]|select(.dst == "default").dev' --raw-output)
|
||||
# `Link 2 (eth0): 172.31.0.2`
|
||||
ETH_DNS=$(resolvectl dns "$IFACE") || :
|
||||
CLOUDFLARE_NS=1.1.1.1
|
||||
if [[ "$ETH_DNS" ]] && [[ "${ETH_DNS#*: }" != *"$CLOUDFLARE_NS"* ]]; then
|
||||
# Cut the leading legend
|
||||
ETH_DNS=${ETH_DNS#*: }
|
||||
# shellcheck disable=SC2206
|
||||
new_dns=(${ETH_DNS} "$CLOUDFLARE_NS")
|
||||
resolvectl dns "$IFACE" "${new_dns[@]}"
|
||||
fi
|
||||
setup_tailscale() {
|
||||
# Setup tailscale, the very first action
|
||||
local TS_API_CLIENT_ID TS_API_CLIENT_SECRET TS_AUTHKEY RUNNER_TYPE
|
||||
TS_API_CLIENT_ID=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-id --query 'Parameter.Value' --output text --with-decryption)
|
||||
TS_API_CLIENT_SECRET=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-secret --query 'Parameter.Value' --output text --with-decryption)
|
||||
|
||||
# Setup tailscale, the very first action
|
||||
TS_API_CLIENT_ID=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-id --query 'Parameter.Value' --output text --with-decryption)
|
||||
TS_API_CLIENT_SECRET=$(aws ssm get-parameter --region us-east-1 --name /tailscale/api-client-secret --query 'Parameter.Value' --output text --with-decryption)
|
||||
export TS_API_CLIENT_ID TS_API_CLIENT_SECRET
|
||||
TS_AUTHKEY=$(get-authkey -tags tag:svc-core-ci-github -reusable -ephemeral)
|
||||
tailscale up --ssh --auth-key="$TS_AUTHKEY" --hostname="ci-runner-$INSTANCE_ID"
|
||||
RUNNER_TYPE=$(/usr/local/bin/aws ec2 describe-tags --filters "Name=resource-id,Values=$INSTANCE_ID" --query "Tags[?Key=='github:runner-type'].Value" --output text)
|
||||
RUNNER_TYPE=${RUNNER_TYPE:-unknown}
|
||||
# Clean possible garbage from the runner type
|
||||
RUNNER_TYPE=${RUNNER_TYPE//[^0-9a-z]/-}
|
||||
TS_AUTHKEY=$(TS_API_CLIENT_ID="$TS_API_CLIENT_ID" TS_API_CLIENT_SECRET="$TS_API_CLIENT_SECRET" \
|
||||
get-authkey -tags tag:svc-core-ci-github -reusable -ephemeral)
|
||||
tailscale up --ssh --auth-key="$TS_AUTHKEY" --hostname="ci-runner-$RUNNER_TYPE-$INSTANCE_ID"
|
||||
}
|
||||
|
||||
cat > /usr/local/share/scripts/init-network.sh << EOF
|
||||
!/usr/bin/env bash
|
||||
$(declare -f setup_cloudflare_dns)
|
||||
|
||||
$(declare -f setup_tailscale)
|
||||
|
||||
# If the script is sourced, it will return now and won't execute functions
|
||||
return 0 &>/dev/null || :
|
||||
|
||||
echo Setup Cloudflare DNS
|
||||
setup_cloudflare_dns
|
||||
|
||||
echo Setup Tailscale VPN
|
||||
setup_tailscale
|
||||
EOF
|
||||
|
||||
chmod +x /usr/local/share/scripts/init-network.sh
|
||||
|
||||
|
||||
# The following line is used in aws TOE check.
|
||||
touch /var/tmp/clickhouse-ci-ami.success
|
||||
|
Loading…
Reference in New Issue
Block a user