ClickHouse/docker/test/fuzzer/run-fuzzer.sh

409 lines
13 KiB
Bash
Raw Normal View History

2020-07-09 11:21:23 +00:00
#!/bin/bash
2021-12-09 11:17:55 +00:00
# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031
2021-01-16 20:47:01 +00:00
set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
set -e
set -u
2020-07-09 11:21:23 +00:00
set -o pipefail
2020-07-09 11:21:23 +00:00
trap "exit" INT TERM
2021-03-15 18:45:57 +00:00
# The watchdog is in the separate process group, so we have to kill it separately
# if the script terminates earlier.
trap 'kill $(jobs -pr) ${watchdog_pid:-} ||:' EXIT
2020-07-09 11:21:23 +00:00
stage=${stage:-}
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
2020-07-10 11:11:31 +00:00
echo "$script_dir"
repo_dir=ch
BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-14_debug_none_unsplitted_disable_False_binary"}
2022-02-19 17:43:02 +00:00
BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"}
2020-07-09 11:21:23 +00:00
function clone
{
# For local runs, start directly from the "fuzz" stage.
rm -rf "$repo_dir" ||:
mkdir "$repo_dir" ||:
git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$repo_dir" 2>&1 | ts '%Y-%m-%d %H:%M:%S'
(
cd "$repo_dir"
if [ "$PR_TO_TEST" != "0" ]; then
if git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/merge"; then
git checkout FETCH_HEAD
echo "Checked out pull/$PR_TO_TEST/merge ($(git rev-parse FETCH_HEAD))"
else
git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/head"
git checkout "$SHA_TO_TEST"
echo "Checked out nominal SHA $SHA_TO_TEST for PR $PR_TO_TEST"
fi
2021-10-28 20:06:04 +00:00
git diff --name-only master HEAD | tee ci-changed-files.txt
else
2021-12-09 10:51:39 +00:00
if [ -v SHA_TO_TEST ]; then
2021-10-29 10:47:29 +00:00
git fetch --depth 2 origin "$SHA_TO_TEST"
git checkout "$SHA_TO_TEST"
echo "Checked out nominal SHA $SHA_TO_TEST for master"
else
2021-10-29 10:47:29 +00:00
git fetch --depth 2 origin
echo "Using default repository head $(git rev-parse HEAD)"
fi
git diff --name-only HEAD~1 HEAD | tee ci-changed-files.txt
fi
cd -
)
ls -lath ||:
2020-07-09 11:21:23 +00:00
}
2021-12-28 13:12:51 +00:00
function wget_with_retry
{
for _ in 1 2 3 4; do
if wget -nv -nd -c "$1";then
return 0
else
sleep 0.5
fi
done
return 1
}
2020-07-09 11:21:23 +00:00
function download
{
2021-12-28 13:12:51 +00:00
wget_with_retry "$BINARY_URL_TO_DOWNLOAD"
2020-07-09 18:30:22 +00:00
chmod +x clickhouse
2022-08-09 13:16:03 +00:00
# clickhouse may be compressed - run once to decompress
./clickhouse ||:
2020-07-16 18:04:16 +00:00
ln -s ./clickhouse ./clickhouse-server
ln -s ./clickhouse ./clickhouse-client
# clickhouse-server is in the current dir
export PATH="$PWD:$PATH"
2020-07-09 11:21:23 +00:00
}
function configure
{
2020-07-10 11:11:31 +00:00
rm -rf db ||:
2020-07-09 11:21:23 +00:00
mkdir db ||:
cp -av --dereference "$repo_dir"/programs/server/config* db
cp -av --dereference "$repo_dir"/programs/server/user* db
2020-07-17 18:41:33 +00:00
# TODO figure out which ones are needed
cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
cat > db/config.d/core.xml <<EOL
<clickhouse>
<core_dump>
<!-- 100GiB -->
<size_limit>107374182400</size_limit>
</core_dump>
<!-- NOTE: no need to configure core_path,
since clickhouse is not started as daemon (via clickhouse start)
-->
<core_path>$PWD</core_path>
</clickhouse>
EOL
2020-07-09 11:21:23 +00:00
}
2020-07-10 11:11:31 +00:00
function watchdog
{
2021-11-25 14:54:50 +00:00
sleep 1800
2020-07-10 11:11:31 +00:00
echo "Fuzzing run has timed out"
2020-09-30 17:06:14 +00:00
for _ in {1..10}
2020-07-16 18:04:16 +00:00
do
2021-05-28 12:55:58 +00:00
# Only kill by pid the particular client that runs the fuzzing, or else
# we can kill some clickhouse-client processes this script starts later,
# e.g. for checking server liveness.
if ! kill $fuzzer_pid
2020-07-16 18:04:16 +00:00
then
break
fi
sleep 1
done
2021-05-28 12:55:58 +00:00
kill -9 -- $fuzzer_pid ||:
2020-07-10 11:11:31 +00:00
}
2021-08-24 12:27:50 +00:00
function filter_exists_and_template
{
local path
for path in "$@"; do
if [ -e "$path" ]; then
# SC2001 shellcheck suggests:
# echo ${path//.sql.j2/.gen.sql}
# but it doesn't allow to use regex
echo "$path" | sed 's/\.sql\.j2$/.gen.sql/'
else
echo "'$path' does not exists" >&2
fi
done
}
function stop_server
{
clickhouse-client --query "select elapsed, query from system.processes" ||:
clickhouse stop
# Debug.
date
sleep 10
jobs
pstree -aspgT
}
2020-07-09 11:21:23 +00:00
function fuzz
{
/generate-test-j2.py --path ch/tests/queries/0_stateless
2021-01-16 15:03:32 +00:00
# Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests.
# Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment.
NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' $repo_dir/ci-changed-files.txt | sort -R)"
# ci-changed-files.txt contains also files that has been deleted/renamed, filter them out.
2021-08-24 12:27:50 +00:00
NEW_TESTS="$(filter_exists_and_template $NEW_TESTS)"
2021-01-16 16:36:08 +00:00
if [[ -n "$NEW_TESTS" ]]
then
NEW_TESTS_OPT="${NEW_TESTS_OPT:---interleave-queries-file ${NEW_TESTS}}"
2021-01-18 15:24:05 +00:00
else
2021-03-15 18:45:57 +00:00
NEW_TESTS_OPT="${NEW_TESTS_OPT:-}"
2021-01-16 19:01:29 +00:00
fi
2021-01-16 15:03:32 +00:00
mkdir -p /var/run/clickhouse-server
# interferes with gdb
export CLICKHOUSE_WATCHDOG_ENABLE=0
# NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server
clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > >(tail -100000 > server.log) 2>&1 &
server_pid=$!
2020-07-09 11:21:23 +00:00
kill -0 $server_pid
2021-12-10 15:03:57 +00:00
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# and clickhouse-server can do fork-exec, for example, to run some bridge.
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
# explicitly ignore non-fatal signals that are used by server.
# Number of SIGRTMIN can be determined only in runtime.
2021-12-10 17:58:09 +00:00
RTMIN=$(kill -l SIGRTMIN)
2021-01-23 21:45:17 +00:00
echo "
2021-12-10 15:03:57 +00:00
set follow-fork-mode parent
handle SIGHUP nostop noprint pass
handle SIGINT nostop noprint pass
handle SIGQUIT nostop noprint pass
handle SIGPIPE nostop noprint pass
handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
2021-01-23 21:45:17 +00:00
continue
2021-12-10 15:03:57 +00:00
backtrace full
thread apply all backtrace full
2021-12-15 10:21:21 +00:00
info registers
disassemble /s
up
disassemble /s
up
disassemble /s
p \"done\"
2021-12-10 15:03:57 +00:00
detach
quit
2021-01-23 21:45:17 +00:00
" > script.gdb
2021-12-10 15:03:57 +00:00
gdb -batch -command script.gdb -p $server_pid &
sleep 5
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
2021-06-02 15:57:01 +00:00
# Check connectivity after we attach gdb, because it might cause the server
# to freeze and the fuzzer will fail. In debug build it can take a lot of time.
for _ in {1..180}
2021-06-02 15:57:01 +00:00
do
sleep 1
if clickhouse-client --query "select 1"
then
break
fi
done
clickhouse-client --query "select 1" # This checks that the server is responding
kill -0 $server_pid # This checks that it is our server that is started and not some other one
echo Server started and responded
2021-01-23 21:45:17 +00:00
2021-01-06 01:56:10 +00:00
# SC2012: Use find instead of ls to better handle non-alphanumeric filenames. They are all alphanumeric.
# SC2046: Quote this to prevent word splitting. Actually I need word splitting.
# shellcheck disable=SC2012,SC2046
2021-05-29 13:12:18 +00:00
clickhouse-client \
--receive_timeout=10 \
--receive_data_timeout_ms=10000 \
--stacktrace \
2021-05-29 13:12:18 +00:00
--query-fuzzer-runs=1000 \
--queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \
$NEW_TESTS_OPT \
2021-01-15 15:06:51 +00:00
> >(tail -n 100000 > fuzzer.log) \
2021-05-28 12:55:58 +00:00
2>&1 &
fuzzer_pid=$!
echo "Fuzzer pid is $fuzzer_pid"
2021-05-28 12:55:58 +00:00
# Start a watchdog that should kill the fuzzer on timeout.
# The shell won't kill the child sleep when we kill it, so we have to put it
# into a separate process group so that we can kill them all.
set -m
watchdog &
watchdog_pid=$!
set +m
# Check that the watchdog has started.
kill -0 $watchdog_pid
2021-05-28 12:55:58 +00:00
# Wait for the fuzzer to complete.
2021-05-28 22:57:16 +00:00
# Note that the 'wait || ...' thing is required so that the script doesn't
# exit because of 'set -e' when 'wait' returns nonzero code.
fuzzer_exit_code=0
wait "$fuzzer_pid" || fuzzer_exit_code=$?
2020-07-10 11:11:31 +00:00
echo "Fuzzer exit code is $fuzzer_exit_code"
2020-07-20 11:56:53 +00:00
2021-05-28 12:55:58 +00:00
kill -- -$watchdog_pid ||:
# If the server dies, most often the fuzzer returns code 210: connetion
# refused, and sometimes also code 32: attempt to read after eof. For
# simplicity, check again whether the server is accepting connections, using
# clickhouse-client. We don't check for existence of server process, because
# the process is still present while the server is terminating and not
# accepting the connections anymore.
if clickhouse-client --query "select 1 format Null"
then
server_died=0
else
echo "Server live check returns $?"
server_died=1
fi
# wait in background to call wait in foreground and ensure that the
# process is alive, since w/o job control this is the only way to obtain
# the exit code
stop_server &
2021-07-22 14:04:48 +00:00
server_exit_code=0
wait $server_pid || server_exit_code=$?
echo "Server exit code is $server_exit_code"
2021-05-28 12:55:58 +00:00
# Make files with status and description we'll show for this check on Github.
2020-07-29 17:04:56 +00:00
task_exit_code=$fuzzer_exit_code
if [ "$server_died" == 1 ]
2020-07-30 13:35:24 +00:00
then
# The server has died.
task_exit_code=210
2020-07-16 18:04:16 +00:00
echo "failure" > status.txt
2021-06-17 06:16:34 +00:00
if ! grep --text -ao "Received signal.*\|Logical error.*\|Assertion.*failed\|Failed assertion.*\|.*runtime error: .*\|.*is located.*\|SUMMARY: AddressSanitizer:.*\|SUMMARY: MemorySanitizer:.*\|SUMMARY: ThreadSanitizer:.*\|.*_LIBCPP_ASSERT.*" server.log > description.txt
2020-07-16 18:04:16 +00:00
then
2021-01-28 01:09:47 +00:00
echo "Lost connection to server. See the logs." > description.txt
2020-07-16 18:04:16 +00:00
fi
elif [ "$fuzzer_exit_code" == "143" ] || [ "$fuzzer_exit_code" == "0" ]
then
# Variants of a normal run:
# 0 -- fuzzing ended earlier than timeout.
# 143 -- SIGTERM -- the fuzzer was killed by timeout.
task_exit_code=0
echo "success" > status.txt
echo "OK" > description.txt
elif [ "$fuzzer_exit_code" == "137" ]
then
# Killed.
task_exit_code=$fuzzer_exit_code
echo "failure" > status.txt
echo "Killed" > description.txt
2020-07-29 17:04:56 +00:00
else
2021-06-02 12:06:12 +00:00
# The server was alive, but the fuzzer returned some error. This might
# be some client-side error detected by fuzzing, or a problem in the
# fuzzer itself. Don't grep the server log in this case, because we will
# find a message about normal server termination (Received signal 15),
# which is confusing.
task_exit_code=$fuzzer_exit_code
2020-07-29 17:04:56 +00:00
echo "failure" > status.txt
2021-06-17 06:16:34 +00:00
{ grep --text -o "Found error:.*" fuzzer.log \
2021-08-16 13:55:20 +00:00
|| grep --text -ao "Exception:.*" fuzzer.log \
2021-06-02 12:06:12 +00:00
|| echo "Fuzzer failed ($fuzzer_exit_code). See the logs." ; } \
| tail -1 > description.txt
2020-07-16 18:04:16 +00:00
fi
if test -f core.*; then
pigz core.*
mv core.*.gz core.gz
fi
2020-07-09 11:21:23 +00:00
}
case "$stage" in
"")
2021-01-16 15:03:32 +00:00
;& # Did you know? This is "fallthrough" in bash. https://stackoverflow.com/questions/12010686/case-statement-fallthrough
2020-07-09 11:21:23 +00:00
"clone")
time clone
2020-07-10 11:11:31 +00:00
if [ -v FUZZ_LOCAL_SCRIPT ]
then
# just fall through
echo Using the testing script from docker container
:
else
2020-07-10 14:16:16 +00:00
# Run the testing script from the repository
2020-07-10 11:11:31 +00:00
echo Using the testing script from the repository
export stage=download
2020-07-30 19:04:15 +00:00
time ch/docker/test/fuzzer/run-fuzzer.sh
2020-07-10 14:16:16 +00:00
# Keep the error code
2020-07-30 19:04:15 +00:00
exit $?
2020-07-10 11:11:31 +00:00
fi
;&
2020-07-09 11:21:23 +00:00
"download")
time download
;&
"configure")
time configure
;&
"fuzz")
2021-05-28 12:55:58 +00:00
time fuzz
2020-08-31 23:33:42 +00:00
;&
"report")
CORE_LINK=''
if [ -f core.gz ]; then
CORE_LINK='<a href="core.gz">core.gz</a>'
fi
2020-08-31 23:33:42 +00:00
cat > report.html <<EOF ||:
<!DOCTYPE html>
<html lang="en">
<style>
2022-05-28 18:25:20 +00:00
body { font-family: "DejaVu Sans", "Noto Sans", Arial, sans-serif; background: #EEE; }
2020-08-31 23:33:42 +00:00
h1 { margin-left: 10px; }
th, td { border: 0; padding: 5px 10px 5px 10px; text-align: left; vertical-align: top; line-height: 1.5; background-color: #FFF;
td { white-space: pre; font-family: Monospace, Courier New; }
border: 0; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
a { color: #06F; text-decoration: none; }
a:hover, a:active { color: #F40; text-decoration: underline; }
table { border: 0; }
p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-space: nowrap; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
th { cursor: pointer; }
</style>
<title>AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}</title>
</head>
<body>
<div class="main">
<h1>AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}</h1>
<p class="links">
<a href="fuzzer.log">fuzzer.log</a>
<a href="server.log">server.log</a>
<a href="main.log">main.log</a>
${CORE_LINK}
2020-08-31 23:33:42 +00:00
</p>
<table>
<tr><th>Test name</th><th>Test status</th><th>Description</th></tr>
<tr><td>AST Fuzzer</td><td>$(cat status.txt)</td><td>$(cat description.txt)</td></tr>
</table>
</body>
</html>
EOF
2020-07-09 11:21:23 +00:00
;&
esac
2020-09-16 07:46:38 +00:00
exit $task_exit_code