ClickHouse/docker/test/fuzzer/run-fuzzer.sh

450 lines
15 KiB
Bash
Raw Normal View History

2020-07-09 11:21:23 +00:00
#!/bin/bash
2022-11-25 12:06:52 +00:00
# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031,SC2010,SC2015
2021-01-16 20:47:01 +00:00
2023-08-16 20:53:51 +00:00
# shellcheck disable=SC1091
source /setup_export_logs.sh
set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
2023-01-13 21:53:17 +00:00
dmesg --clear ||:
set -e
set -u
2020-07-09 11:21:23 +00:00
set -o pipefail
2020-07-09 11:21:23 +00:00
stage=${stage:-}
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
2020-07-10 11:11:31 +00:00
echo "$script_dir"
repo_dir=ch
BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-17_debug_none_unsplitted_disable_False_binary"}
2022-02-19 17:43:02 +00:00
BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"}
2020-07-09 11:21:23 +00:00
2023-01-20 16:46:24 +00:00
function git_clone_with_retry
{
for _ in 1 2 3 4; do
if git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$1" 2>&1 | ts '%Y-%m-%d %H:%M:%S';then
return 0
else
sleep 0.5
fi
done
return 1
}
2020-07-09 11:21:23 +00:00
function clone
{
# For local runs, start directly from the "fuzz" stage.
rm -rf "$repo_dir" ||:
mkdir "$repo_dir" ||:
2023-01-20 16:46:24 +00:00
git_clone_with_retry "$repo_dir"
(
cd "$repo_dir"
if [ "$PR_TO_TEST" != "0" ]; then
if git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/merge"; then
git checkout FETCH_HEAD
echo "Checked out pull/$PR_TO_TEST/merge ($(git rev-parse FETCH_HEAD))"
else
git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/head"
git checkout "$SHA_TO_TEST"
echo "Checked out nominal SHA $SHA_TO_TEST for PR $PR_TO_TEST"
fi
2021-10-28 20:06:04 +00:00
git diff --name-only master HEAD | tee ci-changed-files.txt
else
2021-12-09 10:51:39 +00:00
if [ -v SHA_TO_TEST ]; then
2021-10-29 10:47:29 +00:00
git fetch --depth 2 origin "$SHA_TO_TEST"
git checkout "$SHA_TO_TEST"
echo "Checked out nominal SHA $SHA_TO_TEST for master"
else
2021-10-29 10:47:29 +00:00
git fetch --depth 2 origin
echo "Using default repository head $(git rev-parse HEAD)"
fi
git diff --name-only HEAD~1 HEAD | tee ci-changed-files.txt
fi
cd -
)
ls -lath ||:
2020-07-09 11:21:23 +00:00
}
2021-12-28 13:12:51 +00:00
function wget_with_retry
{
for _ in 1 2 3 4; do
if wget -nv -nd -c "$1";then
return 0
else
sleep 0.5
fi
done
return 1
}
2020-07-09 11:21:23 +00:00
function download
{
2021-12-28 13:12:51 +00:00
wget_with_retry "$BINARY_URL_TO_DOWNLOAD"
2020-07-09 18:30:22 +00:00
chmod +x clickhouse
2022-08-09 13:16:03 +00:00
# clickhouse may be compressed - run once to decompress
./clickhouse ||:
2020-07-16 18:04:16 +00:00
ln -s ./clickhouse ./clickhouse-server
ln -s ./clickhouse ./clickhouse-client
2022-12-25 19:14:25 +00:00
ln -s ./clickhouse ./clickhouse-local
# clickhouse-server is in the current dir
export PATH="$PWD:$PATH"
2020-07-09 11:21:23 +00:00
}
function configure
{
2020-07-10 11:11:31 +00:00
rm -rf db ||:
2020-07-09 11:21:23 +00:00
mkdir db ||:
cp -av --dereference "$repo_dir"/programs/server/config* db
cp -av --dereference "$repo_dir"/programs/server/user* db
2020-07-17 18:41:33 +00:00
# TODO figure out which ones are needed
cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
2022-08-11 02:34:10 +00:00
cp -av --dereference "$script_dir"/allow-nullable-key.xml db/config.d
2022-12-20 18:56:29 +00:00
cat > db/config.d/max_server_memory_usage_to_ram_ratio.xml <<EOL
2022-12-15 17:37:04 +00:00
<clickhouse>
2022-12-20 18:56:29 +00:00
<max_server_memory_usage_to_ram_ratio>0.75</max_server_memory_usage_to_ram_ratio>
2022-12-15 17:37:04 +00:00
</clickhouse>
EOL
cat > db/config.d/core.xml <<EOL
<clickhouse>
<core_dump>
<!-- 100GiB -->
<size_limit>107374182400</size_limit>
</core_dump>
<!-- NOTE: no need to configure core_path,
since clickhouse is not started as daemon (via clickhouse start)
-->
<core_path>$PWD</core_path>
</clickhouse>
EOL
2023-08-12 20:41:56 +00:00
2023-08-16 20:53:51 +00:00
config_logs_export_cluster db/config.d/system_logs_export.yaml
2020-07-09 11:21:23 +00:00
}
2021-08-24 12:27:50 +00:00
function filter_exists_and_template
{
local path
for path in "$@"; do
if [ -e "$path" ]; then
# SC2001 shellcheck suggests:
# echo ${path//.sql.j2/.gen.sql}
# but it doesn't allow to use regex
echo "$path" | sed 's/\.sql\.j2$/.gen.sql/'
else
echo "'$path' does not exists" >&2
fi
done
}
function stop_server
{
clickhouse-client --query "select elapsed, query from system.processes" ||:
clickhouse stop
# Debug.
date
sleep 10
jobs
pstree -aspgT
}
2020-07-09 11:21:23 +00:00
function fuzz
{
/generate-test-j2.py --path ch/tests/queries/0_stateless
2021-01-16 15:03:32 +00:00
# Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests.
# Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment.
NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' $repo_dir/ci-changed-files.txt | sort -R)"
# ci-changed-files.txt contains also files that has been deleted/renamed, filter them out.
2021-08-24 12:27:50 +00:00
NEW_TESTS="$(filter_exists_and_template $NEW_TESTS)"
2021-01-16 16:36:08 +00:00
if [[ -n "$NEW_TESTS" ]]
then
NEW_TESTS_OPT="${NEW_TESTS_OPT:---interleave-queries-file ${NEW_TESTS}}"
2021-01-18 15:24:05 +00:00
else
2021-03-15 18:45:57 +00:00
NEW_TESTS_OPT="${NEW_TESTS_OPT:-}"
2021-01-16 19:01:29 +00:00
fi
2021-01-16 15:03:32 +00:00
mkdir -p /var/run/clickhouse-server
# NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server
2022-12-31 13:23:53 +00:00
clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > server.log 2>&1 &
server_pid=$!
2020-07-09 11:21:23 +00:00
kill -0 $server_pid
2021-12-10 15:03:57 +00:00
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# and clickhouse-server can do fork-exec, for example, to run some bridge.
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
# explicitly ignore non-fatal signals that are used by server.
# Number of SIGRTMIN can be determined only in runtime.
2021-12-10 17:58:09 +00:00
RTMIN=$(kill -l SIGRTMIN)
2021-01-23 21:45:17 +00:00
echo "
2021-12-10 15:03:57 +00:00
set follow-fork-mode parent
handle SIGHUP nostop noprint pass
handle SIGINT nostop noprint pass
handle SIGQUIT nostop noprint pass
handle SIGPIPE nostop noprint pass
handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
2021-01-23 21:45:17 +00:00
continue
2021-12-10 15:03:57 +00:00
backtrace full
thread apply all backtrace full
2021-12-15 10:21:21 +00:00
info registers
disassemble /s
up
disassemble /s
up
disassemble /s
p \"done\"
2021-12-10 15:03:57 +00:00
detach
quit
2021-01-23 21:45:17 +00:00
" > script.gdb
gdb -batch -command script.gdb -p $server_pid &
2021-12-10 15:03:57 +00:00
sleep 5
2023-11-24 20:10:48 +00:00
# gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s)
2021-12-10 15:03:57 +00:00
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
2021-06-02 15:57:01 +00:00
# Check connectivity after we attach gdb, because it might cause the server
2023-11-24 20:10:48 +00:00
# to freeze, and the fuzzer will fail. In debug build, it can take a lot of time.
for _ in {1..180}
2021-06-02 15:57:01 +00:00
do
if clickhouse-client --query "select 1"
then
break
fi
sleep 1
2021-06-02 15:57:01 +00:00
done
kill -0 $server_pid # This checks that it is our server that is started and not some other one
2023-11-24 20:10:48 +00:00
echo 'Server started and responded.'
2021-01-23 21:45:17 +00:00
2023-08-16 20:53:51 +00:00
setup_logs_replication
2023-08-12 20:41:56 +00:00
2021-01-06 01:56:10 +00:00
# SC2012: Use find instead of ls to better handle non-alphanumeric filenames. They are all alphanumeric.
2023-11-24 20:10:48 +00:00
# SC2046: Quote this to prevent word splitting. Actually, I need word splitting.
2021-01-06 01:56:10 +00:00
# shellcheck disable=SC2012,SC2046
2022-11-26 17:45:50 +00:00
timeout -s TERM --preserve-status 30m clickhouse-client \
2023-11-24 20:10:48 +00:00
--max_memory_usage_in_client=1000000000 \
2021-05-29 13:12:18 +00:00
--receive_timeout=10 \
--receive_data_timeout_ms=10000 \
--stacktrace \
2021-05-29 13:12:18 +00:00
--query-fuzzer-runs=1000 \
2022-09-28 14:52:10 +00:00
--create-query-fuzzer-runs=50 \
2022-11-25 18:01:24 +00:00
--queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \
2021-05-29 13:12:18 +00:00
$NEW_TESTS_OPT \
2024-01-15 13:06:43 +00:00
> fuzzer.log \
2021-05-28 12:55:58 +00:00
2>&1 &
fuzzer_pid=$!
echo "Fuzzer pid is $fuzzer_pid"
2021-05-28 12:55:58 +00:00
# Wait for the fuzzer to complete.
2021-05-28 22:57:16 +00:00
# Note that the 'wait || ...' thing is required so that the script doesn't
# exit because of 'set -e' when 'wait' returns nonzero code.
fuzzer_exit_code=0
wait "$fuzzer_pid" || fuzzer_exit_code=$?
2020-07-10 11:11:31 +00:00
echo "Fuzzer exit code is $fuzzer_exit_code"
2020-07-20 11:56:53 +00:00
2023-11-24 20:10:48 +00:00
# If the server dies, most often the fuzzer returns Code 210: Connetion
# refused, and sometimes also code 32: attempt to read after eof. For
2023-11-24 20:10:48 +00:00
# simplicity, check again whether the server is accepting connections using
# clickhouse-client. We don't check for the existence of the server process, because
# the process is still present while the server is terminating and not
# accepting the connections anymore.
2023-01-08 00:14:46 +00:00
for _ in {1..100}
do
if clickhouse-client --query "SELECT 1" 2> err
then
server_died=0
break
else
# There are legitimate queries leading to this error, example:
# SELECT * FROM remote('127.0.0.{1..255}', system, one)
if grep -F 'TOO_MANY_SIMULTANEOUS_QUERIES' err
then
# Give it some time to cool down
clickhouse-client --query "SHOW PROCESSLIST"
sleep 1
else
echo "Server live check returns $?"
cat err
server_died=1
break
fi
fi
done
# wait in background to call wait in foreground and ensure that the
# process is alive, since w/o job control this is the only way to obtain
# the exit code
stop_server &
2021-07-22 14:04:48 +00:00
server_exit_code=0
wait $server_pid || server_exit_code=$?
echo "Server exit code is $server_exit_code"
2021-05-28 12:55:58 +00:00
# Make files with status and description we'll show for this check on Github.
2020-07-29 17:04:56 +00:00
task_exit_code=$fuzzer_exit_code
if [ "$server_died" == 1 ]
2020-07-30 13:35:24 +00:00
then
# The server has died.
if ! rg --text -o 'Received signal.*|Logical error.*|Assertion.*failed|Failed assertion.*|.*runtime error: .*|.*is located.*|(SUMMARY|ERROR): [a-zA-Z]+Sanitizer:.*|.*_LIBCPP_ASSERT.*|.*Child process was terminated by signal 9.*' server.log > description.txt
2020-07-16 18:04:16 +00:00
then
2021-01-28 01:09:47 +00:00
echo "Lost connection to server. See the logs." > description.txt
2020-07-16 18:04:16 +00:00
fi
2023-01-07 01:23:15 +00:00
IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'")
2023-01-10 01:59:55 +00:00
if [ "${IS_SANITIZED}" -eq "1" ] && rg --text 'Sanitizer:? (out-of-memory|out of memory|failed to allocate)|Child process was terminated by signal 9' description.txt
then
# OOM of sanitizer is not a problem we can handle - treat it as success, but preserve the description.
2023-01-07 01:23:15 +00:00
# Why? Because sanitizers have the memory overhead, that is not controllable from inside clickhouse-server.
task_exit_code=0
echo "success" > status.txt
else
task_exit_code=210
echo "failure" > status.txt
fi
elif [ "$fuzzer_exit_code" == "143" ] || [ "$fuzzer_exit_code" == "0" ]
then
# Variants of a normal run:
# 0 -- fuzzing ended earlier than timeout.
# 143 -- SIGTERM -- the fuzzer was killed by timeout.
task_exit_code=0
echo "success" > status.txt
echo "OK" > description.txt
elif [ "$fuzzer_exit_code" == "137" ]
then
# Killed.
task_exit_code=$fuzzer_exit_code
echo "failure" > status.txt
echo "Killed" > description.txt
2020-07-29 17:04:56 +00:00
else
2021-06-02 12:06:12 +00:00
# The server was alive, but the fuzzer returned some error. This might
# be some client-side error detected by fuzzing, or a problem in the
# fuzzer itself. Don't grep the server log in this case, because we will
# find a message about normal server termination (Received signal 15),
# which is confusing.
task_exit_code=$fuzzer_exit_code
2020-07-29 17:04:56 +00:00
echo "failure" > status.txt
2023-10-18 12:09:22 +00:00
{ rg -ao "Found error:.*" fuzzer.log \
|| rg -ao "Exception:.*" fuzzer.log \
2021-06-02 12:06:12 +00:00
|| echo "Fuzzer failed ($fuzzer_exit_code). See the logs." ; } \
| tail -1 > description.txt
2020-07-16 18:04:16 +00:00
fi
if test -f core.*; then
2023-01-01 20:17:43 +00:00
zstd --threads=0 core.*
mv core.*.zst core.zst
fi
2022-11-24 20:36:42 +00:00
2023-01-01 19:50:51 +00:00
dmesg -T | rg -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' && echo "OOM in dmesg" ||:
2020-07-09 11:21:23 +00:00
}
case "$stage" in
"")
2021-01-16 15:03:32 +00:00
;& # Did you know? This is "fallthrough" in bash. https://stackoverflow.com/questions/12010686/case-statement-fallthrough
2020-07-09 11:21:23 +00:00
"clone")
time clone
2020-07-10 11:11:31 +00:00
if [ -v FUZZ_LOCAL_SCRIPT ]
then
# just fall through
echo Using the testing script from docker container
:
else
2020-07-10 14:16:16 +00:00
# Run the testing script from the repository
2020-07-10 11:11:31 +00:00
echo Using the testing script from the repository
export stage=download
2020-07-30 19:04:15 +00:00
time ch/docker/test/fuzzer/run-fuzzer.sh
2020-07-10 14:16:16 +00:00
# Keep the error code
2020-07-30 19:04:15 +00:00
exit $?
2020-07-10 11:11:31 +00:00
fi
;&
2020-07-09 11:21:23 +00:00
"download")
time download
;&
"configure")
time configure
;&
"fuzz")
2021-05-28 12:55:58 +00:00
time fuzz
2020-08-31 23:33:42 +00:00
;&
"report")
2022-12-30 21:13:47 +00:00
CORE_LINK=''
2023-01-01 20:17:43 +00:00
if [ -f core.zst ]; then
CORE_LINK='<a href="core.zst">core.zst</a>'
fi
2022-12-30 21:13:47 +00:00
rg --text -F '<Fatal>' server.log > fatal.log ||:
2023-01-07 00:18:17 +00:00
dmesg -T > dmesg.log ||:
2022-12-30 21:13:47 +00:00
2024-01-30 19:25:26 +00:00
zstd --threads=0 --rm server.log
zstd --threads=0 --rm fuzzer.log
2022-12-30 21:13:47 +00:00
2020-08-31 23:33:42 +00:00
cat > report.html <<EOF ||:
<!DOCTYPE html>
<html lang="en">
<style>
2022-05-28 18:25:20 +00:00
body { font-family: "DejaVu Sans", "Noto Sans", Arial, sans-serif; background: #EEE; }
2020-08-31 23:33:42 +00:00
h1 { margin-left: 10px; }
2023-01-02 00:51:15 +00:00
th, td { border: 0; padding: 5px 10px 5px 10px; text-align: left; vertical-align: top; line-height: 1.5; background-color: #FFF; }
td { white-space: pre; font-family: Monospace, Courier New; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
2020-08-31 23:33:42 +00:00
a { color: #06F; text-decoration: none; }
a:hover, a:active { color: #F40; text-decoration: underline; }
table { border: 0; }
p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-space: nowrap; box-shadow: 0 0 0 1px rgba(0, 0, 0, 0.05), 0 8px 25px -5px rgba(0, 0, 0, 0.1); }
</style>
<title>AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}</title>
</head>
<body>
<div class="main">
2022-12-25 15:19:42 +00:00
<h1>AST Fuzzer for PR <a href="https://github.com/ClickHouse/ClickHouse/pull/${PR_TO_TEST}">#${PR_TO_TEST}</a> @ ${SHA_TO_TEST}</h1>
2020-08-31 23:33:42 +00:00
<p class="links">
<a href="run.log">run.log</a>
2024-01-15 13:06:43 +00:00
<a href="fuzzer.log.zst">fuzzer.log.zst</a>
2023-01-01 20:17:43 +00:00
<a href="server.log.zst">server.log.zst</a>
2022-12-25 15:19:42 +00:00
<a href="main.log">main.log</a>
2023-01-12 04:16:10 +00:00
<a href="dmesg.log">dmesg.log</a>
2022-12-25 15:19:42 +00:00
${CORE_LINK}
2020-08-31 23:33:42 +00:00
</p>
<table>
2022-12-25 15:19:42 +00:00
<tr>
<th>Test name</th>
<th>Test status</th>
<th>Description</th>
</tr>
<tr>
<td>AST Fuzzer</td>
<td>$(cat status.txt)</td>
2023-01-02 00:28:05 +00:00
<td>$(
2022-12-30 21:16:30 +00:00
clickhouse-local --input-format RawBLOB --output-format RawBLOB --query "SELECT encodeXMLComponent(*) FROM table" < description.txt || cat description.txt
)</td>
2022-12-25 15:19:42 +00:00
</tr>
2022-12-30 21:13:47 +00:00
<tr>
2023-01-02 00:28:05 +00:00
<td colspan="3" style="white-space: pre-wrap;">$(
2022-12-30 21:16:30 +00:00
clickhouse-local --input-format RawBLOB --output-format RawBLOB --query "SELECT encodeXMLComponent(*) FROM table" < fatal.log || cat fatal.log
)</td>
2022-12-30 21:13:47 +00:00
</tr>
2020-08-31 23:33:42 +00:00
</table>
</body>
</html>
EOF
2020-07-09 11:21:23 +00:00
;&
esac
2020-09-16 07:46:38 +00:00
exit $task_exit_code