#!/bin/bash # shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031,SC2010,SC2015 # shellcheck disable=SC1091 source /setup_export_logs.sh set -x # core.COMM.PID-TID sysctl kernel.core_pattern='core.%e.%p-%P' dmesg --clear ||: set -e set -u set -o pipefail stage=${stage:-} script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" echo "$script_dir" repo_dir=ch BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-17_debug_none_unsplitted_disable_False_binary"} BINARY_URL_TO_DOWNLOAD=${BINARY_URL_TO_DOWNLOAD:="https://clickhouse-builds.s3.amazonaws.com/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/$BINARY_TO_DOWNLOAD/clickhouse"} function git_clone_with_retry { for _ in 1 2 3 4; do if git clone --depth 1 https://github.com/ClickHouse/ClickHouse.git -- "$1" 2>&1 | ts '%Y-%m-%d %H:%M:%S';then return 0 else sleep 0.5 fi done return 1 } function clone { # For local runs, start directly from the "fuzz" stage. rm -rf "$repo_dir" ||: mkdir "$repo_dir" ||: git_clone_with_retry "$repo_dir" ( cd "$repo_dir" if [ "$PR_TO_TEST" != "0" ]; then if git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/merge"; then git checkout FETCH_HEAD echo "Checked out pull/$PR_TO_TEST/merge ($(git rev-parse FETCH_HEAD))" else git fetch --depth 1 origin "+refs/pull/$PR_TO_TEST/head" git checkout "$SHA_TO_TEST" echo "Checked out nominal SHA $SHA_TO_TEST for PR $PR_TO_TEST" fi git diff --name-only master HEAD | tee ci-changed-files.txt else if [ -v SHA_TO_TEST ]; then git fetch --depth 2 origin "$SHA_TO_TEST" git checkout "$SHA_TO_TEST" echo "Checked out nominal SHA $SHA_TO_TEST for master" else git fetch --depth 2 origin echo "Using default repository head $(git rev-parse HEAD)" fi git diff --name-only HEAD~1 HEAD | tee ci-changed-files.txt fi cd - ) ls -lath ||: } function wget_with_retry { for _ in 1 2 3 4; do if wget -nv -nd -c "$1";then return 0 else sleep 0.5 fi done return 1 } function download { wget_with_retry "$BINARY_URL_TO_DOWNLOAD" chmod +x clickhouse # clickhouse may be compressed - run once to decompress ./clickhouse --query "SELECT 1" ||: ln -s ./clickhouse ./clickhouse-server ln -s ./clickhouse ./clickhouse-client ln -s ./clickhouse ./clickhouse-local # clickhouse-server is in the current dir export PATH="$PWD:$PATH" } function configure { rm -rf db ||: mkdir db ||: cp -av --dereference "$repo_dir"/programs/server/config* db cp -av --dereference "$repo_dir"/programs/server/user* db # TODO figure out which ones are needed cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d cp -av --dereference "$script_dir"/allow-nullable-key.xml db/config.d cat > db/config.d/max_server_memory_usage_to_ram_ratio.xml < 0.75 EOL cat > db/config.d/core.xml < 107374182400 $PWD EOL config_logs_export_cluster db/config.d/system_logs_export.yaml } function filter_exists_and_template { local path for path in "$@"; do if [ -e "$path" ]; then # SC2001 shellcheck suggests: # echo ${path//.sql.j2/.gen.sql} # but it doesn't allow to use regex echo "$path" | sed 's/\.sql\.j2$/.gen.sql/' else echo "'$path' does not exists" >&2 fi done } function stop_server { clickhouse-client --query "select elapsed, query from system.processes" ||: clickhouse stop # Debug. date sleep 10 jobs pstree -aspgT } function fuzz { /generate-test-j2.py --path ch/tests/queries/0_stateless # Obtain the list of newly added tests. They will be fuzzed in more extreme way than other tests. # Don't overwrite the NEW_TESTS_OPT so that it can be set from the environment. NEW_TESTS="$(sed -n 's!\(^tests/queries/0_stateless/.*\.sql\(\.j2\)\?\)$!ch/\1!p' $repo_dir/ci-changed-files.txt | sort -R)" # ci-changed-files.txt contains also files that has been deleted/renamed, filter them out. NEW_TESTS="$(filter_exists_and_template $NEW_TESTS)" if [[ -n "$NEW_TESTS" ]] then NEW_TESTS_OPT="${NEW_TESTS_OPT:---interleave-queries-file ${NEW_TESTS}}" else NEW_TESTS_OPT="${NEW_TESTS_OPT:-}" fi mkdir -p /var/run/clickhouse-server # NOTE: we use process substitution here to preserve keep $! as a pid of clickhouse-server clickhouse-server --config-file db/config.xml --pid-file /var/run/clickhouse-server/clickhouse-server.pid -- --path db > server.log 2>&1 & server_pid=$! kill -0 $server_pid # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog # and clickhouse-server can do fork-exec, for example, to run some bridge. # Do not set nostop noprint for all signals, because some it may cause gdb to hang, # explicitly ignore non-fatal signals that are used by server. # Number of SIGRTMIN can be determined only in runtime. RTMIN=$(kill -l SIGRTMIN) echo " set follow-fork-mode parent handle SIGHUP nostop noprint pass handle SIGINT nostop noprint pass handle SIGQUIT nostop noprint pass handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass handle SIG$RTMIN nostop noprint pass info signals continue backtrace full thread apply all backtrace full info registers disassemble /s up disassemble /s up disassemble /s p \"done\" detach quit " > script.gdb gdb -batch -command script.gdb -p $server_pid & sleep 5 # gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s) time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: # Check connectivity after we attach gdb, because it might cause the server # to freeze, and the fuzzer will fail. In debug build, it can take a lot of time. for _ in {1..180} do if clickhouse-client --query "select 1" then break fi sleep 1 done kill -0 $server_pid # This checks that it is our server that is started and not some other one echo 'Server started and responded.' setup_logs_replication # SC2012: Use find instead of ls to better handle non-alphanumeric filenames. They are all alphanumeric. # SC2046: Quote this to prevent word splitting. Actually, I need word splitting. # shellcheck disable=SC2012,SC2046 timeout -s TERM --preserve-status 30m clickhouse-client \ --max_memory_usage_in_client=1000000000 \ --receive_timeout=10 \ --receive_data_timeout_ms=10000 \ --stacktrace \ --query-fuzzer-runs=1000 \ --create-query-fuzzer-runs=50 \ --queries-file $(ls -1 ch/tests/queries/0_stateless/*.sql | sort -R) \ $NEW_TESTS_OPT \ > fuzzer.log \ 2>&1 & fuzzer_pid=$! echo "Fuzzer pid is $fuzzer_pid" # The fuzzer_pid belongs to the timeout process. actual_fuzzer_pid=$(ps -o pid= --ppid "$fuzzer_pid") echo "Attaching gdb to the fuzzer itself" gdb -batch -command script.gdb -p $actual_fuzzer_pid & # Wait for the fuzzer to complete. # Note that the 'wait || ...' thing is required so that the script doesn't # exit because of 'set -e' when 'wait' returns nonzero code. fuzzer_exit_code=0 wait "$fuzzer_pid" || fuzzer_exit_code=$? echo "Fuzzer exit code is $fuzzer_exit_code" # If the server dies, most often the fuzzer returns Code 210: Connetion # refused, and sometimes also code 32: attempt to read after eof. For # simplicity, check again whether the server is accepting connections using # clickhouse-client. We don't check for the existence of the server process, because # the process is still present while the server is terminating and not # accepting the connections anymore. for _ in {1..100} do if clickhouse-client --query "SELECT 1" 2> err then server_died=0 break else # There are legitimate queries leading to this error, example: # SELECT * FROM remote('127.0.0.{1..255}', system, one) if grep -F 'TOO_MANY_SIMULTANEOUS_QUERIES' err then # Give it some time to cool down clickhouse-client --query "SHOW PROCESSLIST" sleep 1 else echo "Server live check returns $?" cat err server_died=1 break fi fi done # wait in background to call wait in foreground and ensure that the # process is alive, since w/o job control this is the only way to obtain # the exit code stop_server & server_exit_code=0 wait $server_pid || server_exit_code=$? echo "Server exit code is $server_exit_code" # Make files with status and description we'll show for this check on Github. task_exit_code=$fuzzer_exit_code if [ "$server_died" == 1 ] then # The server has died. if ! rg --text -o 'Received signal.*|Logical error.*|Assertion.*failed|Failed assertion.*|.*runtime error: .*|.*is located.*|(SUMMARY|ERROR): [a-zA-Z]+Sanitizer:.*|.*_LIBCPP_ASSERT.*|.*Child process was terminated by signal 9.*' server.log > description.txt then echo "Lost connection to server. See the logs." > description.txt fi IS_SANITIZED=$(clickhouse-local --query "SELECT value LIKE '%-fsanitize=%' FROM system.build_options WHERE name = 'CXX_FLAGS'") if [ "${IS_SANITIZED}" -eq "1" ] && rg --text 'Sanitizer:? (out-of-memory|out of memory|failed to allocate)|Child process was terminated by signal 9' description.txt then # OOM of sanitizer is not a problem we can handle - treat it as success, but preserve the description. # Why? Because sanitizers have the memory overhead, that is not controllable from inside clickhouse-server. task_exit_code=0 echo "success" > status.txt else task_exit_code=210 echo "failure" > status.txt fi elif [ "$fuzzer_exit_code" == "143" ] || [ "$fuzzer_exit_code" == "0" ] then # Variants of a normal run: # 0 -- fuzzing ended earlier than timeout. # 143 -- SIGTERM -- the fuzzer was killed by timeout. task_exit_code=0 echo "success" > status.txt echo "OK" > description.txt elif [ "$fuzzer_exit_code" == "137" ] then # Killed. task_exit_code=$fuzzer_exit_code echo "failure" > status.txt echo "Killed" > description.txt else # The server was alive, but the fuzzer returned some error. This might # be some client-side error detected by fuzzing, or a problem in the # fuzzer itself. Don't grep the server log in this case, because we will # find a message about normal server termination (Received signal 15), # which is confusing. task_exit_code=$fuzzer_exit_code echo "failure" > status.txt echo "Let op!" > description.txt echo "Fuzzer went wrong with error code: ($fuzzer_exit_code). Its process died somehow when the server stayed alive. The server log probably won't tell you much so try to find information in other files." >>description.txt { rg -ao "Found error:.*" fuzzer.log || rg -ao "Exception:.*" fuzzer.log; } | tail -1 >>description.txt fi if test -f core.*; then zstd --threads=0 core.* mv core.*.zst core.zst fi dmesg -T | rg -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' && echo "OOM in dmesg" ||: } case "$stage" in "") ;& # Did you know? This is "fallthrough" in bash. https://stackoverflow.com/questions/12010686/case-statement-fallthrough "clone") time clone if [ -v FUZZ_LOCAL_SCRIPT ] then # just fall through echo Using the testing script from docker container : else # Run the testing script from the repository echo Using the testing script from the repository export stage=download time ch/docker/test/fuzzer/run-fuzzer.sh # Keep the error code exit $? fi ;& "download") time download ;& "configure") time configure ;& "fuzz") time fuzz ;& "report") CORE_LINK='' if [ -f core.zst ]; then CORE_LINK='core.zst' fi # Keep all the lines in the paragraphs containing that either contain or don't start with 20... (year) sed -n '//,/^$/p' server.log | awk '// || !/^20/' > fatal.log ||: FATAL_LINK='' if [ -s fatal.log ]; then FATAL_LINK='fatal.log' fi dmesg -T > dmesg.log ||: zstd --threads=0 --rm server.log zstd --threads=0 --rm fuzzer.log cat > report.html < AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}

AST Fuzzer for PR #${PR_TO_TEST} @ ${SHA_TO_TEST}

Test name Test status Description
AST Fuzzer $(cat status.txt) $( clickhouse-local --input-format RawBLOB --output-format RawBLOB --query "SELECT encodeXMLComponent(*) FROM table" < description.txt || cat description.txt )
$( clickhouse-local --input-format RawBLOB --output-format RawBLOB --query "SELECT encodeXMLComponent(*) FROM table" < fatal.log || cat fatal.log )
EOF ;& esac exit $task_exit_code