mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-15 03:53:41 +00:00
310 lines
13 KiB
Bash
310 lines
13 KiB
Bash
#!/bin/bash
|
|
|
|
# core.COMM.PID-TID
|
|
sysctl kernel.core_pattern='core.%e.%p-%P'
|
|
|
|
OK="\tOK\t\\N\t"
|
|
FAIL="\tFAIL\t\\N\t"
|
|
|
|
FAILURE_CONTEXT_LINES=100
|
|
FAILURE_CONTEXT_MAX_LINE_WIDTH=300
|
|
|
|
function escaped()
|
|
{
|
|
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
|
|
# Also limit lines width just in case (too long lines are not really useful usually)
|
|
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
|
|
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
|
|
}
|
|
|
|
function head_escaped()
|
|
{
|
|
head -n $FAILURE_CONTEXT_LINES $1 | escaped
|
|
}
|
|
|
|
function unts()
|
|
{
|
|
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
|
|
}
|
|
|
|
function trim_server_logs()
|
|
{
|
|
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
|
|
}
|
|
|
|
function install_packages()
|
|
{
|
|
dpkg -i $1/clickhouse-common-static_*.deb
|
|
dpkg -i $1/clickhouse-common-static-dbg_*.deb
|
|
dpkg -i $1/clickhouse-server_*.deb
|
|
dpkg -i $1/clickhouse-client_*.deb
|
|
}
|
|
|
|
function configure()
|
|
{
|
|
# install test configs
|
|
export USE_DATABASE_ORDINARY=1
|
|
export EXPORT_S3_STORAGE_POLICIES=1
|
|
/usr/share/clickhouse-test/config/install.sh
|
|
|
|
# avoid too slow startup
|
|
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
|
|
| sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
|
|
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
|
|
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
|
|
sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
|
sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
|
|
|
# for clickhouse-server (via service)
|
|
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
|
|
# for clickhouse-client
|
|
export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
|
|
|
|
# since we run clickhouse from root
|
|
sudo chown root: /var/lib/clickhouse
|
|
|
|
# Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
|
|
echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
|
|
> /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
|
|
|
|
local total_mem
|
|
total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
|
|
total_mem=$(( total_mem*1024 )) # bytes
|
|
|
|
# Set maximum memory usage as half of total memory (less chance of OOM).
|
|
#
|
|
# But not via max_server_memory_usage but via max_memory_usage_for_user,
|
|
# so that we can override this setting and execute service queries, like:
|
|
# - hung check
|
|
# - show/drop database
|
|
# - ...
|
|
#
|
|
# So max_memory_usage_for_user will be a soft limit, and
|
|
# max_server_memory_usage will be hard limit, and queries that should be
|
|
# executed regardless memory limits will use max_memory_usage_for_user=0,
|
|
# instead of relying on max_untracked_memory
|
|
|
|
max_server_memory_usage_to_ram_ratio=0.5
|
|
echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
|
|
cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
|
|
<clickhouse>
|
|
<max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
|
|
</clickhouse>
|
|
EOL
|
|
|
|
local max_users_mem
|
|
max_users_mem=$((total_mem*30/100)) # 30%
|
|
echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
|
|
cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
|
|
<clickhouse>
|
|
<profiles>
|
|
<default>
|
|
<max_memory_usage>10G</max_memory_usage>
|
|
<max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
|
|
</default>
|
|
</profiles>
|
|
</clickhouse>
|
|
EOL
|
|
|
|
cat > /etc/clickhouse-server/config.d/core.xml <<EOL
|
|
<clickhouse>
|
|
<core_dump>
|
|
<!-- 100GiB -->
|
|
<size_limit>107374182400</size_limit>
|
|
</core_dump>
|
|
<!-- NOTE: no need to configure core_path,
|
|
since clickhouse is not started as daemon (via clickhouse start)
|
|
-->
|
|
<core_path>$PWD</core_path>
|
|
</clickhouse>
|
|
EOL
|
|
|
|
# Analyzer is not yet ready for testing
|
|
cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
|
|
<clickhouse>
|
|
<profiles>
|
|
<default>
|
|
<constraints>
|
|
<allow_experimental_analyzer>
|
|
<readonly/>
|
|
</allow_experimental_analyzer>
|
|
</constraints>
|
|
</default>
|
|
</profiles>
|
|
</clickhouse>
|
|
EOL
|
|
|
|
}
|
|
|
|
function stop()
|
|
{
|
|
local max_tries="${1:-90}"
|
|
local check_hang="${2:-true}"
|
|
local pid
|
|
# Preserve the pid, since the server can hung after the PID will be deleted.
|
|
pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
|
|
|
|
clickhouse stop --max-tries "$max_tries" --do-not-kill && return
|
|
|
|
if [ $check_hang == true ]
|
|
then
|
|
# We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
|
|
echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
|
|
kill -TERM "$(pidof gdb)" ||:
|
|
sleep 5
|
|
echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
|
|
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
|
clickhouse stop --force
|
|
fi
|
|
}
|
|
|
|
function start()
|
|
{
|
|
counter=0
|
|
until clickhouse-client --query "SELECT 1"
|
|
do
|
|
if [ "$counter" -gt ${1:-120} ]
|
|
then
|
|
echo "Cannot start clickhouse-server"
|
|
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
|
|
echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
|
|
cat /var/log/clickhouse-server/stdout.log
|
|
tail -n100 /var/log/clickhouse-server/stderr.log
|
|
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
|
|
break
|
|
fi
|
|
# use root to match with current uid
|
|
clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
|
|
sleep 0.5
|
|
counter=$((counter + 1))
|
|
done
|
|
|
|
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
|
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
|
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
|
# explicitly ignore non-fatal signals that are used by server.
|
|
# Number of SIGRTMIN can be determined only in runtime.
|
|
RTMIN=$(kill -l SIGRTMIN)
|
|
echo "
|
|
set follow-fork-mode parent
|
|
handle SIGHUP nostop noprint pass
|
|
handle SIGINT nostop noprint pass
|
|
handle SIGQUIT nostop noprint pass
|
|
handle SIGPIPE nostop noprint pass
|
|
handle SIGTERM nostop noprint pass
|
|
handle SIGUSR1 nostop noprint pass
|
|
handle SIGUSR2 nostop noprint pass
|
|
handle SIG$RTMIN nostop noprint pass
|
|
info signals
|
|
continue
|
|
backtrace full
|
|
thread apply all backtrace full
|
|
info registers
|
|
disassemble /s
|
|
up
|
|
disassemble /s
|
|
up
|
|
disassemble /s
|
|
p \"done\"
|
|
detach
|
|
quit
|
|
" > script.gdb
|
|
|
|
# FIXME Hung check may work incorrectly because of attached gdb
|
|
# 1. False positives are possible
|
|
# 2. We cannot attach another gdb to get stacktraces if some queries hung
|
|
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
|
sleep 5
|
|
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
|
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
|
}
|
|
|
|
function check_server_start()
|
|
{
|
|
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|
|
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
|
|
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
|
|
>> /test_output/test_results.tsv)
|
|
|
|
# Remove file application_errors.txt if it's empty
|
|
[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
|
|
}
|
|
|
|
function check_logs_for_critical_errors()
|
|
{
|
|
# Sanitizer asserts
|
|
rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
|
|
rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
|
|
rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
|
|
&& echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
|
|
|| echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
|
|
rm -f /test_output/tmp
|
|
|
|
# OOM
|
|
rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
|
&& echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
|
|
|| echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
|
|
|
# Logical errors
|
|
rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
|
|
&& echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
|
|
|| echo -e "No logical errors$OK" >> /test_output/test_results.tsv
|
|
# Remove file logical_errors.txt if it's empty
|
|
[ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
|
|
|
|
# No such key errors
|
|
rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
|
|
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
|
|
|| echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
|
|
|
|
# Remove file no_such_key_errors.txt if it's empty
|
|
[ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
|
|
|
|
# Crash
|
|
rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
|
&& echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
|
|
|| echo -e "Not crashed$OK" >> /test_output/test_results.tsv
|
|
|
|
# It also checks for crash without stacktrace (printed by watchdog)
|
|
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
|
|
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|
|
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
|
|
|
# Remove file fatal_messages.txt if it's empty
|
|
[ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
|
|
|
|
rg -Fa "########################################" /test_output/* > /dev/null \
|
|
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
|
|
|
|
function get_gdb_log_context()
|
|
{
|
|
rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
|
|
}
|
|
|
|
rg -Fa " received signal " /test_output/gdb.log > /dev/null \
|
|
&& echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
|
|
|
|
dmesg -T > /test_output/dmesg.log
|
|
|
|
# OOM in dmesg -- those are real
|
|
grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
|
|
&& echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
|
|
|| echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
|
|
}
|
|
|
|
function collect_query_and_trace_logs()
|
|
{
|
|
for table in query_log trace_log
|
|
do
|
|
clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
|
|
done
|
|
}
|
|
|
|
function collect_core_dumps()
|
|
{
|
|
find . -type f -maxdepth 1 -name 'core.*' | while read core; do
|
|
zstd --threads=0 $core
|
|
mv $core.zst /test_output/
|
|
done
|
|
}
|