mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
Make better
This commit is contained in:
parent
6b7d3bf07b
commit
8c6cf28c01
@ -8,230 +8,13 @@ dmesg --clear
|
||||
|
||||
set -x
|
||||
|
||||
# core.COMM.PID-TID
|
||||
sysctl kernel.core_pattern='core.%e.%p-%P'
|
||||
# we mount tests folder from repo to /usr/share
|
||||
ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
|
||||
ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
|
||||
|
||||
OK="\tOK\t\\N\t"
|
||||
FAIL="\tFAIL\t\\N\t"
|
||||
|
||||
FAILURE_CONTEXT_LINES=50
|
||||
FAILURE_CONTEXT_MAX_LINE_WIDTH=400
|
||||
|
||||
function escaped()
|
||||
{
|
||||
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
|
||||
# Also limit lines width just in case (too long lines are not really useful usually)
|
||||
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
|
||||
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
|
||||
}
|
||||
function head_escaped()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES $1 | escaped
|
||||
}
|
||||
function unts()
|
||||
{
|
||||
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
|
||||
}
|
||||
function trim_server_logs()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
|
||||
}
|
||||
|
||||
function install_packages()
|
||||
{
|
||||
dpkg -i $1/clickhouse-common-static_*.deb
|
||||
dpkg -i $1/clickhouse-common-static-dbg_*.deb
|
||||
dpkg -i $1/clickhouse-server_*.deb
|
||||
dpkg -i $1/clickhouse-client_*.deb
|
||||
}
|
||||
|
||||
function configure()
|
||||
{
|
||||
# install test configs
|
||||
export USE_DATABASE_ORDINARY=1
|
||||
export EXPORT_S3_STORAGE_POLICIES=1
|
||||
/usr/share/clickhouse-test/config/install.sh
|
||||
|
||||
# we mount tests folder from repo to /usr/share
|
||||
ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
|
||||
ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
|
||||
ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages
|
||||
ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag
|
||||
|
||||
# avoid too slow startup
|
||||
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
|
||||
| sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
|
||||
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
|
||||
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
|
||||
# for clickhouse-server (via service)
|
||||
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
|
||||
# for clickhouse-client
|
||||
export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
|
||||
|
||||
# since we run clickhouse from root
|
||||
sudo chown root: /var/lib/clickhouse
|
||||
|
||||
# Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
|
||||
echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
|
||||
> /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
|
||||
|
||||
local total_mem
|
||||
total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
|
||||
total_mem=$(( total_mem*1024 )) # bytes
|
||||
|
||||
# Set maximum memory usage as half of total memory (less chance of OOM).
|
||||
#
|
||||
# But not via max_server_memory_usage but via max_memory_usage_for_user,
|
||||
# so that we can override this setting and execute service queries, like:
|
||||
# - hung check
|
||||
# - show/drop database
|
||||
# - ...
|
||||
#
|
||||
# So max_memory_usage_for_user will be a soft limit, and
|
||||
# max_server_memory_usage will be hard limit, and queries that should be
|
||||
# executed regardless memory limits will use max_memory_usage_for_user=0,
|
||||
# instead of relying on max_untracked_memory
|
||||
|
||||
max_server_memory_usage_to_ram_ratio=0.5
|
||||
echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
|
||||
cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
|
||||
<clickhouse>
|
||||
<max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
local max_users_mem
|
||||
max_users_mem=$((total_mem*30/100)) # 30%
|
||||
echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
|
||||
cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<max_memory_usage>10G</max_memory_usage>
|
||||
<max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
cat > /etc/clickhouse-server/config.d/core.xml <<EOL
|
||||
<clickhouse>
|
||||
<core_dump>
|
||||
<!-- 100GiB -->
|
||||
<size_limit>107374182400</size_limit>
|
||||
</core_dump>
|
||||
<!-- NOTE: no need to configure core_path,
|
||||
since clickhouse is not started as daemon (via clickhouse start)
|
||||
-->
|
||||
<core_path>$PWD</core_path>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
# Let OOM killer terminate other processes before clickhouse-server:
|
||||
cat > /etc/clickhouse-server/config.d/oom_score.xml <<EOL
|
||||
<clickhouse>
|
||||
<oom_score>-1000</oom_score>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
# Analyzer is not yet ready for testing
|
||||
cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<constraints>
|
||||
<allow_experimental_analyzer>
|
||||
<readonly/>
|
||||
</allow_experimental_analyzer>
|
||||
</constraints>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
}
|
||||
|
||||
function stop()
|
||||
{
|
||||
local max_tries="${1:-90}"
|
||||
local pid
|
||||
# Preserve the pid, since the server can hung after the PID will be deleted.
|
||||
pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
|
||||
|
||||
clickhouse stop --max-tries "$max_tries" --do-not-kill && return
|
||||
|
||||
# We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
|
||||
echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
|
||||
kill -TERM "$(pidof gdb)" ||:
|
||||
sleep 5
|
||||
echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
|
||||
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
||||
clickhouse stop --force
|
||||
}
|
||||
|
||||
function start()
|
||||
{
|
||||
counter=0
|
||||
until clickhouse-client --query "SELECT 1"
|
||||
do
|
||||
if [ "$counter" -gt ${1:-120} ]
|
||||
then
|
||||
echo "Cannot start clickhouse-server"
|
||||
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
|
||||
echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
|
||||
cat /var/log/clickhouse-server/stdout.log
|
||||
tail -n100 /var/log/clickhouse-server/stderr.log
|
||||
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
|
||||
break
|
||||
fi
|
||||
# use root to match with current uid
|
||||
clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
|
||||
sleep 0.5
|
||||
counter=$((counter + 1))
|
||||
done
|
||||
|
||||
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
||||
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
||||
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
||||
# explicitly ignore non-fatal signals that are used by server.
|
||||
# Number of SIGRTMIN can be determined only in runtime.
|
||||
RTMIN=$(kill -l SIGRTMIN)
|
||||
echo "
|
||||
set follow-fork-mode parent
|
||||
handle SIGHUP nostop noprint pass
|
||||
handle SIGINT nostop noprint pass
|
||||
handle SIGQUIT nostop noprint pass
|
||||
handle SIGPIPE nostop noprint pass
|
||||
handle SIGTERM nostop noprint pass
|
||||
handle SIGUSR1 nostop noprint pass
|
||||
handle SIGUSR2 nostop noprint pass
|
||||
handle SIG$RTMIN nostop noprint pass
|
||||
info signals
|
||||
continue
|
||||
backtrace full
|
||||
thread apply all backtrace full
|
||||
info registers
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
p \"done\"
|
||||
detach
|
||||
quit
|
||||
" > script.gdb
|
||||
|
||||
# FIXME Hung check may work incorrectly because of attached gdb
|
||||
# 1. False positives are possible
|
||||
# 2. We cannot attach another gdb to get stacktraces if some queries hung
|
||||
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
||||
sleep 5
|
||||
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
||||
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
||||
}
|
||||
# Stress tests and upgrade check uses similar code that was placed
|
||||
# in a separate bash library. See tests/ci/stress_tests.lib
|
||||
source /usr/share/clickhouse-test/ci/stress_tests.lib
|
||||
|
||||
install_packages package_folder
|
||||
|
||||
@ -414,13 +197,7 @@ unset "${!THREAD_@}"
|
||||
|
||||
start
|
||||
|
||||
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|
||||
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
|
||||
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
|
||||
>> /test_output/test_results.tsv)
|
||||
|
||||
# Remove file application_errors.txt if it's empty
|
||||
[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
|
||||
check_server_start
|
||||
|
||||
stop
|
||||
|
||||
@ -430,71 +207,11 @@ stop
|
||||
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.final.log
|
||||
|
||||
# Grep logs for sanitizer asserts, crashes and other critical errors
|
||||
check_logs_for_critical_errors
|
||||
|
||||
# Sanitizer asserts
|
||||
rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
|
||||
rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
|
||||
rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
|
||||
&& echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
|
||||
rm -f /test_output/tmp
|
||||
tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
|
||||
|
||||
# OOM
|
||||
rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
||||
&& echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Logical errors
|
||||
rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
|
||||
&& echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No logical errors$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file logical_errors.txt if it's empty
|
||||
[ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
|
||||
|
||||
# No such key errors
|
||||
rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
|
||||
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file no_such_key_errors.txt if it's empty
|
||||
[ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
|
||||
|
||||
# Crash
|
||||
rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
||||
&& echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Not crashed$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# It also checks for crash without stacktrace (printed by watchdog)
|
||||
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
|
||||
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file fatal_messages.txt if it's empty
|
||||
[ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
|
||||
|
||||
rg -Fa "########################################" /test_output/* > /dev/null \
|
||||
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
|
||||
|
||||
function get_gdb_log_context()
|
||||
{
|
||||
rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
|
||||
}
|
||||
|
||||
rg -Fa " received signal " /test_output/gdb.log > /dev/null \
|
||||
&& echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
|
||||
|
||||
for table in query_log trace_log
|
||||
do
|
||||
clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
|
||||
done
|
||||
|
||||
dmesg -T > /test_output/dmesg.log
|
||||
|
||||
# OOM in dmesg -- those are real
|
||||
grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
|
||||
&& echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
|
||||
collect_query_and_trace_logs
|
||||
|
||||
mv /var/log/clickhouse-server/stderr.log /test_output/
|
||||
|
||||
@ -514,8 +231,4 @@ rowNumberInAllBlocks()
|
||||
LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv || echo "failure\tCannot parse test_results.tsv" > /test_output/check_status.tsv
|
||||
[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
|
||||
|
||||
# Core dumps
|
||||
find . -type f -maxdepth 1 -name 'core.*' | while read core; do
|
||||
zstd --threads=0 $core
|
||||
mv $core.zst /test_output/
|
||||
done
|
||||
collect_core_dumps
|
||||
|
@ -3,267 +3,22 @@
|
||||
# shellcheck disable=SC2086
|
||||
# shellcheck disable=SC2024
|
||||
|
||||
# This script is similar to script for common stress test
|
||||
|
||||
# Avoid overlaps with previous runs
|
||||
dmesg --clear
|
||||
|
||||
set -x
|
||||
|
||||
# core.COMM.PID-TID
|
||||
sysctl kernel.core_pattern='core.%e.%p-%P'
|
||||
|
||||
OK="\tOK\t\\N\t"
|
||||
FAIL="\tFAIL\t\\N\t"
|
||||
|
||||
FAILURE_CONTEXT_LINES=50
|
||||
FAILURE_CONTEXT_MAX_LINE_WIDTH=400
|
||||
|
||||
function escaped()
|
||||
{
|
||||
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
|
||||
# Also limit lines width just in case (too long lines are not really useful usually)
|
||||
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
|
||||
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
|
||||
}
|
||||
function head_escaped()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES $1 | escaped
|
||||
}
|
||||
function unts()
|
||||
{
|
||||
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
|
||||
}
|
||||
function trim_server_logs()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
|
||||
}
|
||||
|
||||
function install_packages()
|
||||
{
|
||||
dpkg -i $1/clickhouse-common-static_*.deb
|
||||
dpkg -i $1/clickhouse-common-static-dbg_*.deb
|
||||
dpkg -i $1/clickhouse-server_*.deb
|
||||
dpkg -i $1/clickhouse-client_*.deb
|
||||
}
|
||||
|
||||
function configure()
|
||||
{
|
||||
# install test configs
|
||||
export USE_DATABASE_ORDINARY=1
|
||||
export EXPORT_S3_STORAGE_POLICIES=1
|
||||
/usr/share/clickhouse-test/config/install.sh
|
||||
|
||||
# avoid too slow startup
|
||||
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
|
||||
| sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
|
||||
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
|
||||
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
|
||||
# for clickhouse-server (via service)
|
||||
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
|
||||
# for clickhouse-client
|
||||
export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
|
||||
|
||||
# since we run clickhouse from root
|
||||
sudo chown root: /var/lib/clickhouse
|
||||
|
||||
# Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
|
||||
echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
|
||||
> /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
|
||||
|
||||
|
||||
local total_mem
|
||||
total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
|
||||
total_mem=$(( total_mem*1024 )) # bytes
|
||||
|
||||
# Set maximum memory usage as half of total memory (less chance of OOM).
|
||||
#
|
||||
# But not via max_server_memory_usage but via max_memory_usage_for_user,
|
||||
# so that we can override this setting and execute service queries, like:
|
||||
# - hung check
|
||||
# - show/drop database
|
||||
# - ...
|
||||
#
|
||||
# So max_memory_usage_for_user will be a soft limit, and
|
||||
# max_server_memory_usage will be hard limit, and queries that should be
|
||||
# executed regardless memory limits will use max_memory_usage_for_user=0,
|
||||
# instead of relying on max_untracked_memory
|
||||
|
||||
max_server_memory_usage_to_ram_ratio=0.5
|
||||
echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
|
||||
cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
|
||||
<clickhouse>
|
||||
<max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
local max_users_mem
|
||||
max_users_mem=$((total_mem*30/100)) # 30%
|
||||
echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
|
||||
cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<max_memory_usage>10G</max_memory_usage>
|
||||
<max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
cat > /etc/clickhouse-server/config.d/core.xml <<EOL
|
||||
<clickhouse>
|
||||
<core_dump>
|
||||
<!-- 100GiB -->
|
||||
<size_limit>107374182400</size_limit>
|
||||
</core_dump>
|
||||
<!-- NOTE: no need to configure core_path,
|
||||
since clickhouse is not started as daemon (via clickhouse start)
|
||||
-->
|
||||
<core_path>$PWD</core_path>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
# Let OOM killer terminate other processes before clickhouse-server:
|
||||
cat > /etc/clickhouse-server/config.d/oom_score.xml <<EOL
|
||||
<clickhouse>
|
||||
<oom_score>-1000</oom_score>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
# Analyzer is not yet ready for testing
|
||||
cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<constraints>
|
||||
<allow_experimental_analyzer>
|
||||
<readonly/>
|
||||
</allow_experimental_analyzer>
|
||||
</constraints>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
}
|
||||
|
||||
function stop()
|
||||
{
|
||||
local max_tries="${1:-90}"
|
||||
local pid
|
||||
# Preserve the pid, since the server can hung after the PID will be deleted.
|
||||
pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
|
||||
|
||||
clickhouse stop --max-tries "$max_tries" --do-not-kill && return
|
||||
|
||||
# We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
|
||||
echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
|
||||
kill -TERM "$(pidof gdb)" ||:
|
||||
sleep 5
|
||||
echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
|
||||
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
||||
clickhouse stop --force
|
||||
}
|
||||
|
||||
function start()
|
||||
{
|
||||
counter=0
|
||||
until clickhouse-client --query "SELECT 1"
|
||||
do
|
||||
if [ "$counter" -gt ${1:-120} ]
|
||||
then
|
||||
echo "Cannot start clickhouse-server"
|
||||
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
|
||||
echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
|
||||
cat /var/log/clickhouse-server/stdout.log
|
||||
tail -n100 /var/log/clickhouse-server/stderr.log
|
||||
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
|
||||
break
|
||||
fi
|
||||
# use root to match with current uid
|
||||
clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
|
||||
sleep 0.5
|
||||
counter=$((counter + 1))
|
||||
done
|
||||
|
||||
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
||||
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
||||
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
||||
# explicitly ignore non-fatal signals that are used by server.
|
||||
# Number of SIGRTMIN can be determined only in runtime.
|
||||
RTMIN=$(kill -l SIGRTMIN)
|
||||
echo "
|
||||
set follow-fork-mode parent
|
||||
handle SIGHUP nostop noprint pass
|
||||
handle SIGINT nostop noprint pass
|
||||
handle SIGQUIT nostop noprint pass
|
||||
handle SIGPIPE nostop noprint pass
|
||||
handle SIGTERM nostop noprint pass
|
||||
handle SIGUSR1 nostop noprint pass
|
||||
handle SIGUSR2 nostop noprint pass
|
||||
handle SIG$RTMIN nostop noprint pass
|
||||
info signals
|
||||
continue
|
||||
backtrace full
|
||||
thread apply all backtrace full
|
||||
info registers
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
p \"done\"
|
||||
detach
|
||||
quit
|
||||
" > script.gdb
|
||||
|
||||
# FIXME Hung check may work incorrectly because of attached gdb
|
||||
# 1. False positives are possible
|
||||
# 2. We cannot attach another gdb to get stacktraces if some queries hung
|
||||
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
||||
sleep 5
|
||||
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
||||
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
||||
}
|
||||
|
||||
# Thread Fuzzer allows to check more permutations of possible thread scheduling
|
||||
# and find more potential issues.
|
||||
# Temporarily disable ThreadFuzzer with tsan because of https://github.com/google/sanitizers/issues/1540
|
||||
is_tsan_build=$(clickhouse local -q "select value like '% -fsanitize=thread %' from system.build_options where name='CXX_FLAGS'")
|
||||
if [ "$is_tsan_build" -eq "0" ]; then
|
||||
export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
|
||||
export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
|
||||
export THREAD_FUZZER_SLEEP_TIME_US=100000
|
||||
|
||||
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
|
||||
export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
|
||||
|
||||
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
|
||||
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
|
||||
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
|
||||
|
||||
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
|
||||
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
|
||||
fi
|
||||
|
||||
azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
|
||||
./setup_minio.sh stateless # to have a proper environment
|
||||
|
||||
# we mount tests folder from repo to /usr/share
|
||||
ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
|
||||
ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
|
||||
ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages
|
||||
ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag
|
||||
|
||||
source /usr/share/clickhouse-test/ci/stress_tests.lib
|
||||
|
||||
azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
|
||||
./setup_minio.sh stateless # to have a proper environment
|
||||
|
||||
echo "Get previous release tag"
|
||||
previous_release_tag=$(dpkg --info package_folder/clickhouse-client*.deb | grep "Version: " | awk '{print $2}' | cut -f1 -d'+' | get_previous_release_tag)
|
||||
echo $previous_release_tag
|
||||
@ -274,224 +29,161 @@ git clone https://github.com/ClickHouse/ClickHouse.git --no-tags --progress --br
|
||||
echo "Download clickhouse-server from the previous release"
|
||||
mkdir previous_release_package_folder
|
||||
|
||||
echo $previous_release_tag | download_release_packages && echo -e 'Download script exit code\tOK' >> /test_output/test_results.tsv \
|
||||
|| echo -e 'Download script failed\tFAIL' >> /test_output/test_results.tsv
|
||||
echo $previous_release_tag | download_release_packages && echo -e "Download script exit code$OK" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Download script failed$FAIL" >> /test_output/test_results.tsv
|
||||
|
||||
# Check if we cloned previous release repository successfully
|
||||
if ! [ "$(ls -A previous_release_repository/tests/queries)" ]
|
||||
then
|
||||
echo -e "Failed to clone previous release tests\tFAIL" >> /test_output/test_results.tsv
|
||||
echo -e 'failure\tFailed to clone previous release tests' > /test_output/check_status.tsv
|
||||
exit
|
||||
elif ! [ "$(ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb)" ]
|
||||
then
|
||||
echo -e "Failed to download previous release packages\tFAIL" >> /test_output/test_results.tsv
|
||||
else
|
||||
echo -e "Successfully cloned previous release tests\tOK" >> /test_output/test_results.tsv
|
||||
echo -e "Successfully downloaded previous release packages\tOK" >> /test_output/test_results.tsv
|
||||
|
||||
# Make upgrade check more funny by forcing Ordinary engine for system database
|
||||
mkdir /var/lib/clickhouse/metadata
|
||||
echo "ATTACH DATABASE system ENGINE=Ordinary" > /var/lib/clickhouse/metadata/system.sql
|
||||
|
||||
# Install previous release packages
|
||||
install_packages previous_release_package_folder
|
||||
|
||||
# Start server from previous release
|
||||
# Let's enable S3 storage by default
|
||||
export USE_S3_STORAGE_FOR_MERGE_TREE=1
|
||||
# Previous version may not be ready for fault injections
|
||||
export ZOOKEEPER_FAULT_INJECTION=0
|
||||
configure
|
||||
|
||||
# But we still need default disk because some tables loaded only into it
|
||||
sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
|
||||
| sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
|
||||
> /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
|
||||
# Avoid "Setting s3_check_objects_after_upload is neither a builtin setting..."
|
||||
rm -f /etc/clickhouse-server/users.d/enable_blobs_check.xml ||:
|
||||
rm -f /etc/clickhouse-server/users.d/marks.xml ||:
|
||||
|
||||
# Remove s3 related configs to avoid "there is no disk type `cache`"
|
||||
rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||:
|
||||
rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||:
|
||||
|
||||
# Turn on after 22.12
|
||||
rm -f /etc/clickhouse-server/config.d/compressed_marks_and_index.xml ||:
|
||||
# it uses recently introduced settings which previous versions may not have
|
||||
rm -f /etc/clickhouse-server/users.d/insert_keeper_retries.xml ||:
|
||||
|
||||
start
|
||||
|
||||
clickhouse-client --query="SELECT 'Server version: ', version()"
|
||||
|
||||
mkdir tmp_stress_output
|
||||
|
||||
stress --test-cmd="/usr/bin/clickhouse-test --queries=\"previous_release_repository/tests/queries\"" --upgrade-check --output-folder tmp_stress_output --global-time-limit=1200 \
|
||||
&& echo -e "Test script exit code$OK" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Test script failed$FAIL script exit code: $?" >> /test_output/test_results.tsv
|
||||
|
||||
rm -rf tmp_stress_output
|
||||
|
||||
# We experienced deadlocks in this command in very rare cases. Let's debug it:
|
||||
timeout 10m clickhouse-client --query="SELECT 'Tables count:', count() FROM system.tables" ||
|
||||
(
|
||||
echo "thread apply all backtrace (on select tables count)" >> /test_output/gdb.log
|
||||
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
||||
clickhouse stop --force
|
||||
)
|
||||
|
||||
# Use bigger timeout for previous version
|
||||
stop 300
|
||||
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
|
||||
|
||||
# Install and start new server
|
||||
install_packages package_folder
|
||||
# Disable fault injections on start (we don't test them here, and it can lead to tons of requests in case of huge number of tables).
|
||||
export ZOOKEEPER_FAULT_INJECTION=0
|
||||
configure
|
||||
start 500
|
||||
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|
||||
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
|
||||
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
|
||||
>> /test_output/test_results.tsv)
|
||||
|
||||
# Remove file application_errors.txt if it's empty
|
||||
[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
|
||||
|
||||
clickhouse-client --query="SELECT 'Server version: ', version()"
|
||||
|
||||
# Let the server run for a while before checking log.
|
||||
sleep 60
|
||||
|
||||
stop
|
||||
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.upgrade.log
|
||||
|
||||
# Error messages (we should ignore some errors)
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/38643 ("Unknown index: idx.")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 ("Cannot parse string 'Hello' as UInt64")
|
||||
# FIXME Not sure if it's expected, but some tests from stress test may not be finished yet when we restarting server.
|
||||
# Let's just ignore all errors from queries ("} <Error> TCPHandler: Code:", "} <Error> executeQuery: Code:")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'")
|
||||
# NOTE Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected
|
||||
# ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility
|
||||
echo "Check for Error messages in server log:"
|
||||
rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
|
||||
-e "Code: 236. DB::Exception: Cancelled mutating parts" \
|
||||
-e "REPLICA_IS_ALREADY_ACTIVE" \
|
||||
-e "REPLICA_ALREADY_EXISTS" \
|
||||
-e "ALL_REPLICAS_LOST" \
|
||||
-e "DDLWorker: Cannot parse DDL task query" \
|
||||
-e "RaftInstance: failed to accept a rpc connection due to error 125" \
|
||||
-e "UNKNOWN_DATABASE" \
|
||||
-e "NETWORK_ERROR" \
|
||||
-e "UNKNOWN_TABLE" \
|
||||
-e "ZooKeeperClient" \
|
||||
-e "KEEPER_EXCEPTION" \
|
||||
-e "DirectoryMonitor" \
|
||||
-e "TABLE_IS_READ_ONLY" \
|
||||
-e "Code: 1000, e.code() = 111, Connection refused" \
|
||||
-e "UNFINISHED" \
|
||||
-e "NETLINK_ERROR" \
|
||||
-e "Renaming unexpected part" \
|
||||
-e "PART_IS_TEMPORARILY_LOCKED" \
|
||||
-e "and a merge is impossible: we didn't find" \
|
||||
-e "found in queue and some source parts for it was lost" \
|
||||
-e "is lost forever." \
|
||||
-e "Unknown index: idx." \
|
||||
-e "Cannot parse string 'Hello' as UInt64" \
|
||||
-e "} <Error> TCPHandler: Code:" \
|
||||
-e "} <Error> executeQuery: Code:" \
|
||||
-e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \
|
||||
-e "This engine is deprecated and is not supported in transactions" \
|
||||
-e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \
|
||||
-e "The set of parts restored in place of" \
|
||||
-e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \
|
||||
-e "Code: 269. DB::Exception: Destination table is myself" \
|
||||
-e "Coordination::Exception: Connection loss" \
|
||||
-e "MutateFromLogEntryTask" \
|
||||
-e "No connection to ZooKeeper, cannot get shared table ID" \
|
||||
-e "Session expired" \
|
||||
-e "TOO_MANY_PARTS" \
|
||||
-e "Authentication failed" \
|
||||
-e "Container already exists" \
|
||||
/var/log/clickhouse-server/clickhouse-server.upgrade.log | zgrep -Fa "<Error>" > /test_output/upgrade_error_messages.txt \
|
||||
&& echo -e "Error message in clickhouse-server.log (see upgrade_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \
|
||||
>> /test_output/test_results.tsv \
|
||||
|| echo -e "No Error messages after server upgrade$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file bc_check_error_messages.txt if it's empty
|
||||
[ -s /test_output/upgrade_error_messages.txt ] || rm /test_output/upgrade_error_messages.txt
|
||||
|
||||
# Sanitizer asserts
|
||||
rg -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
|
||||
rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
|
||||
rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
|
||||
&& echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
|
||||
rm -f /test_output/tmp
|
||||
|
||||
# OOM
|
||||
rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.*.log > /dev/null \
|
||||
&& echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Logical errors
|
||||
echo "Check for Logical errors in server log:"
|
||||
rg -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.*.log > /test_output/logical_errors.txt \
|
||||
&& echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No logical errors$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file logical_errors.txt if it's empty
|
||||
[ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
|
||||
|
||||
# Crash
|
||||
rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.*.log > /dev/null \
|
||||
&& echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Not crashed$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# It also checks for crash without stacktrace (printed by watchdog)
|
||||
echo "Check for Fatal message in server log:"
|
||||
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.*.log > /test_output/fatal_messages.txt \
|
||||
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file fatal_messages.txt if it's empty
|
||||
[ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
|
||||
|
||||
rg -Fa "########################################" /test_output/* > /dev/null \
|
||||
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
|
||||
|
||||
tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
|
||||
for table in query_log trace_log
|
||||
do
|
||||
clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" \
|
||||
| zstd --threads=0 > /test_output/$table.tsv.zst ||:
|
||||
done
|
||||
echo -e 'failure\tFailed to download previous release packages' > /test_output/check_status.tsv
|
||||
exit
|
||||
fi
|
||||
|
||||
dmesg -T > /test_output/dmesg.log
|
||||
echo -e "Successfully cloned previous release tests$OK" >> /test_output/test_results.tsv
|
||||
echo -e "Successfully downloaded previous release packages$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# OOM in dmesg -- those are real
|
||||
grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
|
||||
&& echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
|
||||
# Make upgrade check more funny by forcing Ordinary engine for system database
|
||||
mkdir /var/lib/clickhouse/metadata
|
||||
echo "ATTACH DATABASE system ENGINE=Ordinary" > /var/lib/clickhouse/metadata/system.sql
|
||||
|
||||
# Install previous release packages
|
||||
install_packages previous_release_package_folder
|
||||
|
||||
# Start server from previous release
|
||||
# Let's enable S3 storage by default
|
||||
export USE_S3_STORAGE_FOR_MERGE_TREE=1
|
||||
# Previous version may not be ready for fault injections
|
||||
export ZOOKEEPER_FAULT_INJECTION=0
|
||||
configure
|
||||
|
||||
# But we still need default disk because some tables loaded only into it
|
||||
sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
|
||||
| sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
|
||||
> /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
|
||||
|
||||
start
|
||||
|
||||
clickhouse-client --query="SELECT 'Server version: ', version()"
|
||||
|
||||
mkdir tmp_stress_output
|
||||
|
||||
stress --test-cmd="/usr/bin/clickhouse-test --queries=\"previous_release_repository/tests/queries\"" --upgrade-check --output-folder tmp_stress_output --global-time-limit=1200 \
|
||||
&& echo -e "Test script exit code$OK" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Test script failed$FAIL script exit code: $?" >> /test_output/test_results.tsv
|
||||
|
||||
rm -rf tmp_stress_output
|
||||
|
||||
# We experienced deadlocks in this command in very rare cases. Let's debug it:
|
||||
timeout 10m clickhouse-client --query="SELECT 'Tables count:', count() FROM system.tables" ||
|
||||
(
|
||||
echo "thread apply all backtrace (on select tables count)" >> /test_output/gdb.log
|
||||
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
||||
clickhouse stop --force
|
||||
)
|
||||
|
||||
# Use bigger timeout for previous version and disable additional hang check
|
||||
stop 300 false
|
||||
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
|
||||
|
||||
# Install and start new server
|
||||
install_packages package_folder
|
||||
# Disable fault injections on start (we don't test them here, and it can lead to tons of requests in case of huge number of tables).
|
||||
export ZOOKEEPER_FAULT_INJECTION=0
|
||||
configure
|
||||
start 500
|
||||
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|
||||
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
|
||||
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
|
||||
>> /test_output/test_results.tsv)
|
||||
|
||||
# Remove file application_errors.txt if it's empty
|
||||
[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
|
||||
|
||||
clickhouse-client --query="SELECT 'Server version: ', version()"
|
||||
|
||||
# Let the server run for a while before checking log.
|
||||
sleep 60
|
||||
|
||||
stop
|
||||
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.upgrade.log
|
||||
|
||||
# Error messages (we should ignore some errors)
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/38643 ("Unknown index: idx.")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 ("Cannot parse string 'Hello' as UInt64")
|
||||
# FIXME Not sure if it's expected, but some tests from stress test may not be finished yet when we restarting server.
|
||||
# Let's just ignore all errors from queries ("} <Error> TCPHandler: Code:", "} <Error> executeQuery: Code:")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'")
|
||||
# NOTE Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected
|
||||
# ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part")
|
||||
# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility
|
||||
echo "Check for Error messages in server log:"
|
||||
rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
|
||||
-e "Code: 236. DB::Exception: Cancelled mutating parts" \
|
||||
-e "REPLICA_IS_ALREADY_ACTIVE" \
|
||||
-e "REPLICA_ALREADY_EXISTS" \
|
||||
-e "ALL_REPLICAS_LOST" \
|
||||
-e "DDLWorker: Cannot parse DDL task query" \
|
||||
-e "RaftInstance: failed to accept a rpc connection due to error 125" \
|
||||
-e "UNKNOWN_DATABASE" \
|
||||
-e "NETWORK_ERROR" \
|
||||
-e "UNKNOWN_TABLE" \
|
||||
-e "ZooKeeperClient" \
|
||||
-e "KEEPER_EXCEPTION" \
|
||||
-e "DirectoryMonitor" \
|
||||
-e "TABLE_IS_READ_ONLY" \
|
||||
-e "Code: 1000, e.code() = 111, Connection refused" \
|
||||
-e "UNFINISHED" \
|
||||
-e "NETLINK_ERROR" \
|
||||
-e "Renaming unexpected part" \
|
||||
-e "PART_IS_TEMPORARILY_LOCKED" \
|
||||
-e "and a merge is impossible: we didn't find" \
|
||||
-e "found in queue and some source parts for it was lost" \
|
||||
-e "is lost forever." \
|
||||
-e "Unknown index: idx." \
|
||||
-e "Cannot parse string 'Hello' as UInt64" \
|
||||
-e "} <Error> TCPHandler: Code:" \
|
||||
-e "} <Error> executeQuery: Code:" \
|
||||
-e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \
|
||||
-e "This engine is deprecated and is not supported in transactions" \
|
||||
-e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \
|
||||
-e "The set of parts restored in place of" \
|
||||
-e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \
|
||||
-e "Code: 269. DB::Exception: Destination table is myself" \
|
||||
-e "Coordination::Exception: Connection loss" \
|
||||
-e "MutateFromLogEntryTask" \
|
||||
-e "No connection to ZooKeeper, cannot get shared table ID" \
|
||||
-e "Session expired" \
|
||||
-e "TOO_MANY_PARTS" \
|
||||
-e "Authentication failed" \
|
||||
-e "Container already exists" \
|
||||
/var/log/clickhouse-server/clickhouse-server.upgrade.log | zgrep -Fa "<Error>" > /test_output/upgrade_error_messages.txt \
|
||||
&& echo -e "Error message in clickhouse-server.log (see upgrade_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \
|
||||
>> /test_output/test_results.tsv \
|
||||
|| echo -e "No Error messages after server upgrade$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file upgrade_error_messages.txt if it's empty
|
||||
[ -s /test_output/upgrade_error_messages.txt ] || rm /test_output/upgrade_error_messages.txt
|
||||
|
||||
# Grep logs for sanitizer asserts, crashes and other critical errors
|
||||
check_logs_for_critical_errors
|
||||
|
||||
tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
|
||||
|
||||
collect_query_and_trace_logs
|
||||
|
||||
check_oom_in_dmesg
|
||||
|
||||
mv /var/log/clickhouse-server/stderr.log /test_output/
|
||||
|
||||
# If we failed to clone repo or download previous release packages,
|
||||
# we don't have any packages installed, but we need clickhouse-local
|
||||
# to be installed to create check_status.tsv.
|
||||
if ! command -v clickhouse-local &> /dev/null
|
||||
then
|
||||
install_packages package_folder
|
||||
fi
|
||||
|
||||
# Write check result into check_status.tsv
|
||||
# Try to choose most specific error for the whole check status
|
||||
clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by
|
||||
clickhouse-local --structure "test String, res String, time Nullable(Float32), desc String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by
|
||||
(test like '%Sanitizer%') DESC,
|
||||
(test like '%Killed by signal%') DESC,
|
||||
(test like '%gdb.log%') DESC,
|
||||
@ -504,7 +196,7 @@ clickhouse-local --structure "test String, res String" -q "SELECT 'failure', tes
|
||||
(test like '%Error message%') DESC,
|
||||
(test like '%previous release%') DESC,
|
||||
rowNumberInAllBlocks()
|
||||
LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
|
||||
LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv || echo "failure\tCannot parse test_results.tsv" > /test_output/check_status.tsv
|
||||
[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
|
||||
|
||||
# Core dumps
|
||||
|
305
tests/ci/stress_tests.lib
Normal file
305
tests/ci/stress_tests.lib
Normal file
@ -0,0 +1,305 @@
|
||||
#!/bin/bash
|
||||
|
||||
# core.COMM.PID-TID
|
||||
sysctl kernel.core_pattern='core.%e.%p-%P'
|
||||
|
||||
OK="\tOK\t\\N\t"
|
||||
FAIL="\tFAIL\t\\N\t"
|
||||
|
||||
FAILURE_CONTEXT_LINES=50
|
||||
FAILURE_CONTEXT_MAX_LINE_WIDTH=400
|
||||
|
||||
function escaped()
|
||||
{
|
||||
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
|
||||
# Also limit lines width just in case (too long lines are not really useful usually)
|
||||
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
|
||||
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
|
||||
}
|
||||
function head_escaped()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES $1 | escaped
|
||||
}
|
||||
function unts()
|
||||
{
|
||||
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
|
||||
}
|
||||
function trim_server_logs()
|
||||
{
|
||||
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
|
||||
}
|
||||
|
||||
function install_packages()
|
||||
{
|
||||
dpkg -i $1/clickhouse-common-static_*.deb
|
||||
dpkg -i $1/clickhouse-common-static-dbg_*.deb
|
||||
dpkg -i $1/clickhouse-server_*.deb
|
||||
dpkg -i $1/clickhouse-client_*.deb
|
||||
}
|
||||
|
||||
function configure()
|
||||
{
|
||||
# install test configs
|
||||
export USE_DATABASE_ORDINARY=1
|
||||
export EXPORT_S3_STORAGE_POLICIES=1
|
||||
/usr/share/clickhouse-test/config/install.sh
|
||||
|
||||
# avoid too slow startup
|
||||
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
|
||||
| sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
|
||||
> /etc/clickhouse-server/config.d/keeper_port.xml.tmp
|
||||
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
|
||||
|
||||
# for clickhouse-server (via service)
|
||||
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
|
||||
# for clickhouse-client
|
||||
export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
|
||||
|
||||
# since we run clickhouse from root
|
||||
sudo chown root: /var/lib/clickhouse
|
||||
|
||||
# Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
|
||||
echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
|
||||
> /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
|
||||
|
||||
local total_mem
|
||||
total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
|
||||
total_mem=$(( total_mem*1024 )) # bytes
|
||||
|
||||
# Set maximum memory usage as half of total memory (less chance of OOM).
|
||||
#
|
||||
# But not via max_server_memory_usage but via max_memory_usage_for_user,
|
||||
# so that we can override this setting and execute service queries, like:
|
||||
# - hung check
|
||||
# - show/drop database
|
||||
# - ...
|
||||
#
|
||||
# So max_memory_usage_for_user will be a soft limit, and
|
||||
# max_server_memory_usage will be hard limit, and queries that should be
|
||||
# executed regardless memory limits will use max_memory_usage_for_user=0,
|
||||
# instead of relying on max_untracked_memory
|
||||
|
||||
max_server_memory_usage_to_ram_ratio=0.5
|
||||
echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
|
||||
cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
|
||||
<clickhouse>
|
||||
<max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
local max_users_mem
|
||||
max_users_mem=$((total_mem*30/100)) # 30%
|
||||
echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
|
||||
cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<max_memory_usage>10G</max_memory_usage>
|
||||
<max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
cat > /etc/clickhouse-server/config.d/core.xml <<EOL
|
||||
<clickhouse>
|
||||
<core_dump>
|
||||
<!-- 100GiB -->
|
||||
<size_limit>107374182400</size_limit>
|
||||
</core_dump>
|
||||
<!-- NOTE: no need to configure core_path,
|
||||
since clickhouse is not started as daemon (via clickhouse start)
|
||||
-->
|
||||
<core_path>$PWD</core_path>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
# Analyzer is not yet ready for testing
|
||||
cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
|
||||
<clickhouse>
|
||||
<profiles>
|
||||
<default>
|
||||
<constraints>
|
||||
<allow_experimental_analyzer>
|
||||
<readonly/>
|
||||
</allow_experimental_analyzer>
|
||||
</constraints>
|
||||
</default>
|
||||
</profiles>
|
||||
</clickhouse>
|
||||
EOL
|
||||
|
||||
}
|
||||
|
||||
function stop()
|
||||
{
|
||||
local max_tries="${1:-90}"
|
||||
local check_hang="${2:-true}"
|
||||
local pid
|
||||
# Preserve the pid, since the server can hung after the PID will be deleted.
|
||||
pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
|
||||
|
||||
clickhouse stop --max-tries "$max_tries" --do-not-kill && return
|
||||
|
||||
if [ $check_hang == true ] then
|
||||
# We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
|
||||
echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
|
||||
kill -TERM "$(pidof gdb)" ||:
|
||||
sleep 5
|
||||
echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
|
||||
timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
|
||||
clickhouse stop --force
|
||||
fi
|
||||
}
|
||||
|
||||
function start()
|
||||
{
|
||||
counter=0
|
||||
until clickhouse-client --query "SELECT 1"
|
||||
do
|
||||
if [ "$counter" -gt ${1:-120} ]
|
||||
then
|
||||
echo "Cannot start clickhouse-server"
|
||||
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
|
||||
echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
|
||||
cat /var/log/clickhouse-server/stdout.log
|
||||
tail -n100 /var/log/clickhouse-server/stderr.log
|
||||
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
|
||||
break
|
||||
fi
|
||||
# use root to match with current uid
|
||||
clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
|
||||
sleep 0.5
|
||||
counter=$((counter + 1))
|
||||
done
|
||||
|
||||
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
|
||||
# and clickhouse-server can do fork-exec, for example, to run some bridge.
|
||||
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
|
||||
# explicitly ignore non-fatal signals that are used by server.
|
||||
# Number of SIGRTMIN can be determined only in runtime.
|
||||
RTMIN=$(kill -l SIGRTMIN)
|
||||
echo "
|
||||
set follow-fork-mode parent
|
||||
handle SIGHUP nostop noprint pass
|
||||
handle SIGINT nostop noprint pass
|
||||
handle SIGQUIT nostop noprint pass
|
||||
handle SIGPIPE nostop noprint pass
|
||||
handle SIGTERM nostop noprint pass
|
||||
handle SIGUSR1 nostop noprint pass
|
||||
handle SIGUSR2 nostop noprint pass
|
||||
handle SIG$RTMIN nostop noprint pass
|
||||
info signals
|
||||
continue
|
||||
backtrace full
|
||||
thread apply all backtrace full
|
||||
info registers
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
up
|
||||
disassemble /s
|
||||
p \"done\"
|
||||
detach
|
||||
quit
|
||||
" > script.gdb
|
||||
|
||||
# FIXME Hung check may work incorrectly because of attached gdb
|
||||
# 1. False positives are possible
|
||||
# 2. We cannot attach another gdb to get stacktraces if some queries hung
|
||||
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
|
||||
sleep 5
|
||||
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
|
||||
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
|
||||
}
|
||||
|
||||
function check_server_start()
|
||||
{
|
||||
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|
||||
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
|
||||
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
|
||||
>> /test_output/test_results.tsv)
|
||||
|
||||
# Remove file application_errors.txt if it's empty
|
||||
[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
|
||||
}
|
||||
|
||||
function check_logs_for_critical_errors()
|
||||
{
|
||||
# Sanitizer asserts
|
||||
rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
|
||||
rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
|
||||
rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
|
||||
&& echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
|
||||
rm -f /test_output/tmp
|
||||
|
||||
# OOM
|
||||
rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
||||
&& echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Logical errors
|
||||
rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
|
||||
&& echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No logical errors$OK" >> /test_output/test_results.tsv
|
||||
# Remove file logical_errors.txt if it's empty
|
||||
[ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
|
||||
|
||||
# No such key errors
|
||||
rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
|
||||
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file no_such_key_errors.txt if it's empty
|
||||
[ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
|
||||
|
||||
# Crash
|
||||
rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
|
||||
&& echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
|
||||
|| echo -e "Not crashed$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# It also checks for crash without stacktrace (printed by watchdog)
|
||||
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
|
||||
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
|
||||
|
||||
# Remove file fatal_messages.txt if it's empty
|
||||
[ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
|
||||
|
||||
rg -Fa "########################################" /test_output/* > /dev/null \
|
||||
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
|
||||
|
||||
function get_gdb_log_context()
|
||||
{
|
||||
rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
|
||||
}
|
||||
|
||||
rg -Fa " received signal " /test_output/gdb.log > /dev/null \
|
||||
&& echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
|
||||
|
||||
dmesg -T > /test_output/dmesg.log
|
||||
|
||||
# OOM in dmesg -- those are real
|
||||
grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
|
||||
&& echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
|
||||
|| echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
|
||||
}
|
||||
|
||||
function collect_query_and_trace_logs()
|
||||
{
|
||||
for table in query_log trace_log
|
||||
do
|
||||
clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
|
||||
done
|
||||
}
|
||||
|
||||
function collect_core_dumps()
|
||||
{
|
||||
find . -type f -maxdepth 1 -name 'core.*' | while read core; do
|
||||
zstd --threads=0 $core
|
||||
mv $core.zst /test_output/
|
||||
done
|
||||
}
|
Loading…
Reference in New Issue
Block a user