2020-07-14 14:47:23 +00:00
#!/bin/bash
2021-06-28 11:28:49 +00:00
# shellcheck disable=SC2094
2021-06-28 13:21:17 +00:00
# shellcheck disable=SC2086
2021-12-09 21:12:45 +00:00
# shellcheck disable=SC2024
2020-07-14 14:47:23 +00:00
2020-07-15 09:23:50 +00:00
set -x
2021-08-10 20:49:05 +00:00
# Thread Fuzzer allows to check more permutations of possible thread scheduling
# and find more potential issues.
export THREAD_FUZZER_CPU_TIME_PERIOD_US = 1000
export THREAD_FUZZER_SLEEP_PROBABILITY = 0.1
export THREAD_FUZZER_SLEEP_TIME_US = 100000
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY = 1
export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY = 1
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY = 1
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY = 1
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY = 0.001
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY = 0.001
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY = 0.001
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY = 0.001
export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US = 10000
2021-09-29 11:07:02 +00:00
2021-08-10 20:49:05 +00:00
export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US = 10000
export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US = 10000
export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US = 10000
2021-08-20 12:17:51 +00:00
function install_packages( )
{
dpkg -i $1 /clickhouse-common-static_*.deb
dpkg -i $1 /clickhouse-common-static-dbg_*.deb
dpkg -i $1 /clickhouse-server_*.deb
dpkg -i $1 /clickhouse-client_*.deb
}
2020-07-14 14:47:23 +00:00
2021-02-15 18:02:21 +00:00
function configure( )
2020-08-24 00:14:24 +00:00
{
2021-02-15 18:02:21 +00:00
# install test configs
/usr/share/clickhouse-test/config/install.sh
2020-08-24 00:14:24 +00:00
2022-02-15 12:03:51 +00:00
# we mount tests folder from repo to /usr/share
ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
2021-11-16 14:45:37 +00:00
# avoid too slow startup
2021-11-16 17:03:50 +00:00
sudo cat /etc/clickhouse-server/config.d/keeper_port.xml | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
2020-08-24 00:14:24 +00:00
2021-02-15 18:02:21 +00:00
# for clickhouse-server (via service)
echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
# for clickhouse-client
export ASAN_OPTIONS = 'malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
# since we run clickhouse from root
sudo chown root: /var/lib/clickhouse
2021-04-24 00:27:23 +00:00
# Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
2021-10-25 18:15:42 +00:00
echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
2021-04-24 00:27:23 +00:00
> /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
2021-12-06 06:05:34 +00:00
local total_mem
total_mem = $( awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
total_mem = $(( total_mem*1024 )) # bytes
2021-04-24 00:27:23 +00:00
# Set maximum memory usage as half of total memory (less chance of OOM).
2021-12-06 06:05:34 +00:00
#
# But not via max_server_memory_usage but via max_memory_usage_for_user,
# so that we can override this setting and execute service queries, like:
# - hung check
# - show/drop database
# - ...
#
# So max_memory_usage_for_user will be a soft limit, and
# max_server_memory_usage will be hard limit, and queries that should be
# executed regardless memory limits will use max_memory_usage_for_user=0,
# instead of relying on max_untracked_memory
local max_server_mem
max_server_mem = $(( total_mem*75/100)) # 75%
echo " Setting max_server_memory_usage= $max_server_mem "
cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
<clickhouse>
<max_server_memory_usage>${ max_server_mem } </max_server_memory_usage>
</clickhouse>
EOL
local max_users_mem
max_users_mem = $(( total_mem*50/100)) # 50%
echo " Setting max_memory_usage_for_user= $max_users_mem "
cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
<clickhouse>
<profiles>
<default>
<max_memory_usage_for_user>${ max_users_mem } </max_memory_usage_for_user>
</default>
</profiles>
</clickhouse>
EOL
2021-02-15 18:02:21 +00:00
}
2020-08-24 00:14:24 +00:00
function stop( )
{
2021-02-14 20:31:58 +00:00
clickhouse stop
2020-08-24 00:14:24 +00:00
}
function start( )
2020-07-14 14:47:23 +00:00
{
2021-06-27 15:41:25 +00:00
# Rename existing log file - it will be more convenient to read separate files for separate server runs.
if [ -f '/var/log/clickhouse-server/clickhouse-server.log' ]
then
log_file_counter = 1
while [ -f " /var/log/clickhouse-server/clickhouse-server.log. ${ log_file_counter } " ]
do
log_file_counter = $(( log_file_counter + 1 ))
done
mv '/var/log/clickhouse-server/clickhouse-server.log' " /var/log/clickhouse-server/clickhouse-server.log. ${ log_file_counter } "
fi
2020-07-14 14:47:23 +00:00
counter = 0
until clickhouse-client --query "SELECT 1"
do
2021-08-18 14:57:36 +00:00
if [ " $counter " -gt 240 ]
2020-07-14 14:47:23 +00:00
then
2020-08-18 09:43:02 +00:00
echo "Cannot start clickhouse-server"
cat /var/log/clickhouse-server/stdout.log
2020-08-23 20:48:27 +00:00
tail -n1000 /var/log/clickhouse-server/stderr.log
2021-08-28 16:19:21 +00:00
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | grep -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n1000
2020-07-14 14:47:23 +00:00
break
fi
2021-02-14 20:31:58 +00:00
# use root to match with current uid
2021-07-16 07:46:22 +00:00
clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
2020-07-14 14:47:23 +00:00
sleep 0.5
2021-11-08 11:02:36 +00:00
cat /var/log/clickhouse-server/stdout.log
tail -n200 /var/log/clickhouse-server/stderr.log
tail -n200 /var/log/clickhouse-server/clickhouse-server.log
2020-09-30 17:06:14 +00:00
counter = $(( counter + 1 ))
2020-07-14 14:47:23 +00:00
done
2021-12-10 15:03:57 +00:00
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# and clickhouse-server can do fork-exec, for example, to run some bridge.
# Do not set nostop noprint for all signals, because some it may cause gdb to hang,
# explicitly ignore non-fatal signals that are used by server.
# Number of SIGRTMIN can be determined only in runtime.
2021-12-10 17:58:09 +00:00
RTMIN = $( kill -l SIGRTMIN)
2021-02-13 08:41:00 +00:00
echo "
2021-12-10 15:03:57 +00:00
set follow-fork-mode parent
handle SIGHUP nostop noprint pass
handle SIGINT nostop noprint pass
handle SIGQUIT nostop noprint pass
handle SIGPIPE nostop noprint pass
handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
2021-02-13 08:41:00 +00:00
continue
2022-01-04 11:03:40 +00:00
gcore
2021-12-10 15:03:57 +00:00
backtrace full
2022-02-13 12:02:15 +00:00
thread apply all backtrace full
2021-12-15 10:21:21 +00:00
info registers
disassemble /s
up
disassemble /s
up
disassemble /s
p \" done \"
2021-02-20 16:27:04 +00:00
detach
quit
2021-02-13 08:41:00 +00:00
" > script.gdb
2021-02-22 13:53:43 +00:00
# FIXME Hung check may work incorrectly because of attached gdb
# 1. False positives are possible
# 2. We cannot attach another gdb to get stacktraces if some queries hung
2021-12-10 17:10:49 +00:00
gdb -batch -command script.gdb -p " $( cat /var/run/clickhouse-server/clickhouse-server.pid) " | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
2021-12-10 15:03:57 +00:00
sleep 5
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" || :
2020-07-14 14:47:23 +00:00
}
2021-08-20 12:17:51 +00:00
install_packages package_folder
2021-02-15 18:02:21 +00:00
configure
2020-07-14 14:47:23 +00:00
2020-08-23 21:13:21 +00:00
start
2020-07-14 14:47:23 +00:00
2020-10-01 09:27:05 +00:00
# shellcheck disable=SC2086 # No quotes because I want to split it into words.
2021-11-01 10:32:56 +00:00
/s3downloader --url-prefix " $S3_URL " --dataset-names $DATASETS
2020-07-14 14:47:23 +00:00
chmod 777 -R /var/lib/clickhouse
clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary"
clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test"
2020-08-04 08:48:47 +00:00
2020-08-23 21:13:21 +00:00
stop
start
2020-07-14 14:47:23 +00:00
clickhouse-client --query "SHOW TABLES FROM datasets"
clickhouse-client --query "SHOW TABLES FROM test"
clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits"
clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits"
clickhouse-client --query "SHOW TABLES FROM test"
2021-06-03 15:16:12 +00:00
./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests " $SKIP_TESTS_OPTION " \
2021-02-18 22:08:44 +00:00
&& echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \
|| echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv
2020-07-14 14:47:23 +00:00
2020-08-23 21:13:21 +00:00
stop
start
2020-07-14 14:47:23 +00:00
2021-02-18 22:08:44 +00:00
clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/test_results.tsv \
|| echo -e 'Server failed to start\tFAIL' >> /test_output/test_results.tsv
[ -f /var/log/clickhouse-server/clickhouse-server.log ] || echo -e "Server log does not exist\tFAIL"
[ -f /var/log/clickhouse-server/stderr.log ] || echo -e "Stderr log does not exist\tFAIL"
# Print Fatal log messages to stdout
2021-07-15 07:24:35 +00:00
zgrep -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.log*
2021-02-18 22:08:44 +00:00
# Grep logs for sanitizer asserts, crashes and other critical errors
# Sanitizer asserts
2021-11-30 10:24:04 +00:00
grep -Fa "==================" /var/log/clickhouse-server/stderr.log | grep -v "in query:" >> /test_output/tmp
2021-11-30 10:22:50 +00:00
grep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
2021-07-16 07:46:22 +00:00
zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" /test_output/tmp > /dev/null \
2021-02-18 22:08:44 +00:00
&& echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'No sanitizer asserts\tOK' >> /test_output/test_results.tsv
rm -f /test_output/tmp
2021-04-09 06:39:25 +00:00
# OOM
2021-07-15 07:24:35 +00:00
zgrep -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \
2021-04-09 06:39:25 +00:00
&& echo -e 'OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'No OOM messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv
2021-02-18 22:08:44 +00:00
# Logical errors
2021-07-15 07:24:35 +00:00
zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \
2021-02-18 22:08:44 +00:00
&& echo -e 'Logical error thrown (see clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'No logical errors\tOK' >> /test_output/test_results.tsv
# Crash
2021-07-15 07:24:35 +00:00
zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \
2021-02-18 22:08:44 +00:00
&& echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'Not crashed\tOK' >> /test_output/test_results.tsv
2021-04-09 06:39:25 +00:00
# It also checks for crash without stacktrace (printed by watchdog)
2021-07-15 07:24:35 +00:00
zgrep -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \
2021-02-18 22:08:44 +00:00
&& echo -e 'Fatal message in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv
zgrep -Fa "########################################" /test_output/* > /dev/null \
&& echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv
2021-12-10 15:03:57 +00:00
zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \
&& echo -e 'Found signal in gdb.log\tFAIL' >> /test_output/test_results.tsv
2021-08-20 12:17:51 +00:00
echo -e "Backward compatibility check\n"
echo "Download previous release server"
2022-01-27 15:15:03 +00:00
mkdir previous_release_package_folder
2021-08-20 12:17:51 +00:00
clickhouse-client --query= "SELECT version()" | ./download_previous_release && echo -e 'Download script exit code\tOK' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'Download script failed\tFAIL' >> /test_output/backward_compatibility_check_results.tsv
if [ " $( ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb) " ]
then
echo -e "Successfully downloaded previous release packets\tOK" >> /test_output/backward_compatibility_check_results.tsv
stop
# Uninstall current packages
dpkg --remove clickhouse-client
dpkg --remove clickhouse-server
dpkg --remove clickhouse-common-static-dbg
dpkg --remove clickhouse-common-static
2021-09-28 11:09:14 +00:00
rm -rf /var/lib/clickhouse/*
2021-08-20 12:17:51 +00:00
# Install previous release packages
install_packages previous_release_package_folder
# Start server from previous release
configure
start
clickhouse-client --query= "SELECT 'Server version: ', version()"
# Install new package before running stress test because we should use new clickhouse-client and new clickhouse-test
install_packages package_folder
mkdir tmp_stress_output
./stress --backward-compatibility-check --output-folder tmp_stress_output --global-time-limit= 1800 \
&& echo -e 'Test script exit code\tOK' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'Test script failed\tFAIL' >> /test_output/backward_compatibility_check_results.tsv
rm -rf tmp_stress_output
clickhouse-client --query= "SELECT 'Tables count:', count() FROM system.tables"
2021-09-28 11:17:50 +00:00
2021-08-20 12:17:51 +00:00
stop
# Start new server
configure
start
clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'Server failed to start\tFAIL' >> /test_output/backward_compatibility_check_results.tsv
clickhouse-client --query= "SELECT 'Server version: ', version()"
# Let the server run for a while before checking log.
sleep 60
stop
2021-09-28 11:09:14 +00:00
# Error messages (we should ignore some errors)
2022-02-18 13:36:48 +00:00
zgrep -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
-e "REPLICA_IS_ALREADY_ACTIVE" \
-e "DDLWorker: Cannot parse DDL task query" \
-e "RaftInstance: failed to accept a rpc connection due to error 125" \
-e "UNKNOWN_DATABASE" \
-e "NETWORK_ERROR" \
-e "UNKNOWN_TABLE" \
-e "ZooKeeperClient" \
-e "KEEPER_EXCEPTION" \
-e "DirectoryMonitor" \
-e "Code: 1000, e.code() = 111, Connection refused" \
2021-08-20 12:17:51 +00:00
/var/log/clickhouse-server/clickhouse-server.log | zgrep -Fa "<Error>" > /dev/null \
&& echo -e 'Error message in clickhouse-server.log\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'No Error messages in clickhouse-server.log\tOK' >> /test_output/backward_compatibility_check_results.tsv
# Sanitizer asserts
zgrep -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
zgrep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" /test_output/tmp > /dev/null \
&& echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'No sanitizer asserts\tOK' >> /test_output/backward_compatibility_check_results.tsv
rm -f /test_output/tmp
# OOM
zgrep -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \
&& echo -e 'OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'No OOM messages in clickhouse-server.log\tOK' >> /test_output/backward_compatibility_check_results.tsv
# Logical errors
zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \
&& echo -e 'Logical error thrown (see clickhouse-server.log)\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'No logical errors\tOK' >> /test_output/backward_compatibility_check_results.tsv
# Crash
zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \
&& echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'Not crashed\tOK' >> /test_output/backward_compatibility_check_results.tsv
# It also checks for crash without stacktrace (printed by watchdog)
zgrep -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.log > /dev/null \
&& echo -e 'Fatal message in clickhouse-server.log\tFAIL' >> /test_output/backward_compatibility_check_results.tsv \
|| echo -e 'No fatal messages in clickhouse-server.log\tOK' >> /test_output/backward_compatibility_check_results.tsv
else
echo -e "Failed to download previous release packets\tFAIL" >> /test_output/backward_compatibility_check_results.tsv
fi
zgrep -Fa "FAIL" /test_output/backward_compatibility_check_results.tsv > /dev/null \
&& echo -e 'Backward compatibility check\tFAIL' >> /test_output/test_results.tsv \
|| echo -e 'Backward compatibility check\tOK' >> /test_output/test_results.tsv
2021-02-19 09:57:09 +00:00
# Put logs into /test_output/
2021-06-27 15:41:25 +00:00
for log_file in /var/log/clickhouse-server/clickhouse-server.log*
do
2021-06-28 22:29:14 +00:00
pigz < " ${ log_file } " > /test_output/" $( basename ${ log_file } ) " .gz
2021-11-19 18:17:47 +00:00
# FIXME: remove once only github actions will be left
rm " ${ log_file } "
2021-06-27 15:41:25 +00:00
done
2021-03-07 14:44:30 +00:00
tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination || :
2021-02-19 09:57:09 +00:00
mv /var/log/clickhouse-server/stderr.log /test_output/
2021-10-23 16:58:10 +00:00
# Replace the engine with Ordinary to avoid extra symlinks stuff in artifacts.
# (so that clickhouse-local --path can read it w/o extra care).
sed -i -e "s/ATTACH DATABASE _ UUID '[^']*'/ATTACH DATABASE system/" -e "s/Atomic/Ordinary/" /var/lib/clickhouse/metadata/system.sql
for table in query_log trace_log; do
sed -i " s/ATTACH TABLE _ UUID '[^']*'/ATTACH TABLE $table / " /var/lib/clickhouse/metadata/system/${ table } .sql
tar -chf /test_output/${ table } _dump.tar /var/lib/clickhouse/metadata/system.sql /var/lib/clickhouse/metadata/system/${ table } .sql /var/lib/clickhouse/data/system/${ table } || :
done
2021-02-19 09:57:09 +00:00
2021-02-18 22:08:44 +00:00
# Write check result into check_status.tsv
2021-02-25 16:11:43 +00:00
clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%') LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
2021-02-19 19:39:42 +00:00
[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
2022-01-04 11:03:40 +00:00
# Core dumps (see gcore)
# Default filename is 'core.PROCESS_ID'
for core in core.*; do
pigz $core
2022-02-13 12:02:15 +00:00
mv $core .gz /test_output/
2022-01-04 11:03:40 +00:00
done