#!/bin/bash # shellcheck disable=SC2094 # shellcheck disable=SC2086 # shellcheck disable=SC2024 set -x # Thread Fuzzer allows to check more permutations of possible thread scheduling # and find more potential issues. export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000 export THREAD_FUZZER_SLEEP_PROBABILITY=0.1 export THREAD_FUZZER_SLEEP_TIME_US=100000 export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1 export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1 export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1 export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1 export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001 export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001 export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001 export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001 export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000 export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000 dpkg -i package_folder/clickhouse-common-static_*.deb dpkg -i package_folder/clickhouse-common-static-dbg_*.deb dpkg -i package_folder/clickhouse-server_*.deb dpkg -i package_folder/clickhouse-client_*.deb function configure() { # install test configs /usr/share/clickhouse-test/config/install.sh # we mount tests folder from repo to /usr/share ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test # avoid too slow startup sudo cat /etc/clickhouse-server/config.d/keeper_port.xml | sed "s|100000|10000|" > /etc/clickhouse-server/config.d/keeper_port.xml.tmp sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml # for clickhouse-server (via service) echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment # for clickhouse-client export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000' # since we run clickhouse from root sudo chown root: /var/lib/clickhouse # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM). echo "1" \ > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml local total_mem total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB total_mem=$(( total_mem*1024 )) # bytes # Set maximum memory usage as half of total memory (less chance of OOM). # # But not via max_server_memory_usage but via max_memory_usage_for_user, # so that we can override this setting and execute service queries, like: # - hung check # - show/drop database # - ... # # So max_memory_usage_for_user will be a soft limit, and # max_server_memory_usage will be hard limit, and queries that should be # executed regardless memory limits will use max_memory_usage_for_user=0, # instead of relying on max_untracked_memory local max_server_mem max_server_mem=$((total_mem*75/100)) # 75% echo "Setting max_server_memory_usage=$max_server_mem" cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml < ${max_server_mem} EOL local max_users_mem max_users_mem=$((total_mem*50/100)) # 50% echo "Setting max_memory_usage_for_user=$max_users_mem" cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml < ${max_users_mem} EOL } function stop() { clickhouse stop } function start() { # Rename existing log file - it will be more convenient to read separate files for separate server runs. if [ -f '/var/log/clickhouse-server/clickhouse-server.log' ] then log_file_counter=1 while [ -f "/var/log/clickhouse-server/clickhouse-server.log.${log_file_counter}" ] do log_file_counter=$((log_file_counter + 1)) done mv '/var/log/clickhouse-server/clickhouse-server.log' "/var/log/clickhouse-server/clickhouse-server.log.${log_file_counter}" fi counter=0 until clickhouse-client --query "SELECT 1" do if [ "$counter" -gt 240 ] then echo "Cannot start clickhouse-server" cat /var/log/clickhouse-server/stdout.log tail -n1000 /var/log/clickhouse-server/stderr.log tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | grep -F -v -e ' RaftInstance:' -e ' RaftInstance' | tail -n1000 break fi # use root to match with current uid clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log sleep 0.5 counter=$((counter + 1)) done # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog # and clickhouse-server can do fork-exec, for example, to run some bridge. # Do not set nostop noprint for all signals, because some it may cause gdb to hang, # explicitly ignore non-fatal signals that are used by server. # Number of SIGRTMIN can be determined only in runtime. RTMIN=$(kill -l SIGRTMIN) echo " set follow-fork-mode parent handle SIGHUP nostop noprint pass handle SIGINT nostop noprint pass handle SIGQUIT nostop noprint pass handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass handle SIG$RTMIN nostop noprint pass info signals continue gcore backtrace full thread apply all backtrace full info registers disassemble /s up disassemble /s up disassemble /s p \"done\" detach quit " > script.gdb # FIXME Hung check may work incorrectly because of attached gdb # 1. False positives are possible # 2. We cannot attach another gdb to get stacktraces if some queries hung gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & sleep 5 # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: } configure start # shellcheck disable=SC2086 # No quotes because I want to split it into words. /s3downloader --url-prefix "$S3_URL" --dataset-names $DATASETS chmod 777 -R /var/lib/clickhouse clickhouse-client --query "ATTACH DATABASE IF NOT EXISTS datasets ENGINE = Ordinary" clickhouse-client --query "CREATE DATABASE IF NOT EXISTS test" stop start clickhouse-client --query "SHOW TABLES FROM datasets" clickhouse-client --query "SHOW TABLES FROM test" clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" clickhouse-client --query "SHOW TABLES FROM test" ./stress --hung-check --drop-databases --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \ && echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \ || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv stop start clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/test_results.tsv \ || echo -e 'Server failed to start\tFAIL' >> /test_output/test_results.tsv [ -f /var/log/clickhouse-server/clickhouse-server.log ] || echo -e "Server log does not exist\tFAIL" [ -f /var/log/clickhouse-server/stderr.log ] || echo -e "Stderr log does not exist\tFAIL" # Print Fatal log messages to stdout zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log* # Grep logs for sanitizer asserts, crashes and other critical errors # Sanitizer asserts grep -Fa "==================" /var/log/clickhouse-server/stderr.log | grep -v "in query:" >> /test_output/tmp grep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" /test_output/tmp > /dev/null \ && echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'No sanitizer asserts\tOK' >> /test_output/test_results.tsv rm -f /test_output/tmp # OOM zgrep -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'OOM killer (or signal 9) in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'No OOM messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv # Logical errors zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'Logical error thrown (see clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'No logical errors\tOK' >> /test_output/test_results.tsv # Crash zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'Not crashed\tOK' >> /test_output/test_results.tsv # It also checks for crash without stacktrace (printed by watchdog) zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log* > /dev/null \ && echo -e 'Fatal message in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ || echo -e 'No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv zgrep -Fa "########################################" /test_output/* > /dev/null \ && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \ && echo -e 'Found signal in gdb.log\tFAIL' >> /test_output/test_results.tsv # Put logs into /test_output/ for log_file in /var/log/clickhouse-server/clickhouse-server.log* do pigz < "${log_file}" > /test_output/"$(basename ${log_file})".gz # FIXME: remove once only github actions will be left rm "${log_file}" done tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||: mv /var/log/clickhouse-server/stderr.log /test_output/ # Replace the engine with Ordinary to avoid extra symlinks stuff in artifacts. # (so that clickhouse-local --path can read it w/o extra care). sed -i -e "s/ATTACH DATABASE _ UUID '[^']*'/ATTACH DATABASE system/" -e "s/Atomic/Ordinary/" /var/lib/clickhouse/metadata/system.sql for table in query_log trace_log; do sed -i "s/ATTACH TABLE _ UUID '[^']*'/ATTACH TABLE $table/" /var/lib/clickhouse/metadata/system/${table}.sql tar -chf /test_output/${table}_dump.tar /var/lib/clickhouse/metadata/system.sql /var/lib/clickhouse/metadata/system/${table}.sql /var/lib/clickhouse/data/system/${table} ||: done # Write check result into check_status.tsv clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%') LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv # Core dumps (see gcore) # Default filename is 'core.PROCESS_ID' for core in core.*; do pigz $core mv $core.gz /test_output/ done