#!/bin/bash # core.COMM.PID-TID sysctl kernel.core_pattern='core.%e.%p-%P' OK="\tOK\t\\N\t" FAIL="\tFAIL\t\\N\t" FAILURE_CONTEXT_LINES=100 FAILURE_CONTEXT_MAX_LINE_WIDTH=300 function escaped() { # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language. # Also limit lines width just in case (too long lines are not really useful usually) clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH) from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'" } function head_escaped() { head -n $FAILURE_CONTEXT_LINES $1 | escaped } function unts() { grep -Po "[0-9][0-9]:[0-9][0-9] \K.*" } function trim_server_logs() { head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped } function install_packages() { dpkg -i $1/clickhouse-common-static_*.deb dpkg -i $1/clickhouse-common-static-dbg_*.deb dpkg -i $1/clickhouse-server_*.deb dpkg -i $1/clickhouse-client_*.deb } function configure() { # install test configs export USE_DATABASE_ORDINARY=1 export EXPORT_S3_STORAGE_POLICIES=1 /usr/share/clickhouse-test/config/install.sh # avoid too slow startup sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \ | sed "s|100000|10000|" \ > /etc/clickhouse-server/config.d/keeper_port.xml.tmp sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml function randomize_config_boolean_value { value=$(($RANDOM % 2)) sudo cat /etc/clickhouse-server/config.d/$2.xml \ | sed "s|<$1>[01]|<$1>$value|" \ > /etc/clickhouse-server/config.d/$2.xml.tmp sudo mv /etc/clickhouse-server/config.d/$2.xml.tmp /etc/clickhouse-server/config.d/$2.xml } if [[ -n "$RANDOMIZE_KEEPER_FEATURE_FLAGS" ]] && [[ "$RANDOMIZE_KEEPER_FEATURE_FLAGS" -eq 1 ]]; then # Randomize all Keeper feature flags randomize_config_boolean_value filtered_list keeper_port randomize_config_boolean_value multi_read keeper_port randomize_config_boolean_value check_not_exists keeper_port randomize_config_boolean_value create_if_not_exists keeper_port fi sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml if [[ -n "$ZOOKEEPER_FAULT_INJECTION" ]] && [[ "$ZOOKEEPER_FAULT_INJECTION" -eq 1 ]]; then randomize_config_boolean_value use_compression zookeeper_fault_injection else randomize_config_boolean_value use_compression zookeeper fi # for clickhouse-server (via service) echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment # for clickhouse-client export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000' # since we run clickhouse from root sudo chown root: /var/lib/clickhouse # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM). echo "1" \ > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml local total_mem total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB total_mem=$(( total_mem*1024 )) # bytes # Set maximum memory usage as half of total memory (less chance of OOM). # # But not via max_server_memory_usage but via max_memory_usage_for_user, # so that we can override this setting and execute service queries, like: # - hung check # - show/drop database # - ... # # So max_memory_usage_for_user will be a soft limit, and # max_server_memory_usage will be hard limit, and queries that should be # executed regardless memory limits will use max_memory_usage_for_user=0, # instead of relying on max_untracked_memory max_server_memory_usage_to_ram_ratio=0.5 echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}" cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml < ${max_server_memory_usage_to_ram_ratio} EOL local max_users_mem max_users_mem=$((total_mem*30/100)) # 30% echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G" cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml < 10G ${max_users_mem} EOL cat > /etc/clickhouse-server/config.d/core.xml < 107374182400 $PWD EOL } function stop() { local max_tries="${1:-90}" local check_hang="${2:-true}" local pid # Preserve the pid, since the server can hung after the PID will be deleted. pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)" clickhouse stop --max-tries "$max_tries" --do-not-kill && return if [ $check_hang == true ] then # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces. # Add a special status just in case, so it will be possible to find in the CI DB echo -e "Warning: server did not stop yet$OK" >> /test_output/test_results.tsv kill -TERM "$(pidof gdb)" ||: sleep 5 # The server could finally stop while we were terminating gdb, let's recheck if it's still running kill -s 0 $pid || return echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log clickhouse stop --force else echo -e "Warning: server did not stop yet$OK" >> /test_output/test_results.tsv clickhouse stop --force fi } function start() { counter=0 until clickhouse-client --query "SELECT 1" do if [ "$counter" -gt ${1:-120} ] then echo "Cannot start clickhouse-server" rg --text ".*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||: echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv cat /var/log/clickhouse-server/stdout.log tail -n100 /var/log/clickhouse-server/stderr.log tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e ' RaftInstance:' -e ' RaftInstance' | tail -n100 break fi # use root to match with current uid clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log sleep 0.5 counter=$((counter + 1)) done attach_gdb_to_clickhouse } function check_server_start() { clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \ || (rg --text ".*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \ && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \ >> /test_output/test_results.tsv) # Remove file application_errors.txt if it's empty [ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt } function check_logs_for_critical_errors() { # Sanitizer asserts rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \ && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \ || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv rm -f /test_output/tmp # OOM rg -Fa " Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \ && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \ || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv # Logical errors rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \ && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \ || echo -e "No logical errors$OK" >> /test_output/test_results.tsv # Remove file logical_errors.txt if it's empty [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt # No such key errors rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \ && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \ || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv rg -Fa "it is lost forever" /var/log/clickhouse-server/clickhouse-server*.log | grep 'SharedMergeTreePartCheckThread' > /dev/null \ && echo -e "Lost forever for SharedMergeTree$FAIL" >> /test_output/test_results.tsv \ || echo -e "No SharedMergeTree lost forever in clickhouse-server.log$OK" >> /test_output/test_results.tsv # Remove file no_such_key_errors.txt if it's empty [ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt # Crash rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \ && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \ || echo -e "Not crashed$OK" >> /test_output/test_results.tsv # It also checks for crash without stacktrace (printed by watchdog) rg -Fa " " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \ && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \ || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv # Remove file fatal_messages.txt if it's empty [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt rg -Faz "########################################" /test_output/* > /dev/null \ && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv function get_gdb_log_context() { rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped } rg -Fa " received signal " /test_output/gdb.log > /dev/null \ && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv dmesg -T > /test_output/dmesg.log # OOM in dmesg -- those are real grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \ && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \ || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv } function collect_query_and_trace_logs() { for table in query_log trace_log do clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: done } function collect_core_dumps() { find . -type f -maxdepth 1 -name 'core.*' | while read core; do zstd --threads=0 $core mv $core.zst /test_output/ done } # vi: ft=bash