ClickHouse/tests/ci/stress_tests.lib

#!/bin/bash

# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'

OK="\tOK\t\\N\t"
FAIL="\tFAIL\t\\N\t"

FAILURE_CONTEXT_LINES=100
FAILURE_CONTEXT_MAX_LINE_WIDTH=300

function escaped()
{
    # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
    # Also limit lines width just in case (too long lines are not really useful usually)
    clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
      from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
}

function head_escaped()
{
    head -n $FAILURE_CONTEXT_LINES $1 | escaped
}

function unts()
{
    grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
}

function trim_server_logs()
{
    head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
}

function install_packages()
{
    dpkg -i $1/clickhouse-common-static_*.deb
    dpkg -i $1/clickhouse-common-static-dbg_*.deb
    dpkg -i $1/clickhouse-server_*.deb
    dpkg -i $1/clickhouse-client_*.deb
}

function configure()
{
    # install test configs
    export USE_DATABASE_ORDINARY=1
    export EXPORT_S3_STORAGE_POLICIES=1
    /usr/share/clickhouse-test/config/install.sh

    # avoid too slow startup
    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
      | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
      > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
    sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml

    # for clickhouse-server (via service)
    echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
    # for clickhouse-client
    export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'

    # since we run clickhouse from root
    sudo chown root: /var/lib/clickhouse

    # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
    echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
        > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml

    local total_mem
    total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
    total_mem=$(( total_mem*1024 )) # bytes

    # Set maximum memory usage as half of total memory (less chance of OOM).
    #
    # But not via max_server_memory_usage but via max_memory_usage_for_user,
    # so that we can override this setting and execute service queries, like:
    # - hung check
    # - show/drop database
    # - ...
    #
    # So max_memory_usage_for_user will be a soft limit, and
    # max_server_memory_usage will be hard limit, and queries that should be
    # executed regardless memory limits will use max_memory_usage_for_user=0,
    # instead of relying on max_untracked_memory

    max_server_memory_usage_to_ram_ratio=0.5
    echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
    cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
<clickhouse>
    <max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
</clickhouse>
EOL

    local max_users_mem
    max_users_mem=$((total_mem*30/100)) # 30%
    echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
    cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
<clickhouse>
    <profiles>
        <default>
            <max_memory_usage>10G</max_memory_usage>
            <max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
        </default>
    </profiles>
</clickhouse>
EOL

    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
<clickhouse>
    <core_dump>
        <!-- 100GiB -->
        <size_limit>107374182400</size_limit>
    </core_dump>
    <!-- NOTE: no need to configure core_path,
         since clickhouse is not started as daemon (via clickhouse start)
    -->
    <core_path>$PWD</core_path>
</clickhouse>
EOL

    # Analyzer is not yet ready for testing
    cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
<clickhouse>
    <profiles>
        <default>
            <constraints>
                <allow_experimental_analyzer>
                    <readonly/>
                </allow_experimental_analyzer>
            </constraints>
        </default>
    </profiles>
</clickhouse>
EOL

}

function stop()
{
    local max_tries="${1:-90}"
    local check_hang="${2:-true}"
    local pid
    # Preserve the pid, since the server can hung after the PID will be deleted.
    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"

    clickhouse stop --max-tries "$max_tries" --do-not-kill && return

    if [ $check_hang == true ]
    then
      # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
      echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
      kill -TERM "$(pidof gdb)" ||:
      sleep 5
      echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
      timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
      clickhouse stop --force
    fi
}

function start()
{
    counter=0
    until clickhouse-client --query "SELECT 1"
    do
        if [ "$counter" -gt ${1:-120} ]
        then
            echo "Cannot start clickhouse-server"
            rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
            echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
            cat /var/log/clickhouse-server/stdout.log
            tail -n100 /var/log/clickhouse-server/stderr.log
            tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
            break
        fi
        # use root to match with current uid
        clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
        sleep 0.5
        counter=$((counter + 1))
    done

    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
    # and clickhouse-server can do fork-exec, for example, to run some bridge.
    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
    # explicitly ignore non-fatal signals that are used by server.
    # Number of SIGRTMIN can be determined only in runtime.
    RTMIN=$(kill -l SIGRTMIN)
    echo "
set follow-fork-mode parent
handle SIGHUP nostop noprint pass
handle SIGINT nostop noprint pass
handle SIGQUIT nostop noprint pass
handle SIGPIPE nostop noprint pass
handle SIGTERM nostop noprint pass
handle SIGUSR1 nostop noprint pass
handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
continue
backtrace full
thread apply all backtrace full
info registers
disassemble /s
up
disassemble /s
up
disassemble /s
p \"done\"
detach
quit
" > script.gdb

    # FIXME Hung check may work incorrectly because of attached gdb
    # 1. False positives are possible
    # 2. We cannot attach another gdb to get stacktraces if some queries hung
    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
    sleep 5
    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
}

function check_server_start()
{
    clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
        || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
        && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
        >> /test_output/test_results.tsv)

    # Remove file application_errors.txt if it's empty
    [ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
}

function check_logs_for_critical_errors()
{
    # Sanitizer asserts
    rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
    rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
    rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
        && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
        || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
    rm -f /test_output/tmp

    # OOM
    rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
        && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
        || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

    # Logical errors
    rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
        && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
        || echo -e "No logical errors$OK" >> /test_output/test_results.tsv
    # Remove file logical_errors.txt if it's empty
    [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt

    # No such key errors
    rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
        && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
        || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv

    # Remove file no_such_key_errors.txt if it's empty
    [ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt

    # Crash
    rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
        && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
        || echo -e "Not crashed$OK" >> /test_output/test_results.tsv

    # It also checks for crash without stacktrace (printed by watchdog)
    rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
        && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
        || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

    # Remove file fatal_messages.txt if it's empty
    [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt

    rg -Fa "########################################" /test_output/* > /dev/null \
      && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv

    function get_gdb_log_context()
    {
        rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
    }

    rg -Fa " received signal " /test_output/gdb.log > /dev/null \
        && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv

    dmesg -T > /test_output/dmesg.log

    # OOM in dmesg -- those are real
    grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
        && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
        || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
}

function collect_query_and_trace_logs()
{
    for table in query_log trace_log
    do
        clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
    done
}

function collect_core_dumps()
{
  find . -type f -maxdepth 1 -name 'core.*' | while read core; do
      zstd --threads=0 $core
      mv $core.zst /test_output/
  done
}