ClickHouse/tests/docker_scripts/stress_tests.lib

#!/bin/bash

OK="\tOK\t\\N\t"
FAIL="\tFAIL\t\\N\t"

FAILURE_CONTEXT_LINES=100
FAILURE_CONTEXT_MAX_LINE_WIDTH=300

function escaped()
{
    # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
    # Also limit lines width just in case (too long lines are not really useful usually)
    clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
      from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
}

function head_escaped()
{
    head -n "$FAILURE_CONTEXT_LINES" "$1" | escaped
}

function unts()
{
    grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
}

function trim_server_logs()
{
    head -n "$FAILURE_CONTEXT_LINES" "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
}

function install_packages()
{
    dpkg -i "$1"/clickhouse-common-static_*.deb
    dpkg -i "$1"/clickhouse-common-static-dbg_*.deb
    dpkg -i "$1"/clickhouse-server_*.deb
    dpkg -i "$1"/clickhouse-client_*.deb
}

function configure()
{
    # install test configs
    export USE_DATABASE_ORDINARY=1
    export EXPORT_S3_STORAGE_POLICIES=1
    /repo/tests/config/install.sh

    # avoid too slow startup
    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
      | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
      > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
    sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml

    function randomize_config_boolean_value {
        value=$((RANDOM % 2))
        sudo cat "/etc/clickhouse-server/config.d/$2.xml" \
        | sed "s|<$1>[01]</$1>|<$1>$value</$1>|" \
        > "/etc/clickhouse-server/config.d/$2.xml.tmp"
        sudo mv "/etc/clickhouse-server/config.d/$2.xml.tmp" "/etc/clickhouse-server/config.d/$2.xml"
    }

    if [[ -n "$RANDOMIZE_KEEPER_FEATURE_FLAGS" ]] && [[ "$RANDOMIZE_KEEPER_FEATURE_FLAGS" -eq 1 ]]; then
        # Randomize all Keeper feature flags
        randomize_config_boolean_value filtered_list keeper_port
        randomize_config_boolean_value multi_read keeper_port
        randomize_config_boolean_value check_not_exists keeper_port
        randomize_config_boolean_value create_if_not_exists keeper_port
        randomize_config_boolean_value remove_recursive keeper_port
    fi

    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml

    if [[ -n "$ZOOKEEPER_FAULT_INJECTION" ]] && [[ "$ZOOKEEPER_FAULT_INJECTION" -eq 1 ]]; then
        randomize_config_boolean_value use_compression zookeeper_fault_injection
    else
        randomize_config_boolean_value use_compression zookeeper
    fi

    randomize_config_boolean_value enable_block_number_column block_number
    randomize_config_boolean_value enable_block_offset_column block_number

    # for clickhouse-server (via service)
    echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
    # for clickhouse-client
    export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'

    # since we run clickhouse from root
    sudo chown root: /var/lib/clickhouse

    local total_mem
    total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
    total_mem=$(( total_mem*1024 )) # bytes

    # Set maximum memory usage as half of total memory (less chance of OOM).
    #
    # But not via max_server_memory_usage but via max_memory_usage_for_user,
    # so that we can override this setting and execute service queries, like:
    # - hung check
    # - show/drop database
    # - ...
    #
    # So max_memory_usage_for_user will be a soft limit, and
    # max_server_memory_usage will be hard limit, and queries that should be
    # executed regardless memory limits will use max_memory_usage_for_user=0,
    # instead of relying on max_untracked_memory

    max_server_memory_usage_to_ram_ratio=0.5
    echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
    cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
<clickhouse>
    <max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
</clickhouse>
EOL

    local max_users_mem
    max_users_mem=$((total_mem*30/100)) # 30%
    # Similar to docker/test/fuzzer/query-fuzzer-tweaks-users.xml
    echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
    cat > /etc/clickhouse-server/users.d/stress_test_tweaks-users.xml <<EOL
<clickhouse>
    <profiles>
        <default>
            <max_execution_time>60</max_execution_time>
            <max_memory_usage>10G</max_memory_usage>
            <max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>

            <table_function_remote_max_addresses>200</table_function_remote_max_addresses>

            <constraints>
                <max_execution_time>
                    <max>60</max>
                </max_execution_time>

                <max_memory_usage>
                    <max>10G</max>
                </max_memory_usage>

                <table_function_remote_max_addresses>
                    <max>200</max>
                </table_function_remote_max_addresses>

                <!-- Don't waste cycles testing the old interpreter. Spend time in the new analyzer instead -->
                <enable_analyzer>
                    <readonly/>
                </enable_analyzer>

                <!-- This feature is broken, deprecated and will be removed. We don't want more reports about it -->
                <allow_experimental_object_type>
                    <readonly/>
                </allow_experimental_object_type>

                <!-- Prevent stack overflow -->
                <max_ast_depth>
                    <readonly/>
                </max_ast_depth>
            </constraints>
        </default>
    </profiles>
</clickhouse>
EOL

    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
<clickhouse>
    <core_dump>
        <!-- 100GiB -->
        <size_limit>107374182400</size_limit>
    </core_dump>
    <!-- NOTE: no need to configure core_path,
         since clickhouse is not started as daemon (via clickhouse start)
    -->
    <core_path>$PWD</core_path>
</clickhouse>
EOL

}

function stop_server()
{
    local max_tries=90
    local check_hang=true
    local pid
    # Preserve the pid, since the server can hung after the PID will be deleted.
    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"

    clickhouse stop --max-tries "$max_tries" --do-not-kill && return

    if [ "$check_hang" == true ]
    then
      # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
      # Add a special status just in case, so it will be possible to find in the CI DB
      echo -e "Warning: server did not stop yet$OK" >> /test_output/test_results.tsv
      kill -TERM "$(pidof gdb)" ||:
      sleep 5

      # The server could finally stop while we were terminating gdb, let's recheck if it's still running
      kill -s 0 "$pid" || return
      echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
      echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
      timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
      clickhouse stop --force
    else
      echo -e "Warning: server did not stop yet$OK" >> /test_output/test_results.tsv
      clickhouse stop --force
    fi
}

function start_server()
{
    counter=0
    max_attempt=120
    until clickhouse-client --query "SELECT 1"
    do
        if [ "$counter" -gt "$max_attempt" ]
        then
            echo "Cannot start clickhouse-server"
            rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
            echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
            cat /var/log/clickhouse-server/stdout.log
            tail -n100 /var/log/clickhouse-server/stderr.log
            tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
            break
        fi
        # use root to match with current uid
        clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
        sleep 0.5
        counter=$((counter + 1))
    done

    attach_gdb_to_clickhouse
}

function check_server_start()
{
    clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
        || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
        && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
        >> /test_output/test_results.tsv)

    # Remove file application_errors.txt if it's empty
    [ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
}

function check_logs_for_critical_errors()
{
    # Sanitizer asserts
    sed -n '/WARNING:.*anitizer/,/^$/p' /var/log/clickhouse-server/stderr*.log >> /test_output/tmp
    rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
        && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
        || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
    rm -f /test_output/tmp

    # OOM
    rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
        && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
        || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

    # Logical errors
    rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
        && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
        || echo -e "No logical errors$OK" >> /test_output/test_results.tsv
    # Remove file logical_errors.txt if it's empty
    [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt

    # No such key errors (ignore a.myext which is used in 02724_database_s3.sh and does not exist)
    rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log | grep -v "a.myext" > /test_output/no_such_key_errors.txt \
        && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
        || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv

    rg -Fa "it is lost forever" /var/log/clickhouse-server/clickhouse-server*.log | grep 'SharedMergeTreePartCheckThread' > /dev/null \
        && echo -e "Lost forever for SharedMergeTree$FAIL" >> /test_output/test_results.tsv \
        || echo -e "No SharedMergeTree lost forever in clickhouse-server.log$OK" >> /test_output/test_results.tsv

    # Remove file no_such_key_errors.txt if it's empty
    [ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt

    # Crash. This must have fewer '#'s than the command below, otherwise the command below will match
    # the echo of this command (if set -x is enabled, and this script's stdout is sent
    # to /test_output/run.log).
    rg -Fa "#######################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
        && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
        || echo -e "Not crashed$OK" >> /test_output/test_results.tsv

    # It also checks for crash without stacktrace (printed by watchdog)
    rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
        && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
        || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

    # Remove file fatal_messages.txt if it's empty
    [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt

    rg -Faz "########################################" /test_output/* > /dev/null \
      && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv

    function get_gdb_log_context()
    {
        rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
    }

    rg -Fa " received signal " /test_output/gdb.log > /dev/null \
        && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv

    dmesg -T > /test_output/dmesg.log

    # OOM in dmesg -- those are real
    grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
        && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
        || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
}

function collect_query_and_trace_logs()
{
    for table in query_log trace_log metric_log
    do
        # Don't ignore errors here, it leads to ignore sanitizer reports when running clickhouse-local
        clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst
    done
}

# vi: ft=bash