Make better

2024-11-21 23:21:59 +00:00 · 2023-02-24 16:43:28 +00:00 · 2023-02-24 16:43:28 +00:00 · 8c6cf28c01
commit 8c6cf28c01
parent 6b7d3bf07b
3 changed files with 463 additions and 753 deletions
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -8,230 +8,13 @@ dmesg --clear

 set -x

-# core.COMM.PID-TID
-sysctl kernel.core_pattern='core.%e.%p-%P'
+# we mount tests folder from repo to /usr/share
+ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
+ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test

-OK="\tOK\t\\N\t"
-FAIL="\tFAIL\t\\N\t"
-
-FAILURE_CONTEXT_LINES=50
-FAILURE_CONTEXT_MAX_LINE_WIDTH=400
-
-function escaped()
-{
-    # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
-    # Also limit lines width just in case (too long lines are not really useful usually)
-    clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
-      from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
-}
-function head_escaped()
-{
-    head -n $FAILURE_CONTEXT_LINES $1 | escaped
-}
-function unts()
-{
-    grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
-}
-function trim_server_logs()
-{
-    head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
-}
-
-function install_packages()
-{
-    dpkg -i $1/clickhouse-common-static_*.deb
-    dpkg -i $1/clickhouse-common-static-dbg_*.deb
-    dpkg -i $1/clickhouse-server_*.deb
-    dpkg -i $1/clickhouse-client_*.deb
-}
-
-function configure()
-{
-    # install test configs
-    export USE_DATABASE_ORDINARY=1
-    export EXPORT_S3_STORAGE_POLICIES=1
-    /usr/share/clickhouse-test/config/install.sh
-
-    # we mount tests folder from repo to /usr/share
-    ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
-    ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
-    ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages
-    ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag
-
-    # avoid too slow startup
-    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
-      | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
-      > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
-    sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
-    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
-    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
-
-    # for clickhouse-server (via service)
-    echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
-    # for clickhouse-client
-    export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
-
-    # since we run clickhouse from root
-    sudo chown root: /var/lib/clickhouse
-
-    # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
-    echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
-        > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
-
-    local total_mem
-    total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
-    total_mem=$(( total_mem*1024 )) # bytes
-
-    # Set maximum memory usage as half of total memory (less chance of OOM).
-    #
-    # But not via max_server_memory_usage but via max_memory_usage_for_user,
-    # so that we can override this setting and execute service queries, like:
-    # - hung check
-    # - show/drop database
-    # - ...
-    #
-    # So max_memory_usage_for_user will be a soft limit, and
-    # max_server_memory_usage will be hard limit, and queries that should be
-    # executed regardless memory limits will use max_memory_usage_for_user=0,
-    # instead of relying on max_untracked_memory
-
-    max_server_memory_usage_to_ram_ratio=0.5
-    echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
-    cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
-<clickhouse>
-    <max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
-</clickhouse>
-EOL
-
-    local max_users_mem
-    max_users_mem=$((total_mem*30/100)) # 30%
-    echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
-    cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
-<clickhouse>
-    <profiles>
-        <default>
-            <max_memory_usage>10G</max_memory_usage>
-            <max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
-        </default>
-    </profiles>
-</clickhouse>
-EOL
-
-    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
-<clickhouse>
-    <core_dump>
-        <!-- 100GiB -->
-        <size_limit>107374182400</size_limit>
-    </core_dump>
-    <!-- NOTE: no need to configure core_path,
-         since clickhouse is not started as daemon (via clickhouse start)
-    -->
-    <core_path>$PWD</core_path>
-</clickhouse>
-EOL
-
-    # Let OOM killer terminate other processes before clickhouse-server:
-    cat > /etc/clickhouse-server/config.d/oom_score.xml <<EOL
-<clickhouse>
-    <oom_score>-1000</oom_score>
-</clickhouse>
-EOL
-
-    # Analyzer is not yet ready for testing
-    cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
-<clickhouse>
-    <profiles>
-        <default>
-            <constraints>
-                <allow_experimental_analyzer>
-                    <readonly/>
-                </allow_experimental_analyzer>
-            </constraints>
-        </default>
-    </profiles>
-</clickhouse>
-EOL
-
-}
-
-function stop()
-{
-    local max_tries="${1:-90}"
-    local pid
-    # Preserve the pid, since the server can hung after the PID will be deleted.
-    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
-
-    clickhouse stop --max-tries "$max_tries" --do-not-kill && return
-
-    # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
-    echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
-    kill -TERM "$(pidof gdb)" ||:
-    sleep 5
-    echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
-    timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
-    clickhouse stop --force
-}
-
-function start()
-{
-    counter=0
-    until clickhouse-client --query "SELECT 1"
-    do
-        if [ "$counter" -gt ${1:-120} ]
-        then
-            echo "Cannot start clickhouse-server"
-            rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
-            echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
-            cat /var/log/clickhouse-server/stdout.log
-            tail -n100 /var/log/clickhouse-server/stderr.log
-            tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
-            break
-        fi
-        # use root to match with current uid
-        clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
-        sleep 0.5
-        counter=$((counter + 1))
-    done
-
-    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
-    # and clickhouse-server can do fork-exec, for example, to run some bridge.
-    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
-    # explicitly ignore non-fatal signals that are used by server.
-    # Number of SIGRTMIN can be determined only in runtime.
-    RTMIN=$(kill -l SIGRTMIN)
-    echo "
-set follow-fork-mode parent
-handle SIGHUP nostop noprint pass
-handle SIGINT nostop noprint pass
-handle SIGQUIT nostop noprint pass
-handle SIGPIPE nostop noprint pass
-handle SIGTERM nostop noprint pass
-handle SIGUSR1 nostop noprint pass
-handle SIGUSR2 nostop noprint pass
-handle SIG$RTMIN nostop noprint pass
-info signals
-continue
-backtrace full
-thread apply all backtrace full
-info registers
-disassemble /s
-up
-disassemble /s
-up
-disassemble /s
-p \"done\"
-detach
-quit
-" > script.gdb
-
-    # FIXME Hung check may work incorrectly because of attached gdb
-    # 1. False positives are possible
-    # 2. We cannot attach another gdb to get stacktraces if some queries hung
-    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
-    sleep 5
-    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
-    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
-}
+# Stress tests and upgrade check uses similar code that was placed
+# in a separate bash library. See tests/ci/stress_tests.lib
+source /usr/share/clickhouse-test/ci/stress_tests.lib

 install_packages package_folder

@ -414,13 +197,7 @@ unset "${!THREAD_@}"

 start

-clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
-    || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
-    && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
-    >> /test_output/test_results.tsv)
-
-# Remove file application_errors.txt if it's empty
-[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
+check_server_start

 stop

@ -430,71 +207,11 @@ stop
 mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.final.log

 # Grep logs for sanitizer asserts, crashes and other critical errors
+check_logs_for_critical_errors

-# Sanitizer asserts
-rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
-rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
-rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
-    && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
-    || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
-rm -f /test_output/tmp
+tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:

-# OOM
-rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
-    && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
-    || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
-
-# Logical errors
-rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
-    && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
-    || echo -e "No logical errors$OK" >> /test_output/test_results.tsv
-
-# Remove file logical_errors.txt if it's empty
-[ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
-
-# No such key errors
-rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
-    && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
-    || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
-
-# Remove file no_such_key_errors.txt if it's empty
-[ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
-
-# Crash
-rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
-    && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
-    || echo -e "Not crashed$OK" >> /test_output/test_results.tsv
-
-# It also checks for crash without stacktrace (printed by watchdog)
-rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
-    && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
-    || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
-
-# Remove file fatal_messages.txt if it's empty
-[ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
-
-rg -Fa "########################################" /test_output/* > /dev/null \
-    && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
-
-function get_gdb_log_context()
-{
-    rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
-}
-
-rg -Fa " received signal " /test_output/gdb.log > /dev/null \
-    && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
-
-for table in query_log trace_log
-do
-    clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
-done
-
-dmesg -T > /test_output/dmesg.log
-
-# OOM in dmesg -- those are real
-grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
-    && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
-    || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
+collect_query_and_trace_logs

 mv /var/log/clickhouse-server/stderr.log /test_output/

@ -514,8 +231,4 @@ rowNumberInAllBlocks()
 LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv || echo "failure\tCannot parse test_results.tsv" > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv

-# Core dumps
-find . -type f -maxdepth 1 -name 'core.*' | while read core; do
-    zstd --threads=0 $core
-    mv $core.zst /test_output/
-done
+collect_core_dumps
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -3,267 +3,22 @@
 # shellcheck disable=SC2086
 # shellcheck disable=SC2024

-# This script is similar to script for common stress test
-
 # Avoid overlaps with previous runs
 dmesg --clear

 set -x

-# core.COMM.PID-TID
-sysctl kernel.core_pattern='core.%e.%p-%P'
-
-OK="\tOK\t\\N\t"
-FAIL="\tFAIL\t\\N\t"
-
-FAILURE_CONTEXT_LINES=50
-FAILURE_CONTEXT_MAX_LINE_WIDTH=400
-
-function escaped()
-{
-    # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
-    # Also limit lines width just in case (too long lines are not really useful usually)
-    clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
-      from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
-}
-function head_escaped()
-{
-    head -n $FAILURE_CONTEXT_LINES $1 | escaped
-}
-function unts()
-{
-    grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
-}
-function trim_server_logs()
-{
-    head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
-}
-
-function install_packages()
-{
-    dpkg -i $1/clickhouse-common-static_*.deb
-    dpkg -i $1/clickhouse-common-static-dbg_*.deb
-    dpkg -i $1/clickhouse-server_*.deb
-    dpkg -i $1/clickhouse-client_*.deb
-}
-
-function configure()
-{
-    # install test configs
-    export USE_DATABASE_ORDINARY=1
-    export EXPORT_S3_STORAGE_POLICIES=1
-    /usr/share/clickhouse-test/config/install.sh
-
-    # avoid too slow startup
-    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
-      | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
-      > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
-    sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
-    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
-    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
-
-    # for clickhouse-server (via service)
-    echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
-    # for clickhouse-client
-    export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
-
-    # since we run clickhouse from root
-    sudo chown root: /var/lib/clickhouse
-
-    # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
-    echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
-        > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
-
-
-    local total_mem
-    total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
-    total_mem=$(( total_mem*1024 )) # bytes
-
-    # Set maximum memory usage as half of total memory (less chance of OOM).
-    #
-    # But not via max_server_memory_usage but via max_memory_usage_for_user,
-    # so that we can override this setting and execute service queries, like:
-    # - hung check
-    # - show/drop database
-    # - ...
-    #
-    # So max_memory_usage_for_user will be a soft limit, and
-    # max_server_memory_usage will be hard limit, and queries that should be
-    # executed regardless memory limits will use max_memory_usage_for_user=0,
-    # instead of relying on max_untracked_memory
-
-    max_server_memory_usage_to_ram_ratio=0.5
-    echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
-    cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
-<clickhouse>
-    <max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
-</clickhouse>
-EOL
-
-    local max_users_mem
-    max_users_mem=$((total_mem*30/100)) # 30%
-    echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
-    cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
-<clickhouse>
-    <profiles>
-        <default>
-            <max_memory_usage>10G</max_memory_usage>
-            <max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
-        </default>
-    </profiles>
-</clickhouse>
-EOL
-
-    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
-<clickhouse>
-    <core_dump>
-        <!-- 100GiB -->
-        <size_limit>107374182400</size_limit>
-    </core_dump>
-    <!-- NOTE: no need to configure core_path,
-         since clickhouse is not started as daemon (via clickhouse start)
-    -->
-    <core_path>$PWD</core_path>
-</clickhouse>
-EOL
-
-    # Let OOM killer terminate other processes before clickhouse-server:
-    cat > /etc/clickhouse-server/config.d/oom_score.xml <<EOL
-<clickhouse>
-    <oom_score>-1000</oom_score>
-</clickhouse>
-EOL
-
-    # Analyzer is not yet ready for testing
-    cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
-<clickhouse>
-    <profiles>
-        <default>
-            <constraints>
-                <allow_experimental_analyzer>
-                    <readonly/>
-                </allow_experimental_analyzer>
-            </constraints>
-        </default>
-    </profiles>
-</clickhouse>
-EOL
-
-}
-
-function stop()
-{
-    local max_tries="${1:-90}"
-    local pid
-    # Preserve the pid, since the server can hung after the PID will be deleted.
-    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
-
-    clickhouse stop --max-tries "$max_tries" --do-not-kill && return
-
-    # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
-    echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
-    kill -TERM "$(pidof gdb)" ||:
-    sleep 5
-    echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
-    timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
-    clickhouse stop --force
-}
-
-function start()
-{
-    counter=0
-    until clickhouse-client --query "SELECT 1"
-    do
-        if [ "$counter" -gt ${1:-120} ]
-        then
-            echo "Cannot start clickhouse-server"
-            rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
-            echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
-            cat /var/log/clickhouse-server/stdout.log
-            tail -n100 /var/log/clickhouse-server/stderr.log
-            tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
-            break
-        fi
-        # use root to match with current uid
-        clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
-        sleep 0.5
-        counter=$((counter + 1))
-    done
-
-    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
-    # and clickhouse-server can do fork-exec, for example, to run some bridge.
-    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
-    # explicitly ignore non-fatal signals that are used by server.
-    # Number of SIGRTMIN can be determined only in runtime.
-    RTMIN=$(kill -l SIGRTMIN)
-    echo "
-set follow-fork-mode parent
-handle SIGHUP nostop noprint pass
-handle SIGINT nostop noprint pass
-handle SIGQUIT nostop noprint pass
-handle SIGPIPE nostop noprint pass
-handle SIGTERM nostop noprint pass
-handle SIGUSR1 nostop noprint pass
-handle SIGUSR2 nostop noprint pass
-handle SIG$RTMIN nostop noprint pass
-info signals
-continue
-backtrace full
-thread apply all backtrace full
-info registers
-disassemble /s
-up
-disassemble /s
-up
-disassemble /s
-p \"done\"
-detach
-quit
-" > script.gdb
-
-    # FIXME Hung check may work incorrectly because of attached gdb
-    # 1. False positives are possible
-    # 2. We cannot attach another gdb to get stacktraces if some queries hung
-    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
-    sleep 5
-    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
-    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
-}
-
-# Thread Fuzzer allows to check more permutations of possible thread scheduling
-# and find more potential issues.
-# Temporarily disable ThreadFuzzer with tsan because of https://github.com/google/sanitizers/issues/1540
-is_tsan_build=$(clickhouse local -q "select value like '% -fsanitize=thread %' from system.build_options where name='CXX_FLAGS'")
-if [ "$is_tsan_build" -eq "0" ]; then
-    export THREAD_FUZZER_CPU_TIME_PERIOD_US=1000
-    export THREAD_FUZZER_SLEEP_PROBABILITY=0.1
-    export THREAD_FUZZER_SLEEP_TIME_US=100000
-
-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_MIGRATE_PROBABILITY=1
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_MIGRATE_PROBABILITY=1
-
-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_PROBABILITY=0.001
-    export THREAD_FUZZER_pthread_mutex_lock_BEFORE_SLEEP_TIME_US=10000
-
-    export THREAD_FUZZER_pthread_mutex_lock_AFTER_SLEEP_TIME_US=10000
-    export THREAD_FUZZER_pthread_mutex_unlock_BEFORE_SLEEP_TIME_US=10000
-    export THREAD_FUZZER_pthread_mutex_unlock_AFTER_SLEEP_TIME_US=10000
-fi
-
-azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
-./setup_minio.sh stateless # to have a proper environment
-
 # we mount tests folder from repo to /usr/share
 ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
 ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
 ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages
 ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag

+source /usr/share/clickhouse-test/ci/stress_tests.lib
+
+azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log &
+./setup_minio.sh stateless # to have a proper environment
+
 echo "Get previous release tag"
 previous_release_tag=$(dpkg --info package_folder/clickhouse-client*.deb | grep "Version: " | awk '{print $2}' | cut -f1 -d'+' | get_previous_release_tag)
 echo $previous_release_tag
@ -274,224 +29,161 @@ git clone https://github.com/ClickHouse/ClickHouse.git --no-tags --progress --br
 echo "Download clickhouse-server from the previous release"
 mkdir previous_release_package_folder

-echo $previous_release_tag | download_release_packages && echo -e 'Download script exit code\tOK' >> /test_output/test_results.tsv \
-    || echo -e 'Download script failed\tFAIL' >> /test_output/test_results.tsv
+echo $previous_release_tag | download_release_packages && echo -e "Download script exit code$OK" >> /test_output/test_results.tsv \
+    || echo -e "Download script failed$FAIL" >> /test_output/test_results.tsv

 # Check if we cloned previous release repository successfully
 if ! [ "$(ls -A previous_release_repository/tests/queries)" ]
 then
-    echo -e "Failed to clone previous release tests\tFAIL" >> /test_output/test_results.tsv
+    echo -e 'failure\tFailed to clone previous release tests' > /test_output/check_status.tsv
+    exit
 elif ! [ "$(ls -A previous_release_package_folder/clickhouse-common-static_*.deb && ls -A previous_release_package_folder/clickhouse-server_*.deb)" ]
 then
-    echo -e "Failed to download previous release packages\tFAIL" >> /test_output/test_results.tsv
-else
-    echo -e "Successfully cloned previous release tests\tOK" >> /test_output/test_results.tsv
-    echo -e "Successfully downloaded previous release packages\tOK" >> /test_output/test_results.tsv
-
-    # Make upgrade check more funny by forcing Ordinary engine for system database
-    mkdir /var/lib/clickhouse/metadata
-    echo "ATTACH DATABASE system ENGINE=Ordinary" > /var/lib/clickhouse/metadata/system.sql
-
-    # Install previous release packages
-    install_packages previous_release_package_folder
-
-    # Start server from previous release
-    # Let's enable S3 storage by default
-    export USE_S3_STORAGE_FOR_MERGE_TREE=1
-    # Previous version may not be ready for fault injections
-    export ZOOKEEPER_FAULT_INJECTION=0
-    configure
-
-    # But we still need default disk because some tables loaded only into it
-    sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
-      | sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
-      > /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp    mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
-    sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
-    sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
-
-    # Avoid "Setting s3_check_objects_after_upload is neither a builtin setting..."
-    rm -f /etc/clickhouse-server/users.d/enable_blobs_check.xml ||:
-    rm -f /etc/clickhouse-server/users.d/marks.xml ||:
-
-    # Remove s3 related configs to avoid "there is no disk type `cache`"
-    rm -f /etc/clickhouse-server/config.d/storage_conf.xml ||:
-    rm -f /etc/clickhouse-server/config.d/azure_storage_conf.xml ||:
-
-    # Turn on after 22.12
-    rm -f /etc/clickhouse-server/config.d/compressed_marks_and_index.xml ||:
-    # it uses recently introduced settings which previous versions may not have
-    rm -f /etc/clickhouse-server/users.d/insert_keeper_retries.xml ||:
-
-    start
-
-    clickhouse-client --query="SELECT 'Server version: ', version()"
-    
-    mkdir tmp_stress_output
-
-    stress --test-cmd="/usr/bin/clickhouse-test --queries=\"previous_release_repository/tests/queries\""  --upgrade-check --output-folder tmp_stress_output --global-time-limit=1200 \
-        && echo -e "Test script exit code$OK" >> /test_output/test_results.tsv \
-        || echo -e "Test script failed$FAIL script exit code: $?" >> /test_output/test_results.tsv
-
-    rm -rf tmp_stress_output
-
-    # We experienced deadlocks in this command in very rare cases. Let's debug it:
-    timeout 10m clickhouse-client --query="SELECT 'Tables count:', count() FROM system.tables" ||
-    (
-        echo "thread apply all backtrace (on select tables count)" >> /test_output/gdb.log
-        timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
-        clickhouse stop --force
-    )
-
-    # Use bigger timeout for previous version
-    stop 300
-    mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
-
-    # Install and start new server
-    install_packages package_folder
-    # Disable fault injections on start (we don't test them here, and it can lead to tons of requests in case of huge number of tables).
-    export ZOOKEEPER_FAULT_INJECTION=0
-    configure
-    start 500
-    clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
-        || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
-        && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
-        >> /test_output/test_results.tsv)
-
-    # Remove file application_errors.txt if it's empty
-    [ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
-
-    clickhouse-client --query="SELECT 'Server version: ', version()"
-
-    # Let the server run for a while before checking log.
-    sleep 60
-
-    stop
-    mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.upgrade.log
-
-    # Error messages (we should ignore some errors)
-    # FIXME https://github.com/ClickHouse/ClickHouse/issues/38643 ("Unknown index: idx.")
-    # FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 ("Cannot parse string 'Hello' as UInt64")
-    # FIXME Not sure if it's expected, but some tests from stress test may not be finished yet when we restarting server.
-    #       Let's just ignore all errors from queries ("} <Error> TCPHandler: Code:", "} <Error> executeQuery: Code:")
-    # FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'")
-    # NOTE  Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected
-    #       ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part")
-    # FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility
-    echo "Check for Error messages in server log:"
-    rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
-               -e "Code: 236. DB::Exception: Cancelled mutating parts" \
-               -e "REPLICA_IS_ALREADY_ACTIVE" \
-               -e "REPLICA_ALREADY_EXISTS" \
-               -e "ALL_REPLICAS_LOST" \
-               -e "DDLWorker: Cannot parse DDL task query" \
-               -e "RaftInstance: failed to accept a rpc connection due to error 125" \
-               -e "UNKNOWN_DATABASE" \
-               -e "NETWORK_ERROR" \
-               -e "UNKNOWN_TABLE" \
-               -e "ZooKeeperClient" \
-               -e "KEEPER_EXCEPTION" \
-               -e "DirectoryMonitor" \
-               -e "TABLE_IS_READ_ONLY" \
-               -e "Code: 1000, e.code() = 111, Connection refused" \
-               -e "UNFINISHED" \
-               -e "NETLINK_ERROR" \
-               -e "Renaming unexpected part" \
-               -e "PART_IS_TEMPORARILY_LOCKED" \
-               -e "and a merge is impossible: we didn't find" \
-               -e "found in queue and some source parts for it was lost" \
-               -e "is lost forever." \
-               -e "Unknown index: idx." \
-               -e "Cannot parse string 'Hello' as UInt64" \
-               -e "} <Error> TCPHandler: Code:" \
-               -e "} <Error> executeQuery: Code:" \
-               -e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \
-               -e "This engine is deprecated and is not supported in transactions" \
-               -e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \
-               -e "The set of parts restored in place of" \
-               -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \
-               -e "Code: 269. DB::Exception: Destination table is myself" \
-               -e "Coordination::Exception: Connection loss" \
-               -e "MutateFromLogEntryTask" \
-               -e "No connection to ZooKeeper, cannot get shared table ID" \
-               -e "Session expired" \
-               -e "TOO_MANY_PARTS" \
-               -e "Authentication failed" \
-               -e "Container already exists" \
-        /var/log/clickhouse-server/clickhouse-server.upgrade.log | zgrep -Fa "<Error>" > /test_output/upgrade_error_messages.txt \
-        && echo -e "Error message in clickhouse-server.log (see upgrade_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \
-            >> /test_output/test_results.tsv \
-        || echo -e "No Error messages after server upgrade$OK" >> /test_output/test_results.tsv
-
-    # Remove file bc_check_error_messages.txt if it's empty
-    [ -s /test_output/upgrade_error_messages.txt ] || rm /test_output/upgrade_error_messages.txt
-
-    # Sanitizer asserts
-    rg -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
-    rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
-    rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
-        && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
-        || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
-    rm -f /test_output/tmp
-
-    # OOM
-    rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server.*.log > /dev/null \
-        && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
-        || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
-
-    # Logical errors
-    echo "Check for Logical errors in server log:"
-    rg -Fa -A20 "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.*.log > /test_output/logical_errors.txt \
-        && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
-        || echo -e "No logical errors$OK" >> /test_output/test_results.tsv
-
-    # Remove file logical_errors.txt if it's empty
-    [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
-
-    # Crash
-    rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.*.log > /dev/null \
-        && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
-        || echo -e "Not crashed$OK" >> /test_output/test_results.tsv
-
-    # It also checks for crash without stacktrace (printed by watchdog)
-    echo "Check for Fatal message in server log:"
-    rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.*.log > /test_output/fatal_messages.txt \
-        && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
-        || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
-
-    # Remove file fatal_messages.txt if it's empty
-    [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
-
-    rg -Fa "########################################" /test_output/* > /dev/null \
-        && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
-
-    tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
-    for table in query_log trace_log
-    do
-        clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" \
-          | zstd --threads=0 > /test_output/$table.tsv.zst ||:
-    done
+    echo -e 'failure\tFailed to download previous release packages' > /test_output/check_status.tsv
+    exit
 fi

-dmesg -T > /test_output/dmesg.log
+echo -e "Successfully cloned previous release tests$OK" >> /test_output/test_results.tsv
+echo -e "Successfully downloaded previous release packages$OK" >> /test_output/test_results.tsv

-# OOM in dmesg -- those are real
-grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
-    && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
-    || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
+# Make upgrade check more funny by forcing Ordinary engine for system database
+mkdir /var/lib/clickhouse/metadata
+echo "ATTACH DATABASE system ENGINE=Ordinary" > /var/lib/clickhouse/metadata/system.sql
+
+# Install previous release packages
+install_packages previous_release_package_folder
+
+# Start server from previous release
+# Let's enable S3 storage by default
+export USE_S3_STORAGE_FOR_MERGE_TREE=1
+# Previous version may not be ready for fault injections
+export ZOOKEEPER_FAULT_INJECTION=0
+configure
+
+# But we still need default disk because some tables loaded only into it
+sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
+  | sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \
+  > /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp    mv /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml.tmp /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+sudo chown clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+sudo chgrp clickhouse /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml
+
+start
+
+clickhouse-client --query="SELECT 'Server version: ', version()"
+
+mkdir tmp_stress_output
+
+stress --test-cmd="/usr/bin/clickhouse-test --queries=\"previous_release_repository/tests/queries\""  --upgrade-check --output-folder tmp_stress_output --global-time-limit=1200 \
+    && echo -e "Test script exit code$OK" >> /test_output/test_results.tsv \
+    || echo -e "Test script failed$FAIL script exit code: $?" >> /test_output/test_results.tsv
+
+rm -rf tmp_stress_output
+
+# We experienced deadlocks in this command in very rare cases. Let's debug it:
+timeout 10m clickhouse-client --query="SELECT 'Tables count:', count() FROM system.tables" ||
+(
+    echo "thread apply all backtrace (on select tables count)" >> /test_output/gdb.log
+    timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
+    clickhouse stop --force
+)
+
+# Use bigger timeout for previous version and disable additional hang check
+stop 300 false
+mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
+
+# Install and start new server
+install_packages package_folder
+# Disable fault injections on start (we don't test them here, and it can lead to tons of requests in case of huge number of tables).
+export ZOOKEEPER_FAULT_INJECTION=0
+configure
+start 500
+clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
+    || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
+    && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
+    >> /test_output/test_results.tsv)
+
+# Remove file application_errors.txt if it's empty
+[ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
+
+clickhouse-client --query="SELECT 'Server version: ', version()"
+
+# Let the server run for a while before checking log.
+sleep 60
+
+stop
+mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.upgrade.log
+
+# Error messages (we should ignore some errors)
+# FIXME https://github.com/ClickHouse/ClickHouse/issues/38643 ("Unknown index: idx.")
+# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 ("Cannot parse string 'Hello' as UInt64")
+# FIXME Not sure if it's expected, but some tests from stress test may not be finished yet when we restarting server.
+#       Let's just ignore all errors from queries ("} <Error> TCPHandler: Code:", "} <Error> executeQuery: Code:")
+# FIXME https://github.com/ClickHouse/ClickHouse/issues/39197 ("Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'")
+# NOTE  Incompatibility was introduced in https://github.com/ClickHouse/ClickHouse/pull/39263, it's expected
+#       ("This engine is deprecated and is not supported in transactions", "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part")
+# FIXME https://github.com/ClickHouse/ClickHouse/issues/39174 - bad mutation does not indicate backward incompatibility
+echo "Check for Error messages in server log:"
+rg -Fav -e "Code: 236. DB::Exception: Cancelled merging parts" \
+           -e "Code: 236. DB::Exception: Cancelled mutating parts" \
+           -e "REPLICA_IS_ALREADY_ACTIVE" \
+           -e "REPLICA_ALREADY_EXISTS" \
+           -e "ALL_REPLICAS_LOST" \
+           -e "DDLWorker: Cannot parse DDL task query" \
+           -e "RaftInstance: failed to accept a rpc connection due to error 125" \
+           -e "UNKNOWN_DATABASE" \
+           -e "NETWORK_ERROR" \
+           -e "UNKNOWN_TABLE" \
+           -e "ZooKeeperClient" \
+           -e "KEEPER_EXCEPTION" \
+           -e "DirectoryMonitor" \
+           -e "TABLE_IS_READ_ONLY" \
+           -e "Code: 1000, e.code() = 111, Connection refused" \
+           -e "UNFINISHED" \
+           -e "NETLINK_ERROR" \
+           -e "Renaming unexpected part" \
+           -e "PART_IS_TEMPORARILY_LOCKED" \
+           -e "and a merge is impossible: we didn't find" \
+           -e "found in queue and some source parts for it was lost" \
+           -e "is lost forever." \
+           -e "Unknown index: idx." \
+           -e "Cannot parse string 'Hello' as UInt64" \
+           -e "} <Error> TCPHandler: Code:" \
+           -e "} <Error> executeQuery: Code:" \
+           -e "Missing columns: 'v3' while processing query: 'v3, k, v1, v2, p'" \
+           -e "This engine is deprecated and is not supported in transactions" \
+           -e "[Queue = DB::MergeMutateRuntimeQueue]: Code: 235. DB::Exception: Part" \
+           -e "The set of parts restored in place of" \
+           -e "(ReplicatedMergeTreeAttachThread): Initialization failed. Error" \
+           -e "Code: 269. DB::Exception: Destination table is myself" \
+           -e "Coordination::Exception: Connection loss" \
+           -e "MutateFromLogEntryTask" \
+           -e "No connection to ZooKeeper, cannot get shared table ID" \
+           -e "Session expired" \
+           -e "TOO_MANY_PARTS" \
+           -e "Authentication failed" \
+           -e "Container already exists" \
+    /var/log/clickhouse-server/clickhouse-server.upgrade.log | zgrep -Fa "<Error>" > /test_output/upgrade_error_messages.txt \
+    && echo -e "Error message in clickhouse-server.log (see upgrade_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \
+        >> /test_output/test_results.tsv \
+    || echo -e "No Error messages after server upgrade$OK" >> /test_output/test_results.tsv
+
+# Remove file upgrade_error_messages.txt if it's empty
+[ -s /test_output/upgrade_error_messages.txt ] || rm /test_output/upgrade_error_messages.txt
+
+# Grep logs for sanitizer asserts, crashes and other critical errors
+check_logs_for_critical_errors
+
+tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||:
+
+collect_query_and_trace_logs
+
+check_oom_in_dmesg

 mv /var/log/clickhouse-server/stderr.log /test_output/

-# If we failed to clone repo or download previous release packages,
-# we don't have any packages installed, but we need clickhouse-local
-# to be installed to create check_status.tsv.
-if ! command -v clickhouse-local &> /dev/null
-then
-    install_packages package_folder
-fi
-
 # Write check result into check_status.tsv
 # Try to choose most specific error for the whole check status
-clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by
+clickhouse-local --structure "test String, res String, time Nullable(Float32), desc String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by
 (test like '%Sanitizer%') DESC,
 (test like '%Killed by signal%') DESC,
 (test like '%gdb.log%') DESC,
@ -504,7 +196,7 @@ clickhouse-local --structure "test String, res String" -q "SELECT 'failure', tes
 (test like '%Error message%') DESC,
 (test like '%previous release%') DESC,
 rowNumberInAllBlocks()
-LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
+LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv || echo "failure\tCannot parse test_results.tsv" > /test_output/check_status.tsv
 [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv

 # Core dumps
--- a/tests/ci/stress_tests.lib
+++ b/tests/ci/stress_tests.lib
@ -0,0 +1,305 @@
+#!/bin/bash
+
+# core.COMM.PID-TID
+sysctl kernel.core_pattern='core.%e.%p-%P'
+
+OK="\tOK\t\\N\t"
+FAIL="\tFAIL\t\\N\t"
+
+FAILURE_CONTEXT_LINES=50
+FAILURE_CONTEXT_MAX_LINE_WIDTH=400
+
+function escaped()
+{
+    # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
+    # Also limit lines width just in case (too long lines are not really useful usually)
+    clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
+      from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
+}
+function head_escaped()
+{
+    head -n $FAILURE_CONTEXT_LINES $1 | escaped
+}
+function unts()
+{
+    grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
+}
+function trim_server_logs()
+{
+    head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
+}
+
+function install_packages()
+{
+    dpkg -i $1/clickhouse-common-static_*.deb
+    dpkg -i $1/clickhouse-common-static-dbg_*.deb
+    dpkg -i $1/clickhouse-server_*.deb
+    dpkg -i $1/clickhouse-client_*.deb
+}
+
+function configure()
+{
+    # install test configs
+    export USE_DATABASE_ORDINARY=1
+    export EXPORT_S3_STORAGE_POLICIES=1
+    /usr/share/clickhouse-test/config/install.sh
+
+    # avoid too slow startup
+    sudo cat /etc/clickhouse-server/config.d/keeper_port.xml \
+      | sed "s|<snapshot_distance>100000</snapshot_distance>|<snapshot_distance>10000</snapshot_distance>|" \
+      > /etc/clickhouse-server/config.d/keeper_port.xml.tmp
+    sudo mv /etc/clickhouse-server/config.d/keeper_port.xml.tmp /etc/clickhouse-server/config.d/keeper_port.xml
+    sudo chown clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
+    sudo chgrp clickhouse /etc/clickhouse-server/config.d/keeper_port.xml
+
+    # for clickhouse-server (via service)
+    echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment
+    # for clickhouse-client
+    export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000'
+
+    # since we run clickhouse from root
+    sudo chown root: /var/lib/clickhouse
+
+    # Set more frequent update period of asynchronous metrics to more frequently update information about real memory usage (less chance of OOM).
+    echo "<clickhouse><asynchronous_metrics_update_period_s>1</asynchronous_metrics_update_period_s></clickhouse>" \
+        > /etc/clickhouse-server/config.d/asynchronous_metrics_update_period_s.xml
+
+    local total_mem
+    total_mem=$(awk '/MemTotal/ { print $(NF-1) }' /proc/meminfo) # KiB
+    total_mem=$(( total_mem*1024 )) # bytes
+
+    # Set maximum memory usage as half of total memory (less chance of OOM).
+    #
+    # But not via max_server_memory_usage but via max_memory_usage_for_user,
+    # so that we can override this setting and execute service queries, like:
+    # - hung check
+    # - show/drop database
+    # - ...
+    #
+    # So max_memory_usage_for_user will be a soft limit, and
+    # max_server_memory_usage will be hard limit, and queries that should be
+    # executed regardless memory limits will use max_memory_usage_for_user=0,
+    # instead of relying on max_untracked_memory
+
+    max_server_memory_usage_to_ram_ratio=0.5
+    echo "Setting max_server_memory_usage_to_ram_ratio to ${max_server_memory_usage_to_ram_ratio}"
+    cat > /etc/clickhouse-server/config.d/max_server_memory_usage.xml <<EOL
+<clickhouse>
+    <max_server_memory_usage_to_ram_ratio>${max_server_memory_usage_to_ram_ratio}</max_server_memory_usage_to_ram_ratio>
+</clickhouse>
+EOL
+
+    local max_users_mem
+    max_users_mem=$((total_mem*30/100)) # 30%
+    echo "Setting max_memory_usage_for_user=$max_users_mem and max_memory_usage for queries to 10G"
+    cat > /etc/clickhouse-server/users.d/max_memory_usage_for_user.xml <<EOL
+<clickhouse>
+    <profiles>
+        <default>
+            <max_memory_usage>10G</max_memory_usage>
+            <max_memory_usage_for_user>${max_users_mem}</max_memory_usage_for_user>
+        </default>
+    </profiles>
+</clickhouse>
+EOL
+
+    cat > /etc/clickhouse-server/config.d/core.xml <<EOL
+<clickhouse>
+    <core_dump>
+        <!-- 100GiB -->
+        <size_limit>107374182400</size_limit>
+    </core_dump>
+    <!-- NOTE: no need to configure core_path,
+         since clickhouse is not started as daemon (via clickhouse start)
+    -->
+    <core_path>$PWD</core_path>
+</clickhouse>
+EOL
+
+    # Analyzer is not yet ready for testing
+    cat > /etc/clickhouse-server/users.d/no_analyzer.xml <<EOL
+<clickhouse>
+    <profiles>
+        <default>
+            <constraints>
+                <allow_experimental_analyzer>
+                    <readonly/>
+                </allow_experimental_analyzer>
+            </constraints>
+        </default>
+    </profiles>
+</clickhouse>
+EOL
+
+}
+
+function stop()
+{
+    local max_tries="${1:-90}"
+    local check_hang="${2:-true}"
+    local pid
+    # Preserve the pid, since the server can hung after the PID will be deleted.
+    pid="$(cat /var/run/clickhouse-server/clickhouse-server.pid)"
+
+    clickhouse stop --max-tries "$max_tries" --do-not-kill && return
+
+    if [ $check_hang == true ] then
+      # We failed to stop the server with SIGTERM. Maybe it hang, let's collect stacktraces.
+      echo -e "Possible deadlock on shutdown (see gdb.log)$FAIL" >> /test_output/test_results.tsv
+      kill -TERM "$(pidof gdb)" ||:
+      sleep 5
+      echo "thread apply all backtrace (on stop)" >> /test_output/gdb.log
+      timeout 30m gdb -batch -ex 'thread apply all backtrace' -p "$pid" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log
+      clickhouse stop --force
+    fi
+}
+
+function start()
+{
+    counter=0
+    until clickhouse-client --query "SELECT 1"
+    do
+        if [ "$counter" -gt ${1:-120} ]
+        then
+            echo "Cannot start clickhouse-server"
+            rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
+            echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
+            cat /var/log/clickhouse-server/stdout.log
+            tail -n100 /var/log/clickhouse-server/stderr.log
+            tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
+            break
+        fi
+        # use root to match with current uid
+        clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>>/var/log/clickhouse-server/stderr.log
+        sleep 0.5
+        counter=$((counter + 1))
+    done
+
+    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
+    # and clickhouse-server can do fork-exec, for example, to run some bridge.
+    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
+    # explicitly ignore non-fatal signals that are used by server.
+    # Number of SIGRTMIN can be determined only in runtime.
+    RTMIN=$(kill -l SIGRTMIN)
+    echo "
+set follow-fork-mode parent
+handle SIGHUP nostop noprint pass
+handle SIGINT nostop noprint pass
+handle SIGQUIT nostop noprint pass
+handle SIGPIPE nostop noprint pass
+handle SIGTERM nostop noprint pass
+handle SIGUSR1 nostop noprint pass
+handle SIGUSR2 nostop noprint pass
+handle SIG$RTMIN nostop noprint pass
+info signals
+continue
+backtrace full
+thread apply all backtrace full
+info registers
+disassemble /s
+up
+disassemble /s
+up
+disassemble /s
+p \"done\"
+detach
+quit
+" > script.gdb
+
+    # FIXME Hung check may work incorrectly because of attached gdb
+    # 1. False positives are possible
+    # 2. We cannot attach another gdb to get stacktraces if some queries hung
+    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
+    sleep 5
+    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
+    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
+}
+
+function check_server_start()
+{
+    clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
+        || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
+        && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
+        >> /test_output/test_results.tsv)
+
+    # Remove file application_errors.txt if it's empty
+    [ -s /test_output/application_errors.txt ] || rm /test_output/application_errors.txt
+}
+
+function check_logs_for_critical_errors()
+{
+    # Sanitizer asserts
+    rg -Fa "==================" /var/log/clickhouse-server/stderr.log | rg -v "in query:" >> /test_output/tmp
+    rg -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp
+    rg -Fav -e "ASan doesn't fully support makecontext/swapcontext functions" -e "DB::Exception" /test_output/tmp > /dev/null \
+        && echo -e "Sanitizer assert (in stderr.log)$FAIL$(head_escaped /test_output/tmp)" >> /test_output/test_results.tsv \
+        || echo -e "No sanitizer asserts$OK" >> /test_output/test_results.tsv
+    rm -f /test_output/tmp
+
+    # OOM
+    rg -Fa " <Fatal> Application: Child process was terminated by signal 9" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
+        && echo -e "Signal 9 in clickhouse-server.log$FAIL" >> /test_output/test_results.tsv \
+        || echo -e "No OOM messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
+
+    # Logical errors
+    rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/logical_errors.txt \
+        && echo -e "Logical error thrown (see clickhouse-server.log or logical_errors.txt)$FAIL$(head_escaped /test_output/logical_errors.txt)" >> /test_output/test_results.tsv \
+        || echo -e "No logical errors$OK" >> /test_output/test_results.tsv
+    # Remove file logical_errors.txt if it's empty
+    [ -s /test_output/logical_errors.txt ] || rm /test_output/logical_errors.txt
+
+    # No such key errors
+    rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
+        && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
+        || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
+
+    # Remove file no_such_key_errors.txt if it's empty
+    [ -s /test_output/no_such_key_errors.txt ] || rm /test_output/no_such_key_errors.txt
+
+    # Crash
+    rg -Fa "########################################" /var/log/clickhouse-server/clickhouse-server*.log > /dev/null \
+        && echo -e "Killed by signal (in clickhouse-server.log)$FAIL" >> /test_output/test_results.tsv \
+        || echo -e "Not crashed$OK" >> /test_output/test_results.tsv
+
+    # It also checks for crash without stacktrace (printed by watchdog)
+    rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
+        && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
+        || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
+
+    # Remove file fatal_messages.txt if it's empty
+    [ -s /test_output/fatal_messages.txt ] || rm /test_output/fatal_messages.txt
+
+    rg -Fa "########################################" /test_output/* > /dev/null \
+      && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
+
+    function get_gdb_log_context()
+    {
+        rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
+    }
+
+    rg -Fa " received signal " /test_output/gdb.log > /dev/null \
+        && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
+
+    dmesg -T > /test_output/dmesg.log
+
+    # OOM in dmesg -- those are real
+    grep -q -F -e 'Out of memory: Killed process' -e 'oom_reaper: reaped process' -e 'oom-kill:constraint=CONSTRAINT_NONE' /test_output/dmesg.log \
+        && echo -e "OOM in dmesg$FAIL$(head_escaped /test_output/dmesg.log)" >> /test_output/test_results.tsv \
+        || echo -e "No OOM in dmesg$OK" >> /test_output/test_results.tsv
+}
+
+function collect_query_and_trace_logs()
+{
+    for table in query_log trace_log
+    do
+        clickhouse-local --path /var/lib/clickhouse/ --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
+    done
+}
+
+function collect_core_dumps()
+{
+  find . -type f -maxdepth 1 -name 'core.*' | while read core; do
+      zstd --threads=0 $core
+      mv $core.zst /test_output/
+  done
+}