Merge pull request #32511 from ClickHouse/fix_attaching_gdb2

Try to fix attaching gdb 2
2024-11-10 01:25:21 +00:00 · 2021-12-14 21:10:00 +03:00 · 2021-12-14 21:10:00 +03:00 · 2cf5456423
commit 2cf5456423
parent 6b5d09be57 dfa31a6577
6 changed files with 57 additions and 33 deletions
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -111,19 +111,6 @@ function start_server
    fi

    echo "ClickHouse server pid '$server_pid' started and responded"
-
-    echo "
-set follow-fork-mode child
-handle all noprint
-handle SIGSEGV stop print
-handle SIGBUS stop print
-handle SIGABRT stop print
-continue
-thread apply all backtrace
-continue
-" > script.gdb
-
-    gdb -batch -command script.gdb -p "$server_pid" &
 }

 function clone_root
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -155,17 +155,34 @@ function fuzz

    kill -0 $server_pid

+    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
+    # and clickhouse-server can do fork-exec, for example, to run some bridge.
+    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
+    # explicitly ignore non-fatal signals that are used by server.
+    # Number of SIGRTMIN can be determined only in runtime.
+    RTMIN=$(kill -l SIGRTMIN)
    echo "
-set follow-fork-mode child
-handle all noprint
-handle SIGSEGV stop print
-handle SIGBUS stop print
-continue
-thread apply all backtrace
+set follow-fork-mode parent
+handle SIGHUP nostop noprint pass
+handle SIGINT nostop noprint pass
+handle SIGQUIT nostop noprint pass
+handle SIGPIPE nostop noprint pass
+handle SIGTERM nostop noprint pass
+handle SIGUSR1 nostop noprint pass
+handle SIGUSR2 nostop noprint pass
+handle SIG$RTMIN nostop noprint pass
+info signals
 continue
+backtrace full
+info locals
+detach
+quit
 " > script.gdb

-    sudo gdb -batch -command script.gdb -p $server_pid &
+    gdb -batch -command script.gdb -p $server_pid  &
+    sleep 5
+    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
+    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:

    # Check connectivity after we attach gdb, because it might cause the server
    # to freeze and the fuzzer will fail.
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -128,14 +128,26 @@ function start()
        counter=$((counter + 1))
    done

+    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
+    # and clickhouse-server can do fork-exec, for example, to run some bridge.
+    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
+    # explicitly ignore non-fatal signals that are used by server.
+    # Number of SIGRTMIN can be determined only in runtime.
+    RTMIN=$(kill -l SIGRTMIN)
    echo "
-set follow-fork-mode child
-handle all noprint
-handle SIGSEGV stop print
-handle SIGBUS stop print
-handle SIGABRT stop print
+set follow-fork-mode parent
+handle SIGHUP nostop noprint pass
+handle SIGINT nostop noprint pass
+handle SIGQUIT nostop noprint pass
+handle SIGPIPE nostop noprint pass
+handle SIGTERM nostop noprint pass
+handle SIGUSR1 nostop noprint pass
+handle SIGUSR2 nostop noprint pass
+handle SIG$RTMIN nostop noprint pass
+info signals
 continue
-thread apply all backtrace
+backtrace full
+info locals
 detach
 quit
 " > script.gdb
@ -143,7 +155,10 @@ quit
    # FIXME Hung check may work incorrectly because of attached gdb
    # 1. False positives are possible
    # 2. We cannot attach another gdb to get stacktraces if some queries hung
-    sudo gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" >> /test_output/gdb.log &
+    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
+    sleep 5
+    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
+    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
 }

 configure
@ -214,6 +229,9 @@ zgrep -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.log* > /dev/n
 zgrep -Fa "########################################" /test_output/* > /dev/null \
    && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv

+zgrep -Fa " received signal " /test_output/gdb.log > /dev/null \
+    && echo -e 'Found signal in gdb.log\tFAIL' >> /test_output/test_results.tsv
+
 # Put logs into /test_output/
 for log_file in /var/log/clickhouse-server/clickhouse-server.log*
 do
--- a/tests/integration/test_system_flush_logs/test.py
+++ b/tests/integration/test_system_flush_logs/test.py
@ -54,3 +54,11 @@ def test_system_logs_non_empty_queue():
        'log_queries_min_type': 'QUERY_START',
    })
    node.query('SYSTEM FLUSH LOGS')
+
+
+def test_system_suspend():
+    node.query("CREATE TABLE t (x DateTime) ENGINE=Memory;")
+    node.query("INSERT INTO t VALUES (now());")
+    node.query("SYSTEM SUSPEND FOR 1 SECOND;")
+    node.query("INSERT INTO t VALUES (now());")
+    assert "1\n" == node.query("SELECT max(x) - min(x) >= 1 FROM t;")
--- a/tests/queries/0_stateless/01643_system_suspend.reference
+++ b/tests/queries/0_stateless/01643_system_suspend.reference
@ -1 +0,0 @@
-1
--- a/tests/queries/0_stateless/01643_system_suspend.sql
+++ b/tests/queries/0_stateless/01643_system_suspend.sql
@ -1,5 +0,0 @@
-CREATE TEMPORARY TABLE t (x DateTime);
-INSERT INTO t VALUES (now());
-SYSTEM SUSPEND FOR 1 SECOND;
-INSERT INTO t VALUES (now());
-SELECT max(x) - min(x) >= 1 FROM t;