Merge pull request #67622 from Algunenano/unit_test_asan

Don't run ASAN unit tests under gdb
2024-09-19 16:20:50 +00:00 · 2024-08-07 10:48:00 +00:00 · 2024-08-07 10:48:00 +00:00 · c9340cba32
commit c9340cba32
parent 47111eb5af 25fa63f7e6
13 changed files with 177 additions and 121 deletions
--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@ -28,12 +28,14 @@ RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 abort_on_error=1 history_
 RUN echo "UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'" >> /etc/environment
 # Sanitizer options for current shell (not current, but the one that will be spawned on "docker run")
 # (but w/o verbosity for TSAN, otherwise test.reference will not match)
 ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1 max_allocation_size_mb=32768'
 ENV UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'
 ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'
 ENV LSAN_OPTIONS='max_allocation_size_mb=32768'
 ENV ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'
 # for external_symbolizer_path, and also ensure that llvm-symbolizer really
 # exists (since you don't want to fallback to addr2line, it is very slow)
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -193,53 +193,60 @@ function fuzz
    kill -0 $server_pid
-    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
+    IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
-    # and clickhouse-server can do fork-exec, for example, to run some bridge.
+    if [[ "$IS_ASAN" = "1" ]];
-    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
+    then
-    # explicitly ignore non-fatal signals that are used by server.
+        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
-    # Number of SIGRTMIN can be determined only in runtime.
+    else
-    RTMIN=$(kill -l SIGRTMIN)
+        # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
-    echo "
+        # and clickhouse-server can do fork-exec, for example, to run some bridge.
-set follow-fork-mode parent
+        # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
-handle SIGHUP nostop noprint pass
+        # explicitly ignore non-fatal signals that are used by server.
-handle SIGINT nostop noprint pass
+        # Number of SIGRTMIN can be determined only in runtime.
-handle SIGQUIT nostop noprint pass
+        RTMIN=$(kill -l SIGRTMIN)
-handle SIGPIPE nostop noprint pass
+        echo "
-handle SIGTERM nostop noprint pass
+    set follow-fork-mode parent
-handle SIGUSR1 nostop noprint pass
+    handle SIGHUP nostop noprint pass
-handle SIGUSR2 nostop noprint pass
+    handle SIGINT nostop noprint pass
-handle SIG$RTMIN nostop noprint pass
+    handle SIGQUIT nostop noprint pass
-info signals
+    handle SIGPIPE nostop noprint pass
-continue
+    handle SIGTERM nostop noprint pass
-backtrace full
+    handle SIGUSR1 nostop noprint pass
-thread apply all backtrace full
+    handle SIGUSR2 nostop noprint pass
-info registers
+    handle SIG$RTMIN nostop noprint pass
-disassemble /s
+    info signals
-up
+    continue
-disassemble /s
+    backtrace full
-up
+    thread apply all backtrace full
-disassemble /s
+    info registers
-p \"done\"
+    disassemble /s
-detach
+    up
-quit
+    disassemble /s
-" > script.gdb
+    up
    disassemble /s
    p \"done\"
    detach
    quit
    " > script.gdb
-    gdb -batch -command script.gdb -p $server_pid &
+        gdb -batch -command script.gdb -p $server_pid &
-    sleep 5
+        sleep 5
-    # gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s)
+        # gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s)
-    time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
+        time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
        # Check connectivity after we attach gdb, because it might cause the server
        # to freeze, and the fuzzer will fail. In debug build, it can take a lot of time.
        for _ in {1..180}
        do
            if clickhouse-client --query "select 1"
            then
                break
            fi
            sleep 1
        done
        kill -0 $server_pid # This checks that it is our server that is started and not some other one
    fi
    # Check connectivity after we attach gdb, because it might cause the server
    # to freeze, and the fuzzer will fail. In debug build, it can take a lot of time.
    for _ in {1..180}
    do
        if clickhouse-client --query "select 1"
        then
            break
        fi
        sleep 1
    done
    kill -0 $server_pid # This checks that it is our server that is started and not some other one
    echo 'Server started and responded.'
    setup_logs_replication
@ -264,8 +271,13 @@ quit
    # The fuzzer_pid belongs to the timeout process.
    actual_fuzzer_pid=$(ps -o pid= --ppid "$fuzzer_pid")
-    echo "Attaching gdb to the fuzzer itself"
+    if [[ "$IS_ASAN" = "1" ]];
-    gdb -batch -command script.gdb -p $actual_fuzzer_pid &
+    then
        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
    else
        echo "Attaching gdb to the fuzzer itself"
        gdb -batch -command script.gdb -p $actual_fuzzer_pid &
    fi
    # Wait for the fuzzer to complete.
    # Note that the 'wait || ...' thing is required so that the script doesn't
--- a/docker/test/stateless/attach_gdb.lib
+++ b/docker/test/stateless/attach_gdb.lib
@ -5,47 +5,53 @@ source /utils.lib
 function attach_gdb_to_clickhouse()
 {
-    # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
+    IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
-    # and clickhouse-server can do fork-exec, for example, to run some bridge.
+    if [[ "$IS_ASAN" = "1" ]];
-    # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
+    then
-    # explicitly ignore non-fatal signals that are used by server.
+        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
-    # Number of SIGRTMIN can be determined only in runtime.
+    else
-    RTMIN=$(kill -l SIGRTMIN)
+            # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
-    # shellcheck disable=SC2016
+            # and clickhouse-server can do fork-exec, for example, to run some bridge.
-    echo "
+            # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
-set follow-fork-mode parent
+            # explicitly ignore non-fatal signals that are used by server.
-handle SIGHUP nostop noprint pass
+            # Number of SIGRTMIN can be determined only in runtime.
-handle SIGINT nostop noprint pass
+            RTMIN=$(kill -l SIGRTMIN)
-handle SIGQUIT nostop noprint pass
+            # shellcheck disable=SC2016
-handle SIGPIPE nostop noprint pass
+            echo "
-handle SIGTERM nostop noprint pass
+        set follow-fork-mode parent
-handle SIGUSR1 nostop noprint pass
+        handle SIGHUP nostop noprint pass
-handle SIGUSR2 nostop noprint pass
+        handle SIGINT nostop noprint pass
-handle SIG$RTMIN nostop noprint pass
+        handle SIGQUIT nostop noprint pass
-info signals
+        handle SIGPIPE nostop noprint pass
-continue
+        handle SIGTERM nostop noprint pass
-backtrace full
+        handle SIGUSR1 nostop noprint pass
-info registers
+        handle SIGUSR2 nostop noprint pass
-p "top 1 KiB of the stack:"
+        handle SIG$RTMIN nostop noprint pass
-p/x *(uint64_t[128]*)"'$sp'"
+        info signals
-maintenance info sections
+        continue
-thread apply all backtrace full
+        backtrace full
-disassemble /s
+        info registers
-up
+        p "top 1 KiB of the stack:"
-disassemble /s
+        p/x *(uint64_t[128]*)"'$sp'"
-up
+        maintenance info sections
-disassemble /s
+        thread apply all backtrace full
-p \"done\"
+        disassemble /s
-detach
+        up
-quit
+        disassemble /s
-" > script.gdb
+        up
        disassemble /s
        p \"done\"
        detach
        quit
        " > script.gdb
-    # FIXME Hung check may work incorrectly because of attached gdb
+            # FIXME Hung check may work incorrectly because of attached gdb
-    # We cannot attach another gdb to get stacktraces if some queries hung
+            # We cannot attach another gdb to get stacktraces if some queries hung
-    gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
+            gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
-    sleep 5
+            sleep 5
-    # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
+            # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
-    run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'"
+            run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'"
    fi
 }
 # vi: ft=bash
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -174,7 +174,7 @@ do
 done
 setup_logs_replication
-attach_gdb_to_clickhouse || true  # FIXME: to not break old builds, clean on 2023-09-01
+attach_gdb_to_clickhouse
 function fn_exists() {
    declare -F "$1" > /dev/null;
--- a/docker/test/stateless/stress_tests.lib
+++ b/docker/test/stateless/stress_tests.lib
@ -308,7 +308,8 @@ function collect_query_and_trace_logs()
 {
    for table in query_log trace_log metric_log
    do
-        clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
+        # Don't ignore errors here, it leads to ignore sanitizer reports when running clickhouse-local
        clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst
    done
 }
--- a/docker/test/unit/Dockerfile
+++ b/docker/test/unit/Dockerfile
@ -4,4 +4,5 @@ ARG FROM_TAG=latest
 FROM clickhouse/test-base:$FROM_TAG
 COPY run.sh /
-CMD ["/bin/bash", "/run.sh"]
+RUN chmod +x run.sh
 ENTRYPOINT ["/run.sh"]
--- a/docker/test/unit/run.sh
+++ b/docker/test/unit/run.sh
@ -1,5 +1,27 @@
 #!/bin/bash
 set -x
 # Need to keep error from tests after `tee`. Otherwise we don't alert on asan errors
 set -o pipefail
 set -e
-timeout 40m gdb -q  -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' | tee test_output/test_result.txt
+if [ "$#" -ne 1 ]; then
    echo "Expected exactly one argument"
    exit 1
 fi
 if [ "$1" = "GDB" ];
 then
  timeout 40m \
    gdb -q  -ex "set print inferior-events off" -ex "set confirm off" -ex "set print thread-events off" -ex run -ex bt -ex quit --args \
    ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
    | tee test_output/test_result.txt
 elif [ "$1" = "NO_GDB" ];
 then
  timeout 40m \
    ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
    | tee test_output/test_result.txt
 else
    echo "Unknown argument: $1"
    exit 1
 fi
--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
@ -8,7 +8,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerFairPolicy, Factory)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerFairPolicy, Factory)
 {
    ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerFairPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<FairPolicy *>(fair.get()) != nullptr);
 }
-TEST(SchedulerFairPolicy, FairnessWeights)
+TEST(DISABLED_SchedulerFairPolicy, FairnessWeights)
 {
    ResourceTest t;
@ -41,7 +43,7 @@ TEST(SchedulerFairPolicy, FairnessWeights)
    t.consumed("B", 20);
 }
-TEST(SchedulerFairPolicy, Activation)
+TEST(DISABLED_SchedulerFairPolicy, Activation)
 {
    ResourceTest t;
@ -77,7 +79,7 @@ TEST(SchedulerFairPolicy, Activation)
    t.consumed("B", 10);
 }
-TEST(SchedulerFairPolicy, FairnessMaxMin)
+TEST(DISABLED_SchedulerFairPolicy, FairnessMaxMin)
 {
    ResourceTest t;
@ -101,7 +103,7 @@ TEST(SchedulerFairPolicy, FairnessMaxMin)
    t.consumed("A", 20);
 }
-TEST(SchedulerFairPolicy, HierarchicalFairness)
+TEST(DISABLED_SchedulerFairPolicy, HierarchicalFairness)
 {
    ResourceTest t;
--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
@ -8,7 +8,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerPriorityPolicy, Factory)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerPriorityPolicy, Factory)
 {
    ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerPriorityPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<PriorityPolicy *>(prio.get()) != nullptr);
 }
-TEST(SchedulerPriorityPolicy, Priorities)
+TEST(DISABLED_SchedulerPriorityPolicy, Priorities)
 {
    ResourceTest t;
@ -51,7 +53,7 @@ TEST(SchedulerPriorityPolicy, Priorities)
    t.consumed("C", 0);
 }
-TEST(SchedulerPriorityPolicy, Activation)
+TEST(DISABLED_SchedulerPriorityPolicy, Activation)
 {
    ResourceTest t;
@ -92,7 +94,7 @@ TEST(SchedulerPriorityPolicy, Activation)
    t.consumed("C", 0);
 }
-TEST(SchedulerPriorityPolicy, SinglePriority)
+TEST(DISABLED_SchedulerPriorityPolicy, SinglePriority)
 {
    ResourceTest t;
--- a/src/Common/Scheduler/Nodes/tests/gtest_throttler_constraint.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_throttler_constraint.cpp
@ -10,7 +10,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerThrottlerConstraint, LeakyBucketConstraint)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -40,7 +42,7 @@ TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint)
    t.consumed("A", 10);
 }
-TEST(SchedulerThrottlerConstraint, Unlimited)
+TEST(DISABLED_SchedulerThrottlerConstraint, Unlimited)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -57,7 +59,7 @@ TEST(SchedulerThrottlerConstraint, Unlimited)
    }
 }
-TEST(SchedulerThrottlerConstraint, Pacing)
+TEST(DISABLED_SchedulerThrottlerConstraint, Pacing)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -77,7 +79,7 @@ TEST(SchedulerThrottlerConstraint, Pacing)
    }
 }
-TEST(SchedulerThrottlerConstraint, BucketFilling)
+TEST(DISABLED_SchedulerThrottlerConstraint, BucketFilling)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -111,7 +113,7 @@ TEST(SchedulerThrottlerConstraint, BucketFilling)
    t.consumed("A", 3);
 }
-TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits)
+TEST(DISABLED_SchedulerThrottlerConstraint, PeekAndAvgLimits)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -139,7 +141,7 @@ TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits)
    }
 }
-TEST(SchedulerThrottlerConstraint, ThrottlerAndFairness)
+TEST(DISABLED_SchedulerThrottlerConstraint, ThrottlerAndFairness)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
--- a/src/Common/tests/gtest_lsan.cpp
+++ b/src/Common/tests/gtest_lsan.cpp
@ -14,20 +14,21 @@
 /// because of broken getauxval() [1].
 ///
 ///   [1]: https://github.com/ClickHouse/ClickHouse/pull/33957
-TEST(Common, LSan)
+TEST(SanitizerDeathTest, LSan)
 {
-    int sanitizers_exit_code = 1;
+    EXPECT_DEATH(
    ASSERT_EXIT({
        std::thread leak_in_thread([]()
        {
-            void * leak = malloc(4096);
+            std::thread leak_in_thread(
-            ASSERT_NE(leak, nullptr);
+                []()
-        });
+                {
-        leak_in_thread.join();
+                    void * leak = malloc(4096);
                    ASSERT_NE(leak, nullptr);
                });
            leak_in_thread.join();
-        __lsan_do_leak_check();
+            __lsan_do_leak_check();
-    }, ::testing::ExitedWithCode(sanitizers_exit_code), ".*LeakSanitizer: detected memory leaks.*");
+        },
        ".*LeakSanitizer: detected memory leaks.*");
 }
 #endif
--- a/tests/ci/unit_tests_check.py
+++ b/tests/ci/unit_tests_check.py
@ -174,10 +174,13 @@ def main():
    test_output = temp_path / "test_output"
    test_output.mkdir(parents=True, exist_ok=True)
    # Don't run ASAN under gdb since that breaks leak detection
    gdb_enabled = "NO_GDB" if "asan" in check_name else "GDB"
    run_command = (
        f"docker run --cap-add=SYS_PTRACE --volume={tests_binary}:/unit_tests_dbms "
        "--security-opt seccomp=unconfined "  # required to issue io_uring sys-calls
-        f"--volume={test_output}:/test_output {docker_image}"
+        f"--volume={test_output}:/test_output {docker_image} {gdb_enabled}"
    )
    run_log_path = test_output / "run.log"
@ -194,6 +197,11 @@ def main():
    subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True)
    state, description, test_results = process_results(test_output)
    if retcode != 0 and state == SUCCESS:
        # The process might have failed without reporting it in the test_output (e.g. LeakSanitizer)
        state = FAILURE
        description = "Invalid return code. Check run.log"
    additional_files = [run_log_path] + [
        p for p in test_output.iterdir() if not p.is_dir()
    ]
--- a/tests/queries/shell_config.sh
+++ b/tests/queries/shell_config.sh
@ -1,9 +1,6 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC2120
 # Don't check for ODR violation, since we may test shared build with ASAN
 export ASAN_OPTIONS=detect_odr_violation=0
 # If ClickHouse was built with coverage - dump the coverage information at exit
 # (in other cases this environment variable has no effect)
 export CLICKHOUSE_WRITE_COVERAGE="coverage"