Merge pull request #67622 from Algunenano/unit_test_asan

Don't run ASAN unit tests under gdb
2024-09-20 00:30:49 +00:00 · 2024-08-07 10:48:00 +00:00 · 2024-08-07 10:48:00 +00:00 · c9340cba32
commit c9340cba32
parent 47111eb5af 25fa63f7e6
13 changed files with 177 additions and 121 deletions
--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@ -28,12 +28,14 @@ RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 abort_on_error=1 history_
 RUN echo "UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt max_allocation_size_mb=32768'" >> /etc/environment
 RUN echo "ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'" >> /etc/environment
 # Sanitizer options for current shell (not current, but the one that will be spawned on "docker run")
 # (but w/o verbosity for TSAN, otherwise test.reference will not match)
 ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1 max_allocation_size_mb=32768'
 ENV UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'
 ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'
 ENV LSAN_OPTIONS='max_allocation_size_mb=32768'
 ENV ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'
 # for external_symbolizer_path, and also ensure that llvm-symbolizer really
 # exists (since you don't want to fallback to addr2line, it is very slow)
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -193,6 +193,11 @@ function fuzz
    kill -0 $server_pid
    IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
    if [[ "$IS_ASAN" = "1" ]];
    then
        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
    else
        # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
        # and clickhouse-server can do fork-exec, for example, to run some bridge.
        # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
@ -240,6 +245,8 @@ quit
            sleep 1
        done
        kill -0 $server_pid # This checks that it is our server that is started and not some other one
    fi
    echo 'Server started and responded.'
    setup_logs_replication
@ -264,8 +271,13 @@ quit
    # The fuzzer_pid belongs to the timeout process.
    actual_fuzzer_pid=$(ps -o pid= --ppid "$fuzzer_pid")
    if [[ "$IS_ASAN" = "1" ]];
    then
        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
    else
        echo "Attaching gdb to the fuzzer itself"
        gdb -batch -command script.gdb -p $actual_fuzzer_pid &
    fi
    # Wait for the fuzzer to complete.
    # Note that the 'wait || ...' thing is required so that the script doesn't
--- a/docker/test/stateless/attach_gdb.lib
+++ b/docker/test/stateless/attach_gdb.lib
@ -5,6 +5,11 @@ source /utils.lib
 function attach_gdb_to_clickhouse()
 {
    IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
    if [[ "$IS_ASAN" = "1" ]];
    then
        echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
    else
            # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
            # and clickhouse-server can do fork-exec, for example, to run some bridge.
            # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
@ -46,6 +51,7 @@ quit
            sleep 5
            # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
            run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'"
    fi
 }
 # vi: ft=bash
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -174,7 +174,7 @@ do
 done
 setup_logs_replication
-attach_gdb_to_clickhouse || true  # FIXME: to not break old builds, clean on 2023-09-01
+attach_gdb_to_clickhouse
 function fn_exists() {
    declare -F "$1" > /dev/null;
--- a/docker/test/stateless/stress_tests.lib
+++ b/docker/test/stateless/stress_tests.lib
@ -308,7 +308,8 @@ function collect_query_and_trace_logs()
 {
    for table in query_log trace_log metric_log
    do
-        clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
+        # Don't ignore errors here, it leads to ignore sanitizer reports when running clickhouse-local
        clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst
    done
 }
--- a/docker/test/unit/Dockerfile
+++ b/docker/test/unit/Dockerfile
@ -4,4 +4,5 @@ ARG FROM_TAG=latest
 FROM clickhouse/test-base:$FROM_TAG
 COPY run.sh /
-CMD ["/bin/bash", "/run.sh"]
+RUN chmod +x run.sh
 ENTRYPOINT ["/run.sh"]
--- a/docker/test/unit/run.sh
+++ b/docker/test/unit/run.sh
@ -1,5 +1,27 @@
 #!/bin/bash
 set -x
 # Need to keep error from tests after `tee`. Otherwise we don't alert on asan errors
 set -o pipefail
 set -e
-timeout 40m gdb -q  -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' | tee test_output/test_result.txt
+if [ "$#" -ne 1 ]; then
    echo "Expected exactly one argument"
    exit 1
 fi
 if [ "$1" = "GDB" ];
 then
  timeout 40m \
    gdb -q  -ex "set print inferior-events off" -ex "set confirm off" -ex "set print thread-events off" -ex run -ex bt -ex quit --args \
    ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
    | tee test_output/test_result.txt
 elif [ "$1" = "NO_GDB" ];
 then
  timeout 40m \
    ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
    | tee test_output/test_result.txt
 else
    echo "Unknown argument: $1"
    exit 1
 fi
--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_fair.cpp
@ -8,7 +8,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerFairPolicy, Factory)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerFairPolicy, Factory)
 {
    ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerFairPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<FairPolicy *>(fair.get()) != nullptr);
 }
-TEST(SchedulerFairPolicy, FairnessWeights)
+TEST(DISABLED_SchedulerFairPolicy, FairnessWeights)
 {
    ResourceTest t;
@ -41,7 +43,7 @@ TEST(SchedulerFairPolicy, FairnessWeights)
    t.consumed("B", 20);
 }
-TEST(SchedulerFairPolicy, Activation)
+TEST(DISABLED_SchedulerFairPolicy, Activation)
 {
    ResourceTest t;
@ -77,7 +79,7 @@ TEST(SchedulerFairPolicy, Activation)
    t.consumed("B", 10);
 }
-TEST(SchedulerFairPolicy, FairnessMaxMin)
+TEST(DISABLED_SchedulerFairPolicy, FairnessMaxMin)
 {
    ResourceTest t;
@ -101,7 +103,7 @@ TEST(SchedulerFairPolicy, FairnessMaxMin)
    t.consumed("A", 20);
 }
-TEST(SchedulerFairPolicy, HierarchicalFairness)
+TEST(DISABLED_SchedulerFairPolicy, HierarchicalFairness)
 {
    ResourceTest t;
--- a/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_class_priority.cpp
@ -8,7 +8,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerPriorityPolicy, Factory)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerPriorityPolicy, Factory)
 {
    ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerPriorityPolicy, Factory)
    EXPECT_TRUE(dynamic_cast<PriorityPolicy *>(prio.get()) != nullptr);
 }
-TEST(SchedulerPriorityPolicy, Priorities)
+TEST(DISABLED_SchedulerPriorityPolicy, Priorities)
 {
    ResourceTest t;
@ -51,7 +53,7 @@ TEST(SchedulerPriorityPolicy, Priorities)
    t.consumed("C", 0);
 }
-TEST(SchedulerPriorityPolicy, Activation)
+TEST(DISABLED_SchedulerPriorityPolicy, Activation)
 {
    ResourceTest t;
@ -92,7 +94,7 @@ TEST(SchedulerPriorityPolicy, Activation)
    t.consumed("C", 0);
 }
-TEST(SchedulerPriorityPolicy, SinglePriority)
+TEST(DISABLED_SchedulerPriorityPolicy, SinglePriority)
 {
    ResourceTest t;
--- a/src/Common/Scheduler/Nodes/tests/gtest_throttler_constraint.cpp
+++ b/src/Common/Scheduler/Nodes/tests/gtest_throttler_constraint.cpp
@ -10,7 +10,9 @@ using namespace DB;
 using ResourceTest = ResourceTestClass;
-TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint)
+/// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
 TEST(DISABLED_SchedulerThrottlerConstraint, LeakyBucketConstraint)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -40,7 +42,7 @@ TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint)
    t.consumed("A", 10);
 }
-TEST(SchedulerThrottlerConstraint, Unlimited)
+TEST(DISABLED_SchedulerThrottlerConstraint, Unlimited)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -57,7 +59,7 @@ TEST(SchedulerThrottlerConstraint, Unlimited)
    }
 }
-TEST(SchedulerThrottlerConstraint, Pacing)
+TEST(DISABLED_SchedulerThrottlerConstraint, Pacing)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -77,7 +79,7 @@ TEST(SchedulerThrottlerConstraint, Pacing)
    }
 }
-TEST(SchedulerThrottlerConstraint, BucketFilling)
+TEST(DISABLED_SchedulerThrottlerConstraint, BucketFilling)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -111,7 +113,7 @@ TEST(SchedulerThrottlerConstraint, BucketFilling)
    t.consumed("A", 3);
 }
-TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits)
+TEST(DISABLED_SchedulerThrottlerConstraint, PeekAndAvgLimits)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -139,7 +141,7 @@ TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits)
    }
 }
-TEST(SchedulerThrottlerConstraint, ThrottlerAndFairness)
+TEST(DISABLED_SchedulerThrottlerConstraint, ThrottlerAndFairness)
 {
    ResourceTest t;
    EventQueue::TimePoint start = std::chrono::system_clock::now();
--- a/src/Common/tests/gtest_lsan.cpp
+++ b/src/Common/tests/gtest_lsan.cpp
@ -14,12 +14,12 @@
 /// because of broken getauxval() [1].
 ///
 ///   [1]: https://github.com/ClickHouse/ClickHouse/pull/33957
-TEST(Common, LSan)
+TEST(SanitizerDeathTest, LSan)
 {
-    int sanitizers_exit_code = 1;
+    EXPECT_DEATH(
-
+        {
-    ASSERT_EXIT({
+            std::thread leak_in_thread(
-        std::thread leak_in_thread([]()
+                []()
                {
                    void * leak = malloc(4096);
                    ASSERT_NE(leak, nullptr);
@ -27,7 +27,8 @@ TEST(Common, LSan)
            leak_in_thread.join();
            __lsan_do_leak_check();
-    }, ::testing::ExitedWithCode(sanitizers_exit_code), ".*LeakSanitizer: detected memory leaks.*");
+        },
        ".*LeakSanitizer: detected memory leaks.*");
 }
 #endif
--- a/tests/ci/unit_tests_check.py
+++ b/tests/ci/unit_tests_check.py
@ -174,10 +174,13 @@ def main():
    test_output = temp_path / "test_output"
    test_output.mkdir(parents=True, exist_ok=True)
    # Don't run ASAN under gdb since that breaks leak detection
    gdb_enabled = "NO_GDB" if "asan" in check_name else "GDB"
    run_command = (
        f"docker run --cap-add=SYS_PTRACE --volume={tests_binary}:/unit_tests_dbms "
        "--security-opt seccomp=unconfined "  # required to issue io_uring sys-calls
-        f"--volume={test_output}:/test_output {docker_image}"
+        f"--volume={test_output}:/test_output {docker_image} {gdb_enabled}"
    )
    run_log_path = test_output / "run.log"
@ -194,6 +197,11 @@ def main():
    subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True)
    state, description, test_results = process_results(test_output)
    if retcode != 0 and state == SUCCESS:
        # The process might have failed without reporting it in the test_output (e.g. LeakSanitizer)
        state = FAILURE
        description = "Invalid return code. Check run.log"
    additional_files = [run_log_path] + [
        p for p in test_output.iterdir() if not p.is_dir()
    ]
--- a/tests/queries/shell_config.sh
+++ b/tests/queries/shell_config.sh
@ -1,9 +1,6 @@
 #!/usr/bin/env bash
 # shellcheck disable=SC2120
 # Don't check for ODR violation, since we may test shared build with ASAN
 export ASAN_OPTIONS=detect_odr_violation=0
 # If ClickHouse was built with coverage - dump the coverage information at exit
 # (in other cases this environment variable has no effect)
 export CLICKHOUSE_WRITE_COVERAGE="coverage"