Merge pull request #67622 from Algunenano/unit_test_asan

Don't run ASAN unit tests under gdb
This commit is contained in:
Raúl Marín 2024-08-07 10:48:00 +00:00 committed by GitHub
commit c9340cba32
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 177 additions and 121 deletions

View File

@ -28,12 +28,14 @@ RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 abort_on_error=1 history_
RUN echo "UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'" >> /etc/environment RUN echo "UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'" >> /etc/environment
RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'" >> /etc/environment RUN echo "MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'" >> /etc/environment
RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt max_allocation_size_mb=32768'" >> /etc/environment RUN echo "LSAN_OPTIONS='suppressions=/usr/share/clickhouse-test/config/lsan_suppressions.txt max_allocation_size_mb=32768'" >> /etc/environment
RUN echo "ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'" >> /etc/environment
# Sanitizer options for current shell (not current, but the one that will be spawned on "docker run") # Sanitizer options for current shell (not current, but the one that will be spawned on "docker run")
# (but w/o verbosity for TSAN, otherwise test.reference will not match) # (but w/o verbosity for TSAN, otherwise test.reference will not match)
ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1 max_allocation_size_mb=32768' ENV TSAN_OPTIONS='halt_on_error=1 abort_on_error=1 history_size=7 memory_limit_mb=46080 second_deadlock_stack=1 max_allocation_size_mb=32768'
ENV UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768' ENV UBSAN_OPTIONS='print_stacktrace=1 max_allocation_size_mb=32768'
ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768' ENV MSAN_OPTIONS='abort_on_error=1 poison_in_dtor=1 max_allocation_size_mb=32768'
ENV LSAN_OPTIONS='max_allocation_size_mb=32768' ENV LSAN_OPTIONS='max_allocation_size_mb=32768'
ENV ASAN_OPTIONS='halt_on_error=1 abort_on_error=1'
# for external_symbolizer_path, and also ensure that llvm-symbolizer really # for external_symbolizer_path, and also ensure that llvm-symbolizer really
# exists (since you don't want to fallback to addr2line, it is very slow) # exists (since you don't want to fallback to addr2line, it is very slow)

View File

@ -193,53 +193,60 @@ function fuzz
kill -0 $server_pid kill -0 $server_pid
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
# and clickhouse-server can do fork-exec, for example, to run some bridge. if [[ "$IS_ASAN" = "1" ]];
# Do not set nostop noprint for all signals, because some it may cause gdb to hang, then
# explicitly ignore non-fatal signals that are used by server. echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
# Number of SIGRTMIN can be determined only in runtime. else
RTMIN=$(kill -l SIGRTMIN) # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
echo " # and clickhouse-server can do fork-exec, for example, to run some bridge.
set follow-fork-mode parent # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
handle SIGHUP nostop noprint pass # explicitly ignore non-fatal signals that are used by server.
handle SIGINT nostop noprint pass # Number of SIGRTMIN can be determined only in runtime.
handle SIGQUIT nostop noprint pass RTMIN=$(kill -l SIGRTMIN)
handle SIGPIPE nostop noprint pass echo "
handle SIGTERM nostop noprint pass set follow-fork-mode parent
handle SIGUSR1 nostop noprint pass handle SIGHUP nostop noprint pass
handle SIGUSR2 nostop noprint pass handle SIGINT nostop noprint pass
handle SIG$RTMIN nostop noprint pass handle SIGQUIT nostop noprint pass
info signals handle SIGPIPE nostop noprint pass
continue handle SIGTERM nostop noprint pass
backtrace full handle SIGUSR1 nostop noprint pass
thread apply all backtrace full handle SIGUSR2 nostop noprint pass
info registers handle SIG$RTMIN nostop noprint pass
disassemble /s info signals
up continue
disassemble /s backtrace full
up thread apply all backtrace full
disassemble /s info registers
p \"done\" disassemble /s
detach up
quit disassemble /s
" > script.gdb up
disassemble /s
p \"done\"
detach
quit
" > script.gdb
gdb -batch -command script.gdb -p $server_pid & gdb -batch -command script.gdb -p $server_pid &
sleep 5 sleep 5
# gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s) # gdb will send SIGSTOP, spend some time loading debug info, and then send SIGCONT, wait for it (up to send_timeout, 300s)
time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||: time clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" ||:
# Check connectivity after we attach gdb, because it might cause the server
# to freeze, and the fuzzer will fail. In debug build, it can take a lot of time.
for _ in {1..180}
do
if clickhouse-client --query "select 1"
then
break
fi
sleep 1
done
kill -0 $server_pid # This checks that it is our server that is started and not some other one
fi
# Check connectivity after we attach gdb, because it might cause the server
# to freeze, and the fuzzer will fail. In debug build, it can take a lot of time.
for _ in {1..180}
do
if clickhouse-client --query "select 1"
then
break
fi
sleep 1
done
kill -0 $server_pid # This checks that it is our server that is started and not some other one
echo 'Server started and responded.' echo 'Server started and responded.'
setup_logs_replication setup_logs_replication
@ -264,8 +271,13 @@ quit
# The fuzzer_pid belongs to the timeout process. # The fuzzer_pid belongs to the timeout process.
actual_fuzzer_pid=$(ps -o pid= --ppid "$fuzzer_pid") actual_fuzzer_pid=$(ps -o pid= --ppid "$fuzzer_pid")
echo "Attaching gdb to the fuzzer itself" if [[ "$IS_ASAN" = "1" ]];
gdb -batch -command script.gdb -p $actual_fuzzer_pid & then
echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
else
echo "Attaching gdb to the fuzzer itself"
gdb -batch -command script.gdb -p $actual_fuzzer_pid &
fi
# Wait for the fuzzer to complete. # Wait for the fuzzer to complete.
# Note that the 'wait || ...' thing is required so that the script doesn't # Note that the 'wait || ...' thing is required so that the script doesn't

View File

@ -5,47 +5,53 @@ source /utils.lib
function attach_gdb_to_clickhouse() function attach_gdb_to_clickhouse()
{ {
# Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog IS_ASAN=$(clickhouse-client --query "SELECT count() FROM system.build_options WHERE name = 'CXX_FLAGS' AND position('sanitize=address' IN value)")
# and clickhouse-server can do fork-exec, for example, to run some bridge. if [[ "$IS_ASAN" = "1" ]];
# Do not set nostop noprint for all signals, because some it may cause gdb to hang, then
# explicitly ignore non-fatal signals that are used by server. echo "ASAN build detected. Not using gdb since it disables LeakSanitizer detections"
# Number of SIGRTMIN can be determined only in runtime. else
RTMIN=$(kill -l SIGRTMIN) # Set follow-fork-mode to parent, because we attach to clickhouse-server, not to watchdog
# shellcheck disable=SC2016 # and clickhouse-server can do fork-exec, for example, to run some bridge.
echo " # Do not set nostop noprint for all signals, because some it may cause gdb to hang,
set follow-fork-mode parent # explicitly ignore non-fatal signals that are used by server.
handle SIGHUP nostop noprint pass # Number of SIGRTMIN can be determined only in runtime.
handle SIGINT nostop noprint pass RTMIN=$(kill -l SIGRTMIN)
handle SIGQUIT nostop noprint pass # shellcheck disable=SC2016
handle SIGPIPE nostop noprint pass echo "
handle SIGTERM nostop noprint pass set follow-fork-mode parent
handle SIGUSR1 nostop noprint pass handle SIGHUP nostop noprint pass
handle SIGUSR2 nostop noprint pass handle SIGINT nostop noprint pass
handle SIG$RTMIN nostop noprint pass handle SIGQUIT nostop noprint pass
info signals handle SIGPIPE nostop noprint pass
continue handle SIGTERM nostop noprint pass
backtrace full handle SIGUSR1 nostop noprint pass
info registers handle SIGUSR2 nostop noprint pass
p "top 1 KiB of the stack:" handle SIG$RTMIN nostop noprint pass
p/x *(uint64_t[128]*)"'$sp'" info signals
maintenance info sections continue
thread apply all backtrace full backtrace full
disassemble /s info registers
up p "top 1 KiB of the stack:"
disassemble /s p/x *(uint64_t[128]*)"'$sp'"
up maintenance info sections
disassemble /s thread apply all backtrace full
p \"done\" disassemble /s
detach up
quit disassemble /s
" > script.gdb up
disassemble /s
p \"done\"
detach
quit
" > script.gdb
# FIXME Hung check may work incorrectly because of attached gdb # FIXME Hung check may work incorrectly because of attached gdb
# We cannot attach another gdb to get stacktraces if some queries hung # We cannot attach another gdb to get stacktraces if some queries hung
gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log & gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" | ts '%Y-%m-%d %H:%M:%S' >> /test_output/gdb.log &
sleep 5 sleep 5
# gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s) # gdb will send SIGSTOP, spend some time loading debug info and then send SIGCONT, wait for it (up to send_timeout, 300s)
run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'" run_with_retry 60 clickhouse-client --query "SELECT 'Connected to clickhouse-server after attaching gdb'"
fi
} }
# vi: ft=bash # vi: ft=bash

View File

@ -174,7 +174,7 @@ do
done done
setup_logs_replication setup_logs_replication
attach_gdb_to_clickhouse || true # FIXME: to not break old builds, clean on 2023-09-01 attach_gdb_to_clickhouse
function fn_exists() { function fn_exists() {
declare -F "$1" > /dev/null; declare -F "$1" > /dev/null;

View File

@ -308,7 +308,8 @@ function collect_query_and_trace_logs()
{ {
for table in query_log trace_log metric_log for table in query_log trace_log metric_log
do do
clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: # Don't ignore errors here, it leads to ignore sanitizer reports when running clickhouse-local
clickhouse-local --config-file=/etc/clickhouse-server/config.xml --only-system-tables -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst
done done
} }

View File

@ -4,4 +4,5 @@ ARG FROM_TAG=latest
FROM clickhouse/test-base:$FROM_TAG FROM clickhouse/test-base:$FROM_TAG
COPY run.sh / COPY run.sh /
CMD ["/bin/bash", "/run.sh"] RUN chmod +x run.sh
ENTRYPOINT ["/run.sh"]

View File

@ -1,5 +1,27 @@
#!/bin/bash #!/bin/bash
set -x set -x
# Need to keep error from tests after `tee`. Otherwise we don't alert on asan errors
set -o pipefail
set -e
timeout 40m gdb -q -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms --gtest_output='json:test_output/test_result.json' | tee test_output/test_result.txt if [ "$#" -ne 1 ]; then
echo "Expected exactly one argument"
exit 1
fi
if [ "$1" = "GDB" ];
then
timeout 40m \
gdb -q -ex "set print inferior-events off" -ex "set confirm off" -ex "set print thread-events off" -ex run -ex bt -ex quit --args \
./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
| tee test_output/test_result.txt
elif [ "$1" = "NO_GDB" ];
then
timeout 40m \
./unit_tests_dbms --gtest_output='json:test_output/test_result.json' \
| tee test_output/test_result.txt
else
echo "Unknown argument: $1"
exit 1
fi

View File

@ -8,7 +8,9 @@ using namespace DB;
using ResourceTest = ResourceTestClass; using ResourceTest = ResourceTestClass;
TEST(SchedulerFairPolicy, Factory) /// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
TEST(DISABLED_SchedulerFairPolicy, Factory)
{ {
ResourceTest t; ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerFairPolicy, Factory)
EXPECT_TRUE(dynamic_cast<FairPolicy *>(fair.get()) != nullptr); EXPECT_TRUE(dynamic_cast<FairPolicy *>(fair.get()) != nullptr);
} }
TEST(SchedulerFairPolicy, FairnessWeights) TEST(DISABLED_SchedulerFairPolicy, FairnessWeights)
{ {
ResourceTest t; ResourceTest t;
@ -41,7 +43,7 @@ TEST(SchedulerFairPolicy, FairnessWeights)
t.consumed("B", 20); t.consumed("B", 20);
} }
TEST(SchedulerFairPolicy, Activation) TEST(DISABLED_SchedulerFairPolicy, Activation)
{ {
ResourceTest t; ResourceTest t;
@ -77,7 +79,7 @@ TEST(SchedulerFairPolicy, Activation)
t.consumed("B", 10); t.consumed("B", 10);
} }
TEST(SchedulerFairPolicy, FairnessMaxMin) TEST(DISABLED_SchedulerFairPolicy, FairnessMaxMin)
{ {
ResourceTest t; ResourceTest t;
@ -101,7 +103,7 @@ TEST(SchedulerFairPolicy, FairnessMaxMin)
t.consumed("A", 20); t.consumed("A", 20);
} }
TEST(SchedulerFairPolicy, HierarchicalFairness) TEST(DISABLED_SchedulerFairPolicy, HierarchicalFairness)
{ {
ResourceTest t; ResourceTest t;

View File

@ -8,7 +8,9 @@ using namespace DB;
using ResourceTest = ResourceTestClass; using ResourceTest = ResourceTestClass;
TEST(SchedulerPriorityPolicy, Factory) /// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
TEST(DISABLED_SchedulerPriorityPolicy, Factory)
{ {
ResourceTest t; ResourceTest t;
@ -17,7 +19,7 @@ TEST(SchedulerPriorityPolicy, Factory)
EXPECT_TRUE(dynamic_cast<PriorityPolicy *>(prio.get()) != nullptr); EXPECT_TRUE(dynamic_cast<PriorityPolicy *>(prio.get()) != nullptr);
} }
TEST(SchedulerPriorityPolicy, Priorities) TEST(DISABLED_SchedulerPriorityPolicy, Priorities)
{ {
ResourceTest t; ResourceTest t;
@ -51,7 +53,7 @@ TEST(SchedulerPriorityPolicy, Priorities)
t.consumed("C", 0); t.consumed("C", 0);
} }
TEST(SchedulerPriorityPolicy, Activation) TEST(DISABLED_SchedulerPriorityPolicy, Activation)
{ {
ResourceTest t; ResourceTest t;
@ -92,7 +94,7 @@ TEST(SchedulerPriorityPolicy, Activation)
t.consumed("C", 0); t.consumed("C", 0);
} }
TEST(SchedulerPriorityPolicy, SinglePriority) TEST(DISABLED_SchedulerPriorityPolicy, SinglePriority)
{ {
ResourceTest t; ResourceTest t;

View File

@ -10,7 +10,9 @@ using namespace DB;
using ResourceTest = ResourceTestClass; using ResourceTest = ResourceTestClass;
TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint) /// Tests disabled because of leaks in the test themselves: https://github.com/ClickHouse/ClickHouse/issues/67678
TEST(DISABLED_SchedulerThrottlerConstraint, LeakyBucketConstraint)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -40,7 +42,7 @@ TEST(SchedulerThrottlerConstraint, LeakyBucketConstraint)
t.consumed("A", 10); t.consumed("A", 10);
} }
TEST(SchedulerThrottlerConstraint, Unlimited) TEST(DISABLED_SchedulerThrottlerConstraint, Unlimited)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -57,7 +59,7 @@ TEST(SchedulerThrottlerConstraint, Unlimited)
} }
} }
TEST(SchedulerThrottlerConstraint, Pacing) TEST(DISABLED_SchedulerThrottlerConstraint, Pacing)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -77,7 +79,7 @@ TEST(SchedulerThrottlerConstraint, Pacing)
} }
} }
TEST(SchedulerThrottlerConstraint, BucketFilling) TEST(DISABLED_SchedulerThrottlerConstraint, BucketFilling)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -111,7 +113,7 @@ TEST(SchedulerThrottlerConstraint, BucketFilling)
t.consumed("A", 3); t.consumed("A", 3);
} }
TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits) TEST(DISABLED_SchedulerThrottlerConstraint, PeekAndAvgLimits)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();
@ -139,7 +141,7 @@ TEST(SchedulerThrottlerConstraint, PeekAndAvgLimits)
} }
} }
TEST(SchedulerThrottlerConstraint, ThrottlerAndFairness) TEST(DISABLED_SchedulerThrottlerConstraint, ThrottlerAndFairness)
{ {
ResourceTest t; ResourceTest t;
EventQueue::TimePoint start = std::chrono::system_clock::now(); EventQueue::TimePoint start = std::chrono::system_clock::now();

View File

@ -14,20 +14,21 @@
/// because of broken getauxval() [1]. /// because of broken getauxval() [1].
/// ///
/// [1]: https://github.com/ClickHouse/ClickHouse/pull/33957 /// [1]: https://github.com/ClickHouse/ClickHouse/pull/33957
TEST(Common, LSan) TEST(SanitizerDeathTest, LSan)
{ {
int sanitizers_exit_code = 1; EXPECT_DEATH(
ASSERT_EXIT({
std::thread leak_in_thread([]()
{ {
void * leak = malloc(4096); std::thread leak_in_thread(
ASSERT_NE(leak, nullptr); []()
}); {
leak_in_thread.join(); void * leak = malloc(4096);
ASSERT_NE(leak, nullptr);
});
leak_in_thread.join();
__lsan_do_leak_check(); __lsan_do_leak_check();
}, ::testing::ExitedWithCode(sanitizers_exit_code), ".*LeakSanitizer: detected memory leaks.*"); },
".*LeakSanitizer: detected memory leaks.*");
} }
#endif #endif

View File

@ -174,10 +174,13 @@ def main():
test_output = temp_path / "test_output" test_output = temp_path / "test_output"
test_output.mkdir(parents=True, exist_ok=True) test_output.mkdir(parents=True, exist_ok=True)
# Don't run ASAN under gdb since that breaks leak detection
gdb_enabled = "NO_GDB" if "asan" in check_name else "GDB"
run_command = ( run_command = (
f"docker run --cap-add=SYS_PTRACE --volume={tests_binary}:/unit_tests_dbms " f"docker run --cap-add=SYS_PTRACE --volume={tests_binary}:/unit_tests_dbms "
"--security-opt seccomp=unconfined " # required to issue io_uring sys-calls "--security-opt seccomp=unconfined " # required to issue io_uring sys-calls
f"--volume={test_output}:/test_output {docker_image}" f"--volume={test_output}:/test_output {docker_image} {gdb_enabled}"
) )
run_log_path = test_output / "run.log" run_log_path = test_output / "run.log"
@ -194,6 +197,11 @@ def main():
subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True) subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True)
state, description, test_results = process_results(test_output) state, description, test_results = process_results(test_output)
if retcode != 0 and state == SUCCESS:
# The process might have failed without reporting it in the test_output (e.g. LeakSanitizer)
state = FAILURE
description = "Invalid return code. Check run.log"
additional_files = [run_log_path] + [ additional_files = [run_log_path] + [
p for p in test_output.iterdir() if not p.is_dir() p for p in test_output.iterdir() if not p.is_dir()
] ]

View File

@ -1,9 +1,6 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# shellcheck disable=SC2120 # shellcheck disable=SC2120
# Don't check for ODR violation, since we may test shared build with ASAN
export ASAN_OPTIONS=detect_odr_violation=0
# If ClickHouse was built with coverage - dump the coverage information at exit # If ClickHouse was built with coverage - dump the coverage information at exit
# (in other cases this environment variable has no effect) # (in other cases this environment variable has no effect)
export CLICKHOUSE_WRITE_COVERAGE="coverage" export CLICKHOUSE_WRITE_COVERAGE="coverage"