Merge pull request #40775 from azat/ci/core-dumps-rework

Rework core collecting on CI (eliminate gcore usage)
This commit is contained in:
Alexander Tokmakov 2022-09-09 20:20:10 +03:00 committed by GitHub
commit e77b9e4d0c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 44 additions and 7 deletions

View File

@ -1,8 +1,15 @@
#!/bin/bash
# shellcheck disable=SC2086,SC2001,SC2046,SC2030,SC2031
set -eux
set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
set -e
set -u
set -o pipefail
trap "exit" INT TERM
# The watchdog is in the separate process group, so we have to kill it separately
# if the script terminates earlier.
@ -87,6 +94,19 @@ function configure
# TODO figure out which ones are needed
cp -av --dereference "$repo_dir"/tests/config/config.d/listen.xml db/config.d
cp -av --dereference "$script_dir"/query-fuzzer-tweaks-users.xml db/users.d
cat > db/config.d/core.xml <<EOL
<clickhouse>
<core_dump>
<!-- 100GiB -->
<size_limit>107374182400</size_limit>
</core_dump>
<!-- NOTE: no need to configure core_path,
since clickhouse is not started as daemon (via clickhouse start)
-->
<core_path>$PWD</core_path>
</clickhouse>
EOL
}
function watchdog
@ -180,7 +200,6 @@ handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
continue
gcore
backtrace full
thread apply all backtrace full
info registers

View File

@ -8,6 +8,9 @@ dmesg --clear
set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
# Thread Fuzzer allows to check more permutations of possible thread scheduling
# and find more potential issues.
@ -104,6 +107,19 @@ EOL
</default>
</profiles>
</clickhouse>
EOL
cat > /etc/clickhouse-server/config.d/core.xml <<EOL
<clickhouse>
<core_dump>
<!-- 100GiB -->
<size_limit>107374182400</size_limit>
</core_dump>
<!-- NOTE: no need to configure core_path,
since clickhouse is not started as daemon (via clickhouse start)
-->
<core_path>$PWD</core_path>
</clickhouse>
EOL
}
@ -160,7 +176,6 @@ handle SIGUSR2 nostop noprint pass
handle SIG$RTMIN nostop noprint pass
info signals
continue
gcore
backtrace full
thread apply all backtrace full
info registers
@ -504,8 +519,7 @@ done
clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%'), rowNumberInAllBlocks() LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv
[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv
# Core dumps (see gcore)
# Default filename is 'core.PROCESS_ID'
# Core dumps
for core in core.*; do
pigz $core
mv $core.gz /test_output/

View File

@ -29,7 +29,11 @@ IMAGE_NAME = "clickhouse/fuzzer"
def get_run_command(pr_number, sha, download_url, workspace_path, image):
return (
f"docker run --network=host --volume={workspace_path}:/workspace "
f"docker run "
# For sysctl
"--privileged "
"--network=host "
f"--volume={workspace_path}:/workspace "
"--cap-add syslog --cap-add sys_admin --cap-add=SYS_PTRACE "
f'-e PR_TO_TEST={pr_number} -e SHA_TO_TEST={sha} -e BINARY_URL_TO_DOWNLOAD="{download_url}" '
f"{image}"

View File

@ -33,7 +33,7 @@ def get_run_command(
"docker run --cap-add=SYS_PTRACE "
# a static link, don't use S3_URL or S3_DOWNLOAD
"-e S3_URL='https://s3.amazonaws.com/clickhouse-datasets' "
# For dmesg
# For dmesg and sysctl
"--privileged "
f"--volume={build_path}:/package_folder "
f"--volume={result_folder}:/test_output "