better context for stress tests failures

This commit is contained in:
Alexander Tokmakov 2023-02-02 01:02:12 +01:00
parent 1682078f1a
commit 03109f942d
4 changed files with 42 additions and 16 deletions

View File

@ -13,15 +13,28 @@ sysctl kernel.core_pattern='core.%e.%p-%P'
OK="\tOK\t\\N\t" OK="\tOK\t\\N\t"
FAIL="\tFAIL\t\\N\t" FAIL="\tFAIL\t\\N\t"
FAILURE_CONTEXT_LINES=50
FAILURE_CONTEXT_MAX_LINE_WIDTH=400
function escaped() function escaped()
{ {
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language. # That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
clickhouse local -S 's String' --input-format=LineAsString -q "select * from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'" # Also limit lines width just in case (too long lines are not really useful usually)
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
} }
function head_escaped() function head_escaped()
{ {
head -50 $1 | escaped head -n $FAILURE_CONTEXT_LINES $1 | escaped
}
function unts()
{
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
}
function trim_server_logs()
{
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
} }
function install_packages() function install_packages()
@ -167,7 +180,7 @@ function start()
then then
echo "Cannot start clickhouse-server" echo "Cannot start clickhouse-server"
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||: rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
echo -e "Cannot start clickhouse-server$FAIL$(head_escaped /test_output/application_errors.txt)" >> /test_output/test_results.tsv echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
cat /var/log/clickhouse-server/stdout.log cat /var/log/clickhouse-server/stdout.log
tail -n100 /var/log/clickhouse-server/stderr.log tail -n100 /var/log/clickhouse-server/stderr.log
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100 tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
@ -389,7 +402,7 @@ start
# NOTE Hung check is implemented in docker/tests/stress/stress # NOTE Hung check is implemented in docker/tests/stress/stress
rg -Fa "No queries hung" /test_output/test_results.tsv | grep -Fa "OK" \ rg -Fa "No queries hung" /test_output/test_results.tsv | grep -Fa "OK" \
|| echo -e "Hung check failed, possible deadlock found (see hung_check.log)$FAIL$(head_escaped /test_output/hung_check.log)" || echo -e "Hung check failed, possible deadlock found (see hung_check.log)$FAIL$(head_escaped /test_output/hung_check.log | unts)"
stop stop
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
@ -402,7 +415,7 @@ start
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \ clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \ || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(head_escaped /test_output/application_errors.txt)" \ && echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
>> /test_output/test_results.tsv) >> /test_output/test_results.tsv)
stop stop
@ -435,7 +448,7 @@ rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*
# No such key errors # No such key errors
rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \ rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(head_escaped /test_output/no_such_key_errors.txt)" >> /test_output/test_results.tsv \ && echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
|| echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv || echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
# Remove file no_such_key_errors.txt if it's empty # Remove file no_such_key_errors.txt if it's empty
@ -448,7 +461,7 @@ rg -Fa "########################################" /var/log/clickhouse-server/cli
# It also checks for crash without stacktrace (printed by watchdog) # It also checks for crash without stacktrace (printed by watchdog)
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \ rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(head_escaped /test_output/fatal_messages.txt)" >> /test_output/test_results.tsv \ && echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv || echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
# Remove file fatal_messages.txt if it's empty # Remove file fatal_messages.txt if it's empty
@ -457,8 +470,13 @@ rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_out
rg -Fa "########################################" /test_output/* > /dev/null \ rg -Fa "########################################" /test_output/* > /dev/null \
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv && echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
function get_gdb_log_context()
{
rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
}
rg -Fa " received signal " /test_output/gdb.log > /dev/null \ rg -Fa " received signal " /test_output/gdb.log > /dev/null \
&& echo -e "Found signal in gdb.log$FAIL$(rg -A50 -Fa " received signal " /test_output/gdb.log | escaped)" >> /test_output/test_results.tsv && echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
if [ "$DISABLE_BC_CHECK" -ne "1" ]; then if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
echo -e "Backward compatibility check\n" echo -e "Backward compatibility check\n"
@ -579,7 +597,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
start 500 start 500
clickhouse-client --query "SELECT 'Backward compatibility check: Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \ clickhouse-client --query "SELECT 'Backward compatibility check: Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log >> /test_output/bc_check_application_errors.txt \ || (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log >> /test_output/bc_check_application_errors.txt \
&& echo -e "Backward compatibility check: Server failed to start$FAIL$(head_escaped /test_output/bc_check_application_errors.txt)" >> /test_output/test_results.tsv) && echo -e "Backward compatibility check: Server failed to start$FAIL$(trim_server_logs bc_check_application_errors.txt)" >> /test_output/test_results.tsv)
clickhouse-client --query="SELECT 'Server version: ', version()" clickhouse-client --query="SELECT 'Server version: ', version()"
@ -634,7 +652,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
-e "Session expired" \ -e "Session expired" \
-e "TOO_MANY_PARTS" \ -e "TOO_MANY_PARTS" \
/var/log/clickhouse-server/clickhouse-server.backward.dirty.log | rg -Fa "<Error>" > /test_output/bc_check_error_messages.txt \ /var/log/clickhouse-server/clickhouse-server.backward.dirty.log | rg -Fa "<Error>" > /test_output/bc_check_error_messages.txt \
&& echo -e "Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \ && echo -e "Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)$FAIL$(trim_server_logs bc_check_error_messages.txt)" \
>> /test_output/test_results.tsv \ >> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No Error messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv || echo -e "Backward compatibility check: No Error messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
@ -657,7 +675,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
# Logical errors # Logical errors
echo "Check for Logical errors in server log:" echo "Check for Logical errors in server log:"
rg -Fa -A20 "Code: 49. DB::Exception:" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_logical_errors.txt \ rg -Fa -A20 "Code: 49. DB::Exception:" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_logical_errors.txt \
&& echo -e "Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)$FAIL$(head_escaped /test_output/bc_check_logical_errors.txt)" \ && echo -e "Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)$FAIL$(trim_server_logs bc_check_logical_errors.txt)" \
>> /test_output/test_results.tsv \ >> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No logical errors$OK" >> /test_output/test_results.tsv || echo -e "Backward compatibility check: No logical errors$OK" >> /test_output/test_results.tsv
@ -672,7 +690,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
# It also checks for crash without stacktrace (printed by watchdog) # It also checks for crash without stacktrace (printed by watchdog)
echo "Check for Fatal message in server log:" echo "Check for Fatal message in server log:"
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_fatal_messages.txt \ rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_fatal_messages.txt \
&& echo -e "Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)$FAIL$(head_escaped /test_output/bc_check_fatal_messages.txt)" \ && echo -e "Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)$FAIL$(trim_server_logs bc_check_fatal_messages.txt)" \
>> /test_output/test_results.tsv \ >> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv || echo -e "Backward compatibility check: No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

View File

@ -85,8 +85,11 @@ def process_test_log(log_path):
if DATABASE_SIGN in line: if DATABASE_SIGN in line:
test_end = True test_end = True
# Python does not support TSV, so we have to escape '\t' and '\n' manually
# and hope that complex escape sequences will not break anything
test_results = [ test_results = [
(test[0], test[1], test[2], "".join(test[3])[:4096]) for test in test_results (test[0], test[1], test[2],
"".join(test[3])[:4096].replace('\t', '\\t').replace('\n', '\\n')) for test in test_results
] ]
return ( return (

View File

@ -183,7 +183,11 @@ def prepare_tests_results_for_clickhouse(
current_row["test_duration_ms"] = int(test_time * 1000) current_row["test_duration_ms"] = int(test_time * 1000)
current_row["test_name"] = test_name current_row["test_name"] = test_name
current_row["test_status"] = test_status current_row["test_status"] = test_status
current_row["test_context_raw"] = test_result.raw_logs or "" if test_result.raw_logs:
# Protect from too big blobs that contain garbage
current_row["test_context_raw"] = test_result.raw_logs[:32 * 1024]
else:
current_row["test_context_raw"] = ""
result.append(current_row) result.append(current_row)
return result return result

View File

@ -224,7 +224,8 @@ def read_test_results(results_path: Path, with_raw_logs: bool = True) -> TestRes
# The value can be emtpy, but when it's not, # The value can be emtpy, but when it's not,
# the 4th value is a pythonic list, e.g. ['file1', 'file2'] # the 4th value is a pythonic list, e.g. ['file1', 'file2']
if with_raw_logs: if with_raw_logs:
result.set_raw_logs(line[3]) # Python does not support TSV, so we unescape manually
result.set_raw_logs(line[3]).replace('\\t', '\t').replace('\\n', '\n')
else: else:
result.set_log_files(line[3]) result.set_log_files(line[3])