Merge pull request #19558 from ClickHouse/better_hung_check

Check for hung queries or server hung in fast test
This commit is contained in:
alesapin 2021-01-25 18:04:07 +03:00 committed by GitHub
commit b3726c1c8e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 23 additions and 13 deletions

View File

@ -337,7 +337,7 @@ function run_tests
01666_blns
)
time clickhouse-test -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
# substr is to remove semicolon after test name
readarray -t FAILED_TESTS < <(awk '/\[ FAIL|TIMEOUT|ERROR \]/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt")
@ -360,7 +360,7 @@ function run_tests
echo "Going to run again: ${FAILED_TESTS[*]}"
clickhouse-test --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt"
clickhouse-test --hung-check --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt"
else
echo "No failed tests"
fi

View File

@ -197,11 +197,14 @@ def need_retry(stderr):
return any(msg in stderr for msg in MESSAGES_TO_RETRY)
def get_processlist(client_cmd):
def get_processlist(args):
try:
return subprocess.check_output("{} --query 'SHOW PROCESSLIST FORMAT Vertical'".format(client_cmd), shell=True).decode('utf-8')
except:
return "" # server seems dead
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
(stdout, _) = clickhouse_proc.communicate((b"SHOW PROCESSLIST FORMAT Vertical"), timeout=10)
return False, stdout.decode('utf-8')
except Exception as ex:
print("Exception", ex)
return True, ""
# collect server stacktraces using gdb
@ -334,9 +337,13 @@ def run_tests_array(all_tests_with_params):
if args.testname:
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
clickhouse_proc.communicate(("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite)), timeout=10)
failed_to_check = False
try:
clickhouse_proc.communicate(("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite)), timeout=10)
except:
failed_to_check = True
if clickhouse_proc.returncode != 0:
if failed_to_check or clickhouse_proc.returncode != 0:
failures += 1
print("Server does not respond to health check")
SERVER_DIED = True
@ -760,14 +767,17 @@ def main(args):
# Some queries may execute in background for some time after test was finished. This is normal.
for n in range(1, 60):
processlist = get_processlist(args.client)
if not processlist:
timeout, processlist = get_processlist(args)
if timeout or not processlist:
break
sleep(1)
if processlist:
print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"]))
print(processlist)
if timeout or processlist:
if processlist:
print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"]))
print(processlist)
else:
print(colored("Seems like server hung and cannot respond to queries", "red", attrs=["bold"]))
clickhouse_tcp_port = os.getenv("CLICKHOUSE_PORT_TCP", '9000')
server_pid = get_server_pid(clickhouse_tcp_port)