mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 17:12:03 +00:00
Merge pull request #19558 from ClickHouse/better_hung_check
Check for hung queries or server hung in fast test
This commit is contained in:
commit
b3726c1c8e
@ -337,7 +337,7 @@ function run_tests
|
|||||||
01666_blns
|
01666_blns
|
||||||
)
|
)
|
||||||
|
|
||||||
time clickhouse-test -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
|
time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt"
|
||||||
|
|
||||||
# substr is to remove semicolon after test name
|
# substr is to remove semicolon after test name
|
||||||
readarray -t FAILED_TESTS < <(awk '/\[ FAIL|TIMEOUT|ERROR \]/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt")
|
readarray -t FAILED_TESTS < <(awk '/\[ FAIL|TIMEOUT|ERROR \]/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt")
|
||||||
@ -360,7 +360,7 @@ function run_tests
|
|||||||
|
|
||||||
echo "Going to run again: ${FAILED_TESTS[*]}"
|
echo "Going to run again: ${FAILED_TESTS[*]}"
|
||||||
|
|
||||||
clickhouse-test --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt"
|
clickhouse-test --hung-check --order=random --no-long --testname --shard --zookeeper "${FAILED_TESTS[@]}" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee -a "$FASTTEST_OUTPUT/test_log.txt"
|
||||||
else
|
else
|
||||||
echo "No failed tests"
|
echo "No failed tests"
|
||||||
fi
|
fi
|
||||||
|
@ -197,11 +197,14 @@ def need_retry(stderr):
|
|||||||
return any(msg in stderr for msg in MESSAGES_TO_RETRY)
|
return any(msg in stderr for msg in MESSAGES_TO_RETRY)
|
||||||
|
|
||||||
|
|
||||||
def get_processlist(client_cmd):
|
def get_processlist(args):
|
||||||
try:
|
try:
|
||||||
return subprocess.check_output("{} --query 'SHOW PROCESSLIST FORMAT Vertical'".format(client_cmd), shell=True).decode('utf-8')
|
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
|
||||||
except:
|
(stdout, _) = clickhouse_proc.communicate((b"SHOW PROCESSLIST FORMAT Vertical"), timeout=10)
|
||||||
return "" # server seems dead
|
return False, stdout.decode('utf-8')
|
||||||
|
except Exception as ex:
|
||||||
|
print("Exception", ex)
|
||||||
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
# collect server stacktraces using gdb
|
# collect server stacktraces using gdb
|
||||||
@ -334,9 +337,13 @@ def run_tests_array(all_tests_with_params):
|
|||||||
|
|
||||||
if args.testname:
|
if args.testname:
|
||||||
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
|
clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True)
|
||||||
clickhouse_proc.communicate(("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite)), timeout=10)
|
failed_to_check = False
|
||||||
|
try:
|
||||||
|
clickhouse_proc.communicate(("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite)), timeout=10)
|
||||||
|
except:
|
||||||
|
failed_to_check = True
|
||||||
|
|
||||||
if clickhouse_proc.returncode != 0:
|
if failed_to_check or clickhouse_proc.returncode != 0:
|
||||||
failures += 1
|
failures += 1
|
||||||
print("Server does not respond to health check")
|
print("Server does not respond to health check")
|
||||||
SERVER_DIED = True
|
SERVER_DIED = True
|
||||||
@ -760,14 +767,17 @@ def main(args):
|
|||||||
|
|
||||||
# Some queries may execute in background for some time after test was finished. This is normal.
|
# Some queries may execute in background for some time after test was finished. This is normal.
|
||||||
for n in range(1, 60):
|
for n in range(1, 60):
|
||||||
processlist = get_processlist(args.client)
|
timeout, processlist = get_processlist(args)
|
||||||
if not processlist:
|
if timeout or not processlist:
|
||||||
break
|
break
|
||||||
sleep(1)
|
sleep(1)
|
||||||
|
|
||||||
if processlist:
|
if timeout or processlist:
|
||||||
print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"]))
|
if processlist:
|
||||||
print(processlist)
|
print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"]))
|
||||||
|
print(processlist)
|
||||||
|
else:
|
||||||
|
print(colored("Seems like server hung and cannot respond to queries", "red", attrs=["bold"]))
|
||||||
|
|
||||||
clickhouse_tcp_port = os.getenv("CLICKHOUSE_PORT_TCP", '9000')
|
clickhouse_tcp_port = os.getenv("CLICKHOUSE_PORT_TCP", '9000')
|
||||||
server_pid = get_server_pid(clickhouse_tcp_port)
|
server_pid = get_server_pid(clickhouse_tcp_port)
|
||||||
|
Loading…
Reference in New Issue
Block a user