clickhouse-test: send TERM to all childs (to avoid hung check triggering)

This is another try of not leaving child processes in clickhouse-test,
first one was in [1] by @akuzm:

  "I tried to do this earlier with a separate process group + atexit callbacks:
  573983d407 (diff-3a359de18cacf146f406a7ae332fb47196aa5e0aa430eb4b157a202a3cb8e6e3R578)

  But that commit was later reverted because it also tried to switch to
  multithreading instead of multiprocessing, and that didn't go good.
  SIG_IGN and SIG_DFL were broken then
  (https://bugs.python.org/issue23395), now they are fixed but not quite
  but maybe it's not relevant for us."

I looked (only briefly) through that bug report in python, but I don't
see any issues with killing child processes during testing this patch.

Plus to me it is better to get some unknown python error (and fix it
somehow) instead of leaving child processes.

v2: correctly catch INT/TERM/HUP too
This commit is contained in:
Azat Khuzhin 2021-04-29 10:43:56 +03:00
parent 711cc5f62b
commit 0b58a149d2

View File

@ -4,6 +4,7 @@ import shutil
import sys import sys
import os import os
import os.path import os.path
import signal
import re import re
import json import json
import copy import copy
@ -36,6 +37,17 @@ MESSAGES_TO_RETRY = [
"ConnectionPoolWithFailover: Connection failed at try", "ConnectionPoolWithFailover: Connection failed at try",
] ]
class Terminated(KeyboardInterrupt):
pass
def signal_handler(sig, frame):
raise Terminated(f'Terminated with {sig} signal')
def stop_tests():
# send signal to all processes in group to avoid hung check triggering
# (to avoid terminating clickhouse-test itself, the signal should be ignored)
signal.signal(signal.SIGTERM, signal.SIG_IGN)
os.killpg(os.getpgid(os.getpid()), signal.SIGTERM)
signal.signal(signal.SIGTERM, signal.SIG_DFL)
def json_minify(string): def json_minify(string):
""" """
@ -334,10 +346,12 @@ def run_tests_array(all_tests_with_params):
for case in all_tests: for case in all_tests:
if SERVER_DIED: if SERVER_DIED:
stop_tests()
break break
if stop_time and time() > stop_time: if stop_time and time() > stop_time:
print("\nStop tests run because global time limit is exceeded.\n") print("\nStop tests run because global time limit is exceeded.\n")
stop_tests()
break break
case_file = os.path.join(suite_dir, case) case_file = os.path.join(suite_dir, case)
@ -394,6 +408,7 @@ def run_tests_array(all_tests_with_params):
failures += 1 failures += 1
print("Server does not respond to health check") print("Server does not respond to health check")
SERVER_DIED = True SERVER_DIED = True
stop_tests()
break break
file_suffix = ('.' + str(os.getpid())) if is_concurrent and args.test_runs > 1 else '' file_suffix = ('.' + str(os.getpid())) if is_concurrent and args.test_runs > 1 else ''
@ -512,6 +527,7 @@ def run_tests_array(all_tests_with_params):
sys.stdout.flush() sys.stdout.flush()
except KeyboardInterrupt as e: except KeyboardInterrupt as e:
print(colored("Break tests execution", args, "red")) print(colored("Break tests execution", args, "red"))
stop_tests()
raise e raise e
except: except:
exc_type, exc_value, tb = sys.exc_info() exc_type, exc_value, tb = sys.exc_info()
@ -519,6 +535,7 @@ def run_tests_array(all_tests_with_params):
print("{0} - Test internal error: {1}\n{2}\n{3}".format(MSG_FAIL, exc_type.__name__, exc_value, "\n".join(traceback.format_tb(tb, 10)))) print("{0} - Test internal error: {1}\n{2}\n{3}".format(MSG_FAIL, exc_type.__name__, exc_value, "\n".join(traceback.format_tb(tb, 10))))
if failures_chain >= 20: if failures_chain >= 20:
stop_tests()
break break
failures_total = failures_total + failures failures_total = failures_total + failures
@ -959,6 +976,14 @@ def collect_sequential_list(skip_list_path):
if __name__ == '__main__': if __name__ == '__main__':
# Move to a new process group and kill it at exit so that we don't have any
# infinite tests processes left
# (new process group is required to avoid killing some parent processes)
os.setpgid(0, 0)
signal.signal(signal.SIGTERM, signal_handler)
signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGHUP, signal_handler)
parser=ArgumentParser(description='ClickHouse functional tests') parser=ArgumentParser(description='ClickHouse functional tests')
parser.add_argument('-q', '--queries', help='Path to queries dir') parser.add_argument('-q', '--queries', help='Path to queries dir')
parser.add_argument('--tmp', help='Path to tmp dir') parser.add_argument('--tmp', help='Path to tmp dir')