From 1fdee81ad503a666eea04eb438e369a4c429c109 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 23 Dec 2023 15:42:49 +0100 Subject: [PATCH 1/2] Fix leftover processes/hangs in tests One of such cases is 02479_race_condition_between_insert_and_droppin_mv [1], yes it can be fixed (by using fixed number of iterations, or with some bash trickery), but it is better to fix them completelly, eventually such tests will be submitted and pass review anyway. By allocating process group for each test we can kill all the processes in this process group, and this what this patch does. This will also fix some test hangs (like in [1]) as well as some possible issues in stress tests. [1]: https://s3.amazonaws.com/clickhouse-test-reports/0/e2c1230b00386c4d0096a245396ab3be7ce60950/stateless_tests__release__analyzer_/run.log Signed-off-by: Azat Khuzhin (cherry picked from commit 72fa58e19217cb38abe799db793c21b113fd0b77) --- tests/clickhouse-test | 4 ++-- tests/queries/shell_config.sh | 8 ++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 9f66f49837f..7bf1fcee390 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1031,7 +1031,7 @@ class TestCase: if proc: if proc.returncode is None: try: - proc.kill() + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) except OSError as e: if e.errno != ESRCH: raise @@ -1307,7 +1307,7 @@ class TestCase: command = pattern.format(**params) - proc = Popen(command, shell=True, env=os.environ) + proc = Popen(command, shell=True, env=os.environ, start_new_session=True) while ( datetime.now() - start_time diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index c687a63623f..63f6ba9a914 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -185,3 +185,11 @@ function query_with_retry done echo "Query '$query' failed with '$result'" } + +# Add --foreground to avoid running setpgid() in timeout, otherwise +# clickhouse-test will not kill those processes in case of timeout +function timeout() +{ + command timeout --foreground "$@" +} +export -f timeout From c5dbde8407d8e2fb2b1a9145ae3e0aec8a293fd5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 24 Dec 2023 13:14:13 +0100 Subject: [PATCH 2/2] Replace timeout --foreground with one workaround The problem with --foreground option is that it send the signal only to the process that had been spawned by timeout(1), while it can create lots of children, and when you killing parent you are closing pipes and childrens will get EPIPE, like in [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/0/069f8bbb2f48541cc736903e1da5459fa2c27da0/stateless_tests__debug__%5B2_5%5D.html Another problem is that now child process will finish correctly, which may also print some errors like QUERY_WAS_CANCELLED (see [2]). [2]: https://s3.amazonaws.com/clickhouse-test-reports/0/ef66714bf20042ba9cb5d59b7839befe26110b93/stateless_tests__release__analyzer_.html In general this is not required actually, since all timeout invocations uses timeout value less then the default test limit (10min). But it may leave some processes in case of overriding this limit, i.e. `clickhouse-test --timeout 1` So to workaround this at least somehow, let's send SIGTERM and only after some timeout (here I use 0.1), SIGKILL. This will give at least some ability to terminate all childrens that had been spawned by timeout(1). Signed-off-by: Azat Khuzhin --- tests/clickhouse-test | 19 ++++++++++++++++++- tests/queries/shell_config.sh | 8 -------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 7bf1fcee390..1175d8342b1 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1031,7 +1031,24 @@ class TestCase: if proc: if proc.returncode is None: try: - os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + pgid = os.getpgid(proc.pid) + # NOTE: this still may leave some processes, that had been + # created by timeout(1), since it also creates new process + # group. But this should not be a problem with default + # options, since the default time for each test is 10min, + # and this is way more bigger then the timeout for each + # timeout(1) invocation. + # + # But as a workaround we are sending SIGTERM first, and + # only after SIGKILL, that way timeout(1) will have an + # ability to terminate childrens (though not always since + # signals are asynchronous). + os.killpg(pgid, signal.SIGTERM) + # This may not be enough, but this is at least something + # (and anyway it is OK to spend 0.1 second more in case of + # test timeout). + sleep(0.1) + os.killpg(pgid, signal.SIGKILL) except OSError as e: if e.errno != ESRCH: raise diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index 63f6ba9a914..c687a63623f 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -185,11 +185,3 @@ function query_with_retry done echo "Query '$query' failed with '$result'" } - -# Add --foreground to avoid running setpgid() in timeout, otherwise -# clickhouse-test will not kill those processes in case of timeout -function timeout() -{ - command timeout --foreground "$@" -} -export -f timeout