From 9ad919d91a8da739b1aa8a3d708b608f34dbf583 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 11 Oct 2021 13:25:54 +0300 Subject: [PATCH] More timeouts in stress test --- docker/test/stress/stress | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 8fc4ade2da6..1559b084565 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -71,42 +71,42 @@ def prepare_for_hung_check(drop_databases): # FIXME this function should not exist, but... # ThreadFuzzer significantly slows down server and causes false-positive hung check failures - call("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'", shell=True, stderr=STDOUT, timeout=30) # We attach gdb to clickhouse-server before running tests # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. # However, it obstruct checking for hung queries. logging.info("Will terminate gdb (if any)") - call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT) + call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT, timeout=30) # Some tests set too low memory limit for default user and forget to reset in back. # It may cause SYSTEM queries to fail, let's disable memory limit. - call("clickhouse client --max_memory_usage_for_user=0 -q 'SELECT 1 FORMAT Null'", shell=True, stderr=STDOUT) + call("clickhouse client --max_memory_usage_for_user=0 -q 'SELECT 1 FORMAT Null'", shell=True, stderr=STDOUT, timeout=30) # Some tests execute SYSTEM STOP MERGES or similar queries. # It may cause some ALTERs to hang. # Possibly we should fix tests and forbid to use such queries without specifying table. - call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT, timeout=30) # Issue #21004, live views are experimental, so let's just suppress it - call("""clickhouse client -q "KILL QUERY WHERE upper(query) LIKE 'WATCH %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE upper(query) LIKE 'WATCH %'" """, shell=True, stderr=STDOUT, timeout=30) # Kill other queries which known to be slow # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds - call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'insert into tableB select %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'insert into tableB select %'" """, shell=True, stderr=STDOUT, timeout=30) # Long query from 00084_external_agregation - call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" """, shell=True, stderr=STDOUT, timeout=30) if drop_databases: # Here we try to drop all databases in async mode. If some queries really hung, than drop will hung too. # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds). - databases = check_output('clickhouse client -q "SHOW DATABASES"', shell=True).decode('utf-8').strip().split() + databases = check_output('clickhouse client -q "SHOW DATABASES"', shell=True, timeout=30).decode('utf-8').strip().split() for db in databases: if db == "system": continue @@ -117,13 +117,13 @@ def prepare_for_hung_check(drop_databases): # Wait for last queries to finish if any, not longer than 300 seconds call("""clickhouse client -q "select sleepEachRow(( select maxOrDefault(300 - elapsed) + 1 from system.processes where query not like '%from system.processes%' and elapsed < 300 - ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT) + ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT, timeout=30) # Even if all clickhouse-test processes are finished, there are probably some sh scripts, # which still run some new queries. Let's ignore them. try: query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """ - output = check_output(query, shell=True, stderr=STDOUT).decode('utf-8').strip() + output = check_output(query, shell=True, stderr=STDOUT, timeout=30).decode('utf-8').strip() if int(output) == 0: return False except: @@ -176,6 +176,7 @@ if __name__ == "__main__": if res != 0 and have_long_running_queries: logging.info("Hung check failed with exit code {}".format(res)) hung_check_status = "Hung check failed\tFAIL\n" - open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) + with open(os.path.join(args.output_folder, "test_results.tsv"), 'w+') as results: + results.write(hung_check_status) logging.info("Stress test finished")