From 9ad919d91a8da739b1aa8a3d708b608f34dbf583 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 11 Oct 2021 13:25:54 +0300 Subject: [PATCH 1/4] More timeouts in stress test --- docker/test/stress/stress | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 8fc4ade2da6..1559b084565 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -71,42 +71,42 @@ def prepare_for_hung_check(drop_databases): # FIXME this function should not exist, but... # ThreadFuzzer significantly slows down server and causes false-positive hung check failures - call("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'", shell=True, stderr=STDOUT, timeout=30) # We attach gdb to clickhouse-server before running tests # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. # However, it obstruct checking for hung queries. logging.info("Will terminate gdb (if any)") - call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT) + call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT, timeout=30) # Some tests set too low memory limit for default user and forget to reset in back. # It may cause SYSTEM queries to fail, let's disable memory limit. - call("clickhouse client --max_memory_usage_for_user=0 -q 'SELECT 1 FORMAT Null'", shell=True, stderr=STDOUT) + call("clickhouse client --max_memory_usage_for_user=0 -q 'SELECT 1 FORMAT Null'", shell=True, stderr=STDOUT, timeout=30) # Some tests execute SYSTEM STOP MERGES or similar queries. # It may cause some ALTERs to hang. # Possibly we should fix tests and forbid to use such queries without specifying table. - call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT) - call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT) + call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT, timeout=30) + call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT, timeout=30) # Issue #21004, live views are experimental, so let's just suppress it - call("""clickhouse client -q "KILL QUERY WHERE upper(query) LIKE 'WATCH %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE upper(query) LIKE 'WATCH %'" """, shell=True, stderr=STDOUT, timeout=30) # Kill other queries which known to be slow # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds - call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'insert into tableB select %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'insert into tableB select %'" """, shell=True, stderr=STDOUT, timeout=30) # Long query from 00084_external_agregation - call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" """, shell=True, stderr=STDOUT) + call("""clickhouse client -q "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" """, shell=True, stderr=STDOUT, timeout=30) if drop_databases: # Here we try to drop all databases in async mode. If some queries really hung, than drop will hung too. # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds). - databases = check_output('clickhouse client -q "SHOW DATABASES"', shell=True).decode('utf-8').strip().split() + databases = check_output('clickhouse client -q "SHOW DATABASES"', shell=True, timeout=30).decode('utf-8').strip().split() for db in databases: if db == "system": continue @@ -117,13 +117,13 @@ def prepare_for_hung_check(drop_databases): # Wait for last queries to finish if any, not longer than 300 seconds call("""clickhouse client -q "select sleepEachRow(( select maxOrDefault(300 - elapsed) + 1 from system.processes where query not like '%from system.processes%' and elapsed < 300 - ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT) + ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT, timeout=30) # Even if all clickhouse-test processes are finished, there are probably some sh scripts, # which still run some new queries. Let's ignore them. try: query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """ - output = check_output(query, shell=True, stderr=STDOUT).decode('utf-8').strip() + output = check_output(query, shell=True, stderr=STDOUT, timeout=30).decode('utf-8').strip() if int(output) == 0: return False except: @@ -176,6 +176,7 @@ if __name__ == "__main__": if res != 0 and have_long_running_queries: logging.info("Hung check failed with exit code {}".format(res)) hung_check_status = "Hung check failed\tFAIL\n" - open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) + with open(os.path.join(args.output_folder, "test_results.tsv"), 'w+') as results: + results.write(hung_check_status) logging.info("Stress test finished") From 95a69b9f4b6478f902c7a970296a8c5b2c193a9c Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 11 Oct 2021 14:05:52 +0300 Subject: [PATCH 2/4] Make test non endless --- .../01509_check_many_parallel_quorum_inserts_long.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01509_check_many_parallel_quorum_inserts_long.sh b/tests/queries/0_stateless/01509_check_many_parallel_quorum_inserts_long.sh index c2682cd8cfa..6533eeb12f5 100755 --- a/tests/queries/0_stateless/01509_check_many_parallel_quorum_inserts_long.sh +++ b/tests/queries/0_stateless/01509_check_many_parallel_quorum_inserts_long.sh @@ -18,9 +18,10 @@ for i in $(seq 1 $NUM_REPLICAS); do done function thread { - while true - do + i=0 retries=300 + while [[ $i -lt $retries ]]; do # server can be dead $CLICKHOUSE_CLIENT --insert_quorum 5 --insert_quorum_parallel 1 --query "INSERT INTO r$1 SELECT $2" && break + ((++i)) sleep 0.1 done } From 59a78830f9b73f9ff366449874047828019ad58e Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 11 Oct 2021 14:46:01 +0300 Subject: [PATCH 3/4] Better timeouts in clickhouse-test --- tests/clickhouse-test | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index f10e38b87e5..061333297e2 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -89,10 +89,13 @@ def make_clickhouse_client(base_args): # hence we should use 'system'. database='system', settings=get_additional_client_options_dict(base_args)) + def clickhouse_execute_one(base_args, *args, **kwargs): return make_clickhouse_client(base_args).execute_one(*args, **kwargs) + def clickhouse_execute(base_args, *args, **kwargs): return make_clickhouse_client(base_args).execute(*args, **kwargs) + def clickhouse_execute_pandas(base_args, *args, **kwargs): return make_clickhouse_client(base_args).execute_pandas(*args, **kwargs) @@ -109,6 +112,7 @@ def stop_tests(): global restarted_tests with stop_tests_triggered_lock: + print("Stopping tests") if not stop_tests_triggered.is_set(): stop_tests_triggered.set() @@ -875,7 +879,7 @@ def run_tests_array(all_tests_with_params): while True: if is_concurrent: - case = queue.get() + case = queue.get(timeout=args.timeout) if not case: break else: @@ -1076,10 +1080,10 @@ def do_run_tests(jobs, test_suite: TestSuite, parallel): pool.map_async(run_tests_array, parallel_tests_array) for suit in test_suite.parallel_tests: - queue.put(suit) + queue.put(suit, timeout=args.timeout) for _ in range(jobs): - queue.put(None) + queue.put(None, timeout=args.timeout) queue.close() From 3ae960e04b7675b54d5e386573c387c72ad1e5cd Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 11 Oct 2021 16:40:12 +0300 Subject: [PATCH 4/4] Review fixes --- docker/test/stress/stress | 2 +- tests/clickhouse-test | 6 +++--- ...4_cancel_http_readonly_queries_on_client_close.sh | 5 +++-- .../0_stateless/01085_max_distributed_connections.sh | 5 +++-- .../01085_max_distributed_connections_http.sh | 5 +++-- .../0_stateless/01600_quota_by_forwarded_ip.sh | 12 +++++++++--- .../0_stateless/01602_max_distributed_connections.sh | 8 ++++++-- .../queries/0_stateless/01675_data_type_coroutine.sh | 6 ++++-- .../0_stateless/01681_hyperscan_debug_assertion.sh | 5 +++-- .../01834_alias_columns_laziness_filimonov.sh | 5 +++-- tests/queries/0_stateless/02044_url_glob_parallel.sh | 6 ++++-- 11 files changed, 42 insertions(+), 23 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 1559b084565..5e98c67d8e1 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -117,7 +117,7 @@ def prepare_for_hung_check(drop_databases): # Wait for last queries to finish if any, not longer than 300 seconds call("""clickhouse client -q "select sleepEachRow(( select maxOrDefault(300 - elapsed) + 1 from system.processes where query not like '%from system.processes%' and elapsed < 300 - ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT, timeout=30) + ) / 300) from numbers(300) format Null" """, shell=True, stderr=STDOUT, timeout=330) # Even if all clickhouse-test processes are finished, there are probably some sh scripts, # which still run some new queries. Let's ignore them. diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 061333297e2..62860a36fc7 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -879,7 +879,7 @@ def run_tests_array(all_tests_with_params): while True: if is_concurrent: - case = queue.get(timeout=args.timeout) + case = queue.get(timeout=args.timeout * 1.1) if not case: break else: @@ -1080,10 +1080,10 @@ def do_run_tests(jobs, test_suite: TestSuite, parallel): pool.map_async(run_tests_array, parallel_tests_array) for suit in test_suite.parallel_tests: - queue.put(suit, timeout=args.timeout) + queue.put(suit, timeout=args.timeout * 1.1) for _ in range(jobs): - queue.put(None, timeout=args.timeout) + queue.put(None, timeout=args.timeout * 1.1) queue.close() diff --git a/tests/queries/0_stateless/00834_cancel_http_readonly_queries_on_client_close.sh b/tests/queries/0_stateless/00834_cancel_http_readonly_queries_on_client_close.sh index 340df58e473..74b4c4052f8 100755 --- a/tests/queries/0_stateless/00834_cancel_http_readonly_queries_on_client_close.sh +++ b/tests/queries/0_stateless/00834_cancel_http_readonly_queries_on_client_close.sh @@ -7,9 +7,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ${CLICKHOUSE_CURL} --max-time 1 -sS "${CLICKHOUSE_URL}&query_id=cancel_http_readonly_queries_on_client_close&cancel_http_readonly_queries_on_client_close=1&query=SELECT+count()+FROM+system.numbers" 2>&1 | grep -cF 'curl: (28)' -while true -do +i=0 retries=300 +while [[ $i -lt $retries ]]; do ${CLICKHOUSE_CURL} -sS --data "SELECT count() FROM system.processes WHERE query_id = 'cancel_http_readonly_queries_on_client_close'" "${CLICKHOUSE_URL}" | grep '0' && break + ((++i)) sleep 0.2 done diff --git a/tests/queries/0_stateless/01085_max_distributed_connections.sh b/tests/queries/0_stateless/01085_max_distributed_connections.sh index 4ffcd980956..34862289d1e 100755 --- a/tests/queries/0_stateless/01085_max_distributed_connections.sh +++ b/tests/queries/0_stateless/01085_max_distributed_connections.sh @@ -5,10 +5,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh +i=0 retries=300 # Sometimes five seconds are not enough due to system overload. # But if it can run in less than five seconds at least sometimes - it is enough for the test. -while true -do +while [[ $i -lt $retries ]]; do opts=( --max_distributed_connections 20 --max_threads 1 @@ -19,4 +19,5 @@ do # "$@" left to pass manual options (like --experimental_use_processors 0) during manual testing timeout 10s ${CLICKHOUSE_CLIENT} "${opts[@]}" "$@" && break + ((++i)) done diff --git a/tests/queries/0_stateless/01085_max_distributed_connections_http.sh b/tests/queries/0_stateless/01085_max_distributed_connections_http.sh index 3edf70f31b8..0e40918257d 100755 --- a/tests/queries/0_stateless/01085_max_distributed_connections_http.sh +++ b/tests/queries/0_stateless/01085_max_distributed_connections_http.sh @@ -8,9 +8,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # Sometimes 1.8 seconds are not enough due to system overload. # But if it can run in less than five seconds at least sometimes - it is enough for the test. -while true -do +i=0 retries=100 +while [[ $i -lt $retries ]]; do query="SELECT sleepEachRow(1) FROM remote('127.{2,3}', system.one) FORMAT Null" # 1.8 less then 2 seconds, but long enough to cover possible load peaks timeout 1.8s ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&max_distributed_connections=2&max_threads=1" -d "$query" && break + ((++i)) done diff --git a/tests/queries/0_stateless/01600_quota_by_forwarded_ip.sh b/tests/queries/0_stateless/01600_quota_by_forwarded_ip.sh index 97e4da5f9e3..1d768c8b027 100755 --- a/tests/queries/0_stateless/01600_quota_by_forwarded_ip.sh +++ b/tests/queries/0_stateless/01600_quota_by_forwarded_ip.sh @@ -21,7 +21,9 @@ CREATE QUOTA quota_by_forwarded_ip_${CLICKHOUSE_DATABASE} KEYED BY forwarded_ip_ echo '--- Test with quota by immediate IP ---' -while true; do +i=0 retries=300 +while [[ $i -lt $retries ]]; do + ((++i)) ${CLICKHOUSE_CURL} --fail -sS "${CLICKHOUSE_URL}&user=quoted_by_ip_${CLICKHOUSE_DATABASE}" -d "SELECT count() FROM numbers(10)" 2>/dev/null || break done | uniq @@ -33,14 +35,18 @@ ${CLICKHOUSE_CURL} -H 'X-Forwarded-For: 1.2.3.4' -sS "${CLICKHOUSE_URL}&user=quo echo '--- Test with quota by forwarded IP ---' -while true; do +i=0 retries=300 +while [[ $i -lt $retries ]]; do + ((++i)) ${CLICKHOUSE_CURL} --fail -sS "${CLICKHOUSE_URL}&user=quoted_by_forwarded_ip_${CLICKHOUSE_DATABASE}" -d "SELECT count() FROM numbers(10)" 2>/dev/null || break done | uniq ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&user=quoted_by_forwarded_ip_${CLICKHOUSE_DATABASE}" -d "SELECT count() FROM numbers(10)" | grep -oF 'exceeded' +i=0 retries=300 # X-Forwarded-For is respected for quota by forwarded IP address -while true; do +while [[ $i -lt $retries ]]; do + ((++i)) ${CLICKHOUSE_CURL} -H 'X-Forwarded-For: 1.2.3.4' -sS "${CLICKHOUSE_URL}&user=quoted_by_forwarded_ip_${CLICKHOUSE_DATABASE}" -d "SELECT count() FROM numbers(10)" | grep -oP '^10$' || break done | uniq diff --git a/tests/queries/0_stateless/01602_max_distributed_connections.sh b/tests/queries/0_stateless/01602_max_distributed_connections.sh index 51ff803ad5e..ed835a8768f 100755 --- a/tests/queries/0_stateless/01602_max_distributed_connections.sh +++ b/tests/queries/0_stateless/01602_max_distributed_connections.sh @@ -13,14 +13,18 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # If concurrency is 10 (good), the query may take less than 10 second with non-zero probability # and the following loops will finish with probability 1 assuming independent random variables. -while true; do +i=0 retries=30 +while [[ $i -lt $retries ]]; do timeout 10 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 10 --query " SELECT sleep(1.5) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=0 && break + ((++i)) done -while true; do +i=0 retries=30 +while [[ $i -lt $retries ]]; do timeout 10 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 10 --query " SELECT sleep(1.5) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=1 && break + ((++i)) done # If max_distributed_connections is low and async_socket_for_remote is disabled, diff --git a/tests/queries/0_stateless/01675_data_type_coroutine.sh b/tests/queries/0_stateless/01675_data_type_coroutine.sh index 781e43e4134..8e80d722a4c 100755 --- a/tests/queries/0_stateless/01675_data_type_coroutine.sh +++ b/tests/queries/0_stateless/01675_data_type_coroutine.sh @@ -4,12 +4,14 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh + +counter=0 retries=60 I=0 -while true -do +while [[ $counter -lt $retries ]]; do I=$((I + 1)) TYPE=$(perl -e "print 'Array(' x $I; print 'UInt8'; print ')' x $I") ${CLICKHOUSE_CLIENT} --max_parser_depth 1000000 --query "SELECT * FROM remote('127.0.0.{1,2}', generateRandom('x $TYPE', 1, 1, 1)) LIMIT 1 FORMAT Null" 2>&1 | grep -q -F 'Maximum parse depth' && break; + ((++counter)) done #echo "I = ${I}" diff --git a/tests/queries/0_stateless/01681_hyperscan_debug_assertion.sh b/tests/queries/0_stateless/01681_hyperscan_debug_assertion.sh index 2b4cd1a5f01..62469da0b3e 100755 --- a/tests/queries/0_stateless/01681_hyperscan_debug_assertion.sh +++ b/tests/queries/0_stateless/01681_hyperscan_debug_assertion.sh @@ -13,13 +13,14 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) M=1000000 -while true -do +i=0 retries=300 +while [[ $i -lt $retries ]]; do $CLICKHOUSE_CLIENT --allow_hyperscan 1 --max_memory_usage $M --format Null --query " SELECT [1, 2, 3, 11] = arraySort(multiMatchAllIndices('фабрикант', ['', 'рикан', 'а', 'f[a${RANDOM}e]b[ei]rl', 'ф[иа${RANDOM}эе]б[еэи][рпл]', 'афиукд', 'a[f${RANDOM}t],th', '^ф[аие${RANDOM}э]?б?[еэи]?$', 'бе${RANDOM}рлик', 'fa${RANDOM}b', 'фа[беьв]+е?[рл${RANDOM}ко]'])) " 2>&1 | grep -q 'Memory limit' || break; M=$((M + 100000)) + ((++i)) done echo 'Ok' diff --git a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh index 793f477b3cb..1d70ba1df7c 100755 --- a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh +++ b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh @@ -16,9 +16,10 @@ insert into aliases_lazyness(x) select * from numbers(40); # The exact time is not guaranteed, so we check in a loop that at least once # the query will process in less than one second, that proves that the behaviour is not like it was long time ago. -while true -do +i=0 retries=300 +while [[ $i -lt $retries ]]; do timeout 1 ${CLICKHOUSE_CLIENT} --query "SELECT x, y FROM aliases_lazyness WHERE x = 1 FORMAT Null" && break + ((++i)) done ${CLICKHOUSE_CLIENT} --multiquery --query " diff --git a/tests/queries/0_stateless/02044_url_glob_parallel.sh b/tests/queries/0_stateless/02044_url_glob_parallel.sh index 6491a661201..c9c779a9ddb 100755 --- a/tests/queries/0_stateless/02044_url_glob_parallel.sh +++ b/tests/queries/0_stateless/02044_url_glob_parallel.sh @@ -5,9 +5,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh + +i=0 retries=60 # Sometimes five seconds are not enough due to system overload. # But if it can run in less than five seconds at least sometimes - it is enough for the test. -while true -do +while [[ $i -lt $retries ]]; do timeout 5s ${CLICKHOUSE_CLIENT} --max_threads 10 --query "SELECT * FROM url('http://127.0.0.{1..10}:${CLICKHOUSE_PORT_HTTP}/?query=SELECT+sleep(1)', TSV, 'x UInt8')" --format Null && break + ((++i)) done