From c23553962024f9ae7315599fcd5e398d5375a878 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 17 Jun 2021 00:20:35 +0300 Subject: [PATCH] minor fixes --- .../stateless/process_functional_tests_result.py | 14 ++++++++++++-- src/Interpreters/DDLWorker.cpp | 4 +++- tests/clickhouse-test | 6 +++++- 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/docker/test/stateless/process_functional_tests_result.py b/docker/test/stateless/process_functional_tests_result.py index 5d183250357..b3c8fa96144 100755 --- a/docker/test/stateless/process_functional_tests_result.py +++ b/docker/test/stateless/process_functional_tests_result.py @@ -14,6 +14,8 @@ HUNG_SIGN = "Found hung queries in processlist" NO_TASK_TIMEOUT_SIGN = "All tests have finished" +RETRIES_SIGN = "Some tests were restarted" + def process_test_log(log_path): total = 0 skipped = 0 @@ -21,6 +23,7 @@ def process_test_log(log_path): failed = 0 success = 0 hung = False + retries = False task_timeout = True test_results = [] with open(log_path, 'r') as test_file: @@ -30,6 +33,8 @@ def process_test_log(log_path): task_timeout = False if HUNG_SIGN in line: hung = True + if RETRIES_SIGN in line: + retries = True if any(sign in line for sign in (OK_SIGN, FAIL_SIGN, UNKNOWN_SIGN, SKIPPED_SIGN)): test_name = line.split(' ')[2].split(':')[0] @@ -57,7 +62,7 @@ def process_test_log(log_path): else: success += int(OK_SIGN in line) test_results.append((test_name, "OK", test_time)) - return total, skipped, unknown, failed, success, hung, task_timeout, test_results + return total, skipped, unknown, failed, success, hung, task_timeout, retries, test_results def process_result(result_path): test_results = [] @@ -73,7 +78,7 @@ def process_result(result_path): state = "error" if result_path and os.path.exists(result_path): - total, skipped, unknown, failed, success, hung, task_timeout, test_results = process_test_log(result_path) + total, skipped, unknown, failed, success, hung, task_timeout, retries, test_results = process_test_log(result_path) is_flacky_check = 1 < int(os.environ.get('NUM_TRIES', 1)) # If no tests were run (success == 0) it indicates an error (e.g. server did not start or crashed immediately) # But it's Ok for "flaky checks" - they can contain just one test for check which is marked as skipped. @@ -83,9 +88,14 @@ def process_result(result_path): if hung: description = "Some queries hung, " state = "failure" + test_results.append(("Some queries hung", "FAIL", "0")) elif task_timeout: description = "Timeout, " state = "failure" + test_results.append(("Timeout", "FAIL", "0")) + elif retries: + description = "Some tests restarted, " + test_results.append(("Some tests restarted", "SKIPPED", "0")) else: description = "" diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 1c023f757f8..735c51e397e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1137,7 +1137,9 @@ void DDLWorker::runMainThread() scheduleTasks(reinitialized); LOG_DEBUG(log, "Waiting for queue updates"); - queue_updated_event->wait(); + /// FIXME It may hang for unknown reason. Timeout is just a hotfix. + constexpr int queue_wait_timeout_ms = 10000; + queue_updated_event->tryWait(queue_wait_timeout_ms); } catch (const Coordination::Exception & e) { diff --git a/tests/clickhouse-test b/tests/clickhouse-test index fe9f010456b..e508abab70c 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -29,6 +29,7 @@ import string import multiprocessing from contextlib import closing +DISTRIBUTED_DDL_TIMEOUT_MSG = "is executing longer than distributed_ddl_task_timeout (=120)" MESSAGES_TO_RETRY = [ "DB::Exception: ZooKeeper session has been expired", @@ -40,7 +41,7 @@ MESSAGES_TO_RETRY = [ "Operation timed out", "ConnectionPoolWithFailover: Connection failed at try", "DB::Exception: New table appeared in database being dropped or detached. Try again", - "is executing longer than distributed_ddl_task_timeout (=120)" # FIXME + DISTRIBUTED_DDL_TIMEOUT_MSG # FIXME ] MAX_RETRIES = 5 @@ -476,6 +477,9 @@ def run_tests_array(all_tests_with_params): sleep(2**counter) counter += 1 if MAX_RETRIES < counter: + if args.replicated_database: + if DISTRIBUTED_DDL_TIMEOUT_MSG in stderr: + SERVER_DIED = True break if proc.returncode != 0: