minor fixes

This commit is contained in:
Alexander Tokmakov 2021-06-17 00:20:35 +03:00
parent 60d220bd14
commit c235539620
3 changed files with 20 additions and 4 deletions

View File

@ -14,6 +14,8 @@ HUNG_SIGN = "Found hung queries in processlist"
NO_TASK_TIMEOUT_SIGN = "All tests have finished"
RETRIES_SIGN = "Some tests were restarted"
def process_test_log(log_path):
total = 0
skipped = 0
@ -21,6 +23,7 @@ def process_test_log(log_path):
failed = 0
success = 0
hung = False
retries = False
task_timeout = True
test_results = []
with open(log_path, 'r') as test_file:
@ -30,6 +33,8 @@ def process_test_log(log_path):
task_timeout = False
if HUNG_SIGN in line:
hung = True
if RETRIES_SIGN in line:
retries = True
if any(sign in line for sign in (OK_SIGN, FAIL_SIGN, UNKNOWN_SIGN, SKIPPED_SIGN)):
test_name = line.split(' ')[2].split(':')[0]
@ -57,7 +62,7 @@ def process_test_log(log_path):
else:
success += int(OK_SIGN in line)
test_results.append((test_name, "OK", test_time))
return total, skipped, unknown, failed, success, hung, task_timeout, test_results
return total, skipped, unknown, failed, success, hung, task_timeout, retries, test_results
def process_result(result_path):
test_results = []
@ -73,7 +78,7 @@ def process_result(result_path):
state = "error"
if result_path and os.path.exists(result_path):
total, skipped, unknown, failed, success, hung, task_timeout, test_results = process_test_log(result_path)
total, skipped, unknown, failed, success, hung, task_timeout, retries, test_results = process_test_log(result_path)
is_flacky_check = 1 < int(os.environ.get('NUM_TRIES', 1))
# If no tests were run (success == 0) it indicates an error (e.g. server did not start or crashed immediately)
# But it's Ok for "flaky checks" - they can contain just one test for check which is marked as skipped.
@ -83,9 +88,14 @@ def process_result(result_path):
if hung:
description = "Some queries hung, "
state = "failure"
test_results.append(("Some queries hung", "FAIL", "0"))
elif task_timeout:
description = "Timeout, "
state = "failure"
test_results.append(("Timeout", "FAIL", "0"))
elif retries:
description = "Some tests restarted, "
test_results.append(("Some tests restarted", "SKIPPED", "0"))
else:
description = ""

View File

@ -1137,7 +1137,9 @@ void DDLWorker::runMainThread()
scheduleTasks(reinitialized);
LOG_DEBUG(log, "Waiting for queue updates");
queue_updated_event->wait();
/// FIXME It may hang for unknown reason. Timeout is just a hotfix.
constexpr int queue_wait_timeout_ms = 10000;
queue_updated_event->tryWait(queue_wait_timeout_ms);
}
catch (const Coordination::Exception & e)
{

View File

@ -29,6 +29,7 @@ import string
import multiprocessing
from contextlib import closing
DISTRIBUTED_DDL_TIMEOUT_MSG = "is executing longer than distributed_ddl_task_timeout (=120)"
MESSAGES_TO_RETRY = [
"DB::Exception: ZooKeeper session has been expired",
@ -40,7 +41,7 @@ MESSAGES_TO_RETRY = [
"Operation timed out",
"ConnectionPoolWithFailover: Connection failed at try",
"DB::Exception: New table appeared in database being dropped or detached. Try again",
"is executing longer than distributed_ddl_task_timeout (=120)" # FIXME
DISTRIBUTED_DDL_TIMEOUT_MSG # FIXME
]
MAX_RETRIES = 5
@ -476,6 +477,9 @@ def run_tests_array(all_tests_with_params):
sleep(2**counter)
counter += 1
if MAX_RETRIES < counter:
if args.replicated_database:
if DISTRIBUTED_DDL_TIMEOUT_MSG in stderr:
SERVER_DIED = True
break
if proc.returncode != 0: