remove query timeouts

This commit is contained in:
Alexander Tokmakov 2021-03-17 22:28:18 +03:00
parent d42b442b83
commit f0c930bf9d
4 changed files with 35 additions and 16 deletions

View File

@ -19,8 +19,8 @@ CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH = "/usr/bin/clickhouse-odbc-bridge"
TRIES_COUNT = 10
MAX_TIME_SECONDS = 3600
# NOTE it must be less then timeout in Sandbox
TASK_TIMEOUT = 7.5 * 60 * 60
MAX_TIME_IN_SANDBOX = 20 * 60 # 20 minutes
TASK_TIMEOUT = 8 * 60 * 60 # 8 hours
def get_tests_to_run(pr_info):
result = set([])
@ -167,7 +167,7 @@ class ClickhouseIntegrationTestsRunner:
self.shuffle_groups = self.params['shuffle_test_groups']
self.flaky_check = 'flaky check' in self.params['context_name']
self.start_time = time.time()
self.soft_deadline_time = self.start_time + TASK_TIMEOUT
self.soft_deadline_time = self.start_time + (TASK_TIMEOUT - MAX_TIME_IN_SANDBOX)
def path(self):
return self.result_path
@ -274,16 +274,27 @@ class ClickhouseIntegrationTestsRunner:
def _update_counters(self, main_counters, current_counters):
for test in current_counters["PASSED"]:
if test not in main_counters["PASSED"]:
if test not in main_counters["PASSED"] and test not in main_counters["FLAKY"]:
is_flaky = False
if test in main_counters["FAILED"]:
main_counters["FAILED"].remove(test)
is_flaky = True
if test in main_counters["ERROR"]:
main_counters["ERROR"].remove(test)
main_counters["PASSED"].append(test)
is_flaky = True
if is_flaky:
main_counters["FLAKY"].append(test)
else:
main_counters["PASSED"].append(test)
for state in ("ERROR", "FAILED"):
for test in current_counters[state]:
if test in main_counters["FLAKY"]:
continue
if test in main_counters["PASSED"]:
main_counters["PASSED"].remove(test)
main_counters["FLAKY"].append(test)
continue
if test not in main_counters[state]:
main_counters[state].append(test)
@ -309,12 +320,15 @@ class ClickhouseIntegrationTestsRunner:
"ERROR": [],
"PASSED": [],
"FAILED": [],
"SKIPPED": [],
"FLAKY": [],
}
tests_times = defaultdict(float)
if self.soft_deadline_time < time.time():
for test in tests_in_group:
counters["ERROR"].append(test)
logging.info("Task timeout exceeded, skipping %s", test)
counters["SKIPPED"].append(test)
tests_times[test] = 0
log_name = None
log_path = None
@ -361,10 +375,10 @@ class ClickhouseIntegrationTestsRunner:
for test_name, test_time in new_tests_times.items():
tests_times[test_name] = test_time
os.remove(output_path)
if len(counters["PASSED"]) == len(tests_in_group):
if len(counters["PASSED"]) + len(counters["FLAKY"]) == len(tests_in_group):
logging.info("All tests from group %s passed", test_group)
break
if len(counters["PASSED"]) >= 0 and len(counters["FAILED"]) == 0 and len(counters["ERROR"]) == 0:
if len(counters["PASSED"]) + len(counters["FLAKY"]) >= 0 and len(counters["FAILED"]) == 0 and len(counters["ERROR"]) == 0:
logging.info("Seems like all tests passed but some of them are skipped or deselected. Ignoring them and finishing group.")
break
else:
@ -407,6 +421,7 @@ class ClickhouseIntegrationTestsRunner:
# NOTE "error" result state will restart the whole test task, so we use "failure" here
result_state = "failure"
break
assert len(counters["FLAKY"]) == 0
logging.info("Try is OK, all tests passed, going to clear env")
clear_ip_tables_and_restart_daemons()
logging.info("And going to sleep for some time")
@ -448,6 +463,8 @@ class ClickhouseIntegrationTestsRunner:
"ERROR": [],
"PASSED": [],
"FAILED": [],
"SKIPPED": [],
"FLAKY": [],
}
tests_times = defaultdict(float)
@ -499,12 +516,14 @@ class ClickhouseIntegrationTestsRunner:
text_state = state
test_result += [(c, text_state, "{:.2f}".format(tests_times[c])) for c in counters[state]]
status_text = "fail: {}, passed: {}, error: {}".format(len(counters['FAILED']), len(counters['PASSED']), len(counters['ERROR']))
failed_sum = len(counters['FAILED']) + len(counters['ERROR'])
status_text = "fail: {}, passed: {}, flaky: {}".format(failed_sum, len(counters['PASSED']), len(counters['FLAKY']))
if self.soft_deadline_time < time.time():
status_text = "Timeout, " + status_text
result_state = "failure"
counters['FLAKY'] = []
if not counters or sum(len(counter) for counter in counters.values()) == 0:
status_text = "No tests found for some reason! It's a bug"
result_state = "failure"

View File

@ -969,12 +969,12 @@ class ClickHouseInstance:
return "-fsanitize=address" in build_opts
# Connects to the instance via clickhouse-client, sends a query (1st argument) and returns the answer
def query(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None,
def query(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None,
ignore_error=False):
return self.client.query(sql, stdin=stdin, timeout=timeout, settings=settings, user=user, password=password,
database=database, ignore_error=ignore_error)
def query_with_retry(self, sql, stdin=None, timeout=10, settings=None, user=None, password=None, database=None,
def query_with_retry(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None,
ignore_error=False,
retry_count=20, sleep_time=0.5, check_callback=lambda x: True):
result = None
@ -998,13 +998,13 @@ class ClickHouseInstance:
return self.client.get_query_request(*args, **kwargs)
# Connects to the instance via clickhouse-client, sends a query (1st argument), expects an error and return its code
def query_and_get_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None,
def query_and_get_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None,
database=None):
return self.client.query_and_get_error(sql, stdin=stdin, timeout=timeout, settings=settings, user=user,
password=password, database=database)
# The same as query_and_get_error but ignores successful query.
def query_and_get_answer_with_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None,
def query_and_get_answer_with_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None,
database=None):
return self.client.query_and_get_answer_with_error(sql, stdin=stdin, timeout=timeout, settings=settings,
user=user, password=password, database=database)

View File

@ -82,11 +82,11 @@ def test_load_dictionaries(started_cluster):
def test_invalidate_query(started_cluster):
conn = get_postgres_conn(True)
cursor = conn.cursor()
table_name = 'test0'
table_name = 'test1'
create_and_fill_postgres_table(table_name)
# invalidate query: SELECT value FROM test0 WHERE id = 0
dict_name = 'dict0'
dict_name = 'dict1'
create_dict(table_name)
node1.query("SYSTEM RELOAD DICTIONARY {}".format(dict_name))
assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(0))".format(dict_name)) == "0\n"
@ -111,6 +111,7 @@ def test_invalidate_query(started_cluster):
time.sleep(5)
assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(0))".format(dict_name)) == '2\n'
assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(1))".format(dict_name)) == '2\n'
cursor.execute("DROP TABLE IF EXISTS {}".format(table_name))
if __name__ == '__main__':

View File

@ -105,7 +105,6 @@ def test_drop_replica(start_cluster):
with PartitionManager() as pm:
## make node_1_1 dead
pm.drop_instance_zk_connections(node_1_1)
time.sleep(10)
assert "doesn't exist" in node_1_3.query_and_get_error(
"SYSTEM DROP REPLICA 'node_1_1' FROM TABLE test.test_table")