From f0c930bf9dd7a980cbbb83f78c3f5288d11c2985 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Wed, 17 Mar 2021 22:28:18 +0300
Subject: [PATCH] remove query timeouts

---
 tests/integration/ci-runner.py                | 37 ++++++++++++++-----
 tests/integration/helpers/cluster.py          |  8 ++--
 .../test_dictionaries_postgresql/test.py      |  5 ++-
 tests/integration/test_drop_replica/test.py   |  1 -
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py
index b2d60f7dc19..eb82d86b38b 100755
--- a/tests/integration/ci-runner.py
+++ b/tests/integration/ci-runner.py
@@ -19,8 +19,8 @@ CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH = "/usr/bin/clickhouse-odbc-bridge"
 TRIES_COUNT = 10
 MAX_TIME_SECONDS = 3600
 
-# NOTE it must be less then timeout in Sandbox
-TASK_TIMEOUT = 7.5 * 60 * 60
+MAX_TIME_IN_SANDBOX = 20 * 60   # 20 minutes
+TASK_TIMEOUT = 8 * 60 * 60      # 8 hours
 
 def get_tests_to_run(pr_info):
     result = set([])
@@ -167,7 +167,7 @@ class ClickhouseIntegrationTestsRunner:
         self.shuffle_groups = self.params['shuffle_test_groups']
         self.flaky_check = 'flaky check' in self.params['context_name']
         self.start_time = time.time()
-        self.soft_deadline_time = self.start_time + TASK_TIMEOUT
+        self.soft_deadline_time = self.start_time + (TASK_TIMEOUT - MAX_TIME_IN_SANDBOX)
 
     def path(self):
         return self.result_path
@@ -274,16 +274,27 @@ class ClickhouseIntegrationTestsRunner:
 
     def _update_counters(self, main_counters, current_counters):
         for test in current_counters["PASSED"]:
-            if test not in main_counters["PASSED"]:
+            if test not in main_counters["PASSED"] and test not in main_counters["FLAKY"]:
+                is_flaky = False
                 if test in main_counters["FAILED"]:
                     main_counters["FAILED"].remove(test)
+                    is_flaky = True
                 if test in main_counters["ERROR"]:
                     main_counters["ERROR"].remove(test)
-                main_counters["PASSED"].append(test)
+                    is_flaky = True
+
+                if is_flaky:
+                    main_counters["FLAKY"].append(test)
+                else:
+                    main_counters["PASSED"].append(test)
 
         for state in ("ERROR", "FAILED"):
             for test in current_counters[state]:
+                if test in main_counters["FLAKY"]:
+                    continue
                 if test in main_counters["PASSED"]:
+                    main_counters["PASSED"].remove(test)
+                    main_counters["FLAKY"].append(test)
                     continue
                 if test not in main_counters[state]:
                     main_counters[state].append(test)
@@ -309,12 +320,15 @@ class ClickhouseIntegrationTestsRunner:
             "ERROR": [],
             "PASSED": [],
             "FAILED": [],
+            "SKIPPED": [],
+            "FLAKY": [],
         }
         tests_times = defaultdict(float)
 
         if self.soft_deadline_time < time.time():
             for test in tests_in_group:
-                counters["ERROR"].append(test)
+                logging.info("Task timeout exceeded, skipping %s", test)
+                counters["SKIPPED"].append(test)
                 tests_times[test] = 0
             log_name = None
             log_path = None
@@ -361,10 +375,10 @@ class ClickhouseIntegrationTestsRunner:
                 for test_name, test_time in new_tests_times.items():
                     tests_times[test_name] = test_time
                 os.remove(output_path)
-            if len(counters["PASSED"]) == len(tests_in_group):
+            if len(counters["PASSED"]) + len(counters["FLAKY"]) == len(tests_in_group):
                 logging.info("All tests from group %s passed", test_group)
                 break
-            if len(counters["PASSED"]) >= 0 and len(counters["FAILED"]) == 0 and len(counters["ERROR"]) == 0:
+            if len(counters["PASSED"]) + len(counters["FLAKY"]) >= 0 and len(counters["FAILED"]) == 0 and len(counters["ERROR"]) == 0:
                 logging.info("Seems like all tests passed but some of them are skipped or deselected. Ignoring them and finishing group.")
                 break
         else:
@@ -407,6 +421,7 @@ class ClickhouseIntegrationTestsRunner:
                 # NOTE "error" result state will restart the whole test task, so we use "failure" here
                 result_state = "failure"
                 break
+            assert len(counters["FLAKY"]) == 0
             logging.info("Try is OK, all tests passed, going to clear env")
             clear_ip_tables_and_restart_daemons()
             logging.info("And going to sleep for some time")
@@ -448,6 +463,8 @@ class ClickhouseIntegrationTestsRunner:
             "ERROR": [],
             "PASSED": [],
             "FAILED": [],
+            "SKIPPED": [],
+            "FLAKY": [],
         }
         tests_times = defaultdict(float)
 
@@ -499,12 +516,14 @@ class ClickhouseIntegrationTestsRunner:
                 text_state = state
             test_result += [(c, text_state, "{:.2f}".format(tests_times[c])) for c in counters[state]]
 
-        status_text = "fail: {}, passed: {}, error: {}".format(len(counters['FAILED']), len(counters['PASSED']), len(counters['ERROR']))
+        failed_sum = len(counters['FAILED']) + len(counters['ERROR'])
+        status_text = "fail: {}, passed: {}, flaky: {}".format(failed_sum, len(counters['PASSED']), len(counters['FLAKY']))
 
         if self.soft_deadline_time < time.time():
             status_text = "Timeout, " + status_text
             result_state = "failure"
 
+        counters['FLAKY'] = []
         if not counters or sum(len(counter) for counter in counters.values()) == 0:
             status_text = "No tests found for some reason! It's a bug"
             result_state = "failure"
diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py
index 4adde53b6b8..3872234d36c 100644
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@@ -969,12 +969,12 @@ class ClickHouseInstance:
         return "-fsanitize=address" in build_opts
 
     # Connects to the instance via clickhouse-client, sends a query (1st argument) and returns the answer
-    def query(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None,
+    def query(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None,
               ignore_error=False):
         return self.client.query(sql, stdin=stdin, timeout=timeout, settings=settings, user=user, password=password,
                                  database=database, ignore_error=ignore_error)
 
-    def query_with_retry(self, sql, stdin=None, timeout=10, settings=None, user=None, password=None, database=None,
+    def query_with_retry(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None,
                          ignore_error=False,
                          retry_count=20, sleep_time=0.5, check_callback=lambda x: True):
         result = None
@@ -998,13 +998,13 @@ class ClickHouseInstance:
         return self.client.get_query_request(*args, **kwargs)
 
     # Connects to the instance via clickhouse-client, sends a query (1st argument), expects an error and return its code
-    def query_and_get_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None,
+    def query_and_get_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None,
                             database=None):
         return self.client.query_and_get_error(sql, stdin=stdin, timeout=timeout, settings=settings, user=user,
                                                password=password, database=database)
 
     # The same as query_and_get_error but ignores successful query.
-    def query_and_get_answer_with_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None,
+    def query_and_get_answer_with_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None,
                                         database=None):
         return self.client.query_and_get_answer_with_error(sql, stdin=stdin, timeout=timeout, settings=settings,
                                                            user=user, password=password, database=database)
diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py
index b83c00409af..0e83cc28085 100644
--- a/tests/integration/test_dictionaries_postgresql/test.py
+++ b/tests/integration/test_dictionaries_postgresql/test.py
@@ -82,11 +82,11 @@ def test_load_dictionaries(started_cluster):
 def test_invalidate_query(started_cluster):
     conn = get_postgres_conn(True)
     cursor = conn.cursor()
-    table_name = 'test0'
+    table_name = 'test1'
     create_and_fill_postgres_table(table_name)
 
     # invalidate query: SELECT value FROM test0 WHERE id = 0
-    dict_name = 'dict0'
+    dict_name = 'dict1'
     create_dict(table_name)
     node1.query("SYSTEM RELOAD DICTIONARY {}".format(dict_name))
     assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(0))".format(dict_name)) ==  "0\n"
@@ -111,6 +111,7 @@ def test_invalidate_query(started_cluster):
     time.sleep(5)
     assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(0))".format(dict_name)) == '2\n'
     assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(1))".format(dict_name)) == '2\n'
+    cursor.execute("DROP TABLE IF EXISTS {}".format(table_name))
 
 
 if __name__ == '__main__':
diff --git a/tests/integration/test_drop_replica/test.py b/tests/integration/test_drop_replica/test.py
index 7d7ad784166..947eaa2dfa1 100644
--- a/tests/integration/test_drop_replica/test.py
+++ b/tests/integration/test_drop_replica/test.py
@@ -105,7 +105,6 @@ def test_drop_replica(start_cluster):
     with PartitionManager() as pm:
         ## make node_1_1 dead
         pm.drop_instance_zk_connections(node_1_1)
-        time.sleep(10)
 
         assert "doesn't exist" in node_1_3.query_and_get_error(
             "SYSTEM DROP REPLICA 'node_1_1' FROM TABLE test.test_table")