add flacky check runner

2024-11-21 23:21:59 +00:00 · 2021-02-24 19:41:44 +03:00 · 2021-02-24 19:41:44 +03:00 · 3632c1d879
commit 3632c1d879
parent f41e68d118
3 changed files with 123 additions and 11 deletions
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -55,6 +55,27 @@ def run_func_test(cmd, output_prefix, num_processes, skip_tests_option, global_t
        time.sleep(0.5)
    return pipes

+def prepare_for_hung_check():
+    # FIXME this function should not exist, but...
+
+    # We attach gdb to clickhouse-server before running tests
+    # to print stacktraces of all crashes even if clickhouse cannot print it for some reason.
+    # However, it obstruct checking for hung queries.
+    logging.info("Will terminate gdb (if any)")
+    call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT)
+
+    # Some tests execute SYSTEM STOP MERGES or similar queries.
+    # It may cause some ALTERs to hang.
+    # Possibly we should fix tests and forbid to use such queries without specifying table.
+    call("clickhouse client -q 'SYSTEM START MERGES'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START DISTRIBUTED SENDS'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START TTL MERGES'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START MOVES'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START FETCHES'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START REPLICATED SENDS'", shell=True, stderr=STDOUT)
+    call("clickhouse client -q 'SYSTEM START REPLICATION QUEUES'", shell=True, stderr=STDOUT)
+
+    time.sleep(30)

 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
@ -85,8 +106,7 @@ if __name__ == "__main__":

    logging.info("All processes finished")
    if args.hung_check:
-        logging.info("Will terminate gdb (if any)")
-        res = call("kill -TERM $(pidof gdb)", shell=True, stderr=STDOUT)
+        prepare_for_hung_check()
        logging.info("Checking if some queries hung")
        cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1")
        res = call(cmd, shell=True, stderr=STDOUT)
--- a/tests/integration/ci-runner.py
+++ b/tests/integration/ci-runner.py
@ -16,6 +16,32 @@ SLEEP_BETWEEN_RETRIES = 5
 CLICKHOUSE_BINARY_PATH = "/usr/bin/clickhouse"
 CLICKHOUSE_ODBC_BRIDGE_BINARY_PATH = "/usr/bin/clickhouse-odbc-bridge"

+TRIES_COUNT = 10
+MAX_TIME_SECONDS = 3600
+
+
+def get_tests_to_run(pr_info):
+    result = set([])
+
+    if pr_info.changed_files is None:
+        return []
+
+    for fpath in pr_info.changed_files:
+        if 'tests/integration/test_' in fpath:
+            logging.info('File %s changed and seems like integration test', fpath)
+            result.add(fpath.split('/')[2])
+    return list(result)
+
+
+def filter_existing_tests(tests_to_run, repo_path):
+    result = []
+    for relative_test_path in tests_to_run:
+        if os.path.exists(os.path.join(repo_path, 'tests/integration', relative_test_path)):
+            result.append(relative_test_path)
+        else:
+            logging.info("Skipping test %s, seems like it was removed", relative_test_path)
+    return result
+

 def _get_deselect_option(tests):
    return ' '.join(['--deselect {}'.format(t) for t in tests])
@ -128,10 +154,13 @@ def clear_ip_tables_and_restart_daemons():

 class ClickhouseIntegrationTestsRunner:

-    def __init__(self, result_path, image_versions, shuffle_groups):
+    def __init__(self, result_path, params):
        self.result_path = result_path
-        self.image_versions = image_versions
-        self.shuffle_groups = shuffle_groups
+        self.params = params
+
+        self.image_versions = self.params['docker_images_with_versions']
+        self.shuffle_groups = self.params['shuffle_test_groups']
+        self.flacky_check = 'flacky check' in self.params['context_name']

    def path(self):
        return self.result_path
@ -328,7 +357,68 @@ class ClickhouseIntegrationTestsRunner:

        return counters, tests_times, log_name, log_path

-    def run_impl(self, commit, repo, pull_request, repo_path, build_path):
+    def run_flacky_check(self, repo_path, build_path):
+        pr_info = self.params['pr_info']
+
+        # pytest swears, if we require to run some tests which was renamed or deleted
+        tests_to_run = filter_existing_tests(get_tests_to_run(pr_info), repo_path)
+        if not tests_to_run:
+            logging.info("No tests to run found")
+            return 'success', 'Nothing to run', [('Nothing to run', 'OK')], ''
+
+        self._install_clickhouse(build_path)
+        logging.info("Found '%s' tests to run", ' '.join(tests_to_run))
+        result_state = "success"
+        description_prefix = "No flaky tests: "
+        start = time.time()
+        logging.info("Starting check with retries")
+        final_retry = 0
+        log_paths = []
+        for i in range(TRIES_COUNT):
+            final_retry += 1
+            logging.info("Running tests for the %s time", i)
+            counters, tests_times, log_name, log_path = self.run_test_group(repo_path, "flaky", tests_to_run, 1)
+            log_paths.append(log_path)
+            if counters["FAILED"]:
+                logging.info("Found failed tests: %s", ' '.join(counters["FAILED"]))
+                description_prefix = "Flaky tests found: "
+                result_state = "failure"
+                break
+            if counters["ERROR"]:
+                description_prefix = "Flaky tests found: "
+                logging.info("Found error tests: %s", ' '.join(counters["ERROR"]))
+                result_state = "error"
+                break
+            logging.info("Try is OK, all tests passed, going to clear env")
+            clear_ip_tables_and_restart_daemons()
+            logging.info("And going to sleep for some time")
+            if time.time() - start > MAX_TIME_SECONDS:
+                logging.info("Timeout reached, going to finish flaky check")
+                break
+            time.sleep(5)
+
+        logging.info("Finally all tests done, going to compress test dir")
+        test_logs = os.path.join(str(self.path()), "./test_dir.tar")
+        self._compress_logs("{}/tests/integration".format(repo_path), test_logs)
+        logging.info("Compression finished")
+
+        test_result = []
+        for state in ("ERROR", "FAILED", "PASSED"):
+            if state == "PASSED":
+                text_state = "OK"
+            elif state == "FAILED":
+                text_state = "FAIL"
+            else:
+                text_state = state
+            test_result += [(c + ' (✕' + str(final_retry) + ')', text_state, str(tests_times[c])) for c in counters[state]]
+        status_text = description_prefix + ', '.join([str(n).lower().replace('failed', 'fail') + ': ' + str(len(c)) for n, c in counters.items()])
+
+        return result_state, status_text, test_result, [test_logs] + log_paths
+
+    def run_impl(self, repo_path, build_path):
+        if self.flacky_check:
+            return self.flacky_check(repo_path, build_path)
+
        self._install_clickhouse(build_path)
        logging.info("Dump iptables before run %s", subprocess.check_output("iptables -L", shell=True))
        all_tests = self._get_all_tests(repo_path)
@ -351,7 +441,7 @@ class ClickhouseIntegrationTestsRunner:
            logging.info("Shuffling test groups")
            random.shuffle(items_to_run)

-        for group, tests in items_to_run:
+        for group, tests in items_to_run[:10]:  #FIXME
            logging.info("Running test group %s countaining %s tests", group, len(tests))
            group_counters, group_test_times, log_name, log_path = self.run_test_group(repo_path, group, tests, MAX_RETRY)
            total_tests = 0
@ -413,14 +503,15 @@ if __name__ == "__main__":
    repo_path = os.environ.get("CLICKHOUSE_TESTS_REPO_PATH")
    build_path = os.environ.get("CLICKHOUSE_TESTS_BUILD_PATH")
    result_path = os.environ.get("CLICKHOUSE_TESTS_RESULT_PATH")
-    image_versions = os.environ.get("CLICKHOUSE_TESTS_IMAGE_VERSIONS", '{}')
-    shuffle_groups = int(os.environ.get("SHUFFLE_TEST_GROUPS", '0'))
+    params_path = os.environ.get("CLICKHOUSE_TESTS_JSON_PARAMS_PATH")

-    runner = ClickhouseIntegrationTestsRunner(result_path, json.loads(image_versions), shuffle_groups)
+    params = json.loads(open(params_path, 'r').read())
+    runner = ClickhouseIntegrationTestsRunner(result_path, params)

    logging.info("Running tests")
-    state, description, test_results, logs = runner.run_impl(None, None, None, repo_path, build_path)
+    state, description, test_results, logs = runner.run_impl(repo_path, build_path)
    logging.info("Tests finished")
+
    status = (state, description)
    out_results_file = os.path.join(str(runner.path()), "test_results.tsv")
    out_status_file = os.path.join(str(runner.path()), "check_status.tsv")
--- a/tests/integration/test_drop_replica/test.py
+++ b/tests/integration/test_drop_replica/test.py
@ -54,6 +54,7 @@ node_1_1 = cluster.add_instance('node_1_1', with_zookeeper=True, main_configs=['
 node_1_2 = cluster.add_instance('node_1_2', with_zookeeper=True, main_configs=['configs/remote_servers.xml'])
 node_1_3 = cluster.add_instance('node_1_3', with_zookeeper=True, main_configs=['configs/remote_servers.xml'])

+#FIXME it's just to run flacky check

@pytest.fixture(scope="module")
 def start_cluster():