Make better

2024-11-29 11:02:08 +00:00 · 2023-01-04 15:06:16 +00:00 · 2023-01-04 15:06:16 +00:00 · f1ca7e54d5
commit f1ca7e54d5
parent 399c9aa235
4 changed files with 5 additions and 307 deletions
--- a/docker/test/ci
+++ b/docker/test/ci
@ -1,305 +0,0 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-from multiprocessing import cpu_count
-from subprocess import Popen, call, check_output, STDOUT
-import os
-import argparse
-import logging
-import time
-
-
-def get_options(i, backward_compatibility_check):
-    options = []
-    client_options = []
-    if 0 < i:
-        options.append("--order=random")
-
-    if i % 3 == 2 and not backward_compatibility_check:
-        options.append(
-            '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i)
-        )
-        client_options.append("allow_experimental_database_replicated=1")
-
-    # If database name is not specified, new database is created for each functional test.
-    # Run some threads with one database for all tests.
-    if i % 2 == 1:
-        options.append(" --database=test_{}".format(i))
-
-    if i % 3 == 1:
-        client_options.append("join_use_nulls=1")
-
-    if i % 2 == 1:
-        join_alg_num = i // 2
-        if join_alg_num % 4 == 0:
-            client_options.append("join_algorithm='parallel_hash'")
-        if join_alg_num % 4 == 1:
-            client_options.append("join_algorithm='partial_merge'")
-        if join_alg_num % 4 == 2:
-            client_options.append("join_algorithm='full_sorting_merge'")
-        if join_alg_num % 4 == 3:
-            client_options.append("join_algorithm='auto'")
-            client_options.append('max_rows_in_join=1000')
-
-    if i == 13:
-        client_options.append("memory_tracker_fault_probability=0.001")
-
-    if i % 2 == 1 and not backward_compatibility_check:
-        client_options.append("group_by_use_nulls=1")
-
-    if client_options:
-        options.append(" --client-option " + " ".join(client_options))
-
-    return " ".join(options)
-
-
-def run_func_test(
-    cmd,
-    output_prefix,
-    num_processes,
-    skip_tests_option,
-    global_time_limit,
-    backward_compatibility_check,
-):
-    backward_compatibility_check_option = (
-        "--backward-compatibility-check" if backward_compatibility_check else ""
-    )
-    global_time_limit_option = ""
-    if global_time_limit:
-        global_time_limit_option = "--global_time_limit={}".format(global_time_limit)
-
-    output_paths = [
-        os.path.join(output_prefix, "stress_test_run_{}.txt".format(i))
-        for i in range(num_processes)
-    ]
-    pipes = []
-    for i in range(0, len(output_paths)):
-        f = open(output_paths[i], "w")
-        full_command = "{} {} {} {} {} --stress".format(
-            cmd,
-            get_options(i, backward_compatibility_check),
-            global_time_limit_option,
-            skip_tests_option,
-            backward_compatibility_check_option,
-        )
-        logging.info("Run func tests '%s'", full_command)
-        p = Popen(full_command, shell=True, stdout=f, stderr=f)
-        pipes.append(p)
-        time.sleep(0.5)
-    return pipes
-
-
-def compress_stress_logs(output_path, files_prefix):
-    cmd = f"cd {output_path} && tar -zcf stress_run_logs.tar.gz {files_prefix}* && rm {files_prefix}*"
-    check_output(cmd, shell=True)
-
-
-def call_with_retry(query, timeout=30, retry_count=5):
-    for i in range(retry_count):
-        code = call(query, shell=True, stderr=STDOUT, timeout=timeout)
-        if code != 0:
-            time.sleep(i)
-        else:
-            break
-
-
-def make_query_command(query):
-    return f"""clickhouse client -q "{query}" --max_untracked_memory=1Gi --memory_profiler_step=1Gi --max_memory_usage_for_user=0"""
-
-
-def prepare_for_hung_check(drop_databases):
-    # FIXME this function should not exist, but...
-
-    # We attach gdb to clickhouse-server before running tests
-    # to print stacktraces of all crashes even if clickhouse cannot print it for some reason.
-    # However, it obstruct checking for hung queries.
-    logging.info("Will terminate gdb (if any)")
-    call_with_retry("kill -TERM $(pidof gdb)")
-
-    # ThreadFuzzer significantly slows down server and causes false-positive hung check failures
-    call_with_retry("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'")
-
-    call_with_retry(make_query_command("SELECT 1 FORMAT Null"))
-
-    # Some tests execute SYSTEM STOP MERGES or similar queries.
-    # It may cause some ALTERs to hang.
-    # Possibly we should fix tests and forbid to use such queries without specifying table.
-    call_with_retry(make_query_command("SYSTEM START MERGES"))
-    call_with_retry(make_query_command("SYSTEM START DISTRIBUTED SENDS"))
-    call_with_retry(make_query_command("SYSTEM START TTL MERGES"))
-    call_with_retry(make_query_command("SYSTEM START MOVES"))
-    call_with_retry(make_query_command("SYSTEM START FETCHES"))
-    call_with_retry(make_query_command("SYSTEM START REPLICATED SENDS"))
-    call_with_retry(make_query_command("SYSTEM START REPLICATION QUEUES"))
-    call_with_retry(make_query_command("SYSTEM DROP MARK CACHE"))
-
-    # Issue #21004, live views are experimental, so let's just suppress it
-    call_with_retry(make_query_command("KILL QUERY WHERE upper(query) LIKE 'WATCH %'"))
-
-    # Kill other queries which known to be slow
-    # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds
-    call_with_retry(
-        make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'")
-    )
-    # Long query from 00084_external_agregation
-    call_with_retry(
-        make_query_command(
-            "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'"
-        )
-    )
-
-    if drop_databases:
-        for i in range(5):
-            try:
-                # Here we try to drop all databases in async mode. If some queries really hung, than drop will hung too.
-                # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds).
-                #
-                # Also specify max_untracked_memory to allow 1GiB of memory to overcommit.
-                databases = (
-                    check_output(
-                        make_query_command("SHOW DATABASES"), shell=True, timeout=30
-                    )
-                    .decode("utf-8")
-                    .strip()
-                    .split()
-                )
-                for db in databases:
-                    if db == "system":
-                        continue
-                    command = make_query_command(f'DETACH DATABASE {db}')
-                    # we don't wait for drop
-                    Popen(command, shell=True)
-                break
-            except Exception as ex:
-                logging.error(
-                    "Failed to SHOW or DROP databasese, will retry %s", str(ex)
-                )
-                time.sleep(i)
-        else:
-            raise Exception(
-                "Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries"
-            )
-
-    # Wait for last queries to finish if any, not longer than 300 seconds
-    call(
-        make_query_command(
-            """
-    select sleepEachRow((
-        select maxOrDefault(300 - elapsed) + 1
-        from system.processes
-        where query not like '%from system.processes%' and elapsed < 300
-    ) / 300)
-    from numbers(300)
-    format Null
-    """
-        ),
-        shell=True,
-        stderr=STDOUT,
-        timeout=330,
-    )
-
-    # Even if all clickhouse-test processes are finished, there are probably some sh scripts,
-    # which still run some new queries. Let's ignore them.
-    try:
-        query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """
-        output = (
-            check_output(query, shell=True, stderr=STDOUT, timeout=30)
-            .decode("utf-8")
-            .strip()
-        )
-        if int(output) == 0:
-            return False
-    except:
-        pass
-    return True
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
-    parser = argparse.ArgumentParser(
-        description="ClickHouse script for running stresstest"
-    )
-    parser.add_argument("--test-cmd", default="/usr/bin/clickhouse-test")
-    parser.add_argument("--skip-func-tests", default="")
-    parser.add_argument("--client-cmd", default="clickhouse-client")
-    parser.add_argument("--server-log-folder", default="/var/log/clickhouse-server")
-    parser.add_argument("--output-folder")
-    parser.add_argument("--global-time-limit", type=int, default=1800)
-    parser.add_argument("--num-parallel", type=int, default=cpu_count())
-    parser.add_argument("--backward-compatibility-check", action="store_true")
-    parser.add_argument("--hung-check", action="store_true", default=False)
-    # make sense only for hung check
-    parser.add_argument("--drop-databases", action="store_true", default=False)
-
-    args = parser.parse_args()
-    if args.drop_databases and not args.hung_check:
-        raise Exception("--drop-databases only used in hung check (--hung-check)")
-    func_pipes = []
-    func_pipes = run_func_test(
-        args.test_cmd,
-        args.output_folder,
-        args.num_parallel,
-        args.skip_func_tests,
-        args.global_time_limit,
-        args.backward_compatibility_check,
-    )
-
-    logging.info("Will wait functests to finish")
-    while True:
-        retcodes = []
-        for p in func_pipes:
-            if p.poll() is not None:
-                retcodes.append(p.returncode)
-        if len(retcodes) == len(func_pipes):
-            break
-        logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes))
-        time.sleep(5)
-
-    logging.info("All processes finished")
-
-    logging.info("Compressing stress logs")
-    compress_stress_logs(args.output_folder, "stress_test_run_")
-    logging.info("Logs compressed")
-
-    if args.hung_check:
-        try:
-            have_long_running_queries = prepare_for_hung_check(args.drop_databases)
-        except Exception as ex:
-            have_long_running_queries = True
-            logging.error("Failed to prepare for hung check %s", str(ex))
-        logging.info("Checking if some queries hung")
-        cmd = " ".join(
-            [
-                args.test_cmd,
-                # Do not track memory allocations up to 1Gi,
-                # this will allow to ignore server memory limit (max_server_memory_usage) for this query.
-                #
-                # NOTE: memory_profiler_step should be also adjusted, because:
-                #
-                #     untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step)
-                #
-                # NOTE: that if there will be queries with GROUP BY, this trick
-                # will not work due to CurrentMemoryTracker::check() from
-                # Aggregator code.
-                # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY.
-                "--client-option",
-                "max_untracked_memory=1Gi",
-                "max_memory_usage_for_user=0",
-                "memory_profiler_step=1Gi",
-                # Use system database to avoid CREATE/DROP DATABASE queries
-                "--database=system",
-                "--hung-check",
-                "--stress",
-                "00001_select_1",
-            ]
-        )
-        res = call(cmd, shell=True, stderr=STDOUT)
-        hung_check_status = "No queries hung\tOK\n"
-        if res != 0 and have_long_running_queries:
-            logging.info("Hung check failed with exit code {}".format(res))
-            hung_check_status = "Hung check failed\tFAIL\n"
-        with open(
-            os.path.join(args.output_folder, "test_results.tsv"), "w+"
-        ) as results:
-            results.write(hung_check_status)
-
-    logging.info("Stress test finished")
--- a/docker/test/stress/Dockerfile
+++ b/docker/test/stress/Dockerfile
@ -22,7 +22,8 @@ RUN apt-get update -y \
            netcat-openbsd \
            telnet \
            llvm-9 \
-            brotli
+            brotli \
+    && apt-get clean

 COPY run.sh /

--- a/docker/test/upgrade/Dockerfile
+++ b/docker/test/upgrade/Dockerfile
@ -22,7 +22,8 @@ RUN apt-get update -y \
            netcat-openbsd \
            telnet \
            llvm-9 \
-            brotli
+            brotli \
+    && apt-get clean

 COPY run.sh /

--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@ -30,6 +30,7 @@ function configure()
    /usr/share/clickhouse-test/config/install.sh

    # we mount tests folder from repo to /usr/share
+    ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress
    ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test
    ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages
    ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag