From f1ca7e54d567ca9e229a041ff152a66b722c3715 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 4 Jan 2023 15:06:16 +0000 Subject: [PATCH] Make better --- docker/test/ci | 305 --------------------------------- docker/test/stress/Dockerfile | 3 +- docker/test/upgrade/Dockerfile | 3 +- docker/test/upgrade/run.sh | 1 + 4 files changed, 5 insertions(+), 307 deletions(-) delete mode 100755 docker/test/ci diff --git a/docker/test/ci b/docker/test/ci deleted file mode 100755 index d1860e9e14b..00000000000 --- a/docker/test/ci +++ /dev/null @@ -1,305 +0,0 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- -from multiprocessing import cpu_count -from subprocess import Popen, call, check_output, STDOUT -import os -import argparse -import logging -import time - - -def get_options(i, backward_compatibility_check): - options = [] - client_options = [] - if 0 < i: - options.append("--order=random") - - if i % 3 == 2 and not backward_compatibility_check: - options.append( - '''--db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) - ) - client_options.append("allow_experimental_database_replicated=1") - - # If database name is not specified, new database is created for each functional test. - # Run some threads with one database for all tests. - if i % 2 == 1: - options.append(" --database=test_{}".format(i)) - - if i % 3 == 1: - client_options.append("join_use_nulls=1") - - if i % 2 == 1: - join_alg_num = i // 2 - if join_alg_num % 4 == 0: - client_options.append("join_algorithm='parallel_hash'") - if join_alg_num % 4 == 1: - client_options.append("join_algorithm='partial_merge'") - if join_alg_num % 4 == 2: - client_options.append("join_algorithm='full_sorting_merge'") - if join_alg_num % 4 == 3: - client_options.append("join_algorithm='auto'") - client_options.append('max_rows_in_join=1000') - - if i == 13: - client_options.append("memory_tracker_fault_probability=0.001") - - if i % 2 == 1 and not backward_compatibility_check: - client_options.append("group_by_use_nulls=1") - - if client_options: - options.append(" --client-option " + " ".join(client_options)) - - return " ".join(options) - - -def run_func_test( - cmd, - output_prefix, - num_processes, - skip_tests_option, - global_time_limit, - backward_compatibility_check, -): - backward_compatibility_check_option = ( - "--backward-compatibility-check" if backward_compatibility_check else "" - ) - global_time_limit_option = "" - if global_time_limit: - global_time_limit_option = "--global_time_limit={}".format(global_time_limit) - - output_paths = [ - os.path.join(output_prefix, "stress_test_run_{}.txt".format(i)) - for i in range(num_processes) - ] - pipes = [] - for i in range(0, len(output_paths)): - f = open(output_paths[i], "w") - full_command = "{} {} {} {} {} --stress".format( - cmd, - get_options(i, backward_compatibility_check), - global_time_limit_option, - skip_tests_option, - backward_compatibility_check_option, - ) - logging.info("Run func tests '%s'", full_command) - p = Popen(full_command, shell=True, stdout=f, stderr=f) - pipes.append(p) - time.sleep(0.5) - return pipes - - -def compress_stress_logs(output_path, files_prefix): - cmd = f"cd {output_path} && tar -zcf stress_run_logs.tar.gz {files_prefix}* && rm {files_prefix}*" - check_output(cmd, shell=True) - - -def call_with_retry(query, timeout=30, retry_count=5): - for i in range(retry_count): - code = call(query, shell=True, stderr=STDOUT, timeout=timeout) - if code != 0: - time.sleep(i) - else: - break - - -def make_query_command(query): - return f"""clickhouse client -q "{query}" --max_untracked_memory=1Gi --memory_profiler_step=1Gi --max_memory_usage_for_user=0""" - - -def prepare_for_hung_check(drop_databases): - # FIXME this function should not exist, but... - - # We attach gdb to clickhouse-server before running tests - # to print stacktraces of all crashes even if clickhouse cannot print it for some reason. - # However, it obstruct checking for hung queries. - logging.info("Will terminate gdb (if any)") - call_with_retry("kill -TERM $(pidof gdb)") - - # ThreadFuzzer significantly slows down server and causes false-positive hung check failures - call_with_retry("clickhouse client -q 'SYSTEM STOP THREAD FUZZER'") - - call_with_retry(make_query_command("SELECT 1 FORMAT Null")) - - # Some tests execute SYSTEM STOP MERGES or similar queries. - # It may cause some ALTERs to hang. - # Possibly we should fix tests and forbid to use such queries without specifying table. - call_with_retry(make_query_command("SYSTEM START MERGES")) - call_with_retry(make_query_command("SYSTEM START DISTRIBUTED SENDS")) - call_with_retry(make_query_command("SYSTEM START TTL MERGES")) - call_with_retry(make_query_command("SYSTEM START MOVES")) - call_with_retry(make_query_command("SYSTEM START FETCHES")) - call_with_retry(make_query_command("SYSTEM START REPLICATED SENDS")) - call_with_retry(make_query_command("SYSTEM START REPLICATION QUEUES")) - call_with_retry(make_query_command("SYSTEM DROP MARK CACHE")) - - # Issue #21004, live views are experimental, so let's just suppress it - call_with_retry(make_query_command("KILL QUERY WHERE upper(query) LIKE 'WATCH %'")) - - # Kill other queries which known to be slow - # It's query from 01232_preparing_sets_race_condition_long, it may take up to 1000 seconds in slow builds - call_with_retry( - make_query_command("KILL QUERY WHERE query LIKE 'insert into tableB select %'") - ) - # Long query from 00084_external_agregation - call_with_retry( - make_query_command( - "KILL QUERY WHERE query LIKE 'SELECT URL, uniq(SearchPhrase) AS u FROM test.hits GROUP BY URL ORDER BY u %'" - ) - ) - - if drop_databases: - for i in range(5): - try: - # Here we try to drop all databases in async mode. If some queries really hung, than drop will hung too. - # Otherwise we will get rid of queries which wait for background pool. It can take a long time on slow builds (more than 900 seconds). - # - # Also specify max_untracked_memory to allow 1GiB of memory to overcommit. - databases = ( - check_output( - make_query_command("SHOW DATABASES"), shell=True, timeout=30 - ) - .decode("utf-8") - .strip() - .split() - ) - for db in databases: - if db == "system": - continue - command = make_query_command(f'DETACH DATABASE {db}') - # we don't wait for drop - Popen(command, shell=True) - break - except Exception as ex: - logging.error( - "Failed to SHOW or DROP databasese, will retry %s", str(ex) - ) - time.sleep(i) - else: - raise Exception( - "Cannot drop databases after stress tests. Probably server consumed too much memory and cannot execute simple queries" - ) - - # Wait for last queries to finish if any, not longer than 300 seconds - call( - make_query_command( - """ - select sleepEachRow(( - select maxOrDefault(300 - elapsed) + 1 - from system.processes - where query not like '%from system.processes%' and elapsed < 300 - ) / 300) - from numbers(300) - format Null - """ - ), - shell=True, - stderr=STDOUT, - timeout=330, - ) - - # Even if all clickhouse-test processes are finished, there are probably some sh scripts, - # which still run some new queries. Let's ignore them. - try: - query = """clickhouse client -q "SELECT count() FROM system.processes where where elapsed > 300" """ - output = ( - check_output(query, shell=True, stderr=STDOUT, timeout=30) - .decode("utf-8") - .strip() - ) - if int(output) == 0: - return False - except: - pass - return True - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") - parser = argparse.ArgumentParser( - description="ClickHouse script for running stresstest" - ) - parser.add_argument("--test-cmd", default="/usr/bin/clickhouse-test") - parser.add_argument("--skip-func-tests", default="") - parser.add_argument("--client-cmd", default="clickhouse-client") - parser.add_argument("--server-log-folder", default="/var/log/clickhouse-server") - parser.add_argument("--output-folder") - parser.add_argument("--global-time-limit", type=int, default=1800) - parser.add_argument("--num-parallel", type=int, default=cpu_count()) - parser.add_argument("--backward-compatibility-check", action="store_true") - parser.add_argument("--hung-check", action="store_true", default=False) - # make sense only for hung check - parser.add_argument("--drop-databases", action="store_true", default=False) - - args = parser.parse_args() - if args.drop_databases and not args.hung_check: - raise Exception("--drop-databases only used in hung check (--hung-check)") - func_pipes = [] - func_pipes = run_func_test( - args.test_cmd, - args.output_folder, - args.num_parallel, - args.skip_func_tests, - args.global_time_limit, - args.backward_compatibility_check, - ) - - logging.info("Will wait functests to finish") - while True: - retcodes = [] - for p in func_pipes: - if p.poll() is not None: - retcodes.append(p.returncode) - if len(retcodes) == len(func_pipes): - break - logging.info("Finished %s from %s processes", len(retcodes), len(func_pipes)) - time.sleep(5) - - logging.info("All processes finished") - - logging.info("Compressing stress logs") - compress_stress_logs(args.output_folder, "stress_test_run_") - logging.info("Logs compressed") - - if args.hung_check: - try: - have_long_running_queries = prepare_for_hung_check(args.drop_databases) - except Exception as ex: - have_long_running_queries = True - logging.error("Failed to prepare for hung check %s", str(ex)) - logging.info("Checking if some queries hung") - cmd = " ".join( - [ - args.test_cmd, - # Do not track memory allocations up to 1Gi, - # this will allow to ignore server memory limit (max_server_memory_usage) for this query. - # - # NOTE: memory_profiler_step should be also adjusted, because: - # - # untracked_memory_limit = min(settings.max_untracked_memory, settings.memory_profiler_step) - # - # NOTE: that if there will be queries with GROUP BY, this trick - # will not work due to CurrentMemoryTracker::check() from - # Aggregator code. - # But right now it should work, since neither hung check, nor 00001_select_1 has GROUP BY. - "--client-option", - "max_untracked_memory=1Gi", - "max_memory_usage_for_user=0", - "memory_profiler_step=1Gi", - # Use system database to avoid CREATE/DROP DATABASE queries - "--database=system", - "--hung-check", - "--stress", - "00001_select_1", - ] - ) - res = call(cmd, shell=True, stderr=STDOUT) - hung_check_status = "No queries hung\tOK\n" - if res != 0 and have_long_running_queries: - logging.info("Hung check failed with exit code {}".format(res)) - hung_check_status = "Hung check failed\tFAIL\n" - with open( - os.path.join(args.output_folder, "test_results.tsv"), "w+" - ) as results: - results.write(hung_check_status) - - logging.info("Stress test finished") diff --git a/docker/test/stress/Dockerfile b/docker/test/stress/Dockerfile index 2778b63774d..1cabea58a65 100644 --- a/docker/test/stress/Dockerfile +++ b/docker/test/stress/Dockerfile @@ -22,7 +22,8 @@ RUN apt-get update -y \ netcat-openbsd \ telnet \ llvm-9 \ - brotli + brotli \ + && apt-get clean COPY run.sh / diff --git a/docker/test/upgrade/Dockerfile b/docker/test/upgrade/Dockerfile index c98220b3403..a91088fb01e 100644 --- a/docker/test/upgrade/Dockerfile +++ b/docker/test/upgrade/Dockerfile @@ -22,7 +22,8 @@ RUN apt-get update -y \ netcat-openbsd \ telnet \ llvm-9 \ - brotli + brotli \ + && apt-get clean COPY run.sh / diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 1a107b6df2a..c0a7f279588 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -30,6 +30,7 @@ function configure() /usr/share/clickhouse-test/config/install.sh # we mount tests folder from repo to /usr/share + ln -s /usr/share/clickhouse-test/ci/stress.py /usr/bin/stress ln -s /usr/share/clickhouse-test/clickhouse-test /usr/bin/clickhouse-test ln -s /usr/share/clickhouse-test/ci/download_release_packages.py /usr/bin/download_release_packages ln -s /usr/share/clickhouse-test/ci/get_previous_release_tag.py /usr/bin/get_previous_release_tag