#!/usr/bin/env python from __future__ import print_function import atexit import os import os.path import re import signal import sys from argparse import ArgumentParser from argparse import FileType from pprint import pprint import shlex import subprocess from subprocess import check_call from subprocess import Popen from subprocess import PIPE from subprocess import CalledProcessError from datetime import datetime from time import sleep from errno import ESRCH try: import termcolor except ImportError: termcolor = None from random import random import commands import concurrent.futures import functools import multiprocessing from contextlib import closing MESSAGES_TO_RETRY = [ "DB::Exception: ZooKeeper session has been expired", "Coordination::Exception: Connection loss", ] def remove_control_characters(s): """ https://github.com/html5lib/html5lib-python/issues/96#issuecomment-43438438 """ def str_to_int(s, default, base=10): if int(s, base) < 0x10000: return unichr(int(s, base)) return default s = re.sub(r"&#(\d+);?", lambda c: str_to_int(c.group(1), c.group(0)), s) s = re.sub(r"&#[xX]([0-9a-fA-F]+);?", lambda c: str_to_int(c.group(1), c.group(0), base=16), s) s = re.sub(r"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", s) return s def run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file): # print(client_options) params = { 'client': args.client_with_database, 'logs_level': server_logs_level, 'options': client_options, 'test': case_file, 'stdout': stdout_file, 'stderr': stderr_file, } pattern = '{test} > {stdout} 2> {stderr}' if ext == '.sql': pattern = "{client} --send_logs_level={logs_level} --testmode --multiquery {options} < " + pattern command = pattern.format(**params) #print(command) proc = Popen(command, shell=True, env=os.environ) start_time = datetime.now() while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: sleep(0.01) total_time = (datetime.now() - start_time).total_seconds() # Normalize randomized database names in stdout, stderr files. os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=args.database, file=stdout_file)) os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=args.database, file=stderr_file)) stdout = open(stdout_file, 'r').read() if os.path.exists(stdout_file) else '' stdout = unicode(stdout, errors='replace', encoding='utf-8') stderr = open(stderr_file, 'r').read() if os.path.exists(stderr_file) else '' stderr = unicode(stderr, errors='replace', encoding='utf-8') return proc, stdout, stderr, total_time def need_retry(stderr): return any(msg in stderr for msg in MESSAGES_TO_RETRY) def get_processlist(client_cmd): try: return subprocess.check_output("{} --query 'SHOW PROCESSLIST FORMAT Vertical'".format(client_cmd), shell=True) except: return "" # server seems dead # collect server stacktraces using gdb def get_stacktraces_from_gdb(server_pid): cmd = "gdb -batch -ex 'thread apply all backtrace' -p {}".format(server_pid) try: return subprocess.check_output(cmd, shell=True) except Exception as ex: return "Error occured while receiving stack traces from gdb: {}".format(str(ex)) # collect server stacktraces from system.stack_trace table def get_stacktraces_from_clickhouse(client): try: return subprocess.check_call("{} --allow_introspection_functions=1 --query \"SELECT arrayStringConcat(arrayMap(x, y -> concat(x, ': ', y), arrayMap(x -> addressToLine(x), trace), arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\n') as trace FROM system.stack_trace format Vertical\"".format(client), shell=True) except Exception as ex: return "Error occured while receiving stack traces from client: {}".format(str(ex)) def get_server_pid(server_tcp_port): cmd = "lsof -i tcp:{port} -s tcp:LISTEN -Fp | awk '/^p[0-9]+$/{{print substr($0, 2)}}'".format(port=server_tcp_port) try: output = subprocess.check_output(cmd, shell=True) if output: return int(output) else: return None # server dead except Exception as ex: return None def colored(text, args, color=None, on_color=None, attrs=None): if termcolor and (sys.stdout.isatty() or args.force_color): return termcolor.colored(text, color, on_color, attrs) else: return text SERVER_DIED = False exit_code = 0 # def run_tests_array(all_tests, suite, suite_dir, suite_tmp_dir, run_total): def run_tests_array(tests, suite, suite_dir, suite_tmp_dir, run_total): global exit_code global SERVER_DIED OP_SQUARE_BRACKET = colored("[", args, attrs=['bold']) CL_SQUARE_BRACKET = colored("]", args, attrs=['bold']) MSG_FAIL = OP_SQUARE_BRACKET + colored(" FAIL ", args, "red", attrs=['bold']) + CL_SQUARE_BRACKET MSG_UNKNOWN = OP_SQUARE_BRACKET + colored(" UNKNOWN ", args, "yellow", attrs=['bold']) + CL_SQUARE_BRACKET MSG_OK = OP_SQUARE_BRACKET + colored(" OK ", args, "green", attrs=['bold']) + CL_SQUARE_BRACKET MSG_SKIPPED = OP_SQUARE_BRACKET + colored(" SKIPPED ", args, "cyan", attrs=['bold']) + CL_SQUARE_BRACKET passed_total = 0 skipped_total = 0 failures_total = 0 failures = 0 failures_chain = 0 client_options = get_additional_client_options(args) def print_test_time(test_time): if args.print_time: print(" {0:.2f} sec.".format(test_time), end='') if len(tests): print("\nRunning {} {} tests.".format(len(tests), suite) + "\n") for case in tests: if SERVER_DIED: break case_file = os.path.join(suite_dir, case) (name, ext) = os.path.splitext(case) try: sys.stdout.write("{0:72}".format(name + ": ")) if run_total == 1: sys.stdout.flush() if args.skip and any(s in name for s in args.skip): print(MSG_SKIPPED + " - skip") skipped_total += 1 elif not args.zookeeper and 'zookeeper' in name: print(MSG_SKIPPED + " - no zookeeper") skipped_total += 1 elif not args.shard and ('shard' in name or 'distributed' in name or 'global' in name): print(MSG_SKIPPED + " - no shard") skipped_total += 1 elif not args.no_long and ('long' in name # Tests for races and deadlocks usually are runned in loop # for significant amount of time or 'deadlock' in name or 'race' in name): print(MSG_SKIPPED + " - no long") skipped_total += 1 else: disabled_file = os.path.join(suite_dir, name) + '.disabled' if os.path.exists(disabled_file) and not args.disabled: message = open(disabled_file, 'r').read() print(MSG_SKIPPED + " - " + message) else: if args.testname: clickhouse_proc = Popen(shlex.split(args.client_with_database), stdin=PIPE, stdout=PIPE, stderr=PIPE) clickhouse_proc.communicate("SELECT 'Running test {suite}/{case} from pid={pid}';".format(pid = os.getpid(), case = case, suite = suite)) reference_file = os.path.join(suite_dir, name) + '.reference' stdout_file = os.path.join(suite_tmp_dir, name) + '.stdout' stderr_file = os.path.join(suite_tmp_dir, name) + '.stderr' proc, stdout, stderr, total_time = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) if proc.returncode is None: try: proc.kill() except OSError as e: if e.errno != ESRCH: raise failures += 1 print(MSG_FAIL, end='') print_test_time(total_time) print(" - Timeout!") else: counter = 1 while proc.returncode != 0 and need_retry(stderr): proc, stdout, stderr, total_time = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) sleep(2**counter) counter += 1 if counter > 6: break if proc.returncode != 0: failures += 1 failures_chain += 1 print(MSG_FAIL, end='') print_test_time(total_time) print(" - return code {}".format(proc.returncode)) if stderr: print(stderr.encode('utf-8')) if args.stop and ('Connection refused' in stderr or 'Attempt to read after eof' in stderr) and not 'Received exception from server' in stderr: SERVER_DIED = True elif stderr: failures += 1 failures_chain += 1 print(MSG_FAIL, end='') print_test_time(total_time) print(" - having stderror:\n{}".format(stderr.encode('utf-8'))) elif 'Exception' in stdout: failures += 1 failures_chain += 1 print(MSG_FAIL, end='') print_test_time(total_time) print(" - having exception:\n{}".format(stdout.encode('utf-8'))) elif not os.path.isfile(reference_file): print(MSG_UNKNOWN, end='') print_test_time(total_time) print(" - no reference file") else: result_is_different = subprocess.call(['diff', '-q', reference_file, stdout_file], stdout = PIPE) if result_is_different: diff = Popen(['diff', '-U', str(args.unified), reference_file, stdout_file], stdout = PIPE).communicate()[0] failures += 1 print(MSG_FAIL, end='') print_test_time(total_time) print(" - result differs with reference:\n{}".format(diff)) else: passed_total += 1 failures_chain = 0 print(MSG_OK, end='') print_test_time(total_time) print() if os.path.exists(stdout_file): os.remove(stdout_file) if os.path.exists(stderr_file): os.remove(stderr_file) except KeyboardInterrupt: print(colored("Break tests execution", args, "red")) SERVER_DIED = 1 break except: import traceback exc_type, exc_value, tb = sys.exc_info() failures += 1 print("{0} - Test internal error: {1}\n{2}\n{3}".format(MSG_FAIL, exc_type.__name__, exc_value, "\n".join(traceback.format_tb(tb, 10)))) if failures_chain >= 20: break failures_total = failures_total + failures if failures_total > 0: print(colored("\nHaving {failures_total} errors! {passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total, failures_total = failures_total), args, "red", attrs=["bold"])) exit_code = 1 else: print(colored("\n{passed_total} tests passed. {skipped_total} tests skipped.".format(passed_total = passed_total, skipped_total = skipped_total), args, "green", attrs=["bold"])) server_logs_level = "warning" def atexit_kill_process_group(): # Ignore the terminate signal in this process, so that the exit code is # preserved. Don't use SIG_IGN because is broken: # https://bugs.python.org/issue23395 def ignore_signal(signal, frame): pass signal.signal(signal.SIGTERM, ignore_signal) os.killpg(0, signal.SIGTERM) # SIG_DFL is broken: https://bugs.python.org/issue23395 original_signal_handlers = {} def signal_kill_process_group(signal_number, frame): handler = signal.signal(signal_number, original_signal_handlers[signal_number]) os.killpg(0, signal_number) # Restore the handler back in case the signal was not deadly. signal.signal(signal_number, handler) def register_killgroup_callbacks(): atexit.register(atexit_kill_process_group) original_signal_handlers[signal.SIGHUP] = signal.signal(signal.SIGHUP, signal_kill_process_group) original_signal_handlers[signal.SIGINT] = signal.signal(signal.SIGINT, signal_kill_process_group) original_signal_handlers[signal.SIGTERM] = signal.signal(signal.SIGTERM, signal_kill_process_group) def unregister_killgroup_callbacks(): # Subprocesses will inherit the atexit callback and the signal handlers, so # we have to unregister them there. # GODHELPME atexit.unregister is somehow unavailable in my python 3.6.9 so # the subprocess fails silently. Commented out for now. # atexit.unregister(atexit_kill_process_group) signal.signal(signal.SIGHUP, original_signal_handlers[signal.SIGHUP]) signal.signal(signal.SIGINT, original_signal_handlers[signal.SIGINT]) signal.signal(signal.SIGTERM, original_signal_handlers[signal.SIGTERM]) def run_tests_array_subprocess(params): unregister_killgroup_callbacks() run_tests_array(**params) def main(args): global SERVER_DIED global exit_code global server_logs_level def is_data_present(): clickhouse_proc = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE) (stdout, stderr) = clickhouse_proc.communicate("EXISTS TABLE test.hits") if clickhouse_proc.returncode != 0: raise CalledProcessError(clickhouse_proc.returncode, args.client, stderr) return stdout.startswith('1') base_dir = os.path.abspath(args.queries) tmp_dir = os.path.abspath(args.tmp) # Keep same default values as in queries/shell_config.sh os.environ.setdefault("CLICKHOUSE_BINARY", args.binary) #os.environ.setdefault("CLICKHOUSE_CLIENT", args.client) os.environ.setdefault("CLICKHOUSE_CONFIG", args.configserver) if args.configclient: os.environ.setdefault("CLICKHOUSE_CONFIG_CLIENT", args.configclient) os.environ.setdefault("CLICKHOUSE_TMP", tmp_dir) os.environ.setdefault("CLICKHOUSE_DATABASE", args.database) # Force to print server warnings in stderr # Shell scripts could change logging level os.environ.setdefault("CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL", server_logs_level) if args.zookeeper is None: code, out = commands.getstatusoutput(args.extract_from_config +" --try --config " + args.configserver + ' --key zookeeper | grep . | wc -l') try: if int(out) > 0: args.zookeeper = True else: args.zookeeper = False except ValueError: args.zookeeper = False if args.shard is None: code, out = commands.getstatusoutput(args.extract_from_config + " --try --config " + args.configserver + ' --key listen_host | grep -E "127.0.0.2|::"') if out: args.shard = True else: args.shard = False clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE) clickhouse_proc_create.communicate("CREATE DATABASE IF NOT EXISTS " + args.database) if args.database != "test": clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE) clickhouse_proc_create.communicate("CREATE DATABASE IF NOT EXISTS test") def is_test_from_dir(suite_dir, case): case_file = os.path.join(suite_dir, case) (name, ext) = os.path.splitext(case) return os.path.isfile(case_file) and (ext == '.sql' or ext == '.sh' or ext == '.py') def sute_key_func(item): if args.order == 'random': return random() if -1 == item.find('_'): return 99998 prefix, suffix = item.split('_', 1) try: return int(prefix), suffix except ValueError: return 99997 total_tests_run = 0 for suite in sorted(os.listdir(base_dir), key=sute_key_func): if SERVER_DIED: break suite_dir = os.path.join(base_dir, suite) suite_re_obj = re.search('^[0-9]+_(.*)$', suite) if not suite_re_obj: #skip .gitignore and so on continue suite_tmp_dir = os.path.join(tmp_dir, suite) if not os.path.exists(suite_tmp_dir): os.makedirs(suite_tmp_dir) suite = suite_re_obj.group(1) if os.path.isdir(suite_dir): if 'stateful' in suite and not args.no_stateful and not is_data_present(): print("Won't run stateful tests because test data wasn't loaded.") continue if 'stateless' in suite and args.no_stateless: print("Won't run stateless tests because they were manually disabled.") continue if 'stateful' in suite and args.no_stateful: print("Won't run stateful tests because they were manually disabled.") continue # Reverse sort order: we want run newest test first. # And not reverse subtests def key_func(item): if args.order == 'random': return random() reverse = 1 if args.order == 'asc' else -1 if -1 == item.find('_'): return 99998 prefix, suffix = item.split('_', 1) try: return reverse * int(prefix), suffix except ValueError: return 99997 all_tests = os.listdir(suite_dir) all_tests = filter(lambda case: is_test_from_dir(suite_dir, case), all_tests) if args.test: all_tests = [t for t in all_tests if any([re.search(r, t) for r in args.test])] all_tests.sort(key=key_func) run_n, run_total = [int(x) for x in args.parallel.split('/')] tests_n = len(all_tests) if run_total > tests_n: run_total = tests_n if run_n > run_total: continue jobs = args.jobs if jobs > tests_n: jobs = tests_n if jobs > run_total: run_total = jobs tests_per_group = tests_n / run_total all_tests_array = [] for n in range(0, run_total): start = tests_per_group * n end = tests_per_group * (n + 1) if n == run_total - 1: end = len(all_tests) all_tests_array.append(dict( tests=all_tests[start : end], suite=suite, suite_dir=suite_dir, suite_tmp_dir=suite_tmp_dir, run_total=run_total)) assert(functools.reduce(lambda x, y: x + len(y['tests']), all_tests_array, 0) == len(all_tests)) if jobs > 1: # Handling of KeyboardInterrupt in multiprocessing.Pool # is broken: https://bugs.python.org/msg221549 with concurrent.futures.ProcessPoolExecutor() as pool: pool.map(run_tests_array_subprocess, all_tests_array) else: run_tests_array(**all_tests_array[int(run_n)-1]) total_tests_run += tests_n if args.hung_check: processlist = get_processlist(args.client_with_database) if processlist: print(colored("\nFound hung queries in processlist:", args, "red", attrs=["bold"])) print(processlist) clickhouse_tcp_port = os.getenv("CLICKHOUSE_PORT_TCP", '9000') server_pid = get_server_pid(clickhouse_tcp_port) if server_pid: print("\nLocated ClickHouse server process {} listening at TCP port {}".format(server_pid, clickhouse_tcp_port)) print("\nCollecting stacktraces from system.stacktraces table:") print(get_stacktraces_from_clickhouse(args.client)) print("\nCollecting stacktraces from all running threads with gdb:") print(get_stacktraces_from_gdb(server_pid)) else: print( colored( "\nUnable to locate ClickHouse server process listening at TCP port {}. " "It must have crashed or exited prematurely!".format(clickhouse_tcp_port), args, "red", attrs=["bold"])) exit_code = 1 else: print(colored("\nNo queries hung.", args, "green", attrs=["bold"])) if total_tests_run == 0: print("No tests were run.") sys.exit(1) sys.exit(exit_code) def find_binary(name): if os.path.exists(name) and os.access(name, os.X_OK): return True paths = os.environ.get("PATH").split(':') for path in paths: if os.access(os.path.join(path, name), os.X_OK): return True # maybe it wasn't in PATH if os.access(os.path.join('/usr/local/bin', name), os.X_OK): return True if os.access(os.path.join('/usr/bin', name), os.X_OK): return True return False def get_additional_client_options(args): if args.client_option: return ' '.join('--' + option for option in args.client_option) return '' def get_additional_client_options_url(args): if args.client_option: return '&'.join(args.client_option) return '' if __name__ == '__main__': # Move to a new process group and kill it at exit so that we don't have any # infinite tests processes left. os.setpgid(0, 0) register_killgroup_callbacks() parser=ArgumentParser(description='ClickHouse functional tests') parser.add_argument('-q', '--queries', help='Path to queries dir') parser.add_argument('--tmp', help='Path to tmp dir') parser.add_argument('-b', '--binary', default='clickhouse', help='Path to clickhouse binary or name of binary in PATH') parser.add_argument('-c', '--client', help='Client program') parser.add_argument('--extract_from_config', help='extract-from-config program') parser.add_argument('--configclient', help='Client config (if you use not default ports)') parser.add_argument('--configserver', default= '/etc/clickhouse-server/config.xml', help='Preprocessed server config') parser.add_argument('-o', '--output', help='Output xUnit compliant test report directory') parser.add_argument('-t', '--timeout', type=int, default=600, help='Timeout for each test case in seconds') parser.add_argument('test', nargs='*', help='Optional test case name regex') parser.add_argument('-d', '--disabled', action='store_true', default=False, help='Also run disabled tests') parser.add_argument('--stop', action='store_true', default=None, dest='stop', help='Stop on network errors') parser.add_argument('--order', default='desc', choices=['asc', 'desc', 'random'], help='Run order') parser.add_argument('--testname', action='store_true', default=None, dest='testname', help='Make query with test name before test run') parser.add_argument('--hung-check', action='store_true', default=False) parser.add_argument('--force-color', action='store_true', default=False) parser.add_argument('--database', help='Database for tests (random name test_XXXXXX by default)') parser.add_argument('--parallel', default='1/1', help='One parallel test run number/total') parser.add_argument('-j', '--jobs', default=1, nargs='?', type=int, help='Run all tests in parallel') parser.add_argument('-U', '--unified', default=3, type=int, help='output NUM lines of unified context') parser.add_argument('--no-stateless', action='store_true', help='Disable all stateless tests') parser.add_argument('--no-stateful', action='store_true', help='Disable all stateful tests') parser.add_argument('--skip', nargs='+', help="Skip these tests") parser.add_argument('--no-long', action='store_false', dest='no_long', help='Do not run long tests') parser.add_argument('--client-option', nargs='+', help='Specify additional client argument') parser.add_argument('--print-time', action='store_true', dest='print_time', help='Print test time') group=parser.add_mutually_exclusive_group(required=False) group.add_argument('--zookeeper', action='store_true', default=None, dest='zookeeper', help='Run zookeeper related tests') group.add_argument('--no-zookeeper', action='store_false', default=None, dest='zookeeper', help='Do not run zookeeper related tests') group=parser.add_mutually_exclusive_group(required=False) group.add_argument('--shard', action='store_true', default=None, dest='shard', help='Run sharding related tests (required to clickhouse-server listen 127.0.0.2 127.0.0.3)') group.add_argument('--no-shard', action='store_false', default=None, dest='shard', help='Do not run shard related tests') args = parser.parse_args() if args.queries is None and os.path.isdir('queries'): args.queries = 'queries' elif args.queries is None: if (os.path.isdir('/usr/local/share/clickhouse-test/queries')): args.queries = '/usr/local/share/clickhouse-test/queries' if (args.queries is None and os.path.isdir('/usr/share/clickhouse-test/queries')): args.queries = '/usr/share/clickhouse-test/queries' if args.tmp is None: args.tmp = '/tmp/clickhouse-test' if args.queries is None: print_err("Failed to detect path to the queries directory. Please specify it with '--queries' option.") exit(1) if args.tmp is None: args.tmp = args.queries if args.client is None: if find_binary(args.binary + '-client'): args.client = args.binary + '-client' elif find_binary(args.binary): args.client = args.binary + ' client' else: print("No 'clickhouse' binary found in PATH") parser.print_help() exit(1) if args.configclient: args.client += ' --config-file=' + args.configclient if os.getenv("CLICKHOUSE_HOST"): args.client += ' --host=' + os.getenv("CLICKHOUSE_HOST") if os.getenv("CLICKHOUSE_PORT_TCP"): args.client += ' --port=' + os.getenv("CLICKHOUSE_PORT_TCP") if os.getenv("CLICKHOUSE_DATABASE"): args.client += ' --database=' + os.getenv("CLICKHOUSE_DATABASE") if args.client_option: # Set options for client if 'CLICKHOUSE_CLIENT_OPT' in os.environ: os.environ['CLICKHOUSE_CLIENT_OPT'] += ' ' else: os.environ['CLICKHOUSE_CLIENT_OPT'] = '' os.environ['CLICKHOUSE_CLIENT_OPT'] += get_additional_client_options(args) # Set options for curl if 'CLICKHOUSE_URL_PARAMS' in os.environ: os.environ['CLICKHOUSE_URL_PARAMS'] += '&' else: os.environ['CLICKHOUSE_URL_PARAMS'] = '' os.environ['CLICKHOUSE_URL_PARAMS'] += get_additional_client_options_url(args) args.client_with_database = args.client if not args.database: def random_str(length=6): import random import string alphabet = string.ascii_lowercase + string.digits return ''.join(random.choice(alphabet) for _ in range(length)) args.database = 'test_{suffix}'.format(suffix=random_str()) args.client_with_database += ' --database=' + args.database if args.extract_from_config is None: if os.access(args.binary + '-extract-from-config', os.X_OK): args.extract_from_config = args.binary + '-extract-from-config' else: args.extract_from_config = args.binary + ' extract-from-config' if args.jobs is None: args.jobs = multiprocessing.cpu_count() main(args)