diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh new file mode 100755 index 00000000000..4cd3e39522e --- /dev/null +++ b/docker/test/performance-comparison/compare.sh @@ -0,0 +1,97 @@ +#!/bin/bash +set -ex +set -o pipefail +trap "exit" INT TERM +trap "kill 0" EXIT + +mkdir left ||: +mkdir right ||: +mkdir db0 ||: + +left_pr=$1 +left_sha=$2 + +right_pr=$3 +right_sha=$4 + +function download +{ + la="$left_pr-$left_sha.tgz" + ra="$right_pr-$right_sha.tgz" + wget -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O "$la" && tar -C left --strip-components=1 -zxvf "$la" & + wget -nd -c "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/performance/performance.tgz" -O "$ra" && tar -C right --strip-components=1 -zxvf "$ra" & + cd db0 && wget -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" && tar -xvf hits_10m_single.tar & + cd db0 && wget -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" && tar -xvf hits_100m_single.tar & + #cd db0 && wget -nd -c "https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" && tar -xvf hits_v1.tar & + wait + + # Use hardlinks instead of copying + rm -r left/db ||: + rm -r right/db ||: + cp -al db0/ left/db/ + cp -al db0/ right/db/ +} + +#download + +function configure +{ + sed -i 's/9000/9001/g' right/config/config.xml + + cat > right/config/config.d/perf-test-tweaks.xml < + + true + + + +EOF + + cp right/config/config.d/perf-test-tweaks.xml left/config/config.d/perf-test-tweaks.xml +} + +configure + + +function restart +{ + while killall clickhouse ; do echo . ; sleep 1 ; done + echo all killed + + # Spawn servers in their own process groups + set -m + + left/clickhouse server --config-file=left/config/config.xml -- --path left/db &> left/log.txt & + left_pid=$! + kill -0 $left_pid + disown $left_pid + + right/clickhouse server --config-file=right/config/config.xml -- --path right/db &> right/log.txt & + right_pid=$! + kill -0 $right_pid + disown $right_pid + + set +m + + while ! left/clickhouse client --query "select 1" ; do kill -0 $left_pid ; echo . ; sleep 1 ; done + echo left ok + + while ! right/clickhouse client --port 9001 --query "select 1" ; do kill -0 $right_pid ; echo . ; sleep 1 ; done + echo right ok +} + +restart + +for test in ch/dbms/tests/performance/*.xml +do + test_name=$(basename $test ".xml") + ./perf.py "$test" > "$test_name-raw.tsv" || continue + right/clickhouse local --file "$test_name-raw.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat eqmed.sql)" > "$test_name-report.tsv" +done + +#while killall clickhouse ; do echo . ; sleep 1 ; done +#echo ok + + + + diff --git a/docker/test/performance-comparison/eqmed.sql b/docker/test/performance-comparison/eqmed.sql new file mode 100644 index 00000000000..dd677b750a0 --- /dev/null +++ b/docker/test/performance-comparison/eqmed.sql @@ -0,0 +1,41 @@ +-- input is table(query text, run UInt32, version int, time float) +select + abs(diff_percent) > rd_quantiles_percent[3] fail, + floor(original_medians_array.time_by_version[1], 4) m1, + floor(original_medians_array.time_by_version[2], 4) m2, + floor((m1 - m2) / m1, 3) diff_percent, + arrayMap(x -> floor(x / m1, 3), rd.rd_quantiles) rd_quantiles_percent, + query +from + ( + select query, quantiles(0.05, 0.5, 0.95)(abs(time_by_label[1] - time_by_label[2])) rd_quantiles -- quantiles of randomization distribution + from + ( + select query, virtual_run, groupArrayInsertAt(median_time, random_label) time_by_label -- make array 'random label' -> 'median time' + from ( + select query, medianExact(time) median_time, virtual_run, random_label -- get median times, grouping by random label + from ( + select *, toUInt32(rowNumberInBlock() % 2) random_label -- randomly relabel measurements + from ( + select query, time, number virtual_run + from table, numbers(1, 10000) -- duplicate input measurements into many virtual runs + order by query, virtual_run, rand() -- for each virtual run, randomly reorder measurements + ) virtual_runs + ) relabeled + group by query, virtual_run, random_label + ) virtual_medians + group by query, virtual_run -- aggregate by random_label + ) virtual_medians_array + group by query -- aggregate by virtual_run + ) rd, + ( + select groupArrayInsertAt(median_time, version) time_by_version, query + from + ( + select medianExact(time) median_time, query, version + from table group by query, version + ) original_medians + group by query + ) original_medians_array +where rd.query = original_medians_array.query +order by fail desc, rd_quantiles_percent[3] asc; diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py new file mode 100755 index 00000000000..508d46b6261 --- /dev/null +++ b/docker/test/performance-comparison/perf.py @@ -0,0 +1,87 @@ +#!/usr/bin/python3 + +import itertools +import clickhouse_driver +import xml.etree.ElementTree as et +import argparse +import pprint + +parser = argparse.ArgumentParser(description='Run performance test.') +parser.add_argument('file', metavar='FILE', type=argparse.FileType('r'), nargs=1, help='test description file') +args = parser.parse_args() + +tree = et.parse(args.file[0]) +root = tree.getroot() + +# Check main metric +main_metric = root.find('main_metric/*').tag +if main_metric != 'min_time': + raise Exception('Only the min_time main metric is supported. This test uses \'{}\''.format(main_metric)) + +# Open connections +servers = [{'host': 'localhost', 'port': 9000, 'client_name': 'left'}, {'host': 'localhost', 'port': 9001, 'client_name': 'right'}] +connections = [clickhouse_driver.Client(**server) for server in servers] + +# Check tables that should exist +tables = [e.text for e in root.findall('preconditions/table_exists')] +if tables: + for c in connections: + tables_list = ", ".join("'{}'".format(t) for t in tables) + res = c.execute("select t from values('t text', {}) anti join system.tables on database = currentDatabase() and name = t".format(tables_list)) + if res: + raise Exception('Some tables are not found: {}'.format(res)) + +# Process substitutions +subst_elems = root.findall('substitutions/substitution') + +parameter_keys = [] # ['table', 'limit' ] +parameter_value_arrays = [] # [['hits_100m', 'hits_10m'], ['1', '10']] +parameter_combinations = [] # [{table: hits_100m, limit: 1}, ...] +for se in subst_elems: + parameter_keys.append(se.find('name').text) + parameter_value_arrays.append([v.text for v in se.findall('values/value')]) +parameter_combinations = [dict(zip(parameter_keys, parameter_combination)) for parameter_combination in itertools.product(*parameter_value_arrays)] + +def substitute_parameters(query_templates, parameter_combinations): + return list(set([template.format(**parameters) for template, parameters in itertools.product(query_templates, parameter_combinations)])) + +# Run drop queries, ignoring errors +drop_query_templates = [q.text for q in root.findall('drop_query')] +drop_queries = substitute_parameters(drop_query_templates, parameter_combinations) +for c in connections: + for q in drop_queries: + try: + c.execute(q) + except: + print("Error:", sys.exc_info()[0], file=sys.stderr) + +# Run create queries +create_query_templates = [q.text for q in root.findall('create_query')] +create_queries = substitute_parameters(create_query_templates, parameter_combinations) +for c in connections: + for q in create_queries: + c.execute(q) + +# Run fill queries +fill_query_templates = [q.text for q in root.findall('fill_query')] +fill_queries = substitute_parameters(fill_query_templates, parameter_combinations) +for c in connections: + for q in fill_queries: + c.execute(q) + +# Run test queries +test_query_templates = [q.text for q in root.findall('query')] +test_queries = substitute_parameters(test_query_templates, parameter_combinations) + +for q in test_queries: + for run in range(0, 7): + for conn_index, c in enumerate(connections): + res = c.execute(q) + print(q + '\t' + str(run) + '\t' + str(conn_index) + '\t' + str(c.last_query.elapsed)) + +# Run drop queries +drop_query_templates = [q.text for q in root.findall('drop_query')] +drop_queries = substitute_parameters(drop_query_templates, parameter_combinations) +for c in connections: + for q in drop_queries: + c.execute(q)