ClickHouse/docker/test/performance-comparison/compare.sh

309 lines
12 KiB
Bash
Raw Normal View History

#!/bin/bash
set -ex
set -o pipefail
trap "exit" INT TERM
trap "kill 0" EXIT
script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
mkdir db0 ||:
left_pr=$1
left_sha=$2
right_pr=$3
right_sha=$4
function download
{
2019-12-26 21:33:10 +00:00
rm -r left ||:
mkdir left ||:
rm -r right ||:
mkdir right ||:
2020-01-21 13:42:12 +00:00
# might have the same version on left and right
2020-02-14 19:11:46 +00:00
if ! [ "$left_sha" = "$right_sha" ]
2020-01-21 13:42:12 +00:00
then
2020-02-14 19:11:46 +00:00
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O- | tar -C left --strip-components=1 -zxv &
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$right_pr/$right_sha/performance/performance.tgz" -O- | tar -C right --strip-components=1 -zxv &
2020-01-21 13:42:12 +00:00
else
2020-02-14 19:11:46 +00:00
wget -nv -nd -c "https://clickhouse-builds.s3.yandex.net/$left_pr/$left_sha/performance/performance.tgz" -O- | tar -C left --strip-components=1 -zxv && cp -al left right
2020-01-21 13:42:12 +00:00
fi
2020-02-14 19:11:46 +00:00
cd db0 && wget -nv -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar" -O- | tar -xv &
cd db0 && wget -nv -nd -c "https://s3.mds.yandex.net/clickhouse-private-datasets/hits_100m_single/partitions/hits_100m_single.tar" -O- | tar -xv &
cd db0 && wget -nv -nd -c "https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_v1.tar" -O- | tar -xv &
cd db0 && wget -nv -nd -c "https://clickhouse-datasets.s3.yandex.net/values_with_expressions/partitions/test_values.tar" -O- | tar -xv &
wait
}
function configure
{
2020-01-14 19:05:58 +00:00
sed -i 's/<tcp_port>9000/<tcp_port>9001/g' left/config/config.xml
sed -i 's/<tcp_port>9000/<tcp_port>9002/g' right/config/config.xml
2020-02-14 12:55:47 +00:00
mkdir right/config/users.d ||:
mkdir left/config/users.d ||:
cat > right/config/config.d/zz-perf-test-tweaks-config.xml <<EOF
<yandex>
<logger>
<console>true</console>
</logger>
2019-12-26 21:33:10 +00:00
<text_log remove="remove">
<table remove="remove"/>
</text_log>
<metric_log remove="remove">
<table remove="remove"/>
</metric_log>
</yandex>
EOF
2020-02-14 12:55:47 +00:00
cat > right/config/users.d/zz-perf-test-tweaks-users.xml <<EOF
<yandex>
<profiles>
<default>
<query_profiler_real_time_period_ns>10000000</query_profiler_real_time_period_ns>
<query_profiler_cpu_time_period_ns>0</query_profiler_cpu_time_period_ns>
<allow_introspection_functions>1</allow_introspection_functions>
</default>
</profiles>
</yandex>
EOF
cp right/config/config.d/zz-perf-test-tweaks-config.xml left/config/config.d/zz-perf-test-tweaks-config.xml
cp right/config/users.d/zz-perf-test-tweaks-users.xml left/config/users.d/zz-perf-test-tweaks-users.xml
2019-12-26 21:33:10 +00:00
rm left/config/config.d/metric_log.xml ||:
rm left/config/config.d/text_log.xml ||:
rm right/config/config.d/metric_log.xml ||:
rm right/config/config.d/text_log.xml ||:
2020-01-14 19:05:58 +00:00
# Start a temporary server to rename the tables
while killall clickhouse ; do echo . ; sleep 1 ; done
echo all killed
set -m # Spawn temporary in its own process groups
2020-02-14 19:11:46 +00:00
left/clickhouse server --config-file=left/config/config.xml -- --path db0 &> setup-server-log.log &
2020-01-14 19:05:58 +00:00
left_pid=$!
kill -0 $left_pid
disown $left_pid
set +m
while ! left/clickhouse client --port 9001 --query "select 1" ; do kill -0 $left_pid ; echo . ; sleep 1 ; done
echo server for setup started
left/clickhouse client --port 9001 --query "create database test" ||:
left/clickhouse client --port 9001 --query "rename table datasets.hits_v1 to test.hits" ||:
2020-02-14 12:55:47 +00:00
while killall clickhouse ; do echo . ; sleep 1 ; done
echo all killed
# Remove logs etc, because they will be updated, and sharing them between
# servers with hardlink might cause unpredictable behavior.
rm db0/data/system/* -rf ||:
}
function restart
{
while killall clickhouse ; do echo . ; sleep 1 ; done
echo all killed
2020-01-14 19:05:58 +00:00
# Make copies of the original db for both servers. Use hardlinks instead
# of copying.
rm -r left/db ||:
rm -r right/db ||:
cp -al db0/ left/db/
cp -al db0/ right/db/
set -m # Spawn servers in their own process groups
2020-02-14 19:11:46 +00:00
left/clickhouse server --config-file=left/config/config.xml -- --path left/db &> left-server-log.log &
left_pid=$!
kill -0 $left_pid
disown $left_pid
2020-02-14 19:11:46 +00:00
right/clickhouse server --config-file=right/config/config.xml -- --path right/db &> right-server-log.log &
right_pid=$!
kill -0 $right_pid
disown $right_pid
set +m
2020-01-14 19:05:58 +00:00
while ! left/clickhouse client --port 9001 --query "select 1" ; do kill -0 $left_pid ; echo . ; sleep 1 ; done
echo left ok
2020-01-14 19:05:58 +00:00
while ! right/clickhouse client --port 9002 --query "select 1" ; do kill -0 $right_pid ; echo . ; sleep 1 ; done
echo right ok
2019-12-26 21:33:10 +00:00
2020-01-14 19:05:58 +00:00
left/clickhouse client --port 9001 --query "select * from system.tables where database != 'system'"
right/clickhouse client --port 9002 --query "select * from system.tables where database != 'system'"
}
function run_tests
{
# Just check that the script runs at all
"$script_dir/perf.py" --help > /dev/null
2020-02-06 12:46:57 +00:00
rm -v test-times.tsv ||:
2020-02-14 12:55:47 +00:00
# Why the ugly cut:
# 1) can't make --out-format='%n' work for deleted files, it outputs things
# like "deleted 1.xml";
# 2) the output is not tab separated, but at least it's fixed width, so I
# cut by characters.
changed_files=$(rsync --dry-run --dirs --checksum --delete --itemize-changes left/performance/ right/performance/ | cut -c13-)
2020-01-16 19:39:07 +00:00
# FIXME remove some broken long tests
rm right/performance/{IPv4,IPv6,modulo,parse_engine_file,number_formatting_formats,select_format}.xml ||:
2020-01-14 19:05:58 +00:00
2020-02-14 12:55:47 +00:00
test_files=$(ls right/performance/*)
# FIXME a quick crutch to bring the run time down for the flappy tests --
# run only those that have changed. Only on my prs for now.
2020-02-14 19:11:46 +00:00
if grep Kuzmenkov right-commit.txt && [ "PR_TO_TEST" != "0" ]
2020-02-14 12:55:47 +00:00
then
2020-02-14 19:11:46 +00:00
test_files_override=$(cd right/performance && readlink -e $changed_files)
if [ "test_files_override" != "" ]
2020-02-14 12:55:47 +00:00
then
2020-02-14 19:11:46 +00:00
test_files=$test_files_override
2020-02-14 12:55:47 +00:00
fi
fi
# Run only explicitly specified tests, if any
if [ -v CHPC_TEST_GLOB ]
then
test_files=$(ls right/performance/${CHPC_TEST_GLOB}.xml)
fi
# Run the tests
2020-02-14 12:55:47 +00:00
for test in $test_files
do
test_name=$(basename $test ".xml")
2020-01-14 19:05:58 +00:00
echo test $test_name
2020-02-14 19:11:46 +00:00
2020-02-06 12:46:57 +00:00
TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n")
2020-02-11 15:01:16 +00:00
# the grep is to filter out set -x output and keep only time output
{ time "$script_dir/perf.py" "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; } 2>&1 >/dev/null | grep -v ^+ >> "wall-clock-times.tsv" || continue
2020-02-14 19:11:46 +00:00
2020-01-16 19:39:07 +00:00
grep ^query "$test_name-raw.tsv" | cut -f2- > "$test_name-queries.tsv"
grep ^client-time "$test_name-raw.tsv" | cut -f2- > "$test_name-client-time.tsv"
2020-02-14 12:55:47 +00:00
# this may be slow, run it in background
right/clickhouse local --file "$test_name-queries.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat $script_dir/eqmed.sql)" > "$test_name-report.tsv" &
done
2020-02-14 12:55:47 +00:00
2020-02-14 19:11:46 +00:00
unset TIMEFORMAT
2020-02-14 12:55:47 +00:00
wait
2020-02-14 19:11:46 +00:00
}
2020-02-14 12:55:47 +00:00
2020-02-14 19:11:46 +00:00
function get_profiles
{
2020-02-14 12:55:47 +00:00
# Collect the profiles
2020-02-14 19:11:46 +00:00
left/clickhouse client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
left/clickhouse client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
right/clickhouse client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
right/clickhouse client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
2020-02-14 12:55:47 +00:00
left/clickhouse client --port 9001 --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv ||: &
left/clickhouse client --port 9001 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv ||: &
right/clickhouse client --port 9002 --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv ||: &
right/clickhouse client --port 9002 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv ||: &
wait
}
# Analyze results
2020-02-07 18:34:24 +00:00
function report
{
2019-12-26 21:33:10 +00:00
result_structure="left float, right float, diff float, rd Array(float), query text"
2020-02-11 15:01:16 +00:00
rm test-times.tsv test-dump.tsv unstable.tsv changed-perf.tsv unstable-tests.tsv unstable-queries.tsv bad-tests.tsv slow-on-client.tsv all-queries.tsv ||:
2020-02-07 18:34:24 +00:00
right/clickhouse local --query "
create table queries engine Memory as select
replaceAll(_file, '-report.tsv', '') test,
2020-02-11 20:00:53 +00:00
left + right < 0.01 as short,
-- FIXME Comparison mode doesn't make sense for queries that complete
-- immediately, so for now we pretend they don't exist. We don't want to
-- remove them altogether because we want to be able to detect regressions,
-- but the right way to do this is not yet clear.
not short and abs(diff) < 0.05 and rd[3] > 0.05 as unstable,
2020-02-14 12:55:47 +00:00
not short and abs(diff) > 0.10 and abs(diff) > rd[3] as changed,
2020-02-07 18:34:24 +00:00
*
2020-02-11 20:00:53 +00:00
from file('*-report.tsv', TSV, 'left float, right float, diff float, rd Array(float), query text');
2020-02-07 18:34:24 +00:00
2020-02-10 16:34:07 +00:00
create table changed_perf_tsv engine File(TSV, 'changed-perf.tsv') as
select left, right, diff, rd, test, query from queries where changed
order by rd[3] desc;
2020-02-11 20:00:53 +00:00
2020-02-10 16:34:07 +00:00
create table unstable_queries_tsv engine File(TSV, 'unstable-queries.tsv') as
select left, right, diff, rd, test, query from queries where unstable
order by rd[3] desc;
2020-02-11 20:00:53 +00:00
2020-02-10 16:34:07 +00:00
create table unstable_tests_tsv engine File(TSV, 'bad-tests.tsv') as
select test, sum(unstable) u, sum(changed) c, u + c s from queries
2020-02-07 18:34:24 +00:00
group by test having s > 0 order by s desc;
2020-02-10 16:34:07 +00:00
create table query_time engine Memory as select *, replaceAll(_file, '-client-time.tsv', '') test
2020-02-07 18:34:24 +00:00
from file('*-client-time.tsv', TSV, 'query text, client float, server float');
2020-02-10 16:34:07 +00:00
create table wall_clock engine Memory as select *
from file('wall-clock-times.tsv', TSV, 'test text, real float, user float, system float');
2020-02-07 18:34:24 +00:00
create table slow_on_client_tsv engine File(TSV, 'slow-on-client.tsv') as
2020-02-10 16:34:07 +00:00
select client, server, floor(client/server, 3) p, query
from query_time where p > 1.02 order by p desc;
create table test_time engine Memory as
2020-02-11 15:01:16 +00:00
select test, sum(client) total_client_time,
2020-02-11 20:00:53 +00:00
maxIf(client, not short) query_max,
minIf(client, not short) query_min,
count(*) queries,
sum(short) short_queries
from query_time, queries
where query_time.query = queries.query
2020-02-11 15:01:16 +00:00
group by test;
2020-02-07 18:34:24 +00:00
create table test_times_tsv engine File(TSV, 'test-times.tsv') as
2020-02-11 15:01:16 +00:00
select wall_clock.test, real,
floor(total_client_time, 3),
queries,
2020-02-11 20:00:53 +00:00
short_queries,
2020-02-11 15:01:16 +00:00
floor(query_max, 3),
floor(real / queries, 3) avg_real_per_query,
floor(query_min, 3)
2020-02-11 20:00:53 +00:00
from test_time join wall_clock using test
2020-02-14 12:55:47 +00:00
order by avg_real_per_query desc;
2020-02-10 16:34:07 +00:00
create table all_queries_tsv engine File(TSV, 'all-queries.tsv') as
select left, right, diff, rd, test, query
from queries order by rd[3] desc;
2020-02-07 18:34:24 +00:00
"
2020-02-11 15:01:16 +00:00
# Remember that grep sets error code when nothing is found, hence the bayan
# operator
grep Exception:[^:] *-err.log > run-errors.log ||:
2020-01-23 17:48:26 +00:00
2020-01-28 15:13:12 +00:00
$script_dir/report.py > report.html
2020-02-07 18:34:24 +00:00
}
2020-02-10 18:37:46 +00:00
case "$stage" in
"")
;&
"download")
2020-02-14 19:11:46 +00:00
time download
;&
"configure")
time configure
;&
"restart")
time restart
;&
"run_tests")
time run_tests
;&
"get_profiles")
time get_profiles
2020-02-10 18:37:46 +00:00
;&
"report")
2020-02-14 19:11:46 +00:00
time report
2020-02-10 18:37:46 +00:00
;&
esac