2019-12-26 17:35:41 +00:00
#!/bin/bash
set -ex
set -o pipefail
trap "exit" INT TERM
2020-04-17 15:47:01 +00:00
trap 'kill $(jobs -pr) ||:' EXIT
2019-12-26 17:35:41 +00:00
2020-02-27 20:02:50 +00:00
stage = ${ stage :- }
2020-03-24 17:33:18 +00:00
script_dir = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " >/dev/null 2>& 1 && pwd ) "
2019-12-26 17:35:41 +00:00
function configure
{
2020-02-25 19:51:09 +00:00
# Use the new config for both servers, so that we can change it in a PR.
2019-12-26 21:33:10 +00:00
rm right/config/config.d/text_log.xml || :
2020-02-25 19:51:09 +00:00
cp -rv right/config left || :
2020-04-22 21:39:51 +00:00
sed -i 's/<tcp_port>900./<tcp_port>9001/g' left/config/config.xml
sed -i 's/<tcp_port>900./<tcp_port>9002/g' right/config/config.xml
2020-02-25 19:51:09 +00:00
2020-01-14 19:05:58 +00:00
# Start a temporary server to rename the tables
2020-04-17 15:47:01 +00:00
while killall clickhouse-server; do echo . ; sleep 1 ; done
2020-01-14 19:05:58 +00:00
echo all killed
set -m # Spawn temporary in its own process groups
2020-04-17 15:47:01 +00:00
left/clickhouse-server --config-file= left/config/config.xml -- --path db0 & > setup-server-log.log &
2020-01-14 19:05:58 +00:00
left_pid = $!
kill -0 $left_pid
disown $left_pid
set +m
2020-05-19 14:51:51 +00:00
while ! clickhouse-client --port 9001 --query "select 1" && kill -0 $left_pid ; do echo . ; sleep 1 ; done
2020-01-14 19:05:58 +00:00
echo server for setup started
2020-04-17 15:47:01 +00:00
clickhouse-client --port 9001 --query "create database test" || :
clickhouse-client --port 9001 --query "rename table datasets.hits_v1 to test.hits" || :
2020-02-14 12:55:47 +00:00
2020-04-17 15:47:01 +00:00
while killall clickhouse-server; do echo . ; sleep 1 ; done
2020-02-14 12:55:47 +00:00
echo all killed
# Remove logs etc, because they will be updated, and sharing them between
# servers with hardlink might cause unpredictable behavior.
rm db0/data/system/* -rf || :
2020-02-21 19:58:52 +00:00
rm db0/metadata/system/* -rf || :
2019-12-26 17:35:41 +00:00
2020-01-14 19:05:58 +00:00
# Make copies of the original db for both servers. Use hardlinks instead
2020-04-17 15:47:01 +00:00
# of copying. Be careful to remove preprocessed configs and system tables,or
# it can lead to weird effects.
2020-01-14 19:05:58 +00:00
rm -r left/db || :
rm -r right/db || :
2020-04-13 19:06:07 +00:00
rm -r db0/preprocessed_configs || :
2020-04-17 15:47:01 +00:00
rm -r db/{ data,metadata} /system || :
2020-01-14 19:05:58 +00:00
cp -al db0/ left/db/
cp -al db0/ right/db/
2020-02-25 19:51:09 +00:00
}
function restart
{
2020-04-17 15:47:01 +00:00
while killall clickhouse-server; do echo . ; sleep 1 ; done
2020-02-25 19:51:09 +00:00
echo all killed
2020-01-14 19:05:58 +00:00
set -m # Spawn servers in their own process groups
2019-12-26 17:35:41 +00:00
2020-04-17 15:47:01 +00:00
left/clickhouse-server --config-file= left/config/config.xml -- --path left/db & >> left-server-log.log &
2019-12-26 17:35:41 +00:00
left_pid = $!
kill -0 $left_pid
disown $left_pid
2020-04-17 15:47:01 +00:00
right/clickhouse-server --config-file= right/config/config.xml -- --path right/db & >> right-server-log.log &
2019-12-26 17:35:41 +00:00
right_pid = $!
kill -0 $right_pid
disown $right_pid
set +m
2020-05-19 14:51:51 +00:00
while ! clickhouse-client --port 9001 --query "select 1" && kill -0 $left_pid ; do echo . ; sleep 1 ; done
2019-12-26 17:35:41 +00:00
echo left ok
2020-05-19 14:51:51 +00:00
while ! clickhouse-client --port 9002 --query "select 1" && kill -0 $right_pid ; do echo . ; sleep 1 ; done
2019-12-26 17:35:41 +00:00
echo right ok
2019-12-26 21:33:10 +00:00
2020-04-17 15:47:01 +00:00
clickhouse-client --port 9001 --query "select * from system.tables where database != 'system'"
clickhouse-client --port 9001 --query "select * from system.build_options"
clickhouse-client --port 9002 --query "select * from system.tables where database != 'system'"
clickhouse-client --port 9002 --query "select * from system.build_options"
# Check again that both servers we started are running -- this is important
# for running locally, when there might be some other servers started and we
# will connect to them instead.
kill -0 $left_pid
kill -0 $right_pid
2019-12-26 17:35:41 +00:00
}
2019-12-26 19:16:36 +00:00
function run_tests
{
# Just check that the script runs at all
" $script_dir /perf.py " --help > /dev/null
2020-04-30 08:36:33 +00:00
# Find the directory with test files.
if [ -v CHPC_TEST_PATH ]
2020-02-14 12:55:47 +00:00
then
2020-04-30 08:36:33 +00:00
# Use the explicitly set path to directory with test files.
test_prefix = " $CHPC_TEST_PATH "
elif [ " $PR_TO_TEST " = "0" ]
then
# When testing commits from master, use the older test files. This
# allows the tests to pass even when we add new functions and tests for
# them, that are not supported in the old revision.
test_prefix = left/performance
elif [ " $PR_TO_TEST " != "" ] && [ " $PR_TO_TEST " != "0" ]
then
# For PRs, use newer test files so we can test these changes.
test_prefix = right/performance
# If some tests were changed in the PR, we may want to run only these
# ones. The list of changed tests in changed-test.txt is prepared in
# entrypoint.sh from git diffs, because it has the cloned repo. Used
# to use rsync for that but it was really ugly and not always correct
# (e.g. when the reference SHA is really old and has some other
# differences to the tested SHA, besides the one introduced by the PR).
changed_test_files = $( sed " s/tests\/performance/ ${ test_prefix // \/ / \\ / } / " changed-tests.txt)
2020-02-14 12:55:47 +00:00
fi
2020-04-30 08:36:33 +00:00
# Determine which tests to run.
2020-04-23 20:18:46 +00:00
if [ -v CHPC_TEST_GREP ]
2020-02-14 12:55:47 +00:00
then
2020-04-30 08:36:33 +00:00
# Run only explicitly specified tests, if any.
2020-04-24 03:23:38 +00:00
test_files = $( ls " $test_prefix " | grep " $CHPC_TEST_GREP " | xargs -I{ } -n1 readlink -f " $test_prefix /{} " )
2020-04-30 08:36:33 +00:00
elif [ " $changed_test_files " != "" ]
2020-04-02 18:44:58 +00:00
then
2020-04-30 08:36:33 +00:00
# Use test files that changed in the PR.
test_files = " $changed_test_files "
else
# The default -- run all tests found in the test dir.
2020-04-02 18:44:58 +00:00
test_files = $( ls " $test_prefix " /*.xml)
fi
2020-04-30 08:36:33 +00:00
# Delete old report files.
2020-05-20 02:19:19 +00:00
for x in { test-times,wall-clock-times} .tsv
2020-04-30 08:36:33 +00:00
do
rm -v " $x " || :
touch " $x "
done
2020-02-25 19:51:09 +00:00
# Run the tests.
test_name = "<none>"
2020-02-14 12:55:47 +00:00
for test in $test_files
2019-12-26 19:16:36 +00:00
do
2020-02-25 19:51:09 +00:00
# Check that both servers are alive, to fail faster if they die.
2020-04-17 15:47:01 +00:00
clickhouse-client --port 9001 --query "select 1 format Null" \
2020-02-25 19:51:09 +00:00
|| { echo $test_name >> left-server-died.log ; restart ; continue ; }
2020-04-17 15:47:01 +00:00
clickhouse-client --port 9002 --query "select 1 format Null" \
2020-02-25 19:51:09 +00:00
|| { echo $test_name >> right-server-died.log ; restart ; continue ; }
2020-02-27 20:02:50 +00:00
test_name = $( basename " $test " ".xml" )
echo test " $test_name "
2020-02-14 19:11:46 +00:00
2020-02-06 12:46:57 +00:00
TIMEFORMAT = $( printf " $test_name \t%%3R\t%%3U\t%%3S\n " )
2020-02-11 15:01:16 +00:00
# the grep is to filter out set -x output and keep only time output
2020-04-07 17:55:39 +00:00
{ time " $script_dir /perf.py " --host localhost localhost --port 9001 9002 -- " $test " > " $test_name -raw.tsv " 2> " $test_name -err.log " ; } 2>& 1 >/dev/null | grep -v ^+ >> "wall-clock-times.tsv" || continue
2020-02-14 19:11:46 +00:00
2020-03-02 17:05:53 +00:00
# The test completed with zero status, so we treat stderr as warnings
mv " $test_name -err.log " " $test_name -warn.log "
2019-12-26 19:16:36 +00:00
done
2020-02-14 12:55:47 +00:00
2020-02-14 19:11:46 +00:00
unset TIMEFORMAT
2020-02-14 12:55:47 +00:00
wait
2020-02-14 19:11:46 +00:00
}
2020-02-14 12:55:47 +00:00
2020-04-21 21:53:13 +00:00
function get_profiles_watchdog
{
2020-04-23 20:18:46 +00:00
sleep 6000
2020-04-21 21:53:13 +00:00
2020-04-22 19:41:40 +00:00
echo "The trace collection did not finish in time." >> profile-errors.log
2020-04-21 21:53:13 +00:00
for pid in $( pgrep -f clickhouse)
do
2020-04-29 09:47:17 +00:00
gdb -p " $pid " --batch --ex "info proc all" --ex "thread apply all bt" --ex quit & > " $pid .gdb.log " &
2020-04-21 21:53:13 +00:00
done
wait
2020-04-29 09:47:17 +00:00
for _ in { 1..10}
2020-04-21 21:53:13 +00:00
do
if ! pkill -f clickhouse
then
break
fi
sleep 1
done
}
2020-02-14 19:11:46 +00:00
function get_profiles
{
2020-02-14 12:55:47 +00:00
# Collect the profiles
2020-04-17 15:47:01 +00:00
clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
clickhouse-client --port 9001 --query "system flush logs"
clickhouse-client --port 9002 --query "system flush logs"
clickhouse-client --port 9001 --query "select * from system.query_log where type = 2 format TSVWithNamesAndTypes" > left-query-log.tsv || : &
clickhouse-client --port 9001 --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv || : &
clickhouse-client --port 9001 --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv || : &
clickhouse-client --port 9001 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv || : &
clickhouse-client --port 9001 --query "select * from system.metric_log format TSVWithNamesAndTypes" > left-metric-log.tsv || : &
clickhouse-client --port 9002 --query "select * from system.query_log where type = 2 format TSVWithNamesAndTypes" > right-query-log.tsv || : &
clickhouse-client --port 9002 --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > right-query-thread-log.tsv || : &
clickhouse-client --port 9002 --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv || : &
clickhouse-client --port 9002 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv || : &
clickhouse-client --port 9002 --query "select * from system.metric_log format TSVWithNamesAndTypes" > right-metric-log.tsv || : &
2020-02-14 12:55:47 +00:00
wait
2020-04-24 07:19:23 +00:00
# Just check that the servers are alive so that we return a proper exit code.
# We don't consistently check the return codes of the above background jobs.
clickhouse-client --port 9001 --query "select 1"
clickhouse-client --port 9002 --query "select 1"
2019-12-26 19:16:36 +00:00
}
2019-12-26 17:35:41 +00:00
2020-05-21 23:03:41 +00:00
function build_log_column_definitions
2020-03-02 18:47:37 +00:00
{
2020-05-20 02:19:19 +00:00
# FIXME This loop builds column definitons from TSVWithNamesAndTypes in an
# absolutely atrocious way. This should be done by the file() function itself.
for x in { right,left} -{ addresses,{ query,query-thread,trace,metric} -log} .tsv
do
paste -d' ' \
<( sed -n '1{s/\t/\n/g;p;q}' " $x " | sed 's/\(^.*$\)/"\1"/' ) \
<( sed -n '2{s/\t/\n/g;p;q}' " $x " ) \
| tr '\n' ', ' | sed 's/,$//' > " $x .columns "
done
2020-05-21 23:03:41 +00:00
}
# Build and analyze randomization distribution for all queries.
function analyze_queries
{
rm -v analyze-commands.txt analyze-errors.log all-queries.tsv unstable-queries.tsv ./*-report.tsv raw-queries.tsv || :
rm -rfv analyze || :
mkdir analyze || :
build_log_column_definitions
2020-04-29 17:26:28 +00:00
# Split the raw test output into files suitable for analysis.
IFS = $'\n'
for test_file in $( find . -maxdepth 1 -name "*-raw.tsv" -print)
do
test_name = $( basename " $test_file " "-raw.tsv" )
2020-05-20 02:19:19 +00:00
sed -n " s/^query\t/ $test_name \t/p " < " $test_file " >> "analyze/query-runs.tsv"
sed -n " s/^client-time/ $test_name /p " < " $test_file " >> "analyze/client-times.tsv"
sed -n " s/^report-threshold/ $test_name /p " < " $test_file " >> "analyze/report-thresholds.tsv"
sed -n " s/^skipped/ $test_name /p " < " $test_file " >> "analyze/skipped-tests.tsv"
sed -n " s/^display-name/ $test_name /p " < " $test_file " >> "analyze/query-display-names.tsv"
2020-04-29 17:26:28 +00:00
done
unset IFS
2020-04-27 12:47:59 +00:00
2020-05-20 02:19:19 +00:00
# for each query run, prepare array of metrics from query log
clickhouse-local --query "
create view query_runs as select * from file( 'analyze/query-runs.tsv' , TSV,
'test text, query_index int, query_id text, version UInt8, time float' ) ;
create view left_query_log as select *
from file( 'left-query-log.tsv' , TSVWithNamesAndTypes,
'$(cat "left-query-log.tsv.columns")' ) ;
create view right_query_log as select *
from file( 'right-query-log.tsv' , TSVWithNamesAndTypes,
'$(cat "right-query-log.tsv.columns")' ) ;
create table query_metrics engine File( TSV, -- do not add header -- will parse with grep
'analyze/query-run-metrics.tsv' )
as select
test, query_index, 0 run, version,
[
-- server-reported time
query_duration_ms / toFloat64( 1000)
, toFloat64( memory_usage)
-- client-reported time
, query_runs.time
] metrics
from (
select *, 0 version from left_query_log
union all
select *, 1 version from right_query_log
) query_logs
right join query_runs
using ( query_id, version)
2020-05-21 23:03:41 +00:00
order by test, query_index
2020-05-20 02:19:19 +00:00
;
"
2020-04-27 12:47:59 +00:00
# This is a lateral join in bash... please forgive me.
2020-05-20 02:19:19 +00:00
# We don't have arrayPermute(), so I have to make random permutations with
2020-04-27 12:47:59 +00:00
# `order by rand`, and it becomes really slow if I do it for more than one
# query. We also don't have lateral joins. So I just put all runs of each
# query into a separate file, and then compute randomization distribution
# for each file. I do this in parallel using GNU parallel.
2020-05-20 02:19:19 +00:00
query_index = 1
2020-04-27 12:47:59 +00:00
IFS = $'\n'
2020-05-20 02:19:19 +00:00
for prefix in $( cut -f1,2 "analyze/query-run-metrics.tsv" | sort | uniq)
2020-04-27 12:47:59 +00:00
do
2020-05-21 23:03:41 +00:00
file = " analyze/ $( echo " $prefix " | sed 's/\t/_/g' ) .tmp "
grep " ^ $prefix " "analyze/query-run-metrics.tsv" > " $file " &
2020-05-20 02:19:19 +00:00
printf "%s\0\n" \
" clickhouse-local \
--file \" $file \" \
--structure 'test text, query text, run int, version UInt8, metrics Array(float)' \
--query \" $( cat " $script_dir /eqmed.sql " ) \" \
>> \" analyze/query-reports.tsv\" " \
2>> analyze/errors.log \
>> analyze/commands.txt
2020-04-27 12:47:59 +00:00
done
wait
unset IFS
2020-05-21 23:03:41 +00:00
parallel --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log
2020-03-02 18:47:37 +00:00
}
2019-12-26 19:16:36 +00:00
# Analyze results
2020-02-07 18:34:24 +00:00
function report
{
2020-04-28 07:45:35 +00:00
rm -r report || :
mkdir report || :
2020-02-27 20:02:50 +00:00
rm ./*.{ rep,svg} test-times.tsv test-dump.tsv unstable.tsv unstable-query-ids.tsv unstable-query-metrics.tsv changed-perf.tsv unstable-tests.tsv unstable-queries.tsv bad-tests.tsv slow-on-client.tsv all-queries.tsv || :
2020-02-17 19:32:40 +00:00
2020-05-21 23:03:41 +00:00
build_log_column_definitions
2020-05-20 02:19:19 +00:00
cat analyze/errors.log >> report/errors.log || :
2020-04-28 07:45:35 +00:00
cat profile-errors.log >> report/errors.log || :
2020-04-22 19:41:40 +00:00
2020-04-17 15:47:01 +00:00
clickhouse-local --query "
2020-05-20 02:19:19 +00:00
create view query_display_names as select * from
file( 'analyze/query-display-names.tsv' , TSV,
'test text, query_index int, query_display_name text' )
;
create table query_metric_stats engine File( TSVWithNamesAndTypes,
'report/query-metric-stats.tsv' ) as
select *, metric_name
from file ( 'analyze/query-reports.tsv' , TSV, ' left Array( float) ,
right Array( float) , diff Array( float) , stat_threshold Array( float) ,
test text, query_index int' ) reports
left array join [ 'server_time' , 'memory' , 'client_time' ] as metric_name,
left, right, diff, stat_threshold
left join query_display_names
on reports.test = query_display_names.test
and reports.query_index = query_display_names.query_index
;
-- Main statistics for queries -- query time as reported in query log.
2020-04-28 07:45:35 +00:00
create table queries engine File( TSVWithNamesAndTypes, 'report/queries.tsv' )
2020-03-23 09:39:12 +00:00
as select
2020-02-11 20:00:53 +00:00
-- FIXME Comparison mode doesn' t make sense for queries that complete
-- immediately, so for now we pretend they don't exist. We don' t want to
-- remove them altogether because we want to be able to detect regressions,
-- but the right way to do this is not yet clear.
2020-04-21 18:46:45 +00:00
( left + right) / 2 < 0.02 as short,
2020-02-28 19:59:59 +00:00
2020-04-28 07:45:35 +00:00
not short and abs( diff) > report_threshold and abs( diff) > stat_threshold as changed_fail,
not short and abs( diff) > report_threshold - 0.05 and abs( diff) > stat_threshold as changed_show,
2020-04-17 13:32:59 +00:00
2020-04-29 17:26:28 +00:00
not short and not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
2020-04-28 07:45:35 +00:00
not short and not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
2020-04-17 13:32:59 +00:00
2020-04-28 07:45:35 +00:00
left, right, diff, stat_threshold,
if ( report_threshold > 0, report_threshold, 0.10) as report_threshold,
2020-05-20 02:19:19 +00:00
test, query_index, query_display_name
from query_metric_stats
left join file( 'analyze/report-thresholds.tsv' , TSV,
'test text, report_threshold float' ) thresholds
on query_metric_stats.test = thresholds.test
where metric_name = 'server_time'
order by test, query_index, metric_name
;
2020-04-28 07:45:35 +00:00
-- keep the table in old format so that we can analyze new and old data together
create table queries_old_format engine File( TSVWithNamesAndTypes, 'queries.rep' )
2020-05-20 02:19:19 +00:00
as select short, changed_fail, unstable_fail, left, right, diff,
stat_threshold, test, query_display_name query
2020-04-28 07:45:35 +00:00
from queries
;
2020-02-07 18:34:24 +00:00
2020-05-18 21:14:43 +00:00
-- save all test runs as JSON for the new comparison page
2020-05-20 02:19:19 +00:00
create table all_query_runs_json engine File( JSON, 'report/all-query-runs.json' ) as
2020-05-22 19:01:55 +00:00
select test, query_index, query_display_name query,
left, right, diff, stat_threshold, report_threshold,
2020-05-20 02:19:19 +00:00
versions_runs[ 1] runs_left, versions_runs[ 2] runs_right
2020-05-18 21:14:43 +00:00
from (
select
2020-05-20 02:19:19 +00:00
test, query_index,
2020-05-18 21:14:43 +00:00
groupArrayInsertAt( runs, version) versions_runs
from (
select
2020-05-20 02:19:19 +00:00
test, query_index, version,
groupArray( metrics[ 1] ) runs
from file( 'analyze/query-run-metrics.tsv' , TSV,
'test text, query_index int, run int, version UInt8, metrics Array(float)' )
group by test, query_index, version
2020-05-18 21:14:43 +00:00
)
2020-05-20 02:19:19 +00:00
group by test, query_index
) runs
2020-05-22 19:01:55 +00:00
left join query_display_names
on runs.test = query_display_names.test
and runs.query_index = query_display_names.query_index
left join file( 'analyze/report-thresholds.tsv' ,
TSV, 'test text, report_threshold float' ) thresholds
on runs.test = thresholds.test
left join query_metric_stats
on runs.test = query_metric_stats.test
and runs.query_index = query_metric_stats.query_index
where
query_metric_stats.metric_name = 'server_time'
2020-05-18 21:14:43 +00:00
;
2020-04-28 07:45:35 +00:00
create table changed_perf_tsv engine File( TSV, 'report/changed-perf.tsv' ) as
2020-05-20 02:19:19 +00:00
select left, right, diff, stat_threshold, changed_fail, test, query_display_name
from queries where changed_show order by abs( diff) desc;
2020-02-11 20:00:53 +00:00
2020-04-28 07:45:35 +00:00
create table unstable_queries_tsv engine File( TSV, 'report/unstable-queries.tsv' ) as
2020-05-20 02:19:19 +00:00
select left, right, diff, stat_threshold, unstable_fail, test, query_display_name
from queries where unstable_show order by stat_threshold desc;
2020-02-11 20:00:53 +00:00
2020-05-20 02:19:19 +00:00
create table queries_for_flamegraph engine File( TSVWithNamesAndTypes,
'report/queries-for-flamegraph.tsv' ) as
select test, query_index from queries where unstable_show or changed_show
2020-04-28 07:45:35 +00:00
;
create table unstable_tests_tsv engine File( TSV, 'report/bad-tests.tsv' ) as
select test, sum( unstable_fail) u, sum( changed_fail) c, u + c s from queries
2020-02-07 18:34:24 +00:00
group by test having s > 0 order by s desc;
2020-04-28 07:45:35 +00:00
create table query_time engine Memory as select *
2020-05-20 02:19:19 +00:00
from file( 'analyze/client-times.tsv' , TSV,
'test text, query_index int, client float, server float' ) ;
2020-02-07 18:34:24 +00:00
2020-02-10 16:34:07 +00:00
create table wall_clock engine Memory as select *
from file( 'wall-clock-times.tsv' , TSV, 'test text, real float, user float, system float' ) ;
2020-04-28 07:45:35 +00:00
create table slow_on_client_tsv engine File( TSV, 'report/slow-on-client.tsv' ) as
2020-05-20 02:19:19 +00:00
select client, server, floor( client/server, 3) p, query_display_name
from query_time left join query_display_names using ( test, query_index)
where p > 1.02 order by p desc;
2020-02-10 16:34:07 +00:00
create table test_time engine Memory as
2020-02-11 15:01:16 +00:00
select test, sum( client) total_client_time,
2020-02-11 20:00:53 +00:00
maxIf( client, not short) query_max,
minIf( client, not short) query_min,
2020-05-20 02:19:19 +00:00
count( *) queries, sum( short) short_queries
from query_time full join queries using ( test, query_index)
2020-02-11 15:01:16 +00:00
group by test;
2020-02-07 18:34:24 +00:00
2020-04-28 07:45:35 +00:00
create table test_times_tsv engine File( TSV, 'report/test-times.tsv' ) as
2020-02-11 15:01:16 +00:00
select wall_clock.test, real,
floor( total_client_time, 3) ,
queries,
2020-02-11 20:00:53 +00:00
short_queries,
2020-02-11 15:01:16 +00:00
floor( query_max, 3) ,
floor( real / queries, 3) avg_real_per_query,
floor( query_min, 3)
2020-04-29 16:27:40 +00:00
from test_time
2020-05-20 02:19:19 +00:00
-- wall clock times are also measured for skipped tests, so don' t
-- do full join
left join wall_clock using test
2020-02-14 12:55:47 +00:00
order by avg_real_per_query desc;
2020-02-10 16:34:07 +00:00
2020-05-20 02:19:19 +00:00
-- report for all queries page, only main metric
2020-04-28 07:45:35 +00:00
create table all_tests_tsv engine File( TSV, 'report/all-queries.tsv' ) as
select changed_fail, unstable_fail,
left, right, diff,
2020-04-02 18:44:58 +00:00
floor( left > right ? left / right : right / left, 3) ,
2020-05-20 02:19:19 +00:00
stat_threshold, test, query_display_name
from queries order by test, query_display_name;
2020-04-28 07:45:35 +00:00
2020-05-20 02:19:19 +00:00
-- new report for all queries with all metrics ( no page yet)
create table all_query_metrics_tsv engine File( TSV, 'report/all-query-metrics.tsv' ) as
select metric_name, left, right, diff,
floor( left > right ? left / right : right / left, 3) ,
stat_threshold, test, query_index, query_display_name
from query_metric_stats
order by test, query_index;
" 2> >(tee -a report/errors.log 1>&2)
2020-03-24 17:33:18 +00:00
2020-05-20 02:19:19 +00:00
# Prepare source data for metrics and flamegraphs for unstable queries.
2020-03-24 17:33:18 +00:00
for version in { right,left}
2020-05-20 02:19:19 +00:00
do
rm -rf data
clickhouse-local --query "
2020-04-28 07:45:35 +00:00
create view queries_for_flamegraph as
select * from file( 'report/queries-for-flamegraph.tsv' , TSVWithNamesAndTypes,
2020-05-20 02:19:19 +00:00
'test text, query_index int' ) ;
2020-02-17 19:32:40 +00:00
2020-05-20 02:19:19 +00:00
create view query_runs as
with 0 as left, 1 as right
select * from file( 'analyze/query-runs.tsv' , TSV,
'test text, query_index int, query_id text, version UInt8, time float' )
where version = $version
;
2020-02-17 19:32:40 +00:00
2020-05-20 02:19:19 +00:00
create view query_display_names as select * from
file( 'analyze/query-display-names.tsv' , TSV,
'test text, query_index int, query_display_name text' )
;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_query_runs engine File( TSVWithNamesAndTypes,
'unstable-query-runs.$version.rep' ) as
2020-05-20 02:19:19 +00:00
select test, query_index, query_display_name, query_id
from query_runs
join queries_for_flamegraph on
query_runs.test = queries_for_flamegraph.test
and query_runs.query_index = queries_for_flamegraph.query_index
left join query_display_names on
query_runs.test = query_display_names.test
and query_runs.query_index = query_display_names.query_index
2020-02-17 19:32:40 +00:00
;
2020-05-20 02:19:19 +00:00
create view query_log as select *
from file( '$version-query-log.tsv' , TSVWithNamesAndTypes,
'$(cat "$version-query-log.tsv.columns")' ) ;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_run_metrics engine File( TSVWithNamesAndTypes,
'unstable-run-metrics.$version.rep' ) as
2020-05-20 02:19:19 +00:00
select
test, query_index, query_id,
ProfileEvents.Values value, ProfileEvents.Names metric
2020-03-24 17:33:18 +00:00
from query_log array join ProfileEvents
2020-05-20 02:19:19 +00:00
join unstable_query_runs using ( query_id)
2020-02-17 19:32:40 +00:00
;
2020-03-24 17:33:18 +00:00
create table unstable_run_metrics_2 engine File( TSVWithNamesAndTypes,
'unstable-run-metrics-2.$version.rep' ) as
2020-05-20 02:19:19 +00:00
select
test, query_index, query_id,
v, n
from (
select
test, query_index, query_id,
2020-03-17 02:41:47 +00:00
[ 'memory_usage' , 'read_bytes' , 'written_bytes' , 'query_duration_ms' ] n,
2020-05-20 02:19:19 +00:00
[ memory_usage, read_bytes, written_bytes, query_duration_ms] v
2020-03-24 17:33:18 +00:00
from query_log
2020-05-20 02:19:19 +00:00
join unstable_query_runs using ( query_id)
)
array join v, n;
create view trace_log as select *
from file( '$version-trace-log.tsv' , TSVWithNamesAndTypes,
'$(cat "$version-trace-log.tsv.columns")' ) ;
create view addresses_src as select *
from file( '$version-addresses.tsv' , TSVWithNamesAndTypes,
'$(cat "$version-addresses.tsv.columns")' ) ;
create table addresses_join_$version engine Join( any, left, address) as
select addr address, name from addresses_src;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_run_traces engine File( TSVWithNamesAndTypes,
'unstable-run-traces.$version.rep' ) as
select
2020-05-20 02:19:19 +00:00
test, query_index, query_id,
2020-03-24 17:33:18 +00:00
count( ) value,
2020-05-20 02:19:19 +00:00
joinGet( addresses_join_$version , 'name' , arrayJoin( trace) ) metric
from trace_log
join unstable_query_runs using query_id
group by test, query_index, query_id, metric
2020-02-17 19:32:40 +00:00
order by count( ) desc
;
2020-03-24 17:33:18 +00:00
create table metric_devation engine File( TSVWithNamesAndTypes,
'metric-deviation.$version.rep' ) as
2020-05-20 02:19:19 +00:00
-- first goes the key used to split the file with grep
select test, query_index, query_display_name,
d, q, metric
from (
select
test, query_index,
floor( ( q[ 3] - q[ 1] ) /q[ 2] , 3) d,
quantilesExact( 0, 0.5, 1) ( value) q, metric
from ( select * from unstable_run_metrics
union all select * from unstable_run_traces
union all select * from unstable_run_metrics_2) mm
group by test, query_index, metric
having d > 0.5
) metrics
left join unstable_query_runs using ( test, query_index)
order by test, query_index, d desc
2020-02-17 19:32:40 +00:00
;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table stacks engine File( TSV, 'stacks.$version.rep' ) as
2020-02-20 16:28:21 +00:00
select
2020-05-20 02:19:19 +00:00
-- first goes the key used to split the file with grep
test, query_index, any( query_display_name) ,
2020-05-21 23:03:41 +00:00
-- next go the stacks in flamegraph format: 'func1;...;funcN count'
2020-02-20 16:28:21 +00:00
arrayStringConcat(
2020-05-21 23:03:41 +00:00
arrayMap( addr -> replaceRegexpOne(
joinGet( addresses_join_$version , 'name' , addr) ,
-- This function is at the base of the stack, and its name changes
-- surprisingly often between builds, e.g. '__clone' or 'clone' or
-- even '__GI__clone' . This breaks differential flame graphs, so
-- filter it out here.
'^clone\\.S.*' , 'clone.S (name filtered by comparison script)'
) ,
2020-02-20 16:28:21 +00:00
arrayReverse( trace)
) ,
';'
) readable_trace,
2020-05-20 02:19:19 +00:00
count( ) c
2020-03-24 17:33:18 +00:00
from trace_log
2020-02-20 16:28:21 +00:00
join unstable_query_runs using query_id
2020-05-20 02:19:19 +00:00
group by test, query_index, trace
2020-05-21 23:03:41 +00:00
order by test, query_index, trace
2020-02-20 16:28:21 +00:00
;
2020-04-28 07:45:35 +00:00
" 2> >(tee -a report/errors.log 1>&2) # do not run in parallel because they use the same data dir for StorageJoins which leads to weird errors.
2020-03-24 17:33:18 +00:00
done
wait
2020-02-11 15:01:16 +00:00
2020-05-20 02:19:19 +00:00
# Create per-query flamegraphs and files with metrics
2020-02-20 16:28:21 +00:00
IFS = $'\n'
2020-03-24 17:33:18 +00:00
for version in { right,left}
2020-02-20 16:28:21 +00:00
do
2020-05-20 02:19:19 +00:00
for query in $( cut -d' ' -f1,2,3 " stacks. $version .rep " | sort | uniq)
2020-03-24 17:33:18 +00:00
do
2020-05-20 02:19:19 +00:00
query_file = $( echo " $query " | cut -c-120 | sed 's/[/ ]/_/g' )
2020-05-21 23:03:41 +00:00
echo " $query_file " >> report/query-files.txt
2020-04-21 18:46:45 +00:00
# Build separate .svg flamegraph for each query.
2020-05-21 23:03:41 +00:00
# -F is somewhat unsafe because it might match not the beginning of the
# string, but this is unlikely and escaping the query for grep is a pain.
2020-03-24 17:33:18 +00:00
grep -F " $query " " stacks. $version .rep " \
2020-05-21 23:03:41 +00:00
| cut -f 4- \
2020-03-24 17:33:18 +00:00
| sed 's/\t/ /g' \
| tee " $query_file .stacks. $version .rep " \
2020-05-21 23:03:41 +00:00
| ~/fg/flamegraph.pl --hash > " $query_file . $version .svg " &
2020-04-21 18:46:45 +00:00
# Copy metric stats into separate files as well.
2020-05-21 23:03:41 +00:00
# Ditto the above comment about -F.
2020-04-21 18:46:45 +00:00
grep -F " $query " " metric-deviation. $version .rep " \
2020-05-21 23:03:41 +00:00
| cut -f4- > " $query_file . $version .metrics.rep " &
2020-03-24 17:33:18 +00:00
done
2020-02-20 16:28:21 +00:00
done
wait
unset IFS
2020-05-21 23:03:41 +00:00
# Create differential flamegraphs.
IFS = $'\n'
for query_file in $( cat report/query-files.txt)
do
~/fg/difffolded.pl " $query_file .stacks.left.rep " " $query_file .stacks.right.rep " \
| tee " $query_file .stacks.diff.rep " \
| ~/fg/flamegraph.pl > " $query_file .diff.svg " &
done
unset IFS
wait
2020-02-11 15:01:16 +00:00
# Remember that grep sets error code when nothing is found, hence the bayan
2020-03-02 18:47:37 +00:00
# operator.
2020-04-22 19:41:40 +00:00
grep -H -m2 -i '\(Exception\|Error\):[^:]' ./*-err.log | sed 's/:/\t/' >> run-errors.tsv || :
2020-02-07 18:34:24 +00:00
}
2020-02-10 18:37:46 +00:00
2020-04-17 15:47:01 +00:00
# Check that local and client are in PATH
clickhouse-local --version > /dev/null
clickhouse-client --version > /dev/null
2020-02-10 18:37:46 +00:00
case " $stage " in
"" )
; &
2020-02-14 19:11:46 +00:00
"configure" )
time configure
; &
"restart" )
time restart
; &
"run_tests" )
2020-03-17 16:37:09 +00:00
# Ignore the errors to collect the log and build at least some report, anyway
2020-02-25 19:51:09 +00:00
time run_tests || :
2020-02-14 19:11:46 +00:00
; &
"get_profiles" )
2020-04-21 21:53:13 +00:00
# Getting profiles inexplicably hangs sometimes, so try to save some logs if
2020-04-23 20:18:46 +00:00
# this happens again. Give the servers some time to collect all info, then
2020-04-21 21:53:13 +00:00
# trace and kill. Start in a subshell, so that both function don't interfere
# with each other's jobs through `wait`. Also make the subshell have its own
# process group, so that we can then kill it with all its child processes.
# Somehow it doesn't kill the children by itself when dying.
set -m
( get_profiles_watchdog ) &
watchdog_pid = $!
set +m
# Check that the watchdog started OK.
kill -0 $watchdog_pid
2020-02-28 16:22:07 +00:00
# If the tests fail with OOM or something, still try to restart the servers
# to collect the logs. Prefer not to restart, because addresses might change
2020-04-21 21:53:13 +00:00
# and we won't be able to process trace_log data. Start in a subshell, so that
# it doesn't interfere with the watchdog through `wait`.
2020-05-19 14:51:51 +00:00
( get_profiles || restart || get_profiles || : )
2020-04-21 21:53:13 +00:00
# Kill the whole process group, because somehow when the subshell is killed,
# the sleep inside remains alive and orphaned.
while env kill -- -$watchdog_pid ; do sleep 1; done
2020-03-03 10:47:32 +00:00
# Stop the servers to free memory for the subsequent query analysis.
while killall clickhouse; do echo . ; sleep 1 ; done
echo Servers stopped.
2020-02-10 18:37:46 +00:00
; &
2020-02-27 19:43:43 +00:00
"analyze_queries" )
2020-04-28 14:26:49 +00:00
time analyze_queries || :
2020-02-27 19:43:43 +00:00
; &
2020-02-10 18:37:46 +00:00
"report" )
2020-04-30 08:36:33 +00:00
time report || :
2020-03-17 16:37:09 +00:00
2020-04-28 07:45:35 +00:00
time " $script_dir /report.py " --report= all-queries > all-queries.html 2> >( tee -a report/errors.log 1>& 2) || :
2020-03-17 16:37:09 +00:00
time " $script_dir /report.py " > report.html
2020-02-10 18:37:46 +00:00
; &
esac
2020-04-21 21:53:13 +00:00
# Print some final debug info to help debug Weirdness, of which there is plenty.
jobs
pstree -apgT