2019-12-26 17:35:41 +00:00
#!/bin/bash
2020-08-19 15:34:23 +00:00
set -exu
2019-12-26 17:35:41 +00:00
set -o pipefail
trap "exit" INT TERM
2021-03-15 18:45:57 +00:00
# The watchdog is in the separate process group, so we have to kill it separately
# if the script terminates earlier.
trap 'kill $(jobs -pr) ${watchdog_pid:-} ||:' EXIT
2019-12-26 17:35:41 +00:00
2020-02-27 20:02:50 +00:00
stage = ${ stage :- }
2020-03-24 17:33:18 +00:00
script_dir = " $( cd " $( dirname " ${ BASH_SOURCE [0] } " ) " >/dev/null 2>& 1 && pwd ) "
2020-12-08 18:18:48 +00:00
# upstream/master
LEFT_SERVER_PORT = 9001
# patched version
RIGHT_SERVER_PORT = 9002
2020-08-19 15:08:23 +00:00
function wait_for_server # port, pid
{
for _ in { 1..60}
do
if clickhouse-client --port " $1 " --query "select 1" || ! kill -0 " $2 "
then
break
fi
sleep 1
done
if ! clickhouse-client --port " $1 " --query "select 1"
then
echo " Cannot connect to ClickHouse server at $1 "
return 1
fi
if ! kill -0 " $2 "
then
echo " Server pid ' $2 ' is not running "
return 1
fi
}
2019-12-26 17:35:41 +00:00
2021-01-02 14:27:03 +00:00
function left_or_right( )
{
local from = $1 && shift
local basename = $1 && shift
if [ -e " $from / $basename " ] ; then
echo " $from / $basename "
return
fi
case " $from " in
left) echo " right/ $basename " ; ;
right) echo " left/ $basename " ; ;
esac
}
2019-12-26 17:35:41 +00:00
function configure
{
2020-02-25 19:51:09 +00:00
# Use the new config for both servers, so that we can change it in a PR.
2019-12-26 21:33:10 +00:00
rm right/config/config.d/text_log.xml || :
2020-02-25 19:51:09 +00:00
cp -rv right/config left || :
2020-01-14 19:05:58 +00:00
# Start a temporary server to rename the tables
2022-09-15 10:27:08 +00:00
while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
2020-01-14 19:05:58 +00:00
echo all killed
set -m # Spawn temporary in its own process groups
2020-12-08 18:18:48 +00:00
local setup_left_server_opts = (
# server options
--config-file= left/config/config.xml
--
# server *config* directives overrides
--path db0
--user_files_path db0/user_files
2021-01-02 14:27:03 +00:00
--top_level_domains_path " $( left_or_right right top_level_domains) "
2020-12-08 18:18:48 +00:00
--tcp_port $LEFT_SERVER_PORT
)
left/clickhouse-server " ${ setup_left_server_opts [@] } " & > setup-server-log.log &
2020-01-14 19:05:58 +00:00
left_pid = $!
kill -0 $left_pid
disown $left_pid
set +m
2020-08-19 15:08:23 +00:00
2020-12-08 18:18:48 +00:00
wait_for_server $LEFT_SERVER_PORT $left_pid
2020-08-19 15:08:23 +00:00
echo Server for setup started
2020-01-14 19:05:58 +00:00
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "create database test" || :
clickhouse-client --port $LEFT_SERVER_PORT --query "rename table datasets.hits_v1 to test.hits" || :
2020-02-14 12:55:47 +00:00
2022-09-15 10:27:08 +00:00
while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
2020-02-14 12:55:47 +00:00
echo all killed
2020-01-14 19:05:58 +00:00
# Make copies of the original db for both servers. Use hardlinks instead
2020-06-25 20:19:27 +00:00
# of copying to save space. Before that, remove preprocessed configs and
# system tables, because sharing them between servers with hardlinks may
2020-10-27 11:04:03 +00:00
# lead to weird effects.
2020-01-14 19:05:58 +00:00
rm -r left/db || :
rm -r right/db || :
2020-04-13 19:06:07 +00:00
rm -r db0/preprocessed_configs || :
2020-06-25 20:19:27 +00:00
rm -r db0/{ data,metadata} /system || :
2021-02-21 07:56:48 +00:00
rm db0/status || :
2020-01-14 19:05:58 +00:00
cp -al db0/ left/db/
cp -al db0/ right/db/
2020-02-25 19:51:09 +00:00
}
function restart
{
2022-09-15 10:27:08 +00:00
while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
2020-02-25 19:51:09 +00:00
echo all killed
2020-01-14 19:05:58 +00:00
2020-10-26 16:35:37 +00:00
# Change the jemalloc settings here.
2020-09-25 10:19:37 +00:00
# https://github.com/jemalloc/jemalloc/wiki/Getting-Started
2020-10-26 16:35:37 +00:00
export MALLOC_CONF = "confirm_conf:true"
2019-12-26 17:35:41 +00:00
2020-09-25 10:19:37 +00:00
set -m # Spawn servers in their own process groups
2020-10-27 11:04:03 +00:00
2020-12-08 18:18:48 +00:00
local left_server_opts = (
# server options
--config-file= left/config/config.xml
--
# server *config* directives overrides
--path left/db
--user_files_path left/db/user_files
2021-01-02 14:27:03 +00:00
--top_level_domains_path " $( left_or_right left top_level_domains) "
2020-12-08 18:18:48 +00:00
--tcp_port $LEFT_SERVER_PORT
)
left/clickhouse-server " ${ left_server_opts [@] } " & >> left-server-log.log &
2019-12-26 17:35:41 +00:00
left_pid = $!
kill -0 $left_pid
disown $left_pid
2020-12-08 18:18:48 +00:00
local right_server_opts = (
# server options
--config-file= right/config/config.xml
--
# server *config* directives overrides
--path right/db
--user_files_path right/db/user_files
2021-01-02 14:27:03 +00:00
--top_level_domains_path " $( left_or_right right top_level_domains) "
2020-12-08 18:18:48 +00:00
--tcp_port $RIGHT_SERVER_PORT
)
right/clickhouse-server " ${ right_server_opts [@] } " & >> right-server-log.log &
2019-12-26 17:35:41 +00:00
right_pid = $!
kill -0 $right_pid
disown $right_pid
set +m
2020-09-25 10:19:37 +00:00
unset MALLOC_CONF
2020-12-08 18:18:48 +00:00
wait_for_server $LEFT_SERVER_PORT $left_pid
2019-12-26 17:35:41 +00:00
echo left ok
2020-08-19 15:08:23 +00:00
2020-12-08 18:18:48 +00:00
wait_for_server $RIGHT_SERVER_PORT $right_pid
2019-12-26 17:35:41 +00:00
echo right ok
2019-12-26 21:33:10 +00:00
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.tables where database != 'system'"
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.build_options"
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.tables where database != 'system'"
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.build_options"
2020-04-17 15:47:01 +00:00
# Check again that both servers we started are running -- this is important
# for running locally, when there might be some other servers started and we
# will connect to them instead.
kill -0 $left_pid
kill -0 $right_pid
2019-12-26 17:35:41 +00:00
}
2019-12-26 19:16:36 +00:00
function run_tests
{
# Just check that the script runs at all
" $script_dir /perf.py " --help > /dev/null
2020-04-30 08:36:33 +00:00
# Find the directory with test files.
if [ -v CHPC_TEST_PATH ]
2020-02-14 12:55:47 +00:00
then
2020-04-30 08:36:33 +00:00
# Use the explicitly set path to directory with test files.
test_prefix = " $CHPC_TEST_PATH "
2020-09-10 16:57:26 +00:00
elif [ " $PR_TO_TEST " = = "0" ]
2020-04-30 08:36:33 +00:00
then
# When testing commits from master, use the older test files. This
# allows the tests to pass even when we add new functions and tests for
# them, that are not supported in the old revision.
test_prefix = left/performance
2020-06-05 14:30:26 +00:00
else
2020-04-30 08:36:33 +00:00
# For PRs, use newer test files so we can test these changes.
test_prefix = right/performance
2020-02-14 12:55:47 +00:00
fi
2020-04-30 08:36:33 +00:00
# Determine which tests to run.
2020-04-23 20:18:46 +00:00
if [ -v CHPC_TEST_GREP ]
2020-02-14 12:55:47 +00:00
then
2020-04-30 08:36:33 +00:00
# Run only explicitly specified tests, if any.
2020-08-19 15:31:13 +00:00
# shellcheck disable=SC2010
2023-01-01 19:53:06 +00:00
test_files = ( $( ls " $test_prefix " | rg " $CHPC_TEST_GREP " | xargs -I{ } -n1 readlink -f " $test_prefix /{} " ) )
2020-09-18 13:27:50 +00:00
elif [ " $PR_TO_TEST " -ne 0 ] \
&& [ " $( wc -l < changed-test-definitions.txt) " -gt 0 ] \
&& [ " $( wc -l < other-changed-files.txt) " -eq 0 ]
2020-04-02 18:44:58 +00:00
then
2020-09-18 13:27:50 +00:00
# If only the perf tests were changed in the PR, we will run only these
# tests. The lists of changed files are prepared in entrypoint.sh because
# it has the repository.
2021-12-13 09:58:34 +00:00
test_files = ( $( sed " s/tests\/performance/ ${ test_prefix // \/ / \\ / } / " changed-test-definitions.txt) )
2020-04-30 08:36:33 +00:00
else
# The default -- run all tests found in the test dir.
2021-12-13 09:58:34 +00:00
test_files = ( $( ls " $test_prefix " /*.xml) )
2020-04-02 18:44:58 +00:00
fi
2022-06-02 12:29:24 +00:00
# We can filter out certain tests
if [ -v CHPC_TEST_GREP_EXCLUDE ] ; then
# filter tests array in bash https://stackoverflow.com/a/40375567
2023-01-01 19:53:06 +00:00
filtered_test_files = ( $( for i in ${ test_files [@] } ; do echo $i ; done | rg -v ${ CHPC_TEST_GREP_EXCLUDE } ) )
2022-06-02 12:29:24 +00:00
test_files = ( " ${ filtered_test_files [@] } " )
fi
2021-12-13 08:56:54 +00:00
# We split perf tests into multiple checks to make them faster
if [ -v CHPC_TEST_RUN_BY_HASH_TOTAL ] ; then
# filter tests array in bash https://stackoverflow.com/a/40375567
for index in " ${ !test_files[@] } " ; do
2022-06-27 13:19:13 +00:00
[ $(( index % CHPC_TEST_RUN_BY_HASH_TOTAL )) != " $CHPC_TEST_RUN_BY_HASH_NUM " ] && \
2021-12-13 08:56:54 +00:00
unset -v 'test_files[$index]'
done
# to have sequential indexes...
test_files = ( " ${ test_files [@] } " )
fi
2021-10-27 19:26:10 +00:00
# For PRs w/o changes in test definitons, test only a subset of queries,
# and run them less times. If the corresponding environment variables are
# already set, keep those values.
#
# NOTE: too high CHPC_RUNS/CHPC_MAX_QUERIES may hit internal CI timeout.
2021-11-24 20:05:27 +00:00
# NOTE: Currently we disabled complete run even for master branch
#if [ "$PR_TO_TEST" -ne 0 ] && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ]
#then
# CHPC_RUNS=${CHPC_RUNS:-7}
# CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-10}
#else
# CHPC_RUNS=${CHPC_RUNS:-13}
# CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
#fi
CHPC_RUNS = ${ CHPC_RUNS :- 7 }
CHPC_MAX_QUERIES = ${ CHPC_MAX_QUERIES :- 10 }
2020-09-15 10:44:21 +00:00
export CHPC_RUNS
2020-09-10 16:57:26 +00:00
export CHPC_MAX_QUERIES
2020-06-23 12:30:45 +00:00
# Determine which concurrent benchmarks to run. For now, the only test
# we run as a concurrent benchmark is 'website'. Run it as benchmark if we
# are also going to run it as a normal test.
2021-12-13 09:58:34 +00:00
for test in ${ test_files [@] } ; do echo " $test " ; done | sed -n '/website/p' > benchmarks-to-run.txt
2020-06-23 12:30:45 +00:00
2020-04-30 08:36:33 +00:00
# Delete old report files.
2020-05-20 02:19:19 +00:00
for x in { test-times,wall-clock-times} .tsv
2020-04-30 08:36:33 +00:00
do
rm -v " $x " || :
touch " $x "
done
2021-12-13 09:58:34 +00:00
# Randomize test order. BTW, it's not an array no more.
test_files = $( for f in ${ test_files [@] } ; do echo " $f " ; done | sort -R)
2020-06-25 20:19:27 +00:00
2020-09-25 09:52:09 +00:00
# Limit profiling time to 10 minutes, not to run for too long.
profile_seconds_left = 600
2020-02-25 19:51:09 +00:00
# Run the tests.
2021-04-23 17:32:30 +00:00
total_tests = $( echo " $test_files " | wc -w)
2021-04-23 13:47:13 +00:00
current_test = 0
2020-02-25 19:51:09 +00:00
test_name = "<none>"
2020-02-14 12:55:47 +00:00
for test in $test_files
2019-12-26 19:16:36 +00:00
do
2021-04-23 19:01:24 +00:00
echo " $current_test of $total_tests tests complete " > status.txt
2020-06-25 20:19:27 +00:00
# Check that both servers are alive, and restart them if they die.
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "select 1 format Null" \
2020-06-25 20:19:27 +00:00
|| { echo $test_name >> left-server-died.log ; restart ; }
2020-12-08 18:18:48 +00:00
clickhouse-client --port $RIGHT_SERVER_PORT --query "select 1 format Null" \
2020-06-25 20:19:27 +00:00
|| { echo $test_name >> right-server-died.log ; restart ; }
2020-02-25 19:51:09 +00:00
2020-02-27 20:02:50 +00:00
test_name = $( basename " $test " ".xml" )
echo test " $test_name "
2020-02-14 19:11:46 +00:00
2020-09-25 09:52:09 +00:00
# Don't profile if we're past the time limit.
2020-10-27 11:04:03 +00:00
# Use awk because bash doesn't support floating point arithmetic.
2020-09-29 11:07:31 +00:00
profile_seconds = $( awk " BEGIN { print ( $profile_seconds_left > 0 ? 10 : 0) } " )
2020-09-25 09:52:09 +00:00
2023-01-01 19:53:06 +00:00
if [ " $( rg -c $( basename $test ) changed-test-definitions.txt) " -gt 0 ]
2022-08-19 17:46:07 +00:00
then
# Run all queries from changed test files to ensure that all new queries will be tested.
max_queries = 0
else
max_queries = $CHPC_MAX_QUERIES
fi
2021-12-11 21:08:26 +00:00
(
set +x
argv = (
--host localhost localhost
--port " $LEFT_SERVER_PORT " " $RIGHT_SERVER_PORT "
--runs " $CHPC_RUNS "
2022-08-19 17:46:07 +00:00
--max-queries " $max_queries "
2021-12-11 21:08:26 +00:00
--profile-seconds " $profile_seconds "
" $test "
)
TIMEFORMAT = $( printf " $test_name \t%%3R\t%%3U\t%%3S\n " )
# one more subshell to suppress trace output for "set +x"
(
time " $script_dir /perf.py " " ${ argv [@] } " > " $test_name -raw.tsv " 2> " $test_name -err.log "
) 2>>wall-clock-times.tsv >/dev/null \
|| echo " Test $test_name failed with error code $? " >> " $test_name -err.log "
) 2>/dev/null
2020-09-25 09:52:09 +00:00
profile_seconds_left = $( awk -F' ' \
2020-09-28 11:44:49 +00:00
'BEGIN { s = ' $profile_seconds_left '; } /^profile-total/ { s -= $2 } END { print s }' \
2020-09-25 09:52:09 +00:00
" $test_name -raw.tsv " )
2021-04-23 13:47:13 +00:00
current_test = $(( current_test + 1 ))
2019-12-26 19:16:36 +00:00
done
2020-02-14 12:55:47 +00:00
wait
2020-02-14 19:11:46 +00:00
}
2020-02-14 12:55:47 +00:00
2020-04-21 21:53:13 +00:00
function get_profiles_watchdog
{
2020-09-01 23:43:23 +00:00
sleep 600
2020-04-21 21:53:13 +00:00
2020-04-22 19:41:40 +00:00
echo "The trace collection did not finish in time." >> profile-errors.log
2020-04-21 21:53:13 +00:00
for pid in $( pgrep -f clickhouse)
do
2021-12-09 21:12:45 +00:00
sudo gdb -p " $pid " --batch --ex "info proc all" --ex "thread apply all bt" --ex quit & > " $pid .gdb.log " &
2020-04-21 21:53:13 +00:00
done
wait
2020-04-29 09:47:17 +00:00
for _ in { 1..10}
2020-04-21 21:53:13 +00:00
do
if ! pkill -f clickhouse
then
break
fi
sleep 1
done
}
2020-02-14 19:11:46 +00:00
function get_profiles
{
2020-02-14 12:55:47 +00:00
# Collect the profiles
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "system flush logs" &
clickhouse-client --port $RIGHT_SERVER_PORT --query "system flush logs" &
2020-06-25 20:19:27 +00:00
wait
2020-04-17 15:47:01 +00:00
2021-10-24 22:28:44 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > left-query-log.tsv || : &
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv || : &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv || : &
clickhouse-client --port $LEFT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv || : &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.metric_log format TSVWithNamesAndTypes" > left-metric-log.tsv || : &
clickhouse-client --port $LEFT_SERVER_PORT --query "select * from system.asynchronous_metric_log format TSVWithNamesAndTypes" > left-async-metric-log.tsv || : &
2020-04-17 15:47:01 +00:00
2021-10-24 22:28:44 +00:00
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_log where type in ('QueryFinish', 'ExceptionWhileProcessing') format TSVWithNamesAndTypes" > right-query-log.tsv || : &
2020-12-08 18:18:48 +00:00
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > right-query-thread-log.tsv || : &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv || : &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv || : &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.metric_log format TSVWithNamesAndTypes" > right-metric-log.tsv || : &
clickhouse-client --port $RIGHT_SERVER_PORT --query "select * from system.asynchronous_metric_log format TSVWithNamesAndTypes" > right-async-metric-log.tsv || : &
2020-02-14 12:55:47 +00:00
wait
2020-04-24 07:19:23 +00:00
# Just check that the servers are alive so that we return a proper exit code.
# We don't consistently check the return codes of the above background jobs.
2020-12-08 18:18:48 +00:00
clickhouse-client --port $LEFT_SERVER_PORT --query "select 1"
clickhouse-client --port $RIGHT_SERVER_PORT --query "select 1"
2019-12-26 19:16:36 +00:00
}
2019-12-26 17:35:41 +00:00
2020-05-21 23:03:41 +00:00
# Build and analyze randomization distribution for all queries.
function analyze_queries
{
rm -v analyze-commands.txt analyze-errors.log all-queries.tsv unstable-queries.tsv ./*-report.tsv raw-queries.tsv || :
2020-05-25 01:42:56 +00:00
rm -rf analyze || :
mkdir analyze analyze/tmp || :
2020-05-21 23:03:41 +00:00
2020-04-29 17:26:28 +00:00
# Split the raw test output into files suitable for analysis.
2021-03-02 16:21:30 +00:00
# To debug calculations only for a particular test, substitute a suitable
# wildcard here, e.g. `for test_file in modulo-raw.tsv`.
2020-08-19 15:31:13 +00:00
for test_file in *-raw.tsv
2020-04-29 17:26:28 +00:00
do
test_name = $( basename " $test_file " "-raw.tsv" )
2020-05-20 02:19:19 +00:00
sed -n " s/^query\t/ $test_name \t/p " < " $test_file " >> "analyze/query-runs.tsv"
2020-09-25 09:52:09 +00:00
sed -n " s/^profile\t/ $test_name \t/p " < " $test_file " >> "analyze/query-profiles.tsv"
2020-06-27 00:45:00 +00:00
sed -n " s/^client-time\t/ $test_name \t/p " < " $test_file " >> "analyze/client-times.tsv"
sed -n " s/^report-threshold\t/ $test_name \t/p " < " $test_file " >> "analyze/report-thresholds.tsv"
sed -n " s/^skipped\t/ $test_name \t/p " < " $test_file " >> "analyze/skipped-tests.tsv"
sed -n " s/^display-name\t/ $test_name \t/p " < " $test_file " >> "analyze/query-display-names.tsv"
sed -n " s/^partial\t/ $test_name \t/p " < " $test_file " >> "analyze/partial-queries.tsv"
2020-04-29 17:26:28 +00:00
done
2020-04-27 12:47:59 +00:00
2020-05-20 02:19:19 +00:00
# for each query run, prepare array of metrics from query log
2021-06-28 11:42:21 +00:00
clickhouse-local --query "
2020-05-20 02:19:19 +00:00
create view query_runs as select * from file( 'analyze/query-runs.tsv' , TSV,
'test text, query_index int, query_id text, version UInt8, time float' ) ;
2020-07-03 08:39:43 +00:00
-- Separately process 'partial' queries which we could only run on the new server
-- because they use new functions. We can' t make normal stats for them, but still
-- have to show some stats so that the PR author can tweak them.
2020-06-27 00:45:00 +00:00
create view partial_queries as select test, query_index
from file( 'analyze/partial-queries.tsv' , TSV,
'test text, query_index int, servers Array(int)' ) ;
create table partial_query_times engine File( TSVWithNamesAndTypes,
'analyze/partial-query-times.tsv' )
as select test, query_index, stddevPop( time ) time_stddev, median( time ) time_median
from query_runs
where ( test, query_index) in partial_queries
group by test, query_index
;
2020-07-03 08:39:43 +00:00
-- Process queries that were run normally, on both servers.
2020-05-20 02:19:19 +00:00
create view left_query_log as select *
2022-04-17 06:01:15 +00:00
from file( 'left-query-log.tsv' , TSVWithNamesAndTypes) ;
2020-05-20 02:19:19 +00:00
create view right_query_log as select *
2022-04-17 06:01:15 +00:00
from file( 'right-query-log.tsv' , TSVWithNamesAndTypes) ;
2020-05-20 02:19:19 +00:00
2020-06-25 20:19:27 +00:00
create view query_logs as
2021-06-29 18:51:21 +00:00
select 0 version, query_id, ProfileEvents,
2020-08-11 11:53:25 +00:00
query_duration_ms, memory_usage from left_query_log
2020-06-25 20:19:27 +00:00
union all
2021-06-29 18:51:21 +00:00
select 1 version, query_id, ProfileEvents,
2020-08-11 11:53:25 +00:00
query_duration_ms, memory_usage from right_query_log
2020-06-25 20:19:27 +00:00
;
2020-07-09 14:45:24 +00:00
-- This is a single source of truth on all metrics we have for query runs. The
-- metrics include ProfileEvents from system.query_log, and query run times
-- reported by the perf.py test runner.
create table query_run_metric_arrays engine File( TSV, 'analyze/query-run-metric-arrays.tsv' )
2020-06-25 20:19:27 +00:00
as
with (
-- sumMapState with the list of all keys with '-0.' values. Negative zero is because
-- sumMap removes keys with positive zeros.
2021-06-29 18:51:21 +00:00
with ( select groupUniqArrayArray( mapKeys( ProfileEvents) ) from query_logs) as all_names
2020-06-25 20:19:27 +00:00
select arrayReduce( 'sumMapState' , [ ( all_names, arrayMap( x->-0., all_names) ) ] )
) as all_metrics
select test, query_index, version, query_id,
( finalizeAggregation(
arrayReduce( 'sumMapMergeState' ,
[
all_metrics,
arrayReduce( 'sumMapState' ,
2021-06-29 18:51:21 +00:00
[ ( mapKeys( ProfileEvents) ,
arrayMap( x->toFloat64( x) , mapValues( ProfileEvents) ) ) ]
2020-06-25 20:19:27 +00:00
) ,
arrayReduce( 'sumMapState' , [ (
2020-08-11 11:53:25 +00:00
[ 'client_time' , 'server_time' , 'memory_usage' ] ,
2020-06-25 20:19:27 +00:00
arrayMap( x->if( x != 0., x, -0.) , [
toFloat64( query_runs.time) ,
2020-08-11 11:53:25 +00:00
toFloat64( query_duration_ms / 1000.) ,
toFloat64( memory_usage) ] ) ) ] )
2020-06-25 20:19:27 +00:00
]
) ) as metrics_tuple) .1 metric_names,
metrics_tuple.2 metric_values
from query_logs
2020-05-20 02:19:19 +00:00
right join query_runs
2020-06-25 20:19:27 +00:00
on query_logs.query_id = query_runs.query_id
and query_logs.version = query_runs.version
2020-06-27 00:45:00 +00:00
where ( test, query_index) not in partial_queries
2020-06-25 20:19:27 +00:00
;
2020-07-09 14:45:24 +00:00
-- This is just for convenience -- human-readable + easy to make plots.
create table query_run_metrics_denorm engine File( TSV, 'analyze/query-run-metrics-denorm.tsv' )
as select test, query_index, metric_names, version, query_id, metric_values
from query_run_metric_arrays
array join metric_names, metric_values
order by test, query_index, metric_names, version, query_id
;
2020-09-14 07:59:45 +00:00
-- Filter out tests that don' t have an even number of runs, to avoid breaking
-- the further calculations. This may happen if there was an error during the
-- test runs, e.g. the server died. It will be reported in test errors, so we
-- don' t have to report it again.
create view broken_queries as
select test, query_index
from query_runs
group by test, query_index
having count( *) % 2 != 0
;
2020-07-09 14:45:24 +00:00
-- This is for statistical processing with eqmed.sql
create table query_run_metrics_for_stats engine File(
2020-06-25 20:19:27 +00:00
TSV, -- do not add header -- will parse with grep
2020-07-09 14:45:24 +00:00
'analyze/query-run-metrics-for-stats.tsv' )
2021-03-02 16:21:30 +00:00
as select test, query_index, 0 run, version,
-- For debugging, add a filter for a particular metric like this:
-- arrayFilter( m, n -> n = 'client_time' , metric_values, metric_names)
-- metric_values
-- Note that further reporting may break, because the metric names are
-- not filtered.
metric_values
2020-07-09 14:45:24 +00:00
from query_run_metric_arrays
2020-09-14 07:59:45 +00:00
where ( test, query_index) not in broken_queries
2020-06-25 20:19:27 +00:00
order by test, query_index, run, version
;
2020-07-09 14:45:24 +00:00
-- This is the list of metric names, so that we can join them back after
-- statistical processing.
2020-06-25 20:19:27 +00:00
create table query_run_metric_names engine File( TSV, 'analyze/query-run-metric-names.tsv' )
2020-07-09 14:45:24 +00:00
as select metric_names from query_run_metric_arrays limit 1
2020-05-20 02:19:19 +00:00
;
2020-07-09 14:45:24 +00:00
" 2> >(tee -a analyze/errors.log 1>&2)
2020-05-20 02:19:19 +00:00
2020-04-27 12:47:59 +00:00
# This is a lateral join in bash... please forgive me.
2020-05-20 02:19:19 +00:00
# We don't have arrayPermute(), so I have to make random permutations with
2020-04-27 12:47:59 +00:00
# `order by rand`, and it becomes really slow if I do it for more than one
# query. We also don't have lateral joins. So I just put all runs of each
# query into a separate file, and then compute randomization distribution
# for each file. I do this in parallel using GNU parallel.
2020-06-27 00:45:00 +00:00
( set +x # do not bloat the log
2020-04-27 12:47:59 +00:00
IFS = $'\n'
2020-07-09 14:45:24 +00:00
for prefix in $( cut -f1,2 "analyze/query-run-metrics-for-stats.tsv" | sort | uniq)
2020-04-27 12:47:59 +00:00
do
2020-08-19 15:31:13 +00:00
file = " analyze/tmp/ ${ prefix // /_ } .tsv "
2023-01-01 19:53:06 +00:00
rg " ^ $prefix " "analyze/query-run-metrics-for-stats.tsv" > " $file " &
2020-05-20 02:19:19 +00:00
printf "%s\0\n" \
2021-06-28 11:42:21 +00:00
" clickhouse-local \
2020-05-20 02:19:19 +00:00
--file \" $file \" \
--structure 'test text, query text, run int, version UInt8, metrics Array(float)' \
--query \" $( cat " $script_dir /eqmed.sql " ) \" \
2020-07-09 14:45:24 +00:00
>> \" analyze/query-metric-stats.tsv\" " \
2020-05-20 02:19:19 +00:00
2>> analyze/errors.log \
>> analyze/commands.txt
2020-04-27 12:47:59 +00:00
done
wait
unset IFS
2020-06-27 00:45:00 +00:00
)
2020-04-27 12:47:59 +00:00
2020-10-01 10:56:56 +00:00
# The comparison script might be bound to one NUMA node for better test
# stability, and the calculation runs out of memory because of this. Use
# all nodes.
2020-10-16 14:44:59 +00:00
numactl --show
2020-10-19 14:31:02 +00:00
numactl --cpunodebind= all --membind= all numactl --show
2021-11-23 08:34:10 +00:00
# Use less jobs to avoid OOM. Some queries can consume 8+ GB of memory.
jobs_count = $(( $( grep -c ^processor /proc/cpuinfo) / 3 ))
numactl --cpunodebind= all --membind= all parallel --jobs $jobs_count --joblog analyze/parallel-log.txt --null < analyze/commands.txt 2>> analyze/errors.log
2020-07-09 14:45:24 +00:00
2021-06-28 11:42:21 +00:00
clickhouse-local --query "
2020-07-09 14:45:24 +00:00
-- Join the metric names back to the metric statistics we' ve calculated, and make
-- a denormalized table of them -- statistics for all metrics for all queries.
-- The WITH, ARRAY JOIN and CROSS JOIN do not like each other:
-- https://github.com/ClickHouse/ClickHouse/issues/11868
-- https://github.com/ClickHouse/ClickHouse/issues/11757
-- Because of this, we make a view with arrays first, and then apply all the
-- array joins.
create view query_metric_stat_arrays as
with ( select * from file( 'analyze/query-run-metric-names.tsv' ,
TSV, 'n Array(String)' ) ) as metric_name
select test, query_index, metric_name, left, right, diff, stat_threshold
from file( 'analyze/query-metric-stats.tsv' , TSV, ' left Array( float) ,
right Array( float) , diff Array( float) , stat_threshold Array( float) ,
test text, query_index int' ) reports
order by test, query_index, metric_name
;
create table query_metric_stats_denorm engine File( TSVWithNamesAndTypes,
'analyze/query-metric-stats-denorm.tsv' )
as select test, query_index, metric_name, left, right, diff, stat_threshold
from query_metric_stat_arrays
left array join metric_name, left, right, diff, stat_threshold
order by test, query_index, metric_name
;
" 2> >(tee -a analyze/errors.log 1>&2)
2021-05-26 13:30:43 +00:00
# Fetch historical query variability thresholds from the CI database
2022-06-07 10:00:31 +00:00
if [ -v CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_URL ]
2021-05-26 13:30:43 +00:00
then
set +x # Don't show password in the log
client = ( clickhouse-client
# Surprisingly, clickhouse-client doesn't understand --host 127.0.0.1:9000
# so I have to extract host and port with clickhouse-local. I tried to use
# Poco URI parser to support this in the client, but it's broken and can't
# parse host:port.
2022-06-07 10:00:31 +00:00
$( clickhouse-local --query " with ' ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_URL } ' as url select '--host ' || domain(url) || ' --port ' || toString(port(url)) format TSV " )
2021-05-26 13:30:43 +00:00
--secure
2022-06-07 10:00:31 +00:00
--user " ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_USER } "
--password " ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_USER_PASSWORD } "
2021-05-26 13:30:43 +00:00
--config "right/config/client_config.xml"
--date_time_input_format= best_effort)
2021-06-03 17:40:05 +00:00
# Precision is going to be 1.5 times worse for PRs, because we run the queries
# less times. How do I know it? I ran this:
2021-05-26 13:30:43 +00:00
# SELECT quantilesExact(0., 0.1, 0.5, 0.75, 0.95, 1.)(p / m)
# FROM
# (
# SELECT
# quantileIf(0.95)(stat_threshold, pr_number = 0) AS m,
# quantileIf(0.95)(stat_threshold, (pr_number != 0) AND (abs(diff) < stat_threshold)) AS p
# FROM query_metrics_v2
# WHERE (event_date > (today() - toIntervalMonth(1))) AND (metric = 'client_time')
# GROUP BY
# test,
# query_index,
# query_display_name
# HAVING count(*) > 100
# )
2021-06-03 17:40:05 +00:00
#
# The file can be empty if the server is inaccessible, so we can't use
# TSVWithNamesAndTypes.
#
2021-05-26 13:30:43 +00:00
" ${ client [@] } " --query "
select test, query_index,
2021-06-06 09:57:18 +00:00
quantileExact( 0.99) ( abs( diff) ) * 1.5 AS max_diff,
quantileExactIf( 0.99) ( stat_threshold, abs( diff) < stat_threshold) * 1.5 AS max_stat_threshold,
2021-05-26 13:30:43 +00:00
query_display_name
from query_metrics_v2
2021-06-03 17:40:05 +00:00
-- We use results at least one week in the past, so that the current
-- changes do not immediately influence the statistics, and we have
-- some time to notice that something is wrong.
where event_date between now( ) - interval 1 month - interval 1 week
and now( ) - interval 1 week
2021-05-26 13:30:43 +00:00
and metric = 'client_time'
and pr_number = 0
group by test, query_index, query_display_name
having count( *) > 100
" > analyze/historical-thresholds.tsv
2021-06-04 15:27:21 +00:00
set -x
2021-05-26 13:30:43 +00:00
else
touch analyze/historical-thresholds.tsv
fi
2020-03-02 18:47:37 +00:00
}
2019-12-26 19:16:36 +00:00
# Analyze results
2020-02-07 18:34:24 +00:00
function report
{
2020-04-28 07:45:35 +00:00
rm -r report || :
2020-05-25 01:42:56 +00:00
mkdir report report/tmp || :
2020-04-28 07:45:35 +00:00
2020-05-28 07:45:03 +00:00
rm ./*.{ rep,svg} test-times.tsv test-dump.tsv unstable.tsv unstable-query-ids.tsv unstable-query-metrics.tsv changed-perf.tsv unstable-tests.tsv unstable-queries.tsv bad-tests.tsv slow-on-client.tsv all-queries.tsv run-errors.tsv || :
2020-02-17 19:32:40 +00:00
2020-05-20 02:19:19 +00:00
cat analyze/errors.log >> report/errors.log || :
2020-04-28 07:45:35 +00:00
cat profile-errors.log >> report/errors.log || :
2020-04-22 19:41:40 +00:00
2021-06-28 11:42:21 +00:00
clickhouse-local --query "
2020-05-20 02:19:19 +00:00
create view query_display_names as select * from
file( 'analyze/query-display-names.tsv' , TSV,
'test text, query_index int, query_display_name text' )
;
2020-07-03 08:39:43 +00:00
create view partial_query_times as select * from
file( 'analyze/partial-query-times.tsv' , TSVWithNamesAndTypes,
2021-11-01 09:13:09 +00:00
'test text, query_index int, time_stddev float, time_median double' )
2020-07-03 08:39:43 +00:00
;
2020-07-09 14:45:24 +00:00
-- Report for partial queries that we could only run on the new server ( e.g.
-- queries with new functions added in the tested PR) .
2020-06-27 00:45:00 +00:00
create table partial_queries_report engine File( TSV, 'report/partial-queries-report.tsv' )
2021-08-19 18:43:43 +00:00
settings output_format_decimal_trailing_zeros = 1
2020-07-31 19:58:18 +00:00
as select toDecimal64( time_median, 3) time,
toDecimal64( time_stddev / time_median, 3) relative_time_stddev,
2020-06-27 00:45:00 +00:00
test, query_index, query_display_name
2020-07-03 08:39:43 +00:00
from partial_query_times
2020-06-27 00:45:00 +00:00
join query_display_names using ( test, query_index)
order by test, query_index
;
2020-07-09 14:45:24 +00:00
create view query_metric_stats as
select * from file( 'analyze/query-metric-stats-denorm.tsv' ,
TSVWithNamesAndTypes,
' test text, query_index int, metric_name text, left float, right float,
diff float, stat_threshold float' )
2020-06-25 20:19:27 +00:00
;
2021-05-26 13:30:43 +00:00
create table report_thresholds engine File( TSVWithNamesAndTypes, 'report/thresholds.tsv' )
as select
query_display_names.test test, query_display_names.query_index query_index,
ceil( greatest( 0.1, historical_thresholds.max_diff,
test_thresholds.report_threshold) , 2) changed_threshold,
ceil( greatest( 0.2, historical_thresholds.max_stat_threshold,
test_thresholds.report_threshold + 0.1) , 2) unstable_threshold,
query_display_names.query_display_name query_display_name
from query_display_names
left join file( 'analyze/historical-thresholds.tsv' , TSV,
' test text, query_index int, max_diff float, max_stat_threshold float,
query_display_name text' ) historical_thresholds
on query_display_names.test = historical_thresholds.test
and query_display_names.query_index = historical_thresholds.query_index
and query_display_names.query_display_name = historical_thresholds.query_display_name
left join file( 'analyze/report-thresholds.tsv' , TSV,
'test text, report_threshold float' ) test_thresholds
on query_display_names.test = test_thresholds.test
;
2020-05-20 02:19:19 +00:00
-- Main statistics for queries -- query time as reported in query log.
2020-04-28 07:45:35 +00:00
create table queries engine File( TSVWithNamesAndTypes, 'report/queries.tsv' )
2020-03-23 09:39:12 +00:00
as select
2021-03-02 16:21:30 +00:00
-- It is important to have a non-strict inequality with stat_threshold
-- here. The randomization distribution is actually discrete, and when
-- the number of runs is small, the quantile we need ( e.g. 0.99) turns
-- out to be the maximum value of the distribution. We can also hit this
-- maximum possible value with our test run, and this obviously means
-- that we have observed the difference to the best precision possible
-- for the given number of runs. If we use a strict equality here, we
-- will miss such cases. This happened in the wild and lead to some
-- uncaught regressions, because for the default 7 runs we do for PRs,
-- the randomization distribution has only 16 values, so the max quantile
-- is actually 0.9375.
2021-05-26 13:30:43 +00:00
abs( diff) > changed_threshold and abs( diff) >= stat_threshold as changed_fail,
abs( diff) > changed_threshold - 0.05 and abs( diff) >= stat_threshold as changed_show,
2020-10-27 11:04:03 +00:00
2021-05-26 13:30:43 +00:00
not changed_fail and stat_threshold > unstable_threshold as unstable_fail,
not changed_show and stat_threshold > unstable_threshold - 0.05 as unstable_show,
2020-10-27 11:04:03 +00:00
2020-04-28 07:45:35 +00:00
left, right, diff, stat_threshold,
2020-07-15 13:29:00 +00:00
query_metric_stats.test test, query_metric_stats.query_index query_index,
2021-05-26 13:30:43 +00:00
query_display_names.query_display_name query_display_name
2020-05-20 02:19:19 +00:00
from query_metric_stats
2020-07-03 08:39:43 +00:00
left join query_display_names
on query_metric_stats.test = query_display_names.test
and query_metric_stats.query_index = query_display_names.query_index
2021-05-26 13:30:43 +00:00
left join report_thresholds
on query_display_names.test = report_thresholds.test
and query_display_names.query_index = report_thresholds.query_index
and query_display_names.query_display_name = report_thresholds.query_display_name
2020-09-01 19:05:57 +00:00
-- 'server_time' is rounded down to ms, which might be bad for very short queries.
-- Use 'client_time' instead.
where metric_name = 'client_time'
2020-05-20 02:19:19 +00:00
order by test, query_index, metric_name
;
2020-04-28 07:45:35 +00:00
2021-08-19 18:43:43 +00:00
create table changed_perf_report engine File( TSV, 'report/changed-perf.tsv' )
settings output_format_decimal_trailing_zeros = 1
as with
2020-07-31 19:58:18 +00:00
-- server_time is sometimes reported as zero ( if it' s less than 1 ms) ,
-- so we have to work around this to not get an error about conversion
-- of NaN to decimal.
2020-08-05 23:02:18 +00:00
( left > right ? left / right : right / left) as times_change_float,
isFinite( times_change_float) as times_change_finite,
toDecimal64( times_change_finite ? times_change_float : 1., 3) as times_change_decimal,
times_change_finite
? ( left > right ? '-' : '+' ) || toString( times_change_decimal) || 'x'
: '--' as times_change_str
select
toDecimal64( left, 3) , toDecimal64( right, 3) , times_change_str,
toDecimal64( diff, 3) , toDecimal64( stat_threshold, 3) ,
changed_fail, test, query_index, query_display_name
2020-05-20 02:19:19 +00:00
from queries where changed_show order by abs( diff) desc;
2020-02-11 20:00:53 +00:00
2021-08-19 18:43:43 +00:00
create table unstable_queries_report engine File( TSV, 'report/unstable-queries.tsv' )
settings output_format_decimal_trailing_zeros = 1
as select
2020-07-31 19:58:18 +00:00
toDecimal64( left, 3) , toDecimal64( right, 3) , toDecimal64( diff, 3) ,
toDecimal64( stat_threshold, 3) , unstable_fail, test, query_index, query_display_name
2020-05-20 02:19:19 +00:00
from queries where unstable_show order by stat_threshold desc;
2020-02-11 20:00:53 +00:00
2020-06-09 13:29:07 +00:00
2020-09-03 00:57:25 +00:00
create view test_speedup as
select
test,
exp2( avg( log2( left / right) ) ) times_speedup,
count( *) queries,
unstable + changed bad,
sum( changed_show) changed,
sum( unstable_show) unstable
2020-06-09 13:29:07 +00:00
from queries
group by test
2020-09-03 00:57:25 +00:00
order by times_speedup desc
;
create view total_speedup as
select
'Total' test,
exp2( avg( log2( times_speedup) ) ) times_speedup,
sum( queries) queries,
unstable + changed bad,
sum( changed) changed,
sum( unstable) unstable
from test_speedup
2020-06-09 13:29:07 +00:00
;
2021-08-19 18:43:43 +00:00
create table test_perf_changes_report engine File( TSV, 'report/test-perf-changes.tsv' )
settings output_format_decimal_trailing_zeros = 1
as with
2020-09-03 00:57:25 +00:00
( times_speedup >= 1
? '-' || toString( toDecimal64( times_speedup, 3) ) || 'x'
: '+' || toString( toDecimal64( 1 / times_speedup, 3) ) || 'x' )
as times_speedup_str
select test, times_speedup_str, queries, bad, changed, unstable
-- Not sure what' s the precedence of UNION ALL vs WHERE & ORDER BY, hence all
-- the braces.
from (
(
select * from total_speedup
) union all (
select * from test_speedup
where
( times_speedup >= 1 ? times_speedup : ( 1 / times_speedup) ) >= 1.005
or bad
)
)
2020-09-06 23:49:07 +00:00
order by test = 'Total' desc, times_speedup desc
2020-06-09 13:29:07 +00:00
;
2020-02-07 18:34:24 +00:00
2020-09-03 00:57:25 +00:00
2020-07-03 08:39:43 +00:00
create view total_client_time_per_query as select *
2020-05-20 02:19:19 +00:00
from file( 'analyze/client-times.tsv' , TSV,
'test text, query_index int, client float, server float' ) ;
2020-02-07 18:34:24 +00:00
2021-08-19 18:43:43 +00:00
create table slow_on_client_report engine File( TSV, 'report/slow-on-client.tsv' )
settings output_format_decimal_trailing_zeros = 1
as select client, server, toDecimal64( client/server, 3) p,
2020-07-31 19:58:18 +00:00
test, query_display_name
2020-07-03 08:39:43 +00:00
from total_client_time_per_query left join query_display_names using ( test, query_index)
2020-07-31 19:58:18 +00:00
where p > toDecimal64( 1.02, 3) order by p desc;
2020-02-10 16:34:07 +00:00
2020-07-03 08:39:43 +00:00
create table wall_clock_time_per_test engine Memory as select *
from file( 'wall-clock-times.tsv' , TSV, 'test text, real float, user float, system float' ) ;
2020-02-10 16:34:07 +00:00
create table test_time engine Memory as
2020-02-11 15:01:16 +00:00
select test, sum( client) total_client_time,
2020-09-03 01:42:25 +00:00
max( client) query_max,
min( client) query_min,
count( *) queries
2020-07-03 08:39:43 +00:00
from total_client_time_per_query full join queries using ( test, query_index)
2020-02-11 15:01:16 +00:00
group by test;
2020-02-07 18:34:24 +00:00
2020-09-18 13:27:50 +00:00
create view query_runs as select * from file( 'analyze/query-runs.tsv' , TSV,
'test text, query_index int, query_id text, version UInt8, time float' ) ;
--
-- Guess the number of query runs used for this test. The number is required to
-- calculate and check the average query run time in the report.
-- We have to be careful, because we will encounter:
-- 1) partial queries which run only on one server
-- 3) some errors that make query run for a different number of times on a
-- particular server.
--
create view test_runs as
select test,
-- Default to 7 runs if there are only 'short' queries in the test, and
-- we can' t determine the number of runs.
2022-04-29 02:27:57 +00:00
if ( ( ceil( median( t.runs) , 0) as r) != 0, r, 7) runs
2020-09-18 13:27:50 +00:00
from (
select
-- The query id is the same for both servers, so no need to divide here.
uniqExact( query_id) runs,
test, query_index
from query_runs
group by test, query_index
) t
group by test
;
2020-09-24 11:46:03 +00:00
create view test_times_view as
select
wall_clock_time_per_test.test test,
real,
total_client_time,
2020-02-11 15:01:16 +00:00
queries,
2020-09-24 11:46:03 +00:00
query_max,
2021-03-18 14:59:49 +00:00
real / if ( queries > 0, queries, 1) avg_real_per_query,
2020-09-24 11:46:03 +00:00
query_min,
2020-09-18 13:27:50 +00:00
runs
2020-04-29 16:27:40 +00:00
from test_time
2020-09-18 13:27:50 +00:00
-- wall clock times are also measured for skipped tests, so don' t
-- do full join
left join wall_clock_time_per_test
on wall_clock_time_per_test.test = test_time.test
full join test_runs
on test_runs.test = test_time.test
2020-09-24 11:46:03 +00:00
;
-- WITH TOTALS doesn' t work with INSERT SELECT, so we have to jump through these
-- hoops: https://github.com/ClickHouse/ClickHouse/issues/15227
create view test_times_view_total as
select
'Total' test,
sum( real) ,
sum( total_client_time) ,
sum( queries) ,
max( query_max) ,
2021-03-18 14:59:49 +00:00
sum( real) / if ( sum( queries) > 0, sum( queries) , 1) avg_real_per_query,
2020-09-24 11:46:03 +00:00
min( query_min) ,
-- Totaling the number of runs doesn' t make sense, but use the max so
-- that the reporting script doesn' t complain about queries being too
-- long.
max( runs)
from test_times_view
;
2021-08-19 18:43:43 +00:00
create table test_times_report engine File( TSV, 'report/test-times.tsv' )
settings output_format_decimal_trailing_zeros = 1
as select
2020-09-24 11:46:03 +00:00
test,
toDecimal64( real, 3) ,
toDecimal64( total_client_time, 3) ,
queries,
toDecimal64( query_max, 3) ,
toDecimal64( avg_real_per_query, 3) ,
toDecimal64( query_min, 3) ,
runs
from (
select * from test_times_view
union all
select * from test_times_view_total
)
order by test = 'Total' desc, avg_real_per_query desc
;
2020-02-10 16:34:07 +00:00
2020-05-20 02:19:19 +00:00
-- report for all queries page, only main metric
2021-08-19 18:43:43 +00:00
create table all_tests_report engine File( TSV, 'report/all-queries.tsv' )
settings output_format_decimal_trailing_zeros = 1
as with
2020-08-05 23:02:18 +00:00
-- server_time is sometimes reported as zero ( if it' s less than 1 ms) ,
-- so we have to work around this to not get an error about conversion
-- of NaN to decimal.
( left > right ? left / right : right / left) as times_change_float,
isFinite( times_change_float) as times_change_finite,
toDecimal64( times_change_finite ? times_change_float : 1., 3) as times_change_decimal,
times_change_finite
? ( left > right ? '-' : '+' ) || toString( times_change_decimal) || 'x'
: '--' as times_change_str
2020-04-28 07:45:35 +00:00
select changed_fail, unstable_fail,
2020-08-05 23:02:18 +00:00
toDecimal64( left, 3) , toDecimal64( right, 3) , times_change_str,
2020-07-31 19:58:18 +00:00
toDecimal64( isFinite( diff) ? diff : 0, 3) ,
toDecimal64( isFinite( stat_threshold) ? stat_threshold : 0, 3) ,
test, query_index, query_display_name
2020-06-08 13:57:33 +00:00
from queries order by test, query_index;
2020-04-28 07:45:35 +00:00
2020-09-17 16:21:59 +00:00
2020-07-03 08:39:43 +00:00
--------------------------------------------------------------------------------
-- various compatibility data formats follow, not related to the main report
-- keep the table in old format so that we can analyze new and old data together
create table queries_old_format engine File( TSVWithNamesAndTypes, 'queries.rep' )
2020-09-03 01:42:25 +00:00
as select 0 short, changed_fail, unstable_fail, left, right, diff,
2020-07-03 08:39:43 +00:00
stat_threshold, test, query_display_name query
from queries
;
2020-05-20 02:19:19 +00:00
-- new report for all queries with all metrics ( no page yet)
create table all_query_metrics_tsv engine File( TSV, 'report/all-query-metrics.tsv' ) as
select metric_name, left, right, diff,
floor( left > right ? left / right : right / left, 3) ,
stat_threshold, test, query_index, query_display_name
from query_metric_stats
2020-07-03 08:39:43 +00:00
left join query_display_names
on query_metric_stats.test = query_display_names.test
and query_metric_stats.query_index = query_display_names.query_index
2020-05-20 02:19:19 +00:00
order by test, query_index;
" 2> >(tee -a report/errors.log 1>&2)
2020-03-24 17:33:18 +00:00
2020-09-25 09:52:09 +00:00
# Prepare source data for metrics and flamegraphs for queries that were profiled
# by perf.py.
2020-03-24 17:33:18 +00:00
for version in { right,left}
2020-05-25 01:03:21 +00:00
do
2020-05-20 02:19:19 +00:00
rm -rf data
2021-06-28 11:42:21 +00:00
clickhouse-local --query "
2020-09-25 09:52:09 +00:00
create view query_profiles as
2020-05-20 02:19:19 +00:00
with 0 as left, 1 as right
2020-09-25 09:52:09 +00:00
select * from file( 'analyze/query-profiles.tsv' , TSV,
2020-05-20 02:19:19 +00:00
'test text, query_index int, query_id text, version UInt8, time float' )
where version = $version
;
2020-02-17 19:32:40 +00:00
2020-05-20 02:19:19 +00:00
create view query_display_names as select * from
file( 'analyze/query-display-names.tsv' , TSV,
'test text, query_index int, query_display_name text' )
;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_query_runs engine File( TSVWithNamesAndTypes,
'unstable-query-runs.$version.rep' ) as
2020-09-25 09:52:09 +00:00
select query_profiles.test test, query_profiles.query_index query_index,
2020-07-15 13:29:00 +00:00
query_display_name, query_id
2020-09-25 09:52:09 +00:00
from query_profiles
2020-05-20 02:19:19 +00:00
left join query_display_names on
2020-09-25 09:52:09 +00:00
query_profiles.test = query_display_names.test
and query_profiles.query_index = query_display_names.query_index
2020-02-17 19:32:40 +00:00
;
2020-05-20 02:19:19 +00:00
create view query_log as select *
2022-04-17 06:01:15 +00:00
from file( '$version-query-log.tsv' , TSVWithNamesAndTypes) ;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_run_metrics engine File( TSVWithNamesAndTypes,
'unstable-run-metrics.$version.rep' ) as
2021-06-29 18:51:21 +00:00
select test, query_index, query_id, value, metric
from query_log
array join
mapValues( ProfileEvents) as value,
mapKeys( ProfileEvents) as metric
2020-05-20 02:19:19 +00:00
join unstable_query_runs using ( query_id)
2020-02-17 19:32:40 +00:00
;
2020-03-24 17:33:18 +00:00
create table unstable_run_metrics_2 engine File( TSVWithNamesAndTypes,
'unstable-run-metrics-2.$version.rep' ) as
2020-05-20 02:19:19 +00:00
select
test, query_index, query_id,
v, n
from (
select
test, query_index, query_id,
2020-03-17 02:41:47 +00:00
[ 'memory_usage' , 'read_bytes' , 'written_bytes' , 'query_duration_ms' ] n,
2020-05-20 02:19:19 +00:00
[ memory_usage, read_bytes, written_bytes, query_duration_ms] v
2020-03-24 17:33:18 +00:00
from query_log
2020-05-20 02:19:19 +00:00
join unstable_query_runs using ( query_id)
)
array join v, n;
create view trace_log as select *
2022-04-17 06:01:15 +00:00
from file( '$version-trace-log.tsv' , TSVWithNamesAndTypes) ;
2020-05-20 02:19:19 +00:00
2020-05-25 01:03:21 +00:00
create view addresses_src as select addr,
-- Some functions change name between builds, e.g. '__clone' or 'clone' or
-- even '__GI__clone@@GLIBC_2.32' . This breaks differential flame graphs, so
-- filter them out here.
[ name, 'clone.S (filtered by script)' , 'pthread_cond_timedwait (filtered by script)' ]
-- this line is a subscript operator of the above array
[ 1 + multiSearchFirstIndex( name, [ 'clone.S' , 'pthread_cond_timedwait' ] ) ] name
2022-04-17 06:01:15 +00:00
from file( '$version-addresses.tsv' , TSVWithNamesAndTypes) ;
2020-05-20 02:19:19 +00:00
create table addresses_join_$version engine Join( any, left, address) as
select addr address, name from addresses_src;
2020-02-20 16:28:21 +00:00
2020-03-24 17:33:18 +00:00
create table unstable_run_traces engine File( TSVWithNamesAndTypes,
'unstable-run-traces.$version.rep' ) as
select
2020-05-20 02:19:19 +00:00
test, query_index, query_id,
2020-03-24 17:33:18 +00:00
count( ) value,
2020-05-25 01:03:21 +00:00
joinGet( addresses_join_$version , 'name' , arrayJoin( trace) )
|| '(' || toString( trace_type) || ')' metric
2020-05-20 02:19:19 +00:00
from trace_log
join unstable_query_runs using query_id
group by test, query_index, query_id, metric
2020-02-17 19:32:40 +00:00
order by count( ) desc
;
2020-03-24 17:33:18 +00:00
create table metric_devation engine File( TSVWithNamesAndTypes,
2021-08-19 18:43:43 +00:00
'report/metric-deviation.$version.tsv' )
settings output_format_decimal_trailing_zeros = 1
2020-05-20 02:19:19 +00:00
-- first goes the key used to split the file with grep
2021-08-19 18:43:43 +00:00
as select test, query_index, query_display_name,
2020-07-31 19:58:18 +00:00
toDecimal64( d, 3) d, q, metric
2020-05-20 02:19:19 +00:00
from (
select
test, query_index,
2020-07-31 19:58:18 +00:00
( q[ 3] - q[ 1] ) /q[ 2] d,
2020-05-20 02:19:19 +00:00
quantilesExact( 0, 0.5, 1) ( value) q, metric
from ( select * from unstable_run_metrics
union all select * from unstable_run_traces
union all select * from unstable_run_metrics_2) mm
group by test, query_index, metric
2020-07-31 19:58:18 +00:00
having isFinite( d) and d > 0.5 and q[ 3] > 5
2020-05-20 02:19:19 +00:00
) metrics
2020-05-25 01:42:56 +00:00
left join query_display_names using ( test, query_index)
2020-05-20 02:19:19 +00:00
order by test, query_index, d desc
2020-02-17 19:32:40 +00:00
;
2020-02-20 16:28:21 +00:00
2020-05-25 01:03:21 +00:00
create table stacks engine File( TSV, 'report/stacks.$version.tsv' ) as
2020-02-20 16:28:21 +00:00
select
2020-05-20 02:19:19 +00:00
-- first goes the key used to split the file with grep
2020-05-25 01:03:21 +00:00
test, query_index, trace_type, any( query_display_name) ,
2020-05-21 23:03:41 +00:00
-- next go the stacks in flamegraph format: 'func1;...;funcN count'
2020-02-20 16:28:21 +00:00
arrayStringConcat(
2020-05-25 01:03:21 +00:00
arrayMap(
addr -> joinGet( addresses_join_$version , 'name' , addr) ,
2020-02-20 16:28:21 +00:00
arrayReverse( trace)
) ,
';'
) readable_trace,
2020-05-20 02:19:19 +00:00
count( ) c
2020-03-24 17:33:18 +00:00
from trace_log
2020-02-20 16:28:21 +00:00
join unstable_query_runs using query_id
2020-05-25 01:03:21 +00:00
group by test, query_index, trace_type, trace
order by test, query_index, trace_type, trace
2020-02-20 16:28:21 +00:00
;
2020-07-31 19:58:18 +00:00
" 2> >(tee -a report/errors.log 1>&2) &
2020-03-24 17:33:18 +00:00
done
wait
2020-02-11 15:01:16 +00:00
2020-05-25 01:42:56 +00:00
# Create per-query flamegraphs
2021-04-26 09:39:54 +00:00
touch report/query-files.txt
2020-02-20 16:28:21 +00:00
IFS = $'\n'
2020-03-24 17:33:18 +00:00
for version in { right,left}
2020-02-20 16:28:21 +00:00
do
2020-05-25 01:03:21 +00:00
for query in $( cut -d' ' -f1-4 " report/stacks. $version .tsv " | sort | uniq)
2020-03-24 17:33:18 +00:00
do
2020-05-20 02:19:19 +00:00
query_file = $( echo " $query " | cut -c-120 | sed 's/[/ ]/_/g' )
2020-05-21 23:03:41 +00:00
echo " $query_file " >> report/query-files.txt
2020-04-21 18:46:45 +00:00
# Build separate .svg flamegraph for each query.
2020-05-21 23:03:41 +00:00
# -F is somewhat unsafe because it might match not the beginning of the
# string, but this is unlikely and escaping the query for grep is a pain.
2023-01-01 19:53:06 +00:00
rg -F " $query " " report/stacks. $version .tsv " \
2020-05-25 01:03:21 +00:00
| cut -f 5- \
2020-03-24 17:33:18 +00:00
| sed 's/\t/ /g' \
2020-05-25 01:42:56 +00:00
| tee " report/tmp/ $query_file .stacks. $version .tsv " \
2020-05-21 23:03:41 +00:00
| ~/fg/flamegraph.pl --hash > " $query_file . $version .svg " &
2020-03-24 17:33:18 +00:00
done
2020-02-20 16:28:21 +00:00
done
wait
unset IFS
2020-05-21 23:03:41 +00:00
# Create differential flamegraphs.
2020-08-19 15:31:13 +00:00
while IFS = read -r query_file
2020-05-21 23:03:41 +00:00
do
2020-05-25 01:42:56 +00:00
~/fg/difffolded.pl " report/tmp/ $query_file .stacks.left.tsv " \
" report/tmp/ $query_file .stacks.right.tsv " \
| tee " report/tmp/ $query_file .stacks.diff.tsv " \
2020-05-21 23:03:41 +00:00
| ~/fg/flamegraph.pl > " $query_file .diff.svg " &
2020-08-19 15:31:13 +00:00
done < report/query-files.txt
2020-05-21 23:03:41 +00:00
wait
2020-05-25 01:42:56 +00:00
# Create per-query files with metrics. Note that the key is different from flamegraphs.
IFS = $'\n'
for version in { right,left}
do
for query in $( cut -d' ' -f1-3 " report/metric-deviation. $version .tsv " | sort | uniq)
do
query_file = $( echo " $query " | cut -c-120 | sed 's/[/ ]/_/g' )
# Ditto the above comment about -F.
2023-01-01 19:53:06 +00:00
rg -F " $query " " report/metric-deviation. $version .tsv " \
2020-05-25 01:42:56 +00:00
| cut -f4- > " $query_file . $version .metrics.rep " &
done
done
wait
unset IFS
2020-07-02 07:13:02 +00:00
# Prefer to grep for clickhouse_driver exception messages, but if there are none,
# just show a couple of lines from the log.
for log in *-err.log
do
test = $( basename " $log " "-err.log" )
{
2020-09-01 19:05:57 +00:00
# The second grep is a heuristic for error messages like
# "socket.timeout: timed out".
2023-01-06 21:57:47 +00:00
rg --no-filename --max-count= 2 -i '\(Exception\|Error\):[^:]' " $log " \
|| rg --no-filename --max-count= 2 -i '^[^ ]\+: ' " $log " \
2020-07-02 07:13:02 +00:00
|| head -2 " $log "
} | sed " s/^/ $test \t/ " >> run-errors.tsv || :
done
2020-02-07 18:34:24 +00:00
}
2020-02-10 18:37:46 +00:00
2020-06-25 20:19:27 +00:00
function report_metrics
{
rm -rf metrics || :
mkdir metrics
2021-06-28 11:42:21 +00:00
clickhouse-local --query "
2020-06-25 20:19:27 +00:00
create view right_async_metric_log as
2022-04-17 06:01:15 +00:00
select * from file( 'right-async-metric-log.tsv' , TSVWithNamesAndTypes)
2020-06-25 20:19:27 +00:00
;
-- Use the right log as time reference because it may have higher precision.
create table metrics engine File( TSV, 'metrics/metrics.tsv' ) as
with ( select min( event_time) from right_async_metric_log) as min_time
2021-07-05 19:08:29 +00:00
select metric, r.event_time - min_time event_time, l.value as left, r.value as right
2020-06-25 20:19:27 +00:00
from right_async_metric_log r
2022-04-17 06:01:15 +00:00
asof join file( 'left-async-metric-log.tsv' , TSVWithNamesAndTypes) l
2021-07-05 19:08:29 +00:00
on l.metric = r.metric and r.event_time <= l.event_time
2020-06-25 20:19:27 +00:00
order by metric, event_time
;
-- Show metrics that have changed
2021-08-19 18:43:43 +00:00
create table changes engine File( TSV, 'metrics/changes.tsv' )
settings output_format_decimal_trailing_zeros = 1
as select metric, left, right,
2020-07-31 19:58:18 +00:00
toDecimal64( diff, 3) , toDecimal64( times_diff, 3)
from (
select metric, median( left) as left, median( right) as right,
( right - left) / left diff,
if ( left > right, left / right, right / left) times_diff
from metrics
group by metric
2021-07-22 15:37:20 +00:00
having abs( diff) > 0.05 and isFinite( diff) and isFinite( times_diff)
2020-07-31 19:58:18 +00:00
)
2020-06-25 20:19:27 +00:00
order by diff desc
;
2020-08-19 15:31:13 +00:00
" 2> >(tee -a metrics/errors.log 1>&2)
2020-06-25 20:19:27 +00:00
IFS = $'\n'
for prefix in $( cut -f1 "metrics/metrics.tsv" | sort | uniq)
do
file = " metrics/ $prefix .tsv "
2023-01-01 19:53:06 +00:00
rg " ^ $prefix " "metrics/metrics.tsv" | cut -f2- > " $file "
2020-06-25 20:19:27 +00:00
gnuplot -e "
set datafile separator '\t' ;
set terminal png size 960,540;
set xtics time format '%tH:%tM' ;
set title '$prefix' noenhanced offset 0,-3;
set key left top;
plot
'$file' using 1:2 with lines title 'Left'
, '$file' using 1:3 with lines title 'Right'
;
" \
| convert - -filter point -resize "200%" " metrics/ $prefix .png " &
done
wait
unset IFS
}
2020-11-02 23:10:19 +00:00
function upload_results
{
2021-06-03 17:40:05 +00:00
# Prepare info for the CI checks table.
2022-06-15 20:25:36 +00:00
rm -f ci-checks.tsv
2021-06-03 17:40:05 +00:00
clickhouse-local --query "
2022-06-15 20:25:36 +00:00
create view queries as select * from file( 'report/queries.tsv' , TSVWithNamesAndTypes) ;
2021-06-03 17:40:05 +00:00
create table ci_checks engine File( TSVWithNamesAndTypes, 'ci-checks.tsv' )
as select
2022-06-15 20:25:36 +00:00
$PR_TO_TEST :: UInt32 AS pull_request_number,
2022-06-17 14:14:18 +00:00
'$SHA_TO_TEST' :: LowCardinality( String) AS commit_sha,
2022-07-05 12:24:58 +00:00
'${CLICKHOUSE_PERFORMANCE_COMPARISON_CHECK_NAME:-Performance}' :: LowCardinality( String) AS check_name,
2022-06-17 14:14:18 +00:00
'$(sed -n ' s/.*<!--status: \( .*\) -->/\1 /p' report.html)' :: LowCardinality( String) AS check_status,
2021-06-03 17:40:05 +00:00
-- TODO toDateTime( ) can't parse output of ' date' , so no time for now.
2022-06-17 14:14:18 +00:00
( ( $( date +%s) - $CHPC_CHECK_START_TIMESTAMP ) * 1000) :: UInt64 AS check_duration_ms,
2021-06-08 14:12:47 +00:00
fromUnixTimestamp( $CHPC_CHECK_START_TIMESTAMP ) check_start_time,
2022-06-17 14:14:18 +00:00
test_name :: LowCardinality( String) AS test_name ,
test_status :: LowCardinality( String) AS test_status,
test_duration_ms :: UInt64 AS test_duration_ms,
2021-06-04 15:27:21 +00:00
report_url,
2021-06-03 17:40:05 +00:00
$PR_TO_TEST = 0
? 'https://github.com/ClickHouse/ClickHouse/commit/$SHA_TO_TEST'
: 'https://github.com/ClickHouse/ClickHouse/pull/$PR_TO_TEST' pull_request_url,
'' commit_url,
'' task_url,
'' base_ref,
'' base_repo,
'' head_ref,
'' head_repo
from (
select '' test_name,
'$(sed -n ' s/.*<!--message: \( .*\) -->/\1 /p' report.html)' test_status,
2021-06-04 15:27:21 +00:00
0 test_duration_ms,
2022-07-05 12:24:58 +00:00
'https://s3.amazonaws.com/clickhouse-test-reports/$PR_TO_TEST/$SHA_TO_TEST/${CLICKHOUSE_PERFORMANCE_COMPARISON_CHECK_NAME_PREFIX}/report.html#fail1' report_url
2021-06-03 17:40:05 +00:00
union all
2021-06-04 15:27:21 +00:00
select test || ' #' || toString( query_index) , 'slower' test_status, 0 test_duration_ms,
2022-07-05 12:24:58 +00:00
'https://s3.amazonaws.com/clickhouse-test-reports/$PR_TO_TEST/$SHA_TO_TEST/${CLICKHOUSE_PERFORMANCE_COMPARISON_CHECK_NAME_PREFIX}/report.html#changes-in-performance.'
2021-06-04 15:27:21 +00:00
|| test || '.' || toString( query_index) report_url
2021-06-03 17:40:05 +00:00
from queries where changed_fail != 0 and diff > 0
union all
2021-06-04 15:27:21 +00:00
select test || ' #' || toString( query_index) , 'unstable' test_status, 0 test_duration_ms,
2022-07-05 12:24:58 +00:00
'https://s3.amazonaws.com/clickhouse-test-reports/$PR_TO_TEST/$SHA_TO_TEST/${CLICKHOUSE_PERFORMANCE_COMPARISON_CHECK_NAME_PREFIX}/report.html#unstable-queries.'
2021-06-04 15:27:21 +00:00
|| test || '.' || toString( query_index) report_url
2021-06-03 17:40:05 +00:00
from queries where unstable_fail != 0
)
;
"
2022-06-07 10:00:31 +00:00
if ! [ -v CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_URL ]
2020-11-02 23:10:19 +00:00
then
echo Database for test results is not specified, will not upload them.
return 0
2021-01-22 05:16:13 +00:00
fi
2020-11-02 23:10:19 +00:00
2020-11-04 01:19:53 +00:00
set +x # Don't show password in the log
2021-04-22 16:46:54 +00:00
client = ( clickhouse-client
# Surprisingly, clickhouse-client doesn't understand --host 127.0.0.1:9000
2021-06-28 11:42:21 +00:00
# so I have to extract host and port with clickhouse-local. I tried to use
2021-04-22 16:46:54 +00:00
# Poco URI parser to support this in the client, but it's broken and can't
# parse host:port.
2022-06-07 10:00:31 +00:00
$( clickhouse-local --query " with ' ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_URL } ' as url select '--host ' || domain(url) || ' --port ' || toString(port(url)) format TSV " )
2021-04-22 16:46:54 +00:00
--secure
2022-06-07 10:00:31 +00:00
--user " ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_USER } "
--password " ${ CLICKHOUSE_PERFORMANCE_COMPARISON_DATABASE_USER_PASSWORD } "
2021-04-22 16:46:54 +00:00
--config "right/config/client_config.xml"
--date_time_input_format= best_effort)
2022-06-14 17:39:16 +00:00
# CREATE TABLE IF NOT EXISTS query_metrics_v2 (
# `event_date` Date,
# `event_time` DateTime,
# `pr_number` UInt32,
# `old_sha` String,
# `new_sha` String,
# `test` LowCardinality(String),
# `query_index` UInt32,
# `query_display_name` String,
2022-06-15 20:25:36 +00:00
# `metric` LowCardinality(String),
2022-06-14 17:39:16 +00:00
# `old_value` Float64,
# `new_value` Float64,
# `diff` Float64,
# `stat_threshold` Float64
# ) ENGINE = ReplicatedMergeTree
# ORDER BY event_date
# CREATE TABLE IF NOT EXISTS run_attributes_v1 (
# `old_sha` String,
# `new_sha` String,
2022-06-15 20:25:36 +00:00
# `metric` LowCardinality(String),
2022-06-14 17:39:16 +00:00
# `metric_value` String
# ) ENGINE = ReplicatedMergeTree
# ORDER BY (old_sha, new_sha)
2022-06-10 14:04:31 +00:00
2021-04-22 16:46:54 +00:00
" ${ client [@] } " --query "
2020-11-06 10:45:42 +00:00
insert into query_metrics_v2
2020-11-04 01:19:53 +00:00
select
toDate( event_time) event_date,
toDateTime( '$(cd right/ch && git show -s --format=%ci "$SHA_TO_TEST" | cut -d' ' -f-2)' ) event_time,
$PR_TO_TEST pr_number,
'$REF_SHA' old_sha,
'$SHA_TO_TEST' new_sha,
test,
query_index,
query_display_name,
2022-06-15 20:25:36 +00:00
metric_name as metric,
2020-11-04 01:19:53 +00:00
old_value,
new_value,
diff,
stat_threshold
from input( ' metric_name text, old_value float, new_value float, diff float,
ratio_display_text text, stat_threshold float,
test text, query_index int, query_display_name text' )
format TSV
" < report/all-query-metrics.tsv # Don't leave whitespace after INSERT: https://github.com/ClickHouse/ClickHouse/issues/16652
2021-04-22 16:46:54 +00:00
# Upload some run attributes. I use this weird form because it is the same
# form that can be used for historical data when you only have compare.log.
cat compare.log \
| sed -n '
2021-04-22 17:18:21 +00:00
s/.*Model name:[ [ :space:] ] \+ \( .*\) $/metric lscpu-model-name \1 /p;
s/.*L1d cache:[ [ :space:] ] \+ \( .*\) $/metric lscpu-l1d-cache \1 /p;
s/.*L1i cache:[ [ :space:] ] \+ \( .*\) $/metric lscpu-l1i-cache \1 /p;
s/.*L2 cache:[ [ :space:] ] \+ \( .*\) $/metric lscpu-l2-cache \1 /p;
s/.*L3 cache:[ [ :space:] ] \+ \( .*\) $/metric lscpu-l3-cache \1 /p;
2021-04-22 16:46:54 +00:00
s/.*left_sha= \( .*\) $/old-sha \1 /p;
s/.*right_sha= \( .*\) /new-sha \1 /p' \
| awk '
BEGIN { FS = "\t" ; OFS = "\t" }
/^old-sha/ { old_sha = $2 }
/^new-sha/ { new_sha = $2 }
/^metric/ { print old_sha, new_sha, $2 , $3 } ' \
2022-06-10 14:04:31 +00:00
| " ${ client [@] } " --query "INSERT INTO run_attributes_v1 FORMAT TSV"
2021-04-22 16:46:54 +00:00
2021-04-23 20:51:55 +00:00
# Grepping numactl results from log is too crazy, I'll just call it again.
2022-06-10 14:04:31 +00:00
" ${ client [@] } " --query "INSERT INTO run_attributes_v1 FORMAT TSV" <<EOF
$REF_SHA $SHA_TO_TEST $( numactl --show | sed -n 's/^cpubind:[[:space:]]\+/numactl-cpubind /p' )
$REF_SHA $SHA_TO_TEST $( numactl --hardware | sed -n 's/^available:[[:space:]]\+/numactl-available /p' )
EOF
2021-04-23 20:51:55 +00:00
2021-06-03 17:40:05 +00:00
# Also insert some data about the check into the CI checks table.
2022-03-29 19:06:50 +00:00
" ${ client [@] } " --query "INSERT INTO " '"' "default" '"' ".checks FORMAT TSVWithNamesAndTypes" \
2021-06-03 17:40:05 +00:00
< ci-checks.tsv
2020-11-02 23:10:19 +00:00
set -x
}
2020-04-17 15:47:01 +00:00
# Check that local and client are in PATH
2021-06-28 11:42:21 +00:00
clickhouse-local --version > /dev/null
2020-04-17 15:47:01 +00:00
clickhouse-client --version > /dev/null
2020-02-10 18:37:46 +00:00
case " $stage " in
"" )
; &
2020-02-14 19:11:46 +00:00
"configure" )
time configure
; &
"restart" )
2020-10-22 12:52:43 +00:00
numactl --show || :
2020-07-21 12:41:14 +00:00
numactl --hardware || :
lscpu || :
2020-10-21 15:06:42 +00:00
dmidecode -t 4 || :
2020-02-14 19:11:46 +00:00
time restart
; &
"run_tests" )
2020-03-17 16:37:09 +00:00
# Ignore the errors to collect the log and build at least some report, anyway
2020-02-25 19:51:09 +00:00
time run_tests || :
2020-02-14 19:11:46 +00:00
; &
"get_profiles" )
2020-06-25 20:19:27 +00:00
# Check for huge pages.
cat /sys/kernel/mm/transparent_hugepage/enabled > thp-enabled.txt || :
cat /proc/meminfo > meminfo.txt || :
for pid in $( pgrep -f clickhouse-server)
do
cat " /proc/ $pid /smaps " > " $pid -smaps.txt " || :
done
# We had a bug where getting profiles froze sometimes, so try to save some
# logs if this happens again. Give the servers some time to collect all info,
# then trace and kill. Start in a subshell, so that both function don't
# interfere with each other's jobs through `wait`. Also make the subshell
# have its own process group, so that we can then kill it with all its child
# processes. Somehow it doesn't kill the children by itself when dying.
2020-04-21 21:53:13 +00:00
set -m
( get_profiles_watchdog ) &
watchdog_pid = $!
set +m
# Check that the watchdog started OK.
kill -0 $watchdog_pid
2020-02-28 16:22:07 +00:00
# If the tests fail with OOM or something, still try to restart the servers
# to collect the logs. Prefer not to restart, because addresses might change
2020-04-21 21:53:13 +00:00
# and we won't be able to process trace_log data. Start in a subshell, so that
# it doesn't interfere with the watchdog through `wait`.
2020-09-23 08:21:55 +00:00
( get_profiles || { restart && get_profiles ; } ) || :
2020-04-21 21:53:13 +00:00
# Kill the whole process group, because somehow when the subshell is killed,
# the sleep inside remains alive and orphaned.
while env kill -- -$watchdog_pid ; do sleep 1; done
2020-03-03 10:47:32 +00:00
# Stop the servers to free memory for the subsequent query analysis.
2022-09-15 10:27:08 +00:00
while pkill -f clickhouse-serv ; do echo . ; sleep 1 ; done
2020-03-03 10:47:32 +00:00
echo Servers stopped.
2020-02-10 18:37:46 +00:00
; &
2020-02-27 19:43:43 +00:00
"analyze_queries" )
2020-04-28 14:26:49 +00:00
time analyze_queries || :
2020-02-27 19:43:43 +00:00
; &
2020-02-10 18:37:46 +00:00
"report" )
2020-04-30 08:36:33 +00:00
time report || :
2020-06-25 20:19:27 +00:00
; &
"report_metrics" )
time report_metrics || :
2020-07-31 19:58:18 +00:00
cat metrics/errors.log >> report/errors.log || :
2020-06-25 20:19:27 +00:00
; &
"report_html" )
2020-04-28 07:45:35 +00:00
time " $script_dir /report.py " --report= all-queries > all-queries.html 2> >( tee -a report/errors.log 1>& 2) || :
2020-03-17 16:37:09 +00:00
time " $script_dir /report.py " > report.html
2020-02-10 18:37:46 +00:00
; &
2020-11-02 23:10:19 +00:00
"upload_results" )
time upload_results || :
; &
2020-02-10 18:37:46 +00:00
esac
2020-04-21 21:53:13 +00:00
# Print some final debug info to help debug Weirdness, of which there is plenty.
jobs
pstree -apgT