mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 23:21:59 +00:00
performance comparison
This commit is contained in:
parent
875369676f
commit
e521d616fe
@ -12,6 +12,8 @@ RUN apt-get update \
|
||||
g++ \
|
||||
gdb \
|
||||
git \
|
||||
gnuplot \
|
||||
imagemagick \
|
||||
libc6-dbg \
|
||||
moreutils \
|
||||
ncdu \
|
||||
|
@ -36,18 +36,14 @@ function configure
|
||||
while killall clickhouse-server; do echo . ; sleep 1 ; done
|
||||
echo all killed
|
||||
|
||||
# Remove logs etc, because they will be updated, and sharing them between
|
||||
# servers with hardlink might cause unpredictable behavior.
|
||||
rm db0/data/system/* -rf ||:
|
||||
rm db0/metadata/system/* -rf ||:
|
||||
|
||||
# Make copies of the original db for both servers. Use hardlinks instead
|
||||
# of copying. Be careful to remove preprocessed configs and system tables,or
|
||||
# it can lead to weird effects.
|
||||
# of copying to save space. Before that, remove preprocessed configs and
|
||||
# system tables, because sharing them between servers with hardlinks may
|
||||
# lead to weird effects.
|
||||
rm -r left/db ||:
|
||||
rm -r right/db ||:
|
||||
rm -r db0/preprocessed_configs ||:
|
||||
rm -r db/{data,metadata}/system ||:
|
||||
rm -r db0/{data,metadata}/system ||:
|
||||
cp -al db0/ left/db/
|
||||
cp -al db0/ right/db/
|
||||
}
|
||||
@ -143,15 +139,18 @@ function run_tests
|
||||
touch "$x"
|
||||
done
|
||||
|
||||
# Randomize test order.
|
||||
test_files=$(for f in $test_files; do echo "$f"; done | sort -R)
|
||||
|
||||
# Run the tests.
|
||||
test_name="<none>"
|
||||
for test in $test_files
|
||||
do
|
||||
# Check that both servers are alive, to fail faster if they die.
|
||||
# Check that both servers are alive, and restart them if they die.
|
||||
clickhouse-client --port 9001 --query "select 1 format Null" \
|
||||
|| { echo $test_name >> left-server-died.log ; restart ; continue ; }
|
||||
|| { echo $test_name >> left-server-died.log ; restart ; }
|
||||
clickhouse-client --port 9002 --query "select 1 format Null" \
|
||||
|| { echo $test_name >> right-server-died.log ; restart ; continue ; }
|
||||
|| { echo $test_name >> right-server-died.log ; restart ; }
|
||||
|
||||
test_name=$(basename "$test" ".xml")
|
||||
echo test "$test_name"
|
||||
@ -217,10 +216,13 @@ function get_profiles
|
||||
# Collect the profiles
|
||||
clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
|
||||
clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
|
||||
clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
|
||||
clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
|
||||
clickhouse-client --port 9001 --query "system flush logs"
|
||||
clickhouse-client --port 9002 --query "system flush logs"
|
||||
clickhouse-client --port 9001 --query "system flush logs" &
|
||||
|
||||
clickhouse-client --port 9002 --query "set query_profiler_cpu_time_period_ns = 0"
|
||||
clickhouse-client --port 9002 --query "set query_profiler_real_time_period_ns = 0"
|
||||
clickhouse-client --port 9002 --query "system flush logs" &
|
||||
|
||||
wait
|
||||
|
||||
clickhouse-client --port 9001 --query "select * from system.query_log where type = 2 format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
|
||||
clickhouse-client --port 9001 --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv ||: &
|
||||
@ -292,25 +294,54 @@ create view right_query_log as select *
|
||||
from file('right-query-log.tsv', TSVWithNamesAndTypes,
|
||||
'$(cat "right-query-log.tsv.columns")');
|
||||
|
||||
create table query_metrics engine File(TSV, -- do not add header -- will parse with grep
|
||||
'analyze/query-run-metrics.tsv')
|
||||
as select
|
||||
test, query_index, 0 run, version,
|
||||
[
|
||||
-- server-reported time
|
||||
query_duration_ms / toFloat64(1000)
|
||||
, toFloat64(memory_usage)
|
||||
-- client-reported time
|
||||
, query_runs.time
|
||||
] metrics
|
||||
from (
|
||||
select query_duration_ms, memory_usage, query_id, 0 version from left_query_log
|
||||
union all
|
||||
select query_duration_ms, memory_usage, query_id, 1 version from right_query_log
|
||||
) query_logs
|
||||
create view query_logs as
|
||||
select *, 0 version from left_query_log
|
||||
union all
|
||||
select *, 1 version from right_query_log
|
||||
;
|
||||
|
||||
create table query_run_metrics_full engine File(TSV, 'analyze/query-run-metrics-full.tsv')
|
||||
as
|
||||
with (
|
||||
-- sumMapState with the list of all keys with '-0.' values. Negative zero is because
|
||||
-- sumMap removes keys with positive zeros.
|
||||
with (select groupUniqArrayArray(ProfileEvents.Names) from query_logs) as all_names
|
||||
select arrayReduce('sumMapState', [(all_names, arrayMap(x->-0., all_names))])
|
||||
) as all_metrics
|
||||
select test, query_index, version, query_id,
|
||||
(finalizeAggregation(
|
||||
arrayReduce('sumMapMergeState',
|
||||
[
|
||||
all_metrics,
|
||||
arrayReduce('sumMapState',
|
||||
[(ProfileEvents.Names,
|
||||
arrayMap(x->toFloat64(x), ProfileEvents.Values))]
|
||||
),
|
||||
arrayReduce('sumMapState', [(
|
||||
['client_time', 'server_time'],
|
||||
arrayMap(x->if(x != 0., x, -0.), [
|
||||
toFloat64(query_runs.time),
|
||||
toFloat64(query_duration_ms / 1000.)]))])
|
||||
]
|
||||
)) as metrics_tuple).1 metric_names,
|
||||
metrics_tuple.2 metric_values
|
||||
from query_logs
|
||||
right join query_runs
|
||||
using (query_id, version)
|
||||
order by test, query_index
|
||||
on query_logs.query_id = query_runs.query_id
|
||||
and query_logs.version = query_runs.version
|
||||
;
|
||||
|
||||
create table query_run_metrics engine File(
|
||||
TSV, -- do not add header -- will parse with grep
|
||||
'analyze/query-run-metrics.tsv')
|
||||
as select test, query_index, 0 run, version, metric_values
|
||||
from query_run_metrics_full
|
||||
where test = 'arithmetic'
|
||||
order by test, query_index, run, version
|
||||
;
|
||||
|
||||
create table query_run_metric_names engine File(TSV, 'analyze/query-run-metric-names.tsv')
|
||||
as select metric_names from query_run_metrics_full limit 1
|
||||
;
|
||||
"
|
||||
|
||||
@ -320,7 +351,6 @@ create table query_metrics engine File(TSV, -- do not add header -- will parse w
|
||||
# query. We also don't have lateral joins. So I just put all runs of each
|
||||
# query into a separate file, and then compute randomization distribution
|
||||
# for each file. I do this in parallel using GNU parallel.
|
||||
query_index=1
|
||||
IFS=$'\n'
|
||||
for prefix in $(cut -f1,2 "analyze/query-run-metrics.tsv" | sort | uniq)
|
||||
do
|
||||
@ -360,20 +390,34 @@ create view query_display_names as select * from
|
||||
'test text, query_index int, query_display_name text')
|
||||
;
|
||||
|
||||
create table query_metric_stats engine File(TSVWithNamesAndTypes,
|
||||
'report/query-metric-stats.tsv') as
|
||||
-- WITH, ARRAY JOIN and CROSS JOIN do not like each other:
|
||||
-- https://github.com/ClickHouse/ClickHouse/issues/11868
|
||||
-- https://github.com/ClickHouse/ClickHouse/issues/11757
|
||||
-- Because of this, we make a view with arrays first, and then apply all the
|
||||
-- array joins.
|
||||
|
||||
create view query_metric_stat_arrays as
|
||||
with (select * from file('analyze/query-run-metric-names.tsv',
|
||||
TSV, 'n Array(String)')) as metric_name
|
||||
select metric_name, left, right, diff, stat_threshold, test, query_index,
|
||||
query_display_name
|
||||
from file ('analyze/query-reports.tsv', TSV, 'left Array(float),
|
||||
right Array(float), diff Array(float), stat_threshold Array(float),
|
||||
test text, query_index int') reports
|
||||
left array join ['server_time', 'memory', 'client_time'] as metric_name,
|
||||
left, right, diff, stat_threshold
|
||||
left join query_display_names
|
||||
on reports.test = query_display_names.test
|
||||
and reports.query_index = query_display_names.query_index
|
||||
;
|
||||
|
||||
create table query_metric_stats engine File(TSVWithNamesAndTypes,
|
||||
'report/query-metric-stats.tsv')
|
||||
as
|
||||
select metric_name, left, right, diff, stat_threshold, test, query_index,
|
||||
query_display_name
|
||||
from query_metric_stat_arrays
|
||||
left array join metric_name, left, right, diff, stat_threshold
|
||||
;
|
||||
|
||||
-- Main statistics for queries -- query time as reported in query log.
|
||||
create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
|
||||
as select
|
||||
@ -392,7 +436,8 @@ create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
|
||||
not short and not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
|
||||
|
||||
left, right, diff, stat_threshold,
|
||||
if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
|
||||
--if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
|
||||
0.10 as report_threshold,
|
||||
test, query_index, query_display_name
|
||||
from query_metric_stats
|
||||
left join file('analyze/report-thresholds.tsv', TSV,
|
||||
@ -728,6 +773,64 @@ unset IFS
|
||||
grep -H -m2 -i '\(Exception\|Error\):[^:]' ./*-err.log | sed 's/:/\t/' >> run-errors.tsv ||:
|
||||
}
|
||||
|
||||
function report_metrics
|
||||
{
|
||||
rm -rf metrics ||:
|
||||
mkdir metrics
|
||||
|
||||
clickhouse-local --stacktrace --verbose --query "
|
||||
create view right_async_metric_log as
|
||||
select * from file('right-async-metric-log.tsv', TSVWithNamesAndTypes,
|
||||
'event_date Date, event_time DateTime, name String, value Float64')
|
||||
;
|
||||
|
||||
-- Use the right log as time reference because it may have higher precision.
|
||||
create table metrics engine File(TSV, 'metrics/metrics.tsv') as
|
||||
with (select min(event_time) from right_async_metric_log) as min_time
|
||||
select name metric, r.event_time - min_time event_time, l.value as left, r.value as right
|
||||
from right_async_metric_log r
|
||||
asof join file('left-async-metric-log.tsv', TSVWithNamesAndTypes,
|
||||
'event_date Date, event_time DateTime, name String, value Float64') l
|
||||
on l.name = r.name and r.event_time <= l.event_time
|
||||
order by metric, event_time
|
||||
;
|
||||
|
||||
-- Show metrics that have changed
|
||||
create table changes engine File(TSV, 'metrics/changes.tsv') as
|
||||
select metric, median(left) as left, median(right) as right,
|
||||
floor((right - left) / left, 3) diff,
|
||||
floor(if(left > right, left / right, right / left), 3) times_diff
|
||||
from metrics
|
||||
group by metric
|
||||
having abs(diff) > 0.05 and isFinite(diff)
|
||||
order by diff desc
|
||||
;
|
||||
"
|
||||
|
||||
IFS=$'\n'
|
||||
for prefix in $(cut -f1 "metrics/metrics.tsv" | sort | uniq)
|
||||
do
|
||||
file="metrics/$prefix.tsv"
|
||||
grep "^$prefix " "metrics/metrics.tsv" | cut -f2- > "$file"
|
||||
|
||||
gnuplot -e "
|
||||
set datafile separator '\t';
|
||||
set terminal png size 960,540;
|
||||
set xtics time format '%tH:%tM';
|
||||
set title '$prefix' noenhanced offset 0,-3;
|
||||
set key left top;
|
||||
plot
|
||||
'$file' using 1:2 with lines title 'Left'
|
||||
, '$file' using 1:3 with lines title 'Right'
|
||||
;
|
||||
" \
|
||||
| convert - -filter point -resize "200%" "metrics/$prefix.png" &
|
||||
|
||||
done
|
||||
wait
|
||||
unset IFS
|
||||
}
|
||||
|
||||
# Check that local and client are in PATH
|
||||
clickhouse-local --version > /dev/null
|
||||
clickhouse-client --version > /dev/null
|
||||
@ -749,12 +852,24 @@ case "$stage" in
|
||||
time run_benchmark 2> >(tee -a run-errors.tsv 1>&2) ||:
|
||||
;&
|
||||
"get_profiles")
|
||||
# Getting profiles inexplicably hangs sometimes, so try to save some logs if
|
||||
# this happens again. Give the servers some time to collect all info, then
|
||||
# trace and kill. Start in a subshell, so that both function don't interfere
|
||||
# with each other's jobs through `wait`. Also make the subshell have its own
|
||||
# process group, so that we can then kill it with all its child processes.
|
||||
# Somehow it doesn't kill the children by itself when dying.
|
||||
# Check for huge pages.
|
||||
cat /sys/kernel/mm/transparent_hugepage/enabled > thp-enabled.txt ||:
|
||||
cat /proc/meminfo > meminfo.txt ||:
|
||||
for pid in $(pgrep -f clickhouse-server)
|
||||
do
|
||||
cat "/proc/$pid/smaps" > "$pid-smaps.txt" ||:
|
||||
done
|
||||
|
||||
# Sleep for five minutes to see how the servers enter a quiescent state (e.g.
|
||||
# how fast the memory usage drops).
|
||||
sleep 300
|
||||
|
||||
# We had a bug where getting profiles froze sometimes, so try to save some
|
||||
# logs if this happens again. Give the servers some time to collect all info,
|
||||
# then trace and kill. Start in a subshell, so that both function don't
|
||||
# interfere with each other's jobs through `wait`. Also make the subshell
|
||||
# have its own process group, so that we can then kill it with all its child
|
||||
# processes. Somehow it doesn't kill the children by itself when dying.
|
||||
set -m
|
||||
( get_profiles_watchdog ) &
|
||||
watchdog_pid=$!
|
||||
@ -781,7 +896,11 @@ case "$stage" in
|
||||
;&
|
||||
"report")
|
||||
time report ||:
|
||||
|
||||
;&
|
||||
"report_metrics")
|
||||
time report_metrics ||:
|
||||
;&
|
||||
"report_html")
|
||||
time "$script_dir/report.py" --report=all-queries > all-queries.html 2> >(tee -a report/errors.log 1>&2) ||:
|
||||
time "$script_dir/report.py" > report.html
|
||||
;&
|
||||
|
@ -133,6 +133,6 @@ dmesg -T > dmesg.log
|
||||
|
||||
7z a '-x!*/tmp' /output/output.7z ./*.{log,tsv,html,txt,rep,svg,columns} \
|
||||
{right,left}/{performance,scripts} {{right,left}/db,db0}/preprocessed_configs \
|
||||
report analyze benchmark
|
||||
report analyze benchmark metrics
|
||||
|
||||
cp compare.log /output
|
||||
|
@ -191,7 +191,7 @@ if args.report == 'main':
|
||||
slow_on_client_rows = tsvRows('report/slow-on-client.tsv')
|
||||
error_tests += len(slow_on_client_rows)
|
||||
printSimpleTable('Slow on client',
|
||||
['Client time, s', 'Server time, s', 'Ratio', 'Test', 'Query'],
|
||||
['Client time, s', 'Server time, s', 'Ratio', 'Test', 'Query'],
|
||||
slow_on_client_rows)
|
||||
|
||||
def print_changes():
|
||||
@ -203,8 +203,8 @@ if args.report == 'main':
|
||||
|
||||
print(tableStart('Changes in performance'))
|
||||
columns = [
|
||||
'Old, s', # 0
|
||||
'New, s', # 1
|
||||
'Old, s', # 0
|
||||
'New, s', # 1
|
||||
'Relative difference (new − old) / old', # 2
|
||||
'p < 0.001 threshold', # 3
|
||||
# Failed # 4
|
||||
@ -245,8 +245,8 @@ if args.report == 'main':
|
||||
unstable_queries += len(unstable_rows)
|
||||
|
||||
columns = [
|
||||
'Old, s', #0
|
||||
'New, s', #1
|
||||
'Old, s', #0
|
||||
'New, s', #1
|
||||
'Relative difference (new - old)/old', #2
|
||||
'p < 0.001 threshold', #3
|
||||
# Failed #4
|
||||
@ -288,13 +288,13 @@ if args.report == 'main':
|
||||
|
||||
columns = [
|
||||
'Test', #0
|
||||
'Wall clock time, s', #1
|
||||
'Total client time, s', #2
|
||||
'Wall clock time, s', #1
|
||||
'Total client time, s', #2
|
||||
'Total queries', #3
|
||||
'Ignored short queries', #4
|
||||
'Longest query<br>(sum for all runs), s', #5
|
||||
'Avg wall clock time<br>(sum for all runs), s', #6
|
||||
'Shortest query<br>(sum for all runs), s', #7
|
||||
'Longest query<br>(sum for all runs), s', #5
|
||||
'Avg wall clock time<br>(sum for all runs), s', #6
|
||||
'Shortest query<br>(sum for all runs), s', #7
|
||||
]
|
||||
|
||||
print(tableStart('Test times'))
|
||||
@ -384,6 +384,11 @@ if args.report == 'main':
|
||||
*sys.exc_info()[:2])[-1])
|
||||
pass
|
||||
|
||||
printSimpleTable('Metric changes',
|
||||
['Metric', 'Old median value', 'New median value',
|
||||
'Relative difference', 'Times difference'],
|
||||
tsvRows('metrics/changes.tsv'))
|
||||
|
||||
print_report_errors()
|
||||
|
||||
print("""
|
||||
@ -450,8 +455,8 @@ elif args.report == 'all-queries':
|
||||
columns = [
|
||||
# Changed #0
|
||||
# Unstable #1
|
||||
'Old, s', #2
|
||||
'New, s', #3
|
||||
'Old, s', #2
|
||||
'New, s', #3
|
||||
'Relative difference (new − old) / old', #4
|
||||
'Times speedup / slowdown', #5
|
||||
'p < 0.001 threshold', #6
|
||||
|
Loading…
Reference in New Issue
Block a user