performance comparison

2024-11-21 15:12:02 +00:00 · 2020-06-25 23:19:27 +03:00 · 2020-06-25 23:19:27 +03:00 · e521d616fe
commit e521d616fe
parent 875369676f
4 changed files with 185 additions and 59 deletions
--- a/docker/test/performance-comparison/Dockerfile
+++ b/docker/test/performance-comparison/Dockerfile
@ -12,6 +12,8 @@ RUN apt-get update \
            g++ \
            gdb \
            git \
+            gnuplot \
+            imagemagick \
            libc6-dbg \
            moreutils \
            ncdu \
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -36,18 +36,14 @@ function configure
    while killall clickhouse-server; do echo . ; sleep 1 ; done
    echo all killed

-    # Remove logs etc, because they will be updated, and sharing them between
-    # servers with hardlink might cause unpredictable behavior.
-    rm db0/data/system/* -rf ||:
-    rm db0/metadata/system/* -rf ||:
-
    # Make copies of the original db for both servers. Use hardlinks instead
-    # of copying. Be careful to remove preprocessed configs and system tables,or
-    # it can lead to weird effects.
+    # of copying to save space. Before that, remove preprocessed configs and
+    # system tables, because sharing them between servers with hardlinks may
+    # lead to weird effects.
    rm -r left/db ||:
    rm -r right/db ||:
    rm -r db0/preprocessed_configs ||:
-    rm -r db/{data,metadata}/system ||:
+    rm -r db0/{data,metadata}/system ||:
    cp -al db0/ left/db/
    cp -al db0/ right/db/
 }
@ -143,15 +139,18 @@ function run_tests
        touch "$x"
    done

+    # Randomize test order.
+    test_files=$(for f in $test_files; do echo "$f"; done | sort -R)
+
    # Run the tests.
    test_name="<none>"
    for test in $test_files
    do
-        # Check that both servers are alive, to fail faster if they die.
+        # Check that both servers are alive, and restart them if they die.
        clickhouse-client --port 9001 --query "select 1 format Null" \
-            || { echo $test_name >> left-server-died.log ; restart ; continue ; }
+            || { echo $test_name >> left-server-died.log ; restart ; }
        clickhouse-client --port 9002 --query "select 1 format Null" \
-            || { echo $test_name >> right-server-died.log ; restart ; continue ; }
+            || { echo $test_name >> right-server-died.log ; restart ; }

        test_name=$(basename "$test" ".xml")
        echo test "$test_name"
@ -217,10 +216,13 @@ function get_profiles
    # Collect the profiles
    clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
    clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
-    clickhouse-client --port 9001 --query "set query_profiler_cpu_time_period_ns = 0"
-    clickhouse-client --port 9001 --query "set query_profiler_real_time_period_ns = 0"
-    clickhouse-client --port 9001 --query "system flush logs"
-    clickhouse-client --port 9002 --query "system flush logs"
+    clickhouse-client --port 9001 --query "system flush logs" &
+
+    clickhouse-client --port 9002 --query "set query_profiler_cpu_time_period_ns = 0"
+    clickhouse-client --port 9002 --query "set query_profiler_real_time_period_ns = 0"
+    clickhouse-client --port 9002 --query "system flush logs" &
+
+    wait

    clickhouse-client --port 9001 --query "select * from system.query_log where type = 2 format TSVWithNamesAndTypes" > left-query-log.tsv ||: &
    clickhouse-client --port 9001 --query "select * from system.query_thread_log format TSVWithNamesAndTypes" > left-query-thread-log.tsv ||: &
@ -292,25 +294,54 @@ create view right_query_log as select *
    from file('right-query-log.tsv', TSVWithNamesAndTypes,
        '$(cat "right-query-log.tsv.columns")');

-create table query_metrics engine File(TSV, -- do not add header -- will parse with grep
-        'analyze/query-run-metrics.tsv')
-    as select
-        test, query_index, 0 run, version,
-        [
-            -- server-reported time
-            query_duration_ms / toFloat64(1000)
-            , toFloat64(memory_usage)
-            -- client-reported time
-            , query_runs.time
-        ] metrics
-    from (
-        select query_duration_ms, memory_usage, query_id, 0 version from left_query_log
-        union all
-        select query_duration_ms, memory_usage, query_id, 1 version from right_query_log
-    ) query_logs
+create view query_logs as
+    select *, 0 version from left_query_log
+    union all
+    select *, 1 version from right_query_log
+    ;
+
+create table query_run_metrics_full engine File(TSV, 'analyze/query-run-metrics-full.tsv')
+    as
+    with (
+        -- sumMapState with the list of all keys with '-0.' values. Negative zero is because
+        -- sumMap removes keys with positive zeros.
+        with (select groupUniqArrayArray(ProfileEvents.Names) from query_logs) as all_names
+            select arrayReduce('sumMapState', [(all_names, arrayMap(x->-0., all_names))])
+        ) as all_metrics
+    select test, query_index, version, query_id,
+        (finalizeAggregation(
+            arrayReduce('sumMapMergeState',
+                [
+                    all_metrics,
+                    arrayReduce('sumMapState',
+                        [(ProfileEvents.Names,
+                            arrayMap(x->toFloat64(x), ProfileEvents.Values))]
+                    ),
+                    arrayReduce('sumMapState', [(
+                        ['client_time', 'server_time'],
+                        arrayMap(x->if(x != 0., x, -0.), [
+                            toFloat64(query_runs.time),
+                            toFloat64(query_duration_ms / 1000.)]))])
+                ]
+            )) as metrics_tuple).1 metric_names,
+        metrics_tuple.2 metric_values
+    from query_logs
    right join query_runs
-    using (query_id, version)
-    order by test, query_index
+        on query_logs.query_id = query_runs.query_id
+            and query_logs.version = query_runs.version
+    ;
+
+create table query_run_metrics engine File(
+        TSV, -- do not add header -- will parse with grep
+        'analyze/query-run-metrics.tsv')
+    as select test, query_index, 0 run, version, metric_values
+    from query_run_metrics_full
+    where test = 'arithmetic'
+    order by test, query_index, run, version
+    ;
+
+create table query_run_metric_names engine File(TSV, 'analyze/query-run-metric-names.tsv')
+    as select metric_names from query_run_metrics_full limit 1
    ;
 "

@ -320,7 +351,6 @@ create table query_metrics engine File(TSV, -- do not add header -- will parse w
 # query. We also don't have lateral joins. So I just put all runs of each
 # query into a separate file, and then compute randomization distribution
 # for each file. I do this in parallel using GNU parallel.
-query_index=1
 IFS=$'\n'
 for prefix in $(cut -f1,2 "analyze/query-run-metrics.tsv" | sort | uniq)
 do
@ -360,20 +390,34 @@ create view query_display_names as select * from
        'test text, query_index int, query_display_name text')
    ;

-create table query_metric_stats engine File(TSVWithNamesAndTypes,
-        'report/query-metric-stats.tsv') as
+-- WITH, ARRAY JOIN and CROSS JOIN do not like each other:
+--  https://github.com/ClickHouse/ClickHouse/issues/11868
+--  https://github.com/ClickHouse/ClickHouse/issues/11757
+-- Because of this, we make a view with arrays first, and then apply all the
+-- array joins.
+
+create view query_metric_stat_arrays as
+    with (select * from file('analyze/query-run-metric-names.tsv',
+        TSV, 'n Array(String)')) as metric_name
    select metric_name, left, right, diff, stat_threshold, test, query_index,
        query_display_name
    from file ('analyze/query-reports.tsv', TSV, 'left Array(float),
        right Array(float), diff Array(float), stat_threshold Array(float),
        test text, query_index int') reports
-    left array join ['server_time', 'memory', 'client_time'] as metric_name,
-        left, right, diff, stat_threshold
    left join query_display_names
        on reports.test = query_display_names.test
            and reports.query_index = query_display_names.query_index
    ;

+create table query_metric_stats engine File(TSVWithNamesAndTypes,
+        'report/query-metric-stats.tsv')
+    as
+    select metric_name, left, right, diff, stat_threshold, test, query_index,
+        query_display_name
+    from query_metric_stat_arrays
+    left array join metric_name, left, right, diff, stat_threshold
+    ;
+
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
@ -392,7 +436,8 @@ create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
        not short and not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
        
        left, right, diff, stat_threshold,
-        if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
+        --if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
+        0.10 as report_threshold,
        test, query_index, query_display_name
    from query_metric_stats
    left join file('analyze/report-thresholds.tsv', TSV,
@ -728,6 +773,64 @@ unset IFS
 grep -H -m2 -i '\(Exception\|Error\):[^:]' ./*-err.log | sed 's/:/\t/' >> run-errors.tsv ||:
 }

+function report_metrics
+{
+rm -rf metrics ||:
+mkdir metrics
+
+clickhouse-local --stacktrace --verbose --query "
+create view right_async_metric_log as
+    select * from file('right-async-metric-log.tsv', TSVWithNamesAndTypes,
+        'event_date Date, event_time DateTime, name String, value Float64')
+    ;
+
+-- Use the right log as time reference because it may have higher precision.
+create table metrics engine File(TSV, 'metrics/metrics.tsv') as
+    with (select min(event_time) from right_async_metric_log) as min_time
+    select name metric, r.event_time - min_time event_time, l.value as left, r.value as right
+    from right_async_metric_log r
+    asof join file('left-async-metric-log.tsv', TSVWithNamesAndTypes,
+        'event_date Date, event_time DateTime, name String, value Float64') l
+    on l.name = r.name and r.event_time <= l.event_time
+    order by metric, event_time
+    ;
+
+-- Show metrics that have changed
+create table changes engine File(TSV, 'metrics/changes.tsv') as
+    select metric, median(left) as left, median(right) as right,
+        floor((right - left) / left, 3) diff,
+        floor(if(left > right, left / right, right / left), 3) times_diff
+    from metrics
+    group by metric
+    having abs(diff) > 0.05 and isFinite(diff)
+    order by diff desc
+    ;
+"
+
+IFS=$'\n'
+for prefix in $(cut -f1 "metrics/metrics.tsv" | sort | uniq)
+do
+    file="metrics/$prefix.tsv"
+    grep "^$prefix	" "metrics/metrics.tsv" | cut -f2- > "$file"
+
+    gnuplot -e "
+        set datafile separator '\t';
+        set terminal png size 960,540;
+        set xtics time format '%tH:%tM';
+        set title '$prefix' noenhanced offset 0,-3;
+        set key left top;
+        plot
+            '$file' using 1:2 with lines title 'Left'
+            , '$file' using 1:3 with lines title 'Right'
+            ;
+    " \
+        | convert - -filter point -resize "200%" "metrics/$prefix.png" &
+
+done
+wait
+unset IFS
+}
+
 # Check that local and client are in PATH
 clickhouse-local --version > /dev/null
 clickhouse-client --version > /dev/null
@ -749,12 +852,24 @@ case "$stage" in
    time run_benchmark 2> >(tee -a run-errors.tsv 1>&2) ||:
    ;&
 "get_profiles")
-    # Getting profiles inexplicably hangs sometimes, so try to save some logs if
-    # this happens again. Give the servers some time to collect all info, then
-    # trace and kill. Start in a subshell, so that both function don't interfere
-    # with each other's jobs through `wait`. Also make the subshell have its own
-    # process group, so that we can then kill it with all its child processes.
-    # Somehow it doesn't kill the children by itself when dying.
+    # Check for huge pages.
+    cat /sys/kernel/mm/transparent_hugepage/enabled > thp-enabled.txt ||:
+    cat /proc/meminfo > meminfo.txt ||:
+    for pid in $(pgrep -f clickhouse-server)
+    do
+        cat "/proc/$pid/smaps" > "$pid-smaps.txt" ||:
+    done
+
+    # Sleep for five minutes to see how the servers enter a quiescent state (e.g.
+    # how fast the memory usage drops).
+    sleep 300
+
+    # We had a bug where getting profiles froze sometimes, so try to save some
+    # logs if this happens again. Give the servers some time to collect all info,
+    # then trace and kill. Start in a subshell, so that both function don't
+    # interfere with each other's jobs through `wait`. Also make the subshell
+    # have its own process group, so that we can then kill it with all its child
+    # processes. Somehow it doesn't kill the children by itself when dying.
    set -m
    ( get_profiles_watchdog ) &
    watchdog_pid=$!
@ -781,7 +896,11 @@ case "$stage" in
    ;&
 "report")
    time report ||:
-
+    ;&
+"report_metrics")
+    time report_metrics ||:
+    ;&
+"report_html")
    time "$script_dir/report.py" --report=all-queries > all-queries.html 2> >(tee -a report/errors.log 1>&2) ||:
    time "$script_dir/report.py" > report.html
    ;&
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@ -133,6 +133,6 @@ dmesg -T > dmesg.log

 7z a '-x!*/tmp' /output/output.7z ./*.{log,tsv,html,txt,rep,svg,columns} \
    {right,left}/{performance,scripts} {{right,left}/db,db0}/preprocessed_configs \
-    report analyze benchmark
+    report analyze benchmark metrics

 cp compare.log /output
--- a/docker/test/performance-comparison/report.py
+++ b/docker/test/performance-comparison/report.py
@ -191,7 +191,7 @@ if args.report == 'main':
    slow_on_client_rows = tsvRows('report/slow-on-client.tsv')
    error_tests += len(slow_on_client_rows)
    printSimpleTable('Slow on client',
-                     ['Client time, s', 'Server time, s', 'Ratio', 'Test', 'Query'],
+                     ['Client time,&nbsp;s', 'Server time,&nbsp;s', 'Ratio', 'Test', 'Query'],
                     slow_on_client_rows)

    def print_changes():
@ -203,8 +203,8 @@ if args.report == 'main':

        print(tableStart('Changes in performance'))
        columns = [
-            'Old, s',                                          # 0
-            'New, s',                                          # 1
+            'Old,&nbsp;s',                                          # 0
+            'New,&nbsp;s',                                          # 1
            'Relative difference (new&nbsp;&minus;&nbsp;old) / old',   # 2
            'p&nbsp;<&nbsp;0.001 threshold',                   # 3
            # Failed                                           # 4
@ -245,8 +245,8 @@ if args.report == 'main':
        unstable_queries += len(unstable_rows)

        columns = [
-            'Old, s', #0
-            'New, s', #1
+            'Old,&nbsp;s', #0
+            'New,&nbsp;s', #1
            'Relative difference (new&nbsp;-&nbsp;old)/old', #2
            'p&nbsp;<&nbsp;0.001 threshold', #3
            # Failed #4
@ -288,13 +288,13 @@ if args.report == 'main':

        columns = [
            'Test',                                          #0
-            'Wall clock time, s',                            #1
-            'Total client time, s',                          #2
+            'Wall clock time,&nbsp;s',                            #1
+            'Total client time,&nbsp;s',                          #2
            'Total queries',                                 #3
            'Ignored short queries',                         #4
-            'Longest query<br>(sum for all runs), s',        #5
-            'Avg wall clock time<br>(sum for all runs), s',  #6
-            'Shortest query<br>(sum for all runs), s',       #7
+            'Longest query<br>(sum for all runs),&nbsp;s',        #5
+            'Avg wall clock time<br>(sum for all runs),&nbsp;s',  #6
+            'Shortest query<br>(sum for all runs),&nbsp;s',       #7
            ]

        print(tableStart('Test times'))
@ -384,6 +384,11 @@ if args.report == 'main':
                *sys.exc_info()[:2])[-1])
        pass

+    printSimpleTable('Metric changes',
+        ['Metric', 'Old median value', 'New median value',
+            'Relative difference', 'Times difference'],
+        tsvRows('metrics/changes.tsv'))
+
    print_report_errors()

    print("""
@ -450,8 +455,8 @@ elif args.report == 'all-queries':
        columns = [
            # Changed #0
            # Unstable #1
-            'Old, s', #2
-            'New, s', #3
+            'Old,&nbsp;s', #2
+            'New,&nbsp;s', #3
            'Relative difference (new&nbsp;&minus;&nbsp;old) / old', #4
            'Times speedup / slowdown',                 #5
            'p&nbsp;<&nbsp;0.001 threshold',          #6