diff --git a/docker/test/performance-comparison/Dockerfile b/docker/test/performance-comparison/Dockerfile
index b733a127af1..2710c386ac7 100644
--- a/docker/test/performance-comparison/Dockerfile
+++ b/docker/test/performance-comparison/Dockerfile
@@ -18,6 +18,7 @@ RUN apt-get update \
             python3 \
             python3-dev \
             python3-pip \
+            rsync \
             tree \
             tzdata \
             vim \
diff --git a/docker/test/performance-comparison/compare.sh b/docker/test/performance-comparison/compare.sh
index c8eeec16962..58707be68d3 100755
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@@ -45,7 +45,10 @@ function configure
     sed -i 's/<tcp_port>9000/<tcp_port>9001/g' left/config/config.xml
     sed -i 's/<tcp_port>9000/<tcp_port>9002/g' right/config/config.xml
 
-    cat > right/config/config.d/zz-perf-test-tweaks.xml <<EOF
+    mkdir right/config/users.d ||:
+    mkdir left/config/users.d ||:
+
+    cat > right/config/config.d/zz-perf-test-tweaks-config.xml <<EOF
     <yandex>
         <logger>
             <console>true</console>
@@ -59,7 +62,20 @@ function configure
     </yandex>
 EOF
 
-    cp right/config/config.d/zz-perf-test-tweaks.xml left/config/config.d/zz-perf-test-tweaks.xml
+    cat > right/config/users.d/zz-perf-test-tweaks-users.xml <<EOF
+    <yandex>
+        <profiles>
+            <default>
+                <query_profiler_real_time_period_ns>10000000</query_profiler_real_time_period_ns>
+                <query_profiler_cpu_time_period_ns>0</query_profiler_cpu_time_period_ns>
+                <allow_introspection_functions>1</allow_introspection_functions>
+            </default>
+        </profiles>
+    </yandex>
+EOF
+
+    cp right/config/config.d/zz-perf-test-tweaks-config.xml left/config/config.d/zz-perf-test-tweaks-config.xml
+    cp right/config/users.d/zz-perf-test-tweaks-users.xml left/config/users.d/zz-perf-test-tweaks-users.xml
 
     rm left/config/config.d/metric_log.xml ||:
     rm left/config/config.d/text_log.xml ||:
@@ -81,6 +97,13 @@ EOF
 
     left/clickhouse client --port 9001 --query "create database test" ||:
     left/clickhouse client --port 9001 --query "rename table datasets.hits_v1 to test.hits" ||:
+
+    while killall clickhouse ; do echo . ; sleep 1 ; done
+    echo all killed
+
+    # Remove logs etc, because they will be updated, and sharing them between
+    # servers with hardlink might cause unpredictable behavior.
+    rm db0/data/system/* -rf ||:
 }
 
 function restart
@@ -125,11 +148,36 @@ function run_tests
 
     rm -v test-times.tsv ||:
 
+    # Why the ugly cut:
+    # 1) can't make --out-format='%n' work for deleted files, it outputs things
+    # like "deleted 1.xml";
+    # 2) the output is not tab separated, but at least it's fixed width, so I
+    # cut by characters.
+    changed_files=$(rsync --dry-run --dirs --checksum --delete --itemize-changes left/performance/ right/performance/ | cut -c13-)
+
     # FIXME remove some broken long tests
     rm right/performance/{IPv4,IPv6,modulo,parse_engine_file,number_formatting_formats,select_format}.xml ||:
 
+    test_files=$(ls right/performance/*)
+
+    # FIXME a quick crutch to bring the run time down for the flappy tests --
+    # run only those that have changed. Only on my prs for now.
+    if grep Kuzmenkov right-commit.txt
+    then
+        if [ "PR_TO_TEST" != "0" ]
+        then
+            test_files=$(cd right/performance && readlink -e $changed_files)
+        fi
+    fi
+
+    # Run only explicitly specified tests, if any
+    if [ -v CHPC_TEST_GLOB ]
+    then
+        test_files=$(ls right/performance/${CHPC_TEST_GLOB}.xml)
+    fi
+
     # Run the tests
-    for test in right/performance/${CHPC_TEST_GLOB:-*}.xml
+    for test in $test_files
     do
         test_name=$(basename $test ".xml")
         echo test $test_name
@@ -138,8 +186,19 @@ function run_tests
         { time "$script_dir/perf.py" "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; } 2>&1 >/dev/null | grep -v ^+ >> "wall-clock-times.tsv" || continue
         grep ^query "$test_name-raw.tsv" | cut -f2- > "$test_name-queries.tsv"
         grep ^client-time "$test_name-raw.tsv" | cut -f2- > "$test_name-client-time.tsv"
-        right/clickhouse local --file "$test_name-queries.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat $script_dir/eqmed.sql)" > "$test_name-report.tsv"
+        # this may be slow, run it in background
+        right/clickhouse local --file "$test_name-queries.tsv" --structure 'query text, run int, version UInt32, time float' --query "$(cat $script_dir/eqmed.sql)" > "$test_name-report.tsv" &
     done
+
+    wait
+
+    # Collect the profiles
+    left/clickhouse client --port 9001 --query "select * from system.trace_log format TSVWithNamesAndTypes" > left-trace-log.tsv ||: &
+    left/clickhouse client --port 9001 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > left-addresses.tsv ||: &
+    right/clickhouse client --port 9002 --query "select * from system.trace_log format TSVWithNamesAndTypes" > right-trace-log.tsv ||: &
+    right/clickhouse client --port 9002 --query "select arrayJoin(trace) addr, concat(splitByChar('/', addressToLine(addr))[-1], '#', demangle(addressToSymbol(addr)) ) name from system.trace_log group by addr format TSVWithNamesAndTypes" > right-addresses.tsv ||: &
+
+    wait
 }
 
 # Analyze results
@@ -156,7 +215,7 @@ create table queries engine Memory as select
         -- remove them altogether because we want to be able to detect regressions,
         -- but the right way to do this is not yet clear.
         not short and abs(diff) < 0.05 and rd[3] > 0.05 as unstable,
-        not short and abs(diff) > 0.05 and abs(diff) > rd[3] as changed,
+        not short and abs(diff) > 0.10 and abs(diff) > rd[3] as changed,
         *
     from file('*-report.tsv', TSV, 'left float, right float, diff float, rd Array(float), query text');
 
@@ -201,7 +260,7 @@ create table test_times_tsv engine File(TSV, 'test-times.tsv') as
         floor(real / queries, 3) avg_real_per_query,
         floor(query_min, 3)
     from test_time join wall_clock using test
-    order by query_max / query_min desc;
+    order by avg_real_per_query desc;
 
 create table all_queries_tsv engine File(TSV, 'all-queries.tsv') as
     select left, right, diff, rd, test, query
diff --git a/docker/test/performance-comparison/eqmed.sql b/docker/test/performance-comparison/eqmed.sql
index d7265533208..07f72963067 100644
--- a/docker/test/performance-comparison/eqmed.sql
+++ b/docker/test/performance-comparison/eqmed.sql
@@ -8,7 +8,7 @@ select
    query 
 from
    (
-      select query, quantiles(0.05, 0.5, 0.95)(abs(time_by_label[1] - time_by_label[2])) rd_quantiles -- quantiles of randomization distribution
+      select query, quantiles(0.05, 0.5, 0.95, 0.99)(abs(time_by_label[1] - time_by_label[2])) rd_quantiles -- quantiles of randomization distribution
       from
          (
             select query, virtual_run, groupArrayInsertAt(median_time, random_label) time_by_label -- make array 'random label' -> 'median time'
diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py
index c45b9f93827..1205fc97ffd 100755
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@@ -10,6 +10,15 @@ import pprint
 import time
 import traceback
 
+stage_start_seconds = time.perf_counter()
+
+def report_stage_end(stage_name):
+    global stage_start_seconds
+    print('{}\t{}'.format(stage_name, time.perf_counter() - stage_start_seconds))
+    stage_start_seconds = time.perf_counter()
+
+report_stage_end('start')
+
 parser = argparse.ArgumentParser(description='Run performance test.')
 # Explicitly decode files as UTF-8 because sometimes we have Russian characters in queries, and LANG=C is set.
 parser.add_argument('file', metavar='FILE', type=argparse.FileType('r', encoding='utf-8'), nargs=1, help='test description file')
@@ -35,6 +44,8 @@ if infinite_sign is not None:
 servers = [{'host': host, 'port': port} for (host, port) in zip(args.host, args.port)]
 connections = [clickhouse_driver.Client(**server) for server in servers]
 
+report_stage_end('connect')
+
 # Check tables that should exist
 tables = [e.text for e in root.findall('preconditions/table_exists')]
 for t in tables:
@@ -47,6 +58,8 @@ for c in connections:
     for s in settings:
         c.execute("set {} = '{}'".format(s.tag, s.text))
 
+report_stage_end('preconditions')
+
 # Process substitutions
 subst_elems = root.findall('substitutions/substitution')
 
@@ -61,6 +74,8 @@ parameter_combinations = [dict(zip(parameter_keys, parameter_combination)) for p
 def substitute_parameters(query_templates, parameter_combinations):
     return list(set([template.format(**parameters) for template, parameters in itertools.product(query_templates, parameter_combinations)]))
 
+report_stage_end('substitute')
+
 # Run drop queries, ignoring errors
 drop_query_templates = [q.text for q in root.findall('drop_query')]
 drop_queries = substitute_parameters(drop_query_templates, parameter_combinations)
@@ -86,6 +101,8 @@ for c in connections:
     for q in fill_queries:
         c.execute(q)
 
+report_stage_end('fill')
+
 # Run test queries
 def tsv_escape(s):
     return s.replace('\\', '\\\\').replace('\t', '\\t').replace('\n', '\\n').replace('\r','')
@@ -93,6 +110,8 @@ def tsv_escape(s):
 test_query_templates = [q.text for q in root.findall('query')]
 test_queries = substitute_parameters(test_query_templates, parameter_combinations)
 
+report_stage_end('substitute2')
+
 for q in test_queries:
     # Prewarm: run once on both servers. Helps to bring the data into memory,
     # precompile the queries, etc.
@@ -115,9 +134,13 @@ for q in test_queries:
     client_seconds = time.perf_counter() - start_seconds
     print('client-time\t{}\t{}\t{}'.format(tsv_escape(q), client_seconds, server_seconds))
 
+report_stage_end('benchmark')
+
 # Run drop queries
 drop_query_templates = [q.text for q in root.findall('drop_query')]
 drop_queries = substitute_parameters(drop_query_templates, parameter_combinations)
 for c in connections:
     for q in drop_queries:
         c.execute(q)
+
+report_stage_end('drop')