Fix a rare false negative in perf tests

2024-09-19 16:20:50 +00:00 · 2021-03-02 19:21:30 +03:00 · 2021-03-02 19:21:30 +03:00 · 08148e062f
commit 08148e062f
parent a8f0fd1b26
3 changed files with 33 additions and 10 deletions
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -358,6 +358,8 @@ mkdir analyze analyze/tmp ||:
 build_log_column_definitions

 # Split the raw test output into files suitable for analysis.
+# To debug calculations only for a particular test, substitute a suitable
+# wildcard here, e.g. `for test_file in modulo-raw.tsv`.
 for test_file in *-raw.tsv
 do
    test_name=$(basename "$test_file" "-raw.tsv")
@ -467,7 +469,13 @@ create view broken_queries as
 create table query_run_metrics_for_stats engine File(
        TSV, -- do not add header -- will parse with grep
        'analyze/query-run-metrics-for-stats.tsv')
-    as select test, query_index, 0 run, version, metric_values
+    as select test, query_index, 0 run, version,
+        -- For debugging, add a filter for a particular metric like this:
+        -- arrayFilter(m, n -> n = 'client_time', metric_values, metric_names)
+        --     metric_values
+        -- Note that further reporting may break, because the metric names are
+        -- not filtered.
+        metric_values
    from query_run_metric_arrays
    where (test, query_index) not in broken_queries
    order by test, query_index, run, version
@ -585,8 +593,19 @@ create view query_metric_stats as
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
-        abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
-        abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
+        -- It is important to have a non-strict inequality with stat_threshold
+        -- here. The randomization distribution is actually discrete, and when
+        -- the number of runs is small, the quantile we need (e.g. 0.99) turns
+        -- out to be the maximum value of the distribution. We can also hit this
+        -- maximum possible value with our test run, and this obviously means
+        -- that we have observed the difference to the best precision possible
+        -- for the given number of runs. If we use a strict equality here, we
+        -- will miss such cases. This happened in the wild and lead to some
+        -- uncaught regressions, because for the default 7 runs we do for PRs,
+        -- the randomization distribution has only 16 values, so the max quantile
+        -- is actually 0.9375.
+        abs(diff) > report_threshold        and abs(diff) >= stat_threshold as changed_fail,
+        abs(diff) > report_threshold - 0.05 and abs(diff) >= stat_threshold as changed_show,

        not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
        not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
--- a/docker/test/performance-comparison/eqmed.sql
+++ b/docker/test/performance-comparison/eqmed.sql
@ -1,4 +1,6 @@
-- input is table(test text, query text, run UInt32, version int, metrics Array(float))
+-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
+-- Run like this:
+-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
 select
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
@ -8,14 +10,19 @@ select
 from
   (
      -- quantiles of randomization distributions
+      -- note that for small number of runs, the exact quantile might not make
+      -- sense, because the last possible value of randomization distribution
+      -- might take a larger percentage of distirbution (i.e. the distribution
+      -- actually has discrete values, and the last step can be large).
      select quantileExactForEach(0.99)(
        arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
      ) threshold
-      ---- uncomment to see what the distribution is really like
-      --, uniqExact(d.1) u
+      ---- Uncomment to see what the distribution is really like. This debug
+      ---- code only works for single (the first) metric.
+      --, uniqExact(d[1]) u
      --, arraySort(x->x.1,
      --      arrayZip(
-      --          (sumMap([d.1], [1]) as f).1,
+      --          (sumMap([d[1]], [1]) as f).1,
      --          f.2)) full_histogram
      from
         (
--- a/tests/performance/modulo.xml
+++ b/tests/performance/modulo.xml
@ -1,7 +1,4 @@
 <test>
-
-
-
    <query>SELECT number % 128 FROM numbers(300000000) FORMAT Null</query>
    <query>SELECT number % 255 FROM numbers(300000000) FORMAT Null</query>
    <query>SELECT number % 256 FROM numbers(300000000) FORMAT Null</query>