ClickHouse/tests/performance/scripts/eqmed.sql

-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
-- Run like this:
-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
select
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
   arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
   arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,
   arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,
   test, query
from
   (
      -- quantiles of randomization distributions
      -- note that for small number of runs, the exact quantile might not make
      -- sense, because the last possible value of randomization distribution
      -- might take a larger percentage of distribution (i.e. the distribution
      -- actually has discrete values, and the last step can be large).
      select quantileExactForEach(0.99)(
        arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
      ) threshold
      ---- Uncomment to see what the distribution is really like. This debug
      ---- code only works for single (the first) metric.
      --, uniqExact(d[1]) u
      --, arraySort(x->x.1,
      --      arrayZip(
      --          (sumMap([d[1]], [1]) as f).1,
      --          f.2)) full_histogram
      from
         (
            -- make array 'random label' -> '[median metric]'
            select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label
            from (
                  -- get [median metric] arrays among virtual runs, grouping by random label
                  select medianExactForEach(metrics) median_metrics, virtual_run, random_label
                  from (
                        -- randomly relabel measurements
                        select *, toUInt32(rowNumberInAllBlocks() % 2) random_label
                        from (
                              select metrics, number virtual_run
                              from
                                -- strip the query away before the join -- it might be several kB long;
                                (select metrics, run, version from table) no_query,
                                -- duplicate input measurements into many virtual runs
                                numbers(1, 10000) nn
                              -- for each virtual run, randomly reorder measurements
                              order by virtual_run, rand()
                           ) virtual_runs
                     ) relabeled
                  group by virtual_run, random_label
               ) virtual_medians
            group by virtual_run -- aggregate by random_label
         ) virtual_medians_array
      -- this select aggregates by virtual_run
   ) rd,
   (
        select groupArrayInsertAt(median_metrics, version) medians_by_version
        from
        (
            select medianExactForEach(metrics) median_metrics, version
            from table
            group by version
        ) original_medians
   ) original_medians_array,
   (
        select any(test) test, any(query) query from table
   ) any_query,
   (
       select throwIf(uniq((test, query)) != 1) from table
   ) check_single_query -- this subselect checks that there is only one query in the input table;
                        -- written this way so that it is not optimized away (#10523)
SETTINGS allow_experimental_analyzer = 0
;
Fix a rare false negative in perf tests 2021-03-02 16:21:30 +00:00			`-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).`
			`-- Run like this:`
			`-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`select`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,`
			`arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,`
			`arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,`
			`arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,`
			`test, query`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from`
			`(`
performance comparison 2020-04-27 12:47:59 +00:00			`-- quantiles of randomization distributions`
Fix a rare false negative in perf tests 2021-03-02 16:21:30 +00:00			`-- note that for small number of runs, the exact quantile might not make`
			`-- sense, because the last possible value of randomization distribution`
Reapply "improve CI with digest for docker, build and test jobs" (#57904) * Revert "Revert "improve CI with digest for docker, build and test jobs"" * fix: docker manifest merge for missing images only 2023-12-18 08:07:22 +00:00			`-- might take a larger percentage of distribution (i.e. the distribution`
Fix a rare false negative in perf tests 2021-03-02 16:21:30 +00:00			`-- actually has discrete values, and the last step can be large).`
faster 2020-09-02 16:42:24 +00:00			`select quantileExactForEach(0.99)(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d`
			`) threshold`
Fix a rare false negative in perf tests 2021-03-02 16:21:30 +00:00			`---- Uncomment to see what the distribution is really like. This debug`
			`---- code only works for single (the first) metric.`
			`--, uniqExact(d[1]) u`
performance comparison 2020-04-28 07:45:35 +00:00			`--, arraySort(x->x.1,`
			`-- arrayZip(`
Fix a rare false negative in perf tests 2021-03-02 16:21:30 +00:00			`-- (sumMap([d[1]], [1]) as f).1,`
performance comparison 2020-04-28 07:45:35 +00:00			`-- f.2)) full_histogram`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from`
			`(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`-- make array 'random label' -> '[median metric]'`
			`select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from (`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`-- get [median metric] arrays among virtual runs, grouping by random label`
			`select medianExactForEach(metrics) median_metrics, virtual_run, random_label`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from (`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`-- randomly relabel measurements`
			`select *, toUInt32(rowNumberInAllBlocks() % 2) random_label`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from (`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`select metrics, number virtual_run`
performance comparison 2020-04-27 12:47:59 +00:00			`from`
			`-- strip the query away before the join -- it might be several kB long;`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`(select metrics, run, version from table) no_query,`
performance comparison 2020-04-27 12:47:59 +00:00			`-- duplicate input measurements into many virtual runs`
faster 2020-09-02 16:42:24 +00:00			`numbers(1, 10000) nn`
performance comparison 2020-04-27 12:47:59 +00:00			`-- for each virtual run, randomly reorder measurements`
			`order by virtual_run, rand()`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`) virtual_runs`
Reapply "improve CI with digest for docker, build and test jobs" (#57904) * Revert "Revert "improve CI with digest for docker, build and test jobs"" * fix: docker manifest merge for missing images only 2023-12-18 08:07:22 +00:00			`) relabeled`
performance comparison 2020-04-27 12:47:59 +00:00			`group by virtual_run, random_label`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`) virtual_medians`
performance comparison 2020-04-27 12:47:59 +00:00			`group by virtual_run -- aggregate by random_label`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`) virtual_medians_array`
performance comparison 2020-04-27 12:47:59 +00:00			`-- this select aggregates by virtual_run`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`) rd,`
			`(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`select groupArrayInsertAt(median_metrics, version) medians_by_version`
[wip] some experimental scripts for peformance comparison 2019-12-26 17:35:41 +00:00			`from`
performance comparison 2020-04-27 12:47:59 +00:00			`(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`select medianExactForEach(metrics) median_metrics, version`
performance comparison 2020-04-27 12:47:59 +00:00			`from table`
			`group by version`
			`) original_medians`
			`) original_medians_array,`
			`(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`select any(test) test, any(query) query from table`
performance comparison 2020-04-27 12:47:59 +00:00			`) any_query,`
			`(`
[wip] Add memory stats to performance test 2020-05-20 02:19:19 +00:00			`select throwIf(uniq((test, query)) != 1) from table`
performance comparison 2020-04-27 12:47:59 +00:00			`) check_single_query -- this subselect checks that there is only one query in the input table;`
			`-- written this way so that it is not optimized away (#10523)`
Try to fix perf tests 2024-03-25 16:06:42 +00:00			`SETTINGS allow_experimental_analyzer = 0`
performance comparison 2020-04-27 12:47:59 +00:00			`;`