ClickHouse/docker/test/performance-comparison/eqmed.sql

64 lines
2.9 KiB
MySQL
Raw Normal View History

-- input is table(test text, query text, run UInt32, version int, metrics Array(float))
select
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,
arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,
test, query
from
(
2020-04-27 12:47:59 +00:00
-- quantiles of randomization distributions
2020-09-02 16:42:24 +00:00
select quantileExactForEach(0.99)(
arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
) threshold
2020-04-28 07:45:35 +00:00
---- uncomment to see what the distribution is really like
--, uniqExact(d.1) u
2020-04-28 07:45:35 +00:00
--, arraySort(x->x.1,
-- arrayZip(
-- (sumMap([d.1], [1]) as f).1,
2020-04-28 07:45:35 +00:00
-- f.2)) full_histogram
from
(
-- make array 'random label' -> '[median metric]'
select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label
from (
-- get [median metric] arrays among virtual runs, grouping by random label
select medianExactForEach(metrics) median_metrics, virtual_run, random_label
from (
-- randomly relabel measurements
select *, toUInt32(rowNumberInAllBlocks() % 2) random_label
from (
select metrics, number virtual_run
2020-04-27 12:47:59 +00:00
from
-- strip the query away before the join -- it might be several kB long;
(select metrics, run, version from table) no_query,
2020-04-27 12:47:59 +00:00
-- duplicate input measurements into many virtual runs
2020-09-02 16:42:24 +00:00
numbers(1, 10000) nn
2020-04-27 12:47:59 +00:00
-- for each virtual run, randomly reorder measurements
order by virtual_run, rand()
) virtual_runs
) relabeled
2020-04-27 12:47:59 +00:00
group by virtual_run, random_label
) virtual_medians
2020-04-27 12:47:59 +00:00
group by virtual_run -- aggregate by random_label
) virtual_medians_array
2020-04-27 12:47:59 +00:00
-- this select aggregates by virtual_run
) rd,
(
select groupArrayInsertAt(median_metrics, version) medians_by_version
from
2020-04-27 12:47:59 +00:00
(
select medianExactForEach(metrics) median_metrics, version
2020-04-27 12:47:59 +00:00
from table
group by version
) original_medians
) original_medians_array,
(
select any(test) test, any(query) query from table
2020-04-27 12:47:59 +00:00
) any_query,
(
select throwIf(uniq((test, query)) != 1) from table
2020-04-27 12:47:59 +00:00
) check_single_query -- this subselect checks that there is only one query in the input table;
-- written this way so that it is not optimized away (#10523)
;