ClickHouse/tests/performance/scripts/eqmed.sql

72 lines
3.5 KiB
MySQL
Raw Normal View History

-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
-- Run like this:
-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
select
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,
arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,
test, query
from
(
2020-04-27 12:47:59 +00:00
-- quantiles of randomization distributions
-- note that for small number of runs, the exact quantile might not make
-- sense, because the last possible value of randomization distribution
-- might take a larger percentage of distribution (i.e. the distribution
-- actually has discrete values, and the last step can be large).
2020-09-02 16:42:24 +00:00
select quantileExactForEach(0.99)(
arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
) threshold
---- Uncomment to see what the distribution is really like. This debug
---- code only works for single (the first) metric.
--, uniqExact(d[1]) u
2020-04-28 07:45:35 +00:00
--, arraySort(x->x.1,
-- arrayZip(
-- (sumMap([d[1]], [1]) as f).1,
2020-04-28 07:45:35 +00:00
-- f.2)) full_histogram
from
(
-- make array 'random label' -> '[median metric]'
select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label
from (
-- get [median metric] arrays among virtual runs, grouping by random label
select medianExactForEach(metrics) median_metrics, virtual_run, random_label
from (
-- randomly relabel measurements
select *, toUInt32(rowNumberInAllBlocks() % 2) random_label
from (
select metrics, number virtual_run
2020-04-27 12:47:59 +00:00
from
-- strip the query away before the join -- it might be several kB long;
(select metrics, run, version from table) no_query,
2020-04-27 12:47:59 +00:00
-- duplicate input measurements into many virtual runs
2020-09-02 16:42:24 +00:00
numbers(1, 10000) nn
2020-04-27 12:47:59 +00:00
-- for each virtual run, randomly reorder measurements
order by virtual_run, rand()
) virtual_runs
) relabeled
2020-04-27 12:47:59 +00:00
group by virtual_run, random_label
) virtual_medians
2020-04-27 12:47:59 +00:00
group by virtual_run -- aggregate by random_label
) virtual_medians_array
2020-04-27 12:47:59 +00:00
-- this select aggregates by virtual_run
) rd,
(
select groupArrayInsertAt(median_metrics, version) medians_by_version
from
2020-04-27 12:47:59 +00:00
(
select medianExactForEach(metrics) median_metrics, version
2020-04-27 12:47:59 +00:00
from table
group by version
) original_medians
) original_medians_array,
(
select any(test) test, any(query) query from table
2020-04-27 12:47:59 +00:00
) any_query,
(
select throwIf(uniq((test, query)) != 1) from table
2020-04-27 12:47:59 +00:00
) check_single_query -- this subselect checks that there is only one query in the input table;
-- written this way so that it is not optimized away (#10523)
2024-03-25 16:06:42 +00:00
SETTINGS allow_experimental_analyzer = 0
2020-04-27 12:47:59 +00:00
;