2021-03-02 16:21:30 +00:00
|
|
|
-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
|
|
|
|
-- Run like this:
|
|
|
|
-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
|
2019-12-26 17:35:41 +00:00
|
|
|
select
|
2020-05-20 02:19:19 +00:00
|
|
|
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
|
|
|
|
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
|
|
|
|
arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,
|
|
|
|
arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,
|
|
|
|
test, query
|
2019-12-26 17:35:41 +00:00
|
|
|
from
|
|
|
|
(
|
2020-04-27 12:47:59 +00:00
|
|
|
-- quantiles of randomization distributions
|
2021-03-02 16:21:30 +00:00
|
|
|
-- note that for small number of runs, the exact quantile might not make
|
|
|
|
-- sense, because the last possible value of randomization distribution
|
2023-12-15 14:48:01 +00:00
|
|
|
-- might take a larger percentage of distirbution (i.e. the distribution
|
2021-03-02 16:21:30 +00:00
|
|
|
-- actually has discrete values, and the last step can be large).
|
2020-09-02 16:42:24 +00:00
|
|
|
select quantileExactForEach(0.99)(
|
2020-05-20 02:19:19 +00:00
|
|
|
arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
|
|
|
|
) threshold
|
2021-03-02 16:21:30 +00:00
|
|
|
---- Uncomment to see what the distribution is really like. This debug
|
|
|
|
---- code only works for single (the first) metric.
|
|
|
|
--, uniqExact(d[1]) u
|
2020-04-28 07:45:35 +00:00
|
|
|
--, arraySort(x->x.1,
|
|
|
|
-- arrayZip(
|
2021-03-02 16:21:30 +00:00
|
|
|
-- (sumMap([d[1]], [1]) as f).1,
|
2020-04-28 07:45:35 +00:00
|
|
|
-- f.2)) full_histogram
|
2019-12-26 17:35:41 +00:00
|
|
|
from
|
|
|
|
(
|
2020-05-20 02:19:19 +00:00
|
|
|
-- make array 'random label' -> '[median metric]'
|
|
|
|
select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label
|
2019-12-26 17:35:41 +00:00
|
|
|
from (
|
2020-05-20 02:19:19 +00:00
|
|
|
-- get [median metric] arrays among virtual runs, grouping by random label
|
|
|
|
select medianExactForEach(metrics) median_metrics, virtual_run, random_label
|
2019-12-26 17:35:41 +00:00
|
|
|
from (
|
2020-05-20 02:19:19 +00:00
|
|
|
-- randomly relabel measurements
|
|
|
|
select *, toUInt32(rowNumberInAllBlocks() % 2) random_label
|
2019-12-26 17:35:41 +00:00
|
|
|
from (
|
2020-05-20 02:19:19 +00:00
|
|
|
select metrics, number virtual_run
|
2020-04-27 12:47:59 +00:00
|
|
|
from
|
|
|
|
-- strip the query away before the join -- it might be several kB long;
|
2020-05-20 02:19:19 +00:00
|
|
|
(select metrics, run, version from table) no_query,
|
2020-04-27 12:47:59 +00:00
|
|
|
-- duplicate input measurements into many virtual runs
|
2020-09-02 16:42:24 +00:00
|
|
|
numbers(1, 10000) nn
|
2020-04-27 12:47:59 +00:00
|
|
|
-- for each virtual run, randomly reorder measurements
|
|
|
|
order by virtual_run, rand()
|
2019-12-26 17:35:41 +00:00
|
|
|
) virtual_runs
|
2023-12-15 14:48:01 +00:00
|
|
|
) relabeled
|
2020-04-27 12:47:59 +00:00
|
|
|
group by virtual_run, random_label
|
2019-12-26 17:35:41 +00:00
|
|
|
) virtual_medians
|
2020-04-27 12:47:59 +00:00
|
|
|
group by virtual_run -- aggregate by random_label
|
2019-12-26 17:35:41 +00:00
|
|
|
) virtual_medians_array
|
2020-04-27 12:47:59 +00:00
|
|
|
-- this select aggregates by virtual_run
|
2019-12-26 17:35:41 +00:00
|
|
|
) rd,
|
|
|
|
(
|
2020-05-20 02:19:19 +00:00
|
|
|
select groupArrayInsertAt(median_metrics, version) medians_by_version
|
2019-12-26 17:35:41 +00:00
|
|
|
from
|
2020-04-27 12:47:59 +00:00
|
|
|
(
|
2020-05-20 02:19:19 +00:00
|
|
|
select medianExactForEach(metrics) median_metrics, version
|
2020-04-27 12:47:59 +00:00
|
|
|
from table
|
|
|
|
group by version
|
|
|
|
) original_medians
|
|
|
|
) original_medians_array,
|
|
|
|
(
|
2020-05-20 02:19:19 +00:00
|
|
|
select any(test) test, any(query) query from table
|
2020-04-27 12:47:59 +00:00
|
|
|
) any_query,
|
|
|
|
(
|
2020-05-20 02:19:19 +00:00
|
|
|
select throwIf(uniq((test, query)) != 1) from table
|
2020-04-27 12:47:59 +00:00
|
|
|
) check_single_query -- this subselect checks that there is only one query in the input table;
|
|
|
|
-- written this way so that it is not optimized away (#10523)
|
|
|
|
;
|