mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-16 12:44:42 +00:00
71 lines
3.4 KiB
SQL
71 lines
3.4 KiB
SQL
-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)).
|
|
-- Run like this:
|
|
-- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv
|
|
select
|
|
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded,
|
|
arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded,
|
|
arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent,
|
|
arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent,
|
|
test, query
|
|
from
|
|
(
|
|
-- quantiles of randomization distributions
|
|
-- note that for small number of runs, the exact quantile might not make
|
|
-- sense, because the last possible value of randomization distribution
|
|
-- might take a larger percentage of distribution (i.e. the distribution
|
|
-- actually has discrete values, and the last step can be large).
|
|
select quantileExactForEach(0.99)(
|
|
arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d
|
|
) threshold
|
|
---- Uncomment to see what the distribution is really like. This debug
|
|
---- code only works for single (the first) metric.
|
|
--, uniqExact(d[1]) u
|
|
--, arraySort(x->x.1,
|
|
-- arrayZip(
|
|
-- (sumMap([d[1]], [1]) as f).1,
|
|
-- f.2)) full_histogram
|
|
from
|
|
(
|
|
-- make array 'random label' -> '[median metric]'
|
|
select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label
|
|
from (
|
|
-- get [median metric] arrays among virtual runs, grouping by random label
|
|
select medianExactForEach(metrics) median_metrics, virtual_run, random_label
|
|
from (
|
|
-- randomly relabel measurements
|
|
select *, toUInt32(rowNumberInAllBlocks() % 2) random_label
|
|
from (
|
|
select metrics, number virtual_run
|
|
from
|
|
-- strip the query away before the join -- it might be several kB long;
|
|
(select metrics, run, version from table) no_query,
|
|
-- duplicate input measurements into many virtual runs
|
|
numbers(1, 10000) nn
|
|
-- for each virtual run, randomly reorder measurements
|
|
order by virtual_run, rand()
|
|
) virtual_runs
|
|
) relabeled
|
|
group by virtual_run, random_label
|
|
) virtual_medians
|
|
group by virtual_run -- aggregate by random_label
|
|
) virtual_medians_array
|
|
-- this select aggregates by virtual_run
|
|
) rd,
|
|
(
|
|
select groupArrayInsertAt(median_metrics, version) medians_by_version
|
|
from
|
|
(
|
|
select medianExactForEach(metrics) median_metrics, version
|
|
from table
|
|
group by version
|
|
) original_medians
|
|
) original_medians_array,
|
|
(
|
|
select any(test) test, any(query) query from table
|
|
) any_query,
|
|
(
|
|
select throwIf(uniq((test, query)) != 1) from table
|
|
) check_single_query -- this subselect checks that there is only one query in the input table;
|
|
-- written this way so that it is not optimized away (#10523)
|
|
;
|