-- The input is table(test text, query text, run UInt32, version UInt8, metrics Array(float)). -- Run like this: -- clickhouse-local --queries-file eqmed.sql -S 'test text, query text, run UInt32, version UInt8, metrics Array(float)' --file analyze/tmp/modulo_0.tsv select arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[1] as l) l_rounded, arrayMap(x -> floor(x, 4), original_medians_array.medians_by_version[2] as r) r_rounded, arrayMap(x, y -> floor((y - x) / x, 3), l, r) diff_percent, arrayMap(x, y -> floor(x / y, 3), threshold, l) threshold_percent, test, query from ( -- quantiles of randomization distributions -- note that for small number of runs, the exact quantile might not make -- sense, because the last possible value of randomization distribution -- might take a larger percentage of distribution (i.e. the distribution -- actually has discrete values, and the last step can be large). select quantileExactForEach(0.99)( arrayMap(x, y -> abs(x - y), metrics_by_label[1], metrics_by_label[2]) as d ) threshold ---- Uncomment to see what the distribution is really like. This debug ---- code only works for single (the first) metric. --, uniqExact(d[1]) u --, arraySort(x->x.1, -- arrayZip( -- (sumMap([d[1]], [1]) as f).1, -- f.2)) full_histogram from ( -- make array 'random label' -> '[median metric]' select virtual_run, groupArrayInsertAt(median_metrics, random_label) metrics_by_label from ( -- get [median metric] arrays among virtual runs, grouping by random label select medianExactForEach(metrics) median_metrics, virtual_run, random_label from ( -- randomly relabel measurements select *, toUInt32(rowNumberInAllBlocks() % 2) random_label from ( select metrics, number virtual_run from -- strip the query away before the join -- it might be several kB long; (select metrics, run, version from table) no_query, -- duplicate input measurements into many virtual runs numbers(1, 10000) nn -- for each virtual run, randomly reorder measurements order by virtual_run, rand() ) virtual_runs ) relabeled group by virtual_run, random_label ) virtual_medians group by virtual_run -- aggregate by random_label ) virtual_medians_array -- this select aggregates by virtual_run ) rd, ( select groupArrayInsertAt(median_metrics, version) medians_by_version from ( select medianExactForEach(metrics) median_metrics, version from table group by version ) original_medians ) original_medians_array, ( select any(test) test, any(query) query from table ) any_query, ( select throwIf(uniq((test, query)) != 1) from table ) check_single_query -- this subselect checks that there is only one query in the input table; -- written this way so that it is not optimized away (#10523) ;