From 3b2a84264ece7301add58b128d9600644a53da6a Mon Sep 17 00:00:00 2001 From: Alexey Arno Date: Fri, 15 May 2015 19:54:17 +0300 Subject: [PATCH] dbms: Server: Feature implementation. [#METR-16188] --- .../AggregateFunctionsStatistics.h | 47 +++++++++++++++++-- 1 file changed, 43 insertions(+), 4 deletions(-) diff --git a/dbms/include/DB/AggregateFunctions/AggregateFunctionsStatistics.h b/dbms/include/DB/AggregateFunctions/AggregateFunctionsStatistics.h index 0ab8daf582c..b6f36964dcf 100644 --- a/dbms/include/DB/AggregateFunctions/AggregateFunctionsStatistics.h +++ b/dbms/include/DB/AggregateFunctions/AggregateFunctionsStatistics.h @@ -11,6 +11,19 @@ namespace DB { +/// XXX Реализовать корреляцию и ковариацию. + +/** Статистические аггрегатные функции: + * varSamp - выборочная дисперсия + * stddevSamp - среднее выборочное квадратичное отклонение + * varPop - дисперсия + * stddevPop - среднее квадратичное отклонение + */ + +/** Параллельный и инкрементальный алгоритм для вычисления дисперсии. + * Источник: "Updating formulae and a pairwise algorithm for computing sample variances" + * (Chan et al., Stanford University, 12/1979) + */ template class AggregateFunctionVarianceData { @@ -31,12 +44,23 @@ public: void mergeWith(const AggregateFunctionVarianceData & source) { UInt64 total_count = count + source.count; + if (total_count == 0) + return; + Float64 factor = static_cast(count * source.count) / total_count; Float64 delta = mean - source.mean; - count = total_count; - mean += delta * (source.count / count); + auto res = std::minmax(count, source.count); + if (((1 - static_cast(res.first) / res.second) < 0.001) && (res.first > 10000)) + { + /// Эта формула более стабильная, когда размеры обоих источников велики и сравнимы. + mean = (source.count * source.mean + count * mean) / total_count; + } + else + mean = source.mean + delta * (count / total_count); + m2 += source.m2 + delta * delta * factor; + count = total_count; } void serialize(WriteBuffer & buf) const @@ -64,6 +88,8 @@ private: Float64 m2 = 0.0; }; +/** Основной код для реализации функций varSamp, stddevSamp, varPop, stddevPop. + */ template class AggregateFunctionVariance final : public IUnaryAggregateFunction, AggregateFunctionVariance > { @@ -111,19 +137,26 @@ public: } }; +/** Реализации функции varSamp. + */ struct VarSampImpl { static constexpr auto name = "varSamp"; static inline Float64 apply(Float64 m2, UInt64 count) { - if (count == 1) + if (count < 2) return 0.0; else return m2 / (count - 1); } }; +namespace +{ + +/** Реализация функции stddevSamp. + */ struct StdDevSampImpl { static constexpr auto name = "stddevSamp"; @@ -134,19 +167,23 @@ struct StdDevSampImpl } }; +/** Реализация функции varPop. + */ struct VarPopImpl { static constexpr auto name = "varPop"; static inline Float64 apply(Float64 m2, UInt64 count) { - if (count == 1) + if (count < 2) return 0.0; else return m2 / count; } }; +/** Реализация функции stddevPop. + */ struct StdDevPopImpl { static constexpr auto name = "stddevPop"; @@ -157,6 +194,8 @@ struct StdDevPopImpl } }; +} + template using AggregateFunctionVarSamp = AggregateFunctionVariance;