#pragma once #include #include #include #include #include #include /** This is simple, not numerically stable * implementations of variance/covariance/correlation functions. * * It is about two times faster than stable variants. * Numerical errors may occur during summation. * * This implementation is selected as default, * because "you don't pay for what you don't need" principle. * * For more sophisticated implementation, look at AggregateFunctionStatistics.h */ namespace DB { enum class VarianceMode { Population, Sample }; enum class VariancePower { Original, Sqrt }; template struct VarMoments { T m0{}; T m1{}; T m2{}; void add(T x) { ++m0; m1 += x; m2 += x * x; } void merge(const VarMoments & rhs) { m0 += rhs.m0; m1 += rhs.m1; m2 += rhs.m2; } void write(WriteBuffer & buf) const { writePODBinary(*this, buf); } void read(ReadBuffer & buf) { readPODBinary(*this, buf); } template T get() const { if (m0 == 0 && mode == VarianceMode::Sample) return std::numeric_limits::quiet_NaN(); T res = (m2 - m1 * m1 / m0) / (m0 - (mode == VarianceMode::Sample)); return power == VariancePower::Original ? res : sqrt(res); } }; template struct CovarMoments { T m0{}; T x1{}; T y1{}; T xy{}; void add(T x, T y) { ++m0; x1 += x; y1 += y; xy += x * y; } void merge(const CovarMoments & rhs) { m0 += rhs.m0; x1 += rhs.x1; y1 += rhs.y1; xy += rhs.xy; } void write(WriteBuffer & buf) const { writePODBinary(*this, buf); } void read(ReadBuffer & buf) { readPODBinary(*this, buf); } template T get() const { if (m0 == 0 && mode == VarianceMode::Sample) return std::numeric_limits::quiet_NaN(); return (xy - x1 * y1 / m0) / (m0 - (mode == VarianceMode::Sample)); } }; template struct CorrMoments { T m0{}; T x1{}; T y1{}; T xy{}; T x2{}; T y2{}; void add(T x, T y) { ++m0; x1 += x; y1 += y; xy += x * y; x2 += x * x; y2 += y * y; } void merge(const CorrMoments & rhs) { m0 += rhs.m0; x1 += rhs.x1; y1 += rhs.y1; xy += rhs.xy; x2 += rhs.x2; y2 += rhs.y2; } void write(WriteBuffer & buf) const { writePODBinary(*this, buf); } void read(ReadBuffer & buf) { readPODBinary(*this, buf); } T get() const { return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1)); } }; enum class StatisticsFunctionKind { varPop, varSamp, stddevPop, stddevSamp, covarPop, covarSamp, corr }; template using VarianceCalcType = std::conditional_t && std::is_same_v, Float32, Float64>; template class AggregateFunctionVarianceSimple final : public IAggregateFunctionDataHelper> { using ResultType = VarianceCalcType; public: String getName() const override { switch (Kind) { case StatisticsFunctionKind::varPop: return "varPop"; case StatisticsFunctionKind::varSamp: return "varSamp"; case StatisticsFunctionKind::stddevPop: return "stddevPop"; case StatisticsFunctionKind::stddevSamp: return "stddevSamp"; case StatisticsFunctionKind::covarPop: return "covarPop"; case StatisticsFunctionKind::covarSamp: return "covarSamp"; case StatisticsFunctionKind::corr: return "corr"; } } DataTypePtr getReturnType() const override { return std::make_shared>(); } void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override { if constexpr (Kind == StatisticsFunctionKind::covarPop || Kind == StatisticsFunctionKind::covarSamp || Kind == StatisticsFunctionKind::corr) this->data(place).add( static_cast &>(*columns[0]).getData()[row_num], static_cast &>(*columns[1]).getData()[row_num]); else this->data(place).add( static_cast &>(*columns[0]).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override { this->data(place).merge(this->data(rhs)); } void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override { this->data(place).write(buf); } void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override { this->data(place).read(buf); } void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { const auto & data = this->data(place); auto & dst = static_cast &>(to).getData(); if constexpr (Kind == StatisticsFunctionKind::varPop) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::varSamp) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::stddevPop) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::stddevSamp) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::covarPop) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::covarSamp) dst.push_back(data.template get()); else if constexpr (Kind == StatisticsFunctionKind::corr) dst.push_back(data.get()); } const char * getHeaderFilePath() const override { return __FILE__; } }; template using AggregateFunctionVarPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::varPop>; template using AggregateFunctionVarSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::varSamp>; template using AggregateFunctionStddevPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::stddevPop>; template using AggregateFunctionStddevSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::stddevSamp>; template using AggregateFunctionCovarPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::covarPop>; template using AggregateFunctionCovarSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::covarSamp>; template using AggregateFunctionCorrSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::corr>; }