From 7adf8d29cf82cfb4f3d778df0e68ffef7f17876e Mon Sep 17 00:00:00 2001 From: chertus Date: Thu, 13 Sep 2018 21:36:47 +0300 Subject: [PATCH] var/stddev for decimal [CLICKHOUSE-3765] --- .../AggregateFunctionStatisticsSimple.h | 216 ++++++++++++++---- .../AggregateFunctionsStatisticsSimple.cpp | 14 +- .../00700_decimal_aggregates.reference | 8 + .../0_stateless/00700_decimal_aggregates.sql | 15 ++ 4 files changed, 201 insertions(+), 52 deletions(-) diff --git a/dbms/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h b/dbms/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h index e864d1cf64a..2aa119e56b6 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h +++ b/dbms/src/AggregateFunctions/AggregateFunctionStatisticsSimple.h @@ -2,13 +2,17 @@ #include +#include + #include #include #include #include +#include #include +#include /** This is simple, not numerically stable @@ -26,17 +30,11 @@ namespace DB { -enum class VarianceMode +namespace ErrorCodes { - Population, - Sample -}; - -enum class VariancePower -{ - Original, - Sqrt -}; + extern const int LOGICAL_ERROR; + extern const int DECIMAL_OVERFLOW; +} template @@ -70,15 +68,74 @@ struct VarMoments readPODBinary(*this, buf); } - template - T get() const + T getPopulation() const { - if (m0 == 0 && mode == VarianceMode::Sample) + return (m2 - m1 * m1 / m0) / m0; + } + + T getSample() const + { + if (m0 == 0) + return std::numeric_limits::quiet_NaN(); + return (m2 - m1 * m1 / m0) / (m0 - 1); + } + + T get() const { throw Exception("Unexpected call", ErrorCodes::LOGICAL_ERROR); } +}; + +template +struct VarMomentsDecimal +{ + using NativeType = typename T::NativeType; + + UInt64 m0{}; + NativeType m1{}; + NativeType m2{}; + + void add(NativeType x) + { + ++m0; + m1 += x; + + NativeType tmp; /// scale' = 2 * scale + if (common::mulOverflow(x, x, tmp) || common::addOverflow(m2, tmp, m2)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + } + + void merge(const VarMomentsDecimal & rhs) + { + m0 += rhs.m0; + m1 += rhs.m1; + + if (common::addOverflow(m2, rhs.m2, m2)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + } + + void write(WriteBuffer & buf) const { writePODBinary(*this, buf); } + void read(ReadBuffer & buf) { readPODBinary(*this, buf); } + + Float64 getPopulation(UInt32 scale) const + { + NativeType tmp; + if (common::mulOverflow(m1, m1, tmp) || + common::subOverflow(m2, NativeType(tmp/m0), tmp)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + return convertFromDecimal, DataTypeNumber>(tmp / m0, scale); + } + + Float64 getSample(UInt32 scale) const + { + if (m0 == 0) return std::numeric_limits::quiet_NaN(); - T res = (m2 - m1 * m1 / m0) / (m0 - (mode == VarianceMode::Sample)); - return power == VariancePower::Original ? res : sqrt(res); + NativeType tmp; + if (common::mulOverflow(m1, m1, tmp) || + common::subOverflow(m2, NativeType(tmp/m0), tmp)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + return convertFromDecimal, DataTypeNumber>(tmp / (m0 - 1), scale); } + + Float64 get() const { throw Exception("Unexpected call", ErrorCodes::LOGICAL_ERROR); } }; template @@ -115,14 +172,19 @@ struct CovarMoments readPODBinary(*this, buf); } - template - T get() const + T getPopulation() const { - if (m0 == 0 && mode == VarianceMode::Sample) - return std::numeric_limits::quiet_NaN(); - - return (xy - x1 * y1 / m0) / (m0 - (mode == VarianceMode::Sample)); + return (xy - x1 * y1 / m0) / m0; } + + T getSample() const + { + if (m0 == 0) + return std::numeric_limits::quiet_NaN(); + return (xy - x1 * y1 / m0) / (m0 - 1); + } + + T get() const { throw Exception("Unexpected call", ErrorCodes::LOGICAL_ERROR); } }; template @@ -169,6 +231,9 @@ struct CorrMoments { return (m0 * xy - x1 * y1) / sqrt((m0 * x2 - x1 * x1) * (m0 * y2 - y1 * y1)); } + + T getPopulation() const { throw Exception("Unexpected call", ErrorCodes::LOGICAL_ERROR); } + T getSample() const { throw Exception("Unexpected call", ErrorCodes::LOGICAL_ERROR); } }; @@ -181,20 +246,54 @@ enum class StatisticsFunctionKind }; -template -using VarianceCalcType = std::conditional_t && std::is_same_v, Float32, Float64>; - - -template -class AggregateFunctionVarianceSimple final - : public IAggregateFunctionDataHelper> +template +struct StatFuncOneArg { - using ResultType = VarianceCalcType; + using Type1 = T; + using Type2 = T; + using ResultType = std::conditional_t, Float32, Float64>; + using Data = std::conditional_t, VarMomentsDecimal, VarMoments>; + static constexpr StatisticsFunctionKind kind = _kind; + static constexpr UInt32 num_args = 1; +}; + +template +struct StatFuncTwoArg +{ + using Type1 = T1; + using Type2 = T2; + using ResultType = std::conditional_t && std::is_same_v, Float32, Float64>; + using Data = std::conditional_t<_kind == StatisticsFunctionKind::corr, CorrMoments, CovarMoments>; + + static constexpr StatisticsFunctionKind kind = _kind; + static constexpr UInt32 num_args = 2; +}; + + +template +class AggregateFunctionVarianceSimple final + : public IAggregateFunctionDataHelper> +{ public: + using T1 = typename StatFunc::Type1; + using T2 = typename StatFunc::Type2; + using ColVecT1 = std::conditional_t, ColumnDecimal, ColumnVector>; + using ColVecT2 = std::conditional_t, ColumnDecimal, ColumnVector>; + using ResultType = typename StatFunc::ResultType; + using ColVecResult = ColumnVector; + + AggregateFunctionVarianceSimple() + : src_scale(0) + {} + + AggregateFunctionVarianceSimple(const IDataType & data_type) + : src_scale(getDecimalScale(data_type)) + {} + String getName() const override { - switch (Kind) + switch (StatFunc::kind) { case StatisticsFunctionKind::varPop: return "varPop"; case StatisticsFunctionKind::varSamp: return "varSamp"; @@ -214,13 +313,13 @@ public: void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override { - if constexpr (Kind == StatisticsFunctionKind::covarPop || Kind == StatisticsFunctionKind::covarSamp || Kind == StatisticsFunctionKind::corr) + if constexpr (StatFunc::num_args == 2) this->data(place).add( - static_cast &>(*columns[0]).getData()[row_num], - static_cast &>(*columns[1]).getData()[row_num]); + static_cast(*columns[0]).getData()[row_num], + static_cast(*columns[1]).getData()[row_num]); else this->data(place).add( - static_cast &>(*columns[0]).getData()[row_num]); + static_cast(*columns[0]).getData()[row_num]); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override @@ -241,27 +340,46 @@ public: void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override { const auto & data = this->data(place); - auto & dst = static_cast &>(to).getData(); + auto & dst = static_cast(to).getData(); - if constexpr (Kind == StatisticsFunctionKind::varPop) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::varSamp) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::stddevPop) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::stddevSamp) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::covarPop) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::covarSamp) dst.push_back(data.template get()); - else if constexpr (Kind == StatisticsFunctionKind::corr) dst.push_back(data.get()); + if constexpr (IsDecimalNumber) + { + switch (StatFunc::kind) + { + case StatisticsFunctionKind::varPop: dst.push_back(data.getPopulation(src_scale * 2)); break; + case StatisticsFunctionKind::varSamp: dst.push_back(data.getSample(src_scale * 2)); break; + case StatisticsFunctionKind::stddevPop: dst.push_back(sqrt(data.getPopulation(src_scale * 2))); break; + case StatisticsFunctionKind::stddevSamp: dst.push_back(sqrt(data.getSample(src_scale * 2))); break; + } + } + else + { + switch (StatFunc::kind) + { + case StatisticsFunctionKind::varPop: dst.push_back(data.getPopulation()); break; + case StatisticsFunctionKind::varSamp: dst.push_back(data.getSample()); break; + case StatisticsFunctionKind::stddevPop: dst.push_back(sqrt(data.getPopulation())); break; + case StatisticsFunctionKind::stddevSamp: dst.push_back(sqrt(data.getSample())); break; + case StatisticsFunctionKind::covarPop: dst.push_back(data.getPopulation()); break; + case StatisticsFunctionKind::covarSamp: dst.push_back(data.getSample()); break; + case StatisticsFunctionKind::corr: dst.push_back(data.get()); break; + } + } } const char * getHeaderFilePath() const override { return __FILE__; } + +private: + UInt32 src_scale; }; -template using AggregateFunctionVarPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::varPop>; -template using AggregateFunctionVarSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::varSamp>; -template using AggregateFunctionStddevPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::stddevPop>; -template using AggregateFunctionStddevSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::stddevSamp>; -template using AggregateFunctionCovarPopSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::covarPop>; -template using AggregateFunctionCovarSampSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::covarSamp>; -template using AggregateFunctionCorrSimple = AggregateFunctionVarianceSimple>, StatisticsFunctionKind::corr>; +template using AggregateFunctionVarPopSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionVarSampSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionStddevPopSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionStddevSampSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionCovarPopSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionCovarSampSimple = AggregateFunctionVarianceSimple>; +template using AggregateFunctionCorrSimple = AggregateFunctionVarianceSimple>; } diff --git a/dbms/src/AggregateFunctions/AggregateFunctionsStatisticsSimple.cpp b/dbms/src/AggregateFunctions/AggregateFunctionsStatisticsSimple.cpp index c42372187bc..35bee73c9d8 100644 --- a/dbms/src/AggregateFunctions/AggregateFunctionsStatisticsSimple.cpp +++ b/dbms/src/AggregateFunctions/AggregateFunctionsStatisticsSimple.cpp @@ -21,11 +21,18 @@ AggregateFunctionPtr createAggregateFunctionStatisticsUnary(const std::string & assertNoParameters(name, parameters); assertUnary(name, argument_types); - AggregateFunctionPtr res(createWithNumericType(*argument_types[0])); + AggregateFunctionPtr res; + DataTypePtr data_type = argument_types[0]; + if (isDecimal(data_type)) + { + res.reset(createWithDecimalType(*data_type)); + } + else + res.reset(createWithNumericType(*data_type)); if (!res) - throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name, ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); - + throw Exception("Illegal type " + argument_types[0]->getName() + " of argument for aggregate function " + name, + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return res; } @@ -51,6 +58,7 @@ void registerAggregateFunctionsStatisticsSimple(AggregateFunctionFactory & facto factory.registerFunction("varPop", createAggregateFunctionStatisticsUnary); factory.registerFunction("stddevSamp", createAggregateFunctionStatisticsUnary); factory.registerFunction("stddevPop", createAggregateFunctionStatisticsUnary); + factory.registerFunction("covarSamp", createAggregateFunctionStatisticsBinary); factory.registerFunction("covarPop", createAggregateFunctionStatisticsBinary); factory.registerFunction("corr", createAggregateFunctionStatisticsBinary, AggregateFunctionFactory::CaseInsensitive); diff --git a/dbms/tests/queries/0_stateless/00700_decimal_aggregates.reference b/dbms/tests/queries/0_stateless/00700_decimal_aggregates.reference index d958ad4f8f7..3b1c6f9099d 100644 --- a/dbms/tests/queries/0_stateless/00700_decimal_aggregates.reference +++ b/dbms/tests/queries/0_stateless/00700_decimal_aggregates.reference @@ -26,3 +26,11 @@ [-50.0000,-40.0000,-30.0000,-20.0000,-10.0000,0.0000,10.0000,20.0000,30.0000,40.0000,50.0000] [-16.66666666,-13.33333333,-10.00000000,-6.66666666,-3.33333333,0.00000000,3.33333333,6.66666666,10.00000000,13.33333333,16.66666666] [-10.00000000,-8.00000000,-6.00000000,-4.00000000,-2.00000000,0.00000000,2.00000000,4.00000000,6.00000000,8.00000000,10.00000000] +850 94.44444438684269 34 Float64 Float64 Float64 +850 94.4444443868427 34.00000000000001 +858.5 95.38888883071111 34.34 Float64 Float64 Float64 +858.5 95.38888883071112 34.34 +29.154759474226502 9.718253155111915 5.830951894845301 Float64 Float64 Float64 +29.154759474226502 9.718253155111915 5.830951894845301 +29.300170647967224 9.766723546344041 5.860034129593445 Float64 Float64 Float64 +29.300170647967224 9.766723546344041 5.860034129593445 diff --git a/dbms/tests/queries/0_stateless/00700_decimal_aggregates.sql b/dbms/tests/queries/0_stateless/00700_decimal_aggregates.sql index 968ede183ca..9206e63172b 100644 --- a/dbms/tests/queries/0_stateless/00700_decimal_aggregates.sql +++ b/dbms/tests/queries/0_stateless/00700_decimal_aggregates.sql @@ -54,6 +54,21 @@ SELECT quantilesExact(0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)(a) SELECT quantilesExact(0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)(b) FROM test.decimal; SELECT quantilesExact(0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)(c) FROM test.decimal; +SELECT varPop(a) AS va, varPop(b) AS vb, varPop(c) AS vc, toTypeName(va), toTypeName(vb), toTypeName(vc) FROM test.decimal; +SELECT varPop(toFloat64(a)), varPop(toFloat64(b)), varPop(toFloat64(c)) FROM test.decimal; +SELECT varSamp(a) AS va, varSamp(b) AS vb, varSamp(c) AS vc, toTypeName(va), toTypeName(vb), toTypeName(vc) FROM test.decimal; +SELECT varSamp(toFloat64(a)), varSamp(toFloat64(b)), varSamp(toFloat64(c)) FROM test.decimal; + +SELECT stddevPop(a) AS da, stddevPop(b) AS db, stddevPop(c) AS dc, toTypeName(da), toTypeName(db), toTypeName(dc) FROM test.decimal; +SELECT stddevPop(toFloat64(a)), stddevPop(toFloat64(b)), stddevPop(toFloat64(c)) FROM test.decimal; +SELECT stddevSamp(a) AS da, stddevSamp(b) AS db, stddevSamp(c) AS dc, toTypeName(da), toTypeName(db), toTypeName(dc) FROM test.decimal; +SELECT stddevSamp(toFloat64(a)), stddevSamp(toFloat64(b)), stddevSamp(toFloat64(c)) FROM test.decimal; + +SELECT covarPop(a, a), covarPop(b, b), covarPop(c, c) FROM test.decimal; -- { serverError 43 } +SELECT covarSamp(a, a), covarSamp(b, b), covarSamp(c, c) FROM test.decimal; -- { serverError 43 } +SELECT corr(a, a), corr(b, b), corr(c, c) FROM test.decimal; -- { serverError 43 } +SELECT 1 LIMIT 0; + -- TODO: sumMap -- TODO: other quantile(s) -- TODO: groupArray, groupArrayInsertAt, groupUniqArray