Do not calculate integrals in statistical tests (#36953)

2024-11-25 17:12:03 +00:00 · 2022-06-07 15:39:39 +02:00 · 2022-06-07 15:39:39 +02:00 · 85a1204e95
commit 85a1204e95
parent dcad154105
8 changed files with 85 additions and 75 deletions
--- a/src/AggregateFunctions/AggregateFunctionMannWhitney.h
+++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h
@ -6,8 +6,10 @@
 #include <Columns/ColumnVector.h>
 #include <Columns/ColumnTuple.h>
 #include <Common/assert_cast.h>
+#include <Common/ArenaAllocator.h>
 #include <Common/PODArray_fwd.h>
 #include <base/types.h>
+#include <DataTypes/DataTypeArray.h>
 #include <DataTypes/DataTypesDecimal.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <DataTypes/DataTypesNumber.h>
@ -16,10 +18,7 @@
 #include <IO/WriteHelpers.h>
 #include <limits>

-#include <DataTypes/DataTypeArray.h>
-
-#include <Common/ArenaAllocator.h>
-
+#include <boost/math/distributions/normal.hpp>

 namespace DB
 {
@ -84,15 +83,14 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
        if (alternative == Alternative::TwoSided)
            z = std::abs(z);

-        /// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
-        /// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
-        const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
+        auto standart_normal_distribution = boost::math::normal_distribution<Float64>();
+        auto cdf = boost::math::cdf(standart_normal_distribution, z);

        Float64 p_value = 0;
        if (alternative == Alternative::TwoSided)
-            p_value = 1 - 2 * cdf;
+            p_value = 2 - 2 * cdf;
        else
-            p_value = 0.5 - cdf;
+            p_value = 1 - cdf;

        return {u2, p_value};
    }
--- a/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
@ -62,7 +62,17 @@ struct StudentTTestData : public TTestMoments<Float64>
        /// t-statistic
        Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);

-        return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
+        if (isNaN(t_stat))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Resulted t-statistics is NaN");
+
+        auto student = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
+        Float64 pvalue = 0;
+        if (t_stat > 0)
+            pvalue = 2 * boost::math::cdf<Float64>(student, -t_stat);
+        else
+            pvalue = 2 * boost::math::cdf<Float64>(student, t_stat);
+
+        return {t_stat, pvalue};
    }
 };

--- a/src/AggregateFunctions/AggregateFunctionTTest.h
+++ b/src/AggregateFunctions/AggregateFunctionTTest.h
@ -34,50 +34,6 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
 }

-/**
- * If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1−F(T)
- * In our case p-value is two-sided, so we multiply it by 2.
- * So cumulative distribution function F equals to
- * \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
- * where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
- *
- * so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
- *
- * And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
- *
- * Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
- * and
- * \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
- * \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
- *
- * p-value can be calculated in terms of gamma functions and integrals more simply:
- * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
- * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
- *
- * which simplifies to:
- *
- * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
- * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
- *
- * Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
- *
- * Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
- * So the procedure of computing p-value is the same.
-*/
-static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) /// NOLINT
-{
-    Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
-        [degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); });
-
-    int unused;
-    Float64 denominator = std::exp(
-        lgamma_r(degrees_of_freedom / 2, &unused)
-        + lgamma_r(0.5, &unused)
-        - lgamma_r(degrees_of_freedom / 2 + 0.5, &unused));
-
-    return std::min(1.0, std::max(0.0, numerator / denominator));
-}
-

 /// Returns tuple of (t-statistic, p-value)
 /// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
--- a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp
@ -10,7 +10,6 @@ namespace ErrorCodes
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }

-
 namespace DB
 {
 struct Settings;
@ -53,7 +52,14 @@ struct WelchTTestData : public TTestMoments<Float64>
        Float64 se = getStandardError();
        Float64 t_stat = (mean_x - mean_y) / se;

-        return {t_stat, getPValue(getDegreesOfFreedom(), t_stat * t_stat)};
+        auto students_t_distribution = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
+        Float64 pvalue = 0;
+        if (t_stat > 0)
+            pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, -t_stat);
+        else
+            pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, t_stat);
+
+        return {t_stat, pvalue};
    }
 };

--- a/src/AggregateFunctions/StatCommon.h
+++ b/src/AggregateFunctions/StatCommon.h
@ -21,20 +21,6 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
 }

-template <typename F>
-static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
-{
-    const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a)));
-    const long double h = (b - a) / iterations;
-    Float64 sum_odds = 0.0;
-    for (size_t i = 1; i < iterations; i += 2)
-        sum_odds += func(a + i * h);
-    Float64 sum_evens = 0.0;
-    for (size_t i = 2; i < iterations; i += 2)
-        sum_evens += func(a + i * h);
-    return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
-}
-
 /// Because ranks are adjusted, we have to store each of them in Float type.
 using RanksArray = std::vector<Float64>;

@ -126,4 +112,3 @@ struct StatisticalSample
 };

 }
-
--- a/tests/queries/0_stateless/01560_mann_whitney.reference
+++ b/tests/queries/0_stateless/01560_mann_whitney.reference
@ -1,5 +1,5 @@
-(223,0.5426959774289524)
+(223,0.5426959774289482)
 223.0	0.5426959774289482
-223	0.5426959774289524
-223	0.5426959774289524
-223	0.5426959774289524
+223	0.5426959774289482
+223	0.5426959774289482
+223	0.5426959774289482
--- a/tests/queries/0_stateless/02293_ttest_large_samples.reference
+++ b/tests/queries/0_stateless/02293_ttest_large_samples.reference
@ -0,0 +1,2 @@
+-0.12709	0.89887
+-1.27088	0.20377
--- a/tests/queries/0_stateless/02293_ttest_large_samples.sql
+++ b/tests/queries/0_stateless/02293_ttest_large_samples.sql
@ -0,0 +1,53 @@
+SELECT roundBankers(result.1, 5), roundBankers(result.2, 5) FROM (
+SELECT
+     studentTTest(sample, variant) as result
+FROM (
+SELECT
+    toFloat64(number) % 30 AS sample,
+    0 AS variant
+FROM system.numbers limit 500000
+
+UNION ALL
+
+SELECT
+    toFloat64(number) % 30 + 0.0022 AS sample,
+    1 AS variant
+FROM system.numbers limit 500000));
+
+
+SELECT roundBankers(result.1, 5), roundBankers(result.2, 5 ) FROM (
+SELECT
+     studentTTest(sample, variant) as result
+FROM (
+SELECT
+    toFloat64(number) % 30 AS sample,
+    0 AS variant
+FROM system.numbers limit 50000000
+
+UNION ALL
+
+SELECT
+    toFloat64(number) % 30 + 0.0022 AS sample,
+    1 AS variant
+FROM system.numbers limit 50000000));
+
+
+SELECT roundBankers(result.2, 1025)
+FROM
+(
+    SELECT studentTTest(sample, variant) AS result
+    FROM
+    (
+        SELECT
+            toFloat64(number) % 30 AS sample,
+            1048576 AS variant
+        FROM system.numbers
+        LIMIT 1
+        UNION ALL
+        SELECT
+            (toFloat64(number) % 7) + inf AS sample,
+            255 AS variant
+        FROM system.numbers
+        LIMIT 1023
+    )
+); -- { serverError 36 }