Do not calculate integrals in statistical tests (#36953)

This commit is contained in:
Nikita Mikhaylov 2022-06-07 15:39:39 +02:00 committed by GitHub
parent dcad154105
commit 85a1204e95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 85 additions and 75 deletions

View File

@ -6,8 +6,10 @@
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
#include <Common/ArenaAllocator.h>
#include <Common/PODArray_fwd.h>
#include <base/types.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypesDecimal.h>
#include <DataTypes/DataTypeNullable.h>
#include <DataTypes/DataTypesNumber.h>
@ -16,10 +18,7 @@
#include <IO/WriteHelpers.h>
#include <limits>
#include <DataTypes/DataTypeArray.h>
#include <Common/ArenaAllocator.h>
#include <boost/math/distributions/normal.hpp>
namespace DB
{
@ -84,15 +83,14 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
if (alternative == Alternative::TwoSided)
z = std::abs(z);
/// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
/// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
auto standart_normal_distribution = boost::math::normal_distribution<Float64>();
auto cdf = boost::math::cdf(standart_normal_distribution, z);
Float64 p_value = 0;
if (alternative == Alternative::TwoSided)
p_value = 1 - 2 * cdf;
p_value = 2 - 2 * cdf;
else
p_value = 0.5 - cdf;
p_value = 1 - cdf;
return {u2, p_value};
}

View File

@ -62,7 +62,17 @@ struct StudentTTestData : public TTestMoments<Float64>
/// t-statistic
Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);
return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
if (isNaN(t_stat))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Resulted t-statistics is NaN");
auto student = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
Float64 pvalue = 0;
if (t_stat > 0)
pvalue = 2 * boost::math::cdf<Float64>(student, -t_stat);
else
pvalue = 2 * boost::math::cdf<Float64>(student, t_stat);
return {t_stat, pvalue};
}
};

View File

@ -34,50 +34,6 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
/**
* If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1F(T)
* In our case p-value is two-sided, so we multiply it by 2.
* So cumulative distribution function F equals to
* \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
* where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
*
* so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
*
* And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
*
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
* and
* \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
*
* p-value can be calculated in terms of gamma functions and integrals more simply:
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* which simplifies to:
*
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
*
* Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
* So the procedure of computing p-value is the same.
*/
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) /// NOLINT
{
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
[degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); });
int unused;
Float64 denominator = std::exp(
lgamma_r(degrees_of_freedom / 2, &unused)
+ lgamma_r(0.5, &unused)
- lgamma_r(degrees_of_freedom / 2 + 0.5, &unused));
return std::min(1.0, std::max(0.0, numerator / denominator));
}
/// Returns tuple of (t-statistic, p-value)
/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf

View File

@ -10,7 +10,6 @@ namespace ErrorCodes
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
}
namespace DB
{
struct Settings;
@ -53,7 +52,14 @@ struct WelchTTestData : public TTestMoments<Float64>
Float64 se = getStandardError();
Float64 t_stat = (mean_x - mean_y) / se;
return {t_stat, getPValue(getDegreesOfFreedom(), t_stat * t_stat)};
auto students_t_distribution = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
Float64 pvalue = 0;
if (t_stat > 0)
pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, -t_stat);
else
pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, t_stat);
return {t_stat, pvalue};
}
};

View File

@ -21,20 +21,6 @@ namespace ErrorCodes
extern const int BAD_ARGUMENTS;
}
template <typename F>
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
{
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a)));
const long double h = (b - a) / iterations;
Float64 sum_odds = 0.0;
for (size_t i = 1; i < iterations; i += 2)
sum_odds += func(a + i * h);
Float64 sum_evens = 0.0;
for (size_t i = 2; i < iterations; i += 2)
sum_evens += func(a + i * h);
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
/// Because ranks are adjusted, we have to store each of them in Float type.
using RanksArray = std::vector<Float64>;
@ -126,4 +112,3 @@ struct StatisticalSample
};
}

View File

@ -1,5 +1,5 @@
(223,0.5426959774289524)
(223,0.5426959774289482)
223.0 0.5426959774289482
223 0.5426959774289524
223 0.5426959774289524
223 0.5426959774289524
223 0.5426959774289482
223 0.5426959774289482
223 0.5426959774289482

View File

@ -0,0 +1,2 @@
-0.12709 0.89887
-1.27088 0.20377

View File

@ -0,0 +1,53 @@
SELECT roundBankers(result.1, 5), roundBankers(result.2, 5) FROM (
SELECT
studentTTest(sample, variant) as result
FROM (
SELECT
toFloat64(number) % 30 AS sample,
0 AS variant
FROM system.numbers limit 500000
UNION ALL
SELECT
toFloat64(number) % 30 + 0.0022 AS sample,
1 AS variant
FROM system.numbers limit 500000));
SELECT roundBankers(result.1, 5), roundBankers(result.2, 5 ) FROM (
SELECT
studentTTest(sample, variant) as result
FROM (
SELECT
toFloat64(number) % 30 AS sample,
0 AS variant
FROM system.numbers limit 50000000
UNION ALL
SELECT
toFloat64(number) % 30 + 0.0022 AS sample,
1 AS variant
FROM system.numbers limit 50000000));
SELECT roundBankers(result.2, 1025)
FROM
(
SELECT studentTTest(sample, variant) AS result
FROM
(
SELECT
toFloat64(number) % 30 AS sample,
1048576 AS variant
FROM system.numbers
LIMIT 1
UNION ALL
SELECT
(toFloat64(number) % 7) + inf AS sample,
255 AS variant
FROM system.numbers
LIMIT 1023
)
); -- { serverError 36 }