mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-25 17:12:03 +00:00
Do not calculate integrals in statistical tests (#36953)
This commit is contained in:
parent
dcad154105
commit
85a1204e95
@ -6,8 +6,10 @@
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Common/assert_cast.h>
|
||||
#include <Common/ArenaAllocator.h>
|
||||
#include <Common/PODArray_fwd.h>
|
||||
#include <base/types.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypesDecimal.h>
|
||||
#include <DataTypes/DataTypeNullable.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
@ -16,10 +18,7 @@
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <limits>
|
||||
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
|
||||
#include <Common/ArenaAllocator.h>
|
||||
|
||||
#include <boost/math/distributions/normal.hpp>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
@ -84,15 +83,14 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
|
||||
if (alternative == Alternative::TwoSided)
|
||||
z = std::abs(z);
|
||||
|
||||
/// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
|
||||
/// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
|
||||
const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
|
||||
auto standart_normal_distribution = boost::math::normal_distribution<Float64>();
|
||||
auto cdf = boost::math::cdf(standart_normal_distribution, z);
|
||||
|
||||
Float64 p_value = 0;
|
||||
if (alternative == Alternative::TwoSided)
|
||||
p_value = 1 - 2 * cdf;
|
||||
p_value = 2 - 2 * cdf;
|
||||
else
|
||||
p_value = 0.5 - cdf;
|
||||
p_value = 1 - cdf;
|
||||
|
||||
return {u2, p_value};
|
||||
}
|
||||
|
@ -62,7 +62,17 @@ struct StudentTTestData : public TTestMoments<Float64>
|
||||
/// t-statistic
|
||||
Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);
|
||||
|
||||
return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
|
||||
if (isNaN(t_stat))
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Resulted t-statistics is NaN");
|
||||
|
||||
auto student = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
|
||||
Float64 pvalue = 0;
|
||||
if (t_stat > 0)
|
||||
pvalue = 2 * boost::math::cdf<Float64>(student, -t_stat);
|
||||
else
|
||||
pvalue = 2 * boost::math::cdf<Float64>(student, t_stat);
|
||||
|
||||
return {t_stat, pvalue};
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -34,50 +34,6 @@ namespace ErrorCodes
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
/**
|
||||
* If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1−F(T)
|
||||
* In our case p-value is two-sided, so we multiply it by 2.
|
||||
* So cumulative distribution function F equals to
|
||||
* \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
|
||||
* where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
|
||||
*
|
||||
* so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
|
||||
*
|
||||
* And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
|
||||
*
|
||||
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
|
||||
* and
|
||||
* \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
|
||||
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
|
||||
*
|
||||
* p-value can be calculated in terms of gamma functions and integrals more simply:
|
||||
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* which simplifies to:
|
||||
*
|
||||
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
|
||||
*
|
||||
* Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
|
||||
* So the procedure of computing p-value is the same.
|
||||
*/
|
||||
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) /// NOLINT
|
||||
{
|
||||
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
|
||||
[degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); });
|
||||
|
||||
int unused;
|
||||
Float64 denominator = std::exp(
|
||||
lgamma_r(degrees_of_freedom / 2, &unused)
|
||||
+ lgamma_r(0.5, &unused)
|
||||
- lgamma_r(degrees_of_freedom / 2 + 0.5, &unused));
|
||||
|
||||
return std::min(1.0, std::max(0.0, numerator / denominator));
|
||||
}
|
||||
|
||||
|
||||
/// Returns tuple of (t-statistic, p-value)
|
||||
/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
|
||||
|
@ -10,7 +10,6 @@ namespace ErrorCodes
|
||||
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
||||
}
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
struct Settings;
|
||||
@ -53,7 +52,14 @@ struct WelchTTestData : public TTestMoments<Float64>
|
||||
Float64 se = getStandardError();
|
||||
Float64 t_stat = (mean_x - mean_y) / se;
|
||||
|
||||
return {t_stat, getPValue(getDegreesOfFreedom(), t_stat * t_stat)};
|
||||
auto students_t_distribution = boost::math::students_t_distribution<Float64>(getDegreesOfFreedom());
|
||||
Float64 pvalue = 0;
|
||||
if (t_stat > 0)
|
||||
pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, -t_stat);
|
||||
else
|
||||
pvalue = 2 * boost::math::cdf<Float64>(students_t_distribution, t_stat);
|
||||
|
||||
return {t_stat, pvalue};
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -21,20 +21,6 @@ namespace ErrorCodes
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
template <typename F>
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
|
||||
{
|
||||
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a)));
|
||||
const long double h = (b - a) / iterations;
|
||||
Float64 sum_odds = 0.0;
|
||||
for (size_t i = 1; i < iterations; i += 2)
|
||||
sum_odds += func(a + i * h);
|
||||
Float64 sum_evens = 0.0;
|
||||
for (size_t i = 2; i < iterations; i += 2)
|
||||
sum_evens += func(a + i * h);
|
||||
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
|
||||
}
|
||||
|
||||
/// Because ranks are adjusted, we have to store each of them in Float type.
|
||||
using RanksArray = std::vector<Float64>;
|
||||
|
||||
@ -126,4 +112,3 @@ struct StatisticalSample
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
(223,0.5426959774289524)
|
||||
(223,0.5426959774289482)
|
||||
223.0 0.5426959774289482
|
||||
223 0.5426959774289524
|
||||
223 0.5426959774289524
|
||||
223 0.5426959774289524
|
||||
223 0.5426959774289482
|
||||
223 0.5426959774289482
|
||||
223 0.5426959774289482
|
||||
|
@ -0,0 +1,2 @@
|
||||
-0.12709 0.89887
|
||||
-1.27088 0.20377
|
53
tests/queries/0_stateless/02293_ttest_large_samples.sql
Normal file
53
tests/queries/0_stateless/02293_ttest_large_samples.sql
Normal file
@ -0,0 +1,53 @@
|
||||
SELECT roundBankers(result.1, 5), roundBankers(result.2, 5) FROM (
|
||||
SELECT
|
||||
studentTTest(sample, variant) as result
|
||||
FROM (
|
||||
SELECT
|
||||
toFloat64(number) % 30 AS sample,
|
||||
0 AS variant
|
||||
FROM system.numbers limit 500000
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
toFloat64(number) % 30 + 0.0022 AS sample,
|
||||
1 AS variant
|
||||
FROM system.numbers limit 500000));
|
||||
|
||||
|
||||
SELECT roundBankers(result.1, 5), roundBankers(result.2, 5 ) FROM (
|
||||
SELECT
|
||||
studentTTest(sample, variant) as result
|
||||
FROM (
|
||||
SELECT
|
||||
toFloat64(number) % 30 AS sample,
|
||||
0 AS variant
|
||||
FROM system.numbers limit 50000000
|
||||
|
||||
UNION ALL
|
||||
|
||||
SELECT
|
||||
toFloat64(number) % 30 + 0.0022 AS sample,
|
||||
1 AS variant
|
||||
FROM system.numbers limit 50000000));
|
||||
|
||||
|
||||
SELECT roundBankers(result.2, 1025)
|
||||
FROM
|
||||
(
|
||||
SELECT studentTTest(sample, variant) AS result
|
||||
FROM
|
||||
(
|
||||
SELECT
|
||||
toFloat64(number) % 30 AS sample,
|
||||
1048576 AS variant
|
||||
FROM system.numbers
|
||||
LIMIT 1
|
||||
UNION ALL
|
||||
SELECT
|
||||
(toFloat64(number) % 7) + inf AS sample,
|
||||
255 AS variant
|
||||
FROM system.numbers
|
||||
LIMIT 1023
|
||||
)
|
||||
); -- { serverError 36 }
|
Loading…
Reference in New Issue
Block a user