diff --git a/src/AggregateFunctions/AggregateFunctionMannWhitney.h b/src/AggregateFunctions/AggregateFunctionMannWhitney.h index 089f70cd26b..d861eef10ab 100644 --- a/src/AggregateFunctions/AggregateFunctionMannWhitney.h +++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h @@ -6,8 +6,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -16,10 +18,7 @@ #include #include -#include - -#include - +#include namespace DB { @@ -84,15 +83,14 @@ struct MannWhitneyData : public StatisticalSample if (alternative == Alternative::TwoSided) z = std::abs(z); - /// In fact cdf is a probability function, so it is intergral of density from (-inf, z]. - /// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z]. - const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);}); + auto standart_normal_distribution = boost::math::normal_distribution(); + auto cdf = boost::math::cdf(standart_normal_distribution, z); Float64 p_value = 0; if (alternative == Alternative::TwoSided) - p_value = 1 - 2 * cdf; + p_value = 2 - 2 * cdf; else - p_value = 0.5 - cdf; + p_value = 1 - cdf; return {u2, p_value}; } diff --git a/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp b/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp index 83a91ef06fc..f1beff806e2 100644 --- a/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp +++ b/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp @@ -62,7 +62,17 @@ struct StudentTTestData : public TTestMoments /// t-statistic Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2); - return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)}; + if (isNaN(t_stat)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Resulted t-statistics is NaN"); + + auto student = boost::math::students_t_distribution(getDegreesOfFreedom()); + Float64 pvalue = 0; + if (t_stat > 0) + pvalue = 2 * boost::math::cdf(student, -t_stat); + else + pvalue = 2 * boost::math::cdf(student, t_stat); + + return {t_stat, pvalue}; } }; diff --git a/src/AggregateFunctions/AggregateFunctionTTest.h b/src/AggregateFunctions/AggregateFunctionTTest.h index 7ef5cfce9c9..b72e7a3cdcb 100644 --- a/src/AggregateFunctions/AggregateFunctionTTest.h +++ b/src/AggregateFunctions/AggregateFunctionTTest.h @@ -34,50 +34,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -/** - * If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1−F(T) - * In our case p-value is two-sided, so we multiply it by 2. - * So cumulative distribution function F equals to - * \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \] - * where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function - * - * so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]. - * - * And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function - * - * Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \] - * and - * \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\ - * \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \] - * - * p-value can be calculated in terms of gamma functions and integrals more simply: - * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\ - * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \] - * - * which simplifies to: - * - * \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\ - * {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \] - * - * Read here for details https://rosettacode.org/wiki/Welch%27s_t-test# - * - * Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom. - * So the procedure of computing p-value is the same. -*/ -static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) /// NOLINT -{ - Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom), - [degrees_of_freedom](double x) { return std::pow(x, degrees_of_freedom / 2 - 1) / std::sqrt(1 - x); }); - - int unused; - Float64 denominator = std::exp( - lgamma_r(degrees_of_freedom / 2, &unused) - + lgamma_r(0.5, &unused) - - lgamma_r(degrees_of_freedom / 2 + 0.5, &unused)); - - return std::min(1.0, std::max(0.0, numerator / denominator)); -} - /// Returns tuple of (t-statistic, p-value) /// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf diff --git a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp index fe5cf83c509..74000296a2d 100644 --- a/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp +++ b/src/AggregateFunctions/AggregateFunctionWelchTTest.cpp @@ -10,7 +10,6 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } - namespace DB { struct Settings; @@ -53,7 +52,14 @@ struct WelchTTestData : public TTestMoments Float64 se = getStandardError(); Float64 t_stat = (mean_x - mean_y) / se; - return {t_stat, getPValue(getDegreesOfFreedom(), t_stat * t_stat)}; + auto students_t_distribution = boost::math::students_t_distribution(getDegreesOfFreedom()); + Float64 pvalue = 0; + if (t_stat > 0) + pvalue = 2 * boost::math::cdf(students_t_distribution, -t_stat); + else + pvalue = 2 * boost::math::cdf(students_t_distribution, t_stat); + + return {t_stat, pvalue}; } }; diff --git a/src/AggregateFunctions/StatCommon.h b/src/AggregateFunctions/StatCommon.h index d670e646f4b..29163b63f77 100644 --- a/src/AggregateFunctions/StatCommon.h +++ b/src/AggregateFunctions/StatCommon.h @@ -21,20 +21,6 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } -template -static Float64 integrateSimpson(Float64 a, Float64 b, F && func) -{ - const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b) - std::round(a))); - const long double h = (b - a) / iterations; - Float64 sum_odds = 0.0; - for (size_t i = 1; i < iterations; i += 2) - sum_odds += func(a + i * h); - Float64 sum_evens = 0.0; - for (size_t i = 2; i < iterations; i += 2) - sum_evens += func(a + i * h); - return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3; -} - /// Because ranks are adjusted, we have to store each of them in Float type. using RanksArray = std::vector; @@ -126,4 +112,3 @@ struct StatisticalSample }; } - diff --git a/tests/queries/0_stateless/01560_mann_whitney.reference b/tests/queries/0_stateless/01560_mann_whitney.reference index 5a718fdca97..dcd1882b096 100644 --- a/tests/queries/0_stateless/01560_mann_whitney.reference +++ b/tests/queries/0_stateless/01560_mann_whitney.reference @@ -1,5 +1,5 @@ -(223,0.5426959774289524) +(223,0.5426959774289482) 223.0 0.5426959774289482 -223 0.5426959774289524 -223 0.5426959774289524 -223 0.5426959774289524 +223 0.5426959774289482 +223 0.5426959774289482 +223 0.5426959774289482 diff --git a/tests/queries/0_stateless/02293_ttest_large_samples.reference b/tests/queries/0_stateless/02293_ttest_large_samples.reference new file mode 100644 index 00000000000..2e90a4a98b2 --- /dev/null +++ b/tests/queries/0_stateless/02293_ttest_large_samples.reference @@ -0,0 +1,2 @@ +-0.12709 0.89887 +-1.27088 0.20377 diff --git a/tests/queries/0_stateless/02293_ttest_large_samples.sql b/tests/queries/0_stateless/02293_ttest_large_samples.sql new file mode 100644 index 00000000000..d4d919d6e7c --- /dev/null +++ b/tests/queries/0_stateless/02293_ttest_large_samples.sql @@ -0,0 +1,53 @@ +SELECT roundBankers(result.1, 5), roundBankers(result.2, 5) FROM ( +SELECT + studentTTest(sample, variant) as result +FROM ( +SELECT + toFloat64(number) % 30 AS sample, + 0 AS variant +FROM system.numbers limit 500000 + +UNION ALL + +SELECT + toFloat64(number) % 30 + 0.0022 AS sample, + 1 AS variant +FROM system.numbers limit 500000)); + + +SELECT roundBankers(result.1, 5), roundBankers(result.2, 5 ) FROM ( +SELECT + studentTTest(sample, variant) as result +FROM ( +SELECT + toFloat64(number) % 30 AS sample, + 0 AS variant +FROM system.numbers limit 50000000 + +UNION ALL + +SELECT + toFloat64(number) % 30 + 0.0022 AS sample, + 1 AS variant +FROM system.numbers limit 50000000)); + + +SELECT roundBankers(result.2, 1025) +FROM +( + SELECT studentTTest(sample, variant) AS result + FROM + ( + SELECT + toFloat64(number) % 30 AS sample, + 1048576 AS variant + FROM system.numbers + LIMIT 1 + UNION ALL + SELECT + (toFloat64(number) % 7) + inf AS sample, + 255 AS variant + FROM system.numbers + LIMIT 1023 + ) +); -- { serverError 36 }