This commit is contained in:
nikitamikhaylov 2020-11-19 16:16:36 +03:00
parent de9dc3a43f
commit d1eb9f02dd
3 changed files with 20 additions and 20 deletions

View File

@ -35,10 +35,10 @@ namespace ErrorCodes
struct MannWhitneyData : public StatisticalSample<Float64, Float64>
{
/*Since null hypotesis is "for randomly selected values X and Y from two populations,
/*Since null hypothesis is "for randomly selected values X and Y from two populations,
*the probability of X being greater than Y is equal to the probability of Y being greater than X".
*Or "the distribution F of first sample equals to the distribution G of second sample".
*Then alternative for this hypotesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
*Or "the distribution F of first sample equals to the distribution G of second sample".
*Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
enum class Alternative
{
TwoSided,
@ -67,7 +67,7 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
const Float64 u2 = n1 * n2 - u1;
/// The distribution of U-statistic under null hypotesis H0 is symmetric with respect to meanrank.
/// The distribution of U-statistic under null hypothesis H0 is symmetric with respect to meanrank.
const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
@ -82,7 +82,7 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
const Float64 z = std::abs((u - meanrank) / sd);
/// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
/// But since standart normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
/// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
Float64 p_value = 0;

View File

@ -30,30 +30,30 @@ class WriteBuffer;
/**
* If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1F(T)
* In our case p-value is two-sided, so we multiply it by 2.
* So cumulative distribution function F equals to
* So cumulative distribution function F equals to
* \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
* where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
*
*
* so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
*
*
* And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
*
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
*
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
* and
* \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
*
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
*
* p-value can be calculated in terms of gamma functions and integrals more simply:
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* which simplifies to:
*
*
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
*
* Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
*
*
* Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
* So the procedure of computing p-value is the same.
*/

View File

@ -82,7 +82,7 @@ struct VarMoments
// to avoid accuracy problem
if (m[0] == 1)
return 0;
/// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
/// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
return (m[3]
- (3 * m[2]
- 2 * m[1] * m[1] / m[0]
@ -97,7 +97,7 @@ struct VarMoments
// to avoid accuracy problem
if (m[0] == 1)
return 0;
/// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
/// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
return (m[4]
- (4 * m[3]
- (6 * m[2]