mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 17:20:50 +00:00
style
This commit is contained in:
parent
de9dc3a43f
commit
d1eb9f02dd
@ -35,10 +35,10 @@ namespace ErrorCodes
|
||||
|
||||
struct MannWhitneyData : public StatisticalSample<Float64, Float64>
|
||||
{
|
||||
/*Since null hypotesis is "for randomly selected values X and Y from two populations,
|
||||
/*Since null hypothesis is "for randomly selected values X and Y from two populations,
|
||||
*the probability of X being greater than Y is equal to the probability of Y being greater than X".
|
||||
*Or "the distribution F of first sample equals to the distribution G of second sample".
|
||||
*Then alternative for this hypotesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
|
||||
*Or "the distribution F of first sample equals to the distribution G of second sample".
|
||||
*Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
|
||||
enum class Alternative
|
||||
{
|
||||
TwoSided,
|
||||
@ -67,7 +67,7 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
|
||||
const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
|
||||
const Float64 u2 = n1 * n2 - u1;
|
||||
|
||||
/// The distribution of U-statistic under null hypotesis H0 is symmetric with respect to meanrank.
|
||||
/// The distribution of U-statistic under null hypothesis H0 is symmetric with respect to meanrank.
|
||||
const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
|
||||
const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
|
||||
|
||||
@ -82,7 +82,7 @@ struct MannWhitneyData : public StatisticalSample<Float64, Float64>
|
||||
|
||||
const Float64 z = std::abs((u - meanrank) / sd);
|
||||
/// In fact cdf is a probability function, so it is intergral of density from (-inf, z].
|
||||
/// But since standart normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
|
||||
/// But since standard normal distribution is symmetric, cdf(0) = 0.5 and we have to compute integral from [0, z].
|
||||
const Float64 cdf = integrateSimpson(0, z, [] (Float64 t) { return std::pow(M_E, -0.5 * t * t) / std::sqrt(2 * M_PI);});
|
||||
|
||||
Float64 p_value = 0;
|
||||
|
@ -30,30 +30,30 @@ class WriteBuffer;
|
||||
/**
|
||||
* If you have a cumulative distribution function F, then calculating the p-value for given statistic T is simply 1−F(T)
|
||||
* In our case p-value is two-sided, so we multiply it by 2.
|
||||
* So cumulative distribution function F equals to
|
||||
* So cumulative distribution function F equals to
|
||||
* \[ F(t) = \int_{-\infty}^{t} f(u)du = 1 - \frac{1}{2} I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \]
|
||||
* where \[ x(t) = \frac{v}{t^2 + v} \]: https://en.wikipedia.org/wiki/Student%27s_t-distribution#Cumulative_distribution_function
|
||||
*
|
||||
*
|
||||
* so our resulting \[ p-value = I_{x(t)}(\frac{v}{2}, \frac{1}{2}) \].
|
||||
*
|
||||
*
|
||||
* And I is regularized incomplete beta function: https://en.wikipedia.org/wiki/Beta_function#Incomplete_beta_function
|
||||
*
|
||||
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
|
||||
*
|
||||
* Keepenig in mind that \[ \mathrm {B} (x;a,b)=\int _{0}^{x}r^{a-1}\,(1-r)^{b-1}\,\mathrm {d} r.\! \]
|
||||
* and
|
||||
* \[ \mathrm {B} (x,y)={\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}}=\
|
||||
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
|
||||
*
|
||||
* \exp(\ln {\dfrac {\Gamma (x)\,\Gamma (y)}{\Gamma (x+y)}})=\exp((\ln(\Gamma (x))+\ln(\Gamma (y))-\ln(\Gamma (x+y))) \]
|
||||
*
|
||||
* p-value can be calculated in terms of gamma functions and integrals more simply:
|
||||
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}r^{{\frac {\nu }{2}}-1}\,(1-r)^{-0.5}\,\mathrm {d} r}\
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* which simplifies to:
|
||||
*
|
||||
*
|
||||
* \[ {\frac {\int _{0}^{\frac {\nu }{t^{2}+\nu }}{\frac {r^{{\frac {\nu }{2}}-1}}{\sqrt {1-r}}}\,\mathrm {d} r}\
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* {\exp((\ln(\Gamma ({\frac {\nu }{2}}))+\ln(\Gamma (0.5))-\ln(\Gamma ({\frac {\nu }{2}}+0.5)))}} \]
|
||||
*
|
||||
* Read here for details https://rosettacode.org/wiki/Welch%27s_t-test#
|
||||
*
|
||||
*
|
||||
* Both WelchTTest and StudentTTest have t-statistric with Student distribution but with different degrees of freedom.
|
||||
* So the procedure of computing p-value is the same.
|
||||
*/
|
||||
|
@ -82,7 +82,7 @@ struct VarMoments
|
||||
// to avoid accuracy problem
|
||||
if (m[0] == 1)
|
||||
return 0;
|
||||
/// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
|
||||
/// \[ \frac{1}{m_0} (m_3 - (3 * m_2 - \frac{2 * {m_1}^2}{m_0}) * \frac{m_1}{m_0});\]
|
||||
return (m[3]
|
||||
- (3 * m[2]
|
||||
- 2 * m[1] * m[1] / m[0]
|
||||
@ -97,7 +97,7 @@ struct VarMoments
|
||||
// to avoid accuracy problem
|
||||
if (m[0] == 1)
|
||||
return 0;
|
||||
/// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
|
||||
/// \[ \frac{1}{m_0}(m_4 - (4 * m_3 - (6 * m_2 - \frac{3 * m_1^2}{m_0} ) \frac{m_1}{m_0})\frac{m_1}{m_0})\]
|
||||
return (m[4]
|
||||
- (4 * m[3]
|
||||
- (6 * m[2]
|
||||
|
Loading…
Reference in New Issue
Block a user