mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Make at least some sense
This commit is contained in:
parent
04a39317c1
commit
ac84015f01
@ -23,16 +23,7 @@ struct ContingencyData : CrossTabData
|
||||
if (count < 2)
|
||||
return std::numeric_limits<Float64>::quiet_NaN();
|
||||
|
||||
Float64 phi = 0.0;
|
||||
for (const auto & [key, value_ab] : count_ab)
|
||||
{
|
||||
Float64 value_a = count_a.at(key.items[0]);
|
||||
Float64 value_b = count_b.at(key.items[1]);
|
||||
|
||||
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
|
||||
}
|
||||
phi /= count;
|
||||
|
||||
Float64 phi = getPhiSquared();
|
||||
return sqrt(phi / (phi + count));
|
||||
}
|
||||
};
|
||||
|
@ -23,20 +23,8 @@ struct CramersVData : CrossTabData
|
||||
if (count < 2)
|
||||
return std::numeric_limits<Float64>::quiet_NaN();
|
||||
|
||||
Float64 phi = 0.0;
|
||||
for (const auto & [key, value_ab] : count_ab)
|
||||
{
|
||||
Float64 value_a = count_a.at(key.items[0]);
|
||||
Float64 value_b = count_b.at(key.items[1]);
|
||||
|
||||
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
|
||||
}
|
||||
|
||||
phi /= count;
|
||||
UInt64 q = std::min(count_a.size(), count_b.size());
|
||||
phi /= q - 1;
|
||||
|
||||
return sqrt(phi);
|
||||
return sqrt(getPhiSquared() / (q - 1));
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -23,22 +23,17 @@ struct CramersVBiasCorrectedData : CrossTabData
|
||||
if (count < 2)
|
||||
return std::numeric_limits<Float64>::quiet_NaN();
|
||||
|
||||
Float64 phi = 0.0;
|
||||
for (const auto & [key, value_ab] : count_ab)
|
||||
{
|
||||
Float64 value_a = count_a.at(key.items[0]);
|
||||
Float64 value_b = count_b.at(key.items[1]);
|
||||
Float64 phi = getPhiSquared();
|
||||
|
||||
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
|
||||
}
|
||||
Float64 a_size_adjusted = count_a.size() - 1;
|
||||
Float64 b_size_adjusted = count_b.size() - 1;
|
||||
Float64 count_adjusted = count - 1;
|
||||
|
||||
phi /= count;
|
||||
Float64 res = std::max(0.0, phi - a_size_adjusted * b_size_adjusted / count_adjusted);
|
||||
Float64 correction_a = count_a.size() - a_size_adjusted * a_size_adjusted / count_adjusted;
|
||||
Float64 correction_b = count_b.size() - b_size_adjusted * b_size_adjusted / count_adjusted;
|
||||
|
||||
Float64 res = std::max(0.0, phi - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1));
|
||||
Float64 correction_a = count_a.size() - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_a.size()) - 1) / (count - 1);
|
||||
Float64 correction_b = count_b.size() - (static_cast<Float64>(count_b.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1);
|
||||
res /= std::min(correction_a, correction_b) - 1;
|
||||
|
||||
return sqrt(res);
|
||||
}
|
||||
};
|
||||
|
@ -69,6 +69,47 @@ struct CrossTabData
|
||||
count_b.read(buf);
|
||||
count_ab.read(buf);
|
||||
}
|
||||
|
||||
/** See https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
|
||||
*
|
||||
* φ² is χ² divided by the sample size (count).
|
||||
* χ² is the sum of squares of the normalized differences between the "expected" and "observed" statistics.
|
||||
* ("Expected" in the case when one of the hypotheses is true).
|
||||
* Something resembling the L2 distance.
|
||||
*
|
||||
* Note: statisticians use the name χ² for every statistic that has χ² distribution in many various contexts.
|
||||
*
|
||||
* Let's suppose that there is no association between the values a and b.
|
||||
* Then the frequency (e.g. probability) of (a, b) pair is equal to the multiplied frequencies of a and b:
|
||||
* count_ab / count = (count_a / count) * (count_b / count)
|
||||
* count_ab = count_a * count_b / count
|
||||
*
|
||||
* Let's calculate the difference between the values that are supposed to be equal if there is no association between a and b:
|
||||
* count_ab - count_a * count_b / count
|
||||
*
|
||||
* Let's sum the squares of the differences across all (a, b) pairs.
|
||||
* Then divide by the second term for normalization: (count_a * count_b / count)
|
||||
*
|
||||
* This will be the χ² statistics.
|
||||
* This statistics is used as a base for many other statistics.
|
||||
*/
|
||||
Float64 getPhiSquared() const
|
||||
{
|
||||
Float64 chi_squared = 0;
|
||||
for (const auto & [key, value_ab] : count_ab)
|
||||
{
|
||||
Float64 value_a = count_a.at(key.items[0]);
|
||||
Float64 value_b = count_b.at(key.items[1]);
|
||||
|
||||
Float64 expected_value_ab = (value_a * value_b) / count;
|
||||
|
||||
Float64 chi_squared_elem = value_ab - expected_value_ab;
|
||||
chi_squared_elem = chi_squared_elem * chi_squared_elem / expected_value_ab;
|
||||
|
||||
chi_squared += chi_squared_elem;
|
||||
}
|
||||
return chi_squared / count;
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user