Make at least some sense

This commit is contained in:
Alexey Milovidov 2022-01-03 22:18:57 +03:00
parent 04a39317c1
commit ac84015f01
4 changed files with 50 additions and 35 deletions

View File

@ -23,16 +23,7 @@ struct ContingencyData : CrossTabData
if (count < 2)
return std::numeric_limits<Float64>::quiet_NaN();
Float64 phi = 0.0;
for (const auto & [key, value_ab] : count_ab)
{
Float64 value_a = count_a.at(key.items[0]);
Float64 value_b = count_b.at(key.items[1]);
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
}
phi /= count;
Float64 phi = getPhiSquared();
return sqrt(phi / (phi + count));
}
};

View File

@ -23,20 +23,8 @@ struct CramersVData : CrossTabData
if (count < 2)
return std::numeric_limits<Float64>::quiet_NaN();
Float64 phi = 0.0;
for (const auto & [key, value_ab] : count_ab)
{
Float64 value_a = count_a.at(key.items[0]);
Float64 value_b = count_b.at(key.items[1]);
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
}
phi /= count;
UInt64 q = std::min(count_a.size(), count_b.size());
phi /= q - 1;
return sqrt(phi);
return sqrt(getPhiSquared() / (q - 1));
}
};

View File

@ -23,22 +23,17 @@ struct CramersVBiasCorrectedData : CrossTabData
if (count < 2)
return std::numeric_limits<Float64>::quiet_NaN();
Float64 phi = 0.0;
for (const auto & [key, value_ab] : count_ab)
{
Float64 value_a = count_a.at(key.items[0]);
Float64 value_b = count_b.at(key.items[1]);
Float64 phi = getPhiSquared();
phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
}
Float64 a_size_adjusted = count_a.size() - 1;
Float64 b_size_adjusted = count_b.size() - 1;
Float64 count_adjusted = count - 1;
phi /= count;
Float64 res = std::max(0.0, phi - a_size_adjusted * b_size_adjusted / count_adjusted);
Float64 correction_a = count_a.size() - a_size_adjusted * a_size_adjusted / count_adjusted;
Float64 correction_b = count_b.size() - b_size_adjusted * b_size_adjusted / count_adjusted;
Float64 res = std::max(0.0, phi - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1));
Float64 correction_a = count_a.size() - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_a.size()) - 1) / (count - 1);
Float64 correction_b = count_b.size() - (static_cast<Float64>(count_b.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1);
res /= std::min(correction_a, correction_b) - 1;
return sqrt(res);
}
};

View File

@ -69,6 +69,47 @@ struct CrossTabData
count_b.read(buf);
count_ab.read(buf);
}
/** See https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
*
* φ² is χ² divided by the sample size (count).
* χ² is the sum of squares of the normalized differences between the "expected" and "observed" statistics.
* ("Expected" in the case when one of the hypotheses is true).
* Something resembling the L2 distance.
*
* Note: statisticians use the name χ² for every statistic that has χ² distribution in many various contexts.
*
* Let's suppose that there is no association between the values a and b.
* Then the frequency (e.g. probability) of (a, b) pair is equal to the multiplied frequencies of a and b:
* count_ab / count = (count_a / count) * (count_b / count)
* count_ab = count_a * count_b / count
*
* Let's calculate the difference between the values that are supposed to be equal if there is no association between a and b:
* count_ab - count_a * count_b / count
*
* Let's sum the squares of the differences across all (a, b) pairs.
* Then divide by the second term for normalization: (count_a * count_b / count)
*
* This will be the χ² statistics.
* This statistics is used as a base for many other statistics.
*/
Float64 getPhiSquared() const
{
Float64 chi_squared = 0;
for (const auto & [key, value_ab] : count_ab)
{
Float64 value_a = count_a.at(key.items[0]);
Float64 value_b = count_b.at(key.items[1]);
Float64 expected_value_ab = (value_a * value_b) / count;
Float64 chi_squared_elem = value_ab - expected_value_ab;
chi_squared_elem = chi_squared_elem * chi_squared_elem / expected_value_ab;
chi_squared += chi_squared_elem;
}
return chi_squared / count;
}
};