Make at least some sense

2024-11-10 09:32:06 +00:00 · 2022-01-03 22:18:57 +03:00 · 2022-01-03 22:18:57 +03:00 · ac84015f01
commit ac84015f01
parent 04a39317c1
4 changed files with 50 additions and 35 deletions
--- a/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp
+++ b/src/AggregateFunctions/AggregateFunctionContingencyCoefficient.cpp
@ -23,16 +23,7 @@ struct ContingencyData : CrossTabData
        if (count < 2)
            return std::numeric_limits<Float64>::quiet_NaN();

-        Float64 phi = 0.0;
-        for (const auto & [key, value_ab] : count_ab)
-        {
-            Float64 value_a = count_a.at(key.items[0]);
-            Float64 value_b = count_b.at(key.items[1]);
-
-            phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
-        }
-        phi /= count;
-
+        Float64 phi = getPhiSquared();
        return sqrt(phi / (phi + count));
    }
 };
--- a/src/AggregateFunctions/AggregateFunctionCramersV.cpp
+++ b/src/AggregateFunctions/AggregateFunctionCramersV.cpp
@ -23,20 +23,8 @@ struct CramersVData : CrossTabData
        if (count < 2)
            return std::numeric_limits<Float64>::quiet_NaN();

-        Float64 phi = 0.0;
-        for (const auto & [key, value_ab] : count_ab)
-        {
-            Float64 value_a = count_a.at(key.items[0]);
-            Float64 value_b = count_b.at(key.items[1]);
-
-            phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
-        }
-
-        phi /= count;
        UInt64 q = std::min(count_a.size(), count_b.size());
-        phi /= q - 1;
-
-        return sqrt(phi);
+        return sqrt(getPhiSquared() / (q - 1));
    }
 };

--- a/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp
+++ b/src/AggregateFunctions/AggregateFunctionCramersVBiasCorrected.cpp
@ -23,22 +23,17 @@ struct CramersVBiasCorrectedData : CrossTabData
        if (count < 2)
            return std::numeric_limits<Float64>::quiet_NaN();

-        Float64 phi = 0.0;
-        for (const auto & [key, value_ab] : count_ab)
-        {
-            Float64 value_a = count_a.at(key.items[0]);
-            Float64 value_b = count_b.at(key.items[1]);
+        Float64 phi = getPhiSquared();

-            phi += value_ab * value_ab / (value_a * value_b) * count - 2 * value_ab + (value_a * value_b) / count;
-        }
+        Float64 a_size_adjusted = count_a.size() - 1;
+        Float64 b_size_adjusted = count_b.size() - 1;
+        Float64 count_adjusted = count - 1;

-        phi /= count;
+        Float64 res = std::max(0.0, phi - a_size_adjusted * b_size_adjusted / count_adjusted);
+        Float64 correction_a = count_a.size() - a_size_adjusted * a_size_adjusted / count_adjusted;
+        Float64 correction_b = count_b.size() - b_size_adjusted * b_size_adjusted / count_adjusted;

-        Float64 res = std::max(0.0, phi - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1));
-        Float64 correction_a = count_a.size() - (static_cast<Float64>(count_a.size()) - 1) * (static_cast<Float64>(count_a.size()) - 1) / (count - 1);
-        Float64 correction_b = count_b.size() - (static_cast<Float64>(count_b.size()) - 1) * (static_cast<Float64>(count_b.size()) - 1) / (count - 1);
        res /= std::min(correction_a, correction_b) - 1;
-
        return sqrt(res);
    }
 };
--- a/src/AggregateFunctions/CrossTab.h
+++ b/src/AggregateFunctions/CrossTab.h
@ -69,6 +69,47 @@ struct CrossTabData
        count_b.read(buf);
        count_ab.read(buf);
    }
+
+    /** See https://en.wikipedia.org/wiki/Cram%C3%A9r%27s_V
+      *
+      * φ² is χ² divided by the sample size (count).
+      * χ² is the sum of squares of the normalized differences between the "expected" and "observed" statistics.
+      * ("Expected" in the case when one of the hypotheses is true).
+      * Something resembling the L2 distance.
+      *
+      * Note: statisticians use the name χ² for every statistic that has χ² distribution in many various contexts.
+      *
+      * Let's suppose that there is no association between the values a and b.
+      * Then the frequency (e.g. probability) of (a, b) pair is equal to the multiplied frequencies of a and b:
+      * count_ab / count = (count_a / count) * (count_b / count)
+      * count_ab = count_a * count_b / count
+      *
+      * Let's calculate the difference between the values that are supposed to be equal if there is no association between a and b:
+      * count_ab - count_a * count_b / count
+      *
+      * Let's sum the squares of the differences across all (a, b) pairs.
+      * Then divide by the second term for normalization: (count_a * count_b / count)
+      *
+      * This will be the χ² statistics.
+      * This statistics is used as a base for many other statistics.
+      */
+    Float64 getPhiSquared() const
+    {
+        Float64 chi_squared = 0;
+        for (const auto & [key, value_ab] : count_ab)
+        {
+            Float64 value_a = count_a.at(key.items[0]);
+            Float64 value_b = count_b.at(key.items[1]);
+
+            Float64 expected_value_ab = (value_a * value_b) / count;
+
+            Float64 chi_squared_elem = value_ab - expected_value_ab;
+            chi_squared_elem = chi_squared_elem * chi_squared_elem / expected_value_ab;
+
+            chi_squared += chi_squared_elem;
+        }
+        return chi_squared / count;
+    }
 };