add student t test

2024-11-21 15:12:02 +00:00 · 2020-10-12 21:10:01 +03:00 · 2020-10-12 21:10:01 +03:00 · e65a2a1cbd
commit e65a2a1cbd
parent 20ebd4fd5b
4 changed files with 288 additions and 0 deletions
--- a/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionStudentTTest.cpp
@ -0,0 +1,56 @@
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/AggregateFunctionStudentTTest.h>
+#include <AggregateFunctions/FactoryHelpers.h>
+#include "registerAggregateFunctions.h"
+
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeAggregateFunction.h>
+
+
+// the return type is boolean (we use UInt8 as we do not have boolean in clickhouse)
+
+namespace ErrorCodes
+{
+extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+extern const int NOT_IMPLEMENTED;
+}
+
+namespace DB
+{
+
+namespace
+{
+
+AggregateFunctionPtr createAggregateFunctionStudentTTest(const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertBinary(name, argument_types);
+    assertNoParameters(name, parameters);
+
+    AggregateFunctionPtr res;
+
+    if (isDecimal(argument_types[0]) || isDecimal(argument_types[1]))
+    {
+        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
+    }
+    else
+    {
+        res.reset(createWithTwoNumericTypes<AggregateFunctionStudentTTest>(*argument_types[0], *argument_types[1], argument_types));
+    }
+
+    if (!res)
+    {
+        throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    return res;
+}
+
+}
+
+
+void registerAggregateFunctionStudentTTest(AggregateFunctionFactory & factory)
+{
+    factory.registerFunction("StudentTTest", createAggregateFunctionStudentTTest, AggregateFunctionFactory::CaseInsensitive);
+}
+
+}
--- a/src/AggregateFunctions/AggregateFunctionStudentTTest.h
+++ b/src/AggregateFunctions/AggregateFunctionStudentTTest.h
@ -0,0 +1,230 @@
+#pragma once
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <Common/FieldVisitors.h>
+#include <Core/Types.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <limits>
+#include <cmath>
+#include <functional>
+
+#include <type_traits>
+
+namespace ErrorCodes
+{
+extern const int BAD_ARGUMENTS;
+}
+
+namespace DB
+{
+
+template <typename X = Float64, typename Y = Float64>
+struct AggregateFunctionStudentTTestData final
+{
+    size_t size_x = 0;
+    size_t size_y = 0;
+    X sum_x = static_cast<X>(0);
+    Y sum_y = static_cast<Y>(0);
+    X square_sum_x = static_cast<X>(0);
+    Y square_sum_y = static_cast<Y>(0);
+    Float64 mean_x = static_cast<Float64>(0);
+    Float64 mean_y = static_cast<Float64>(0);
+
+    void add(X x, Y y)
+    {
+        sum_x += x;
+        sum_y += y;
+        size_x++;
+        size_y++;
+        mean_x = static_cast<Float64>(sum_x) / size_x;
+        mean_y = static_cast<Float64>(sum_y) / size_y;
+        square_sum_x += x * x;
+        square_sum_y += y * y;
+    }
+
+    void merge(const AggregateFunctionStudentTTestData &other)
+    {
+        sum_x += other.sum_x;
+        sum_y += other.sum_y;
+        size_x += other.size_x;
+        size_y += other.size_y;
+        mean_x = static_cast<Float64>(sum_x) / size_x;
+        mean_y = static_cast<Float64>(sum_y) / size_y;
+        square_sum_x += other.square_sum_x;
+        square_sum_y += other.square_sum_y;
+    }
+
+    void serialize(WriteBuffer &buf) const
+    {
+        writeBinary(mean_x, buf);
+        writeBinary(mean_y, buf);
+        writeBinary(sum_x, buf);
+        writeBinary(sum_y, buf);
+        writeBinary(square_sum_x, buf);
+        writeBinary(square_sum_y, buf);
+        writeBinary(size_x, buf);
+        writeBinary(size_y, buf);
+    }
+
+    void deserialize(ReadBuffer &buf)
+    {
+        readBinary(mean_x, buf);
+        readBinary(mean_y, buf);
+        readBinary(sum_x, buf);
+        readBinary(sum_y, buf);
+        readBinary(square_sum_x, buf);
+        readBinary(square_sum_y, buf);
+        readBinary(size_x, buf);
+        readBinary(size_y, buf);
+    }
+
+    size_t getSizeY() const
+    {
+        return size_y;
+    }
+
+    size_t getSizeX() const
+    {
+        return size_x;
+    }
+
+    Float64 getSSquared() const
+    {
+        /// TODO: Update comment with Tex.
+        /// The original formulae looks like  ...
+        /// But we made some mathematical transformations not to store original sequences.
+        /// Also we dropped sqrt, because later it will be squared later.
+        const Float64 all_x = square_sum_x + size_x * std::pow(mean_x, 2) - 2 * mean_x * sum_x;
+        const Float64 all_y = square_sum_y + size_y * std::pow(mean_y, 2) - 2 * mean_y * sum_y;
+        return static_cast<Float64>(all_x + all_y) / (size_x + size_y - 2);
+    }
+
+
+    Float64 getTStatisticSquared() const
+    {
+        if (size_x == 0 || size_y == 0)
+        {
+            throw Exception("Division by zero encountered in Aggregate function StudentTTest", ErrorCodes::BAD_ARGUMENTS);
+        }
+
+        if (mean_x - mean_y < 1e-8)
+        {
+            return static_cast<Float64>(0.0);
+        }
+
+        return std::pow(mean_x - mean_y, 2) / getStandartErrorSquared();
+    }
+
+
+    Float64 getStandartErrorSquared() const
+    {
+        return getSSquared() * (1 / size_x + 1 / size_y);
+    }
+
+    Float64 getDegreesOfFreedom() const
+    {
+        return static_cast<Float64>(size_x + size_y - 2);
+    }
+
+    static Float64 integrateSimpson(Float64 a, Float64 b, std::function<Float64(Float64)> func, size_t iterations = 1e6)
+    {
+        double h = (b - a) / iterations;
+        Float64 sum_odds = 0.0;
+        for (size_t i = 1; i < iterations; i += 2)
+            sum_odds += func(a + i * h);
+        Float64 sum_evens = 0.0;
+        for (size_t i = 2; i < iterations; i += 2)
+            sum_evens += func(a + i * h);
+        return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
+    }
+
+    Float64 getPValue() const
+    {
+        const Float64 v = getDegreesOfFreedom();
+        const Float64 t = getTStatisticSquared();
+        std::cout << "getDegreesOfFreedom " << v << " getTStatisticSquared " << t << std::endl;
+        auto f = [&v] (double x) { return std::pow(x, v/2 - 1) / std::sqrt(1 - x); };
+        Float64 numenator = integrateSimpson(0, v / (t + v), f);
+        Float64 denominator = std::exp(std::lgammal(v/2) + std::lgammal(0.5) - std::lgammal(v/2 + 0.5));
+        return numenator / denominator;
+    }
+
+    Float64 getResult() const
+    {
+        return getPValue();
+    }
+};
+
+/// Returns p-value
+/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
+template <typename X = Float64, typename Y = Float64>
+class AggregateFunctionStudentTTest : 
+    public IAggregateFunctionDataHelper<AggregateFunctionStudentTTestData<X, Y>,AggregateFunctionStudentTTest<X, Y>>
+{
+
+public:
+    AggregateFunctionStudentTTest(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<AggregateFunctionStudentTTestData<X, Y>, AggregateFunctionStudentTTest<X, Y>> ({arguments}, {})
+    {}
+
+    String getName() const override
+    {
+        return "StudentTTest";
+    }
+
+    DataTypePtr getReturnType() const override
+    {
+        return std::make_shared<DataTypeNumber<Float64>>();
+    }
+
+    void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto col_x = assert_cast<const ColumnVector<X> *>(columns[0]);
+        auto col_y = assert_cast<const ColumnVector<Y> *>(columns[1]);
+
+        X x = col_x->getData()[row_num];
+        Y y = col_y->getData()[row_num];
+
+        this->data(place).add(x, y);
+    }
+
+    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr place, IColumn & to, Arena * /*arena*/) const override
+    {
+        size_t size_x = this->data(place).getSizeX();
+        size_t size_y = this->data(place).getSizeY();
+
+        if (size_x < 2 || size_y < 2)
+        {
+            throw Exception("Aggregate function " + getName() + " requires samples to be of size > 1", ErrorCodes::BAD_ARGUMENTS);
+        }
+
+        auto & column = static_cast<ColumnVector<Float64> &>(to);
+        column.getData().push_back(this->data(place).getResult());
+    }
+
+};
+
+};
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -46,6 +46,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionCategoricalIV(factory);
        registerAggregateFunctionAggThrow(factory);
        registerAggregateFunctionWelchTTest(factory);
+        registerAggregateFunctionStudentTTest(factory);
        registerAggregateFunctionRankCorrelation(factory);
    }

--- a/src/AggregateFunctions/registerAggregateFunctions.h
+++ b/src/AggregateFunctions/registerAggregateFunctions.h
@ -36,6 +36,7 @@ void registerAggregateFunctionMoving(AggregateFunctionFactory &);
 void registerAggregateFunctionCategoricalIV(AggregateFunctionFactory &);
 void registerAggregateFunctionAggThrow(AggregateFunctionFactory &);
 void registerAggregateFunctionWelchTTest(AggregateFunctionFactory &);
+void registerAggregateFunctionStudentTTest(AggregateFunctionFactory &);
 void registerAggregateFunctionRankCorrelation(AggregateFunctionFactory &);

 class AggregateFunctionCombinatorFactory;