2020-11-05 19:08:49 +00:00
|
|
|
#include <AggregateFunctions/AggregateFunctionFactory.h>
|
2020-11-02 09:50:08 +00:00
|
|
|
#include <AggregateFunctions/AggregateFunctionTTest.h>
|
2020-11-05 19:08:49 +00:00
|
|
|
#include <AggregateFunctions/FactoryHelpers.h>
|
2020-11-02 09:50:08 +00:00
|
|
|
#include <AggregateFunctions/Moments.h>
|
2020-11-05 19:08:49 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2020-11-02 09:50:08 +00:00
|
|
|
extern const int BAD_ARGUMENTS;
|
2021-11-09 03:08:13 +00:00
|
|
|
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|
|
|
|
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-05 19:08:49 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2021-05-26 11:32:14 +00:00
|
|
|
struct Settings;
|
2020-11-05 19:08:49 +00:00
|
|
|
|
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
2020-11-02 16:44:12 +00:00
|
|
|
/** Student T-test applies to two samples of independent random variables
|
|
|
|
* that have normal distributions with equal (but unknown) variances.
|
|
|
|
* It allows to answer the question whether means of the distributions differ.
|
|
|
|
*
|
|
|
|
* If variances are not considered equal, Welch T-test should be used instead.
|
|
|
|
*/
|
2020-11-02 09:50:08 +00:00
|
|
|
struct StudentTTestData : public TTestMoments<Float64>
|
2020-11-05 19:08:49 +00:00
|
|
|
{
|
2020-11-02 09:50:08 +00:00
|
|
|
static constexpr auto name = "studentTTest";
|
2020-11-05 19:08:49 +00:00
|
|
|
|
2022-01-15 10:33:27 +00:00
|
|
|
bool hasEnoughObservations() const
|
|
|
|
{
|
|
|
|
return nx > 0 && ny > 0 && nx + ny > 2;
|
|
|
|
}
|
|
|
|
|
2021-11-09 03:08:13 +00:00
|
|
|
Float64 getDegreesOfFreedom() const
|
2020-11-05 19:08:49 +00:00
|
|
|
{
|
2021-11-09 03:08:13 +00:00
|
|
|
return nx + ny - 2;
|
|
|
|
}
|
|
|
|
|
|
|
|
std::tuple<Float64, Float64> getResult() const
|
|
|
|
{
|
|
|
|
Float64 mean_x = getMeanX();
|
|
|
|
Float64 mean_y = getMeanY();
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-02 16:44:12 +00:00
|
|
|
/// To estimate the variance we first estimate two means.
|
|
|
|
/// That's why the number of degrees of freedom is the total number of values of both samples minus 2.
|
2021-11-09 03:08:13 +00:00
|
|
|
Float64 degrees_of_freedom = getDegreesOfFreedom();
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-02 16:44:12 +00:00
|
|
|
/// Calculate s^2
|
2020-11-02 09:50:08 +00:00
|
|
|
/// The original formulae looks like
|
|
|
|
/// \frac{\sum_{i = 1}^{n_x}{(x_i - \bar{x}) ^ 2} + \sum_{i = 1}^{n_y}{(y_i - \bar{y}) ^ 2}}{n_x + n_y - 2}
|
|
|
|
/// But we made some mathematical transformations not to store original sequences.
|
|
|
|
/// Also we dropped sqrt, because later it will be squared later.
|
|
|
|
|
2020-11-06 17:48:58 +00:00
|
|
|
Float64 all_x = x2 + nx * mean_x * mean_x - 2 * mean_x * x1;
|
|
|
|
Float64 all_y = y2 + ny * mean_y * mean_y - 2 * mean_y * y1;
|
2020-11-02 09:50:08 +00:00
|
|
|
|
|
|
|
Float64 s2 = (all_x + all_y) / degrees_of_freedom;
|
2020-11-25 14:45:27 +00:00
|
|
|
Float64 std_err2 = s2 * (1. / nx + 1. / ny);
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-02 16:44:12 +00:00
|
|
|
/// t-statistic
|
2020-11-02 09:50:08 +00:00
|
|
|
Float64 t_stat = (mean_x - mean_y) / sqrt(std_err2);
|
|
|
|
|
|
|
|
return {t_stat, getPValue(degrees_of_freedom, t_stat * t_stat)};
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|
2020-11-02 09:50:08 +00:00
|
|
|
};
|
2020-11-05 19:08:49 +00:00
|
|
|
|
2021-06-07 00:15:11 +00:00
|
|
|
AggregateFunctionPtr createAggregateFunctionStudentTTest(
|
|
|
|
const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
|
2020-11-02 09:50:08 +00:00
|
|
|
{
|
|
|
|
assertBinary(name, argument_types);
|
2021-11-09 03:08:13 +00:00
|
|
|
|
|
|
|
if (parameters.size() > 1)
|
|
|
|
throw Exception("Aggregate function " + name + " requires zero or one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
2020-11-02 09:50:08 +00:00
|
|
|
|
|
|
|
if (!isNumber(argument_types[0]) || !isNumber(argument_types[1]))
|
|
|
|
throw Exception("Aggregate function " + name + " only supports numerical types", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
|
2021-11-09 03:08:13 +00:00
|
|
|
return std::make_shared<AggregateFunctionTTest<StudentTTestData>>(argument_types, parameters);
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void registerAggregateFunctionStudentTTest(AggregateFunctionFactory & factory)
|
|
|
|
{
|
2020-11-06 17:48:58 +00:00
|
|
|
factory.registerFunction("studentTTest", createAggregateFunctionStudentTTest);
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|
2020-11-02 09:50:08 +00:00
|
|
|
|
2020-11-05 19:08:49 +00:00
|
|
|
}
|