mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
Replace vector with PODArray and add testcases
This commit is contained in:
parent
769a6dfb08
commit
3554e6c36a
@ -29,16 +29,16 @@ static const String BETA = "beta";
|
||||
static const String GAMMA = "gamma";
|
||||
|
||||
template <bool higher_is_better>
|
||||
Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vector<double> ys)
|
||||
Variants bayesian_ab_test(String distribution, PODArray<Float64> & xs, PODArray<Float64> & ys)
|
||||
{
|
||||
const size_t r = 1000, c = 100;
|
||||
|
||||
Variants variants(xs.size());
|
||||
std::vector<std::vector<double>> samples_matrix;
|
||||
Variants variants(xs.size(), {0.0, 0.0});
|
||||
std::vector<std::vector<Float64>> samples_matrix;
|
||||
|
||||
if (distribution == BETA)
|
||||
{
|
||||
double alpha, beta;
|
||||
Float64 alpha, beta;
|
||||
|
||||
for (size_t i = 0; i < xs.size(); ++i)
|
||||
if (xs[i] < ys[i])
|
||||
@ -49,29 +49,32 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
|
||||
alpha = 1.0 + ys[i];
|
||||
beta = 1.0 + xs[i] - ys[i];
|
||||
|
||||
samples_matrix.push_back(stats::rbeta<std::vector<double>>(r, c, alpha, beta));
|
||||
samples_matrix.emplace_back(std::move(stats::rbeta<std::vector<Float64>>(r, c, alpha, beta)));
|
||||
}
|
||||
}
|
||||
else if (distribution == GAMMA)
|
||||
{
|
||||
double shape, scale;
|
||||
Float64 shape, scale;
|
||||
|
||||
for (size_t i = 0; i < xs.size(); ++i)
|
||||
{
|
||||
shape = 1.0 + xs[i];
|
||||
scale = 250.0 / (1 + 250.0 * ys[i]);
|
||||
std::vector<double> samples = stats::rgamma<std::vector<double>>(r, c, shape, scale);
|
||||
|
||||
std::vector<Float64> samples = stats::rgamma<std::vector<Float64>>(r, c, shape, scale);
|
||||
for (size_t j = 0; j < samples.size(); ++j)
|
||||
samples[j] = 1 / samples[j];
|
||||
samples_matrix.push_back(samples);
|
||||
samples_matrix.emplace_back(std::move(samples));
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<double> means;
|
||||
PODArray<Float64> means;
|
||||
for (size_t i = 0; i < xs.size(); ++i)
|
||||
{
|
||||
auto mean = accumulate(samples_matrix[i].begin(), samples_matrix[i].end(), 0.0) / samples_matrix[i].size();
|
||||
means.push_back(mean);
|
||||
Float64 total = 0.0;
|
||||
for (size_t j = 0; j < samples_matrix[i].size(); ++j)
|
||||
total += samples_matrix[i][j];
|
||||
means.push_back(total / samples_matrix[i].size());
|
||||
}
|
||||
|
||||
// Beats control
|
||||
@ -93,18 +96,18 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
|
||||
}
|
||||
|
||||
for (size_t i = 1; i < xs.size(); ++i)
|
||||
variants[i].beats_control = static_cast<double>(variants[i].beats_control) / r / c;
|
||||
variants[i].beats_control = static_cast<Float64>(variants[i].beats_control) / r / c;
|
||||
|
||||
// To be best
|
||||
std::vector<size_t> count_m(xs.size(), 0);
|
||||
std::vector<double> row(xs.size(), 0);
|
||||
PODArray<size_t> count_m(xs.size(), 0);
|
||||
PODArray<Float64> row(xs.size(), 0);
|
||||
|
||||
for (size_t n = 0; n < r * c; ++n)
|
||||
{
|
||||
for (size_t i = 0; i < xs.size(); ++i)
|
||||
row[i] = samples_matrix[i][n];
|
||||
|
||||
double m;
|
||||
Float64 m;
|
||||
if (higher_is_better)
|
||||
m = *std::max_element(row.begin(), row.end());
|
||||
else
|
||||
@ -121,11 +124,37 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < xs.size(); ++i)
|
||||
variants[i].best = static_cast<double>(variants[i].best) / r / c;
|
||||
variants[i].best = static_cast<Float64>(variants[i].best) / r / c;
|
||||
|
||||
return variants;
|
||||
}
|
||||
|
||||
String convertToJson(const PODArray<String> & variant_names, const Variants & variants)
|
||||
{
|
||||
FormatSettings settings;
|
||||
std::stringstream s;
|
||||
|
||||
{
|
||||
WriteBufferFromOStream buf(s);
|
||||
|
||||
writeCString("{\"data\":[", buf);
|
||||
for (size_t i = 0; i < variants.size(); ++i)
|
||||
{
|
||||
writeCString("{\"variant_name\":", buf);
|
||||
writeJSONString(variant_names[i], buf, settings);
|
||||
writeCString(",\"beats_control\":", buf);
|
||||
writeText(variants[i].beats_control, buf);
|
||||
writeCString(",\"to_be_best\":", buf);
|
||||
writeText(variants[i].best, buf);
|
||||
writeCString("}", buf);
|
||||
if (i != variant_names.size() -1) writeCString(",", buf);
|
||||
}
|
||||
writeCString("]}", buf);
|
||||
}
|
||||
|
||||
return s.str();
|
||||
}
|
||||
|
||||
class FunctionBayesAB : public IFunction
|
||||
{
|
||||
public:
|
||||
@ -159,8 +188,8 @@ public:
|
||||
return;
|
||||
}
|
||||
|
||||
std::vector<double> xs, ys;
|
||||
std::vector<std::string> variant_names;
|
||||
PODArray<Float64> xs, ys;
|
||||
PODArray<String> variant_names;
|
||||
String dist;
|
||||
bool higher_is_better;
|
||||
|
||||
@ -192,7 +221,7 @@ public:
|
||||
if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[3]).column.get()))
|
||||
{
|
||||
if (!col_const_arr)
|
||||
throw Exception("Forth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
throw Exception("Forth argument for function " + getName() + " must be Array of constant flaot64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
Array src_arr = col_const_arr->getValue<Array>();
|
||||
|
||||
@ -203,7 +232,7 @@ public:
|
||||
if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[4]).column.get()))
|
||||
{
|
||||
if (!col_const_arr)
|
||||
throw Exception("Fifth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
throw Exception("Fifth argument for function " + getName() + " must be Array of constant float64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
Array src_arr = col_const_arr->getValue<Array>();
|
||||
|
||||
@ -214,12 +243,11 @@ public:
|
||||
if (variant_names.size() != xs.size() || xs.size() != ys.size())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Sizes of arguments doen't match: variant_names: {}, xs: {}, ys: {}", variant_names.size(), xs.size(), ys.size());
|
||||
|
||||
if (std::count_if(xs.begin(), xs.end(), [](double v) { return v < 0; }) > 0 ||
|
||||
std::count_if(ys.begin(), ys.end(), [](double v) { return v < 0; }) > 0)
|
||||
if (std::count_if(xs.begin(), xs.end(), [](Float64 v) { return v < 0; }) > 0 ||
|
||||
std::count_if(ys.begin(), ys.end(), [](Float64 v) { return v < 0; }) > 0)
|
||||
throw Exception("Negative values don't allowed", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
Variants variants;
|
||||
|
||||
if (higher_is_better)
|
||||
variants = bayesian_ab_test<true>(dist, xs, ys);
|
||||
else
|
||||
@ -237,9 +265,9 @@ public:
|
||||
writeCString("{\"variant_name\":", buf);
|
||||
writeJSONString(variant_names[i], buf, settings);
|
||||
writeCString(",\"beats_control\":", buf);
|
||||
writeText(variants[i].beats_control, buf);
|
||||
writeFloatText<Float64>(variants[i].beats_control, buf);
|
||||
writeCString(",\"to_be_best\":", buf);
|
||||
writeText(variants[i].best, buf);
|
||||
writeFloatText<Float64>(variants[i].best, buf);
|
||||
writeCString("}", buf);
|
||||
if (i != xs.size() -1) writeCString(",", buf);
|
||||
}
|
||||
@ -247,7 +275,7 @@ public:
|
||||
}
|
||||
|
||||
auto dst = ColumnString::create();
|
||||
std::string result_str = s.str();
|
||||
std::string result_str = convertToJson(variant_names, variants).c_str();
|
||||
dst->insertData(result_str.c_str(), result_str.length());
|
||||
block.getByPosition(result).column = std::move(dst);
|
||||
}
|
||||
|
@ -5,21 +5,25 @@
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
||||
#include <Core/Types.h>
|
||||
#include <Common/PODArray.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
typedef struct _Variant
|
||||
{
|
||||
double beats_control;
|
||||
double best;
|
||||
std::vector<double> samples;
|
||||
Float64 beats_control;
|
||||
Float64 best;
|
||||
} Variant;
|
||||
|
||||
using Variants = std::vector<Variant>;
|
||||
using Variants = PODArray<Variant>;
|
||||
|
||||
template <bool higher_is_better>
|
||||
Variants bayesian_ab_test(std::string distribution, std::vector<double> xs, std::vector<double> ys);
|
||||
Variants bayesian_ab_test(String distribution, PODArray<Float64> & xs, PODArray<Float64> & ys);
|
||||
|
||||
String convertToJson(const PODArray<String> & variant_names, const Variants & variants);
|
||||
|
||||
}
|
||||
#endif
|
||||
|
@ -2,9 +2,11 @@
|
||||
#include <iostream>
|
||||
#include <stdio.h>
|
||||
|
||||
DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vector<double> ys, size_t & max, size_t & min)
|
||||
using namespace DB;
|
||||
|
||||
Variants test_bayesab(std::string dist, PODArray<Float64> xs, PODArray<Float64> ys, size_t & max, size_t & min)
|
||||
{
|
||||
DB::ABTestResult ret;
|
||||
Variants variants;
|
||||
|
||||
std::cout << std::fixed;
|
||||
if (dist == "beta")
|
||||
@ -17,7 +19,7 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vec
|
||||
|
||||
std::cout << "\n";
|
||||
|
||||
ret = DB::bayesian_ab_test<true>(dist, xs, ys);
|
||||
variants = bayesian_ab_test<true>(dist, xs, ys);
|
||||
}
|
||||
else if (dist == "gamma")
|
||||
{
|
||||
@ -28,19 +30,34 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vec
|
||||
for (auto y : ys) std::cout << y << " ";
|
||||
|
||||
std::cout << "\n";
|
||||
ret = DB::bayesian_ab_test<false>(dist, xs, ys);
|
||||
variants = bayesian_ab_test<true>(dist, xs, ys);
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < ret.beats_control.size(); ++i)
|
||||
std::cout << i << " beats 0: " << ret.beats_control[i] << std::endl;
|
||||
for (size_t i = 0; i < variants.size(); ++i)
|
||||
std::cout << i << " beats 0: " << variants[i].beats_control << std::endl;
|
||||
|
||||
for (size_t i = 0; i < ret.beats_control.size(); ++i)
|
||||
std::cout << i << " to be best: " << ret.best[i] << std::endl;
|
||||
for (size_t i = 0; i < variants.size(); ++i)
|
||||
std::cout << i << " to be best: " << variants[i].best << std::endl;
|
||||
|
||||
max = std::max_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
|
||||
min = std::min_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
|
||||
std::cout << convertToJson({"0", "1", "2"}, variants) << std::endl;
|
||||
|
||||
return ret;
|
||||
Float64 max_val = 0.0, min_val = 2.0;
|
||||
for (size_t i = 0; i < variants.size(); ++i)
|
||||
{
|
||||
if (variants[i].best > max_val)
|
||||
{
|
||||
max_val = variants[i].best;
|
||||
max = i;
|
||||
}
|
||||
|
||||
if (variants[i].best < min_val)
|
||||
{
|
||||
min_val = variants[i].best;
|
||||
min = i;
|
||||
}
|
||||
}
|
||||
|
||||
return variants;
|
||||
}
|
||||
|
||||
|
||||
@ -48,27 +65,30 @@ int main(int, char **)
|
||||
{
|
||||
size_t max, min;
|
||||
|
||||
auto ret = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min);
|
||||
auto variants = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min);
|
||||
if (max != 1) exit(1);
|
||||
|
||||
ret = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min);
|
||||
variants = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min);
|
||||
if (max != 0) exit(1);
|
||||
|
||||
ret = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min);
|
||||
variants = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min);
|
||||
if (max != 2) exit(1);
|
||||
|
||||
ret = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min);
|
||||
variants = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min);
|
||||
if (max != 0) exit(1);
|
||||
|
||||
ret = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min);
|
||||
variants = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min);
|
||||
if (max != 1) exit(1);
|
||||
|
||||
ret = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min);
|
||||
variants = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min);
|
||||
if (max != 0) exit(1);
|
||||
|
||||
ret = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min);
|
||||
variants = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min);
|
||||
if (max != 2) exit(1);
|
||||
|
||||
ret = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min);
|
||||
variants = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min);
|
||||
if (max != 0) exit(1);
|
||||
|
||||
std::cout << "Successfully done\n";
|
||||
return 0;
|
||||
}
|
||||
|
@ -0,0 +1,4 @@
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
4
tests/queries/0_stateless/01411_bayesian_ab_testing.sql
Normal file
4
tests/queries/0_stateless/01411_bayesian_ab_testing.sql
Normal file
@ -0,0 +1,4 @@
|
||||
SELECT count() FROM (SELECT bayesAB('beta', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
|
||||
SELECT count() FROM (SELECT bayesAB('gamma', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
|
||||
SELECT count() FROM (SELECT bayesAB('beta', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
|
||||
SELECT count() FROM (SELECT bayesAB('gamma', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
|
Loading…
Reference in New Issue
Block a user