Replace vector with PODArray and add testcases

This commit is contained in:
philip.han 2020-07-17 21:06:29 +09:00
parent 769a6dfb08
commit 3554e6c36a
5 changed files with 110 additions and 50 deletions

View File

@ -29,16 +29,16 @@ static const String BETA = "beta";
static const String GAMMA = "gamma";
template <bool higher_is_better>
Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vector<double> ys)
Variants bayesian_ab_test(String distribution, PODArray<Float64> & xs, PODArray<Float64> & ys)
{
const size_t r = 1000, c = 100;
Variants variants(xs.size());
std::vector<std::vector<double>> samples_matrix;
Variants variants(xs.size(), {0.0, 0.0});
std::vector<std::vector<Float64>> samples_matrix;
if (distribution == BETA)
{
double alpha, beta;
Float64 alpha, beta;
for (size_t i = 0; i < xs.size(); ++i)
if (xs[i] < ys[i])
@ -49,29 +49,32 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
alpha = 1.0 + ys[i];
beta = 1.0 + xs[i] - ys[i];
samples_matrix.push_back(stats::rbeta<std::vector<double>>(r, c, alpha, beta));
samples_matrix.emplace_back(std::move(stats::rbeta<std::vector<Float64>>(r, c, alpha, beta)));
}
}
else if (distribution == GAMMA)
{
double shape, scale;
Float64 shape, scale;
for (size_t i = 0; i < xs.size(); ++i)
{
shape = 1.0 + xs[i];
scale = 250.0 / (1 + 250.0 * ys[i]);
std::vector<double> samples = stats::rgamma<std::vector<double>>(r, c, shape, scale);
std::vector<Float64> samples = stats::rgamma<std::vector<Float64>>(r, c, shape, scale);
for (size_t j = 0; j < samples.size(); ++j)
samples[j] = 1 / samples[j];
samples_matrix.push_back(samples);
samples_matrix.emplace_back(std::move(samples));
}
}
std::vector<double> means;
PODArray<Float64> means;
for (size_t i = 0; i < xs.size(); ++i)
{
auto mean = accumulate(samples_matrix[i].begin(), samples_matrix[i].end(), 0.0) / samples_matrix[i].size();
means.push_back(mean);
Float64 total = 0.0;
for (size_t j = 0; j < samples_matrix[i].size(); ++j)
total += samples_matrix[i][j];
means.push_back(total / samples_matrix[i].size());
}
// Beats control
@ -93,18 +96,18 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
}
for (size_t i = 1; i < xs.size(); ++i)
variants[i].beats_control = static_cast<double>(variants[i].beats_control) / r / c;
variants[i].beats_control = static_cast<Float64>(variants[i].beats_control) / r / c;
// To be best
std::vector<size_t> count_m(xs.size(), 0);
std::vector<double> row(xs.size(), 0);
PODArray<size_t> count_m(xs.size(), 0);
PODArray<Float64> row(xs.size(), 0);
for (size_t n = 0; n < r * c; ++n)
{
for (size_t i = 0; i < xs.size(); ++i)
row[i] = samples_matrix[i][n];
double m;
Float64 m;
if (higher_is_better)
m = *std::max_element(row.begin(), row.end());
else
@ -121,11 +124,37 @@ Variants bayesian_ab_test(String distribution, std::vector<double> xs, std::vect
}
for (size_t i = 0; i < xs.size(); ++i)
variants[i].best = static_cast<double>(variants[i].best) / r / c;
variants[i].best = static_cast<Float64>(variants[i].best) / r / c;
return variants;
}
String convertToJson(const PODArray<String> & variant_names, const Variants & variants)
{
FormatSettings settings;
std::stringstream s;
{
WriteBufferFromOStream buf(s);
writeCString("{\"data\":[", buf);
for (size_t i = 0; i < variants.size(); ++i)
{
writeCString("{\"variant_name\":", buf);
writeJSONString(variant_names[i], buf, settings);
writeCString(",\"beats_control\":", buf);
writeText(variants[i].beats_control, buf);
writeCString(",\"to_be_best\":", buf);
writeText(variants[i].best, buf);
writeCString("}", buf);
if (i != variant_names.size() -1) writeCString(",", buf);
}
writeCString("]}", buf);
}
return s.str();
}
class FunctionBayesAB : public IFunction
{
public:
@ -159,8 +188,8 @@ public:
return;
}
std::vector<double> xs, ys;
std::vector<std::string> variant_names;
PODArray<Float64> xs, ys;
PODArray<String> variant_names;
String dist;
bool higher_is_better;
@ -192,7 +221,7 @@ public:
if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[3]).column.get()))
{
if (!col_const_arr)
throw Exception("Forth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
throw Exception("Forth argument for function " + getName() + " must be Array of constant flaot64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
Array src_arr = col_const_arr->getValue<Array>();
@ -203,7 +232,7 @@ public:
if (const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(block.getByPosition(arguments[4]).column.get()))
{
if (!col_const_arr)
throw Exception("Fifth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
throw Exception("Fifth argument for function " + getName() + " must be Array of constant float64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
Array src_arr = col_const_arr->getValue<Array>();
@ -214,12 +243,11 @@ public:
if (variant_names.size() != xs.size() || xs.size() != ys.size())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Sizes of arguments doen't match: variant_names: {}, xs: {}, ys: {}", variant_names.size(), xs.size(), ys.size());
if (std::count_if(xs.begin(), xs.end(), [](double v) { return v < 0; }) > 0 ||
std::count_if(ys.begin(), ys.end(), [](double v) { return v < 0; }) > 0)
if (std::count_if(xs.begin(), xs.end(), [](Float64 v) { return v < 0; }) > 0 ||
std::count_if(ys.begin(), ys.end(), [](Float64 v) { return v < 0; }) > 0)
throw Exception("Negative values don't allowed", ErrorCodes::BAD_ARGUMENTS);
Variants variants;
if (higher_is_better)
variants = bayesian_ab_test<true>(dist, xs, ys);
else
@ -237,9 +265,9 @@ public:
writeCString("{\"variant_name\":", buf);
writeJSONString(variant_names[i], buf, settings);
writeCString(",\"beats_control\":", buf);
writeText(variants[i].beats_control, buf);
writeFloatText<Float64>(variants[i].beats_control, buf);
writeCString(",\"to_be_best\":", buf);
writeText(variants[i].best, buf);
writeFloatText<Float64>(variants[i].best, buf);
writeCString("}", buf);
if (i != xs.size() -1) writeCString(",", buf);
}
@ -247,7 +275,7 @@ public:
}
auto dst = ColumnString::create();
std::string result_str = s.str();
std::string result_str = convertToJson(variant_names, variants).c_str();
dst->insertData(result_str.c_str(), result_str.length());
block.getByPosition(result).column = std::move(dst);
}

View File

@ -5,21 +5,25 @@
#include <vector>
#include <algorithm>
#include <Core/Types.h>
#include <Common/PODArray.h>
namespace DB
{
typedef struct _Variant
{
double beats_control;
double best;
std::vector<double> samples;
Float64 beats_control;
Float64 best;
} Variant;
using Variants = std::vector<Variant>;
using Variants = PODArray<Variant>;
template <bool higher_is_better>
Variants bayesian_ab_test(std::string distribution, std::vector<double> xs, std::vector<double> ys);
Variants bayesian_ab_test(String distribution, PODArray<Float64> & xs, PODArray<Float64> & ys);
String convertToJson(const PODArray<String> & variant_names, const Variants & variants);
}
#endif

View File

@ -2,9 +2,11 @@
#include <iostream>
#include <stdio.h>
DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vector<double> ys, size_t & max, size_t & min)
using namespace DB;
Variants test_bayesab(std::string dist, PODArray<Float64> xs, PODArray<Float64> ys, size_t & max, size_t & min)
{
DB::ABTestResult ret;
Variants variants;
std::cout << std::fixed;
if (dist == "beta")
@ -17,7 +19,7 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vec
std::cout << "\n";
ret = DB::bayesian_ab_test<true>(dist, xs, ys);
variants = bayesian_ab_test<true>(dist, xs, ys);
}
else if (dist == "gamma")
{
@ -28,19 +30,34 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector<double> xs, std::vec
for (auto y : ys) std::cout << y << " ";
std::cout << "\n";
ret = DB::bayesian_ab_test<false>(dist, xs, ys);
variants = bayesian_ab_test<true>(dist, xs, ys);
}
for (size_t i = 0; i < ret.beats_control.size(); ++i)
std::cout << i << " beats 0: " << ret.beats_control[i] << std::endl;
for (size_t i = 0; i < variants.size(); ++i)
std::cout << i << " beats 0: " << variants[i].beats_control << std::endl;
for (size_t i = 0; i < ret.beats_control.size(); ++i)
std::cout << i << " to be best: " << ret.best[i] << std::endl;
for (size_t i = 0; i < variants.size(); ++i)
std::cout << i << " to be best: " << variants[i].best << std::endl;
max = std::max_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
min = std::min_element(ret.best.begin(), ret.best.end()) - ret.best.begin();
std::cout << convertToJson({"0", "1", "2"}, variants) << std::endl;
return ret;
Float64 max_val = 0.0, min_val = 2.0;
for (size_t i = 0; i < variants.size(); ++i)
{
if (variants[i].best > max_val)
{
max_val = variants[i].best;
max = i;
}
if (variants[i].best < min_val)
{
min_val = variants[i].best;
min = i;
}
}
return variants;
}
@ -48,27 +65,30 @@ int main(int, char **)
{
size_t max, min;
auto ret = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min);
auto variants = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min);
if (max != 1) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min);
variants = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min);
variants = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min);
if (max != 2) exit(1);
ret = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min);
variants = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min);
variants = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min);
if (max != 1) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min);
variants = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min);
if (max != 0) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min);
variants = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min);
if (max != 2) exit(1);
ret = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min);
variants = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min);
if (max != 0) exit(1);
std::cout << "Successfully done\n";
return 0;
}

View File

@ -0,0 +1,4 @@
1
1
1
1

View File

@ -0,0 +1,4 @@
SELECT count() FROM (SELECT bayesAB('beta', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
SELECT count() FROM (SELECT bayesAB('gamma', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
SELECT count() FROM (SELECT bayesAB('beta', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));
SELECT count() FROM (SELECT bayesAB('gamma', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));