From 3554e6c36aba948a138e2c3f1473ad5275ea2aaa Mon Sep 17 00:00:00 2001 From: "philip.han" Date: Fri, 17 Jul 2020 21:06:29 +0900 Subject: [PATCH] Replace vector with PODArray and add testcases --- src/Functions/abtesting.cpp | 80 +++++++++++++------ src/Functions/abtesting.h | 14 ++-- src/Functions/tests/abtesting.cpp | 58 +++++++++----- .../01411_bayesian_ab_testing.reference | 4 + .../0_stateless/01411_bayesian_ab_testing.sql | 4 + 5 files changed, 110 insertions(+), 50 deletions(-) create mode 100644 tests/queries/0_stateless/01411_bayesian_ab_testing.reference create mode 100644 tests/queries/0_stateless/01411_bayesian_ab_testing.sql diff --git a/src/Functions/abtesting.cpp b/src/Functions/abtesting.cpp index 52131908087..f73fa9efa65 100644 --- a/src/Functions/abtesting.cpp +++ b/src/Functions/abtesting.cpp @@ -29,16 +29,16 @@ static const String BETA = "beta"; static const String GAMMA = "gamma"; template -Variants bayesian_ab_test(String distribution, std::vector xs, std::vector ys) +Variants bayesian_ab_test(String distribution, PODArray & xs, PODArray & ys) { const size_t r = 1000, c = 100; - Variants variants(xs.size()); - std::vector> samples_matrix; + Variants variants(xs.size(), {0.0, 0.0}); + std::vector> samples_matrix; if (distribution == BETA) { - double alpha, beta; + Float64 alpha, beta; for (size_t i = 0; i < xs.size(); ++i) if (xs[i] < ys[i]) @@ -49,29 +49,32 @@ Variants bayesian_ab_test(String distribution, std::vector xs, std::vect alpha = 1.0 + ys[i]; beta = 1.0 + xs[i] - ys[i]; - samples_matrix.push_back(stats::rbeta>(r, c, alpha, beta)); + samples_matrix.emplace_back(std::move(stats::rbeta>(r, c, alpha, beta))); } } else if (distribution == GAMMA) { - double shape, scale; + Float64 shape, scale; for (size_t i = 0; i < xs.size(); ++i) { shape = 1.0 + xs[i]; scale = 250.0 / (1 + 250.0 * ys[i]); - std::vector samples = stats::rgamma>(r, c, shape, scale); + + std::vector samples = stats::rgamma>(r, c, shape, scale); for (size_t j = 0; j < samples.size(); ++j) samples[j] = 1 / samples[j]; - samples_matrix.push_back(samples); + samples_matrix.emplace_back(std::move(samples)); } } - std::vector means; + PODArray means; for (size_t i = 0; i < xs.size(); ++i) { - auto mean = accumulate(samples_matrix[i].begin(), samples_matrix[i].end(), 0.0) / samples_matrix[i].size(); - means.push_back(mean); + Float64 total = 0.0; + for (size_t j = 0; j < samples_matrix[i].size(); ++j) + total += samples_matrix[i][j]; + means.push_back(total / samples_matrix[i].size()); } // Beats control @@ -93,18 +96,18 @@ Variants bayesian_ab_test(String distribution, std::vector xs, std::vect } for (size_t i = 1; i < xs.size(); ++i) - variants[i].beats_control = static_cast(variants[i].beats_control) / r / c; + variants[i].beats_control = static_cast(variants[i].beats_control) / r / c; // To be best - std::vector count_m(xs.size(), 0); - std::vector row(xs.size(), 0); + PODArray count_m(xs.size(), 0); + PODArray row(xs.size(), 0); for (size_t n = 0; n < r * c; ++n) { for (size_t i = 0; i < xs.size(); ++i) row[i] = samples_matrix[i][n]; - double m; + Float64 m; if (higher_is_better) m = *std::max_element(row.begin(), row.end()); else @@ -121,11 +124,37 @@ Variants bayesian_ab_test(String distribution, std::vector xs, std::vect } for (size_t i = 0; i < xs.size(); ++i) - variants[i].best = static_cast(variants[i].best) / r / c; + variants[i].best = static_cast(variants[i].best) / r / c; return variants; } +String convertToJson(const PODArray & variant_names, const Variants & variants) +{ + FormatSettings settings; + std::stringstream s; + + { + WriteBufferFromOStream buf(s); + + writeCString("{\"data\":[", buf); + for (size_t i = 0; i < variants.size(); ++i) + { + writeCString("{\"variant_name\":", buf); + writeJSONString(variant_names[i], buf, settings); + writeCString(",\"beats_control\":", buf); + writeText(variants[i].beats_control, buf); + writeCString(",\"to_be_best\":", buf); + writeText(variants[i].best, buf); + writeCString("}", buf); + if (i != variant_names.size() -1) writeCString(",", buf); + } + writeCString("]}", buf); + } + + return s.str(); +} + class FunctionBayesAB : public IFunction { public: @@ -159,8 +188,8 @@ public: return; } - std::vector xs, ys; - std::vector variant_names; + PODArray xs, ys; + PODArray variant_names; String dist; bool higher_is_better; @@ -192,7 +221,7 @@ public: if (const ColumnConst * col_const_arr = checkAndGetColumnConst(block.getByPosition(arguments[3]).column.get())) { if (!col_const_arr) - throw Exception("Forth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("Forth argument for function " + getName() + " must be Array of constant flaot64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); Array src_arr = col_const_arr->getValue(); @@ -203,7 +232,7 @@ public: if (const ColumnConst * col_const_arr = checkAndGetColumnConst(block.getByPosition(arguments[4]).column.get())) { if (!col_const_arr) - throw Exception("Fifth argument for function " + getName() + " must be Array of constant doubles", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + throw Exception("Fifth argument for function " + getName() + " must be Array of constant float64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); Array src_arr = col_const_arr->getValue(); @@ -214,12 +243,11 @@ public: if (variant_names.size() != xs.size() || xs.size() != ys.size()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Sizes of arguments doen't match: variant_names: {}, xs: {}, ys: {}", variant_names.size(), xs.size(), ys.size()); - if (std::count_if(xs.begin(), xs.end(), [](double v) { return v < 0; }) > 0 || - std::count_if(ys.begin(), ys.end(), [](double v) { return v < 0; }) > 0) + if (std::count_if(xs.begin(), xs.end(), [](Float64 v) { return v < 0; }) > 0 || + std::count_if(ys.begin(), ys.end(), [](Float64 v) { return v < 0; }) > 0) throw Exception("Negative values don't allowed", ErrorCodes::BAD_ARGUMENTS); Variants variants; - if (higher_is_better) variants = bayesian_ab_test(dist, xs, ys); else @@ -237,9 +265,9 @@ public: writeCString("{\"variant_name\":", buf); writeJSONString(variant_names[i], buf, settings); writeCString(",\"beats_control\":", buf); - writeText(variants[i].beats_control, buf); + writeFloatText(variants[i].beats_control, buf); writeCString(",\"to_be_best\":", buf); - writeText(variants[i].best, buf); + writeFloatText(variants[i].best, buf); writeCString("}", buf); if (i != xs.size() -1) writeCString(",", buf); } @@ -247,7 +275,7 @@ public: } auto dst = ColumnString::create(); - std::string result_str = s.str(); + std::string result_str = convertToJson(variant_names, variants).c_str(); dst->insertData(result_str.c_str(), result_str.length()); block.getByPosition(result).column = std::move(dst); } diff --git a/src/Functions/abtesting.h b/src/Functions/abtesting.h index 0d1fd7cf35c..fac79a76262 100644 --- a/src/Functions/abtesting.h +++ b/src/Functions/abtesting.h @@ -5,21 +5,25 @@ #include #include +#include +#include + namespace DB { typedef struct _Variant { - double beats_control; - double best; - std::vector samples; + Float64 beats_control; + Float64 best; } Variant; -using Variants = std::vector; +using Variants = PODArray; template -Variants bayesian_ab_test(std::string distribution, std::vector xs, std::vector ys); +Variants bayesian_ab_test(String distribution, PODArray & xs, PODArray & ys); + +String convertToJson(const PODArray & variant_names, const Variants & variants); } #endif diff --git a/src/Functions/tests/abtesting.cpp b/src/Functions/tests/abtesting.cpp index adc14b5e7df..b9c65fde728 100644 --- a/src/Functions/tests/abtesting.cpp +++ b/src/Functions/tests/abtesting.cpp @@ -2,9 +2,11 @@ #include #include -DB::ABTestResult test_bayesab(std::string dist, std::vector xs, std::vector ys, size_t & max, size_t & min) +using namespace DB; + +Variants test_bayesab(std::string dist, PODArray xs, PODArray ys, size_t & max, size_t & min) { - DB::ABTestResult ret; + Variants variants; std::cout << std::fixed; if (dist == "beta") @@ -17,7 +19,7 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector xs, std::vec std::cout << "\n"; - ret = DB::bayesian_ab_test(dist, xs, ys); + variants = bayesian_ab_test(dist, xs, ys); } else if (dist == "gamma") { @@ -28,19 +30,34 @@ DB::ABTestResult test_bayesab(std::string dist, std::vector xs, std::vec for (auto y : ys) std::cout << y << " "; std::cout << "\n"; - ret = DB::bayesian_ab_test(dist, xs, ys); + variants = bayesian_ab_test(dist, xs, ys); } - for (size_t i = 0; i < ret.beats_control.size(); ++i) - std::cout << i << " beats 0: " << ret.beats_control[i] << std::endl; + for (size_t i = 0; i < variants.size(); ++i) + std::cout << i << " beats 0: " << variants[i].beats_control << std::endl; - for (size_t i = 0; i < ret.beats_control.size(); ++i) - std::cout << i << " to be best: " << ret.best[i] << std::endl; + for (size_t i = 0; i < variants.size(); ++i) + std::cout << i << " to be best: " << variants[i].best << std::endl; - max = std::max_element(ret.best.begin(), ret.best.end()) - ret.best.begin(); - min = std::min_element(ret.best.begin(), ret.best.end()) - ret.best.begin(); + std::cout << convertToJson({"0", "1", "2"}, variants) << std::endl; - return ret; + Float64 max_val = 0.0, min_val = 2.0; + for (size_t i = 0; i < variants.size(); ++i) + { + if (variants[i].best > max_val) + { + max_val = variants[i].best; + max = i; + } + + if (variants[i].best < min_val) + { + min_val = variants[i].best; + min = i; + } + } + + return variants; } @@ -48,27 +65,30 @@ int main(int, char **) { size_t max, min; - auto ret = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min); + auto variants = test_bayesab("beta", {10000, 1000, 900}, {600, 110, 90}, max, min); if (max != 1) exit(1); - ret = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min); + variants = test_bayesab("beta", {3000, 3000, 3000}, {600, 100, 90}, max, min); if (max != 0) exit(1); - ret = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min); + variants = test_bayesab("beta", {3000, 3000, 3000}, {100, 90, 110}, max, min); if (max != 2) exit(1); - ret = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min); + variants = test_bayesab("beta", {3000, 3000, 3000}, {110, 90, 100}, max, min); if (max != 0) exit(1); - ret = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min); + variants = test_bayesab("gamma", {10000, 1000, 900}, {600, 110, 90}, max, min); if (max != 1) exit(1); - ret = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min); + variants = test_bayesab("gamma", {3000, 3000, 3000}, {600, 100, 90}, max, min); if (max != 0) exit(1); - ret = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min); + variants = test_bayesab("gamma", {3000, 3000, 3000}, {100, 90, 110}, max, min); if (max != 2) exit(1); - ret = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min); + variants = test_bayesab("gamma", {3000, 3000, 3000}, {110, 90, 100}, max, min); if (max != 0) exit(1); + + std::cout << "Successfully done\n"; + return 0; } diff --git a/tests/queries/0_stateless/01411_bayesian_ab_testing.reference b/tests/queries/0_stateless/01411_bayesian_ab_testing.reference new file mode 100644 index 00000000000..98fb6a68656 --- /dev/null +++ b/tests/queries/0_stateless/01411_bayesian_ab_testing.reference @@ -0,0 +1,4 @@ +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/01411_bayesian_ab_testing.sql b/tests/queries/0_stateless/01411_bayesian_ab_testing.sql new file mode 100644 index 00000000000..e162c3a8e1a --- /dev/null +++ b/tests/queries/0_stateless/01411_bayesian_ab_testing.sql @@ -0,0 +1,4 @@ +SELECT count() FROM (SELECT bayesAB('beta', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0])); +SELECT count() FROM (SELECT bayesAB('gamma', 1, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0])); +SELECT count() FROM (SELECT bayesAB('beta', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0])); +SELECT count() FROM (SELECT bayesAB('gamma', 0, ['Control', 'A', 'B'], [3000.0, 3000.0, 2000.0], [1000.0, 1100.0, 800.0]));