2023-03-23 15:33:23 +00:00
|
|
|
#!/usr/bin/env python3
|
2022-01-20 13:57:37 +00:00
|
|
|
import os
|
|
|
|
import sys
|
|
|
|
from statistics import variance
|
|
|
|
from scipy import stats
|
|
|
|
import pandas as pd
|
|
|
|
import numpy as np
|
|
|
|
|
|
|
|
CURDIR = os.path.dirname(os.path.realpath(__file__))
|
2023-03-23 15:33:23 +00:00
|
|
|
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
|
2022-01-20 13:57:37 +00:00
|
|
|
|
|
|
|
from pure_http_client import ClickHouseClient
|
|
|
|
|
|
|
|
|
|
|
|
# unpooled variance z-test for means of two samples
|
|
|
|
def twosample_mean_ztest(rvs1, rvs2, alpha=0.05):
|
|
|
|
mean_rvs1 = np.mean(rvs1)
|
|
|
|
mean_rvs2 = np.mean(rvs2)
|
|
|
|
var_pop_rvs1 = variance(rvs1)
|
|
|
|
var_pop_rvs2 = variance(rvs2)
|
|
|
|
se = np.sqrt(var_pop_rvs1 / len(rvs1) + var_pop_rvs2 / len(rvs2))
|
|
|
|
z_stat = (mean_rvs1 - mean_rvs2) / se
|
|
|
|
p_val = 2 * stats.norm.cdf(-1 * abs(z_stat))
|
|
|
|
z_a = stats.norm.ppf(1 - alpha / 2)
|
|
|
|
ci_low = (mean_rvs1 - mean_rvs2) - z_a * se
|
|
|
|
ci_high = (mean_rvs1 - mean_rvs2) + z_a * se
|
|
|
|
return z_stat, p_val, ci_low, ci_high
|
|
|
|
|
|
|
|
|
|
|
|
def test_and_check(name, a, b, t_stat, p_value, ci_low, ci_high, precision=1e-2):
|
|
|
|
client = ClickHouseClient()
|
|
|
|
client.query("DROP TABLE IF EXISTS ztest;")
|
2023-03-23 15:33:23 +00:00
|
|
|
client.query("CREATE TABLE ztest (left Float64, right UInt8) ENGINE = Memory;")
|
|
|
|
client.query(
|
|
|
|
"INSERT INTO ztest VALUES {};".format(
|
|
|
|
", ".join(["({},{})".format(i, 0) for i in a])
|
|
|
|
)
|
|
|
|
)
|
|
|
|
client.query(
|
|
|
|
"INSERT INTO ztest VALUES {};".format(
|
|
|
|
", ".join(["({},{})".format(j, 1) for j in b])
|
|
|
|
)
|
|
|
|
)
|
2022-01-20 13:57:37 +00:00
|
|
|
real = client.query_return_df(
|
2023-03-23 15:33:23 +00:00
|
|
|
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
|
|
|
|
+ "roundBankers({}(left, right).2, 16) as p_value, ".format(name)
|
|
|
|
+ "roundBankers({}(left, right).3, 16) as ci_low, ".format(name)
|
|
|
|
+ "roundBankers({}(left, right).4, 16) as ci_high ".format(name)
|
|
|
|
+ "FROM ztest FORMAT TabSeparatedWithNames;"
|
|
|
|
)
|
|
|
|
real_t_stat = real["t_stat"][0]
|
|
|
|
real_p_value = real["p_value"][0]
|
|
|
|
real_ci_low = real["ci_low"][0]
|
|
|
|
real_ci_high = real["ci_high"][0]
|
|
|
|
assert (
|
|
|
|
abs(real_t_stat - np.float64(t_stat)) < precision
|
|
|
|
), "clickhouse_t_stat {}, py_t_stat {}".format(real_t_stat, t_stat)
|
|
|
|
assert (
|
|
|
|
abs(real_p_value - np.float64(p_value)) < precision
|
|
|
|
), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
|
|
|
|
assert (
|
|
|
|
abs(real_ci_low - np.float64(ci_low)) < precision
|
|
|
|
), "clickhouse_ci_low {}, py_ci_low {}".format(real_ci_low, ci_low)
|
|
|
|
assert (
|
|
|
|
abs(real_ci_high - np.float64(ci_high)) < precision
|
|
|
|
), "clickhouse_ci_high {}, py_ci_high {}".format(real_ci_high, ci_high)
|
2022-01-20 13:57:37 +00:00
|
|
|
client.query("DROP TABLE IF EXISTS ztest;")
|
|
|
|
|
|
|
|
|
|
|
|
def test_mean_ztest():
|
2023-03-23 15:33:23 +00:00
|
|
|
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 2)
|
|
|
|
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
|
2022-01-20 13:57:37 +00:00
|
|
|
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
|
2023-03-23 15:33:23 +00:00
|
|
|
test_and_check(
|
|
|
|
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
|
|
|
|
rvs1,
|
|
|
|
rvs2,
|
|
|
|
s,
|
|
|
|
p,
|
|
|
|
cl,
|
|
|
|
ch,
|
|
|
|
)
|
2022-01-20 13:57:37 +00:00
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
|
|
|
|
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
|
2022-01-20 13:57:37 +00:00
|
|
|
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
|
2023-03-23 15:33:23 +00:00
|
|
|
test_and_check(
|
|
|
|
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
|
|
|
|
rvs1,
|
|
|
|
rvs2,
|
|
|
|
s,
|
|
|
|
p,
|
|
|
|
cl,
|
|
|
|
ch,
|
|
|
|
)
|
2022-01-20 13:57:37 +00:00
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10, size=512), 2)
|
|
|
|
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20, size=1024), 2)
|
2022-01-20 13:57:37 +00:00
|
|
|
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
|
2023-03-23 15:33:23 +00:00
|
|
|
test_and_check(
|
|
|
|
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
|
|
|
|
rvs1,
|
|
|
|
rvs2,
|
|
|
|
s,
|
|
|
|
p,
|
|
|
|
cl,
|
|
|
|
ch,
|
|
|
|
)
|
2022-01-20 13:57:37 +00:00
|
|
|
|
2023-03-23 15:33:23 +00:00
|
|
|
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
|
|
|
|
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=512), 2)
|
2022-01-20 13:57:37 +00:00
|
|
|
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
|
2023-03-23 15:33:23 +00:00
|
|
|
test_and_check(
|
|
|
|
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
|
|
|
|
rvs1,
|
|
|
|
rvs2,
|
|
|
|
s,
|
|
|
|
p,
|
|
|
|
cl,
|
|
|
|
ch,
|
|
|
|
)
|
2022-01-20 13:57:37 +00:00
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
test_mean_ztest()
|
|
|
|
print("Ok.")
|