mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-01 03:52:15 +00:00
87 lines
2.6 KiB
Plaintext
87 lines
2.6 KiB
Plaintext
|
#!/usr/bin/env python3
|
||
|
import os
|
||
|
import sys
|
||
|
from statistics import variance
|
||
|
from scipy import stats
|
||
|
import pandas as pd
|
||
|
import numpy as np
|
||
|
|
||
|
CURDIR = os.path.dirname(os.path.realpath(__file__))
|
||
|
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
|
||
|
|
||
|
from pure_http_client import ClickHouseClient
|
||
|
|
||
|
|
||
|
# unpooled variance z-test for means of two samples
|
||
|
def scipy_anova(rvs):
|
||
|
return stats.f_oneway(*rvs)
|
||
|
|
||
|
|
||
|
def test_and_check(rvs, n_groups, f_stat, p_value, precision=1e-2):
|
||
|
client = ClickHouseClient()
|
||
|
client.query("DROP TABLE IF EXISTS anova;")
|
||
|
client.query("CREATE TABLE anova (left Float64, right UInt64) ENGINE = Memory;")
|
||
|
for group in range(n_groups):
|
||
|
client.query(f'''INSERT INTO anova VALUES {", ".join([f'({i},{group})' for i in rvs[group]])};''')
|
||
|
|
||
|
real = client.query_return_df(
|
||
|
'''SELECT roundBankers(a.1, 16) as f_stat, roundBankers(a.2, 16) as p_value FROM (SELECT anova(left, right) as a FROM anova) FORMAT TabSeparatedWithNames;''')
|
||
|
|
||
|
real_f_stat = real['f_stat'][0]
|
||
|
real_p_value = real['p_value'][0]
|
||
|
assert(abs(real_f_stat - np.float64(f_stat)) < precision), f"clickhouse_f_stat {real_f_stat}, py_f_stat {f_stat}"
|
||
|
assert(abs(real_p_value - np.float64(p_value)) < precision), f"clickhouse_p_value {real_p_value}, py_p_value {p_value}"
|
||
|
client.query("DROP TABLE IF EXISTS anova;")
|
||
|
|
||
|
|
||
|
def test_anova():
|
||
|
n_groups = 3
|
||
|
rvs = []
|
||
|
loc = 0
|
||
|
scale = 5
|
||
|
size = 500
|
||
|
for _ in range(n_groups):
|
||
|
rvs.append(np.round(stats.norm.rvs(loc=loc, scale=scale, size=size), 2))
|
||
|
loc += 5
|
||
|
f_stat, p_value = scipy_anova(rvs)
|
||
|
test_and_check(rvs, n_groups, f_stat, p_value)
|
||
|
|
||
|
n_groups = 6
|
||
|
rvs = []
|
||
|
loc = 0
|
||
|
scale = 5
|
||
|
size = 500
|
||
|
for _ in range(n_groups):
|
||
|
rvs.append(np.round(stats.norm.rvs(loc=loc, scale=scale, size=size), 2))
|
||
|
f_stat, p_value = scipy_anova(rvs)
|
||
|
test_and_check(rvs, n_groups, f_stat, p_value)
|
||
|
|
||
|
n_groups = 10
|
||
|
rvs = []
|
||
|
loc = 1
|
||
|
scale = 2
|
||
|
size = 100
|
||
|
for _ in range(n_groups):
|
||
|
rvs.append(np.round(stats.norm.rvs(loc=loc, scale=scale, size=size), 2))
|
||
|
loc += 1
|
||
|
scale += 2
|
||
|
size += 100
|
||
|
f_stat, p_value = scipy_anova(rvs)
|
||
|
test_and_check(rvs, n_groups, f_stat, p_value)
|
||
|
|
||
|
n_groups = 20
|
||
|
rvs = []
|
||
|
loc = 0
|
||
|
scale = 10
|
||
|
size = 1100
|
||
|
for _ in range(n_groups):
|
||
|
rvs.append(np.round(stats.norm.rvs(loc=loc, scale=scale, size=size), 2))
|
||
|
size -= 50
|
||
|
f_stat, p_value = scipy_anova(rvs)
|
||
|
test_and_check(rvs, n_groups, f_stat, p_value)
|
||
|
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
test_anova()
|
||
|
print("Ok.")
|