mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
add python test and fix build
This commit is contained in:
parent
252b9b2ec1
commit
ff7601a52c
@ -309,10 +309,10 @@ long double __lgammal_r(long double x, int *sg) {
|
||||
return r;
|
||||
}
|
||||
|
||||
int signgam;
|
||||
int signgam_lgammal;
|
||||
|
||||
long double lgammal(long double x)
|
||||
{
|
||||
return lgammal_r(x, &signgam);
|
||||
return lgammal_r(x, &signgam_lgammal);
|
||||
}
|
||||
|
||||
|
@ -50,7 +50,7 @@ AggregateFunctionPtr createAggregateFunctionStudentTTest(const std::string & nam
|
||||
|
||||
void registerAggregateFunctionStudentTTest(AggregateFunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction("StudentTTest", createAggregateFunctionStudentTTest, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction("studentTTest", createAggregateFunctionStudentTTest, AggregateFunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -98,8 +98,8 @@ struct AggregateFunctionStudentTTestData final
|
||||
|
||||
Float64 getSSquared() const
|
||||
{
|
||||
/// TODO: Update comment with Tex.
|
||||
/// The original formulae looks like ...
|
||||
/// The original formulae looks like
|
||||
/// \frac{\sum_{i = 1}^{n_x}{(x_i - \bar{x}) ^ 2} + \sum_{i = 1}^{n_y}{(y_i - \bar{y}) ^ 2}}{n_x + n_y - 2}
|
||||
/// But we made some mathematical transformations not to store original sequences.
|
||||
/// Also we dropped sqrt, because later it will be squared later.
|
||||
const Float64 all_x = square_sum_x + size_x * std::pow(mean_x, 2) - 2 * mean_x * sum_x;
|
||||
@ -110,26 +110,19 @@ struct AggregateFunctionStudentTTestData final
|
||||
|
||||
Float64 getTStatisticSquared() const
|
||||
{
|
||||
if (size_x == 0 || size_y == 0)
|
||||
{
|
||||
throw Exception("Division by zero encountered in Aggregate function StudentTTest", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
|
||||
return std::pow(mean_x - mean_y, 2) / getStandartErrorSquared();
|
||||
}
|
||||
|
||||
Float64 getTStatistic() const
|
||||
{
|
||||
if (size_x == 0 || size_y == 0)
|
||||
{
|
||||
throw Exception("Division by zero encountered in Aggregate function StudentTTest", ErrorCodes::BAD_ARGUMENTS);
|
||||
}
|
||||
|
||||
return (mean_x - mean_y) / std::sqrt(getStandartErrorSquared());
|
||||
}
|
||||
|
||||
Float64 getStandartErrorSquared() const
|
||||
{
|
||||
if (size_x == 0 || size_y == 0)
|
||||
throw Exception("Division by zero encountered in Aggregate function StudentTTest", ErrorCodes::BAD_ARGUMENTS);
|
||||
|
||||
return getSSquared() * (1.0 / static_cast<Float64>(size_x) + 1.0 / static_cast<Float64>(size_y));
|
||||
}
|
||||
|
||||
@ -138,9 +131,10 @@ struct AggregateFunctionStudentTTestData final
|
||||
return static_cast<Float64>(size_x + size_y - 2);
|
||||
}
|
||||
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, std::function<Float64(Float64)> func, size_t iterations = 1e6)
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, std::function<Float64(Float64)> func)
|
||||
{
|
||||
double h = (b - a) / iterations;
|
||||
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
|
||||
const long double h = (b - a) / iterations;
|
||||
Float64 sum_odds = 0.0;
|
||||
for (size_t i = 1; i < iterations; i += 2)
|
||||
sum_odds += func(a + i * h);
|
||||
@ -154,13 +148,9 @@ struct AggregateFunctionStudentTTestData final
|
||||
{
|
||||
const Float64 v = getDegreesOfFreedom();
|
||||
const Float64 t = getTStatisticSquared();
|
||||
std::cout << "getDegreesOfFreedom() " << getDegreesOfFreedom() << std::endl;
|
||||
std::cout << "getTStatisticSquared() " << getTStatisticSquared() << std::endl;
|
||||
auto f = [&v] (double x) { return std::pow(x, v/2 - 1) / std::sqrt(1 - x); };
|
||||
Float64 numenator = integrateSimpson(0, v / (t + v), f);
|
||||
Float64 denominator = std::exp(std::lgammal(v/2) + std::lgammal(0.5) - std::lgammal(v/2 + 0.5));
|
||||
std::cout << "numenator " << numenator << std::endl;
|
||||
std::cout << "denominator " << denominator << std::endl;
|
||||
return numenator / denominator;
|
||||
}
|
||||
|
||||
@ -184,7 +174,7 @@ public:
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return "StudentTTest";
|
||||
return "studentTTest";
|
||||
}
|
||||
|
||||
DataTypePtr getReturnType() const override
|
||||
|
@ -50,7 +50,7 @@ AggregateFunctionPtr createAggregateFunctionWelchTTest(const std::string & name,
|
||||
|
||||
void registerAggregateFunctionWelchTTest(AggregateFunctionFactory & factory)
|
||||
{
|
||||
factory.registerFunction("WelchTTest", createAggregateFunctionWelchTTest, AggregateFunctionFactory::CaseInsensitive);
|
||||
factory.registerFunction("welchTTest", createAggregateFunctionWelchTTest, AggregateFunctionFactory::CaseInsensitive);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -142,8 +142,9 @@ struct AggregateFunctionWelchTTestData final
|
||||
return numerator / (denominator_first + denominator_second);
|
||||
}
|
||||
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, std::function<Float64(Float64)> func, size_t iterations = 1e6)
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, std::function<Float64(Float64)> func)
|
||||
{
|
||||
size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
|
||||
double h = (b - a) / iterations;
|
||||
Float64 sum_odds = 0.0;
|
||||
for (size_t i = 1; i < iterations; i += 2)
|
||||
@ -170,7 +171,8 @@ struct AggregateFunctionWelchTTestData final
|
||||
}
|
||||
};
|
||||
|
||||
/// Returns p-value
|
||||
/// Returns tuple of (t-statistic, p-value)
|
||||
/// https://cpb-us-w2.wpmucdn.com/voices.uchicago.edu/dist/9/1193/files/2016/01/05b-TandP.pdf
|
||||
template <typename X = Float64, typename Y = Float64>
|
||||
class AggregateFunctionWelchTTest :
|
||||
public IAggregateFunctionDataHelper<AggregateFunctionWelchTTestData<X, Y>,AggregateFunctionWelchTTest<X, Y>>
|
||||
@ -183,7 +185,7 @@ public:
|
||||
|
||||
String getName() const override
|
||||
{
|
||||
return "WelchTTest";
|
||||
return "welchTTest";
|
||||
}
|
||||
|
||||
DataTypePtr getReturnType() const override
|
||||
|
108
tests/queries/0_stateless/01322_ttest_scipy.py
Normal file
108
tests/queries/0_stateless/01322_ttest_scipy.py
Normal file
@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import io
|
||||
import sys
|
||||
import requests
|
||||
import time
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
|
||||
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
|
||||
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
|
||||
|
||||
class ClickHouseClient:
|
||||
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
|
||||
self.host = host
|
||||
|
||||
def query(self, query, connection_timeout = 1500):
|
||||
NUMBER_OF_TRIES = 30
|
||||
DELAY = 10
|
||||
|
||||
for i in range(NUMBER_OF_TRIES):
|
||||
r = requests.post(
|
||||
self.host,
|
||||
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
|
||||
timeout = connection_timeout,
|
||||
data = query)
|
||||
if r.status_code == 200:
|
||||
return r.text
|
||||
else:
|
||||
print('ATTENTION: try #%d failed' % i)
|
||||
if i != (NUMBER_OF_TRIES-1):
|
||||
print(query)
|
||||
print(r.text)
|
||||
time.sleep(DELAY*(i+1))
|
||||
else:
|
||||
raise ValueError(r.text)
|
||||
|
||||
def query_return_df(self, query, connection_timeout = 1500):
|
||||
data = self.query(query, connection_timeout)
|
||||
df = pd.read_csv(io.StringIO(data), sep = '\t')
|
||||
return df
|
||||
|
||||
def query_with_data(self, query, content):
|
||||
content = content.encode('utf-8')
|
||||
r = requests.post(self.host, data=content)
|
||||
result = r.text
|
||||
if r.status_code == 200:
|
||||
return result
|
||||
else:
|
||||
raise ValueError(r.text)
|
||||
|
||||
def test_and_check(name, a, b, t_stat, p_value):
|
||||
client = ClickHouseClient()
|
||||
client.query("DROP TABLE IF EXISTS ttest;")
|
||||
client.query("CREATE TABLE ttest (left Float64, right Float64) ENGINE = Memory;");
|
||||
client.query("INSERT INTO ttest VALUES {};".format(", ".join(['({},{})'.format(i, j) for i,j in zip(a, b)])))
|
||||
|
||||
real = client.query_return_df(
|
||||
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) +
|
||||
"roundBankers({}(left, right).2, 16) as p_value ".format(name) +
|
||||
"FROM ttest FORMAT TabSeparatedWithNames;")
|
||||
real_t_stat = real['t_stat'][0]
|
||||
real_p_value = real['p_value'][0]
|
||||
assert(abs(real_t_stat - np.float64(t_stat) < 1e-4)), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
|
||||
assert(abs(real_p_value - np.float64(p_value)) < 1e-4), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
|
||||
client.query("DROP TABLE IF EXISTS ttest;")
|
||||
|
||||
|
||||
def test_student():
|
||||
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
|
||||
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=65536), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=5, scale=1,size=65536), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
def test_welch():
|
||||
rvs1 = np.round(stats.norm.rvs(loc=1, scale=15,size=500), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
rvs1 = np.round(stats.norm.rvs(loc=0, scale=7,size=500), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=0, scale=3,size=500), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
|
||||
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=65536), 5)
|
||||
rvs2 = np.round(stats.norm.rvs(loc=5, scale=1,size=65536), 5)
|
||||
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
|
||||
test_and_check("studentTTest", rvs1, rvs2, s, p)
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_student()
|
||||
test_welch()
|
||||
print("Ok.")
|
Loading…
Reference in New Issue
Block a user