mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
better merge
This commit is contained in:
parent
3f874af323
commit
f764332070
@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
|
|||||||
|
|
||||||
# Require python libraries like scipy, pandas and numpy
|
# Require python libraries like scipy, pandas and numpy
|
||||||
01322_ttest_scipy
|
01322_ttest_scipy
|
||||||
|
01561_mann_whitney_scipy
|
||||||
|
|
||||||
01545_system_errors
|
01545_system_errors
|
||||||
# Checks system.errors
|
# Checks system.errors
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <AggregateFunctions/IAggregateFunction.h>
|
#include <AggregateFunctions/IAggregateFunction.h>
|
||||||
|
#include <AggregateFunctions/StatCommon.h>
|
||||||
#include <Columns/ColumnVector.h>
|
#include <Columns/ColumnVector.h>
|
||||||
#include <Columns/ColumnTuple.h>
|
#include <Columns/ColumnTuple.h>
|
||||||
#include <Common/assert_cast.h>
|
#include <Common/assert_cast.h>
|
||||||
@ -27,20 +28,6 @@ class ReadBuffer;
|
|||||||
class WriteBuffer;
|
class WriteBuffer;
|
||||||
|
|
||||||
|
|
||||||
template <typename F>
|
|
||||||
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
|
|
||||||
{
|
|
||||||
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
|
|
||||||
const long double h = (b - a) / iterations;
|
|
||||||
Float64 sum_odds = 0.0;
|
|
||||||
for (size_t i = 1; i < iterations; i += 2)
|
|
||||||
sum_odds += func(a + i * h);
|
|
||||||
Float64 sum_evens = 0.0;
|
|
||||||
for (size_t i = 2; i < iterations; i += 2)
|
|
||||||
sum_evens += func(a + i * h);
|
|
||||||
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
|
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
|
||||||
{
|
{
|
||||||
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
|
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
|
||||||
@ -98,7 +85,10 @@ public:
|
|||||||
Float64 value = columns[0]->getFloat64(row_num);
|
Float64 value = columns[0]->getFloat64(row_num);
|
||||||
UInt8 is_second = columns[1]->getUInt(row_num);
|
UInt8 is_second = columns[1]->getUInt(row_num);
|
||||||
|
|
||||||
this->data(place).add(value, static_cast<bool>(is_second));
|
if (is_second)
|
||||||
|
this->data(place).addY(value);
|
||||||
|
else
|
||||||
|
this->data(place).addX(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
|
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
|
||||||
|
@ -321,17 +321,18 @@ struct TTestMoments
|
|||||||
T x2{};
|
T x2{};
|
||||||
T y2{};
|
T y2{};
|
||||||
|
|
||||||
void add(T value, bool second_sample)
|
void addX(T value)
|
||||||
{
|
{
|
||||||
if (second_sample) {
|
|
||||||
++ny;
|
|
||||||
y1 += value;
|
|
||||||
y2 += value * value;
|
|
||||||
} else {
|
|
||||||
++nx;
|
++nx;
|
||||||
x1 += value;
|
x1 += value;
|
||||||
x2 += value * value;
|
x2 += value * value;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void addY(T value)
|
||||||
|
{
|
||||||
|
++ny;
|
||||||
|
y1 += value;
|
||||||
|
y2 += value * value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void merge(const TTestMoments & rhs)
|
void merge(const TTestMoments & rhs)
|
||||||
|
@ -66,8 +66,8 @@ struct StatisticalSample
|
|||||||
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
|
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
|
||||||
using SampleY = PODArray<Y, 32, AllocatorYSample>;
|
using SampleY = PODArray<Y, 32, AllocatorYSample>;
|
||||||
|
|
||||||
SampleX x;
|
SampleX x{};
|
||||||
SampleY y;
|
SampleY y{};
|
||||||
size_t size_x{0};
|
size_t size_x{0};
|
||||||
size_t size_y{0};
|
size_t size_y{0};
|
||||||
|
|
||||||
|
@ -44,6 +44,7 @@ SRCS(
|
|||||||
AggregateFunctionState.cpp
|
AggregateFunctionState.cpp
|
||||||
AggregateFunctionStatistics.cpp
|
AggregateFunctionStatistics.cpp
|
||||||
AggregateFunctionStatisticsSimple.cpp
|
AggregateFunctionStatisticsSimple.cpp
|
||||||
|
AggregateFunctionStudentTTest.cpp
|
||||||
AggregateFunctionSum.cpp
|
AggregateFunctionSum.cpp
|
||||||
AggregateFunctionSumMap.cpp
|
AggregateFunctionSumMap.cpp
|
||||||
AggregateFunctionTimeSeriesGroupSum.cpp
|
AggregateFunctionTimeSeriesGroupSum.cpp
|
||||||
@ -51,6 +52,7 @@ SRCS(
|
|||||||
AggregateFunctionUniq.cpp
|
AggregateFunctionUniq.cpp
|
||||||
AggregateFunctionUniqCombined.cpp
|
AggregateFunctionUniqCombined.cpp
|
||||||
AggregateFunctionUniqUpTo.cpp
|
AggregateFunctionUniqUpTo.cpp
|
||||||
|
AggregateFunctionWelchTTest.cpp
|
||||||
AggregateFunctionWindowFunnel.cpp
|
AggregateFunctionWindowFunnel.cpp
|
||||||
UniqCombinedBiasData.cpp
|
UniqCombinedBiasData.cpp
|
||||||
UniqVariadicHash.cpp
|
UniqVariadicHash.cpp
|
||||||
|
@ -1,55 +1,14 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
import os
|
import os
|
||||||
import io
|
|
||||||
import sys
|
import sys
|
||||||
import requests
|
from scipy import stats
|
||||||
import time
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import stats
|
|
||||||
|
|
||||||
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
|
CURDIR = os.path.dirname(os.path.realpath(__file__))
|
||||||
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
|
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
|
||||||
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
|
|
||||||
|
|
||||||
class ClickHouseClient:
|
from pure_http_client import ClickHouseClient
|
||||||
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
|
|
||||||
self.host = host
|
|
||||||
|
|
||||||
def query(self, query, connection_timeout = 1500):
|
|
||||||
NUMBER_OF_TRIES = 30
|
|
||||||
DELAY = 10
|
|
||||||
|
|
||||||
for i in range(NUMBER_OF_TRIES):
|
|
||||||
r = requests.post(
|
|
||||||
self.host,
|
|
||||||
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
|
|
||||||
timeout = connection_timeout,
|
|
||||||
data = query)
|
|
||||||
if r.status_code == 200:
|
|
||||||
return r.text
|
|
||||||
else:
|
|
||||||
print('ATTENTION: try #%d failed' % i)
|
|
||||||
if i != (NUMBER_OF_TRIES-1):
|
|
||||||
print(query)
|
|
||||||
print(r.text)
|
|
||||||
time.sleep(DELAY*(i+1))
|
|
||||||
else:
|
|
||||||
raise ValueError(r.text)
|
|
||||||
|
|
||||||
def query_return_df(self, query, connection_timeout = 1500):
|
|
||||||
data = self.query(query, connection_timeout)
|
|
||||||
df = pd.read_csv(io.StringIO(data), sep = '\t')
|
|
||||||
return df
|
|
||||||
|
|
||||||
def query_with_data(self, query, content):
|
|
||||||
content = content.encode('utf-8')
|
|
||||||
r = requests.post(self.host, data=content)
|
|
||||||
result = r.text
|
|
||||||
if r.status_code == 200:
|
|
||||||
return result
|
|
||||||
else:
|
|
||||||
raise ValueError(r.text)
|
|
||||||
|
|
||||||
def test_and_check(name, a, b, t_stat, p_value):
|
def test_and_check(name, a, b, t_stat, p_value):
|
||||||
client = ClickHouseClient()
|
client = ClickHouseClient()
|
||||||
|
@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
|
|||||||
from pure_http_client import ClickHouseClient
|
from pure_http_client import ClickHouseClient
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def test_and_check(name, a, b, t_stat, p_value):
|
def test_and_check(name, a, b, t_stat, p_value):
|
||||||
client = ClickHouseClient()
|
client = ClickHouseClient()
|
||||||
client.query("DROP TABLE IF EXISTS mann_whitney;")
|
client.query("DROP TABLE IF EXISTS mann_whitney;")
|
||||||
|
Loading…
Reference in New Issue
Block a user