mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-21 15:12:02 +00:00
better merge
This commit is contained in:
parent
3f874af323
commit
f764332070
@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
|
||||
|
||||
# Require python libraries like scipy, pandas and numpy
|
||||
01322_ttest_scipy
|
||||
01561_mann_whitney_scipy
|
||||
|
||||
01545_system_errors
|
||||
# Checks system.errors
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include <AggregateFunctions/IAggregateFunction.h>
|
||||
#include <AggregateFunctions/StatCommon.h>
|
||||
#include <Columns/ColumnVector.h>
|
||||
#include <Columns/ColumnTuple.h>
|
||||
#include <Common/assert_cast.h>
|
||||
@ -27,20 +28,6 @@ class ReadBuffer;
|
||||
class WriteBuffer;
|
||||
|
||||
|
||||
template <typename F>
|
||||
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
|
||||
{
|
||||
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
|
||||
const long double h = (b - a) / iterations;
|
||||
Float64 sum_odds = 0.0;
|
||||
for (size_t i = 1; i < iterations; i += 2)
|
||||
sum_odds += func(a + i * h);
|
||||
Float64 sum_evens = 0.0;
|
||||
for (size_t i = 2; i < iterations; i += 2)
|
||||
sum_evens += func(a + i * h);
|
||||
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
|
||||
}
|
||||
|
||||
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
|
||||
{
|
||||
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
|
||||
@ -98,7 +85,10 @@ public:
|
||||
Float64 value = columns[0]->getFloat64(row_num);
|
||||
UInt8 is_second = columns[1]->getUInt(row_num);
|
||||
|
||||
this->data(place).add(value, static_cast<bool>(is_second));
|
||||
if (is_second)
|
||||
this->data(place).addY(value);
|
||||
else
|
||||
this->data(place).addX(value);
|
||||
}
|
||||
|
||||
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
|
||||
|
@ -321,17 +321,18 @@ struct TTestMoments
|
||||
T x2{};
|
||||
T y2{};
|
||||
|
||||
void add(T value, bool second_sample)
|
||||
void addX(T value)
|
||||
{
|
||||
if (second_sample) {
|
||||
++ny;
|
||||
y1 += value;
|
||||
y2 += value * value;
|
||||
} else {
|
||||
++nx;
|
||||
x1 += value;
|
||||
x2 += value * value;
|
||||
}
|
||||
|
||||
void addY(T value)
|
||||
{
|
||||
++ny;
|
||||
y1 += value;
|
||||
y2 += value * value;
|
||||
}
|
||||
|
||||
void merge(const TTestMoments & rhs)
|
||||
|
@ -66,8 +66,8 @@ struct StatisticalSample
|
||||
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
|
||||
using SampleY = PODArray<Y, 32, AllocatorYSample>;
|
||||
|
||||
SampleX x;
|
||||
SampleY y;
|
||||
SampleX x{};
|
||||
SampleY y{};
|
||||
size_t size_x{0};
|
||||
size_t size_y{0};
|
||||
|
||||
|
@ -44,6 +44,7 @@ SRCS(
|
||||
AggregateFunctionState.cpp
|
||||
AggregateFunctionStatistics.cpp
|
||||
AggregateFunctionStatisticsSimple.cpp
|
||||
AggregateFunctionStudentTTest.cpp
|
||||
AggregateFunctionSum.cpp
|
||||
AggregateFunctionSumMap.cpp
|
||||
AggregateFunctionTimeSeriesGroupSum.cpp
|
||||
@ -51,6 +52,7 @@ SRCS(
|
||||
AggregateFunctionUniq.cpp
|
||||
AggregateFunctionUniqCombined.cpp
|
||||
AggregateFunctionUniqUpTo.cpp
|
||||
AggregateFunctionWelchTTest.cpp
|
||||
AggregateFunctionWindowFunnel.cpp
|
||||
UniqCombinedBiasData.cpp
|
||||
UniqVariadicHash.cpp
|
||||
|
@ -1,55 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
import os
|
||||
import io
|
||||
import sys
|
||||
import requests
|
||||
import time
|
||||
from scipy import stats
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from scipy import stats
|
||||
|
||||
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
|
||||
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
|
||||
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
|
||||
CURDIR = os.path.dirname(os.path.realpath(__file__))
|
||||
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
|
||||
|
||||
class ClickHouseClient:
|
||||
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
|
||||
self.host = host
|
||||
|
||||
def query(self, query, connection_timeout = 1500):
|
||||
NUMBER_OF_TRIES = 30
|
||||
DELAY = 10
|
||||
|
||||
for i in range(NUMBER_OF_TRIES):
|
||||
r = requests.post(
|
||||
self.host,
|
||||
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
|
||||
timeout = connection_timeout,
|
||||
data = query)
|
||||
if r.status_code == 200:
|
||||
return r.text
|
||||
else:
|
||||
print('ATTENTION: try #%d failed' % i)
|
||||
if i != (NUMBER_OF_TRIES-1):
|
||||
print(query)
|
||||
print(r.text)
|
||||
time.sleep(DELAY*(i+1))
|
||||
else:
|
||||
raise ValueError(r.text)
|
||||
|
||||
def query_return_df(self, query, connection_timeout = 1500):
|
||||
data = self.query(query, connection_timeout)
|
||||
df = pd.read_csv(io.StringIO(data), sep = '\t')
|
||||
return df
|
||||
|
||||
def query_with_data(self, query, content):
|
||||
content = content.encode('utf-8')
|
||||
r = requests.post(self.host, data=content)
|
||||
result = r.text
|
||||
if r.status_code == 200:
|
||||
return result
|
||||
else:
|
||||
raise ValueError(r.text)
|
||||
from pure_http_client import ClickHouseClient
|
||||
|
||||
def test_and_check(name, a, b, t_stat, p_value):
|
||||
client = ClickHouseClient()
|
||||
|
@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
|
||||
from pure_http_client import ClickHouseClient
|
||||
|
||||
|
||||
|
||||
def test_and_check(name, a, b, t_stat, p_value):
|
||||
client = ClickHouseClient()
|
||||
client.query("DROP TABLE IF EXISTS mann_whitney;")
|
||||
|
Loading…
Reference in New Issue
Block a user