better merge

This commit is contained in:
nikitamikhaylov 2020-11-13 02:10:58 +03:00
parent 3f874af323
commit f764332070
7 changed files with 27 additions and 75 deletions

View File

@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
# Require python libraries like scipy, pandas and numpy
01322_ttest_scipy
01561_mann_whitney_scipy
01545_system_errors
# Checks system.errors

View File

@ -1,6 +1,7 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
@ -27,20 +28,6 @@ class ReadBuffer;
class WriteBuffer;
template <typename F>
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
{
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
const long double h = (b - a) / iterations;
Float64 sum_odds = 0.0;
for (size_t i = 1; i < iterations; i += 2)
sum_odds += func(a + i * h);
Float64 sum_evens = 0.0;
for (size_t i = 2; i < iterations; i += 2)
sum_evens += func(a + i * h);
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
{
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
@ -98,7 +85,10 @@ public:
Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num);
this->data(place).add(value, static_cast<bool>(is_second));
if (is_second)
this->data(place).addY(value);
else
this->data(place).addX(value);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override

View File

@ -321,17 +321,18 @@ struct TTestMoments
T x2{};
T y2{};
void add(T value, bool second_sample)
void addX(T value)
{
if (second_sample) {
++ny;
y1 += value;
y2 += value * value;
} else {
++nx;
x1 += value;
x2 += value * value;
}
++nx;
x1 += value;
x2 += value * value;
}
void addY(T value)
{
++ny;
y1 += value;
y2 += value * value;
}
void merge(const TTestMoments & rhs)

View File

@ -66,8 +66,8 @@ struct StatisticalSample
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
using SampleY = PODArray<Y, 32, AllocatorYSample>;
SampleX x;
SampleY y;
SampleX x{};
SampleY y{};
size_t size_x{0};
size_t size_y{0};

View File

@ -44,6 +44,7 @@ SRCS(
AggregateFunctionState.cpp
AggregateFunctionStatistics.cpp
AggregateFunctionStatisticsSimple.cpp
AggregateFunctionStudentTTest.cpp
AggregateFunctionSum.cpp
AggregateFunctionSumMap.cpp
AggregateFunctionTimeSeriesGroupSum.cpp
@ -51,6 +52,7 @@ SRCS(
AggregateFunctionUniq.cpp
AggregateFunctionUniqCombined.cpp
AggregateFunctionUniqUpTo.cpp
AggregateFunctionWelchTTest.cpp
AggregateFunctionWindowFunnel.cpp
UniqCombinedBiasData.cpp
UniqVariadicHash.cpp

View File

@ -1,55 +1,14 @@
#!/usr/bin/env python3
import os
import io
import sys
import requests
import time
import os
import sys
from scipy import stats
import pandas as pd
import numpy as np
from scipy import stats
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
class ClickHouseClient:
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
self.host = host
def query(self, query, connection_timeout = 1500):
NUMBER_OF_TRIES = 30
DELAY = 10
for i in range(NUMBER_OF_TRIES):
r = requests.post(
self.host,
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
timeout = connection_timeout,
data = query)
if r.status_code == 200:
return r.text
else:
print('ATTENTION: try #%d failed' % i)
if i != (NUMBER_OF_TRIES-1):
print(query)
print(r.text)
time.sleep(DELAY*(i+1))
else:
raise ValueError(r.text)
def query_return_df(self, query, connection_timeout = 1500):
data = self.query(query, connection_timeout)
df = pd.read_csv(io.StringIO(data), sep = '\t')
return df
def query_with_data(self, query, content):
content = content.encode('utf-8')
r = requests.post(self.host, data=content)
result = r.text
if r.status_code == 200:
return result
else:
raise ValueError(r.text)
from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient()

View File

@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS mann_whitney;")