better merge

This commit is contained in:
nikitamikhaylov 2020-11-13 02:10:58 +03:00
parent 3f874af323
commit f764332070
7 changed files with 27 additions and 75 deletions

View File

@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
# Require python libraries like scipy, pandas and numpy
01322_ttest_scipy
01561_mann_whitney_scipy
01545_system_errors
# Checks system.errors

View File

@ -1,6 +1,7 @@
#pragma once
#include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h>
@ -27,20 +28,6 @@ class ReadBuffer;
class WriteBuffer;
template <typename F>
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
{
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
const long double h = (b - a) / iterations;
Float64 sum_odds = 0.0;
for (size_t i = 1; i < iterations; i += 2)
sum_odds += func(a + i * h);
Float64 sum_evens = 0.0;
for (size_t i = 2; i < iterations; i += 2)
sum_evens += func(a + i * h);
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
{
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
@ -98,7 +85,10 @@ public:
Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num);
this->data(place).add(value, static_cast<bool>(is_second));
if (is_second)
this->data(place).addY(value);
else
this->data(place).addX(value);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override

View File

@ -321,17 +321,18 @@ struct TTestMoments
T x2{};
T y2{};
void add(T value, bool second_sample)
void addX(T value)
{
if (second_sample) {
++ny;
y1 += value;
y2 += value * value;
} else {
++nx;
x1 += value;
x2 += value * value;
}
void addY(T value)
{
++ny;
y1 += value;
y2 += value * value;
}
void merge(const TTestMoments & rhs)

View File

@ -66,8 +66,8 @@ struct StatisticalSample
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
using SampleY = PODArray<Y, 32, AllocatorYSample>;
SampleX x;
SampleY y;
SampleX x{};
SampleY y{};
size_t size_x{0};
size_t size_y{0};

View File

@ -44,6 +44,7 @@ SRCS(
AggregateFunctionState.cpp
AggregateFunctionStatistics.cpp
AggregateFunctionStatisticsSimple.cpp
AggregateFunctionStudentTTest.cpp
AggregateFunctionSum.cpp
AggregateFunctionSumMap.cpp
AggregateFunctionTimeSeriesGroupSum.cpp
@ -51,6 +52,7 @@ SRCS(
AggregateFunctionUniq.cpp
AggregateFunctionUniqCombined.cpp
AggregateFunctionUniqUpTo.cpp
AggregateFunctionWelchTTest.cpp
AggregateFunctionWindowFunnel.cpp
UniqCombinedBiasData.cpp
UniqVariadicHash.cpp

View File

@ -1,55 +1,14 @@
#!/usr/bin/env python3
import os
import io
import sys
import requests
import time
from scipy import stats
import pandas as pd
import numpy as np
from scipy import stats
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
class ClickHouseClient:
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
self.host = host
def query(self, query, connection_timeout = 1500):
NUMBER_OF_TRIES = 30
DELAY = 10
for i in range(NUMBER_OF_TRIES):
r = requests.post(
self.host,
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
timeout = connection_timeout,
data = query)
if r.status_code == 200:
return r.text
else:
print('ATTENTION: try #%d failed' % i)
if i != (NUMBER_OF_TRIES-1):
print(query)
print(r.text)
time.sleep(DELAY*(i+1))
else:
raise ValueError(r.text)
def query_return_df(self, query, connection_timeout = 1500):
data = self.query(query, connection_timeout)
df = pd.read_csv(io.StringIO(data), sep = '\t')
return df
def query_with_data(self, query, content):
content = content.encode('utf-8')
r = requests.post(self.host, data=content)
result = r.text
if r.status_code == 200:
return result
else:
raise ValueError(r.text)
from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient()

View File

@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS mann_whitney;")