better merge

This commit is contained in:
nikitamikhaylov 2020-11-13 02:10:58 +03:00
parent 3f874af323
commit f764332070
7 changed files with 27 additions and 75 deletions

View File

@ -288,6 +288,7 @@ TESTS_TO_SKIP=(
# Require python libraries like scipy, pandas and numpy # Require python libraries like scipy, pandas and numpy
01322_ttest_scipy 01322_ttest_scipy
01561_mann_whitney_scipy
01545_system_errors 01545_system_errors
# Checks system.errors # Checks system.errors

View File

@ -1,6 +1,7 @@
#pragma once #pragma once
#include <AggregateFunctions/IAggregateFunction.h> #include <AggregateFunctions/IAggregateFunction.h>
#include <AggregateFunctions/StatCommon.h>
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Columns/ColumnTuple.h> #include <Columns/ColumnTuple.h>
#include <Common/assert_cast.h> #include <Common/assert_cast.h>
@ -27,20 +28,6 @@ class ReadBuffer;
class WriteBuffer; class WriteBuffer;
template <typename F>
static Float64 integrateSimpson(Float64 a, Float64 b, F && func)
{
const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b)));
const long double h = (b - a) / iterations;
Float64 sum_odds = 0.0;
for (size_t i = 1; i < iterations; i += 2)
sum_odds += func(a + i * h);
Float64 sum_evens = 0.0;
for (size_t i = 2; i < iterations; i += 2)
sum_evens += func(a + i * h);
return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3;
}
static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2)
{ {
Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom), Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom),
@ -98,7 +85,10 @@ public:
Float64 value = columns[0]->getFloat64(row_num); Float64 value = columns[0]->getFloat64(row_num);
UInt8 is_second = columns[1]->getUInt(row_num); UInt8 is_second = columns[1]->getUInt(row_num);
this->data(place).add(value, static_cast<bool>(is_second)); if (is_second)
this->data(place).addY(value);
else
this->data(place).addX(value);
} }
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override

View File

@ -321,17 +321,18 @@ struct TTestMoments
T x2{}; T x2{};
T y2{}; T y2{};
void add(T value, bool second_sample) void addX(T value)
{ {
if (second_sample) {
++ny;
y1 += value;
y2 += value * value;
} else {
++nx; ++nx;
x1 += value; x1 += value;
x2 += value * value; x2 += value * value;
} }
void addY(T value)
{
++ny;
y1 += value;
y2 += value * value;
} }
void merge(const TTestMoments & rhs) void merge(const TTestMoments & rhs)

View File

@ -66,8 +66,8 @@ struct StatisticalSample
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>; using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
using SampleY = PODArray<Y, 32, AllocatorYSample>; using SampleY = PODArray<Y, 32, AllocatorYSample>;
SampleX x; SampleX x{};
SampleY y; SampleY y{};
size_t size_x{0}; size_t size_x{0};
size_t size_y{0}; size_t size_y{0};

View File

@ -44,6 +44,7 @@ SRCS(
AggregateFunctionState.cpp AggregateFunctionState.cpp
AggregateFunctionStatistics.cpp AggregateFunctionStatistics.cpp
AggregateFunctionStatisticsSimple.cpp AggregateFunctionStatisticsSimple.cpp
AggregateFunctionStudentTTest.cpp
AggregateFunctionSum.cpp AggregateFunctionSum.cpp
AggregateFunctionSumMap.cpp AggregateFunctionSumMap.cpp
AggregateFunctionTimeSeriesGroupSum.cpp AggregateFunctionTimeSeriesGroupSum.cpp
@ -51,6 +52,7 @@ SRCS(
AggregateFunctionUniq.cpp AggregateFunctionUniq.cpp
AggregateFunctionUniqCombined.cpp AggregateFunctionUniqCombined.cpp
AggregateFunctionUniqUpTo.cpp AggregateFunctionUniqUpTo.cpp
AggregateFunctionWelchTTest.cpp
AggregateFunctionWindowFunnel.cpp AggregateFunctionWindowFunnel.cpp
UniqCombinedBiasData.cpp UniqCombinedBiasData.cpp
UniqVariadicHash.cpp UniqVariadicHash.cpp

View File

@ -1,55 +1,14 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import io
import sys import sys
import requests from scipy import stats
import time
import pandas as pd import pandas as pd
import numpy as np import numpy as np
from scipy import stats
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1') CURDIR = os.path.dirname(os.path.realpath(__file__))
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/"
class ClickHouseClient: from pure_http_client import ClickHouseClient
def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
self.host = host
def query(self, query, connection_timeout = 1500):
NUMBER_OF_TRIES = 30
DELAY = 10
for i in range(NUMBER_OF_TRIES):
r = requests.post(
self.host,
params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000},
timeout = connection_timeout,
data = query)
if r.status_code == 200:
return r.text
else:
print('ATTENTION: try #%d failed' % i)
if i != (NUMBER_OF_TRIES-1):
print(query)
print(r.text)
time.sleep(DELAY*(i+1))
else:
raise ValueError(r.text)
def query_return_df(self, query, connection_timeout = 1500):
data = self.query(query, connection_timeout)
df = pd.read_csv(io.StringIO(data), sep = '\t')
return df
def query_with_data(self, query, content):
content = content.encode('utf-8')
r = requests.post(self.host, data=content)
result = r.text
if r.status_code == 200:
return result
else:
raise ValueError(r.text)
def test_and_check(name, a, b, t_stat, p_value): def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient() client = ClickHouseClient()

View File

@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value): def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient() client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS mann_whitney;") client.query("DROP TABLE IF EXISTS mann_whitney;")