From f764332070c6b365c9de79ce8a36b2d230ac7644 Mon Sep 17 00:00:00 2001 From: nikitamikhaylov Date: Fri, 13 Nov 2020 02:10:58 +0300 Subject: [PATCH] better merge --- docker/test/fasttest/run.sh | 1 + .../AggregateFunctionTTest.h | 20 ++----- src/AggregateFunctions/Moments.h | 21 ++++---- src/AggregateFunctions/StatCommon.h | 4 +- src/AggregateFunctions/ya.make | 2 + .../0_stateless/01558_ttest_scipy.python | 53 +++---------------- .../01561_mann_whitney_scipy.python | 1 - 7 files changed, 27 insertions(+), 75 deletions(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index c95344eeca2..b286e57c206 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -288,6 +288,7 @@ TESTS_TO_SKIP=( # Require python libraries like scipy, pandas and numpy 01322_ttest_scipy + 01561_mann_whitney_scipy 01545_system_errors # Checks system.errors diff --git a/src/AggregateFunctions/AggregateFunctionTTest.h b/src/AggregateFunctions/AggregateFunctionTTest.h index 6fddbc70422..10f47a6e516 100644 --- a/src/AggregateFunctions/AggregateFunctionTTest.h +++ b/src/AggregateFunctions/AggregateFunctionTTest.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -27,20 +28,6 @@ class ReadBuffer; class WriteBuffer; -template -static Float64 integrateSimpson(Float64 a, Float64 b, F && func) -{ - const size_t iterations = std::max(1e6, 1e4 * std::abs(std::round(b))); - const long double h = (b - a) / iterations; - Float64 sum_odds = 0.0; - for (size_t i = 1; i < iterations; i += 2) - sum_odds += func(a + i * h); - Float64 sum_evens = 0.0; - for (size_t i = 2; i < iterations; i += 2) - sum_evens += func(a + i * h); - return (func(a) + func(b) + 2 * sum_evens + 4 * sum_odds) * h / 3; -} - static inline Float64 getPValue(Float64 degrees_of_freedom, Float64 t_stat2) { Float64 numerator = integrateSimpson(0, degrees_of_freedom / (t_stat2 + degrees_of_freedom), @@ -98,7 +85,10 @@ public: Float64 value = columns[0]->getFloat64(row_num); UInt8 is_second = columns[1]->getUInt(row_num); - this->data(place).add(value, static_cast(is_second)); + if (is_second) + this->data(place).addY(value); + else + this->data(place).addX(value); } void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override diff --git a/src/AggregateFunctions/Moments.h b/src/AggregateFunctions/Moments.h index 5b08ff4f6fc..c7fcda1ba1e 100644 --- a/src/AggregateFunctions/Moments.h +++ b/src/AggregateFunctions/Moments.h @@ -321,17 +321,18 @@ struct TTestMoments T x2{}; T y2{}; - void add(T value, bool second_sample) + void addX(T value) { - if (second_sample) { - ++ny; - y1 += value; - y2 += value * value; - } else { - ++nx; - x1 += value; - x2 += value * value; - } + ++nx; + x1 += value; + x2 += value * value; + } + + void addY(T value) + { + ++ny; + y1 += value; + y2 += value * value; } void merge(const TTestMoments & rhs) diff --git a/src/AggregateFunctions/StatCommon.h b/src/AggregateFunctions/StatCommon.h index 437b3bbf4d2..e3221ff77aa 100644 --- a/src/AggregateFunctions/StatCommon.h +++ b/src/AggregateFunctions/StatCommon.h @@ -66,8 +66,8 @@ struct StatisticalSample using AllocatorYSample = MixedAlignedArenaAllocator; using SampleY = PODArray; - SampleX x; - SampleY y; + SampleX x{}; + SampleY y{}; size_t size_x{0}; size_t size_y{0}; diff --git a/src/AggregateFunctions/ya.make b/src/AggregateFunctions/ya.make index ea36a6acd91..30b0efdc77f 100644 --- a/src/AggregateFunctions/ya.make +++ b/src/AggregateFunctions/ya.make @@ -44,6 +44,7 @@ SRCS( AggregateFunctionState.cpp AggregateFunctionStatistics.cpp AggregateFunctionStatisticsSimple.cpp + AggregateFunctionStudentTTest.cpp AggregateFunctionSum.cpp AggregateFunctionSumMap.cpp AggregateFunctionTimeSeriesGroupSum.cpp @@ -51,6 +52,7 @@ SRCS( AggregateFunctionUniq.cpp AggregateFunctionUniqCombined.cpp AggregateFunctionUniqUpTo.cpp + AggregateFunctionWelchTTest.cpp AggregateFunctionWindowFunnel.cpp UniqCombinedBiasData.cpp UniqVariadicHash.cpp diff --git a/tests/queries/0_stateless/01558_ttest_scipy.python b/tests/queries/0_stateless/01558_ttest_scipy.python index 5a490bb6421..727ca08c080 100644 --- a/tests/queries/0_stateless/01558_ttest_scipy.python +++ b/tests/queries/0_stateless/01558_ttest_scipy.python @@ -1,55 +1,14 @@ #!/usr/bin/env python3 -import os -import io -import sys -import requests -import time +import os +import sys +from scipy import stats import pandas as pd import numpy as np -from scipy import stats -CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1') -CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') -CLICKHOUSE_SERVER_URL_STR = 'http://' + ':'.join(str(s) for s in [CLICKHOUSE_HOST, CLICKHOUSE_PORT_HTTP]) + "/" +CURDIR = os.path.dirname(os.path.realpath(__file__)) +sys.path.insert(0, os.path.join(CURDIR, 'helpers')) -class ClickHouseClient: - def __init__(self, host = CLICKHOUSE_SERVER_URL_STR): - self.host = host - - def query(self, query, connection_timeout = 1500): - NUMBER_OF_TRIES = 30 - DELAY = 10 - - for i in range(NUMBER_OF_TRIES): - r = requests.post( - self.host, - params = {'timeout_before_checking_execution_speed': 120, 'max_execution_time': 6000}, - timeout = connection_timeout, - data = query) - if r.status_code == 200: - return r.text - else: - print('ATTENTION: try #%d failed' % i) - if i != (NUMBER_OF_TRIES-1): - print(query) - print(r.text) - time.sleep(DELAY*(i+1)) - else: - raise ValueError(r.text) - - def query_return_df(self, query, connection_timeout = 1500): - data = self.query(query, connection_timeout) - df = pd.read_csv(io.StringIO(data), sep = '\t') - return df - - def query_with_data(self, query, content): - content = content.encode('utf-8') - r = requests.post(self.host, data=content) - result = r.text - if r.status_code == 200: - return result - else: - raise ValueError(r.text) +from pure_http_client import ClickHouseClient def test_and_check(name, a, b, t_stat, p_value): client = ClickHouseClient() diff --git a/tests/queries/0_stateless/01561_mann_whitney_scipy.python b/tests/queries/0_stateless/01561_mann_whitney_scipy.python index 6905c758550..7958e8bbaf1 100644 --- a/tests/queries/0_stateless/01561_mann_whitney_scipy.python +++ b/tests/queries/0_stateless/01561_mann_whitney_scipy.python @@ -11,7 +11,6 @@ sys.path.insert(0, os.path.join(CURDIR, 'helpers')) from pure_http_client import ClickHouseClient - def test_and_check(name, a, b, t_stat, p_value): client = ClickHouseClient() client.query("DROP TABLE IF EXISTS mann_whitney;")