mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-04 05:22:17 +00:00
121 lines
3.2 KiB
C++
121 lines
3.2 KiB
C++
#pragma once
|
|
|
|
#include <numeric>
|
|
#include <algorithm>
|
|
#include <utility>
|
|
|
|
#include <base/sort.h>
|
|
|
|
#include <Common/ArenaAllocator.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
struct Settings;
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int BAD_ARGUMENTS;
|
|
}
|
|
|
|
/// Because ranks are adjusted, we have to store each of them in Float type.
|
|
using RanksArray = std::vector<Float64>;
|
|
|
|
template <typename Values>
|
|
std::pair<RanksArray, Float64> computeRanksAndTieCorrection(const Values & values)
|
|
{
|
|
const size_t size = values.size();
|
|
/// Save initial positions, than sort indices according to the values.
|
|
std::vector<size_t> indexes(size);
|
|
std::iota(indexes.begin(), indexes.end(), 0);
|
|
std::sort(indexes.begin(), indexes.end(),
|
|
[&] (size_t lhs, size_t rhs) { return values[lhs] < values[rhs]; });
|
|
|
|
size_t left = 0;
|
|
Float64 tie_numenator = 0;
|
|
RanksArray out(size);
|
|
while (left < size)
|
|
{
|
|
size_t right = left;
|
|
while (right < size && values[indexes[left]] == values[indexes[right]])
|
|
++right;
|
|
auto adjusted = (left + right + 1.) / 2.;
|
|
auto count_equal = right - left;
|
|
|
|
/// Scipy implementation throws exception in this case too.
|
|
if (count_equal == size)
|
|
throw Exception("All numbers in both samples are identical", ErrorCodes::BAD_ARGUMENTS);
|
|
|
|
tie_numenator += std::pow(count_equal, 3) - count_equal;
|
|
for (size_t iter = left; iter < right; ++iter)
|
|
out[indexes[iter]] = adjusted;
|
|
left = right;
|
|
}
|
|
return {out, 1 - (tie_numenator / (std::pow(size, 3) - size))};
|
|
}
|
|
|
|
|
|
template <typename X, typename Y>
|
|
struct StatisticalSample
|
|
{
|
|
using AllocatorXSample = MixedAlignedArenaAllocator<alignof(X), 4096>;
|
|
using SampleX = PODArray<X, 32, AllocatorXSample>;
|
|
|
|
using AllocatorYSample = MixedAlignedArenaAllocator<alignof(Y), 4096>;
|
|
using SampleY = PODArray<Y, 32, AllocatorYSample>;
|
|
|
|
SampleX x{};
|
|
SampleY y{};
|
|
size_t size_x{0};
|
|
size_t size_y{0};
|
|
|
|
void addX(X value, Arena * arena)
|
|
{
|
|
if (isNaN(value))
|
|
return;
|
|
|
|
++size_x;
|
|
x.push_back(value, arena);
|
|
}
|
|
|
|
void addY(Y value, Arena * arena)
|
|
{
|
|
if (isNaN(value))
|
|
return;
|
|
|
|
++size_y;
|
|
y.push_back(value, arena);
|
|
}
|
|
|
|
void merge(const StatisticalSample & rhs, Arena * arena)
|
|
{
|
|
size_x += rhs.size_x;
|
|
size_y += rhs.size_y;
|
|
x.insert(rhs.x.begin(), rhs.x.end(), arena);
|
|
y.insert(rhs.y.begin(), rhs.y.end(), arena);
|
|
}
|
|
|
|
void write(WriteBuffer & buf) const
|
|
{
|
|
writeVarUInt(size_x, buf);
|
|
writeVarUInt(size_y, buf);
|
|
buf.write(reinterpret_cast<const char *>(x.data()), size_x * sizeof(x[0]));
|
|
buf.write(reinterpret_cast<const char *>(y.data()), size_y * sizeof(y[0]));
|
|
}
|
|
|
|
void read(ReadBuffer & buf, Arena * arena)
|
|
{
|
|
readVarUInt(size_x, buf);
|
|
readVarUInt(size_y, buf);
|
|
x.resize(size_x, arena);
|
|
y.resize(size_y, arena);
|
|
buf.read(reinterpret_cast<char *>(x.data()), size_x * sizeof(x[0]));
|
|
buf.read(reinterpret_cast<char *>(y.data()), size_y * sizeof(y[0]));
|
|
}
|
|
};
|
|
|
|
}
|