Remove useless header files

2024-09-20 00:30:49 +00:00 · 2023-11-06 03:49:55 +01:00 · 2023-11-06 03:49:55 +01:00 · 184e6f840e
commit 184e6f840e
parent a710ac6011
71 changed files with 7491 additions and 7872 deletions
--- a/src/Access/UsersConfigAccessStorage.cpp
+++ b/src/Access/UsersConfigAccessStorage.cpp
@ -12,7 +12,7 @@
 #include <Common/Config/ConfigReloader.h>
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/quoteString.h>
-#include <Common/TransformEndianness.hpp>
+#include <Common/transformEndianness.h>
 #include <Core/Settings.h>
 #include <Interpreters/executeQuery.h>
 #include <Parsers/Access/ASTGrantQuery.h>
--- a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.cpp
@ -1,7 +1,18 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionAnalysisOfVariance.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+#include <IO/VarInt.h>
+
+#include <array>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <Columns/ColumnNullable.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/Moments.h>
+#include "Common/NaNUtils.h"
+#include <Common/assert_cast.h>
+
+
 namespace DB
 {

@ -13,6 +24,82 @@ namespace ErrorCodes
 namespace
 {

+using AggregateFunctionAnalysisOfVarianceData = AnalysisOfVarianceMoments<Float64>;
+
+
+/// One way analysis of variance
+/// Provides a statistical test of whether two or more population means are equal (null hypothesis)
+/// Has an assumption that subjects from group i have normal distribution.
+/// Accepts two arguments - a value and a group number which this value belongs to.
+/// Groups are enumerated starting from 0 and there should be at least two groups to perform a test
+/// Moreover there should be at least one group with the number of observations greater than one.
+class AggregateFunctionAnalysisOfVariance final : public IAggregateFunctionDataHelper<AggregateFunctionAnalysisOfVarianceData, AggregateFunctionAnalysisOfVariance>
+{
+public:
+    explicit AggregateFunctionAnalysisOfVariance(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper(arguments, params, createResultType())
+    {}
+
+    DataTypePtr createResultType() const
+    {
+        DataTypes types {std::make_shared<DataTypeNumber<Float64>>(), std::make_shared<DataTypeNumber<Float64>>() };
+        Strings names {"f_statistic", "p_value"};
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    String getName() const override { return "analysisOfVariance"; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        data(place).add(columns[0]->getFloat64(row_num), columns[1]->getUInt(row_num));
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        data(place).merge(data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        data(place).read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto f_stat = data(place).getFStatistic();
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+
+        if (unlikely(!std::isfinite(f_stat) || f_stat < 0))
+        {
+            column_stat.getData().push_back(std::numeric_limits<Float64>::quiet_NaN());
+            column_value.getData().push_back(std::numeric_limits<Float64>::quiet_NaN());
+            return;
+        }
+
+        auto p_value = data(place).getPValue(f_stat);
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        column_stat.getData().push_back(f_stat);
+        column_value.getData().push_back(p_value);
+    }
+
+};
+
 AggregateFunctionPtr createAggregateFunctionAnalysisOfVariance(const std::string & name, const DataTypes & arguments, const Array & parameters, const Settings *)
 {
    assertNoParameters(name, parameters);
--- a/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h
+++ b/src/AggregateFunctions/AggregateFunctionAnalysisOfVariance.h
@ -1,97 +0,0 @@
-#pragma once
-
-#include <IO/VarInt.h>
-#include <IO/WriteHelpers.h>
-
-#include <array>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <Columns/ColumnNullable.h>
-#include <Columns/ColumnsCommon.h>
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/Moments.h>
-#include "Common/NaNUtils.h"
-#include <Common/assert_cast.h>
-#include <Core/Types.h>
-
-
-namespace DB
-{
-
-using AggregateFunctionAnalysisOfVarianceData = AnalysisOfVarianceMoments<Float64>;
-
-
-/// One way analysis of variance
-/// Provides a statistical test of whether two or more population means are equal (null hypothesis)
-/// Has an assumption that subjects from group i have normal distribution.
-/// Accepts two arguments - a value and a group number which this value belongs to.
-/// Groups are enumerated starting from 0 and there should be at least two groups to perform a test
-/// Moreover there should be at least one group with the number of observations greater than one.
-class AggregateFunctionAnalysisOfVariance final : public IAggregateFunctionDataHelper<AggregateFunctionAnalysisOfVarianceData, AggregateFunctionAnalysisOfVariance>
-{
-public:
-    explicit AggregateFunctionAnalysisOfVariance(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper(arguments, params, createResultType())
-    {}
-
-    DataTypePtr createResultType() const
-    {
-        DataTypes types {std::make_shared<DataTypeNumber<Float64>>(), std::make_shared<DataTypeNumber<Float64>>() };
-        Strings names {"f_statistic", "p_value"};
-        return std::make_shared<DataTypeTuple>(
-            std::move(types),
-            std::move(names)
-        );
-    }
-
-    String getName() const override { return "analysisOfVariance"; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        data(place).add(columns[0]->getFloat64(row_num), columns[1]->getUInt(row_num));
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        data(place).merge(data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        data(place).read(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto f_stat = data(place).getFStatistic();
-
-        auto & column_tuple = assert_cast<ColumnTuple &>(to);
-        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
-        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
-
-        if (unlikely(!std::isfinite(f_stat) || f_stat < 0))
-        {
-            column_stat.getData().push_back(std::numeric_limits<Float64>::quiet_NaN());
-            column_value.getData().push_back(std::numeric_limits<Float64>::quiet_NaN());
-            return;
-        }
-
-        auto p_value = data(place).getPValue(f_stat);
-
-        /// Because p-value is a probability.
-        p_value = std::min(1.0, std::max(0.0, p_value));
-
-        column_stat.getData().push_back(f_stat);
-        column_value.getData().push_back(p_value);
-    }
-
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionAvgWeighted.cpp
@ -1,12 +1,14 @@
 #include <memory>
 #include <type_traits>
+#include <AggregateFunctions/AggregateFunctionAvg.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionAvgWeighted.h>
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+
 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
@ -16,6 +18,88 @@ namespace ErrorCodes

 namespace
 {
+
+template <typename T>
+using AvgWeightedFieldType = std::conditional_t<is_decimal<T>,
+    std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
+    std::conditional_t<DecimalOrExtendedInt<T>,
+        Float64, // no way to do UInt128 * UInt128, better cast to Float64
+        NearestFieldType<T>>>;
+
+template <typename T, typename U>
+using MaxFieldType = std::conditional_t<(sizeof(AvgWeightedFieldType<T>) > sizeof(AvgWeightedFieldType<U>)),
+    AvgWeightedFieldType<T>, AvgWeightedFieldType<U>>;
+
+template <typename Value, typename Weight>
+class AggregateFunctionAvgWeighted final :
+    public AggregateFunctionAvgBase<
+        MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>
+{
+public:
+    using Base = AggregateFunctionAvgBase<
+        MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>;
+    using Base::Base;
+
+    using Numerator = typename Base::Numerator;
+    using Denominator = typename Base::Denominator;
+    using Fraction = typename Base::Fraction;
+
+    void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        const auto& weights = static_cast<const ColumnVectorOrDecimal<Weight> &>(*columns[1]);
+
+        this->data(place).numerator += static_cast<Numerator>(
+            static_cast<const ColumnVectorOrDecimal<Value> &>(*columns[0]).getData()[row_num]) *
+            static_cast<Numerator>(weights.getData()[row_num]);
+
+        this->data(place).denominator += static_cast<Denominator>(weights.getData()[row_num]);
+    }
+
+    String getName() const override { return "avgWeighted"; }
+
+#if USE_EMBEDDED_COMPILER
+
+    bool isCompilable() const override
+    {
+        bool can_be_compiled = Base::isCompilable();
+        can_be_compiled &= canBeNativeType<Weight>();
+
+        return can_be_compiled;
+    }
+
+    void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const ValuesWithType & arguments) const override
+    {
+        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
+
+        auto * numerator_type = toNativeType<Numerator>(b);
+        auto * numerator_ptr = aggregate_data_ptr;
+        auto * numerator_value = b.CreateLoad(numerator_type, numerator_ptr);
+
+        auto numerator_data_type = toNativeDataType<Numerator>();
+        auto * argument = nativeCast(b, arguments[0], numerator_data_type);
+        auto * weight = nativeCast(b, arguments[1], numerator_data_type);
+
+        llvm::Value * value_weight_multiplication = argument->getType()->isIntegerTy() ? b.CreateMul(argument, weight) : b.CreateFMul(argument, weight);
+        auto * numerator_result_value = numerator_type->isIntegerTy() ? b.CreateAdd(numerator_value, value_weight_multiplication) : b.CreateFAdd(numerator_value, value_weight_multiplication);
+        b.CreateStore(numerator_result_value, numerator_ptr);
+
+        auto * denominator_type = toNativeType<Denominator>(b);
+
+        static constexpr size_t denominator_offset = offsetof(Fraction, denominator);
+        auto * denominator_ptr = b.CreateConstInBoundsGEP1_64(b.getInt8Ty(), aggregate_data_ptr, denominator_offset);
+
+        auto * weight_cast_to_denominator = nativeCast(b, arguments[1], toNativeDataType<Denominator>());
+
+        auto * denominator_value = b.CreateLoad(denominator_type, denominator_ptr);
+        auto * denominator_value_updated = denominator_type->isIntegerTy() ? b.CreateAdd(denominator_value, weight_cast_to_denominator) : b.CreateFAdd(denominator_value, weight_cast_to_denominator);
+
+        b.CreateStore(denominator_value_updated, denominator_ptr);
+    }
+
+#endif
+
+};
+
 bool allowTypes(const DataTypePtr& left, const DataTypePtr& right) noexcept
 {
    const WhichDataType l_dt(left), r_dt(right);
--- a/src/AggregateFunctions/AggregateFunctionAvgWeighted.h
+++ b/src/AggregateFunctions/AggregateFunctionAvgWeighted.h
@ -1,90 +0,0 @@
-#pragma once
-
-#include <type_traits>
-#include <AggregateFunctions/AggregateFunctionAvg.h>
-
-namespace DB
-{
-struct Settings;
-
-template <typename T>
-using AvgWeightedFieldType = std::conditional_t<is_decimal<T>,
-    std::conditional_t<std::is_same_v<T, Decimal256>, Decimal256, Decimal128>,
-    std::conditional_t<DecimalOrExtendedInt<T>,
-        Float64, // no way to do UInt128 * UInt128, better cast to Float64
-        NearestFieldType<T>>>;
-
-template <typename T, typename U>
-using MaxFieldType = std::conditional_t<(sizeof(AvgWeightedFieldType<T>) > sizeof(AvgWeightedFieldType<U>)),
-    AvgWeightedFieldType<T>, AvgWeightedFieldType<U>>;
-
-template <typename Value, typename Weight>
-class AggregateFunctionAvgWeighted final :
-    public AggregateFunctionAvgBase<
-        MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>
-{
-public:
-    using Base = AggregateFunctionAvgBase<
-        MaxFieldType<Value, Weight>, AvgWeightedFieldType<Weight>, AggregateFunctionAvgWeighted<Value, Weight>>;
-    using Base::Base;
-
-    using Numerator = typename Base::Numerator;
-    using Denominator = typename Base::Denominator;
-    using Fraction = typename Base::Fraction;
-
-    void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        const auto& weights = static_cast<const ColumnVectorOrDecimal<Weight> &>(*columns[1]);
-
-        this->data(place).numerator += static_cast<Numerator>(
-            static_cast<const ColumnVectorOrDecimal<Value> &>(*columns[0]).getData()[row_num]) *
-            static_cast<Numerator>(weights.getData()[row_num]);
-
-        this->data(place).denominator += static_cast<Denominator>(weights.getData()[row_num]);
-    }
-
-    String getName() const override { return "avgWeighted"; }
-
-#if USE_EMBEDDED_COMPILER
-
-    bool isCompilable() const override
-    {
-        bool can_be_compiled = Base::isCompilable();
-        can_be_compiled &= canBeNativeType<Weight>();
-
-        return can_be_compiled;
-    }
-
-    void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const ValuesWithType & arguments) const override
-    {
-        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
-
-        auto * numerator_type = toNativeType<Numerator>(b);
-        auto * numerator_ptr = aggregate_data_ptr;
-        auto * numerator_value = b.CreateLoad(numerator_type, numerator_ptr);
-
-        auto numerator_data_type = toNativeDataType<Numerator>();
-        auto * argument = nativeCast(b, arguments[0], numerator_data_type);
-        auto * weight = nativeCast(b, arguments[1], numerator_data_type);
-
-        llvm::Value * value_weight_multiplication = argument->getType()->isIntegerTy() ? b.CreateMul(argument, weight) : b.CreateFMul(argument, weight);
-        auto * numerator_result_value = numerator_type->isIntegerTy() ? b.CreateAdd(numerator_value, value_weight_multiplication) : b.CreateFAdd(numerator_value, value_weight_multiplication);
-        b.CreateStore(numerator_result_value, numerator_ptr);
-
-        auto * denominator_type = toNativeType<Denominator>(b);
-
-        static constexpr size_t denominator_offset = offsetof(Fraction, denominator);
-        auto * denominator_ptr = b.CreateConstInBoundsGEP1_64(b.getInt8Ty(), aggregate_data_ptr, denominator_offset);
-
-        auto * weight_cast_to_denominator = nativeCast(b, arguments[1], toNativeDataType<Denominator>());
-
-        auto * denominator_value = b.CreateLoad(denominator_type, denominator_ptr);
-        auto * denominator_value_updated = denominator_type->isIntegerTy() ? b.CreateAdd(denominator_value, weight_cast_to_denominator) : b.CreateFAdd(denominator_value, weight_cast_to_denominator);
-
-        b.CreateStore(denominator_value_updated, denominator_ptr);
-    }
-
-#endif
-
-};
-}
--- a/src/AggregateFunctions/AggregateFunctionBitwise.cpp
+++ b/src/AggregateFunctions/AggregateFunctionBitwise.cpp
@ -1,11 +1,27 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionBitwise.h>
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnVector.h>
+#include <Common/assert_cast.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#include "config.h"
+
+#if USE_EMBEDDED_COMPILER
+#    include <llvm/IR/IRBuilder.h>
+#    include <DataTypes/Native.h>
+#endif
+

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
@ -16,6 +32,179 @@ namespace ErrorCodes
 namespace
 {

+template <typename T>
+struct AggregateFunctionGroupBitOrData
+{
+    T value = 0;
+    static const char * name() { return "groupBitOr"; }
+    void update(T x) { value |= x; }
+
+#if USE_EMBEDDED_COMPILER
+
+    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
+    {
+        auto type = toNativeType<T>(builder);
+        builder.CreateStore(llvm::Constant::getNullValue(type), value_ptr);
+    }
+
+    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
+    {
+        return builder.CreateOr(lhs, rhs);
+    }
+
+#endif
+};
+
+template <typename T>
+struct AggregateFunctionGroupBitAndData
+{
+    T value = -1; /// Two's complement arithmetic, sign extension.
+    static const char * name() { return "groupBitAnd"; }
+    void update(T x) { value &= x; }
+
+#if USE_EMBEDDED_COMPILER
+
+    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
+    {
+        auto type = toNativeType<T>(builder);
+        builder.CreateStore(llvm::ConstantInt::get(type, -1), value_ptr);
+    }
+
+    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
+    {
+        return builder.CreateAnd(lhs, rhs);
+    }
+
+#endif
+};
+
+template <typename T>
+struct AggregateFunctionGroupBitXorData
+{
+    T value = 0;
+    static const char * name() { return "groupBitXor"; }
+    void update(T x) { value ^= x; }
+
+#if USE_EMBEDDED_COMPILER
+
+    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
+    {
+        auto type = toNativeType<T>(builder);
+        builder.CreateStore(llvm::Constant::getNullValue(type), value_ptr);
+    }
+
+    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
+    {
+        return builder.CreateXor(lhs, rhs);
+    }
+
+#endif
+};
+
+
+/// Counts bitwise operation on numbers.
+template <typename T, typename Data>
+class AggregateFunctionBitwise final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitwise<T, Data>>
+{
+public:
+    explicit AggregateFunctionBitwise(const DataTypePtr & type)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionBitwise<T, Data>>({type}, {}, createResultType())
+    {}
+
+    String getName() const override { return Data::name(); }
+
+    static DataTypePtr createResultType()
+    {
+        return std::make_shared<DataTypeNumber<T>>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).update(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).update(this->data(rhs).value);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        writeBinary(this->data(place).value, buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        readBinary(this->data(place).value, buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).value);
+    }
+
+#if USE_EMBEDDED_COMPILER
+
+    bool isCompilable() const override
+    {
+        auto return_type = this->getResultType();
+        return canBeNativeType(*return_type);
+    }
+
+    void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr) const override
+    {
+        auto * value_ptr = aggregate_data_ptr;
+        Data::compileCreate(builder, value_ptr);
+    }
+
+    void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const ValuesWithType & arguments) const override
+    {
+        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
+
+        auto * return_type = toNativeType(b, this->getResultType());
+
+        auto * value_ptr = aggregate_data_ptr;
+        auto * value = b.CreateLoad(return_type, value_ptr);
+
+        auto * result_value = Data::compileUpdate(builder, value, arguments[0].value);
+
+        b.CreateStore(result_value, value_ptr);
+    }
+
+    void compileMerge(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_dst_ptr, llvm::Value * aggregate_data_src_ptr) const override
+    {
+        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
+
+        auto * return_type = toNativeType(b, this->getResultType());
+
+        auto * value_dst_ptr = aggregate_data_dst_ptr;
+        auto * value_dst = b.CreateLoad(return_type, value_dst_ptr);
+
+        auto * value_src_ptr = aggregate_data_src_ptr;
+        auto * value_src = b.CreateLoad(return_type, value_src_ptr);
+
+        auto * result_value = Data::compileUpdate(builder, value_dst, value_src);
+
+        b.CreateStore(result_value, value_dst_ptr);
+    }
+
+    llvm::Value * compileGetResult(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr) const override
+    {
+        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
+
+        auto * return_type = toNativeType(b, this->getResultType());
+        auto * value_ptr = aggregate_data_ptr;
+
+        return b.CreateLoad(return_type, value_ptr);
+    }
+
+#endif
+
+};
+
+
 template <template <typename> class Data>
 AggregateFunctionPtr createAggregateFunctionBitwise(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
--- a/src/AggregateFunctions/AggregateFunctionBitwise.h
+++ b/src/AggregateFunctions/AggregateFunctionBitwise.h
@ -1,197 +0,0 @@
-#pragma once
-
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-
-#include <DataTypes/DataTypesNumber.h>
-#include <Columns/ColumnVector.h>
-#include <Common/assert_cast.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-#include "config.h"
-
-#if USE_EMBEDDED_COMPILER
-#    include <llvm/IR/IRBuilder.h>
-#    include <DataTypes/Native.h>
-#endif
-
-namespace DB
-{
-struct Settings;
-
-
-template <typename T>
-struct AggregateFunctionGroupBitOrData
-{
-    T value = 0;
-    static const char * name() { return "groupBitOr"; }
-    void update(T x) { value |= x; }
-
-#if USE_EMBEDDED_COMPILER
-
-    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
-    {
-        auto type = toNativeType<T>(builder);
-        builder.CreateStore(llvm::Constant::getNullValue(type), value_ptr);
-    }
-
-    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
-    {
-        return builder.CreateOr(lhs, rhs);
-    }
-
-#endif
-};
-
-template <typename T>
-struct AggregateFunctionGroupBitAndData
-{
-    T value = -1; /// Two's complement arithmetic, sign extension.
-    static const char * name() { return "groupBitAnd"; }
-    void update(T x) { value &= x; }
-
-#if USE_EMBEDDED_COMPILER
-
-    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
-    {
-        auto type = toNativeType<T>(builder);
-        builder.CreateStore(llvm::ConstantInt::get(type, -1), value_ptr);
-    }
-
-    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
-    {
-        return builder.CreateAnd(lhs, rhs);
-    }
-
-#endif
-};
-
-template <typename T>
-struct AggregateFunctionGroupBitXorData
-{
-    T value = 0;
-    static const char * name() { return "groupBitXor"; }
-    void update(T x) { value ^= x; }
-
-#if USE_EMBEDDED_COMPILER
-
-    static void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * value_ptr)
-    {
-        auto type = toNativeType<T>(builder);
-        builder.CreateStore(llvm::Constant::getNullValue(type), value_ptr);
-    }
-
-    static llvm::Value* compileUpdate(llvm::IRBuilderBase & builder, llvm::Value * lhs, llvm::Value * rhs)
-    {
-        return builder.CreateXor(lhs, rhs);
-    }
-
-#endif
-};
-
-
-/// Counts bitwise operation on numbers.
-template <typename T, typename Data>
-class AggregateFunctionBitwise final : public IAggregateFunctionDataHelper<Data, AggregateFunctionBitwise<T, Data>>
-{
-public:
-    explicit AggregateFunctionBitwise(const DataTypePtr & type)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionBitwise<T, Data>>({type}, {}, createResultType())
-    {}
-
-    String getName() const override { return Data::name(); }
-
-    static DataTypePtr createResultType()
-    {
-        return std::make_shared<DataTypeNumber<T>>();
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).update(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).update(this->data(rhs).value);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        writeBinary(this->data(place).value, buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        readBinary(this->data(place).value, buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).value);
-    }
-
-#if USE_EMBEDDED_COMPILER
-
-    bool isCompilable() const override
-    {
-        auto return_type = this->getResultType();
-        return canBeNativeType(*return_type);
-    }
-
-    void compileCreate(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr) const override
-    {
-        auto * value_ptr = aggregate_data_ptr;
-        Data::compileCreate(builder, value_ptr);
-    }
-
-    void compileAdd(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr, const ValuesWithType & arguments) const override
-    {
-        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
-
-        auto * return_type = toNativeType(b, this->getResultType());
-
-        auto * value_ptr = aggregate_data_ptr;
-        auto * value = b.CreateLoad(return_type, value_ptr);
-
-        auto * result_value = Data::compileUpdate(builder, value, arguments[0].value);
-
-        b.CreateStore(result_value, value_ptr);
-    }
-
-    void compileMerge(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_dst_ptr, llvm::Value * aggregate_data_src_ptr) const override
-    {
-        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
-
-        auto * return_type = toNativeType(b, this->getResultType());
-
-        auto * value_dst_ptr = aggregate_data_dst_ptr;
-        auto * value_dst = b.CreateLoad(return_type, value_dst_ptr);
-
-        auto * value_src_ptr = aggregate_data_src_ptr;
-        auto * value_src = b.CreateLoad(return_type, value_src_ptr);
-
-        auto * result_value = Data::compileUpdate(builder, value_dst, value_src);
-
-        b.CreateStore(result_value, value_dst_ptr);
-    }
-
-    llvm::Value * compileGetResult(llvm::IRBuilderBase & builder, llvm::Value * aggregate_data_ptr) const override
-    {
-        llvm::IRBuilder<> & b = static_cast<llvm::IRBuilder<> &>(builder);
-
-        auto * return_type = toNativeType(b, this->getResultType());
-        auto * value_ptr = aggregate_data_ptr;
-
-        return b.CreateLoad(return_type, value_ptr);
-    }
-
-#endif
-
-};
-
-
-}
--- a/src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp
+++ b/src/AggregateFunctions/AggregateFunctionBoundingRatio.cpp
@ -1,7 +1,14 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionBoundingRatio.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnsNumber.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Common/assert_cast.h>
+#include <Common/transformEndianness.h>
+

 namespace DB
 {
@ -10,11 +17,169 @@ struct Settings;
 namespace ErrorCodes
 {
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
 }

 namespace
 {

+/** Tracks the leftmost and rightmost (x, y) data points.
+  */
+struct AggregateFunctionBoundingRatioData
+{
+    struct Point
+    {
+        Float64 x;
+        Float64 y;
+    };
+
+    bool empty = true;
+    Point left;
+    Point right;
+
+    void add(Float64 x, Float64 y)
+    {
+        Point point{x, y};
+
+        if (empty)
+        {
+            left = point;
+            right = point;
+            empty = false;
+        }
+        else if (point.x < left.x)
+        {
+            left = point;
+        }
+        else if (point.x > right.x)
+        {
+            right = point;
+        }
+    }
+
+    void merge(const AggregateFunctionBoundingRatioData & other)
+    {
+        if (empty)
+        {
+            *this = other;
+        }
+        else
+        {
+            if (other.left.x < left.x)
+                left = other.left;
+            if (other.right.x > right.x)
+                right = other.right;
+        }
+    }
+
+    void serialize(WriteBuffer & buf) const;
+    void deserialize(ReadBuffer & buf);
+};
+
+template <std::endian endian>
+inline void transformEndianness(AggregateFunctionBoundingRatioData::Point & p)
+{
+    DB::transformEndianness<endian>(p.x);
+    DB::transformEndianness<endian>(p.y);
+}
+
+void AggregateFunctionBoundingRatioData::serialize(WriteBuffer & buf) const
+{
+    writeBinaryLittleEndian(empty, buf);
+
+    if (!empty)
+    {
+        writeBinaryLittleEndian(left, buf);
+        writeBinaryLittleEndian(right, buf);
+    }
+}
+
+void AggregateFunctionBoundingRatioData::deserialize(ReadBuffer & buf)
+{
+    readBinaryLittleEndian(empty, buf);
+
+    if (!empty)
+    {
+        readBinaryLittleEndian(left, buf);
+        readBinaryLittleEndian(right, buf);
+    }
+}
+
+inline void writeBinary(const AggregateFunctionBoundingRatioData::Point & p, WriteBuffer & buf)
+{
+    writePODBinary(p, buf);
+}
+
+inline void readBinary(AggregateFunctionBoundingRatioData::Point & p, ReadBuffer & buf)
+{
+    readPODBinary(p, buf);
+}
+
+
+class AggregateFunctionBoundingRatio final : public IAggregateFunctionDataHelper<AggregateFunctionBoundingRatioData, AggregateFunctionBoundingRatio>
+{
+private:
+    /** Calculates the slope of a line between leftmost and rightmost data points.
+      * (y2 - y1) / (x2 - x1)
+      */
+    static Float64 NO_SANITIZE_UNDEFINED getBoundingRatio(const AggregateFunctionBoundingRatioData & data)
+    {
+        if (data.empty)
+            return std::numeric_limits<Float64>::quiet_NaN();
+
+        return (data.right.y - data.left.y) / (data.right.x - data.left.x);
+    }
+
+public:
+    String getName() const override
+    {
+        return "boundingRatio";
+    }
+
+    explicit AggregateFunctionBoundingRatio(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<AggregateFunctionBoundingRatioData, AggregateFunctionBoundingRatio>(arguments, {}, std::make_shared<DataTypeFloat64>())
+    {
+        const auto * x_arg = arguments.at(0).get();
+        const auto * y_arg = arguments.at(1).get();
+
+        if (!x_arg->isValueRepresentedByNumber() || !y_arg->isValueRepresentedByNumber())
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Illegal types of arguments of aggregate function {}, must have number representation.",
+                            getName());
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
+    {
+        /// NOTE Slightly inefficient.
+        const auto x = columns[0]->getFloat64(row_num);
+        const auto y = columns[1]->getFloat64(row_num);
+        data(place).add(x, y);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        data(place).merge(data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnFloat64 &>(to).getData().push_back(getBoundingRatio(data(place)));
+    }
+};
+
+
 AggregateFunctionPtr createAggregateFunctionRate(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
    assertNoParameters(name, parameters);
--- a/src/AggregateFunctions/AggregateFunctionBoundingRatio.h
+++ b/src/AggregateFunctions/AggregateFunctionBoundingRatio.h
@ -1,177 +0,0 @@
-#pragma once
-
-#include <DataTypes/DataTypesNumber.h>
-#include <Columns/ColumnsNumber.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <AggregateFunctions/Helpers.h>
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <Common/assert_cast.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
-
-/** Tracks the leftmost and rightmost (x, y) data points.
-  */
-struct AggregateFunctionBoundingRatioData
-{
-    struct Point
-    {
-        Float64 x;
-        Float64 y;
-    };
-
-    bool empty = true;
-    Point left;
-    Point right;
-
-    void add(Float64 x, Float64 y)
-    {
-        Point point{x, y};
-
-        if (empty)
-        {
-            left = point;
-            right = point;
-            empty = false;
-        }
-        else if (point.x < left.x)
-        {
-            left = point;
-        }
-        else if (point.x > right.x)
-        {
-            right = point;
-        }
-    }
-
-    void merge(const AggregateFunctionBoundingRatioData & other)
-    {
-        if (empty)
-        {
-            *this = other;
-        }
-        else
-        {
-            if (other.left.x < left.x)
-                left = other.left;
-            if (other.right.x > right.x)
-                right = other.right;
-        }
-    }
-
-    void serialize(WriteBuffer & buf) const;
-    void deserialize(ReadBuffer & buf);
-};
-
-template <std::endian endian>
-inline void transformEndianness(AggregateFunctionBoundingRatioData::Point & p)
-{
-    transformEndianness<endian>(p.x);
-    transformEndianness<endian>(p.y);
-}
-
-void AggregateFunctionBoundingRatioData::serialize(WriteBuffer & buf) const
-{
-    writeBinaryLittleEndian(empty, buf);
-
-    if (!empty)
-    {
-        writeBinaryLittleEndian(left, buf);
-        writeBinaryLittleEndian(right, buf);
-    }
-}
-
-void AggregateFunctionBoundingRatioData::deserialize(ReadBuffer & buf)
-{
-    readBinaryLittleEndian(empty, buf);
-
-    if (!empty)
-    {
-        readBinaryLittleEndian(left, buf);
-        readBinaryLittleEndian(right, buf);
-    }
-}
-
-inline void writeBinary(const AggregateFunctionBoundingRatioData::Point & p, WriteBuffer & buf)
-{
-    writePODBinary(p, buf);
-}
-
-inline void readBinary(AggregateFunctionBoundingRatioData::Point & p, ReadBuffer & buf)
-{
-    readPODBinary(p, buf);
-}
-
-
-class AggregateFunctionBoundingRatio final : public IAggregateFunctionDataHelper<AggregateFunctionBoundingRatioData, AggregateFunctionBoundingRatio>
-{
-private:
-    /** Calculates the slope of a line between leftmost and rightmost data points.
-      * (y2 - y1) / (x2 - x1)
-      */
-    static Float64 NO_SANITIZE_UNDEFINED getBoundingRatio(const AggregateFunctionBoundingRatioData & data)
-    {
-        if (data.empty)
-            return std::numeric_limits<Float64>::quiet_NaN();
-
-        return (data.right.y - data.left.y) / (data.right.x - data.left.x);
-    }
-
-public:
-    String getName() const override
-    {
-        return "boundingRatio";
-    }
-
-    explicit AggregateFunctionBoundingRatio(const DataTypes & arguments)
-        : IAggregateFunctionDataHelper<AggregateFunctionBoundingRatioData, AggregateFunctionBoundingRatio>(arguments, {}, std::make_shared<DataTypeFloat64>())
-    {
-        const auto * x_arg = arguments.at(0).get();
-        const auto * y_arg = arguments.at(1).get();
-
-        if (!x_arg->isValueRepresentedByNumber() || !y_arg->isValueRepresentedByNumber())
-            throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                            "Illegal types of arguments of aggregate function {}, must have number representation.",
-                            getName());
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
-    {
-        /// NOTE Slightly inefficient.
-        const auto x = columns[0]->getFloat64(row_num);
-        const auto y = columns[1]->getFloat64(row_num);
-        data(place).add(x, y);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        data(place).merge(data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnFloat64 &>(to).getData().push_back(getBoundingRatio(data(place)));
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
@ -1,9 +1,15 @@
-#include <AggregateFunctions/AggregateFunctionDeltaSum.h>
-
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>

+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+

 namespace DB
 {
@ -18,6 +24,113 @@ namespace ErrorCodes
 namespace
 {

+template <typename T>
+struct AggregationFunctionDeltaSumData
+{
+    T sum = 0;
+    T last = 0;
+    T first = 0;
+    bool seen = false;
+};
+
+template <typename T>
+class AggregationFunctionDeltaSum final
+    : public IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>
+{
+public:
+    AggregationFunctionDeltaSum(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{arguments, params, createResultType()}
+    {}
+
+    AggregationFunctionDeltaSum()
+        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{}
+    {}
+
+    String getName() const override { return "deltaSum"; }
+
+    static DataTypePtr createResultType() { return std::make_shared<DataTypeNumber<T>>(); }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto value = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
+
+        if ((this->data(place).last < value) && this->data(place).seen)
+        {
+            this->data(place).sum += (value - this->data(place).last);
+        }
+
+        this->data(place).last = value;
+
+        if (!this->data(place).seen)
+        {
+            this->data(place).first = value;
+            this->data(place).seen = true;
+        }
+    }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto place_data = &this->data(place);
+        auto rhs_data = &this->data(rhs);
+
+        if ((place_data->last < rhs_data->first) && place_data->seen && rhs_data->seen)
+        {
+            // If the lhs last number seen is less than the first number the rhs saw, the lhs is before
+            // the rhs, for example [0, 2] [4, 7]. So we want to add the deltasums, but also add the
+            // difference between lhs last number and rhs first number (the 2 and 4). Then we want to
+            // take last value from the rhs, so first and last become 0 and 7.
+
+            place_data->sum += rhs_data->sum + (rhs_data->first - place_data->last);
+            place_data->last = rhs_data->last;
+        }
+        else if ((rhs_data->first < place_data->last && rhs_data->seen && place_data->seen))
+        {
+            // In the opposite scenario, the lhs comes after the rhs, e.g. [4, 6] [1, 2]. Since we
+            // assume the input interval states are sorted by time, we assume this is a counter
+            // reset, and therefore do *not* add the difference between our first value and the
+            // rhs last value.
+
+            place_data->sum += rhs_data->sum;
+            place_data->last = rhs_data->last;
+        }
+        else if (rhs_data->seen && !place_data->seen)
+        {
+            // If we're here then the lhs is an empty state and the rhs does have some state, so
+            // we'll just take that state.
+
+            place_data->first = rhs_data->first;
+            place_data->last = rhs_data->last;
+            place_data->sum = rhs_data->sum;
+            place_data->seen = rhs_data->seen;
+        }
+
+        // Otherwise lhs either has data or is uninitialized, so we don't need to modify its values.
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        writeBinaryLittleEndian(this->data(place).sum, buf);
+        writeBinaryLittleEndian(this->data(place).first, buf);
+        writeBinaryLittleEndian(this->data(place).last, buf);
+        writeBinaryLittleEndian(this->data(place).seen, buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        readBinaryLittleEndian(this->data(place).sum, buf);
+        readBinaryLittleEndian(this->data(place).first, buf);
+        readBinaryLittleEndian(this->data(place).last, buf);
+        readBinaryLittleEndian(this->data(place).seen, buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).sum);
+    }
+};
+
 AggregateFunctionPtr createAggregateFunctionDeltaSum(
    const String & name,
    const DataTypes & arguments,
--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.h
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.h
@ -1,126 +0,0 @@
-#pragma once
-
-#include <type_traits>
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-#include <Columns/ColumnVector.h>
-#include <DataTypes/DataTypesDecimal.h>
-#include <DataTypes/DataTypesNumber.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-
-namespace DB
-{
-struct Settings;
-
-template <typename T>
-struct AggregationFunctionDeltaSumData
-{
-    T sum = 0;
-    T last = 0;
-    T first = 0;
-    bool seen = false;
-};
-
-template <typename T>
-class AggregationFunctionDeltaSum final
-    : public IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>
-{
-public:
-    AggregationFunctionDeltaSum(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{arguments, params, createResultType()}
-    {}
-
-    AggregationFunctionDeltaSum()
-        : IAggregateFunctionDataHelper<AggregationFunctionDeltaSumData<T>, AggregationFunctionDeltaSum<T>>{}
-    {}
-
-    String getName() const override { return "deltaSum"; }
-
-    static DataTypePtr createResultType() { return std::make_shared<DataTypeNumber<T>>(); }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        auto value = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
-
-        if ((this->data(place).last < value) && this->data(place).seen)
-        {
-            this->data(place).sum += (value - this->data(place).last);
-        }
-
-        this->data(place).last = value;
-
-        if (!this->data(place).seen)
-        {
-            this->data(place).first = value;
-            this->data(place).seen = true;
-        }
-    }
-
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        auto place_data = &this->data(place);
-        auto rhs_data = &this->data(rhs);
-
-        if ((place_data->last < rhs_data->first) && place_data->seen && rhs_data->seen)
-        {
-            // If the lhs last number seen is less than the first number the rhs saw, the lhs is before
-            // the rhs, for example [0, 2] [4, 7]. So we want to add the deltasums, but also add the
-            // difference between lhs last number and rhs first number (the 2 and 4). Then we want to
-            // take last value from the rhs, so first and last become 0 and 7.
-
-            place_data->sum += rhs_data->sum + (rhs_data->first - place_data->last);
-            place_data->last = rhs_data->last;
-        }
-        else if ((rhs_data->first < place_data->last && rhs_data->seen && place_data->seen))
-        {
-            // In the opposite scenario, the lhs comes after the rhs, e.g. [4, 6] [1, 2]. Since we
-            // assume the input interval states are sorted by time, we assume this is a counter
-            // reset, and therefore do *not* add the difference between our first value and the
-            // rhs last value.
-
-            place_data->sum += rhs_data->sum;
-            place_data->last = rhs_data->last;
-        }
-        else if (rhs_data->seen && !place_data->seen)
-        {
-            // If we're here then the lhs is an empty state and the rhs does have some state, so
-            // we'll just take that state.
-
-            place_data->first = rhs_data->first;
-            place_data->last = rhs_data->last;
-            place_data->sum = rhs_data->sum;
-            place_data->seen = rhs_data->seen;
-        }
-
-        // Otherwise lhs either has data or is uninitialized, so we don't need to modify its values.
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        writeBinaryLittleEndian(this->data(place).sum, buf);
-        writeBinaryLittleEndian(this->data(place).first, buf);
-        writeBinaryLittleEndian(this->data(place).last, buf);
-        writeBinaryLittleEndian(this->data(place).seen, buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        readBinaryLittleEndian(this->data(place).sum, buf);
-        readBinaryLittleEndian(this->data(place).first, buf);
-        readBinaryLittleEndian(this->data(place).last, buf);
-        readBinaryLittleEndian(this->data(place).seen, buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnVector<T> &>(to).getData().push_back(this->data(place).sum);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.cpp
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.cpp
@ -1,9 +1,15 @@
-#include <AggregateFunctions/AggregateFunctionDeltaSumTimestamp.h>
-
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>

+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Columns/ColumnVector.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+

 namespace DB
 {
@ -17,6 +23,159 @@ namespace ErrorCodes
 namespace
 {

+template <typename ValueType, typename TimestampType>
+struct AggregationFunctionDeltaSumTimestampData
+{
+    ValueType sum = 0;
+    ValueType first = 0;
+    ValueType last = 0;
+    TimestampType first_ts = 0;
+    TimestampType last_ts = 0;
+    bool seen = false;
+};
+
+template <typename ValueType, typename TimestampType>
+class AggregationFunctionDeltaSumTimestamp final
+    : public IAggregateFunctionDataHelper<
+        AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
+        AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
+      >
+{
+public:
+    AggregationFunctionDeltaSumTimestamp(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<
+            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
+            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
+        >{arguments, params, createResultType()}
+    {}
+
+    AggregationFunctionDeltaSumTimestamp()
+        : IAggregateFunctionDataHelper<
+            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
+            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
+        >{}
+    {}
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    String getName() const override { return "deltaSumTimestamp"; }
+
+    static DataTypePtr createResultType() { return std::make_shared<DataTypeNumber<ValueType>>(); }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto value = assert_cast<const ColumnVector<ValueType> &>(*columns[0]).getData()[row_num];
+        auto ts = assert_cast<const ColumnVector<TimestampType> &>(*columns[1]).getData()[row_num];
+
+        if ((this->data(place).last < value) && this->data(place).seen)
+        {
+            this->data(place).sum += (value - this->data(place).last);
+        }
+
+        this->data(place).last = value;
+        this->data(place).last_ts = ts;
+
+        if (!this->data(place).seen)
+        {
+            this->data(place).first = value;
+            this->data(place).seen = true;
+            this->data(place).first_ts = ts;
+        }
+    }
+
+    // before returns true if lhs is before rhs or false if it is not or can't be determined
+    bool ALWAYS_INLINE before (
+        const AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType> * lhs,
+        const AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType> * rhs
+    ) const
+    {
+        if (lhs->last_ts < rhs->first_ts)
+        {
+            return true;
+        }
+        if (lhs->last_ts == rhs->first_ts && (lhs->last_ts < rhs->last_ts || lhs->first_ts < rhs->first_ts))
+        {
+            return true;
+        }
+        return false;
+    }
+
+    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto place_data = &this->data(place);
+        auto rhs_data = &this->data(rhs);
+
+        if (!place_data->seen && rhs_data->seen)
+        {
+            place_data->sum = rhs_data->sum;
+            place_data->seen = true;
+            place_data->first = rhs_data->first;
+            place_data->first_ts = rhs_data->first_ts;
+            place_data->last = rhs_data->last;
+            place_data->last_ts = rhs_data->last_ts;
+        }
+        else if (place_data->seen && !rhs_data->seen)
+            return;
+        else if (before(place_data, rhs_data))
+        {
+            // This state came before the rhs state
+
+            if (rhs_data->first > place_data->last)
+                place_data->sum += (rhs_data->first - place_data->last);
+            place_data->sum += rhs_data->sum;
+            place_data->last = rhs_data->last;
+            place_data->last_ts = rhs_data->last_ts;
+        }
+        else if (before(rhs_data, place_data))
+        {
+            // This state came after the rhs state
+
+            if (place_data->first > rhs_data->last)
+                place_data->sum += (place_data->first - rhs_data->last);
+            place_data->sum += rhs_data->sum;
+            place_data->first = rhs_data->first;
+            place_data->first_ts = rhs_data->first_ts;
+        }
+        else
+        {
+            // If none of those conditions matched, it means both states we are merging have all
+            // same timestamps. We have to pick either the smaller or larger value so that the
+            // result is deterministic.
+
+            if (place_data->first < rhs_data->first)
+            {
+                place_data->first = rhs_data->first;
+                place_data->last = rhs_data->last;
+            }
+        }
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        writeBinaryLittleEndian(this->data(place).sum, buf);
+        writeBinaryLittleEndian(this->data(place).first, buf);
+        writeBinaryLittleEndian(this->data(place).first_ts, buf);
+        writeBinaryLittleEndian(this->data(place).last, buf);
+        writeBinaryLittleEndian(this->data(place).last_ts, buf);
+        writeBinaryLittleEndian(this->data(place).seen, buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        readBinaryLittleEndian(this->data(place).sum, buf);
+        readBinaryLittleEndian(this->data(place).first, buf);
+        readBinaryLittleEndian(this->data(place).first_ts, buf);
+        readBinaryLittleEndian(this->data(place).last, buf);
+        readBinaryLittleEndian(this->data(place).last_ts, buf);
+        readBinaryLittleEndian(this->data(place).seen, buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnVector<ValueType> &>(to).getData().push_back(this->data(place).sum);
+    }
+};
+
 AggregateFunctionPtr createAggregateFunctionDeltaSumTimestamp(
    const String & name,
    const DataTypes & arguments,
--- a/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.h
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.h
@ -1,171 +0,0 @@
-#pragma once
-
-#include <type_traits>
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-#include <Columns/ColumnVector.h>
-#include <DataTypes/DataTypesDecimal.h>
-#include <DataTypes/DataTypesNumber.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-
-namespace DB
-{
-
-template <typename ValueType, typename TimestampType>
-struct AggregationFunctionDeltaSumTimestampData
-{
-    ValueType sum = 0;
-    ValueType first = 0;
-    ValueType last = 0;
-    TimestampType first_ts = 0;
-    TimestampType last_ts = 0;
-    bool seen = false;
-};
-
-template <typename ValueType, typename TimestampType>
-class AggregationFunctionDeltaSumTimestamp final
-    : public IAggregateFunctionDataHelper<
-        AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-        AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-      >
-{
-public:
-    AggregationFunctionDeltaSumTimestamp(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<
-            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-        >{arguments, params, createResultType()}
-    {}
-
-    AggregationFunctionDeltaSumTimestamp()
-        : IAggregateFunctionDataHelper<
-            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-        >{}
-    {}
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    String getName() const override { return "deltaSumTimestamp"; }
-
-    static DataTypePtr createResultType() { return std::make_shared<DataTypeNumber<ValueType>>(); }
-
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        auto value = assert_cast<const ColumnVector<ValueType> &>(*columns[0]).getData()[row_num];
-        auto ts = assert_cast<const ColumnVector<TimestampType> &>(*columns[1]).getData()[row_num];
-
-        if ((this->data(place).last < value) && this->data(place).seen)
-        {
-            this->data(place).sum += (value - this->data(place).last);
-        }
-
-        this->data(place).last = value;
-        this->data(place).last_ts = ts;
-
-        if (!this->data(place).seen)
-        {
-            this->data(place).first = value;
-            this->data(place).seen = true;
-            this->data(place).first_ts = ts;
-        }
-    }
-
-    // before returns true if lhs is before rhs or false if it is not or can't be determined
-    bool ALWAYS_INLINE before (
-        const AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType> * lhs,
-        const AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType> * rhs
-    ) const
-    {
-        if (lhs->last_ts < rhs->first_ts)
-        {
-            return true;
-        }
-        if (lhs->last_ts == rhs->first_ts && (lhs->last_ts < rhs->last_ts || lhs->first_ts < rhs->first_ts))
-        {
-            return true;
-        }
-        return false;
-    }
-
-    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        auto place_data = &this->data(place);
-        auto rhs_data = &this->data(rhs);
-
-        if (!place_data->seen && rhs_data->seen)
-        {
-            place_data->sum = rhs_data->sum;
-            place_data->seen = true;
-            place_data->first = rhs_data->first;
-            place_data->first_ts = rhs_data->first_ts;
-            place_data->last = rhs_data->last;
-            place_data->last_ts = rhs_data->last_ts;
-        }
-        else if (place_data->seen && !rhs_data->seen)
-            return;
-        else if (before(place_data, rhs_data))
-        {
-            // This state came before the rhs state
-
-            if (rhs_data->first > place_data->last)
-                place_data->sum += (rhs_data->first - place_data->last);
-            place_data->sum += rhs_data->sum;
-            place_data->last = rhs_data->last;
-            place_data->last_ts = rhs_data->last_ts;
-        }
-        else if (before(rhs_data, place_data))
-        {
-            // This state came after the rhs state
-
-            if (place_data->first > rhs_data->last)
-                place_data->sum += (place_data->first - rhs_data->last);
-            place_data->sum += rhs_data->sum;
-            place_data->first = rhs_data->first;
-            place_data->first_ts = rhs_data->first_ts;
-        }
-        else
-        {
-            // If none of those conditions matched, it means both states we are merging have all
-            // same timestamps. We have to pick either the smaller or larger value so that the
-            // result is deterministic.
-
-            if (place_data->first < rhs_data->first)
-            {
-                place_data->first = rhs_data->first;
-                place_data->last = rhs_data->last;
-            }
-        }
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        writeBinaryLittleEndian(this->data(place).sum, buf);
-        writeBinaryLittleEndian(this->data(place).first, buf);
-        writeBinaryLittleEndian(this->data(place).first_ts, buf);
-        writeBinaryLittleEndian(this->data(place).last, buf);
-        writeBinaryLittleEndian(this->data(place).last_ts, buf);
-        writeBinaryLittleEndian(this->data(place).seen, buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        readBinaryLittleEndian(this->data(place).sum, buf);
-        readBinaryLittleEndian(this->data(place).first, buf);
-        readBinaryLittleEndian(this->data(place).first_ts, buf);
-        readBinaryLittleEndian(this->data(place).last, buf);
-        readBinaryLittleEndian(this->data(place).last_ts, buf);
-        readBinaryLittleEndian(this->data(place).seen, buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnVector<ValueType> &>(to).getData().push_back(this->data(place).sum);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionEntropy.cpp
+++ b/src/AggregateFunctions/AggregateFunctionEntropy.cpp
@ -1,8 +1,18 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionEntropy.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>

+#include <Common/HashTable/HashMap.h>
+#include <Common/NaNUtils.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/UniqVariadicHash.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnVector.h>
+#include <Common/assert_cast.h>
+
+#include <cmath>
+

 namespace DB
 {
@ -16,6 +26,133 @@ namespace ErrorCodes
 namespace
 {

+/** Calculates Shannon Entropy, using HashMap and computing empirical distribution function.
+  * Entropy is measured in bits (base-2 logarithm is used).
+  */
+template <typename Value>
+struct EntropyData
+{
+    using Weight = UInt64;
+
+    using HashingMap = HashMapWithStackMemory<Value, Weight, HashCRC32<Value>, 4>;
+
+    /// For the case of pre-hashed values.
+    using TrivialMap = HashMapWithStackMemory<Value, Weight, UInt128TrivialHash, 4>;
+
+    using Map = std::conditional_t<std::is_same_v<UInt128, Value>, TrivialMap, HashingMap>;
+
+    Map map;
+
+    void add(const Value & x)
+    {
+        if (!isNaN(x))
+            ++map[x];
+    }
+
+    void add(const Value & x, const Weight & weight)
+    {
+        if (!isNaN(x))
+            map[x] += weight;
+    }
+
+    void merge(const EntropyData & rhs)
+    {
+        for (const auto & pair : rhs.map)
+            map[pair.getKey()] += pair.getMapped();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        map.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        typename Map::Reader reader(buf);
+        while (reader.next())
+        {
+            const auto & pair = reader.get();
+            map[pair.first] = pair.second;
+        }
+    }
+
+    Float64 get() const
+    {
+        UInt64 total_value = 0;
+        for (const auto & pair : map)
+            total_value += pair.getMapped();
+
+        Float64 shannon_entropy = 0;
+        for (const auto & pair : map)
+        {
+            Float64 frequency = Float64(pair.getMapped()) / total_value;
+            shannon_entropy -= frequency * log2(frequency);
+        }
+
+        return shannon_entropy;
+    }
+};
+
+
+template <typename Value>
+class AggregateFunctionEntropy final : public IAggregateFunctionDataHelper<EntropyData<Value>, AggregateFunctionEntropy<Value>>
+{
+private:
+    size_t num_args;
+
+public:
+    explicit AggregateFunctionEntropy(const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<EntropyData<Value>, AggregateFunctionEntropy<Value>>(argument_types_, {}, createResultType())
+        , num_args(argument_types_.size())
+    {
+    }
+
+    String getName() const override { return "entropy"; }
+
+    static DataTypePtr createResultType()
+    {
+        return std::make_shared<DataTypeNumber<Float64>>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        if constexpr (!std::is_same_v<UInt128, Value>)
+        {
+            /// Here we manage only with numerical types
+            const auto & column = assert_cast<const ColumnVector <Value> &>(*columns[0]);
+            this->data(place).add(column.getData()[row_num]);
+        }
+        else
+        {
+            this->data(place).add(UniqVariadicHash<true, false>::apply(num_args, columns, row_num));
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(const_cast<AggregateDataPtr>(place)).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & column = assert_cast<ColumnVector<Float64> &>(to);
+        column.getData().push_back(this->data(place).get());
+    }
+};
+
+
 AggregateFunctionPtr createAggregateFunctionEntropy(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
--- a/src/AggregateFunctions/AggregateFunctionEntropy.h
+++ b/src/AggregateFunctions/AggregateFunctionEntropy.h
@ -1,145 +0,0 @@
-#pragma once
-
-#include <Common/HashTable/HashMap.h>
-#include <Common/NaNUtils.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/UniqVariadicHash.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <Columns/ColumnVector.h>
-#include <Common/assert_cast.h>
-
-#include <cmath>
-
-
-namespace DB
-{
-struct Settings;
-
-/** Calculates Shannon Entropy, using HashMap and computing empirical distribution function.
-  * Entropy is measured in bits (base-2 logarithm is used).
-  */
-template <typename Value>
-struct EntropyData
-{
-    using Weight = UInt64;
-
-    using HashingMap = HashMapWithStackMemory<Value, Weight, HashCRC32<Value>, 4>;
-
-    /// For the case of pre-hashed values.
-    using TrivialMap = HashMapWithStackMemory<Value, Weight, UInt128TrivialHash, 4>;
-
-    using Map = std::conditional_t<std::is_same_v<UInt128, Value>, TrivialMap, HashingMap>;
-
-    Map map;
-
-    void add(const Value & x)
-    {
-        if (!isNaN(x))
-            ++map[x];
-    }
-
-    void add(const Value & x, const Weight & weight)
-    {
-        if (!isNaN(x))
-            map[x] += weight;
-    }
-
-    void merge(const EntropyData & rhs)
-    {
-        for (const auto & pair : rhs.map)
-            map[pair.getKey()] += pair.getMapped();
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        map.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        typename Map::Reader reader(buf);
-        while (reader.next())
-        {
-            const auto & pair = reader.get();
-            map[pair.first] = pair.second;
-        }
-    }
-
-    Float64 get() const
-    {
-        UInt64 total_value = 0;
-        for (const auto & pair : map)
-            total_value += pair.getMapped();
-
-        Float64 shannon_entropy = 0;
-        for (const auto & pair : map)
-        {
-            Float64 frequency = Float64(pair.getMapped()) / total_value;
-            shannon_entropy -= frequency * log2(frequency);
-        }
-
-        return shannon_entropy;
-    }
-};
-
-
-template <typename Value>
-class AggregateFunctionEntropy final : public IAggregateFunctionDataHelper<EntropyData<Value>, AggregateFunctionEntropy<Value>>
-{
-private:
-    size_t num_args;
-
-public:
-    explicit AggregateFunctionEntropy(const DataTypes & argument_types_)
-        : IAggregateFunctionDataHelper<EntropyData<Value>, AggregateFunctionEntropy<Value>>(argument_types_, {}, createResultType())
-        , num_args(argument_types_.size())
-    {
-    }
-
-    String getName() const override { return "entropy"; }
-
-    static DataTypePtr createResultType()
-    {
-        return std::make_shared<DataTypeNumber<Float64>>();
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        if constexpr (!std::is_same_v<UInt128, Value>)
-        {
-            /// Here we manage only with numerical types
-            const auto & column = assert_cast<const ColumnVector <Value> &>(*columns[0]);
-            this->data(place).add(column.getData()[row_num]);
-        }
-        else
-        {
-            this->data(place).add(UniqVariadicHash<true, false>::apply(num_args, columns, row_num));
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(const_cast<AggregateDataPtr>(place)).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & column = assert_cast<ColumnVector<Float64> &>(to);
-        column.getData().push_back(this->data(place).get());
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionHistogram.cpp
+++ b/src/AggregateFunctions/AggregateFunctionHistogram.cpp
@ -1,9 +1,31 @@
-#include <AggregateFunctions/AggregateFunctionHistogram.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>
 #include <Common/FieldVisitorConvertToNumber.h>

+#include <Common/NaNUtils.h>
+
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnArray.h>
+#include <Common/assert_cast.h>
+
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeTuple.h>
+
+#include <IO/WriteBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <IO/VarInt.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#include <math.h>
+#include <queue>
+#include <stddef.h>
+

 namespace DB
 {
@ -16,12 +38,357 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
    extern const int UNSUPPORTED_PARAMETER;
    extern const int PARAMETER_OUT_OF_BOUND;
+    extern const int TOO_LARGE_ARRAY_SIZE;
+    extern const int INCORRECT_DATA;
 }


 namespace
 {

+/** distance compression algorithm implementation
+  * http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf
+  */
+class AggregateFunctionHistogramData
+{
+public:
+    using Mean = Float64;
+    using Weight = Float64;
+
+    constexpr static size_t bins_count_limit = 250;
+
+private:
+    struct WeightedValue
+    {
+        Mean mean;
+        Weight weight;
+
+        WeightedValue operator+(const WeightedValue & other) const
+        {
+            return {mean + other.weight * (other.mean - mean) / (other.weight + weight), other.weight + weight};
+        }
+    };
+
+    // quantity of stored weighted-values
+    UInt32 size;
+
+    // calculated lower and upper bounds of seen points
+    Mean lower_bound;
+    Mean upper_bound;
+
+    // Weighted values representation of histogram.
+    WeightedValue points[0];
+
+    void sort()
+    {
+        ::sort(points, points + size,
+            [](const WeightedValue & first, const WeightedValue & second)
+            {
+                return first.mean < second.mean;
+            });
+    }
+
+    template <typename T>
+    struct PriorityQueueStorage
+    {
+        size_t size = 0;
+        T * data_ptr;
+
+        explicit PriorityQueueStorage(T * value)
+            : data_ptr(value)
+        {
+        }
+
+        void push_back(T val) /// NOLINT
+        {
+            data_ptr[size] = std::move(val);
+            ++size;
+        }
+
+        void pop_back() { --size; } /// NOLINT
+        T * begin() { return data_ptr; }
+        T * end() const { return data_ptr + size; }
+        bool empty() const { return size == 0; }
+        T & front() { return *data_ptr; }
+        const T & front() const { return *data_ptr; }
+
+        using value_type = T;
+        using reference = T&;
+        using const_reference = const T&;
+        using size_type = size_t;
+    };
+
+    /**
+     * Repeatedly fuse most close values until max_bins bins left
+     */
+    void compress(UInt32 max_bins)
+    {
+        sort();
+        auto new_size = size;
+        if (size <= max_bins)
+            return;
+
+        // Maintain doubly-linked list of "active" points
+        // and store neighbour pairs in priority queue by distance
+        UInt32 previous[size + 1];
+        UInt32 next[size + 1];
+        bool active[size + 1];
+        std::fill(active, active + size, true);
+        active[size] = false;
+
+        auto delete_node = [&](UInt32 i)
+        {
+            previous[next[i]] = previous[i];
+            next[previous[i]] = next[i];
+            active[i] = false;
+        };
+
+        for (size_t i = 0; i <= size; ++i)
+        {
+            previous[i] = static_cast<UInt32>(i - 1);
+            next[i] = static_cast<UInt32>(i + 1);
+        }
+
+        next[size] = 0;
+        previous[0] = size;
+
+        using QueueItem = std::pair<Mean, UInt32>;
+
+        QueueItem storage[2 * size - max_bins];
+
+        std::priority_queue<
+            QueueItem,
+            PriorityQueueStorage<QueueItem>,
+            std::greater<QueueItem>>
+                queue{std::greater<QueueItem>(),
+                        PriorityQueueStorage<QueueItem>(storage)};
+
+        auto quality = [&](UInt32 i) { return points[next[i]].mean - points[i].mean; };
+
+        for (size_t i = 0; i + 1 < size; ++i)
+            queue.push({quality(static_cast<UInt32>(i)), i});
+
+        while (new_size > max_bins && !queue.empty())
+        {
+            auto min_item = queue.top();
+            queue.pop();
+            auto left = min_item.second;
+            auto right = next[left];
+
+            if (!active[left] || !active[right] || quality(left) > min_item.first)
+                continue;
+
+            points[left] = points[left] + points[right];
+
+            delete_node(right);
+            if (active[next[left]])
+                queue.push({quality(left), left});
+            if (active[previous[left]])
+                queue.push({quality(previous[left]), previous[left]});
+
+            --new_size;
+        }
+
+        size_t left = 0;
+        for (size_t right = 0; right < size; ++right)
+        {
+            if (active[right])
+            {
+                points[left] = points[right];
+                ++left;
+            }
+        }
+        size = new_size;
+    }
+
+    /***
+     * Delete too close points from histogram.
+     * Assumes that points are sorted.
+     */
+    void unique()
+    {
+        if (size == 0)
+            return;
+
+        size_t left = 0;
+
+        for (auto right = left + 1; right < size; ++right)
+        {
+            // Fuse points if their text representations differ only in last digit
+            auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon();
+            if (points[left].mean + std::fabs(min_diff) >= points[right].mean)
+            {
+                points[left] = points[left] + points[right];
+            }
+            else
+            {
+                ++left;
+                points[left] = points[right];
+            }
+        }
+        size = static_cast<UInt32>(left + 1);
+    }
+
+public:
+    AggregateFunctionHistogramData()
+        : size(0)
+        , lower_bound(std::numeric_limits<Mean>::max())
+        , upper_bound(std::numeric_limits<Mean>::lowest())
+    {
+        static_assert(offsetof(AggregateFunctionHistogramData, points) == sizeof(AggregateFunctionHistogramData), "points should be last member");
+    }
+
+    static size_t structSize(size_t max_bins)
+    {
+        return sizeof(AggregateFunctionHistogramData) + max_bins * 2 * sizeof(WeightedValue);
+    }
+
+    void insertResultInto(ColumnVector<Mean> & to_lower, ColumnVector<Mean> & to_upper, ColumnVector<Weight> & to_weights, UInt32 max_bins)
+    {
+        compress(max_bins);
+        unique();
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            to_lower.insertValue((i == 0) ? lower_bound : (points[i].mean + points[i - 1].mean) / 2);
+            to_upper.insertValue((i + 1 == size) ? upper_bound : (points[i].mean + points[i + 1].mean) / 2);
+
+            // linear density approximation
+            Weight lower_weight = (i == 0) ? points[i].weight : ((points[i - 1].weight) + points[i].weight * 3) / 4;
+            Weight upper_weight = (i + 1 == size) ? points[i].weight : (points[i + 1].weight + points[i].weight * 3) / 4;
+            to_weights.insertValue((lower_weight + upper_weight) / 2);
+        }
+    }
+
+    void add(Mean value, Weight weight, UInt32 max_bins)
+    {
+        // nans break sort and compression
+        // infs don't fit in bins partition method
+        if (!isFinite(value))
+            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid value (inf or nan) for aggregation by 'histogram' function");
+
+        points[size] = {value, weight};
+        ++size;
+        lower_bound = std::min(lower_bound, value);
+        upper_bound = std::max(upper_bound, value);
+
+        if (size >= max_bins * 2)
+            compress(max_bins);
+    }
+
+    void merge(const AggregateFunctionHistogramData & other, UInt32 max_bins)
+    {
+        lower_bound = std::min(lower_bound, other.lower_bound);
+        upper_bound = std::max(upper_bound, other.upper_bound);
+        for (size_t i = 0; i < other.size; ++i)
+            add(other.points[i].mean, other.points[i].weight, max_bins);
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writeBinary(lower_bound, buf);
+        writeBinary(upper_bound, buf);
+
+        writeVarUInt(size, buf);
+        buf.write(reinterpret_cast<const char *>(points), size * sizeof(WeightedValue));
+    }
+
+    void read(ReadBuffer & buf, UInt32 max_bins)
+    {
+        readBinary(lower_bound, buf);
+        readBinary(upper_bound, buf);
+
+        readVarUInt(size, buf);
+        if (size > max_bins * 2)
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too many bins");
+        static constexpr size_t max_size = 1_GiB;
+        if (size > max_size)
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
+                            "Too large array size in histogram (maximum: {})", max_size);
+
+        buf.readStrict(reinterpret_cast<char *>(points), size * sizeof(WeightedValue));
+    }
+};
+
+template <typename T>
+class AggregateFunctionHistogram final: public IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>>
+{
+private:
+    using Data = AggregateFunctionHistogramData;
+
+    const UInt32 max_bins;
+
+public:
+    AggregateFunctionHistogram(UInt32 max_bins_, const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>>(arguments, params, createResultType())
+        , max_bins(max_bins_)
+    {
+    }
+
+    size_t sizeOfData() const override
+    {
+        return Data::structSize(max_bins);
+    }
+    static DataTypePtr createResultType()
+    {
+        DataTypes types;
+        auto mean = std::make_shared<DataTypeNumber<Data::Mean>>();
+        auto weight = std::make_shared<DataTypeNumber<Data::Weight>>();
+
+        // lower bound
+        types.emplace_back(mean);
+        // upper bound
+        types.emplace_back(mean);
+        // weight
+        types.emplace_back(weight);
+
+        auto tuple = std::make_shared<DataTypeTuple>(types);
+        return std::make_shared<DataTypeArray>(tuple);
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto val = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
+        this->data(place).add(static_cast<Data::Mean>(val), 1, max_bins);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs), max_bins);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).read(buf, max_bins);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & data = this->data(place);
+
+        auto & to_array = assert_cast<ColumnArray &>(to);
+        ColumnArray::Offsets & offsets_to = to_array.getOffsets();
+        auto & to_tuple = assert_cast<ColumnTuple &>(to_array.getData());
+
+        auto & to_lower = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(0));
+        auto & to_upper = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(1));
+        auto & to_weights = assert_cast<ColumnVector<Data::Weight> &>(to_tuple.getColumn(2));
+        data.insertResultInto(to_lower, to_upper, to_weights, max_bins);
+
+        offsets_to.push_back(to_tuple.size());
+    }
+
+    String getName() const override { return "histogram"; }
+};
+
+
 AggregateFunctionPtr createAggregateFunctionHistogram(const std::string & name, const DataTypes & arguments, const Array & params, const Settings *)
 {
    if (params.size() != 1)
--- a/src/AggregateFunctions/AggregateFunctionHistogram.h
+++ b/src/AggregateFunctions/AggregateFunctionHistogram.h
@ -1,382 +0,0 @@
-#pragma once
-
-#include <base/sort.h>
-
-#include <Common/NaNUtils.h>
-
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Columns/ColumnArray.h>
-#include <Common/assert_cast.h>
-
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypeTuple.h>
-
-#include <IO/WriteBuffer.h>
-#include <IO/ReadBuffer.h>
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-#include <IO/VarInt.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-#include <math.h>
-#include <queue>
-#include <stddef.h>
-
-namespace DB
-{
-struct Settings;
-class Arena;
-
-namespace ErrorCodes
-{
-    extern const int TOO_LARGE_ARRAY_SIZE;
-    extern const int INCORRECT_DATA;
-}
-
-/**
- * distance compression algorithm implementation
- * http://jmlr.org/papers/volume11/ben-haim10a/ben-haim10a.pdf
- */
-class AggregateFunctionHistogramData
-{
-public:
-    using Mean = Float64;
-    using Weight = Float64;
-
-    constexpr static size_t bins_count_limit = 250;
-
-private:
-    struct WeightedValue
-    {
-        Mean mean;
-        Weight weight;
-
-        WeightedValue operator+(const WeightedValue & other) const
-        {
-            return {mean + other.weight * (other.mean - mean) / (other.weight + weight), other.weight + weight};
-        }
-    };
-
-    // quantity of stored weighted-values
-    UInt32 size;
-
-    // calculated lower and upper bounds of seen points
-    Mean lower_bound;
-    Mean upper_bound;
-
-    // Weighted values representation of histogram.
-    WeightedValue points[0];
-
-    void sort()
-    {
-        ::sort(points, points + size,
-            [](const WeightedValue & first, const WeightedValue & second)
-            {
-                return first.mean < second.mean;
-            });
-    }
-
-    template <typename T>
-    struct PriorityQueueStorage
-    {
-        size_t size = 0;
-        T * data_ptr;
-
-        explicit PriorityQueueStorage(T * value)
-            : data_ptr(value)
-        {
-        }
-
-        void push_back(T val) /// NOLINT
-        {
-            data_ptr[size] = std::move(val);
-            ++size;
-        }
-
-        void pop_back() { --size; } /// NOLINT
-        T * begin() { return data_ptr; }
-        T * end() const { return data_ptr + size; }
-        bool empty() const { return size == 0; }
-        T & front() { return *data_ptr; }
-        const T & front() const { return *data_ptr; }
-
-        using value_type = T;
-        using reference = T&;
-        using const_reference = const T&;
-        using size_type = size_t;
-    };
-
-    /**
-     * Repeatedly fuse most close values until max_bins bins left
-     */
-    void compress(UInt32 max_bins)
-    {
-        sort();
-        auto new_size = size;
-        if (size <= max_bins)
-            return;
-
-        // Maintain doubly-linked list of "active" points
-        // and store neighbour pairs in priority queue by distance
-        UInt32 previous[size + 1];
-        UInt32 next[size + 1];
-        bool active[size + 1];
-        std::fill(active, active + size, true);
-        active[size] = false;
-
-        auto delete_node = [&](UInt32 i)
-        {
-            previous[next[i]] = previous[i];
-            next[previous[i]] = next[i];
-            active[i] = false;
-        };
-
-        for (size_t i = 0; i <= size; ++i)
-        {
-            previous[i] = static_cast<UInt32>(i - 1);
-            next[i] = static_cast<UInt32>(i + 1);
-        }
-
-        next[size] = 0;
-        previous[0] = size;
-
-        using QueueItem = std::pair<Mean, UInt32>;
-
-        QueueItem storage[2 * size - max_bins];
-
-        std::priority_queue<
-            QueueItem,
-            PriorityQueueStorage<QueueItem>,
-            std::greater<QueueItem>>
-                queue{std::greater<QueueItem>(),
-                        PriorityQueueStorage<QueueItem>(storage)};
-
-        auto quality = [&](UInt32 i) { return points[next[i]].mean - points[i].mean; };
-
-        for (size_t i = 0; i + 1 < size; ++i)
-            queue.push({quality(static_cast<UInt32>(i)), i});
-
-        while (new_size > max_bins && !queue.empty())
-        {
-            auto min_item = queue.top();
-            queue.pop();
-            auto left = min_item.second;
-            auto right = next[left];
-
-            if (!active[left] || !active[right] || quality(left) > min_item.first)
-                continue;
-
-            points[left] = points[left] + points[right];
-
-            delete_node(right);
-            if (active[next[left]])
-                queue.push({quality(left), left});
-            if (active[previous[left]])
-                queue.push({quality(previous[left]), previous[left]});
-
-            --new_size;
-        }
-
-        size_t left = 0;
-        for (size_t right = 0; right < size; ++right)
-        {
-            if (active[right])
-            {
-                points[left] = points[right];
-                ++left;
-            }
-        }
-        size = new_size;
-    }
-
-    /***
-     * Delete too close points from histogram.
-     * Assumes that points are sorted.
-     */
-    void unique()
-    {
-        if (size == 0)
-            return;
-
-        size_t left = 0;
-
-        for (auto right = left + 1; right < size; ++right)
-        {
-            // Fuse points if their text representations differ only in last digit
-            auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon();
-            if (points[left].mean + std::fabs(min_diff) >= points[right].mean)
-            {
-                points[left] = points[left] + points[right];
-            }
-            else
-            {
-                ++left;
-                points[left] = points[right];
-            }
-        }
-        size = static_cast<UInt32>(left + 1);
-    }
-
-public:
-    AggregateFunctionHistogramData()
-        : size(0)
-        , lower_bound(std::numeric_limits<Mean>::max())
-        , upper_bound(std::numeric_limits<Mean>::lowest())
-    {
-        static_assert(offsetof(AggregateFunctionHistogramData, points) == sizeof(AggregateFunctionHistogramData), "points should be last member");
-    }
-
-    static size_t structSize(size_t max_bins)
-    {
-        return sizeof(AggregateFunctionHistogramData) + max_bins * 2 * sizeof(WeightedValue);
-    }
-
-    void insertResultInto(ColumnVector<Mean> & to_lower, ColumnVector<Mean> & to_upper, ColumnVector<Weight> & to_weights, UInt32 max_bins)
-    {
-        compress(max_bins);
-        unique();
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            to_lower.insertValue((i == 0) ? lower_bound : (points[i].mean + points[i - 1].mean) / 2);
-            to_upper.insertValue((i + 1 == size) ? upper_bound : (points[i].mean + points[i + 1].mean) / 2);
-
-            // linear density approximation
-            Weight lower_weight = (i == 0) ? points[i].weight : ((points[i - 1].weight) + points[i].weight * 3) / 4;
-            Weight upper_weight = (i + 1 == size) ? points[i].weight : (points[i + 1].weight + points[i].weight * 3) / 4;
-            to_weights.insertValue((lower_weight + upper_weight) / 2);
-        }
-    }
-
-    void add(Mean value, Weight weight, UInt32 max_bins)
-    {
-        // nans break sort and compression
-        // infs don't fit in bins partition method
-        if (!isFinite(value))
-            throw Exception(ErrorCodes::INCORRECT_DATA, "Invalid value (inf or nan) for aggregation by 'histogram' function");
-
-        points[size] = {value, weight};
-        ++size;
-        lower_bound = std::min(lower_bound, value);
-        upper_bound = std::max(upper_bound, value);
-
-        if (size >= max_bins * 2)
-            compress(max_bins);
-    }
-
-    void merge(const AggregateFunctionHistogramData & other, UInt32 max_bins)
-    {
-        lower_bound = std::min(lower_bound, other.lower_bound);
-        upper_bound = std::max(upper_bound, other.upper_bound);
-        for (size_t i = 0; i < other.size; ++i)
-            add(other.points[i].mean, other.points[i].weight, max_bins);
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writeBinary(lower_bound, buf);
-        writeBinary(upper_bound, buf);
-
-        writeVarUInt(size, buf);
-        buf.write(reinterpret_cast<const char *>(points), size * sizeof(WeightedValue));
-    }
-
-    void read(ReadBuffer & buf, UInt32 max_bins)
-    {
-        readBinary(lower_bound, buf);
-        readBinary(upper_bound, buf);
-
-        readVarUInt(size, buf);
-        if (size > max_bins * 2)
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too many bins");
-        static constexpr size_t max_size = 1_GiB;
-        if (size > max_size)
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
-                            "Too large array size in histogram (maximum: {})", max_size);
-
-        buf.readStrict(reinterpret_cast<char *>(points), size * sizeof(WeightedValue));
-    }
-};
-
-template <typename T>
-class AggregateFunctionHistogram final: public IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>>
-{
-private:
-    using Data = AggregateFunctionHistogramData;
-
-    const UInt32 max_bins;
-
-public:
-    AggregateFunctionHistogram(UInt32 max_bins_, const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<AggregateFunctionHistogramData, AggregateFunctionHistogram<T>>(arguments, params, createResultType())
-        , max_bins(max_bins_)
-    {
-    }
-
-    size_t sizeOfData() const override
-    {
-        return Data::structSize(max_bins);
-    }
-    static DataTypePtr createResultType()
-    {
-        DataTypes types;
-        auto mean = std::make_shared<DataTypeNumber<Data::Mean>>();
-        auto weight = std::make_shared<DataTypeNumber<Data::Weight>>();
-
-        // lower bound
-        types.emplace_back(mean);
-        // upper bound
-        types.emplace_back(mean);
-        // weight
-        types.emplace_back(weight);
-
-        auto tuple = std::make_shared<DataTypeTuple>(types);
-        return std::make_shared<DataTypeArray>(tuple);
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        auto val = assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num];
-        this->data(place).add(static_cast<Data::Mean>(val), 1, max_bins);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs), max_bins);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).read(buf, max_bins);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & data = this->data(place);
-
-        auto & to_array = assert_cast<ColumnArray &>(to);
-        ColumnArray::Offsets & offsets_to = to_array.getOffsets();
-        auto & to_tuple = assert_cast<ColumnTuple &>(to_array.getData());
-
-        auto & to_lower = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(0));
-        auto & to_upper = assert_cast<ColumnVector<Data::Mean> &>(to_tuple.getColumn(1));
-        auto & to_weights = assert_cast<ColumnVector<Data::Weight> &>(to_tuple.getColumn(2));
-        data.insertResultInto(to_lower, to_upper, to_weights, max_bins);
-
-        offsets_to.push_back(to_tuple.size());
-    }
-
-    String getName() const override { return "histogram"; }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.cpp
+++ b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.cpp
@ -1,57 +1,272 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionIntervalLengthSum.h>
-#include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
-#include <DataTypes/DataTypeDateTime.h>

-#include <base/range.h>
+#include <unordered_set>
+
+#include <AggregateFunctions/Combinators/AggregateFunctionNull.h>
+
+#include <Columns/ColumnsNumber.h>
+
+#include <Common/assert_cast.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>


 namespace DB
 {
+
 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int TOO_LARGE_ARRAY_SIZE;
 }

 struct Settings;

 namespace
 {
-    template <template <typename> class Data>
-    AggregateFunctionPtr
-    createAggregateFunctionIntervalLengthSum(const std::string & name, const DataTypes & arguments, const Array &, const Settings *)
+
+/** Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end];
+  * Returns UInt64 for integral types (UInt/Int*, Date/DateTime) and returns Float64 for Float*.
+  *
+  * Implementation simply stores intervals sorted by beginning and sums lengths at final.
+  */
+template <typename T>
+struct AggregateFunctionIntervalLengthSumData
+{
+    constexpr static size_t MAX_ARRAY_SIZE = 0xFFFFFF;
+
+    using Segment = std::pair<T, T>;
+    using Segments = PODArrayWithStackMemory<Segment, 64>;
+
+    bool sorted = false;
+
+    Segments segments;
+
+    void add(T begin, T end)
    {
-        if (arguments.size() != 2)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
-                "Aggregate function {} requires two timestamps argument.", name);
+        /// Reversed intervals are counted by absolute value of their length.
+        if (unlikely(end < begin))
+            std::swap(begin, end);
+        else if (unlikely(begin == end))
+            return;

-        auto args = {arguments[0].get(), arguments[1].get()};
+        if (sorted && !segments.empty())
+            sorted = segments.back().first <= begin;
+        segments.emplace_back(begin, end);
+    }

-        if (WhichDataType{args.begin()[0]}.idx != WhichDataType{args.begin()[1]}.idx)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                            "Illegal types {} and {} of arguments "
-                            "of aggregate function {}, both arguments should have same data type",
-                            args.begin()[0]->getName(), args.begin()[1]->getName(), name);
+    void merge(const AggregateFunctionIntervalLengthSumData & other)
+    {
+        if (other.segments.empty())
+            return;

-        for (const auto & arg : args)
+        const auto size = segments.size();
+
+        segments.insert(std::begin(other.segments), std::end(other.segments));
+
+        /// either sort whole container or do so partially merging ranges afterwards
+        if (!sorted && !other.sorted)
        {
-            if (!isNativeNumber(arg) && !isDate(arg) && !isDateTime(arg))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                                "Illegal type {} of argument of aggregate function {}, must "
-                                "be native integral type, Date/DateTime or Float", arg->getName(), name);
+            ::sort(std::begin(segments), std::end(segments));
+        }
+        else
+        {
+            const auto begin = std::begin(segments);
+            const auto middle = std::next(begin, size);
+            const auto end = std::end(segments);
+
+            if (!sorted)
+                ::sort(begin, middle);
+
+            if (!other.sorted)
+                ::sort(middle, end);
+
+            std::inplace_merge(begin, middle, end);
        }

-        AggregateFunctionPtr res(createWithBasicNumberOrDateOrDateTime<AggregateFunctionIntervalLengthSum, Data>(*arguments[0], arguments));
+        sorted = true;
+    }

-        if (res)
-            return res;
+    void sort()
+    {
+        if (sorted)
+            return;

+        ::sort(std::begin(segments), std::end(segments));
+        sorted = true;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(sorted, buf);
+        writeBinary(segments.size(), buf);
+
+        for (const auto & time_gap : segments)
+        {
+            writeBinary(time_gap.first, buf);
+            writeBinary(time_gap.second, buf);
+        }
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(sorted, buf);
+
+        size_t size;
+        readBinary(size, buf);
+
+        if (unlikely(size > MAX_ARRAY_SIZE))
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size (maximum: {})", MAX_ARRAY_SIZE);
+
+        segments.clear();
+        segments.reserve(size);
+
+        Segment segment;
+        for (size_t i = 0; i < size; ++i)
+        {
+            readBinary(segment.first, buf);
+            readBinary(segment.second, buf);
+            segments.emplace_back(segment);
+        }
+    }
+};
+
+template <typename T, typename Data>
+class AggregateFunctionIntervalLengthSum final : public IAggregateFunctionDataHelper<Data, AggregateFunctionIntervalLengthSum<T, Data>>
+{
+private:
+    static auto NO_SANITIZE_UNDEFINED length(typename Data::Segment segment)
+    {
+        return segment.second - segment.first;
+    }
+
+    template <typename TResult>
+    TResult getIntervalLengthSum(Data & data) const
+    {
+        if (data.segments.empty())
+            return 0;
+
+        data.sort();
+
+        TResult res = 0;
+
+        typename Data::Segment curr_segment = data.segments[0];
+
+        for (size_t i = 1, size = data.segments.size(); i < size; ++i)
+        {
+            const typename Data::Segment & next_segment = data.segments[i];
+
+            /// Check if current interval intersects with next one then add length, otherwise advance interval end.
+            if (curr_segment.second < next_segment.first)
+            {
+                res += length(curr_segment);
+                curr_segment = next_segment;
+            }
+            else if (next_segment.second > curr_segment.second)
+            {
+                curr_segment.second = next_segment.second;
+            }
+        }
+        res += length(curr_segment);
+
+        return res;
+    }
+
+public:
+    String getName() const override { return "intervalLengthSum"; }
+
+    explicit AggregateFunctionIntervalLengthSum(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionIntervalLengthSum<T, Data>>(arguments, {}, createResultType())
+    {
+    }
+
+    static DataTypePtr createResultType()
+    {
+        if constexpr (std::is_floating_point_v<T>)
+            return std::make_shared<DataTypeFloat64>();
+        return std::make_shared<DataTypeUInt64>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    AggregateFunctionPtr getOwnNullAdapter(
+        const AggregateFunctionPtr & nested_function,
+        const DataTypes & arguments,
+        const Array & params,
+        const AggregateFunctionProperties & /*properties*/) const override
+    {
+        return std::make_shared<AggregateFunctionNullVariadic<false, false>>(nested_function, arguments, params);
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
+    {
+        auto begin = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
+        auto end = assert_cast<const ColumnVector<T> *>(columns[1])->getData()[row_num];
+        this->data(place).add(begin, end);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        if constexpr (std::is_floating_point_v<T>)
+            assert_cast<ColumnFloat64 &>(to).getData().push_back(getIntervalLengthSum<Float64>(this->data(place)));
+        else
+            assert_cast<ColumnUInt64 &>(to).getData().push_back(getIntervalLengthSum<UInt64>(this->data(place)));
+    }
+};
+
+
+template <template <typename> class Data>
+AggregateFunctionPtr
+createAggregateFunctionIntervalLengthSum(const std::string & name, const DataTypes & arguments, const Array &, const Settings *)
+{
+    if (arguments.size() != 2)
+        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
+            "Aggregate function {} requires two timestamps argument.", name);
+
+    auto args = {arguments[0].get(), arguments[1].get()};
+
+    if (WhichDataType{args.begin()[0]}.idx != WhichDataType{args.begin()[1]}.idx)
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                        "Illegal type {} of argument of aggregate function {}, must "
-                        "be native integral type, Date/DateTime or Float", arguments.front().get()->getName(), name);
+                        "Illegal types {} and {} of arguments "
+                        "of aggregate function {}, both arguments should have same data type",
+                        args.begin()[0]->getName(), args.begin()[1]->getName(), name);
+
+    for (const auto & arg : args)
+    {
+        if (!isNativeNumber(arg) && !isDate(arg) && !isDateTime(arg))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                            "Illegal type {} of argument of aggregate function {}, must "
+                            "be native integral type, Date/DateTime or Float", arg->getName(), name);
+    }
+
+    AggregateFunctionPtr res(createWithBasicNumberOrDateOrDateTime<AggregateFunctionIntervalLengthSum, Data>(*arguments[0], arguments));
+
+    if (res)
+        return res;
+
+    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                    "Illegal type {} of argument of aggregate function {}, must "
+                    "be native integral type, Date/DateTime or Float", arguments.front().get()->getName(), name);
 }

 }
--- a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h
+++ b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.h
@ -1,232 +0,0 @@
-#pragma once
-
-#include <unordered_set>
-
-#include <AggregateFunctions/Combinators/AggregateFunctionNull.h>
-
-#include <Columns/ColumnsNumber.h>
-
-#include <Common/assert_cast.h>
-#include <base/arithmeticOverflow.h>
-#include <base/sort.h>
-
-#include <DataTypes/DataTypeDateTime.h>
-#include <DataTypes/DataTypesNumber.h>
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int TOO_LARGE_ARRAY_SIZE;
-}
-
-/** Calculate total length of intervals without intersections. Each interval is the pair of numbers [begin, end];
-  * Returns UInt64 for integral types (UInt/Int*, Date/DateTime) and returns Float64 for Float*.
-  *
-  * Implementation simply stores intervals sorted by beginning and sums lengths at final.
-  */
-template <typename T>
-struct AggregateFunctionIntervalLengthSumData
-{
-    constexpr static size_t MAX_ARRAY_SIZE = 0xFFFFFF;
-
-    using Segment = std::pair<T, T>;
-    using Segments = PODArrayWithStackMemory<Segment, 64>;
-
-    bool sorted = false;
-
-    Segments segments;
-
-    void add(T begin, T end)
-    {
-        /// Reversed intervals are counted by absolute value of their length.
-        if (unlikely(end < begin))
-            std::swap(begin, end);
-        else if (unlikely(begin == end))
-            return;
-
-        if (sorted && !segments.empty())
-            sorted = segments.back().first <= begin;
-        segments.emplace_back(begin, end);
-    }
-
-    void merge(const AggregateFunctionIntervalLengthSumData & other)
-    {
-        if (other.segments.empty())
-            return;
-
-        const auto size = segments.size();
-
-        segments.insert(std::begin(other.segments), std::end(other.segments));
-
-        /// either sort whole container or do so partially merging ranges afterwards
-        if (!sorted && !other.sorted)
-        {
-            ::sort(std::begin(segments), std::end(segments));
-        }
-        else
-        {
-            const auto begin = std::begin(segments);
-            const auto middle = std::next(begin, size);
-            const auto end = std::end(segments);
-
-            if (!sorted)
-                ::sort(begin, middle);
-
-            if (!other.sorted)
-                ::sort(middle, end);
-
-            std::inplace_merge(begin, middle, end);
-        }
-
-        sorted = true;
-    }
-
-    void sort()
-    {
-        if (sorted)
-            return;
-
-        ::sort(std::begin(segments), std::end(segments));
-        sorted = true;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(sorted, buf);
-        writeBinary(segments.size(), buf);
-
-        for (const auto & time_gap : segments)
-        {
-            writeBinary(time_gap.first, buf);
-            writeBinary(time_gap.second, buf);
-        }
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(sorted, buf);
-
-        size_t size;
-        readBinary(size, buf);
-
-        if (unlikely(size > MAX_ARRAY_SIZE))
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size (maximum: {})", MAX_ARRAY_SIZE);
-
-        segments.clear();
-        segments.reserve(size);
-
-        Segment segment;
-        for (size_t i = 0; i < size; ++i)
-        {
-            readBinary(segment.first, buf);
-            readBinary(segment.second, buf);
-            segments.emplace_back(segment);
-        }
-    }
-};
-
-template <typename T, typename Data>
-class AggregateFunctionIntervalLengthSum final : public IAggregateFunctionDataHelper<Data, AggregateFunctionIntervalLengthSum<T, Data>>
-{
-private:
-    static auto NO_SANITIZE_UNDEFINED length(typename Data::Segment segment)
-    {
-        return segment.second - segment.first;
-    }
-
-    template <typename TResult>
-    TResult getIntervalLengthSum(Data & data) const
-    {
-        if (data.segments.empty())
-            return 0;
-
-        data.sort();
-
-        TResult res = 0;
-
-        typename Data::Segment curr_segment = data.segments[0];
-
-        for (size_t i = 1, size = data.segments.size(); i < size; ++i)
-        {
-            const typename Data::Segment & next_segment = data.segments[i];
-
-            /// Check if current interval intersects with next one then add length, otherwise advance interval end.
-            if (curr_segment.second < next_segment.first)
-            {
-                res += length(curr_segment);
-                curr_segment = next_segment;
-            }
-            else if (next_segment.second > curr_segment.second)
-            {
-                curr_segment.second = next_segment.second;
-            }
-        }
-        res += length(curr_segment);
-
-        return res;
-    }
-
-public:
-    String getName() const override { return "intervalLengthSum"; }
-
-    explicit AggregateFunctionIntervalLengthSum(const DataTypes & arguments)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionIntervalLengthSum<T, Data>>(arguments, {}, createResultType())
-    {
-    }
-
-    static DataTypePtr createResultType()
-    {
-        if constexpr (std::is_floating_point_v<T>)
-            return std::make_shared<DataTypeFloat64>();
-        return std::make_shared<DataTypeUInt64>();
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    AggregateFunctionPtr getOwnNullAdapter(
-        const AggregateFunctionPtr & nested_function,
-        const DataTypes & arguments,
-        const Array & params,
-        const AggregateFunctionProperties & /*properties*/) const override
-    {
-        return std::make_shared<AggregateFunctionNullVariadic<false, false>>(nested_function, arguments, params);
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
-    {
-        auto begin = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
-        auto end = assert_cast<const ColumnVector<T> *>(columns[1])->getData()[row_num];
-        this->data(place).add(begin, end);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        if constexpr (std::is_floating_point_v<T>)
-            assert_cast<ColumnFloat64 &>(to).getData().push_back(getIntervalLengthSum<Float64>(this->data(place)));
-        else
-            assert_cast<ColumnUInt64 &>(to).getData().push_back(getIntervalLengthSum<UInt64>(this->data(place)));
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.cpp
@ -1,19 +1,339 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h>
 #include <AggregateFunctions/FactoryHelpers.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/Exception.h>
+#include <Common/assert_cast.h>
+#include <Common/PODArray_fwd.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <IO/ReadHelpers.h>
+

 namespace ErrorCodes
 {
    extern const int NOT_IMPLEMENTED;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
 }

 namespace DB
 {
+
 struct Settings;

 namespace
 {

+struct KolmogorovSmirnov : public StatisticalSample<Float64, Float64>
+{
+    enum class Alternative
+    {
+        TwoSided,
+        Less,
+        Greater
+    };
+
+    std::pair<Float64, Float64> getResult(Alternative alternative, String method)
+    {
+        ::sort(x.begin(), x.end());
+        ::sort(y.begin(), y.end());
+
+        Float64 max_s = std::numeric_limits<Float64>::min();
+        Float64 min_s = std::numeric_limits<Float64>::max();
+        Float64 now_s = 0;
+        UInt64 pos_x = 0;
+        UInt64 pos_y = 0;
+        UInt64 pos_tmp;
+        UInt64 n1 = x.size();
+        UInt64 n2 = y.size();
+
+        const Float64 n1_d = 1. / n1;
+        const Float64 n2_d = 1. / n2;
+        const Float64 tol = 1e-7;
+
+        // reference: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
+        while (pos_x < x.size() && pos_y < y.size())
+        {
+            if (likely(fabs(x[pos_x] - y[pos_y]) >= tol))
+            {
+                if (x[pos_x] < y[pos_y])
+                {
+                    now_s += n1_d;
+                    ++pos_x;
+                }
+                else
+                {
+                    now_s -= n2_d;
+                    ++pos_y;
+                }
+            }
+            else
+            {
+                pos_tmp = pos_x + 1;
+                while (pos_tmp < x.size() && unlikely(fabs(x[pos_tmp] - x[pos_x]) <= tol))
+                    pos_tmp++;
+                now_s += n1_d * (pos_tmp - pos_x);
+                pos_x = pos_tmp;
+                pos_tmp = pos_y + 1;
+                while (pos_tmp < y.size() && unlikely(fabs(y[pos_tmp] - y[pos_y]) <= tol))
+                    pos_tmp++;
+                now_s -= n2_d * (pos_tmp - pos_y);
+                pos_y = pos_tmp;
+            }
+            max_s = std::max(max_s, now_s);
+            min_s = std::min(min_s, now_s);
+        }
+        now_s += n1_d * (x.size() - pos_x) - n2_d * (y.size() - pos_y);
+        min_s = std::min(min_s, now_s);
+        max_s = std::max(max_s, now_s);
+
+        Float64 d = 0;
+        if (alternative == Alternative::TwoSided)
+            d = std::max(std::abs(max_s), std::abs(min_s));
+        else if (alternative == Alternative::Less)
+            d = -min_s;
+        else if (alternative == Alternative::Greater)
+            d = max_s;
+
+        UInt64 g = std::__gcd(n1, n2);
+        UInt64 nx_g = n1 / g;
+        UInt64 ny_g = n2 / g;
+
+        if (method == "auto")
+            method = std::max(n1, n2) <= 10000 ? "exact" : "asymptotic";
+        else if (method == "exact" && nx_g >= std::numeric_limits<Int32>::max() / ny_g)
+            method = "asymptotic";
+
+        Float64 p_value = std::numeric_limits<Float64>::infinity();
+
+        if (method == "exact")
+        {
+            /* reference:
+             * Gunar Schröer and Dietrich Trenkler
+             * Exact and Randomization Distributions of Kolmogorov-Smirnov, Tests for Two or Three Samples
+             *
+             * and
+             *
+             * Thomas Viehmann
+             * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test
+             */
+            if (n2 > n1)
+                std::swap(n1, n2);
+
+            const Float64 f_n1 = static_cast<Float64>(n1);
+            const Float64 f_n2 = static_cast<Float64>(n2);
+            const Float64 k_d = (0.5 + floor(d * f_n2 * f_n1 - tol)) / (f_n2 * f_n1);
+            PaddedPODArray<Float64> c(n1 + 1);
+
+            auto check = alternative == Alternative::TwoSided ?
+                         [](const Float64 & q, const Float64 & r, const Float64 & s) { return fabs(r - s) >= q; }
+                       : [](const Float64 & q, const Float64 & r, const Float64 & s) { return r - s >= q; };
+
+            c[0] = 0;
+            for (UInt64 j = 1; j <= n1; j++)
+                if (check(k_d, 0., j / f_n1))
+                    c[j] = 1.;
+                else
+                    c[j] = c[j - 1];
+
+            for (UInt64 i = 1; i <= n2; i++)
+            {
+                if (check(k_d, i / f_n2, 0.))
+                    c[0] = 1.;
+                for (UInt64 j = 1; j <= n1; j++)
+                    if (check(k_d, i / f_n2, j / f_n1))
+                        c[j] = 1.;
+                    else
+                    {
+                        Float64 v = i / static_cast<Float64>(i + j);
+                        Float64 w = j / static_cast<Float64>(i + j);
+                        c[j] = v * c[j] + w * c[j - 1];
+                    }
+            }
+            p_value = c[n1];
+        }
+        else if (method == "asymp" || method == "asymptotic")
+        {
+            Float64 n = std::min(n1, n2);
+            Float64 m = std::max(n1, n2);
+            Float64 p = sqrt((n * m) / (n + m)) * d;
+
+            if (alternative == Alternative::TwoSided)
+            {
+                /* reference:
+                 * J.DURBIN
+                 * Distribution theory for tests based on the sample distribution function
+                 */
+                Float64 new_val, old_val, s, w, z;
+                UInt64 k_max = static_cast<UInt64>(sqrt(2 - log(tol)));
+
+                if (p < 1)
+                {
+                    z = - (M_PI_2 * M_PI_4) / (p * p);
+                    w = log(p);
+                    s = 0;
+                    for (UInt64 k = 1; k < k_max; k += 2)
+                        s += exp(k * k * z - w);
+                    p = s / 0.398942280401432677939946059934;
+                }
+                else
+                {
+                    z = -2 * p * p;
+                    s = -1;
+                    UInt64 k = 1;
+                    old_val = 0;
+                    new_val = 1;
+                    while (fabs(old_val - new_val) > tol)
+                    {
+                        old_val = new_val;
+                        new_val += 2 * s * exp(z * k * k);
+                        s *= -1;
+                        k++;
+                    }
+                    p = new_val;
+                }
+                p_value = 1 - p;
+            }
+            else
+            {
+                /* reference:
+                 * J. L. HODGES, Jr
+                 * The significance probability of the Smirnov two-sample test
+                 */
+
+                // Use Hodges' suggested approximation Eqn 5.3
+                // Requires m to be the larger of (n1, n2)
+                Float64 expt = -2 * p * p - 2 * p * (m + 2 * n) / sqrt(m * n * (m + n)) / 3.0;
+                p_value = exp(expt);
+            }
+        }
+        return {d, p_value};
+    }
+
+};
+
+class AggregateFunctionKolmogorovSmirnov final:
+    public IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov>
+{
+private:
+    using Alternative = typename KolmogorovSmirnov::Alternative;
+    Alternative alternative = Alternative::TwoSided;
+    String method = "auto";
+
+public:
+    explicit AggregateFunctionKolmogorovSmirnov(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov> ({arguments}, {}, createResultType())
+    {
+        if (params.size() > 2)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName());
+
+        if (params.empty())
+            return;
+
+        if (params[0].getType() != Field::Types::String)
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a String", getName());
+
+        const auto & param = params[0].get<String>();
+        if (param == "two-sided")
+            alternative = Alternative::TwoSided;
+        else if (param == "less")
+            alternative = Alternative::Less;
+        else if (param == "greater")
+            alternative = Alternative::Greater;
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter in aggregate function {}. "
+                    "It must be one of: 'two-sided', 'less', 'greater'", getName());
+
+        if (params.size() != 2)
+            return;
+
+        if (params[1].getType() != Field::Types::String)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a String", getName());
+
+        method = params[1].get<String>();
+        if (method != "auto" && method != "exact" && method != "asymp" && method != "asymptotic")
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown method in aggregate function {}. "
+                    "It must be one of: 'auto', 'exact', 'asymp' (or 'asymptotic')", getName());
+    }
+
+    String getName() const override
+    {
+        return "kolmogorovSmirnovTest";
+    }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    static DataTypePtr createResultType()
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+        };
+
+        Strings names
+        {
+            "d_statistic",
+            "p_value"
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 value = columns[0]->getFloat64(row_num);
+        UInt8 is_second = columns[1]->getUInt(row_num);
+        if (is_second)
+            this->data(place).addY(value, arena);
+        else
+            this->data(place).addX(value, arena);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        this->data(place).merge(this->data(rhs), arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        if (!this->data(place).size_x || !this->data(place).size_y)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} require both samples to be non empty", getName());
+
+        auto [d_statistic, p_value] = this->data(place).getResult(alternative, method);
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+
+        column_stat.getData().push_back(d_statistic);
+        column_value.getData().push_back(p_value);
+    }
+
+};
+
+
 AggregateFunctionPtr createAggregateFunctionKolmogorovSmirnovTest(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
--- a/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h
+++ b/src/AggregateFunctions/AggregateFunctionKolmogorovSmirnovTest.h
@ -1,331 +0,0 @@
-#pragma once
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/StatCommon.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Common/Exception.h>
-#include <Common/assert_cast.h>
-#include <Common/PODArray_fwd.h>
-#include <base/types.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-    extern const int BAD_ARGUMENTS;
-}
-
-struct KolmogorovSmirnov : public StatisticalSample<Float64, Float64>
-{
-    enum class Alternative
-    {
-        TwoSided,
-        Less,
-        Greater
-    };
-
-    std::pair<Float64, Float64> getResult(Alternative alternative, String method)
-    {
-        ::sort(x.begin(), x.end());
-        ::sort(y.begin(), y.end());
-
-        Float64 max_s = std::numeric_limits<Float64>::min();
-        Float64 min_s = std::numeric_limits<Float64>::max();
-        Float64 now_s = 0;
-        UInt64 pos_x = 0;
-        UInt64 pos_y = 0;
-        UInt64 pos_tmp;
-        UInt64 n1 = x.size();
-        UInt64 n2 = y.size();
-
-        const Float64 n1_d = 1. / n1;
-        const Float64 n2_d = 1. / n2;
-        const Float64 tol = 1e-7;
-
-        // reference: https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test
-        while (pos_x < x.size() && pos_y < y.size())
-        {
-            if (likely(fabs(x[pos_x] - y[pos_y]) >= tol))
-            {
-                if (x[pos_x] < y[pos_y])
-                {
-                    now_s += n1_d;
-                    ++pos_x;
-                }
-                else
-                {
-                    now_s -= n2_d;
-                    ++pos_y;
-                }
-            }
-            else
-            {
-                pos_tmp = pos_x + 1;
-                while (pos_tmp < x.size() && unlikely(fabs(x[pos_tmp] - x[pos_x]) <= tol))
-                    pos_tmp++;
-                now_s += n1_d * (pos_tmp - pos_x);
-                pos_x = pos_tmp;
-                pos_tmp = pos_y + 1;
-                while (pos_tmp < y.size() && unlikely(fabs(y[pos_tmp] - y[pos_y]) <= tol))
-                    pos_tmp++;
-                now_s -= n2_d * (pos_tmp - pos_y);
-                pos_y = pos_tmp;
-            }
-            max_s = std::max(max_s, now_s);
-            min_s = std::min(min_s, now_s);
-        }
-        now_s += n1_d * (x.size() - pos_x) - n2_d * (y.size() - pos_y);
-        min_s = std::min(min_s, now_s);
-        max_s = std::max(max_s, now_s);
-
-        Float64 d = 0;
-        if (alternative == Alternative::TwoSided)
-            d = std::max(std::abs(max_s), std::abs(min_s));
-        else if (alternative == Alternative::Less)
-            d = -min_s;
-        else if (alternative == Alternative::Greater)
-            d = max_s;
-
-        UInt64 g = std::__gcd(n1, n2);
-        UInt64 nx_g = n1 / g;
-        UInt64 ny_g = n2 / g;
-
-        if (method == "auto")
-            method = std::max(n1, n2) <= 10000 ? "exact" : "asymptotic";
-        else if (method == "exact" && nx_g >= std::numeric_limits<Int32>::max() / ny_g)
-            method = "asymptotic";
-
-        Float64 p_value = std::numeric_limits<Float64>::infinity();
-
-        if (method == "exact")
-        {
-            /* reference:
-             * Gunar Schröer and Dietrich Trenkler
-             * Exact and Randomization Distributions of Kolmogorov-Smirnov, Tests for Two or Three Samples
-             *
-             * and
-             *
-             * Thomas Viehmann
-             * Numerically more stable computation of the p-values for the two-sample Kolmogorov-Smirnov test
-             */
-            if (n2 > n1)
-                std::swap(n1, n2);
-
-            const Float64 f_n1 = static_cast<Float64>(n1);
-            const Float64 f_n2 = static_cast<Float64>(n2);
-            const Float64 k_d = (0.5 + floor(d * f_n2 * f_n1 - tol)) / (f_n2 * f_n1);
-            PaddedPODArray<Float64> c(n1 + 1);
-
-            auto check = alternative == Alternative::TwoSided ?
-                         [](const Float64 & q, const Float64 & r, const Float64 & s) { return fabs(r - s) >= q; }
-                       : [](const Float64 & q, const Float64 & r, const Float64 & s) { return r - s >= q; };
-
-            c[0] = 0;
-            for (UInt64 j = 1; j <= n1; j++)
-                if (check(k_d, 0., j / f_n1))
-                    c[j] = 1.;
-                else
-                    c[j] = c[j - 1];
-
-            for (UInt64 i = 1; i <= n2; i++)
-            {
-                if (check(k_d, i / f_n2, 0.))
-                    c[0] = 1.;
-                for (UInt64 j = 1; j <= n1; j++)
-                    if (check(k_d, i / f_n2, j / f_n1))
-                        c[j] = 1.;
-                    else
-                    {
-                        Float64 v = i / static_cast<Float64>(i + j);
-                        Float64 w = j / static_cast<Float64>(i + j);
-                        c[j] = v * c[j] + w * c[j - 1];
-                    }
-            }
-            p_value = c[n1];
-        }
-        else if (method == "asymp" || method == "asymptotic")
-        {
-            Float64 n = std::min(n1, n2);
-            Float64 m = std::max(n1, n2);
-            Float64 p = sqrt((n * m) / (n + m)) * d;
-
-            if (alternative == Alternative::TwoSided)
-            {
-                /* reference:
-                 * J.DURBIN
-                 * Distribution theory for tests based on the sample distribution function
-                 */
-                Float64 new_val, old_val, s, w, z;
-                UInt64 k_max = static_cast<UInt64>(sqrt(2 - log(tol)));
-
-                if (p < 1)
-                {
-                    z = - (M_PI_2 * M_PI_4) / (p * p);
-                    w = log(p);
-                    s = 0;
-                    for (UInt64 k = 1; k < k_max; k += 2)
-                        s += exp(k * k * z - w);
-                    p = s / 0.398942280401432677939946059934;
-                }
-                else
-                {
-                    z = -2 * p * p;
-                    s = -1;
-                    UInt64 k = 1;
-                    old_val = 0;
-                    new_val = 1;
-                    while (fabs(old_val - new_val) > tol)
-                    {
-                        old_val = new_val;
-                        new_val += 2 * s * exp(z * k * k);
-                        s *= -1;
-                        k++;
-                    }
-                    p = new_val;
-                }
-                p_value = 1 - p;
-            }
-            else
-            {
-                /* reference:
-                 * J. L. HODGES, Jr
-                 * The significance probability of the Smirnov two-sample test
-                 */
-
-                // Use Hodges' suggested approximation Eqn 5.3
-                // Requires m to be the larger of (n1, n2)
-                Float64 expt = -2 * p * p - 2 * p * (m + 2 * n) / sqrt(m * n * (m + n)) / 3.0;
-                p_value = exp(expt);
-            }
-        }
-        return {d, p_value};
-    }
-
-};
-
-class AggregateFunctionKolmogorovSmirnov final:
-    public IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov>
-{
-private:
-    using Alternative = typename KolmogorovSmirnov::Alternative;
-    Alternative alternative = Alternative::TwoSided;
-    String method = "auto";
-
-public:
-    explicit AggregateFunctionKolmogorovSmirnov(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<KolmogorovSmirnov, AggregateFunctionKolmogorovSmirnov> ({arguments}, {}, createResultType())
-    {
-        if (params.size() > 2)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName());
-
-        if (params.empty())
-            return;
-
-        if (params[0].getType() != Field::Types::String)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a String", getName());
-
-        const auto & param = params[0].get<String>();
-        if (param == "two-sided")
-            alternative = Alternative::TwoSided;
-        else if (param == "less")
-            alternative = Alternative::Less;
-        else if (param == "greater")
-            alternative = Alternative::Greater;
-        else
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter in aggregate function {}. "
-                    "It must be one of: 'two-sided', 'less', 'greater'", getName());
-
-        if (params.size() != 2)
-            return;
-
-        if (params[1].getType() != Field::Types::String)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a String", getName());
-
-        method = params[1].get<String>();
-        if (method != "auto" && method != "exact" && method != "asymp" && method != "asymptotic")
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown method in aggregate function {}. "
-                    "It must be one of: 'auto', 'exact', 'asymp' (or 'asymptotic')", getName());
-    }
-
-    String getName() const override
-    {
-        return "kolmogorovSmirnovTest";
-    }
-
-    bool allocatesMemoryInArena() const override { return true; }
-
-    static DataTypePtr createResultType()
-    {
-        DataTypes types
-        {
-            std::make_shared<DataTypeNumber<Float64>>(),
-            std::make_shared<DataTypeNumber<Float64>>(),
-        };
-
-        Strings names
-        {
-            "d_statistic",
-            "p_value"
-        };
-
-        return std::make_shared<DataTypeTuple>(
-            std::move(types),
-            std::move(names)
-        );
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        Float64 value = columns[0]->getFloat64(row_num);
-        UInt8 is_second = columns[1]->getUInt(row_num);
-        if (is_second)
-            this->data(place).addY(value, arena);
-        else
-            this->data(place).addX(value, arena);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        this->data(place).merge(this->data(rhs), arena);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        this->data(place).read(buf, arena);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        if (!this->data(place).size_x || !this->data(place).size_y)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} require both samples to be non empty", getName());
-
-        auto [d_statistic, p_value] = this->data(place).getResult(alternative, method);
-
-        /// Because p-value is a probability.
-        p_value = std::min(1.0, std::max(0.0, p_value));
-
-        auto & column_tuple = assert_cast<ColumnTuple &>(to);
-        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
-        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
-
-        column_stat.getData().push_back(d_statistic);
-        column_value.getData().push_back(p_value);
-    }
-
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp
+++ b/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.cpp
@ -1,12 +1,30 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/Helpers.h>
+
+#include <numeric>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnTuple.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnsDateTime.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/ReadHelpers.h>
+#include <Common/PODArray_fwd.h>
+#include <Common/assert_cast.h>
+
+#include <boost/math/distributions/normal.hpp>


 namespace ErrorCodes
 {
-extern const int NOT_IMPLEMENTED;
+    extern const int NOT_IMPLEMENTED;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }

 namespace DB
@ -16,29 +34,321 @@ struct Settings;
 namespace
 {

-    AggregateFunctionPtr
-    createAggregateFunctionLargestTriangleThreeBuckets(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+struct LargestTriangleThreeBucketsData : public StatisticalSample<Float64, Float64>
+{
+    void add(const Float64 xval, const Float64 yval, Arena * arena)
    {
-        assertBinary(name, argument_types);
-
-
-        if (!(isNumber(argument_types[0]) || isDateOrDate32(argument_types[0]) || isDateTime(argument_types[0])
-              || isDateTime64(argument_types[0])))
-            throw Exception(
-                ErrorCodes::NOT_IMPLEMENTED,
-                "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the first argument",
-                name);
-
-        if (!(isNumber(argument_types[1]) || isDateOrDate32(argument_types[1]) || isDateTime(argument_types[1])
-              || isDateTime64(argument_types[1])))
-            throw Exception(
-                ErrorCodes::NOT_IMPLEMENTED,
-                "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the second argument",
-                name);
-
-        return std::make_shared<AggregateFunctionLargestTriangleThreeBuckets>(argument_types, parameters);
+        this->addX(xval, arena);
+        this->addY(yval, arena);
    }

+    void sort(Arena * arena)
+    {
+        // sort the this->x and this->y in ascending order of this->x using index
+        std::vector<size_t> index(this->x.size());
+
+        std::iota(index.begin(), index.end(), 0);
+        ::sort(index.begin(), index.end(), [&](size_t i1, size_t i2) { return this->x[i1] < this->x[i2]; });
+
+        SampleX temp_x{};
+        SampleY temp_y{};
+
+        for (size_t i = 0; i < this->x.size(); ++i)
+        {
+            temp_x.push_back(this->x[index[i]], arena);
+            temp_y.push_back(this->y[index[i]], arena);
+        }
+
+        for (size_t i = 0; i < this->x.size(); ++i)
+        {
+            this->x[i] = temp_x[i];
+            this->y[i] = temp_y[i];
+        }
+    }
+
+    PODArray<std::pair<Float64, Float64>> getResult(size_t total_buckets, Arena * arena)
+    {
+        // Sort the data
+        this->sort(arena);
+
+        PODArray<std::pair<Float64, Float64>> result;
+
+        // Handle special cases for small data list
+        if (this->x.size() <= total_buckets)
+        {
+            for (size_t i = 0; i < this->x.size(); ++i)
+            {
+                result.emplace_back(std::make_pair(this->x[i], this->y[i]));
+            }
+            return result;
+        }
+
+        // Handle special cases for 0 or 1 or 2 buckets
+        if (total_buckets == 0)
+            return result;
+        if (total_buckets == 1)
+        {
+            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
+            return result;
+        }
+        if (total_buckets == 2)
+        {
+            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
+            result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
+            return result;
+        }
+
+        // Find the size of each bucket
+        size_t single_bucket_size = this->x.size() / total_buckets;
+
+        // Include the first data point
+        result.emplace_back(std::make_pair(this->x[0], this->y[0]));
+
+        for (size_t i = 1; i < total_buckets - 1; ++i) // Skip the first and last bucket
+        {
+            size_t start_index = i * single_bucket_size;
+            size_t end_index = (i + 1) * single_bucket_size;
+
+            // Compute the average point in the next bucket
+            Float64 avg_x = 0;
+            Float64 avg_y = 0;
+            for (size_t j = end_index; j < (i + 2) * single_bucket_size; ++j)
+            {
+                avg_x += this->x[j];
+                avg_y += this->y[j];
+            }
+            avg_x /= single_bucket_size;
+            avg_y /= single_bucket_size;
+
+            // Find the point in the current bucket that forms the largest triangle
+            size_t max_index = start_index;
+            Float64 max_area = 0.0;
+            for (size_t j = start_index; j < end_index; ++j)
+            {
+                Float64 area = std::abs(
+                    0.5
+                    * (result.back().first * this->y[j] + this->x[j] * avg_y + avg_x * result.back().second - result.back().first * avg_y
+                       - this->x[j] * result.back().second - avg_x * this->y[j]));
+                if (area > max_area)
+                {
+                    max_area = area;
+                    max_index = j;
+                }
+            }
+
+            // Include the selected point
+            result.emplace_back(std::make_pair(this->x[max_index], this->y[max_index]));
+        }
+
+        // Include the last data point
+        result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
+
+        return result;
+    }
+};
+
+class AggregateFunctionLargestTriangleThreeBuckets final : public IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>
+{
+private:
+    UInt64 total_buckets{0};
+    TypeIndex x_type;
+    TypeIndex y_type;
+
+public:
+    explicit AggregateFunctionLargestTriangleThreeBuckets(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>({arguments}, {}, createResultType(arguments))
+    {
+        if (params.size() != 1)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require one parameter", getName());
+
+        if (params[0].getType() != Field::Types::UInt64)
+            throw Exception(
+                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a UInt64", getName());
+
+        total_buckets = params[0].get<UInt64>();
+
+        this->x_type = WhichDataType(arguments[0]).idx;
+        this->y_type = WhichDataType(arguments[1]).idx;
+    }
+
+    static constexpr auto name = "largestTriangleThreeBuckets";
+
+    String getName() const override { return name; }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    static DataTypePtr createResultType(const DataTypes & arguments)
+    {
+        TypeIndex x_type = arguments[0]->getTypeId();
+        TypeIndex y_type = arguments[1]->getTypeId();
+
+        UInt32 x_scale = 0;
+        UInt32 y_scale = 0;
+
+        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[0].get()))
+        {
+            x_scale = datetime64_type->getScale();
+        }
+
+        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[1].get()))
+        {
+            y_scale = datetime64_type->getScale();
+        }
+
+        DataTypes types = {getDataTypeFromTypeIndex(x_type, x_scale), getDataTypeFromTypeIndex(y_type, y_scale)};
+
+        auto tuple = std::make_shared<DataTypeTuple>(std::move(types));
+
+        return std::make_shared<DataTypeArray>(tuple);
+    }
+
+    static DataTypePtr getDataTypeFromTypeIndex(TypeIndex type_index, UInt32 scale)
+    {
+        DataTypePtr data_type;
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                data_type = std::make_shared<DataTypeDate>();
+                break;
+            case TypeIndex::Date32:
+                data_type = std::make_shared<DataTypeDate32>();
+                break;
+            case TypeIndex::DateTime:
+                data_type = std::make_shared<DataTypeDateTime>();
+                break;
+            case TypeIndex::DateTime64:
+                data_type = std::make_shared<DataTypeDateTime64>(scale);
+                break;
+            default:
+                data_type = std::make_shared<DataTypeNumber<Float64>>();
+        }
+        return data_type;
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 x = getFloat64DataFromColumn(columns[0], row_num, this->x_type);
+        Float64 y = getFloat64DataFromColumn(columns[1], row_num, this->y_type);
+        this->data(place).add(x, y, arena);
+    }
+
+    Float64 getFloat64DataFromColumn(const IColumn * column, size_t row_num, TypeIndex type_index) const
+    {
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                return static_cast<const ColumnDate &>(*column).getData()[row_num];
+            case TypeIndex::Date32:
+                return static_cast<const ColumnDate32 &>(*column).getData()[row_num];
+            case TypeIndex::DateTime:
+                return static_cast<const ColumnDateTime &>(*column).getData()[row_num];
+            case TypeIndex::DateTime64:
+                return static_cast<const ColumnDateTime64 &>(*column).getData()[row_num];
+            default:
+                return column->getFloat64(row_num);
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & a = this->data(place);
+        const auto & b = this->data(rhs);
+
+        a.merge(b, arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * arena) const override
+    {
+        auto res = this->data(place).getResult(total_buckets, arena);
+
+        auto & col = assert_cast<ColumnArray &>(to);
+        auto & col_offsets = assert_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
+
+        auto column_x_adder_func = getColumnAdderFunc(x_type);
+        auto column_y_adder_func = getColumnAdderFunc(y_type);
+
+        for (size_t i = 0; i < res.size(); ++i)
+        {
+            auto & column_tuple = assert_cast<ColumnTuple &>(col.getData());
+            column_x_adder_func(column_tuple.getColumn(0), res[i].first);
+            column_y_adder_func(column_tuple.getColumn(1), res[i].second);
+        }
+
+        col_offsets.getData().push_back(col.getData().size());
+    }
+
+    std::function<void(IColumn &, Float64)> getColumnAdderFunc(TypeIndex type_index) const
+    {
+        switch (type_index)
+        {
+            case TypeIndex::Date:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDate &>(column);
+                    col.getData().push_back(static_cast<UInt16>(value));
+                };
+            case TypeIndex::Date32:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDate32 &>(column);
+                    col.getData().push_back(static_cast<UInt32>(value));
+                };
+            case TypeIndex::DateTime:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDateTime &>(column);
+                    col.getData().push_back(static_cast<UInt32>(value));
+                };
+            case TypeIndex::DateTime64:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnDateTime64 &>(column);
+                    col.getData().push_back(static_cast<UInt64>(value));
+                };
+            default:
+                return [](IColumn & column, Float64 value)
+                {
+                    auto & col = assert_cast<ColumnFloat64 &>(column);
+                    col.getData().push_back(value);
+                };
+        }
+    }
+};
+
+
+AggregateFunctionPtr
+createAggregateFunctionLargestTriangleThreeBuckets(const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
+{
+    assertBinary(name, argument_types);
+
+
+    if (!(isNumber(argument_types[0]) || isDateOrDate32(argument_types[0]) || isDateTime(argument_types[0])
+          || isDateTime64(argument_types[0])))
+        throw Exception(
+            ErrorCodes::NOT_IMPLEMENTED,
+            "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the first argument",
+            name);
+
+    if (!(isNumber(argument_types[1]) || isDateOrDate32(argument_types[1]) || isDateTime(argument_types[1])
+          || isDateTime64(argument_types[1])))
+        throw Exception(
+            ErrorCodes::NOT_IMPLEMENTED,
+            "Aggregate function {} only supports Date, Date32, DateTime, DateTime64 and Number as the second argument",
+            name);
+
+    return std::make_shared<AggregateFunctionLargestTriangleThreeBuckets>(argument_types, parameters);
+}
+
 }


--- a/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h
+++ b/src/AggregateFunctions/AggregateFunctionLargestTriangleThreeBuckets.h
@ -33,295 +33,6 @@ namespace ErrorCodes
 }


-struct LargestTriangleThreeBucketsData : public StatisticalSample<Float64, Float64>
-{
-    void add(const Float64 xval, const Float64 yval, Arena * arena)
-    {
-        this->addX(xval, arena);
-        this->addY(yval, arena);
-    }

-    void sort(Arena * arena)
-    {
-        // sort the this->x and this->y in ascending order of this->x using index
-        std::vector<size_t> index(this->x.size());
-
-        std::iota(index.begin(), index.end(), 0);
-        ::sort(index.begin(), index.end(), [&](size_t i1, size_t i2) { return this->x[i1] < this->x[i2]; });
-
-        SampleX temp_x{};
-        SampleY temp_y{};
-
-        for (size_t i = 0; i < this->x.size(); ++i)
-        {
-            temp_x.push_back(this->x[index[i]], arena);
-            temp_y.push_back(this->y[index[i]], arena);
-        }
-
-        for (size_t i = 0; i < this->x.size(); ++i)
-        {
-            this->x[i] = temp_x[i];
-            this->y[i] = temp_y[i];
-        }
-    }
-
-    PODArray<std::pair<Float64, Float64>> getResult(size_t total_buckets, Arena * arena)
-    {
-        // Sort the data
-        this->sort(arena);
-
-        PODArray<std::pair<Float64, Float64>> result;
-
-        // Handle special cases for small data list
-        if (this->x.size() <= total_buckets)
-        {
-            for (size_t i = 0; i < this->x.size(); ++i)
-            {
-                result.emplace_back(std::make_pair(this->x[i], this->y[i]));
-            }
-            return result;
-        }
-
-        // Handle special cases for 0 or 1 or 2 buckets
-        if (total_buckets == 0)
-            return result;
-        if (total_buckets == 1)
-        {
-            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
-            return result;
-        }
-        if (total_buckets == 2)
-        {
-            result.emplace_back(std::make_pair(this->x.front(), this->y.front()));
-            result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
-            return result;
-        }
-
-        // Find the size of each bucket
-        size_t single_bucket_size = this->x.size() / total_buckets;
-
-        // Include the first data point
-        result.emplace_back(std::make_pair(this->x[0], this->y[0]));
-
-        for (size_t i = 1; i < total_buckets - 1; ++i) // Skip the first and last bucket
-        {
-            size_t start_index = i * single_bucket_size;
-            size_t end_index = (i + 1) * single_bucket_size;
-
-            // Compute the average point in the next bucket
-            Float64 avg_x = 0;
-            Float64 avg_y = 0;
-            for (size_t j = end_index; j < (i + 2) * single_bucket_size; ++j)
-            {
-                avg_x += this->x[j];
-                avg_y += this->y[j];
-            }
-            avg_x /= single_bucket_size;
-            avg_y /= single_bucket_size;
-
-            // Find the point in the current bucket that forms the largest triangle
-            size_t max_index = start_index;
-            Float64 max_area = 0.0;
-            for (size_t j = start_index; j < end_index; ++j)
-            {
-                Float64 area = std::abs(
-                    0.5
-                    * (result.back().first * this->y[j] + this->x[j] * avg_y + avg_x * result.back().second - result.back().first * avg_y
-                       - this->x[j] * result.back().second - avg_x * this->y[j]));
-                if (area > max_area)
-                {
-                    max_area = area;
-                    max_index = j;
-                }
-            }
-
-            // Include the selected point
-            result.emplace_back(std::make_pair(this->x[max_index], this->y[max_index]));
-        }
-
-        // Include the last data point
-        result.emplace_back(std::make_pair(this->x.back(), this->y.back()));
-
-        return result;
-    }
-};
-
-class AggregateFunctionLargestTriangleThreeBuckets final : public IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>
-{
-private:
-    UInt64 total_buckets{0};
-    TypeIndex x_type;
-    TypeIndex y_type;
-
-public:
-    explicit AggregateFunctionLargestTriangleThreeBuckets(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<LargestTriangleThreeBucketsData, AggregateFunctionLargestTriangleThreeBuckets>({arguments}, {}, createResultType(arguments))
-    {
-        if (params.size() != 1)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require one parameter", getName());
-
-        if (params[0].getType() != Field::Types::UInt64)
-            throw Exception(
-                ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a UInt64", getName());
-
-        total_buckets = params[0].get<UInt64>();
-
-        this->x_type = WhichDataType(arguments[0]).idx;
-        this->y_type = WhichDataType(arguments[1]).idx;
-    }
-
-    static constexpr auto name = "largestTriangleThreeBuckets";
-
-    String getName() const override { return name; }
-
-    bool allocatesMemoryInArena() const override { return true; }
-
-    static DataTypePtr createResultType(const DataTypes & arguments)
-    {
-        TypeIndex x_type = arguments[0]->getTypeId();
-        TypeIndex y_type = arguments[1]->getTypeId();
-
-        UInt32 x_scale = 0;
-        UInt32 y_scale = 0;
-
-        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[0].get()))
-        {
-            x_scale = datetime64_type->getScale();
-        }
-
-        if (const auto * datetime64_type = typeid_cast<const DataTypeDateTime64 *>(arguments[1].get()))
-        {
-            y_scale = datetime64_type->getScale();
-        }
-
-        DataTypes types = {getDataTypeFromTypeIndex(x_type, x_scale), getDataTypeFromTypeIndex(y_type, y_scale)};
-
-        auto tuple = std::make_shared<DataTypeTuple>(std::move(types));
-
-        return std::make_shared<DataTypeArray>(tuple);
-    }
-
-    static DataTypePtr getDataTypeFromTypeIndex(TypeIndex type_index, UInt32 scale)
-    {
-        DataTypePtr data_type;
-        switch (type_index)
-        {
-            case TypeIndex::Date:
-                data_type = std::make_shared<DataTypeDate>();
-                break;
-            case TypeIndex::Date32:
-                data_type = std::make_shared<DataTypeDate32>();
-                break;
-            case TypeIndex::DateTime:
-                data_type = std::make_shared<DataTypeDateTime>();
-                break;
-            case TypeIndex::DateTime64:
-                data_type = std::make_shared<DataTypeDateTime64>(scale);
-                break;
-            default:
-                data_type = std::make_shared<DataTypeNumber<Float64>>();
-        }
-        return data_type;
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        Float64 x = getFloat64DataFromColumn(columns[0], row_num, this->x_type);
-        Float64 y = getFloat64DataFromColumn(columns[1], row_num, this->y_type);
-        this->data(place).add(x, y, arena);
-    }
-
-    Float64 getFloat64DataFromColumn(const IColumn * column, size_t row_num, TypeIndex type_index) const
-    {
-        switch (type_index)
-        {
-            case TypeIndex::Date:
-                return static_cast<const ColumnDate &>(*column).getData()[row_num];
-            case TypeIndex::Date32:
-                return static_cast<const ColumnDate32 &>(*column).getData()[row_num];
-            case TypeIndex::DateTime:
-                return static_cast<const ColumnDateTime &>(*column).getData()[row_num];
-            case TypeIndex::DateTime64:
-                return static_cast<const ColumnDateTime64 &>(*column).getData()[row_num];
-            default:
-                return column->getFloat64(row_num);
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        auto & a = this->data(place);
-        const auto & b = this->data(rhs);
-
-        a.merge(b, arena);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        this->data(place).read(buf, arena);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * arena) const override
-    {
-        auto res = this->data(place).getResult(total_buckets, arena);
-
-        auto & col = assert_cast<ColumnArray &>(to);
-        auto & col_offsets = assert_cast<ColumnArray::ColumnOffsets &>(col.getOffsetsColumn());
-
-        auto column_x_adder_func = getColumnAdderFunc(x_type);
-        auto column_y_adder_func = getColumnAdderFunc(y_type);
-
-        for (size_t i = 0; i < res.size(); ++i)
-        {
-            auto & column_tuple = assert_cast<ColumnTuple &>(col.getData());
-            column_x_adder_func(column_tuple.getColumn(0), res[i].first);
-            column_y_adder_func(column_tuple.getColumn(1), res[i].second);
-        }
-
-        col_offsets.getData().push_back(col.getData().size());
-    }
-
-    std::function<void(IColumn &, Float64)> getColumnAdderFunc(TypeIndex type_index) const
-    {
-        switch (type_index)
-        {
-            case TypeIndex::Date:
-                return [](IColumn & column, Float64 value)
-                {
-                    auto & col = assert_cast<ColumnDate &>(column);
-                    col.getData().push_back(static_cast<UInt16>(value));
-                };
-            case TypeIndex::Date32:
-                return [](IColumn & column, Float64 value)
-                {
-                    auto & col = assert_cast<ColumnDate32 &>(column);
-                    col.getData().push_back(static_cast<UInt32>(value));
-                };
-            case TypeIndex::DateTime:
-                return [](IColumn & column, Float64 value)
-                {
-                    auto & col = assert_cast<ColumnDateTime &>(column);
-                    col.getData().push_back(static_cast<UInt32>(value));
-                };
-            case TypeIndex::DateTime64:
-                return [](IColumn & column, Float64 value)
-                {
-                    auto & col = assert_cast<ColumnDateTime64 &>(column);
-                    col.getData().push_back(static_cast<UInt64>(value));
-                };
-            default:
-                return [](IColumn & column, Float64 value)
-                {
-                    auto & col = assert_cast<ColumnFloat64 &>(column);
-                    col.getData().push_back(value);
-                };
-        }
-    }
-};

 }
--- a/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.cpp
@ -1,21 +1,254 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionMannWhitney.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/Helpers.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <Common/PODArray_fwd.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <IO/ReadHelpers.h>
+#include <limits>
+
+#include <boost/math/distributions/normal.hpp>


 namespace ErrorCodes
 {
-extern const int NOT_IMPLEMENTED;
+    extern const int NOT_IMPLEMENTED;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
 }

 namespace DB
 {
+
 struct Settings;

 namespace
 {

+struct MannWhitneyData : public StatisticalSample<Float64, Float64>
+{
+    /*Since null hypothesis is "for randomly selected values X and Y from two populations,
+     *the probability of X being greater than Y is equal to the probability of Y being greater than X".
+     *Or "the distribution F of first sample equals to the distribution G of second sample".
+     *Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
+    enum class Alternative
+    {
+        TwoSided,
+        Less,
+        Greater
+    };
+
+    /// The behaviour equals to the similar function from scipy.
+    /// https://github.com/scipy/scipy/blob/ab9e9f17e0b7b2d618c4d4d8402cd4c0c200d6c0/scipy/stats/stats.py#L6978
+    std::pair<Float64, Float64> getResult(Alternative alternative, bool continuity_correction)
+    {
+        ConcatenatedSamples both(this->x, this->y);
+        RanksArray ranks;
+        Float64 tie_correction;
+
+        /// Compute ranks according to both samples.
+        std::tie(ranks, tie_correction) = computeRanksAndTieCorrection(both);
+
+        const Float64 n1 = this->size_x;
+        const Float64 n2 = this->size_y;
+
+        Float64 r1 = 0;
+        for (size_t i = 0; i < n1; ++i)
+            r1 += ranks[i];
+
+        const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
+        const Float64 u2 = n1 * n2 - u1;
+
+        /// The distribution of U-statistic under null hypothesis H0  is symmetric with respect to meanrank.
+        const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
+        const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
+
+        Float64 u = 0;
+        if (alternative == Alternative::TwoSided)
+            /// There is no difference which u_i to take as u, because z will be differ only in sign and we take std::abs() from it.
+            u = std::max(u1, u2);
+        else if (alternative == Alternative::Less)
+            u = u1;
+        else if (alternative == Alternative::Greater)
+            u = u2;
+
+        Float64 z = (u - meanrank) / sd;
+
+        if (unlikely(!std::isfinite(z)))
+            return {std::numeric_limits<Float64>::quiet_NaN(), std::numeric_limits<Float64>::quiet_NaN()};
+
+        if (alternative == Alternative::TwoSided)
+            z = std::abs(z);
+
+        auto standard_normal_distribution = boost::math::normal_distribution<Float64>();
+        auto cdf = boost::math::cdf(standard_normal_distribution, z);
+
+        Float64 p_value = 0;
+        if (alternative == Alternative::TwoSided)
+            p_value = 2 - 2 * cdf;
+        else
+            p_value = 1 - cdf;
+
+        return {u2, p_value};
+    }
+
+private:
+    using Sample = typename StatisticalSample<Float64, Float64>::SampleX;
+
+    /// We need to compute ranks according to all samples. Use this class to avoid extra copy and memory allocation.
+    class ConcatenatedSamples
+    {
+        public:
+            ConcatenatedSamples(const Sample & first_, const Sample & second_)
+                : first(first_), second(second_) {}
+
+            const Float64 & operator[](size_t ind) const
+            {
+                if (ind < first.size())
+                    return first[ind];
+                return second[ind % first.size()];
+            }
+
+            size_t size() const
+            {
+                return first.size() + second.size();
+            }
+
+        private:
+            const Sample & first;
+            const Sample & second;
+    };
+};
+
+class AggregateFunctionMannWhitney final:
+    public IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney>
+{
+private:
+    using Alternative = typename MannWhitneyData::Alternative;
+    Alternative alternative;
+    bool continuity_correction{true};
+
+public:
+    explicit AggregateFunctionMannWhitney(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney> ({arguments}, {}, createResultType())
+    {
+        if (params.size() > 2)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName());
+
+        if (params.empty())
+        {
+            alternative = Alternative::TwoSided;
+            return;
+        }
+
+        if (params[0].getType() != Field::Types::String)
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a String", getName());
+
+        const auto & param = params[0].get<String>();
+        if (param == "two-sided")
+            alternative = Alternative::TwoSided;
+        else if (param == "less")
+            alternative = Alternative::Less;
+        else if (param == "greater")
+            alternative = Alternative::Greater;
+        else
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter in aggregate function {}. "
+                    "It must be one of: 'two-sided', 'less', 'greater'", getName());
+
+        if (params.size() != 2)
+            return;
+
+        if (params[1].getType() != Field::Types::UInt64)
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a UInt64", getName());
+
+        continuity_correction = static_cast<bool>(params[1].get<UInt64>());
+    }
+
+    String getName() const override
+    {
+        return "mannWhitneyUTest";
+    }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    static DataTypePtr createResultType()
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+        };
+
+        Strings names
+        {
+            "u_statistic",
+            "p_value"
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 value = columns[0]->getFloat64(row_num);
+        UInt8 is_second = columns[1]->getUInt(row_num);
+
+        if (is_second)
+            this->data(place).addY(value, arena);
+        else
+            this->data(place).addX(value, arena);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & a = this->data(place);
+        const auto & b = this->data(rhs);
+
+        a.merge(b, arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        if (!this->data(place).size_x || !this->data(place).size_y)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} require both samples to be non empty", getName());
+
+        auto [u_statistic, p_value] = this->data(place).getResult(alternative, continuity_correction);
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+
+        column_stat.getData().push_back(u_statistic);
+        column_value.getData().push_back(p_value);
+    }
+
+};
+
+
 AggregateFunctionPtr createAggregateFunctionMannWhitneyUTest(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
--- a/src/AggregateFunctions/AggregateFunctionMannWhitney.h
+++ b/src/AggregateFunctions/AggregateFunctionMannWhitney.h
@ -1,249 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/StatCommon.h>
-#include <Columns/ColumnArray.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Common/assert_cast.h>
-#include <Common/PODArray_fwd.h>
-#include <base/types.h>
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypesDecimal.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <limits>
-
-#include <boost/math/distributions/normal.hpp>
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
-    extern const int BAD_ARGUMENTS;
-}
-
-
-struct MannWhitneyData : public StatisticalSample<Float64, Float64>
-{
-    /*Since null hypothesis is "for randomly selected values X and Y from two populations,
-     *the probability of X being greater than Y is equal to the probability of Y being greater than X".
-     *Or "the distribution F of first sample equals to the distribution G of second sample".
-     *Then alternative for this hypothesis (H1) is "two-sided"(F != G), "less"(F < G), "greater" (F > G). */
-    enum class Alternative
-    {
-        TwoSided,
-        Less,
-        Greater
-    };
-
-    /// The behaviour equals to the similar function from scipy.
-    /// https://github.com/scipy/scipy/blob/ab9e9f17e0b7b2d618c4d4d8402cd4c0c200d6c0/scipy/stats/stats.py#L6978
-    std::pair<Float64, Float64> getResult(Alternative alternative, bool continuity_correction)
-    {
-        ConcatenatedSamples both(this->x, this->y);
-        RanksArray ranks;
-        Float64 tie_correction;
-
-        /// Compute ranks according to both samples.
-        std::tie(ranks, tie_correction) = computeRanksAndTieCorrection(both);
-
-        const Float64 n1 = this->size_x;
-        const Float64 n2 = this->size_y;
-
-        Float64 r1 = 0;
-        for (size_t i = 0; i < n1; ++i)
-            r1 += ranks[i];
-
-        const Float64 u1 = n1 * n2 + (n1 * (n1 + 1.)) / 2. - r1;
-        const Float64 u2 = n1 * n2 - u1;
-
-        /// The distribution of U-statistic under null hypothesis H0  is symmetric with respect to meanrank.
-        const Float64 meanrank = n1 * n2 /2. + 0.5 * continuity_correction;
-        const Float64 sd = std::sqrt(tie_correction * n1 * n2 * (n1 + n2 + 1) / 12.0);
-
-        Float64 u = 0;
-        if (alternative == Alternative::TwoSided)
-            /// There is no difference which u_i to take as u, because z will be differ only in sign and we take std::abs() from it.
-            u = std::max(u1, u2);
-        else if (alternative == Alternative::Less)
-            u = u1;
-        else if (alternative == Alternative::Greater)
-            u = u2;
-
-        Float64 z = (u - meanrank) / sd;
-
-        if (unlikely(!std::isfinite(z)))
-            return {std::numeric_limits<Float64>::quiet_NaN(), std::numeric_limits<Float64>::quiet_NaN()};
-
-        if (alternative == Alternative::TwoSided)
-            z = std::abs(z);
-
-        auto standard_normal_distribution = boost::math::normal_distribution<Float64>();
-        auto cdf = boost::math::cdf(standard_normal_distribution, z);
-
-        Float64 p_value = 0;
-        if (alternative == Alternative::TwoSided)
-            p_value = 2 - 2 * cdf;
-        else
-            p_value = 1 - cdf;
-
-        return {u2, p_value};
-    }
-
-private:
-    using Sample = typename StatisticalSample<Float64, Float64>::SampleX;
-
-    /// We need to compute ranks according to all samples. Use this class to avoid extra copy and memory allocation.
-    class ConcatenatedSamples
-    {
-        public:
-            ConcatenatedSamples(const Sample & first_, const Sample & second_)
-                : first(first_), second(second_) {}
-
-            const Float64 & operator[](size_t ind) const
-            {
-                if (ind < first.size())
-                    return first[ind];
-                return second[ind % first.size()];
-            }
-
-            size_t size() const
-            {
-                return first.size() + second.size();
-            }
-
-        private:
-            const Sample & first;
-            const Sample & second;
-    };
-};
-
-class AggregateFunctionMannWhitney final:
-    public IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney>
-{
-private:
-    using Alternative = typename MannWhitneyData::Alternative;
-    Alternative alternative;
-    bool continuity_correction{true};
-
-public:
-    explicit AggregateFunctionMannWhitney(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<MannWhitneyData, AggregateFunctionMannWhitney> ({arguments}, {}, createResultType())
-    {
-        if (params.size() > 2)
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} require two parameter or less", getName());
-
-        if (params.empty())
-        {
-            alternative = Alternative::TwoSided;
-            return;
-        }
-
-        if (params[0].getType() != Field::Types::String)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require first parameter to be a String", getName());
-
-        const auto & param = params[0].get<String>();
-        if (param == "two-sided")
-            alternative = Alternative::TwoSided;
-        else if (param == "less")
-            alternative = Alternative::Less;
-        else if (param == "greater")
-            alternative = Alternative::Greater;
-        else
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown parameter in aggregate function {}. "
-                    "It must be one of: 'two-sided', 'less', 'greater'", getName());
-
-        if (params.size() != 2)
-            return;
-
-        if (params[1].getType() != Field::Types::UInt64)
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Aggregate function {} require second parameter to be a UInt64", getName());
-
-        continuity_correction = static_cast<bool>(params[1].get<UInt64>());
-    }
-
-    String getName() const override
-    {
-        return "mannWhitneyUTest";
-    }
-
-    bool allocatesMemoryInArena() const override { return true; }
-
-    static DataTypePtr createResultType()
-    {
-        DataTypes types
-        {
-            std::make_shared<DataTypeNumber<Float64>>(),
-            std::make_shared<DataTypeNumber<Float64>>(),
-        };
-
-        Strings names
-        {
-            "u_statistic",
-            "p_value"
-        };
-
-        return std::make_shared<DataTypeTuple>(
-            std::move(types),
-            std::move(names)
-        );
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        Float64 value = columns[0]->getFloat64(row_num);
-        UInt8 is_second = columns[1]->getUInt(row_num);
-
-        if (is_second)
-            this->data(place).addY(value, arena);
-        else
-            this->data(place).addX(value, arena);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        auto & a = this->data(place);
-        const auto & b = this->data(rhs);
-
-        a.merge(b, arena);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        this->data(place).read(buf, arena);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        if (!this->data(place).size_x || !this->data(place).size_y)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} require both samples to be non empty", getName());
-
-        auto [u_statistic, p_value] = this->data(place).getResult(alternative, continuity_correction);
-
-        /// Because p-value is a probability.
-        p_value = std::min(1.0, std::max(0.0, p_value));
-
-        auto & column_tuple = assert_cast<ColumnTuple &>(to);
-        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
-        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
-
-        column_stat.getData().push_back(u_statistic);
-        column_value.getData().push_back(p_value);
-    }
-
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp
@ -1,8 +1,21 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionMaxIntersections.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>

+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnsNumber.h>
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+
+#include <Common/ArenaAllocator.h>
+#include <Common/NaNUtils.h>
+#include <Common/assert_cast.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#define AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE 0xFFFFFF
+

 namespace DB
 {
@ -11,24 +24,187 @@ struct Settings;
 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int TOO_LARGE_ARRAY_SIZE;
 }

 namespace
 {
-    AggregateFunctionPtr createAggregateFunctionMaxIntersections(
-        AggregateFunctionIntersectionsKind kind,
-        const std::string & name, const DataTypes & argument_types, const Array & parameters)
+
+/** maxIntersections: returns maximum count of the intersected intervals defined by start_column and end_column values,
+  * maxIntersectionsPosition: returns leftmost position of maximum intersection of intervals.
+  */
+
+/// Similar to GroupArrayNumericData.
+template <typename T>
+struct MaxIntersectionsData
+{
+    /// Left or right end of the interval and signed weight; with positive sign for begin of interval and negative sign for end of interval.
+    using Value = std::pair<T, Int64>;
+
+    // Switch to ordinary Allocator after 4096 bytes to avoid fragmentation and trash in Arena
+    using Allocator = MixedAlignedArenaAllocator<alignof(Value), 4096>;
+    using Array = PODArray<Value, 32, Allocator>;
+
+    Array value;
+};
+
+enum class AggregateFunctionIntersectionsKind
+{
+    Count,
+    Position
+};
+
+template <typename PointType>
+class AggregateFunctionIntersectionsMax final
+    : public IAggregateFunctionDataHelper<MaxIntersectionsData<PointType>, AggregateFunctionIntersectionsMax<PointType>>
+{
+private:
+    AggregateFunctionIntersectionsKind kind;
+
+public:
+    AggregateFunctionIntersectionsMax(AggregateFunctionIntersectionsKind kind_, const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<MaxIntersectionsData<PointType>, AggregateFunctionIntersectionsMax<PointType>>(arguments, {}, createResultType(kind_))
+        , kind(kind_)
    {
-        assertBinary(name, argument_types);
-        assertNoParameters(name, parameters);
+        if (!isNativeNumber(arguments[0]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: first argument must be represented by integer", getName());

-        AggregateFunctionPtr res(createWithNumericType<AggregateFunctionIntersectionsMax>(*argument_types[0], kind, argument_types));
-        if (!res)
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types {} and {} of argument for aggregate function {}",
-                argument_types[0]->getName(), argument_types[1]->getName(), name);
+        if (!isNativeNumber(arguments[1]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: second argument must be represented by integer", getName());

-        return res;
+        if (!arguments[0]->equals(*arguments[1]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: arguments must have the same type", getName());
    }
+
+    String getName() const override
+    {
+        return kind == AggregateFunctionIntersectionsKind::Count
+            ? "maxIntersections"
+            : "maxIntersectionsPosition";
+    }
+
+    static DataTypePtr createResultType(AggregateFunctionIntersectionsKind kind_)
+    {
+        if (kind_ == AggregateFunctionIntersectionsKind::Count)
+            return std::make_shared<DataTypeUInt64>();
+        else
+            return std::make_shared<DataTypeNumber<PointType>>();
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        PointType left = assert_cast<const ColumnVector<PointType> &>(*columns[0]).getData()[row_num];
+        PointType right = assert_cast<const ColumnVector<PointType> &>(*columns[1]).getData()[row_num];
+
+        if (!isNaN(left))
+            this->data(place).value.push_back(std::make_pair(left, Int64(1)), arena);
+
+        if (!isNaN(right))
+            this->data(place).value.push_back(std::make_pair(right, Int64(-1)), arena);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & cur_elems = this->data(place);
+        auto & rhs_elems = this->data(rhs);
+
+        cur_elems.value.insert(rhs_elems.value.begin(), rhs_elems.value.end(), arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        const auto & value = this->data(place).value;
+        size_t size = value.size();
+        writeVarUInt(size, buf);
+
+        /// In this version, pairs were serialized with padding.
+        /// We must ensure that padding bytes are zero-filled.
+
+        static_assert(offsetof(typename MaxIntersectionsData<PointType>::Value, first) == 0);
+        static_assert(offsetof(typename MaxIntersectionsData<PointType>::Value, second) > 0);
+
+        char zero_padding[offsetof(typename MaxIntersectionsData<PointType>::Value, second) - sizeof(value[0].first)]{};
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            writePODBinary(value[i].first, buf);
+            writePODBinary(zero_padding, buf);
+            if constexpr (std::endian::native == std::endian::little)
+                writePODBinary(value[i].second, buf);
+            else
+                writePODBinary(std::byteswap(value[i].second), buf);
+        }
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        size_t size = 0;
+        readVarUInt(size, buf);
+
+        if (unlikely(size > AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE))
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
+                            "Too large array size (maximum: {})", AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE);
+
+        auto & value = this->data(place).value;
+
+        value.resize(size, arena);
+        buf.readStrict(reinterpret_cast<char *>(value.data()), size * sizeof(value[0]));
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        Int64 current_intersections = 0;
+        Int64 max_intersections = 0;
+        PointType position_of_max_intersections = 0;
+
+        /// const_cast because we will sort the array
+        auto & array = this->data(place).value;
+
+        /// Sort by position; for equal position, sort by weight to get deterministic result.
+        ::sort(array.begin(), array.end());
+
+        for (const auto & point_weight : array)
+        {
+            current_intersections += point_weight.second;
+            if (current_intersections > max_intersections)
+            {
+                max_intersections = current_intersections;
+                position_of_max_intersections = point_weight.first;
+            }
+        }
+
+        if (kind == AggregateFunctionIntersectionsKind::Count)
+        {
+            auto & result_column = assert_cast<ColumnUInt64 &>(to).getData();
+            result_column.push_back(max_intersections);
+        }
+        else
+        {
+            auto & result_column = assert_cast<ColumnVector<PointType> &>(to).getData();
+            result_column.push_back(position_of_max_intersections);
+        }
+    }
+};
+
+
+AggregateFunctionPtr createAggregateFunctionMaxIntersections(
+    AggregateFunctionIntersectionsKind kind,
+    const std::string & name, const DataTypes & argument_types, const Array & parameters)
+{
+    assertBinary(name, argument_types);
+    assertNoParameters(name, parameters);
+
+    AggregateFunctionPtr res(createWithNumericType<AggregateFunctionIntersectionsMax>(*argument_types[0], kind, argument_types));
+    if (!res)
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal types {} and {} of argument for aggregate function {}",
+            argument_types[0]->getName(), argument_types[1]->getName(), name);
+
+    return res;
+}
+
 }

 void registerAggregateFunctionsMaxIntersections(AggregateFunctionFactory & factory)
--- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h
+++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h
@ -1,189 +0,0 @@
-#pragma once
-
-#include <base/sort.h>
-
-#include <DataTypes/DataTypesNumber.h>
-#include <Columns/ColumnsNumber.h>
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-#include <Common/ArenaAllocator.h>
-#include <Common/NaNUtils.h>
-#include <Common/assert_cast.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-#define AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE 0xFFFFFF
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-    extern const int TOO_LARGE_ARRAY_SIZE;
-}
-
-
-/** maxIntersections: returns maximum count of the intersected intervals defined by start_column and end_column values,
-  * maxIntersectionsPosition: returns leftmost position of maximum intersection of intervals.
-  */
-
-/// Similar to GroupArrayNumericData.
-template <typename T>
-struct MaxIntersectionsData
-{
-    /// Left or right end of the interval and signed weight; with positive sign for begin of interval and negative sign for end of interval.
-    using Value = std::pair<T, Int64>;
-
-    // Switch to ordinary Allocator after 4096 bytes to avoid fragmentation and trash in Arena
-    using Allocator = MixedAlignedArenaAllocator<alignof(Value), 4096>;
-    using Array = PODArray<Value, 32, Allocator>;
-
-    Array value;
-};
-
-enum class AggregateFunctionIntersectionsKind
-{
-    Count,
-    Position
-};
-
-template <typename PointType>
-class AggregateFunctionIntersectionsMax final
-    : public IAggregateFunctionDataHelper<MaxIntersectionsData<PointType>, AggregateFunctionIntersectionsMax<PointType>>
-{
-private:
-    AggregateFunctionIntersectionsKind kind;
-
-public:
-    AggregateFunctionIntersectionsMax(AggregateFunctionIntersectionsKind kind_, const DataTypes & arguments)
-        : IAggregateFunctionDataHelper<MaxIntersectionsData<PointType>, AggregateFunctionIntersectionsMax<PointType>>(arguments, {}, createResultType(kind_))
-        , kind(kind_)
-    {
-        if (!isNativeNumber(arguments[0]))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: first argument must be represented by integer", getName());
-
-        if (!isNativeNumber(arguments[1]))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: second argument must be represented by integer", getName());
-
-        if (!arguments[0]->equals(*arguments[1]))
-            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "{}: arguments must have the same type", getName());
-    }
-
-    String getName() const override
-    {
-        return kind == AggregateFunctionIntersectionsKind::Count
-            ? "maxIntersections"
-            : "maxIntersectionsPosition";
-    }
-
-    static DataTypePtr createResultType(AggregateFunctionIntersectionsKind kind_)
-    {
-        if (kind_ == AggregateFunctionIntersectionsKind::Count)
-            return std::make_shared<DataTypeUInt64>();
-        else
-            return std::make_shared<DataTypeNumber<PointType>>();
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        PointType left = assert_cast<const ColumnVector<PointType> &>(*columns[0]).getData()[row_num];
-        PointType right = assert_cast<const ColumnVector<PointType> &>(*columns[1]).getData()[row_num];
-
-        if (!isNaN(left))
-            this->data(place).value.push_back(std::make_pair(left, Int64(1)), arena);
-
-        if (!isNaN(right))
-            this->data(place).value.push_back(std::make_pair(right, Int64(-1)), arena);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        auto & cur_elems = this->data(place);
-        auto & rhs_elems = this->data(rhs);
-
-        cur_elems.value.insert(rhs_elems.value.begin(), rhs_elems.value.end(), arena);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        const auto & value = this->data(place).value;
-        size_t size = value.size();
-        writeVarUInt(size, buf);
-
-        /// In this version, pairs were serialized with padding.
-        /// We must ensure that padding bytes are zero-filled.
-
-        static_assert(offsetof(typename MaxIntersectionsData<PointType>::Value, first) == 0);
-        static_assert(offsetof(typename MaxIntersectionsData<PointType>::Value, second) > 0);
-
-        char zero_padding[offsetof(typename MaxIntersectionsData<PointType>::Value, second) - sizeof(value[0].first)]{};
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            writePODBinary(value[i].first, buf);
-            writePODBinary(zero_padding, buf);
-            if constexpr (std::endian::native == std::endian::little)
-                writePODBinary(value[i].second, buf);
-            else
-                writePODBinary(std::byteswap(value[i].second), buf);
-        }
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        size_t size = 0;
-        readVarUInt(size, buf);
-
-        if (unlikely(size > AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE))
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
-                            "Too large array size (maximum: {})", AGGREGATE_FUNCTION_MAX_INTERSECTIONS_MAX_ARRAY_SIZE);
-
-        auto & value = this->data(place).value;
-
-        value.resize(size, arena);
-        buf.readStrict(reinterpret_cast<char *>(value.data()), size * sizeof(value[0]));
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        Int64 current_intersections = 0;
-        Int64 max_intersections = 0;
-        PointType position_of_max_intersections = 0;
-
-        /// const_cast because we will sort the array
-        auto & array = this->data(place).value;
-
-        /// Sort by position; for equal position, sort by weight to get deterministic result.
-        ::sort(array.begin(), array.end());
-
-        for (const auto & point_weight : array)
-        {
-            current_intersections += point_weight.second;
-            if (current_intersections > max_intersections)
-            {
-                max_intersections = current_intersections;
-                position_of_max_intersections = point_weight.first;
-            }
-        }
-
-        if (kind == AggregateFunctionIntersectionsKind::Count)
-        {
-            auto & result_column = assert_cast<ColumnUInt64 &>(to).getData();
-            result_column.push_back(max_intersections);
-        }
-        else
-        {
-            auto & result_column = assert_cast<ColumnVector<PointType> &>(to).getData();
-            result_column.push_back(position_of_max_intersections);
-        }
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionMeanZTest.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMeanZTest.cpp
@ -1,8 +1,16 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionMeanZTest.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Moments.h>

+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <cmath>
+

 namespace ErrorCodes
 {
@ -18,6 +26,121 @@ struct Settings;
 namespace
 {

+/// Returns tuple of (z-statistic, p-value, confidence-interval-low, confidence-interval-high)
+template <typename Data>
+class AggregateFunctionMeanZTest :
+    public IAggregateFunctionDataHelper<Data, AggregateFunctionMeanZTest<Data>>
+{
+private:
+    Float64 pop_var_x;
+    Float64 pop_var_y;
+    Float64 confidence_level;
+
+public:
+    AggregateFunctionMeanZTest(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionMeanZTest<Data>>({arguments}, params, createResultType())
+    {
+        pop_var_x = params.at(0).safeGet<Float64>();
+        pop_var_y = params.at(1).safeGet<Float64>();
+        confidence_level = params.at(2).safeGet<Float64>();
+
+        if (!std::isfinite(pop_var_x) || !std::isfinite(pop_var_y) || !std::isfinite(confidence_level))
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} requires finite parameter values.", Data::name);
+        }
+
+        if (pop_var_x < 0.0 || pop_var_y < 0.0)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                            "Population variance parameters must be larger than or equal to zero "
+                            "in aggregate function {}.", Data::name);
+        }
+
+        if (confidence_level <= 0.0 || confidence_level >= 1.0)
+        {
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Confidence level parameter must be between 0 and 1 in aggregate function {}.", Data::name);
+        }
+    }
+
+    String getName() const override
+    {
+        return Data::name;
+    }
+
+    static DataTypePtr createResultType()
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+            std::make_shared<DataTypeNumber<Float64>>(),
+        };
+
+        Strings names
+        {
+            "z_statistic",
+            "p_value",
+            "confidence_interval_low",
+            "confidence_interval_high"
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        Float64 value = columns[0]->getFloat64(row_num);
+        UInt8 is_second = columns[1]->getUInt(row_num);
+
+        if (is_second)
+            this->data(place).addY(value);
+        else
+            this->data(place).addX(value);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto [z_stat, p_value] = this->data(place).getResult(pop_var_x, pop_var_y);
+        auto [ci_low, ci_high] = this->data(place).getConfidenceIntervals(pop_var_x, pop_var_y, confidence_level);
+
+        /// Because p-value is a probability.
+        p_value = std::min(1.0, std::max(0.0, p_value));
+
+        auto & column_tuple = assert_cast<ColumnTuple &>(to);
+        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
+        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
+        auto & column_ci_low = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(2));
+        auto & column_ci_high = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(3));
+
+        column_stat.getData().push_back(z_stat);
+        column_value.getData().push_back(p_value);
+        column_ci_low.getData().push_back(ci_low);
+        column_ci_high.getData().push_back(ci_high);
+    }
+};
+
+
 struct MeanZTestData : public ZTestMoments<Float64>
 {
    static constexpr auto name = "meanZTest";
--- a/src/AggregateFunctions/AggregateFunctionMeanZTest.h
+++ b/src/AggregateFunctions/AggregateFunctionMeanZTest.h
@ -1,141 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/StatCommon.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Common/assert_cast.h>
-#include <Core/Types.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <cmath>
-
-
-namespace DB
-{
-struct Settings;
-
-class ReadBuffer;
-class WriteBuffer;
-
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
-
-
-/// Returns tuple of (z-statistic, p-value, confidence-interval-low, confidence-interval-high)
-template <typename Data>
-class AggregateFunctionMeanZTest :
-    public IAggregateFunctionDataHelper<Data, AggregateFunctionMeanZTest<Data>>
-{
-private:
-    Float64 pop_var_x;
-    Float64 pop_var_y;
-    Float64 confidence_level;
-
-public:
-    AggregateFunctionMeanZTest(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionMeanZTest<Data>>({arguments}, params, createResultType())
-    {
-        pop_var_x = params.at(0).safeGet<Float64>();
-        pop_var_y = params.at(1).safeGet<Float64>();
-        confidence_level = params.at(2).safeGet<Float64>();
-
-        if (!std::isfinite(pop_var_x) || !std::isfinite(pop_var_y) || !std::isfinite(confidence_level))
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} requires finite parameter values.", Data::name);
-        }
-
-        if (pop_var_x < 0.0 || pop_var_y < 0.0)
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS,
-                            "Population variance parameters must be larger than or equal to zero "
-                            "in aggregate function {}.", Data::name);
-        }
-
-        if (confidence_level <= 0.0 || confidence_level >= 1.0)
-        {
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Confidence level parameter must be between 0 and 1 in aggregate function {}.", Data::name);
-        }
-    }
-
-    String getName() const override
-    {
-        return Data::name;
-    }
-
-    static DataTypePtr createResultType()
-    {
-        DataTypes types
-        {
-            std::make_shared<DataTypeNumber<Float64>>(),
-            std::make_shared<DataTypeNumber<Float64>>(),
-            std::make_shared<DataTypeNumber<Float64>>(),
-            std::make_shared<DataTypeNumber<Float64>>(),
-        };
-
-        Strings names
-        {
-            "z_statistic",
-            "p_value",
-            "confidence_interval_low",
-            "confidence_interval_high"
-        };
-
-        return std::make_shared<DataTypeTuple>(
-            std::move(types),
-            std::move(names)
-        );
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        Float64 value = columns[0]->getFloat64(row_num);
-        UInt8 is_second = columns[1]->getUInt(row_num);
-
-        if (is_second)
-            this->data(place).addY(value);
-        else
-            this->data(place).addX(value);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).read(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto [z_stat, p_value] = this->data(place).getResult(pop_var_x, pop_var_y);
-        auto [ci_low, ci_high] = this->data(place).getConfidenceIntervals(pop_var_x, pop_var_y, confidence_level);
-
-        /// Because p-value is a probability.
-        p_value = std::min(1.0, std::max(0.0, p_value));
-
-        auto & column_tuple = assert_cast<ColumnTuple &>(to);
-        auto & column_stat = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(0));
-        auto & column_value = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(1));
-        auto & column_ci_low = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(2));
-        auto & column_ci_high = assert_cast<ColumnVector<Float64> &>(column_tuple.getColumn(3));
-
-        column_stat.getData().push_back(z_stat);
-        column_value.getData().push_back(p_value);
-        column_ci_low.getData().push_back(ci_low);
-        column_ci_high.getData().push_back(ci_high);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionQuantile.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.cpp
@ -1,5 +1,5 @@
 #include <AggregateFunctions/AggregateFunctionQuantile.h>
-#include <AggregateFunctions/QuantileReservoirSampler.h>
+#include <AggregateFunctions/ReservoirSampler.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
@ -9,16 +9,106 @@

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NOT_IMPLEMENTED;
 }

 namespace
 {

+/** Quantile calculation with "reservoir sample" algorithm.
+  * It collects pseudorandom subset of limited size from a stream of values,
+  *  and approximate quantile from it.
+  * The result is non-deterministic. Also look at QuantileReservoirSamplerDeterministic.
+  *
+  * This algorithm is quite inefficient in terms of precision for memory usage,
+  *  but very efficient in CPU (though less efficient than QuantileTiming and than QuantileExact for small sets).
+  */
+template <typename Value>
+struct QuantileReservoirSampler
+{
+    using Data = ReservoirSampler<Value, ReservoirSamplerOnEmpty::RETURN_NAN_OR_ZERO>;
+    Data data;
+
+    void add(const Value & x)
+    {
+        data.insert(x);
+    }
+
+    template <typename Weight>
+    void add(const Value &, const Weight &)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add with weight is not implemented for ReservoirSampler");
+    }
+
+    void merge(const QuantileReservoirSampler & rhs)
+    {
+        data.merge(rhs.data);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        data.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        data.read(buf);
+    }
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    Value get(Float64 level)
+    {
+        if (data.empty())
+            return {};
+
+        if constexpr (is_decimal<Value>)
+            return Value(static_cast<typename Value::NativeType>(data.quantileInterpolated(level)));
+        else
+            return static_cast<Value>(data.quantileInterpolated(level));
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
+    {
+        bool is_empty = data.empty();
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            if (is_empty)
+            {
+                result[i] = Value{};
+            }
+            else
+            {
+                if constexpr (is_decimal<Value>)
+                    result[indices[i]] = Value(static_cast<typename Value::NativeType>(data.quantileInterpolated(levels[indices[i]])));
+                else
+                    result[indices[i]] = Value(data.quantileInterpolated(levels[indices[i]]));
+            }
+        }
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64 level)
+    {
+        return data.quantileInterpolated(level);
+    }
+
+    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
+    {
+        for (size_t i = 0; i < size; ++i)
+            result[indices[i]] = data.quantileInterpolated(levels[indices[i]]);
+    }
+};
+
+
 template <typename Value, bool float_return> using FuncQuantile = AggregateFunctionQuantile<Value, QuantileReservoirSampler<Value>, NameQuantile, false, std::conditional_t<float_return, Float64, void>, false>;
 template <typename Value, bool float_return> using FuncQuantiles = AggregateFunctionQuantile<Value, QuantileReservoirSampler<Value>, NameQuantiles, false, std::conditional_t<float_return, Float64, void>, true>;

--- a/src/AggregateFunctions/AggregateFunctionQuantileApprox.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileApprox.cpp
@ -1,10 +1,15 @@
 #include <AggregateFunctions/AggregateFunctionQuantile.h>
-#include <AggregateFunctions/QuantileApprox.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <Core/Field.h>
+#include <cmath>
+#include <Common/RadixSort.h>
+#include <IO/WriteBuffer.h>
+#include <IO/ReadBuffer.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>

 namespace DB
 {
@ -12,11 +17,471 @@ namespace DB
 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int LOGICAL_ERROR;
+    extern const int NOT_IMPLEMENTED;
 }

 namespace
 {

+template <typename T>
+class ApproxSampler
+{
+public:
+    struct Stats
+    {
+        T value;      // the sampled value
+        Int64 g;      // the minimum rank jump from the previous value's minimum rank
+        Int64 delta;  // the maximum span of the rank
+
+        Stats() = default;
+        Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) {}
+    };
+
+    struct QueryResult
+    {
+        size_t index;
+        Int64 rank;
+        T value;
+
+        QueryResult(size_t index_, Int64 rank_, T value_) : index(index_), rank(rank_), value(value_) { }
+    };
+
+    ApproxSampler() = default;
+
+    explicit ApproxSampler(
+        double relative_error_,
+        size_t compress_threshold_ = default_compress_threshold,
+        size_t count_ = 0,
+        bool compressed_ = false)
+        : relative_error(relative_error_)
+        , compress_threshold(compress_threshold_)
+        , count(count_)
+        , compressed(compressed_)
+    {
+        sampled.reserve(compress_threshold);
+        backup_sampled.reserve(compress_threshold);
+
+        head_sampled.reserve(default_head_size);
+    }
+
+    bool isCompressed() const { return compressed; }
+    void setCompressed() { compressed = true; }
+
+    void insert(T x)
+    {
+        head_sampled.push_back(x);
+        compressed = false;
+        if (head_sampled.size() >= default_head_size)
+        {
+            withHeadBufferInserted();
+            if (sampled.size() >= compress_threshold)
+                compress();
+        }
+    }
+
+    void query(const Float64 * percentiles, const size_t * indices, size_t size, T * result) const
+    {
+        if (!head_sampled.empty())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot operate on an uncompressed summary, call compress() first");
+
+        if (sampled.empty())
+        {
+            for (size_t i = 0; i < size; ++i)
+                result[i] = T();
+            return;
+        }
+
+        Int64 current_max = std::numeric_limits<Int64>::min();
+        for (const auto & stats : sampled)
+            current_max = std::max(stats.delta + stats.g, current_max);
+        Int64 target_error = current_max/2;
+
+        size_t index= 0;
+        auto min_rank = sampled[0].g;
+        for (size_t i = 0; i < size; ++i)
+        {
+            double percentile = percentiles[indices[i]];
+            if (percentile <= relative_error)
+            {
+                result[indices[i]] = sampled.front().value;
+            }
+            else if (percentile >= 1 - relative_error)
+            {
+                result[indices[i]] = sampled.back().value;
+            }
+            else
+            {
+                QueryResult res = findApproxQuantile(index, min_rank, target_error, percentile);
+                index = res.index;
+                min_rank = res.rank;
+                result[indices[i]] = res.value;
+            }
+        }
+
+    }
+
+    void compress()
+    {
+        if (compressed)
+            return;
+
+        withHeadBufferInserted();
+
+        doCompress(2 * relative_error * count);
+        compressed = true;
+    }
+
+
+    void merge(const ApproxSampler & other)
+    {
+        if (other.count == 0)
+            return;
+        else if (count == 0)
+        {
+            compress_threshold = other.compress_threshold;
+            relative_error = other.relative_error;
+            count = other.count;
+            compressed = other.compressed;
+
+            sampled.resize(other.sampled.size());
+            memcpy(sampled.data(), other.sampled.data(), sizeof(Stats) * other.sampled.size());
+            return;
+        }
+        else
+        {
+            // Merge the two buffers.
+            // The GK algorithm is a bit unclear about it, but we need to adjust the statistics during the
+            // merging. The main idea is that samples that come from one side will suffer from the lack of
+            // precision of the other.
+            // As a concrete example, take two QuantileSummaries whose samples (value, g, delta) are:
+            // `a = [(0, 1, 0), (20, 99, 0)]` and `b = [(10, 1, 0), (30, 49, 0)]`
+            // This means `a` has 100 values, whose minimum is 0 and maximum is 20,
+            // while `b` has 50 values, between 10 and 30.
+            // The resulting samples of the merge will be:
+            // a+b = [(0, 1, 0), (10, 1, ??), (20, 99, ??), (30, 49, 0)]
+            // The values of `g` do not change, as they represent the minimum number of values between two
+            // consecutive samples. The values of `delta` should be adjusted, however.
+            // Take the case of the sample `10` from `b`. In the original stream, it could have appeared
+            // right after `0` (as expressed by `g=1`) or right before `20`, so `delta=99+0-1=98`.
+            // In the GK algorithm's style of working in terms of maximum bounds, one can observe that the
+            // maximum additional uncertainty over samples coming from `b` is `max(g_a + delta_a) =
+            // floor(2 * eps_a * n_a)`. Likewise, additional uncertainty over samples from `a` is
+            // `floor(2 * eps_b * n_b)`.
+            // Only samples that interleave the other side are affected. That means that samples from
+            // one side that are lesser (or greater) than all samples from the other side are just copied
+            // unmodified.
+            // If the merging instances have different `relativeError`, the resulting instance will carry
+            // the largest one: `eps_ab = max(eps_a, eps_b)`.
+            // The main invariant of the GK algorithm is kept:
+            // `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since
+            // `max(g_ab + delta_ab) <= floor(2 * eps_a * n_a) + floor(2 * eps_b * n_b)`
+            // Finally, one can see how the `insert(x)` operation can be expressed as `merge([(x, 1, 0])`
+            compress();
+
+            backup_sampled.clear();
+            backup_sampled.reserve(sampled.size() + other.sampled.size());
+            double merged_relative_error = std::max(relative_error, other.relative_error);
+            size_t merged_count = count + other.count;
+            Int64 additional_self_delta = static_cast<Int64>(std::floor(2 * other.relative_error * other.count));
+            Int64 additional_other_delta = static_cast<Int64>(std::floor(2 * relative_error * count));
+
+            // Do a merge of two sorted lists until one of the lists is fully consumed
+            size_t self_idx = 0;
+            size_t other_idx = 0;
+            while (self_idx < sampled.size() && other_idx < other.sampled.size())
+            {
+                const Stats & self_sample = sampled[self_idx];
+                const Stats & other_sample = other.sampled[other_idx];
+
+                // Detect next sample
+                Stats next_sample;
+                Int64 additional_delta = 0;
+                if (self_sample.value < other_sample.value)
+                {
+                    ++self_idx;
+                    next_sample = self_sample;
+                    additional_delta = other_idx > 0 ? additional_self_delta : 0;
+                }
+                else
+                {
+                    ++other_idx;
+                    next_sample = other_sample;
+                    additional_delta = self_idx > 0 ? additional_other_delta : 0;
+                }
+
+                // Insert it
+                next_sample.delta += additional_delta;
+                backup_sampled.emplace_back(std::move(next_sample));
+            }
+
+            // Copy the remaining samples from the other list
+            // (by construction, at most one `while` loop will run)
+            while (self_idx < sampled.size())
+            {
+                backup_sampled.emplace_back(sampled[self_idx]);
+                ++self_idx;
+            }
+            while (other_idx < other.sampled.size())
+            {
+                backup_sampled.emplace_back(other.sampled[other_idx]);
+                ++other_idx;
+            }
+
+            std::swap(sampled, backup_sampled);
+            relative_error = merged_relative_error;
+            count = merged_count;
+            compress_threshold = other.compress_threshold;
+
+            doCompress(2 * merged_relative_error * merged_count);
+            compressed = true;
+        }
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writeBinaryLittleEndian(compress_threshold, buf);
+        writeBinaryLittleEndian(relative_error, buf);
+        writeBinaryLittleEndian(count, buf);
+        writeBinaryLittleEndian(sampled.size(), buf);
+
+        for (const auto & stats : sampled)
+        {
+            writeBinaryLittleEndian(stats.value, buf);
+            writeBinaryLittleEndian(stats.g, buf);
+            writeBinaryLittleEndian(stats.delta, buf);
+        }
+    }
+
+    void read(ReadBuffer & buf)
+    {
+        readBinaryLittleEndian(compress_threshold, buf);
+        readBinaryLittleEndian(relative_error, buf);
+        readBinaryLittleEndian(count, buf);
+
+        size_t sampled_len = 0;
+        readBinaryLittleEndian(sampled_len, buf);
+        sampled.resize(sampled_len);
+
+        for (size_t i = 0; i < sampled_len; ++i)
+        {
+            auto stats = sampled[i];
+            readBinaryLittleEndian(stats.value, buf);
+            readBinaryLittleEndian(stats.g, buf);
+            readBinaryLittleEndian(stats.delta, buf);
+        }
+    }
+
+private:
+    QueryResult findApproxQuantile(size_t index, Int64 min_rank_at_index, double target_error, double percentile) const
+    {
+        Stats curr_sample = sampled[index];
+        Int64 rank = static_cast<Int64>(std::ceil(percentile * count));
+        size_t i = index;
+        Int64 min_rank = min_rank_at_index;
+        while (i < sampled.size() - 1)
+        {
+            Int64 max_rank = min_rank + curr_sample.delta;
+            if (max_rank - target_error <= rank && rank <= min_rank + target_error)
+                return {i, min_rank, curr_sample.value};
+            else
+            {
+                ++i;
+                curr_sample = sampled[i];
+                min_rank += curr_sample.g;
+            }
+        }
+        return {sampled.size()-1, 0, sampled.back().value};
+    }
+
+    void withHeadBufferInserted()
+    {
+        if (head_sampled.empty())
+            return;
+
+        bool use_radix_sort = head_sampled.size() >= 256 && (is_arithmetic_v<T> && !is_big_int_v<T>);
+        if (use_radix_sort)
+            RadixSort<RadixSortNumTraits<T>>::executeLSD(head_sampled.data(), head_sampled.size());
+        else
+            ::sort(head_sampled.begin(), head_sampled.end());
+
+        backup_sampled.clear();
+        backup_sampled.reserve(sampled.size() + head_sampled.size());
+
+        size_t sample_idx = 0;
+        size_t ops_idx = 0;
+        size_t current_count = count;
+        for (; ops_idx < head_sampled.size(); ++ops_idx)
+        {
+            T current_sample = head_sampled[ops_idx];
+
+            // Add all the samples before the next observation.
+            while (sample_idx < sampled.size() && sampled[sample_idx].value <= current_sample)
+            {
+                backup_sampled.emplace_back(sampled[sample_idx]);
+                ++sample_idx;
+            }
+
+            // If it is the first one to insert, of if it is the last one
+            ++current_count;
+            Int64 delta;
+            if (backup_sampled.empty() || (sample_idx == sampled.size() && ops_idx == (head_sampled.size() - 1)))
+                delta = 0;
+            else
+                delta = static_cast<Int64>(std::floor(2 * relative_error * current_count));
+
+            backup_sampled.emplace_back(current_sample, 1, delta);
+        }
+
+        // Add all the remaining existing samples
+        for (; sample_idx < sampled.size(); ++sample_idx)
+            backup_sampled.emplace_back(sampled[sample_idx]);
+
+        std::swap(sampled, backup_sampled);
+        head_sampled.clear();
+        count = current_count;
+    }
+
+
+    void doCompress(double merge_threshold)
+    {
+        if (sampled.empty())
+            return;
+
+        backup_sampled.clear();
+        // Start for the last element, which is always part of the set.
+        // The head contains the current new head, that may be merged with the current element.
+        Stats head = sampled.back();
+        ssize_t i = sampled.size() - 2;
+
+        // Do not compress the last element
+        while (i >= 1)
+        {
+            // The current sample:
+            const auto & sample1 = sampled[i];
+            // Do we need to compress?
+            if (sample1.g + head.g + head.delta < merge_threshold)
+            {
+                // Do not insert yet, just merge the current element into the head.
+                head.g += sample1.g;
+            }
+            else
+            {
+                // Prepend the current head, and keep the current sample as target for merging.
+                backup_sampled.push_back(head);
+                head = sample1;
+            }
+            --i;
+        }
+
+        backup_sampled.push_back(head);
+        // If necessary, add the minimum element:
+        auto curr_head = sampled.front();
+
+        // don't add the minimum element if `currentSamples` has only one element (both `currHead` and
+        // `head` point to the same element)
+        if (curr_head.value <= head.value && sampled.size() > 1)
+            backup_sampled.emplace_back(sampled.front());
+
+        std::reverse(backup_sampled.begin(), backup_sampled.end());
+        std::swap(sampled, backup_sampled);
+    }
+
+    double relative_error;
+    size_t compress_threshold;
+    size_t count = 0;
+    bool compressed;
+
+    PaddedPODArray<Stats> sampled;
+    PaddedPODArray<Stats> backup_sampled;
+
+    PaddedPODArray<T> head_sampled;
+
+    static constexpr size_t default_compress_threshold = 10000;
+    static constexpr size_t default_head_size = 50000;
+};
+
+template <typename Value>
+class QuantileGK
+{
+private:
+    using Data = ApproxSampler<Value>;
+    mutable Data data;
+
+public:
+    QuantileGK() = default;
+
+    explicit QuantileGK(size_t accuracy) : data(1.0 / static_cast<double>(accuracy)) { }
+
+    void add(const Value & x)
+    {
+        data.insert(x);
+    }
+
+    template <typename Weight>
+    void add(const Value &, const Weight &)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add with weight is not implemented for GKSampler");
+    }
+
+    void merge(const QuantileGK & rhs)
+    {
+        if (!data.isCompressed())
+            data.compress();
+
+        data.merge(rhs.data);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        /// Always compress before serialization
+        if (!data.isCompressed())
+            data.compress();
+
+        data.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        data.read(buf);
+
+        data.setCompressed();
+    }
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    Value get(Float64 level)
+    {
+        if (!data.isCompressed())
+            data.compress();
+
+        Value res;
+        size_t indice = 0;
+        data.query(&level, &indice, 1, &res);
+        return res;
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
+    {
+        if (!data.isCompressed())
+            data.compress();
+
+        data.query(levels, indices, size, result);
+    }
+
+    Float64 getFloat64(Float64 /*level*/)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat64 is not implemented for GKSampler");
+    }
+
+    void getManyFloat(const Float64 * /*levels*/, const size_t * /*indices*/, size_t /*size*/, Float64 * /*result*/)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for GKSampler");
+    }
+};
+
+
 template <typename Value, bool _> using FuncQuantileGK = AggregateFunctionQuantile<Value, QuantileGK<Value>, NameQuantileGK, false, void, false>;
 template <typename Value, bool _> using FuncQuantilesGK = AggregateFunctionQuantile<Value, QuantileGK<Value>, NameQuantilesGK, false, void, true>;

--- a/src/AggregateFunctions/AggregateFunctionQuantileDeterministic.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileDeterministic.cpp
@ -1,5 +1,5 @@
 #include <AggregateFunctions/AggregateFunctionQuantile.h>
-#include <AggregateFunctions/QuantileReservoirSamplerDeterministic.h>
+#include <AggregateFunctions/ReservoirSamplerDeterministic.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
@ -9,16 +9,106 @@

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NOT_IMPLEMENTED;
 }

 namespace
 {

+/** Quantile calculation with "reservoir sample" algorithm.
+  * It collects pseudorandom subset of limited size from a stream of values,
+  *  and approximate quantile from it.
+  * The function accept second argument, named "determinator"
+  *  and a hash function from it is calculated and used as a source for randomness
+  *  to apply random sampling.
+  * The function is deterministic, but care should be taken with choose of "determinator" argument.
+  */
+template <typename Value>
+struct QuantileReservoirSamplerDeterministic
+{
+    using Data = ReservoirSamplerDeterministic<Value, ReservoirSamplerDeterministicOnEmpty::RETURN_NAN_OR_ZERO>;
+    Data data;
+
+    void add(const Value &)
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add without determinator is not implemented for ReservoirSamplerDeterministic");
+    }
+
+    template <typename Determinator>
+    void add(const Value & x, const Determinator & determinator)
+    {
+        data.insert(x, determinator);
+    }
+
+    void merge(const QuantileReservoirSamplerDeterministic & rhs)
+    {
+        data.merge(rhs.data);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        data.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        data.read(buf);
+    }
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    Value get(Float64 level)
+    {
+        if (data.empty())
+            return {};
+
+        if constexpr (is_decimal<Value>)
+            return static_cast<typename Value::NativeType>(data.quantileInterpolated(level));
+        else
+            return static_cast<Value>(data.quantileInterpolated(level));
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
+    {
+        bool is_empty = data.empty();
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            if (is_empty)
+            {
+                result[i] = Value{};
+            }
+            else
+            {
+                if constexpr (is_decimal<Value>)
+                    result[indices[i]] = static_cast<typename Value::NativeType>(data.quantileInterpolated(levels[indices[i]]));
+                else
+                    result[indices[i]] = static_cast<Value>(data.quantileInterpolated(levels[indices[i]]));
+            }
+        }
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64 level)
+    {
+        return data.quantileInterpolated(level);
+    }
+
+    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
+    {
+        for (size_t i = 0; i < size; ++i)
+            result[indices[i]] = data.quantileInterpolated(levels[indices[i]]);
+    }
+};
+
+
 template <typename Value, bool float_return> using FuncQuantileDeterministic = AggregateFunctionQuantile<Value, QuantileReservoirSamplerDeterministic<Value>, NameQuantileDeterministic, true, std::conditional_t<float_return, Float64, void>, false>;
 template <typename Value, bool float_return> using FuncQuantilesDeterministic = AggregateFunctionQuantile<Value, QuantileReservoirSamplerDeterministic<Value>, NameQuantilesDeterministic, true, std::conditional_t<float_return, Float64, void>, true>;

--- a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp
@ -1,24 +1,214 @@
 #include <AggregateFunctions/AggregateFunctionQuantile.h>
-#include <AggregateFunctions/QuantileExactWeighted.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <Core/Field.h>

+#include <Common/HashTable/HashMap.h>
+#include <Common/NaNUtils.h>
+

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NOT_IMPLEMENTED;
 }

 namespace
 {

+/** Calculates quantile by counting number of occurrences for each value in a hash map.
+  *
+  * It uses O(distinct(N)) memory. Can be naturally applied for values with weight.
+  * In case of many identical values, it can be more efficient than QuantileExact even when weight is not used.
+  */
+template <typename Value>
+struct QuantileExactWeighted
+{
+    struct Int128Hash
+    {
+        size_t operator()(Int128 x) const
+        {
+            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
+        }
+    };
+
+    using Weight = UInt64;
+    using UnderlyingType = NativeType<Value>;
+    using Hasher = HashCRC32<UnderlyingType>;
+
+    /// When creating, the hash table must be small.
+    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
+
+    Map map;
+
+    void add(const Value & x)
+    {
+        /// We must skip NaNs as they are not compatible with comparison sorting.
+        if (!isNaN(x))
+            ++map[x];
+    }
+
+    void add(const Value & x, Weight weight)
+    {
+        if (!isNaN(x))
+            map[x] += weight;
+    }
+
+    void merge(const QuantileExactWeighted & rhs)
+    {
+        for (const auto & pair : rhs.map)
+            map[pair.getKey()] += pair.getMapped();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        map.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        typename Map::Reader reader(buf);
+        while (reader.next())
+        {
+            const auto & pair = reader.get();
+            map[pair.first] = pair.second;
+        }
+    }
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    Value get(Float64 level) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+            return std::numeric_limits<Value>::quiet_NaN();
+
+        /// Copy the data to a temporary array to get the element you need in order.
+        using Pair = typename Map::value_type;
+        std::unique_ptr<Pair[]> array_holder(new Pair[size]);
+        Pair * array = array_holder.get();
+
+        /// Note: 64-bit integer weight can overflow.
+        /// We do some implementation specific behaviour (return approximate or garbage results).
+        /// Float64 is used as accumulator here to get approximate results.
+        /// But weight can be already overflowed in computations in 'add' and 'merge' methods.
+        /// It will be reasonable to change the type of weight to Float64 in the map,
+        /// but we don't do that for compatibility of serialized data.
+
+        size_t i = 0;
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            array[i] = pair.getValue();
+            ++i;
+        }
+
+        ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 threshold = std::ceil(sum_weight * level);
+        Float64 accumulated = 0;
+
+        const Pair * it = array;
+        const Pair * end = array + size;
+        while (it < end)
+        {
+            accumulated += it->second;
+
+            if (accumulated >= threshold)
+                break;
+
+            ++it;
+        }
+
+        if (it == end)
+            --it;
+
+        return it->first;
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+        {
+            for (size_t i = 0; i < num_levels; ++i)
+                result[i] = Value();
+            return;
+        }
+
+        /// Copy the data to a temporary array to get the element you need in order.
+        using Pair = typename Map::value_type;
+        std::unique_ptr<Pair[]> array_holder(new Pair[size]);
+        Pair * array = array_holder.get();
+
+        size_t i = 0;
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            array[i] = pair.getValue();
+            ++i;
+        }
+
+        ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        const Pair * it = array;
+        const Pair * end = array + size;
+
+        size_t level_index = 0;
+        Float64 threshold = std::ceil(sum_weight * levels[indices[level_index]]);
+
+        while (it < end)
+        {
+            accumulated += it->second;
+
+            while (accumulated >= threshold)
+            {
+                result[indices[level_index]] = it->first;
+                ++level_index;
+
+                if (level_index == num_levels)
+                    return;
+
+                threshold = std::ceil(sum_weight * levels[indices[level_index]]);
+            }
+
+            ++it;
+        }
+
+        while (level_index < num_levels)
+        {
+            result[indices[level_index]] = array[size - 1].first;
+            ++level_index;
+        }
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileExact");
+    }
+
+    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileExact");
+    }
+};
+
+
 template <typename Value, bool _> using FuncQuantileExactWeighted = AggregateFunctionQuantile<Value, QuantileExactWeighted<Value>, NameQuantileExactWeighted, true, void, false>;
 template <typename Value, bool _> using FuncQuantilesExactWeighted = AggregateFunctionQuantile<Value, QuantileExactWeighted<Value>, NameQuantilesExactWeighted, true, void, true>;

--- a/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
@ -1,58 +1,353 @@
 #include <AggregateFunctions/AggregateFunctionQuantile.h>
-#include <AggregateFunctions/QuantileInterpolatedWeighted.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <Core/Field.h>
+#include <Common/HashTable/HashMap.h>
+#include <Common/NaNUtils.h>


 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int NOT_IMPLEMENTED;
 }

 namespace
 {

-    template <typename Value, bool _> using FuncQuantileInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantileInterpolatedWeighted, true, void, false>;
-    template <typename Value, bool _> using FuncQuantilesInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantilesInterpolatedWeighted, true, void, true>;
-
-    template <template <typename, bool> class Function>
-    AggregateFunctionPtr createAggregateFunctionQuantile(
-        const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+/** Approximates Quantile by:
+  * - sorting input values and weights
+  * - building a cumulative distribution based on weights
+  * - performing linear interpolation between the weights and values
+  *
+  */
+template <typename Value>
+struct QuantileInterpolatedWeighted
+{
+    struct Int128Hash
    {
-        /// Second argument type check doesn't depend on the type of the first one.
-        Function<void, true>::assertSecondArg(argument_types);
+        size_t operator()(Int128 x) const
+        {
+            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
+        }
+    };

-        const DataTypePtr & argument_type = argument_types[0];
-        WhichDataType which(argument_type);
+    using Weight = UInt64;
+    using UnderlyingType = NativeType<Value>;
+    using Hasher = HashCRC32<UnderlyingType>;
+
+    /// When creating, the hash table must be small.
+    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
+
+    Map map;
+
+    void add(const Value & x)
+    {
+        /// We must skip NaNs as they are not compatible with comparison sorting.
+        if (!isNaN(x))
+            ++map[x];
+    }
+
+    void add(const Value & x, Weight weight)
+    {
+        if (!isNaN(x))
+            map[x] += weight;
+    }
+
+    void merge(const QuantileInterpolatedWeighted & rhs)
+    {
+        for (const auto & pair : rhs.map)
+            map[pair.getKey()] += pair.getMapped();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        map.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        typename Map::Reader reader(buf);
+        while (reader.next())
+        {
+            const auto & pair = reader.get();
+            map[pair.first] = pair.second;
+        }
+    }
+
+    Value get(Float64 level) const
+    {
+        return getImpl<Value>(level);
+    }
+
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
+    {
+        getManyImpl<Value>(levels, indices, size, result);
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileInterpolatedWeighted");
+    }
+
+    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
+    {
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileInterpolatedWeighted");
+    }
+
+private:
+    using Pair = typename std::pair<UnderlyingType, Float64>;
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    template <typename T>
+    T getImpl(Float64 level) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+            return std::numeric_limits<Value>::quiet_NaN();
+
+        /// Maintain a vector of pair of values and weights for easier sorting and for building
+        /// a cumulative distribution using the provided weights.
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        /// Note: weight provided must be a 64-bit integer
+        /// Float64 is used as accumulator here to get approximate results.
+        /// But weight used in the internal array is stored as Float64 as we
+        /// do some quantile estimation operation which involves division and
+        /// require Float64 level of precision.
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.push_back(accumulated);
+        }
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        /// perform linear interpolation
+        size_t idx = 0;
+        if (size >= 2)
+        {
+            if (level >= value_weight_pairs[size - 2].second)
+            {
+                idx = size - 2;
+            }
+            else
+            {
+                size_t start = 0, end = size - 1;
+                while (start <= end)
+                {
+                    size_t mid = start + (end - start) / 2;
+                    if (mid > size)
+                        break;
+                    if (level > value_weight_pairs[mid + 1].second)
+                        start = mid + 1;
+                    else
+                    {
+                        idx = mid;
+                        end = mid - 1;
+                    }
+                }
+            }
+        }
+
+        size_t l = idx;
+        size_t u = idx + 1 < size ? idx + 1 : idx;
+
+        Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+        UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+        if (level < xl)
+            yr = yl;
+        if (level > xr)
+            yl = yr;
+
+        return static_cast<T>(interpolate(level, xl, xr, yl, yr));
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    template <typename T>
+    void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+        {
+            for (size_t i = 0; i < num_levels; ++i)
+                result[i] = Value();
+            return;
+        }
+
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.emplace_back(accumulated);
+        }
+
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        for (size_t level_index = 0; level_index < num_levels; ++level_index)
+        {
+            /// perform linear interpolation for every level
+            auto level = levels[indices[level_index]];
+
+            size_t idx = 0;
+            if (size >= 2)
+            {
+                if (level >= value_weight_pairs[size - 2].second)
+                {
+                    idx = size - 2;
+                }
+                else
+                {
+                    size_t start = 0, end = size - 1;
+                    while (start <= end)
+                    {
+                        size_t mid = start + (end - start) / 2;
+                        if (mid > size)
+                            break;
+                        if (level > value_weight_pairs[mid + 1].second)
+                            start = mid + 1;
+                        else
+                        {
+                            idx = mid;
+                            end = mid - 1;
+                        }
+                    }
+                }
+            }
+
+            size_t l = idx;
+            size_t u = idx + 1 < size ? idx + 1 : idx;
+
+            Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+            UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+            if (level < xl)
+                yr = yl;
+            if (level > xr)
+                yl = yr;
+
+            result[indices[level_index]] = static_cast<T>(interpolate(level, xl, xr, yl, yr));
+        }
+    }
+
+    /// This ignores overflows or NaN's that might arise during add, sub and mul operations and doesn't aim to provide exact
+    /// results since `the quantileInterpolatedWeighted` function itself relies mainly on approximation.
+    UnderlyingType NO_SANITIZE_UNDEFINED interpolate(Float64 level, Float64 xl, Float64 xr, UnderlyingType yl, UnderlyingType yr) const
+    {
+        UnderlyingType dy = yr - yl;
+        Float64 dx = xr - xl;
+        dx = dx == 0 ? 1 : dx; /// to handle NaN behavior that might arise during integer division below.
+
+        /// yl + (dy / dx) * (level - xl)
+        return static_cast<UnderlyingType>(yl + (dy / dx) * (level - xl));
+    }
+};
+
+
+template <typename Value, bool _> using FuncQuantileInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantileInterpolatedWeighted, true, void, false>;
+template <typename Value, bool _> using FuncQuantilesInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantilesInterpolatedWeighted, true, void, true>;
+
+template <template <typename, bool> class Function>
+AggregateFunctionPtr createAggregateFunctionQuantile(
+    const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+{
+    /// Second argument type check doesn't depend on the type of the first one.
+    Function<void, true>::assertSecondArg(argument_types);
+
+    const DataTypePtr & argument_type = argument_types[0];
+    WhichDataType which(argument_type);

 #define DISPATCH(TYPE) \
-    if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
-        FOR_BASIC_NUMERIC_TYPES(DISPATCH)
+if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
+    FOR_BASIC_NUMERIC_TYPES(DISPATCH)
 #undef DISPATCH
-        if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
-        if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
+    if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);

-        if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
-        if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
-        if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
-        if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
-        if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
+    if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
+    if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);

-        if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
-        if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
-        if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
-        if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
+    if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
+    if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
+    if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
+    if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
+
+    throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}",
+                    argument_type->getName(), name);
+}

-        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}",
-                        argument_type->getName(), name);
-    }
 }

 void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory & factory)
--- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp
+++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.cpp
@ -1,7 +1,13 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionRankCorrelation.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/Helpers.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/StatCommon.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <Common/PODArray_fwd.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>


 namespace ErrorCodes
@ -16,6 +22,83 @@ struct Settings;
 namespace
 {

+struct RankCorrelationData : public StatisticalSample<Float64, Float64>
+{
+    Float64 getResult()
+    {
+        RanksArray ranks_x;
+        std::tie(ranks_x, std::ignore) = computeRanksAndTieCorrection(this->x);
+
+        RanksArray ranks_y;
+        std::tie(ranks_y, std::ignore) = computeRanksAndTieCorrection(this->y);
+
+        /// Sizes can be non-equal due to skipped NaNs.
+        const Float64 size = static_cast<Float64>(std::min(this->size_x, this->size_y));
+
+        /// Count d^2 sum
+        Float64 answer = 0;
+        for (size_t j = 0; j < size; ++j)
+            answer += (ranks_x[j] - ranks_y[j]) * (ranks_x[j] - ranks_y[j]);
+
+        answer *= 6;
+        answer /= size * (size * size - 1);
+        answer = 1 - answer;
+        return answer;
+    }
+};
+
+class AggregateFunctionRankCorrelation :
+    public IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation>
+{
+public:
+    explicit AggregateFunctionRankCorrelation(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation> ({arguments}, {}, std::make_shared<DataTypeNumber<Float64>>())
+    {}
+
+    String getName() const override
+    {
+        return "rankCorr";
+    }
+
+    bool allocatesMemoryInArena() const override { return true; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Float64 new_x = columns[0]->getFloat64(row_num);
+        Float64 new_y = columns[1]->getFloat64(row_num);
+        this->data(place).addX(new_x, arena);
+        this->data(place).addY(new_y, arena);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        auto & a = this->data(place);
+        const auto & b = this->data(rhs);
+
+        a.merge(b, arena);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        this->data(place).read(buf, arena);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto answer = this->data(place).getResult();
+
+        auto & column = static_cast<ColumnVector<Float64> &>(to);
+        column.getData().push_back(answer);
+    }
+
+};
+
+
 AggregateFunctionPtr createAggregateFunctionRankCorrelation(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
 {
--- a/src/AggregateFunctions/AggregateFunctionRankCorrelation.h
+++ b/src/AggregateFunctions/AggregateFunctionRankCorrelation.h
@ -1,98 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/StatCommon.h>
-#include <Columns/ColumnArray.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Common/assert_cast.h>
-#include <Common/PODArray_fwd.h>
-#include <base/types.h>
-#include <DataTypes/DataTypesDecimal.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <DataTypes/DataTypeArray.h>
-
-namespace DB
-{
-struct Settings;
-
-
-struct RankCorrelationData : public StatisticalSample<Float64, Float64>
-{
-    Float64 getResult()
-    {
-        RanksArray ranks_x;
-        std::tie(ranks_x, std::ignore) = computeRanksAndTieCorrection(this->x);
-
-        RanksArray ranks_y;
-        std::tie(ranks_y, std::ignore) = computeRanksAndTieCorrection(this->y);
-
-        /// Sizes can be non-equal due to skipped NaNs.
-        const Float64 size = static_cast<Float64>(std::min(this->size_x, this->size_y));
-
-        /// Count d^2 sum
-        Float64 answer = 0;
-        for (size_t j = 0; j < size; ++j)
-            answer += (ranks_x[j] - ranks_y[j]) * (ranks_x[j] - ranks_y[j]);
-
-        answer *= 6;
-        answer /= size * (size * size - 1);
-        answer = 1 - answer;
-        return answer;
-    }
-};
-
-class AggregateFunctionRankCorrelation :
-    public IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation>
-{
-public:
-    explicit AggregateFunctionRankCorrelation(const DataTypes & arguments)
-        :IAggregateFunctionDataHelper<RankCorrelationData, AggregateFunctionRankCorrelation> ({arguments}, {}, std::make_shared<DataTypeNumber<Float64>>())
-    {}
-
-    String getName() const override
-    {
-        return "rankCorr";
-    }
-
-    bool allocatesMemoryInArena() const override { return true; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        Float64 new_x = columns[0]->getFloat64(row_num);
-        Float64 new_y = columns[1]->getFloat64(row_num);
-        this->data(place).addX(new_x, arena);
-        this->data(place).addY(new_y, arena);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        auto & a = this->data(place);
-        const auto & b = this->data(rhs);
-
-        a.merge(b, arena);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        this->data(place).read(buf, arena);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto answer = this->data(place).getResult();
-
-        auto & column = static_cast<ColumnVector<Float64> &>(to);
-        column.getData().push_back(answer);
-    }
-
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionRetention.cpp
+++ b/src/AggregateFunctions/AggregateFunctionRetention.cpp
@ -1,21 +1,150 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionRetention.h>
-#include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+#include <unordered_set>
+#include <Columns/ColumnArray.h>
+#include <Common/assert_cast.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeArray.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <bitset>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
 {
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
 }

 namespace
 {

+struct AggregateFunctionRetentionData
+{
+    static constexpr auto max_events = 32;
+
+    using Events = std::bitset<max_events>;
+
+    Events events;
+
+    void add(UInt8 event)
+    {
+        events.set(event);
+    }
+
+    void merge(const AggregateFunctionRetentionData & other)
+    {
+        events |= other.events;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        UInt32 event_value = static_cast<UInt32>(events.to_ulong());
+        writeBinary(event_value, buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        UInt32 event_value;
+        readBinary(event_value, buf);
+        events = event_value;
+    }
+};
+
+/**
+  * The max size of events is 32, that's enough for retention analytics
+  *
+  * Usage:
+  * - retention(cond1, cond2, cond3, ....)
+  * - returns [cond1_flag, cond1_flag && cond2_flag, cond1_flag && cond3_flag, ...]
+  */
+class AggregateFunctionRetention final
+        : public IAggregateFunctionDataHelper<AggregateFunctionRetentionData, AggregateFunctionRetention>
+{
+private:
+    UInt8 events_size;
+
+public:
+    String getName() const override
+    {
+        return "retention";
+    }
+
+    explicit AggregateFunctionRetention(const DataTypes & arguments)
+        : IAggregateFunctionDataHelper<AggregateFunctionRetentionData, AggregateFunctionRetention>(arguments, {}, std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt8>()))
+    {
+        for (const auto i : collections::range(0, arguments.size()))
+        {
+            const auto * cond_arg = arguments[i].get();
+            if (!isUInt8(cond_arg))
+                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                                "Illegal type {} of argument {} of aggregate function {}, must be UInt8",
+                                cond_arg->getName(), i, getName());
+        }
+
+        events_size = static_cast<UInt8>(arguments.size());
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
+    {
+        for (const auto i : collections::range(0, events_size))
+        {
+            auto event = assert_cast<const ColumnVector<UInt8> *>(columns[i])->getData()[row_num];
+            if (event)
+            {
+                this->data(place).add(i);
+            }
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & data_to = assert_cast<ColumnUInt8 &>(assert_cast<ColumnArray &>(to).getData()).getData();
+        auto & offsets_to = assert_cast<ColumnArray &>(to).getOffsets();
+
+        ColumnArray::Offset current_offset = data_to.size();
+        data_to.resize(current_offset + events_size);
+
+        const bool first_flag = this->data(place).events.test(0);
+        data_to[current_offset] = first_flag;
+        ++current_offset;
+
+        for (size_t i = 1; i < events_size; ++i)
+        {
+            data_to[current_offset] = (first_flag && this->data(place).events.test(i));
+            ++current_offset;
+        }
+
+        offsets_to.push_back(current_offset);
+    }
+};
+
+
 AggregateFunctionPtr createAggregateFunctionRetention(const std::string & name, const DataTypes & arguments, const Array & params, const Settings *)
 {
    assertNoParameters(name, params);
--- a/src/AggregateFunctions/AggregateFunctionRetention.h
+++ b/src/AggregateFunctions/AggregateFunctionRetention.h
@ -1,143 +0,0 @@
-#pragma once
-
-#include <unordered_set>
-#include <Columns/ColumnsNumber.h>
-#include <Columns/ColumnArray.h>
-#include <Common/assert_cast.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeArray.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <base/range.h>
-#include <bitset>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-}
-
-struct AggregateFunctionRetentionData
-{
-    static constexpr auto max_events = 32;
-
-    using Events = std::bitset<max_events>;
-
-    Events events;
-
-    void add(UInt8 event)
-    {
-        events.set(event);
-    }
-
-    void merge(const AggregateFunctionRetentionData & other)
-    {
-        events |= other.events;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        UInt32 event_value = static_cast<UInt32>(events.to_ulong());
-        writeBinary(event_value, buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        UInt32 event_value;
-        readBinary(event_value, buf);
-        events = event_value;
-    }
-};
-
-/**
-  * The max size of events is 32, that's enough for retention analytics
-  *
-  * Usage:
-  * - retention(cond1, cond2, cond3, ....)
-  * - returns [cond1_flag, cond1_flag && cond2_flag, cond1_flag && cond3_flag, ...]
-  */
-class AggregateFunctionRetention final
-        : public IAggregateFunctionDataHelper<AggregateFunctionRetentionData, AggregateFunctionRetention>
-{
-private:
-    UInt8 events_size;
-
-public:
-    String getName() const override
-    {
-        return "retention";
-    }
-
-    explicit AggregateFunctionRetention(const DataTypes & arguments)
-        : IAggregateFunctionDataHelper<AggregateFunctionRetentionData, AggregateFunctionRetention>(arguments, {}, std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt8>()))
-    {
-        for (const auto i : collections::range(0, arguments.size()))
-        {
-            const auto * cond_arg = arguments[i].get();
-            if (!isUInt8(cond_arg))
-                throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
-                                "Illegal type {} of argument {} of aggregate function {}, must be UInt8",
-                                cond_arg->getName(), i, getName());
-        }
-
-        events_size = static_cast<UInt8>(arguments.size());
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
-    {
-        for (const auto i : collections::range(0, events_size))
-        {
-            auto event = assert_cast<const ColumnVector<UInt8> *>(columns[i])->getData()[row_num];
-            if (event)
-            {
-                this->data(place).add(i);
-            }
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & data_to = assert_cast<ColumnUInt8 &>(assert_cast<ColumnArray &>(to).getData()).getData();
-        auto & offsets_to = assert_cast<ColumnArray &>(to).getOffsets();
-
-        ColumnArray::Offset current_offset = data_to.size();
-        data_to.resize(current_offset + events_size);
-
-        const bool first_flag = this->data(place).events.test(0);
-        data_to[current_offset] = first_flag;
-        ++current_offset;
-
-        for (size_t i = 1; i < events_size; ++i)
-        {
-            data_to[current_offset] = (first_flag && this->data(place).events.test(i));
-            ++current_offset;
-        }
-
-        offsets_to.push_back(current_offset);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.cpp
@ -1,15 +1,22 @@
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionSequenceMatch.h>

 #include <DataTypes/DataTypeDate.h>
-#include <DataTypes/DataTypeDate32.h>
 #include <DataTypes/DataTypeDateTime.h>

-#include <base/range.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Columns/ColumnsNumber.h>
+#include <Common/assert_cast.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <bitset>
+#include <stack>
+

 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
@ -18,11 +25,689 @@ namespace ErrorCodes
    extern const int TOO_MANY_ARGUMENTS_FOR_FUNCTION;
    extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION;
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int TOO_SLOW;
+    extern const int SYNTAX_ERROR;
+    extern const int BAD_ARGUMENTS;
+    extern const int LOGICAL_ERROR;
 }

 namespace
 {

+/// helper type for comparing `std::pair`s using solely the .first member
+template <template <typename> class Comparator>
+struct ComparePairFirst final
+{
+    template <typename T1, typename T2>
+    bool operator()(const std::pair<T1, T2> & lhs, const std::pair<T1, T2> & rhs) const
+    {
+        return Comparator<T1>{}(lhs.first, rhs.first);
+    }
+};
+
+static constexpr size_t max_events = 32;
+
+template <typename T>
+struct AggregateFunctionSequenceMatchData final
+{
+    using Timestamp = T;
+    using Events = std::bitset<max_events>;
+    using TimestampEvents = std::pair<Timestamp, Events>;
+    using Comparator = ComparePairFirst<std::less>;
+
+    bool sorted = true;
+    PODArrayWithStackMemory<TimestampEvents, 64> events_list;
+    /// sequenceMatch conditions met at least once in events_list
+    Events conditions_met;
+
+    void add(const Timestamp timestamp, const Events & events)
+    {
+        /// store information exclusively for rows with at least one event
+        if (events.any())
+        {
+            events_list.emplace_back(timestamp, events);
+            sorted = false;
+            conditions_met |= events;
+        }
+    }
+
+    void merge(const AggregateFunctionSequenceMatchData & other)
+    {
+        if (other.events_list.empty())
+            return;
+
+        events_list.insert(std::begin(other.events_list), std::end(other.events_list));
+        sorted = false;
+        conditions_met |= other.conditions_met;
+    }
+
+    void sort()
+    {
+        if (sorted)
+            return;
+
+        ::sort(std::begin(events_list), std::end(events_list), Comparator{});
+        sorted = true;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(sorted, buf);
+        writeBinary(events_list.size(), buf);
+
+        for (const auto & events : events_list)
+        {
+            writeBinary(events.first, buf);
+            writeBinary(events.second.to_ulong(), buf);
+        }
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(sorted, buf);
+
+        size_t size;
+        readBinary(size, buf);
+
+        /// If we lose these flags, functionality is broken
+        /// If we serialize/deserialize these flags, we have compatibility issues
+        /// If we set these flags to 1, we have a minor performance penalty, which seems acceptable
+        conditions_met.set();
+
+        events_list.clear();
+        events_list.reserve(size);
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            Timestamp timestamp;
+            readBinary(timestamp, buf);
+
+            UInt64 events;
+            readBinary(events, buf);
+
+            events_list.emplace_back(timestamp, Events{events});
+        }
+    }
+};
+
+
+/// Max number of iterations to match the pattern against a sequence, exception thrown when exceeded
+constexpr auto sequence_match_max_iterations = 1000000;
+
+
+template <typename T, typename Data, typename Derived>
+class AggregateFunctionSequenceBase : public IAggregateFunctionDataHelper<Data, Derived>
+{
+public:
+    AggregateFunctionSequenceBase(const DataTypes & arguments, const Array & params, const String & pattern_, const DataTypePtr & result_type_)
+        : IAggregateFunctionDataHelper<Data, Derived>(arguments, params, result_type_)
+        , pattern(pattern_)
+    {
+        arg_count = arguments.size();
+        parsePattern();
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
+    {
+        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
+
+        typename Data::Events events;
+        for (const auto i : collections::range(1, arg_count))
+        {
+            const auto event = assert_cast<const ColumnUInt8 *>(columns[i])->getData()[row_num];
+            events.set(i - 1, event);
+        }
+
+        this->data(place).add(timestamp, events);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    bool haveSameStateRepresentationImpl(const IAggregateFunction & rhs) const override
+    {
+        return this->getName() == rhs.getName() && this->haveEqualArgumentTypes(rhs);
+    }
+
+private:
+    enum class PatternActionType
+    {
+        SpecificEvent,
+        AnyEvent,
+        KleeneStar,
+        TimeLessOrEqual,
+        TimeLess,
+        TimeGreaterOrEqual,
+        TimeGreater,
+        TimeEqual
+    };
+
+    struct PatternAction final
+    {
+        PatternActionType type;
+        std::uint64_t extra;
+
+        PatternAction() = default;
+        explicit PatternAction(const PatternActionType type_, const std::uint64_t extra_ = 0) : type{type_}, extra{extra_} {}
+    };
+
+    using PatternActions = PODArrayWithStackMemory<PatternAction, 64>;
+
+    Derived & derived() { return static_cast<Derived &>(*this); }
+
+    void parsePattern()
+    {
+        actions.clear();
+        actions.emplace_back(PatternActionType::KleeneStar);
+
+        dfa_states.clear();
+        dfa_states.emplace_back(true);
+
+        pattern_has_time = false;
+
+        const char * pos = pattern.data();
+        const char * begin = pos;
+        const char * end = pos + pattern.size();
+
+        auto throw_exception = [&](const std::string & msg)
+        {
+            throw Exception(ErrorCodes::SYNTAX_ERROR, "{} '{}' at position {}", msg, std::string(pos, end), toString(pos - begin));
+        };
+
+        auto match = [&pos, end](const char * str) mutable
+        {
+            size_t length = strlen(str);
+            if (pos + length <= end && 0 == memcmp(pos, str, length))
+            {
+                pos += length;
+                return true;
+            }
+            return false;
+        };
+
+        while (pos < end)
+        {
+            if (match("(?"))
+            {
+                if (match("t"))
+                {
+                    PatternActionType type;
+
+                    if (match("<="))
+                        type = PatternActionType::TimeLessOrEqual;
+                    else if (match("<"))
+                        type = PatternActionType::TimeLess;
+                    else if (match(">="))
+                        type = PatternActionType::TimeGreaterOrEqual;
+                    else if (match(">"))
+                        type = PatternActionType::TimeGreater;
+                    else if (match("=="))
+                        type = PatternActionType::TimeEqual;
+                    else
+                        throw_exception("Unknown time condition");
+
+                    UInt64 duration = 0;
+                    const auto * prev_pos = pos;
+                    pos = tryReadIntText(duration, pos, end);
+                    if (pos == prev_pos)
+                        throw_exception("Could not parse number");
+
+                    if (actions.back().type != PatternActionType::SpecificEvent &&
+                        actions.back().type != PatternActionType::AnyEvent &&
+                        actions.back().type != PatternActionType::KleeneStar)
+                        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Temporal condition should be preceded by an event condition");
+
+                    pattern_has_time = true;
+                    actions.emplace_back(type, duration);
+                }
+                else
+                {
+                    UInt64 event_number = 0;
+                    const auto * prev_pos = pos;
+                    pos = tryReadIntText(event_number, pos, end);
+                    if (pos == prev_pos)
+                        throw_exception("Could not parse number");
+
+                    if (event_number > arg_count - 1)
+                        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Event number {} is out of range", event_number);
+
+                    actions.emplace_back(PatternActionType::SpecificEvent, event_number - 1);
+                    dfa_states.back().transition = DFATransition::SpecificEvent;
+                    dfa_states.back().event = static_cast<uint32_t>(event_number - 1);
+                    dfa_states.emplace_back();
+                    conditions_in_pattern.set(event_number - 1);
+                }
+
+                if (!match(")"))
+                    throw_exception("Expected closing parenthesis, found");
+
+            }
+            else if (match(".*"))
+            {
+                actions.emplace_back(PatternActionType::KleeneStar);
+                dfa_states.back().has_kleene = true;
+            }
+            else if (match("."))
+            {
+                actions.emplace_back(PatternActionType::AnyEvent);
+                dfa_states.back().transition = DFATransition::AnyEvent;
+                dfa_states.emplace_back();
+            }
+            else
+                throw_exception("Could not parse pattern, unexpected starting symbol");
+        }
+    }
+
+protected:
+    /// Uses a DFA based approach in order to better handle patterns without
+    /// time assertions.
+    ///
+    /// NOTE: This implementation relies on the assumption that the pattern is *small*.
+    ///
+    /// This algorithm performs in O(mn) (with m the number of DFA states and N the number
+    /// of events) with a memory consumption and memory allocations in O(m). It means that
+    /// if n >>> m (which is expected to be the case), this algorithm can be considered linear.
+    template <typename EventEntry>
+    bool dfaMatch(EventEntry & events_it, const EventEntry events_end) const
+    {
+        using ActiveStates = std::vector<bool>;
+
+        /// Those two vectors keep track of which states should be considered for the current
+        /// event as well as the states which should be considered for the next event.
+        ActiveStates active_states(dfa_states.size(), false);
+        ActiveStates next_active_states(dfa_states.size(), false);
+        active_states[0] = true;
+
+        /// Keeps track of dead-ends in order not to iterate over all the events to realize that
+        /// the match failed.
+        size_t n_active = 1;
+
+        for (/* empty */; events_it != events_end && n_active > 0 && !active_states.back(); ++events_it)
+        {
+            n_active = 0;
+            next_active_states.assign(dfa_states.size(), false);
+
+            for (size_t state = 0; state < dfa_states.size(); ++state)
+            {
+                if (!active_states[state])
+                {
+                    continue;
+                }
+
+                switch (dfa_states[state].transition)
+                {
+                    case DFATransition::None:
+                        break;
+                    case DFATransition::AnyEvent:
+                        next_active_states[state + 1] = true;
+                        ++n_active;
+                        break;
+                    case DFATransition::SpecificEvent:
+                        if (events_it->second.test(dfa_states[state].event))
+                        {
+                            next_active_states[state + 1] = true;
+                            ++n_active;
+                        }
+                        break;
+                }
+
+                if (dfa_states[state].has_kleene)
+                {
+                    next_active_states[state] = true;
+                    ++n_active;
+                }
+            }
+            swap(active_states, next_active_states);
+        }
+
+        return active_states.back();
+    }
+
+    template <typename EventEntry>
+    bool backtrackingMatch(EventEntry & events_it, const EventEntry events_end) const
+    {
+        const auto action_begin = std::begin(actions);
+        const auto action_end = std::end(actions);
+        auto action_it = action_begin;
+
+        const auto events_begin = events_it;
+        auto base_it = events_it;
+
+        /// an iterator to action plus an iterator to row in events list plus timestamp at the start of sequence
+        using backtrack_info = std::tuple<decltype(action_it), EventEntry, EventEntry>;
+        std::stack<backtrack_info> back_stack;
+
+        /// backtrack if possible
+        const auto do_backtrack = [&]
+        {
+            while (!back_stack.empty())
+            {
+                auto & top = back_stack.top();
+
+                action_it = std::get<0>(top);
+                events_it = std::next(std::get<1>(top));
+                base_it = std::get<2>(top);
+
+                back_stack.pop();
+
+                if (events_it != events_end)
+                    return true;
+            }
+
+            return false;
+        };
+
+        size_t i = 0;
+        while (action_it != action_end && events_it != events_end)
+        {
+            if (action_it->type == PatternActionType::SpecificEvent)
+            {
+                if (events_it->second.test(action_it->extra))
+                {
+                    /// move to the next action and events
+                    base_it = events_it;
+                    ++action_it, ++events_it;
+                }
+                else if (!do_backtrack())
+                    /// backtracking failed, bail out
+                    break;
+            }
+            else if (action_it->type == PatternActionType::AnyEvent)
+            {
+                base_it = events_it;
+                ++action_it, ++events_it;
+            }
+            else if (action_it->type == PatternActionType::KleeneStar)
+            {
+                back_stack.emplace(action_it, events_it, base_it);
+                base_it = events_it;
+                ++action_it;
+            }
+            else if (action_it->type == PatternActionType::TimeLessOrEqual)
+            {
+                if (events_it->first <= base_it->first + action_it->extra)
+                {
+                    /// condition satisfied, move onto next action
+                    back_stack.emplace(action_it, events_it, base_it);
+                    base_it = events_it;
+                    ++action_it;
+                }
+                else if (!do_backtrack())
+                    break;
+            }
+            else if (action_it->type == PatternActionType::TimeLess)
+            {
+                if (events_it->first < base_it->first + action_it->extra)
+                {
+                    back_stack.emplace(action_it, events_it, base_it);
+                    base_it = events_it;
+                    ++action_it;
+                }
+                else if (!do_backtrack())
+                    break;
+            }
+            else if (action_it->type == PatternActionType::TimeGreaterOrEqual)
+            {
+                if (events_it->first >= base_it->first + action_it->extra)
+                {
+                    back_stack.emplace(action_it, events_it, base_it);
+                    base_it = events_it;
+                    ++action_it;
+                }
+                else if (++events_it == events_end && !do_backtrack())
+                    break;
+            }
+            else if (action_it->type == PatternActionType::TimeGreater)
+            {
+                if (events_it->first > base_it->first + action_it->extra)
+                {
+                    back_stack.emplace(action_it, events_it, base_it);
+                    base_it = events_it;
+                    ++action_it;
+                }
+                else if (++events_it == events_end && !do_backtrack())
+                    break;
+            }
+            else if (action_it->type == PatternActionType::TimeEqual)
+            {
+                if (events_it->first == base_it->first + action_it->extra)
+                {
+                    back_stack.emplace(action_it, events_it, base_it);
+                    base_it = events_it;
+                    ++action_it;
+                }
+                else if (++events_it == events_end && !do_backtrack())
+                    break;
+            }
+            else
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown PatternActionType");
+
+            if (++i > sequence_match_max_iterations)
+                throw Exception(ErrorCodes::TOO_SLOW, "Pattern application proves too difficult, exceeding max iterations ({})",
+                    sequence_match_max_iterations);
+        }
+
+        /// if there are some actions remaining
+        if (action_it != action_end)
+        {
+            /// match multiple empty strings at end
+            while (action_it->type == PatternActionType::KleeneStar ||
+                   action_it->type == PatternActionType::TimeLessOrEqual ||
+                   action_it->type == PatternActionType::TimeLess ||
+                   (action_it->type == PatternActionType::TimeGreaterOrEqual && action_it->extra == 0))
+                ++action_it;
+        }
+
+        if (events_it == events_begin)
+            ++events_it;
+
+        return action_it == action_end;
+    }
+
+    /// Splits the pattern into deterministic parts separated by non-deterministic fragments
+    /// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order,
+    /// ignoring the non-deterministic fragments.
+    /// This function can quickly check that a full match is not possible if some deterministic fragment is missing.
+    template <typename EventEntry>
+    bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const
+    {
+        size_t events_processed = 0;
+        auto events_it = events_begin;
+
+        const auto actions_end = std::end(actions);
+        auto actions_it = std::begin(actions);
+        auto det_part_begin = actions_it;
+
+        auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]()
+        {
+            auto events_it_init = events_it;
+            auto det_part_it = det_part_begin;
+
+            while (det_part_it != actions_it && events_it != events_end)
+            {
+                /// matching any event
+                if (det_part_it->type == PatternActionType::AnyEvent)
+                    ++events_it, ++det_part_it;
+
+                /// matching specific event
+                else
+                {
+                    if (events_it->second.test(det_part_it->extra))
+                        ++events_it, ++det_part_it;
+
+                    /// abandon current matching, try to match the deterministic fragment further in the list
+                    else
+                    {
+                        events_it = ++events_it_init;
+                        det_part_it = det_part_begin;
+                    }
+                }
+
+                if (limit_iterations && ++events_processed > sequence_match_max_iterations)
+                    throw Exception(ErrorCodes::TOO_SLOW, "Pattern application proves too difficult, exceeding max iterations ({})",
+                        sequence_match_max_iterations);
+            }
+
+            return det_part_it == actions_it;
+        };
+
+        for (; actions_it != actions_end; ++actions_it)
+            if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent)
+            {
+                if (!match_deterministic_part())
+                    return false;
+                det_part_begin = std::next(actions_it);
+            }
+
+        return match_deterministic_part();
+    }
+
+private:
+    enum class DFATransition : char
+    {
+        ///   .-------.
+        ///   |       |
+        ///   `-------'
+        None,
+        ///   .-------.  (?[0-9])
+        ///   |       | ----------
+        ///   `-------'
+        SpecificEvent,
+        ///   .-------.      .
+        ///   |       | ----------
+        ///   `-------'
+        AnyEvent,
+    };
+
+    struct DFAState
+    {
+        explicit DFAState(bool has_kleene_ = false)
+            : has_kleene{has_kleene_}, event{0}, transition{DFATransition::None}
+        {}
+
+        ///   .-------.
+        ///   |       | - - -
+        ///   `-------'
+        ///     |_^
+        bool has_kleene;
+        /// In the case of a state transitions with a `SpecificEvent`,
+        /// `event` contains the value of the event.
+        uint32_t event;
+        /// The kind of transition out of this state.
+        DFATransition transition;
+    };
+
+    using DFAStates = std::vector<DFAState>;
+
+protected:
+    /// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
+    bool pattern_has_time;
+    /// sequenceMatch conditions met at least once in the pattern
+    std::bitset<max_events> conditions_in_pattern;
+
+private:
+    std::string pattern;
+    size_t arg_count;
+    PatternActions actions;
+
+    DFAStates dfa_states;
+};
+
+template <typename T, typename Data>
+class AggregateFunctionSequenceMatch final : public AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>
+{
+public:
+    AggregateFunctionSequenceMatch(const DataTypes & arguments, const Array & params, const String & pattern_)
+        : AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>(arguments, params, pattern_, std::make_shared<DataTypeUInt8>()) {}
+
+    using AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>::AggregateFunctionSequenceBase;
+
+    String getName() const override { return "sequenceMatch"; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & output = assert_cast<ColumnUInt8 &>(to).getData();
+        if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
+        {
+            output.push_back(false);
+            return;
+        }
+        this->data(place).sort();
+
+        const auto & data_ref = this->data(place);
+
+        const auto events_begin = std::begin(data_ref.events_list);
+        const auto events_end = std::end(data_ref.events_list);
+        auto events_it = events_begin;
+
+        bool match = (this->pattern_has_time ?
+            (this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) :
+            this->dfaMatch(events_it, events_end));
+        output.push_back(match);
+    }
+};
+
+template <typename T, typename Data>
+class AggregateFunctionSequenceCount final : public AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>
+{
+public:
+    AggregateFunctionSequenceCount(const DataTypes & arguments, const Array & params, const String & pattern_)
+        : AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>(arguments, params, pattern_, std::make_shared<DataTypeUInt64>()) {}
+
+    using AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>::AggregateFunctionSequenceBase;
+
+    String getName() const override { return "sequenceCount"; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & output = assert_cast<ColumnUInt64 &>(to).getData();
+        if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
+        {
+            output.push_back(0);
+            return;
+        }
+        this->data(place).sort();
+        output.push_back(count(place));
+    }
+
+private:
+    UInt64 count(ConstAggregateDataPtr __restrict place) const
+    {
+        const auto & data_ref = this->data(place);
+
+        const auto events_begin = std::begin(data_ref.events_list);
+        const auto events_end = std::end(data_ref.events_list);
+        auto events_it = events_begin;
+
+        size_t count = 0;
+        // check if there is a chance of matching the sequence at least once
+        if (this->couldMatchDeterministicParts(events_begin, events_end))
+        {
+            while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
+                ++count;
+        }
+
+        return count;
+    }
+};
+
+
 template <template <typename, typename> typename AggregateFunction, template <typename> typename Data>
 AggregateFunctionPtr createAggregateFunctionSequenceBase(
    const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
--- a/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
+++ b/src/AggregateFunctions/AggregateFunctionSequenceMatch.h
@ -1,702 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <DataTypes/DataTypeDateTime.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <Columns/ColumnsNumber.h>
-#include <Common/assert_cast.h>
-#include <base/range.h>
-#include <base/sort.h>
-#include <Common/PODArray.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <bitset>
-#include <stack>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int TOO_SLOW;
-    extern const int SYNTAX_ERROR;
-    extern const int BAD_ARGUMENTS;
-    extern const int LOGICAL_ERROR;
-}
-
-/// helper type for comparing `std::pair`s using solely the .first member
-template <template <typename> class Comparator>
-struct ComparePairFirst final
-{
-    template <typename T1, typename T2>
-    bool operator()(const std::pair<T1, T2> & lhs, const std::pair<T1, T2> & rhs) const
-    {
-        return Comparator<T1>{}(lhs.first, rhs.first);
-    }
-};
-
-static constexpr size_t max_events = 32;
-
-template <typename T>
-struct AggregateFunctionSequenceMatchData final
-{
-    using Timestamp = T;
-    using Events = std::bitset<max_events>;
-    using TimestampEvents = std::pair<Timestamp, Events>;
-    using Comparator = ComparePairFirst<std::less>;
-
-    bool sorted = true;
-    PODArrayWithStackMemory<TimestampEvents, 64> events_list;
-    /// sequenceMatch conditions met at least once in events_list
-    Events conditions_met;
-
-    void add(const Timestamp timestamp, const Events & events)
-    {
-        /// store information exclusively for rows with at least one event
-        if (events.any())
-        {
-            events_list.emplace_back(timestamp, events);
-            sorted = false;
-            conditions_met |= events;
-        }
-    }
-
-    void merge(const AggregateFunctionSequenceMatchData & other)
-    {
-        if (other.events_list.empty())
-            return;
-
-        events_list.insert(std::begin(other.events_list), std::end(other.events_list));
-        sorted = false;
-        conditions_met |= other.conditions_met;
-    }
-
-    void sort()
-    {
-        if (sorted)
-            return;
-
-        ::sort(std::begin(events_list), std::end(events_list), Comparator{});
-        sorted = true;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(sorted, buf);
-        writeBinary(events_list.size(), buf);
-
-        for (const auto & events : events_list)
-        {
-            writeBinary(events.first, buf);
-            writeBinary(events.second.to_ulong(), buf);
-        }
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(sorted, buf);
-
-        size_t size;
-        readBinary(size, buf);
-
-        /// If we lose these flags, functionality is broken
-        /// If we serialize/deserialize these flags, we have compatibility issues
-        /// If we set these flags to 1, we have a minor performance penalty, which seems acceptable
-        conditions_met.set();
-
-        events_list.clear();
-        events_list.reserve(size);
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            Timestamp timestamp;
-            readBinary(timestamp, buf);
-
-            UInt64 events;
-            readBinary(events, buf);
-
-            events_list.emplace_back(timestamp, Events{events});
-        }
-    }
-};
-
-
-/// Max number of iterations to match the pattern against a sequence, exception thrown when exceeded
-constexpr auto sequence_match_max_iterations = 1000000;
-
-
-template <typename T, typename Data, typename Derived>
-class AggregateFunctionSequenceBase : public IAggregateFunctionDataHelper<Data, Derived>
-{
-public:
-    AggregateFunctionSequenceBase(const DataTypes & arguments, const Array & params, const String & pattern_, const DataTypePtr & result_type_)
-        : IAggregateFunctionDataHelper<Data, Derived>(arguments, params, result_type_)
-        , pattern(pattern_)
-    {
-        arg_count = arguments.size();
-        parsePattern();
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
-    {
-        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
-
-        typename Data::Events events;
-        for (const auto i : collections::range(1, arg_count))
-        {
-            const auto event = assert_cast<const ColumnUInt8 *>(columns[i])->getData()[row_num];
-            events.set(i - 1, event);
-        }
-
-        this->data(place).add(timestamp, events);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    bool haveSameStateRepresentationImpl(const IAggregateFunction & rhs) const override
-    {
-        return this->getName() == rhs.getName() && this->haveEqualArgumentTypes(rhs);
-    }
-
-private:
-    enum class PatternActionType
-    {
-        SpecificEvent,
-        AnyEvent,
-        KleeneStar,
-        TimeLessOrEqual,
-        TimeLess,
-        TimeGreaterOrEqual,
-        TimeGreater,
-        TimeEqual
-    };
-
-    struct PatternAction final
-    {
-        PatternActionType type;
-        std::uint64_t extra;
-
-        PatternAction() = default;
-        explicit PatternAction(const PatternActionType type_, const std::uint64_t extra_ = 0) : type{type_}, extra{extra_} {}
-    };
-
-    using PatternActions = PODArrayWithStackMemory<PatternAction, 64>;
-
-    Derived & derived() { return static_cast<Derived &>(*this); }
-
-    void parsePattern()
-    {
-        actions.clear();
-        actions.emplace_back(PatternActionType::KleeneStar);
-
-        dfa_states.clear();
-        dfa_states.emplace_back(true);
-
-        pattern_has_time = false;
-
-        const char * pos = pattern.data();
-        const char * begin = pos;
-        const char * end = pos + pattern.size();
-
-        auto throw_exception = [&](const std::string & msg)
-        {
-            throw Exception(ErrorCodes::SYNTAX_ERROR, "{} '{}' at position {}", msg, std::string(pos, end), toString(pos - begin));
-        };
-
-        auto match = [&pos, end](const char * str) mutable
-        {
-            size_t length = strlen(str);
-            if (pos + length <= end && 0 == memcmp(pos, str, length))
-            {
-                pos += length;
-                return true;
-            }
-            return false;
-        };
-
-        while (pos < end)
-        {
-            if (match("(?"))
-            {
-                if (match("t"))
-                {
-                    PatternActionType type;
-
-                    if (match("<="))
-                        type = PatternActionType::TimeLessOrEqual;
-                    else if (match("<"))
-                        type = PatternActionType::TimeLess;
-                    else if (match(">="))
-                        type = PatternActionType::TimeGreaterOrEqual;
-                    else if (match(">"))
-                        type = PatternActionType::TimeGreater;
-                    else if (match("=="))
-                        type = PatternActionType::TimeEqual;
-                    else
-                        throw_exception("Unknown time condition");
-
-                    UInt64 duration = 0;
-                    const auto * prev_pos = pos;
-                    pos = tryReadIntText(duration, pos, end);
-                    if (pos == prev_pos)
-                        throw_exception("Could not parse number");
-
-                    if (actions.back().type != PatternActionType::SpecificEvent &&
-                        actions.back().type != PatternActionType::AnyEvent &&
-                        actions.back().type != PatternActionType::KleeneStar)
-                        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Temporal condition should be preceded by an event condition");
-
-                    pattern_has_time = true;
-                    actions.emplace_back(type, duration);
-                }
-                else
-                {
-                    UInt64 event_number = 0;
-                    const auto * prev_pos = pos;
-                    pos = tryReadIntText(event_number, pos, end);
-                    if (pos == prev_pos)
-                        throw_exception("Could not parse number");
-
-                    if (event_number > arg_count - 1)
-                        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Event number {} is out of range", event_number);
-
-                    actions.emplace_back(PatternActionType::SpecificEvent, event_number - 1);
-                    dfa_states.back().transition = DFATransition::SpecificEvent;
-                    dfa_states.back().event = static_cast<uint32_t>(event_number - 1);
-                    dfa_states.emplace_back();
-                    conditions_in_pattern.set(event_number - 1);
-                }
-
-                if (!match(")"))
-                    throw_exception("Expected closing parenthesis, found");
-
-            }
-            else if (match(".*"))
-            {
-                actions.emplace_back(PatternActionType::KleeneStar);
-                dfa_states.back().has_kleene = true;
-            }
-            else if (match("."))
-            {
-                actions.emplace_back(PatternActionType::AnyEvent);
-                dfa_states.back().transition = DFATransition::AnyEvent;
-                dfa_states.emplace_back();
-            }
-            else
-                throw_exception("Could not parse pattern, unexpected starting symbol");
-        }
-    }
-
-protected:
-    /// Uses a DFA based approach in order to better handle patterns without
-    /// time assertions.
-    ///
-    /// NOTE: This implementation relies on the assumption that the pattern is *small*.
-    ///
-    /// This algorithm performs in O(mn) (with m the number of DFA states and N the number
-    /// of events) with a memory consumption and memory allocations in O(m). It means that
-    /// if n >>> m (which is expected to be the case), this algorithm can be considered linear.
-    template <typename EventEntry>
-    bool dfaMatch(EventEntry & events_it, const EventEntry events_end) const
-    {
-        using ActiveStates = std::vector<bool>;
-
-        /// Those two vectors keep track of which states should be considered for the current
-        /// event as well as the states which should be considered for the next event.
-        ActiveStates active_states(dfa_states.size(), false);
-        ActiveStates next_active_states(dfa_states.size(), false);
-        active_states[0] = true;
-
-        /// Keeps track of dead-ends in order not to iterate over all the events to realize that
-        /// the match failed.
-        size_t n_active = 1;
-
-        for (/* empty */; events_it != events_end && n_active > 0 && !active_states.back(); ++events_it)
-        {
-            n_active = 0;
-            next_active_states.assign(dfa_states.size(), false);
-
-            for (size_t state = 0; state < dfa_states.size(); ++state)
-            {
-                if (!active_states[state])
-                {
-                    continue;
-                }
-
-                switch (dfa_states[state].transition)
-                {
-                    case DFATransition::None:
-                        break;
-                    case DFATransition::AnyEvent:
-                        next_active_states[state + 1] = true;
-                        ++n_active;
-                        break;
-                    case DFATransition::SpecificEvent:
-                        if (events_it->second.test(dfa_states[state].event))
-                        {
-                            next_active_states[state + 1] = true;
-                            ++n_active;
-                        }
-                        break;
-                }
-
-                if (dfa_states[state].has_kleene)
-                {
-                    next_active_states[state] = true;
-                    ++n_active;
-                }
-            }
-            swap(active_states, next_active_states);
-        }
-
-        return active_states.back();
-    }
-
-    template <typename EventEntry>
-    bool backtrackingMatch(EventEntry & events_it, const EventEntry events_end) const
-    {
-        const auto action_begin = std::begin(actions);
-        const auto action_end = std::end(actions);
-        auto action_it = action_begin;
-
-        const auto events_begin = events_it;
-        auto base_it = events_it;
-
-        /// an iterator to action plus an iterator to row in events list plus timestamp at the start of sequence
-        using backtrack_info = std::tuple<decltype(action_it), EventEntry, EventEntry>;
-        std::stack<backtrack_info> back_stack;
-
-        /// backtrack if possible
-        const auto do_backtrack = [&]
-        {
-            while (!back_stack.empty())
-            {
-                auto & top = back_stack.top();
-
-                action_it = std::get<0>(top);
-                events_it = std::next(std::get<1>(top));
-                base_it = std::get<2>(top);
-
-                back_stack.pop();
-
-                if (events_it != events_end)
-                    return true;
-            }
-
-            return false;
-        };
-
-        size_t i = 0;
-        while (action_it != action_end && events_it != events_end)
-        {
-            if (action_it->type == PatternActionType::SpecificEvent)
-            {
-                if (events_it->second.test(action_it->extra))
-                {
-                    /// move to the next action and events
-                    base_it = events_it;
-                    ++action_it, ++events_it;
-                }
-                else if (!do_backtrack())
-                    /// backtracking failed, bail out
-                    break;
-            }
-            else if (action_it->type == PatternActionType::AnyEvent)
-            {
-                base_it = events_it;
-                ++action_it, ++events_it;
-            }
-            else if (action_it->type == PatternActionType::KleeneStar)
-            {
-                back_stack.emplace(action_it, events_it, base_it);
-                base_it = events_it;
-                ++action_it;
-            }
-            else if (action_it->type == PatternActionType::TimeLessOrEqual)
-            {
-                if (events_it->first <= base_it->first + action_it->extra)
-                {
-                    /// condition satisfied, move onto next action
-                    back_stack.emplace(action_it, events_it, base_it);
-                    base_it = events_it;
-                    ++action_it;
-                }
-                else if (!do_backtrack())
-                    break;
-            }
-            else if (action_it->type == PatternActionType::TimeLess)
-            {
-                if (events_it->first < base_it->first + action_it->extra)
-                {
-                    back_stack.emplace(action_it, events_it, base_it);
-                    base_it = events_it;
-                    ++action_it;
-                }
-                else if (!do_backtrack())
-                    break;
-            }
-            else if (action_it->type == PatternActionType::TimeGreaterOrEqual)
-            {
-                if (events_it->first >= base_it->first + action_it->extra)
-                {
-                    back_stack.emplace(action_it, events_it, base_it);
-                    base_it = events_it;
-                    ++action_it;
-                }
-                else if (++events_it == events_end && !do_backtrack())
-                    break;
-            }
-            else if (action_it->type == PatternActionType::TimeGreater)
-            {
-                if (events_it->first > base_it->first + action_it->extra)
-                {
-                    back_stack.emplace(action_it, events_it, base_it);
-                    base_it = events_it;
-                    ++action_it;
-                }
-                else if (++events_it == events_end && !do_backtrack())
-                    break;
-            }
-            else if (action_it->type == PatternActionType::TimeEqual)
-            {
-                if (events_it->first == base_it->first + action_it->extra)
-                {
-                    back_stack.emplace(action_it, events_it, base_it);
-                    base_it = events_it;
-                    ++action_it;
-                }
-                else if (++events_it == events_end && !do_backtrack())
-                    break;
-            }
-            else
-                throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown PatternActionType");
-
-            if (++i > sequence_match_max_iterations)
-                throw Exception(ErrorCodes::TOO_SLOW, "Pattern application proves too difficult, exceeding max iterations ({})",
-                    sequence_match_max_iterations);
-        }
-
-        /// if there are some actions remaining
-        if (action_it != action_end)
-        {
-            /// match multiple empty strings at end
-            while (action_it->type == PatternActionType::KleeneStar ||
-                   action_it->type == PatternActionType::TimeLessOrEqual ||
-                   action_it->type == PatternActionType::TimeLess ||
-                   (action_it->type == PatternActionType::TimeGreaterOrEqual && action_it->extra == 0))
-                ++action_it;
-        }
-
-        if (events_it == events_begin)
-            ++events_it;
-
-        return action_it == action_end;
-    }
-
-    /// Splits the pattern into deterministic parts separated by non-deterministic fragments
-    /// (time constraints and Kleene stars), and tries to match the deterministic parts in their specified order,
-    /// ignoring the non-deterministic fragments.
-    /// This function can quickly check that a full match is not possible if some deterministic fragment is missing.
-    template <typename EventEntry>
-    bool couldMatchDeterministicParts(const EventEntry events_begin, const EventEntry events_end, bool limit_iterations = true) const
-    {
-        size_t events_processed = 0;
-        auto events_it = events_begin;
-
-        const auto actions_end = std::end(actions);
-        auto actions_it = std::begin(actions);
-        auto det_part_begin = actions_it;
-
-        auto match_deterministic_part = [&events_it, events_end, &events_processed, det_part_begin, actions_it, limit_iterations]()
-        {
-            auto events_it_init = events_it;
-            auto det_part_it = det_part_begin;
-
-            while (det_part_it != actions_it && events_it != events_end)
-            {
-                /// matching any event
-                if (det_part_it->type == PatternActionType::AnyEvent)
-                    ++events_it, ++det_part_it;
-
-                /// matching specific event
-                else
-                {
-                    if (events_it->second.test(det_part_it->extra))
-                        ++events_it, ++det_part_it;
-
-                    /// abandon current matching, try to match the deterministic fragment further in the list
-                    else
-                    {
-                        events_it = ++events_it_init;
-                        det_part_it = det_part_begin;
-                    }
-                }
-
-                if (limit_iterations && ++events_processed > sequence_match_max_iterations)
-                    throw Exception(ErrorCodes::TOO_SLOW, "Pattern application proves too difficult, exceeding max iterations ({})",
-                        sequence_match_max_iterations);
-            }
-
-            return det_part_it == actions_it;
-        };
-
-        for (; actions_it != actions_end; ++actions_it)
-            if (actions_it->type != PatternActionType::SpecificEvent && actions_it->type != PatternActionType::AnyEvent)
-            {
-                if (!match_deterministic_part())
-                    return false;
-                det_part_begin = std::next(actions_it);
-            }
-
-        return match_deterministic_part();
-    }
-
-private:
-    enum class DFATransition : char
-    {
-        ///   .-------.
-        ///   |       |
-        ///   `-------'
-        None,
-        ///   .-------.  (?[0-9])
-        ///   |       | ----------
-        ///   `-------'
-        SpecificEvent,
-        ///   .-------.      .
-        ///   |       | ----------
-        ///   `-------'
-        AnyEvent,
-    };
-
-    struct DFAState
-    {
-        explicit DFAState(bool has_kleene_ = false)
-            : has_kleene{has_kleene_}, event{0}, transition{DFATransition::None}
-        {}
-
-        ///   .-------.
-        ///   |       | - - -
-        ///   `-------'
-        ///     |_^
-        bool has_kleene;
-        /// In the case of a state transitions with a `SpecificEvent`,
-        /// `event` contains the value of the event.
-        uint32_t event;
-        /// The kind of transition out of this state.
-        DFATransition transition;
-    };
-
-    using DFAStates = std::vector<DFAState>;
-
-protected:
-    /// `True` if the parsed pattern contains time assertions (?t...), `false` otherwise.
-    bool pattern_has_time;
-    /// sequenceMatch conditions met at least once in the pattern
-    std::bitset<max_events> conditions_in_pattern;
-
-private:
-    std::string pattern;
-    size_t arg_count;
-    PatternActions actions;
-
-    DFAStates dfa_states;
-};
-
-template <typename T, typename Data>
-class AggregateFunctionSequenceMatch final : public AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>
-{
-public:
-    AggregateFunctionSequenceMatch(const DataTypes & arguments, const Array & params, const String & pattern_)
-        : AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>(arguments, params, pattern_, std::make_shared<DataTypeUInt8>()) {}
-
-    using AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceMatch<T, Data>>::AggregateFunctionSequenceBase;
-
-    String getName() const override { return "sequenceMatch"; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & output = assert_cast<ColumnUInt8 &>(to).getData();
-        if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
-        {
-            output.push_back(false);
-            return;
-        }
-        this->data(place).sort();
-
-        const auto & data_ref = this->data(place);
-
-        const auto events_begin = std::begin(data_ref.events_list);
-        const auto events_end = std::end(data_ref.events_list);
-        auto events_it = events_begin;
-
-        bool match = (this->pattern_has_time ?
-            (this->couldMatchDeterministicParts(events_begin, events_end) && this->backtrackingMatch(events_it, events_end)) :
-            this->dfaMatch(events_it, events_end));
-        output.push_back(match);
-    }
-};
-
-template <typename T, typename Data>
-class AggregateFunctionSequenceCount final : public AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>
-{
-public:
-    AggregateFunctionSequenceCount(const DataTypes & arguments, const Array & params, const String & pattern_)
-        : AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>(arguments, params, pattern_, std::make_shared<DataTypeUInt64>()) {}
-
-    using AggregateFunctionSequenceBase<T, Data, AggregateFunctionSequenceCount<T, Data>>::AggregateFunctionSequenceBase;
-
-    String getName() const override { return "sequenceCount"; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & output = assert_cast<ColumnUInt64 &>(to).getData();
-        if ((this->conditions_in_pattern & this->data(place).conditions_met) != this->conditions_in_pattern)
-        {
-            output.push_back(0);
-            return;
-        }
-        this->data(place).sort();
-        output.push_back(count(place));
-    }
-
-private:
-    UInt64 count(ConstAggregateDataPtr __restrict place) const
-    {
-        const auto & data_ref = this->data(place);
-
-        const auto events_begin = std::begin(data_ref.events_list);
-        const auto events_end = std::end(data_ref.events_list);
-        auto events_it = events_begin;
-
-        size_t count = 0;
-        // check if there is a chance of matching the sequence at least once
-        if (this->couldMatchDeterministicParts(events_begin, events_end))
-        {
-            while (events_it != events_end && this->backtrackingMatch(events_it, events_end))
-                ++count;
-        }
-
-        return count;
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.cpp
@ -1,14 +1,25 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionSequenceNextNode.h>
-#include <AggregateFunctions/Helpers.h>
-#include <AggregateFunctions/FactoryHelpers.h>
 #include <Core/Settings.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeNullable.h>
 #include <Interpreters/Context.h>
-#include <Common/CurrentThread.h>
-#include <base/range.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/WriteBufferFromString.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnNullable.h>
+
+#include <Common/ArenaAllocator.h>
+#include <Common/assert_cast.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+
+#include <bitset>


 namespace DB
@ -24,11 +35,409 @@ namespace ErrorCodes
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
    extern const int BAD_ARGUMENTS;
    extern const int UNKNOWN_AGGREGATE_FUNCTION;
+    extern const int TOO_LARGE_ARRAY_SIZE;
 }

 namespace
 {

+enum class SequenceDirection
+{
+    Forward,
+    Backward,
+};
+
+enum SequenceBase
+{
+    Head,
+    Tail,
+    FirstMatch,
+    LastMatch,
+};
+
+/// This is for security
+static const UInt64 max_node_size_deserialize = 0xFFFFFF;
+
+/// NodeBase used to implement a linked list for storage of SequenceNextNodeImpl
+template <typename Node, size_t MaxEventsSize>
+struct NodeBase
+{
+    UInt64 size; /// size of payload
+
+    DataTypeDateTime::FieldType event_time;
+    std::bitset<MaxEventsSize> events_bitset;
+    bool can_be_base;
+
+    char * data() { return reinterpret_cast<char *>(this) + sizeof(Node); }
+
+    const char * data() const { return reinterpret_cast<const char *>(this) + sizeof(Node); }
+
+    Node * clone(Arena * arena) const
+    {
+        return reinterpret_cast<Node *>(
+            const_cast<char *>(arena->alignedInsert(reinterpret_cast<const char *>(this), sizeof(Node) + size, alignof(Node))));
+    }
+
+    void write(WriteBuffer & buf) const
+    {
+        writeVarUInt(size, buf);
+        buf.write(data(), size);
+
+        writeBinary(event_time, buf);
+        UInt64 ulong_bitset = events_bitset.to_ulong();
+        writeBinary(ulong_bitset, buf);
+        writeBinary(can_be_base, buf);
+    }
+
+    static Node * read(ReadBuffer & buf, Arena * arena)
+    {
+        UInt64 size;
+        readVarUInt(size, buf);
+        if (unlikely(size > max_node_size_deserialize))
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large node state size");
+
+        Node * node = reinterpret_cast<Node *>(arena->alignedAlloc(sizeof(Node) + size, alignof(Node)));
+        node->size = size;
+        buf.readStrict(node->data(), size);
+
+        readBinary(node->event_time, buf);
+        UInt64 ulong_bitset;
+        readBinary(ulong_bitset, buf);
+        node->events_bitset = ulong_bitset;
+        readBinary(node->can_be_base, buf);
+
+        return node;
+    }
+};
+
+/// It stores String, timestamp, bitset of matched events.
+template <size_t MaxEventsSize>
+struct NodeString : public NodeBase<NodeString<MaxEventsSize>, MaxEventsSize>
+{
+    using Node = NodeString<MaxEventsSize>;
+
+    static Node * allocate(const IColumn & column, size_t row_num, Arena * arena)
+    {
+        StringRef string = assert_cast<const ColumnString &>(column).getDataAt(row_num);
+
+        Node * node = reinterpret_cast<Node *>(arena->alignedAlloc(sizeof(Node) + string.size, alignof(Node)));
+        node->size = string.size;
+        memcpy(node->data(), string.data, string.size);
+
+        return node;
+    }
+
+    void insertInto(IColumn & column)
+    {
+        assert_cast<ColumnString &>(column).insertData(this->data(), this->size);
+    }
+
+    bool compare(const Node * rhs) const
+    {
+        auto cmp = strncmp(this->data(), rhs->data(), std::min(this->size, rhs->size));
+        return (cmp == 0) ? this->size < rhs->size : cmp < 0;
+    }
+};
+
+/// TODO : Support other types than string
+template <typename Node>
+struct SequenceNextNodeGeneralData
+{
+    using Allocator = MixedAlignedArenaAllocator<alignof(Node *), 4096>;
+    using Array = PODArray<Node *, 32, Allocator>;
+
+    Array value;
+    bool sorted = false;
+
+    struct Comparator final
+    {
+        bool operator()(const Node * lhs, const Node * rhs) const
+        {
+            return lhs->event_time == rhs->event_time ? lhs->compare(rhs) : lhs->event_time < rhs->event_time;
+        }
+    };
+
+    void sort()
+    {
+        if (!sorted)
+        {
+            std::stable_sort(std::begin(value), std::end(value), Comparator{});
+            sorted = true;
+        }
+    }
+};
+
+/// Implementation of sequenceFirstNode
+template <typename T, typename Node>
+class SequenceNextNodeImpl final
+    : public IAggregateFunctionDataHelper<SequenceNextNodeGeneralData<Node>, SequenceNextNodeImpl<T, Node>>
+{
+    using Self = SequenceNextNodeImpl<T, Node>;
+
+    using Data = SequenceNextNodeGeneralData<Node>;
+    static Data & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<Data *>(place); }
+    static const Data & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const Data *>(place); }
+
+    static constexpr size_t base_cond_column_idx = 2;
+    static constexpr size_t event_column_idx = 1;
+
+    SequenceBase seq_base_kind;
+    SequenceDirection seq_direction;
+    const size_t min_required_args;
+
+    DataTypePtr & data_type;
+    UInt8 events_size;
+    UInt64 max_elems;
+public:
+    SequenceNextNodeImpl(
+        const DataTypePtr & data_type_,
+        const DataTypes & arguments,
+        const Array & parameters_,
+        SequenceBase seq_base_kind_,
+        SequenceDirection seq_direction_,
+        size_t min_required_args_,
+        UInt64 max_elems_ = std::numeric_limits<UInt64>::max())
+        : IAggregateFunctionDataHelper<SequenceNextNodeGeneralData<Node>, Self>(arguments, parameters_, data_type_)
+        , seq_base_kind(seq_base_kind_)
+        , seq_direction(seq_direction_)
+        , min_required_args(min_required_args_)
+        , data_type(this->argument_types[0])
+        , events_size(arguments.size() - min_required_args)
+        , max_elems(max_elems_)
+    {
+    }
+
+    String getName() const override { return "sequenceNextNode"; }
+
+    bool haveSameStateRepresentationImpl(const IAggregateFunction & rhs) const override
+    {
+        return this->getName() == rhs.getName() && this->haveEqualArgumentTypes(rhs);
+    }
+
+    void insert(Data & a, const Node * v, Arena * arena) const
+    {
+        ++a.total_values;
+        a.value.push_back(v->clone(arena), arena);
+    }
+
+    void create(AggregateDataPtr __restrict place) const override /// NOLINT
+    {
+        new (place) Data;
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        Node * node = Node::allocate(*columns[event_column_idx], row_num, arena);
+
+        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
+
+        /// The events_bitset variable stores matched events in the form of bitset.
+        /// Each Nth-bit indicates that the Nth-event are matched.
+        /// For example, event1 and event3 is matched then the values of events_bitset is 0x00000005.
+        ///   0x00000000
+        /// +          1 (bit of event1)
+        /// +          4 (bit of event3)
+        node->events_bitset.reset();
+        for (UInt8 i = 0; i < events_size; ++i)
+            if (assert_cast<const ColumnVector<UInt8> *>(columns[min_required_args + i])->getData()[row_num])
+                node->events_bitset.set(i);
+        node->event_time = static_cast<DataTypeDateTime::FieldType>(timestamp);
+
+        node->can_be_base = assert_cast<const ColumnVector<UInt8> *>(columns[base_cond_column_idx])->getData()[row_num];
+
+        data(place).value.push_back(node, arena);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
+    {
+        if (data(rhs).value.empty())
+            return;
+
+        if (data(place).value.size() >= max_elems)
+            return;
+
+        auto & a = data(place).value;
+        auto & b = data(rhs).value;
+        const auto a_size = a.size();
+
+        const UInt64 new_elems = std::min(data(rhs).value.size(), static_cast<size_t>(max_elems) - data(place).value.size());
+        for (UInt64 i = 0; i < new_elems; ++i)
+            a.push_back(b[i]->clone(arena), arena);
+
+        /// Either sort whole container or do so partially merging ranges afterwards
+        using Comparator = typename SequenceNextNodeGeneralData<Node>::Comparator;
+
+        if (!data(place).sorted && !data(rhs).sorted)
+            std::stable_sort(std::begin(a), std::end(a), Comparator{});
+        else
+        {
+            const auto begin = std::begin(a);
+            const auto middle = std::next(begin, a_size);
+            const auto end = std::end(a);
+
+            if (!data(place).sorted)
+                std::stable_sort(begin, middle, Comparator{});
+
+            if (!data(rhs).sorted)
+                std::stable_sort(middle, end, Comparator{});
+
+            std::inplace_merge(begin, middle, end, Comparator{});
+        }
+
+        data(place).sorted = true;
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        /// Temporarily do a const_cast to sort the values. It helps to reduce the computational burden on the initiator node.
+        this->data(const_cast<AggregateDataPtr>(place)).sort();
+
+        writeBinary(data(place).sorted, buf);
+
+        auto & value = data(place).value;
+
+        size_t size = std::min(static_cast<size_t>(events_size + 1), value.size());
+        switch (seq_base_kind)
+        {
+            case SequenceBase::Head:
+                writeVarUInt(size, buf);
+                for (size_t i = 0; i < size; ++i)
+                    value[i]->write(buf);
+                break;
+
+            case SequenceBase::Tail:
+                writeVarUInt(size, buf);
+                for (size_t i = 0; i < size; ++i)
+                    value[value.size() - size + i]->write(buf);
+                break;
+
+            case SequenceBase::FirstMatch:
+            case SequenceBase::LastMatch:
+                writeVarUInt(value.size(), buf);
+                for (auto & node : value)
+                    node->write(buf);
+                break;
+        }
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        readBinary(data(place).sorted, buf);
+
+        UInt64 size;
+        readVarUInt(size, buf);
+
+        if (unlikely(size == 0))
+            return;
+
+        if (unlikely(size > max_node_size_deserialize))
+            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
+                            "Too large array size (maximum: {})", max_node_size_deserialize);
+
+        auto & value = data(place).value;
+
+        value.resize(size, arena);
+        for (UInt64 i = 0; i < size; ++i)
+            value[i] = Node::read(buf, arena);
+    }
+
+    inline std::optional<size_t> getBaseIndex(Data & data) const
+    {
+        if (data.value.size() == 0)
+            return {};
+
+        switch (seq_base_kind)
+        {
+            case SequenceBase::Head:
+                if (data.value[0]->can_be_base)
+                    return 0;
+                break;
+
+            case SequenceBase::Tail:
+                if (data.value[data.value.size() - 1]->can_be_base)
+                    return data.value.size() - 1;
+                break;
+
+            case SequenceBase::FirstMatch:
+                for (size_t i = 0; i < data.value.size(); ++i)
+                {
+                    if (data.value[i]->events_bitset.test(0) && data.value[i]->can_be_base)
+                        return i;
+                }
+                break;
+
+            case SequenceBase::LastMatch:
+                for (size_t i = 0; i < data.value.size(); ++i)
+                {
+                    auto reversed_i = data.value.size() - i - 1;
+                    if (data.value[reversed_i]->events_bitset.test(0) && data.value[reversed_i]->can_be_base)
+                        return reversed_i;
+                }
+                break;
+        }
+
+        return {};
+    }
+
+    /// This method returns an index of next node that matched the events.
+    /// matched events in the chain of events are represented as a bitmask.
+    /// The first matched event is 0x00000001, the second one is 0x00000002, the third one is 0x00000004, and so on.
+    UInt32 getNextNodeIndex(Data & data) const
+    {
+        const UInt32 unmatched_idx = static_cast<UInt32>(data.value.size());
+
+        if (data.value.size() <= events_size)
+            return unmatched_idx;
+
+        data.sort();
+
+        std::optional<size_t> base_opt = getBaseIndex(data);
+        if (!base_opt.has_value())
+            return unmatched_idx;
+        UInt32 base = static_cast<UInt32>(base_opt.value());
+
+        if (events_size == 0)
+            return data.value.size() > 0 ? base : unmatched_idx;
+
+        UInt32 i = 0;
+        switch (seq_direction)
+        {
+            case SequenceDirection::Forward:
+                for (i = 0; i < events_size && base + i < data.value.size(); ++i)
+                    if (!data.value[base + i]->events_bitset.test(i))
+                        break;
+                return (i == events_size) ? base + i : unmatched_idx;
+
+            case SequenceDirection::Backward:
+                for (i = 0; i < events_size && i < base; ++i)
+                    if (!data.value[base - i]->events_bitset.test(i))
+                        break;
+                return (i == events_size) ? base - i : unmatched_idx;
+        }
+        UNREACHABLE();
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        auto & value = data(place).value;
+
+        UInt32 event_idx = getNextNodeIndex(this->data(place));
+        if (event_idx < value.size())
+        {
+            ColumnNullable & to_concrete = assert_cast<ColumnNullable &>(to);
+            value[event_idx]->insertInto(to_concrete.getNestedColumn());
+            to_concrete.getNullMapData().push_back(0);
+        }
+        else
+        {
+            to.insertDefault();
+        }
+    }
+
+    bool allocatesMemoryInArena() const override { return true; }
+};
+
+
 template <typename T>
 inline AggregateFunctionPtr createAggregateFunctionSequenceNodeImpl(
    const DataTypePtr data_type, const DataTypes & argument_types, const Array & parameters, SequenceDirection direction, SequenceBase base)
--- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h
+++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h
@ -1,432 +0,0 @@
-#pragma once
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <IO/ReadBufferFromString.h>
-#include <IO/WriteBufferFromString.h>
-#include <IO/Operators.h>
-
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeDateTime.h>
-
-#include <Columns/ColumnString.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnNullable.h>
-
-#include <Common/ArenaAllocator.h>
-#include <Common/assert_cast.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-#include <type_traits>
-#include <bitset>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int TOO_LARGE_ARRAY_SIZE;
-}
-
-enum class SequenceDirection
-{
-    Forward,
-    Backward,
-};
-
-enum SequenceBase
-{
-    Head,
-    Tail,
-    FirstMatch,
-    LastMatch,
-};
-
-/// This is for security
-static const UInt64 max_node_size_deserialize = 0xFFFFFF;
-
-/// NodeBase used to implement a linked list for storage of SequenceNextNodeImpl
-template <typename Node, size_t MaxEventsSize>
-struct NodeBase
-{
-    UInt64 size; /// size of payload
-
-    DataTypeDateTime::FieldType event_time;
-    std::bitset<MaxEventsSize> events_bitset;
-    bool can_be_base;
-
-    char * data() { return reinterpret_cast<char *>(this) + sizeof(Node); }
-
-    const char * data() const { return reinterpret_cast<const char *>(this) + sizeof(Node); }
-
-    Node * clone(Arena * arena) const
-    {
-        return reinterpret_cast<Node *>(
-            const_cast<char *>(arena->alignedInsert(reinterpret_cast<const char *>(this), sizeof(Node) + size, alignof(Node))));
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writeVarUInt(size, buf);
-        buf.write(data(), size);
-
-        writeBinary(event_time, buf);
-        UInt64 ulong_bitset = events_bitset.to_ulong();
-        writeBinary(ulong_bitset, buf);
-        writeBinary(can_be_base, buf);
-    }
-
-    static Node * read(ReadBuffer & buf, Arena * arena)
-    {
-        UInt64 size;
-        readVarUInt(size, buf);
-        if (unlikely(size > max_node_size_deserialize))
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large node state size");
-
-        Node * node = reinterpret_cast<Node *>(arena->alignedAlloc(sizeof(Node) + size, alignof(Node)));
-        node->size = size;
-        buf.readStrict(node->data(), size);
-
-        readBinary(node->event_time, buf);
-        UInt64 ulong_bitset;
-        readBinary(ulong_bitset, buf);
-        node->events_bitset = ulong_bitset;
-        readBinary(node->can_be_base, buf);
-
-        return node;
-    }
-};
-
-/// It stores String, timestamp, bitset of matched events.
-template <size_t MaxEventsSize>
-struct NodeString : public NodeBase<NodeString<MaxEventsSize>, MaxEventsSize>
-{
-    using Node = NodeString<MaxEventsSize>;
-
-    static Node * allocate(const IColumn & column, size_t row_num, Arena * arena)
-    {
-        StringRef string = assert_cast<const ColumnString &>(column).getDataAt(row_num);
-
-        Node * node = reinterpret_cast<Node *>(arena->alignedAlloc(sizeof(Node) + string.size, alignof(Node)));
-        node->size = string.size;
-        memcpy(node->data(), string.data, string.size);
-
-        return node;
-    }
-
-    void insertInto(IColumn & column)
-    {
-        assert_cast<ColumnString &>(column).insertData(this->data(), this->size);
-    }
-
-    bool compare(const Node * rhs) const
-    {
-        auto cmp = strncmp(this->data(), rhs->data(), std::min(this->size, rhs->size));
-        return (cmp == 0) ? this->size < rhs->size : cmp < 0;
-    }
-};
-
-/// TODO : Support other types than string
-template <typename Node>
-struct SequenceNextNodeGeneralData
-{
-    using Allocator = MixedAlignedArenaAllocator<alignof(Node *), 4096>;
-    using Array = PODArray<Node *, 32, Allocator>;
-
-    Array value;
-    bool sorted = false;
-
-    struct Comparator final
-    {
-        bool operator()(const Node * lhs, const Node * rhs) const
-        {
-            return lhs->event_time == rhs->event_time ? lhs->compare(rhs) : lhs->event_time < rhs->event_time;
-        }
-    };
-
-    void sort()
-    {
-        if (!sorted)
-        {
-            std::stable_sort(std::begin(value), std::end(value), Comparator{});
-            sorted = true;
-        }
-    }
-};
-
-/// Implementation of sequenceFirstNode
-template <typename T, typename Node>
-class SequenceNextNodeImpl final
-    : public IAggregateFunctionDataHelper<SequenceNextNodeGeneralData<Node>, SequenceNextNodeImpl<T, Node>>
-{
-    using Self = SequenceNextNodeImpl<T, Node>;
-
-    using Data = SequenceNextNodeGeneralData<Node>;
-    static Data & data(AggregateDataPtr __restrict place) { return *reinterpret_cast<Data *>(place); }
-    static const Data & data(ConstAggregateDataPtr __restrict place) { return *reinterpret_cast<const Data *>(place); }
-
-    static constexpr size_t base_cond_column_idx = 2;
-    static constexpr size_t event_column_idx = 1;
-
-    SequenceBase seq_base_kind;
-    SequenceDirection seq_direction;
-    const size_t min_required_args;
-
-    DataTypePtr & data_type;
-    UInt8 events_size;
-    UInt64 max_elems;
-public:
-    SequenceNextNodeImpl(
-        const DataTypePtr & data_type_,
-        const DataTypes & arguments,
-        const Array & parameters_,
-        SequenceBase seq_base_kind_,
-        SequenceDirection seq_direction_,
-        size_t min_required_args_,
-        UInt64 max_elems_ = std::numeric_limits<UInt64>::max())
-        : IAggregateFunctionDataHelper<SequenceNextNodeGeneralData<Node>, Self>(arguments, parameters_, data_type_)
-        , seq_base_kind(seq_base_kind_)
-        , seq_direction(seq_direction_)
-        , min_required_args(min_required_args_)
-        , data_type(this->argument_types[0])
-        , events_size(arguments.size() - min_required_args)
-        , max_elems(max_elems_)
-    {
-    }
-
-    String getName() const override { return "sequenceNextNode"; }
-
-    bool haveSameStateRepresentationImpl(const IAggregateFunction & rhs) const override
-    {
-        return this->getName() == rhs.getName() && this->haveEqualArgumentTypes(rhs);
-    }
-
-    void insert(Data & a, const Node * v, Arena * arena) const
-    {
-        ++a.total_values;
-        a.value.push_back(v->clone(arena), arena);
-    }
-
-    void create(AggregateDataPtr __restrict place) const override /// NOLINT
-    {
-        new (place) Data;
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        Node * node = Node::allocate(*columns[event_column_idx], row_num, arena);
-
-        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
-
-        /// The events_bitset variable stores matched events in the form of bitset.
-        /// Each Nth-bit indicates that the Nth-event are matched.
-        /// For example, event1 and event3 is matched then the values of events_bitset is 0x00000005.
-        ///   0x00000000
-        /// +          1 (bit of event1)
-        /// +          4 (bit of event3)
-        node->events_bitset.reset();
-        for (UInt8 i = 0; i < events_size; ++i)
-            if (assert_cast<const ColumnVector<UInt8> *>(columns[min_required_args + i])->getData()[row_num])
-                node->events_bitset.set(i);
-        node->event_time = static_cast<DataTypeDateTime::FieldType>(timestamp);
-
-        node->can_be_base = assert_cast<const ColumnVector<UInt8> *>(columns[base_cond_column_idx])->getData()[row_num];
-
-        data(place).value.push_back(node, arena);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena * arena) const override
-    {
-        if (data(rhs).value.empty())
-            return;
-
-        if (data(place).value.size() >= max_elems)
-            return;
-
-        auto & a = data(place).value;
-        auto & b = data(rhs).value;
-        const auto a_size = a.size();
-
-        const UInt64 new_elems = std::min(data(rhs).value.size(), static_cast<size_t>(max_elems) - data(place).value.size());
-        for (UInt64 i = 0; i < new_elems; ++i)
-            a.push_back(b[i]->clone(arena), arena);
-
-        /// Either sort whole container or do so partially merging ranges afterwards
-        using Comparator = typename SequenceNextNodeGeneralData<Node>::Comparator;
-
-        if (!data(place).sorted && !data(rhs).sorted)
-            std::stable_sort(std::begin(a), std::end(a), Comparator{});
-        else
-        {
-            const auto begin = std::begin(a);
-            const auto middle = std::next(begin, a_size);
-            const auto end = std::end(a);
-
-            if (!data(place).sorted)
-                std::stable_sort(begin, middle, Comparator{});
-
-            if (!data(rhs).sorted)
-                std::stable_sort(middle, end, Comparator{});
-
-            std::inplace_merge(begin, middle, end, Comparator{});
-        }
-
-        data(place).sorted = true;
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        /// Temporarily do a const_cast to sort the values. It helps to reduce the computational burden on the initiator node.
-        this->data(const_cast<AggregateDataPtr>(place)).sort();
-
-        writeBinary(data(place).sorted, buf);
-
-        auto & value = data(place).value;
-
-        size_t size = std::min(static_cast<size_t>(events_size + 1), value.size());
-        switch (seq_base_kind)
-        {
-            case SequenceBase::Head:
-                writeVarUInt(size, buf);
-                for (size_t i = 0; i < size; ++i)
-                    value[i]->write(buf);
-                break;
-
-            case SequenceBase::Tail:
-                writeVarUInt(size, buf);
-                for (size_t i = 0; i < size; ++i)
-                    value[value.size() - size + i]->write(buf);
-                break;
-
-            case SequenceBase::FirstMatch:
-            case SequenceBase::LastMatch:
-                writeVarUInt(value.size(), buf);
-                for (auto & node : value)
-                    node->write(buf);
-                break;
-        }
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        readBinary(data(place).sorted, buf);
-
-        UInt64 size;
-        readVarUInt(size, buf);
-
-        if (unlikely(size == 0))
-            return;
-
-        if (unlikely(size > max_node_size_deserialize))
-            throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE,
-                            "Too large array size (maximum: {})", max_node_size_deserialize);
-
-        auto & value = data(place).value;
-
-        value.resize(size, arena);
-        for (UInt64 i = 0; i < size; ++i)
-            value[i] = Node::read(buf, arena);
-    }
-
-    inline std::optional<size_t> getBaseIndex(Data & data) const
-    {
-        if (data.value.size() == 0)
-            return {};
-
-        switch (seq_base_kind)
-        {
-            case SequenceBase::Head:
-                if (data.value[0]->can_be_base)
-                    return 0;
-                break;
-
-            case SequenceBase::Tail:
-                if (data.value[data.value.size() - 1]->can_be_base)
-                    return data.value.size() - 1;
-                break;
-
-            case SequenceBase::FirstMatch:
-                for (size_t i = 0; i < data.value.size(); ++i)
-                {
-                    if (data.value[i]->events_bitset.test(0) && data.value[i]->can_be_base)
-                        return i;
-                }
-                break;
-
-            case SequenceBase::LastMatch:
-                for (size_t i = 0; i < data.value.size(); ++i)
-                {
-                    auto reversed_i = data.value.size() - i - 1;
-                    if (data.value[reversed_i]->events_bitset.test(0) && data.value[reversed_i]->can_be_base)
-                        return reversed_i;
-                }
-                break;
-        }
-
-        return {};
-    }
-
-    /// This method returns an index of next node that matched the events.
-    /// matched events in the chain of events are represented as a bitmask.
-    /// The first matched event is 0x00000001, the second one is 0x00000002, the third one is 0x00000004, and so on.
-    UInt32 getNextNodeIndex(Data & data) const
-    {
-        const UInt32 unmatched_idx = static_cast<UInt32>(data.value.size());
-
-        if (data.value.size() <= events_size)
-            return unmatched_idx;
-
-        data.sort();
-
-        std::optional<size_t> base_opt = getBaseIndex(data);
-        if (!base_opt.has_value())
-            return unmatched_idx;
-        UInt32 base = static_cast<UInt32>(base_opt.value());
-
-        if (events_size == 0)
-            return data.value.size() > 0 ? base : unmatched_idx;
-
-        UInt32 i = 0;
-        switch (seq_direction)
-        {
-            case SequenceDirection::Forward:
-                for (i = 0; i < events_size && base + i < data.value.size(); ++i)
-                    if (!data.value[base + i]->events_bitset.test(i))
-                        break;
-                return (i == events_size) ? base + i : unmatched_idx;
-
-            case SequenceDirection::Backward:
-                for (i = 0; i < events_size && i < base; ++i)
-                    if (!data.value[base - i]->events_bitset.test(i))
-                        break;
-                return (i == events_size) ? base - i : unmatched_idx;
-        }
-        UNREACHABLE();
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        auto & value = data(place).value;
-
-        UInt32 event_idx = getNextNodeIndex(this->data(place));
-        if (event_idx < value.size())
-        {
-            ColumnNullable & to_concrete = assert_cast<ColumnNullable &>(to);
-            value[event_idx]->insertInto(to_concrete.getNestedColumn());
-            to_concrete.getNullMapData().push_back(0);
-        }
-        else
-        {
-            to.insertDefault();
-        }
-    }
-
-    bool allocatesMemoryInArena() const override { return true; }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.cpp
@ -1,10 +1,21 @@
-#include <AggregateFunctions/AggregateFunctionSimpleLinearRegression.h>
-
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/FactoryHelpers.h>

+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Columns/ColumnVector.h>
+#include <Columns/ColumnTuple.h>
+#include <Common/assert_cast.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <limits>
+
+
 namespace DB
 {
+
 struct Settings;

 namespace ErrorCodes
@ -15,6 +26,167 @@ namespace ErrorCodes
 namespace
 {

+template <typename T>
+struct AggregateFunctionSimpleLinearRegressionData final
+{
+    size_t count = 0;
+    T sum_x = 0;
+    T sum_y = 0;
+    T sum_xx = 0;
+    T sum_xy = 0;
+
+    void add(T x, T y)
+    {
+        count += 1;
+        sum_x += x;
+        sum_y += y;
+        sum_xx += x * x;
+        sum_xy += x * y;
+    }
+
+    void merge(const AggregateFunctionSimpleLinearRegressionData & other)
+    {
+        count += other.count;
+        sum_x += other.sum_x;
+        sum_y += other.sum_y;
+        sum_xx += other.sum_xx;
+        sum_xy += other.sum_xy;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(count, buf);
+        writeBinary(sum_x, buf);
+        writeBinary(sum_y, buf);
+        writeBinary(sum_xx, buf);
+        writeBinary(sum_xy, buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(count, buf);
+        readBinary(sum_x, buf);
+        readBinary(sum_y, buf);
+        readBinary(sum_xx, buf);
+        readBinary(sum_xy, buf);
+    }
+
+    T getK() const
+    {
+        T divisor = sum_xx * count - sum_x * sum_x;
+
+        if (divisor == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+
+        return (sum_xy * count - sum_x * sum_y) / divisor;
+    }
+
+    T getB(T k) const
+    {
+        if (count == 0)
+            return std::numeric_limits<T>::quiet_NaN();
+
+        return (sum_y - k * sum_x) / count;
+    }
+};
+
+/// Calculates simple linear regression parameters.
+/// Result is a tuple (k, b) for y = k * x + b equation, solved by least squares approximation.
+template <typename X, typename Y, typename Ret = Float64>
+class AggregateFunctionSimpleLinearRegression final : public IAggregateFunctionDataHelper<
+    AggregateFunctionSimpleLinearRegressionData<Ret>,
+    AggregateFunctionSimpleLinearRegression<X, Y, Ret>
+>
+{
+public:
+    AggregateFunctionSimpleLinearRegression(
+        const DataTypes & arguments,
+        const Array & params
+    ):
+        IAggregateFunctionDataHelper<
+            AggregateFunctionSimpleLinearRegressionData<Ret>,
+            AggregateFunctionSimpleLinearRegression<X, Y, Ret>
+        > {arguments, params, createResultType()}
+    {
+        // notice: arguments has been checked before
+    }
+
+    String getName() const override
+    {
+        return "simpleLinearRegression";
+    }
+
+    void add(
+        AggregateDataPtr __restrict place,
+        const IColumn ** columns,
+        size_t row_num,
+        Arena *
+    ) const override
+    {
+        auto col_x = assert_cast<const ColumnVector<X> *>(columns[0]);
+        auto col_y = assert_cast<const ColumnVector<Y> *>(columns[1]);
+
+        X x = col_x->getData()[row_num];
+        Y y = col_y->getData()[row_num];
+
+        this->data(place).add(x, y);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    static DataTypePtr createResultType()
+    {
+        DataTypes types
+        {
+            std::make_shared<DataTypeNumber<Ret>>(),
+            std::make_shared<DataTypeNumber<Ret>>(),
+        };
+
+        Strings names
+        {
+            "k",
+            "b",
+        };
+
+        return std::make_shared<DataTypeTuple>(
+            std::move(types),
+            std::move(names)
+        );
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void insertResultInto(
+        AggregateDataPtr __restrict place,
+        IColumn & to,
+        Arena *) const override
+    {
+        Ret k = this->data(place).getK();
+        Ret b = this->data(place).getB(k);
+
+        auto & col_tuple = assert_cast<ColumnTuple &>(to);
+        auto & col_k = assert_cast<ColumnVector<Ret> &>(col_tuple.getColumn(0));
+        auto & col_b = assert_cast<ColumnVector<Ret> &>(col_tuple.getColumn(1));
+
+        col_k.getData().push_back(k);
+        col_b.getData().push_back(b);
+    }
+};
+
+
 AggregateFunctionPtr createAggregateFunctionSimpleLinearRegression(
    const String & name,
    const DataTypes & arguments,
--- a/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h
+++ b/src/AggregateFunctions/AggregateFunctionSimpleLinearRegression.h
@ -1,182 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnTuple.h>
-#include <Common/assert_cast.h>
-#include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <limits>
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-}
-
-template <typename T>
-struct AggregateFunctionSimpleLinearRegressionData final
-{
-    size_t count = 0;
-    T sum_x = 0;
-    T sum_y = 0;
-    T sum_xx = 0;
-    T sum_xy = 0;
-
-    void add(T x, T y)
-    {
-        count += 1;
-        sum_x += x;
-        sum_y += y;
-        sum_xx += x * x;
-        sum_xy += x * y;
-    }
-
-    void merge(const AggregateFunctionSimpleLinearRegressionData & other)
-    {
-        count += other.count;
-        sum_x += other.sum_x;
-        sum_y += other.sum_y;
-        sum_xx += other.sum_xx;
-        sum_xy += other.sum_xy;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(count, buf);
-        writeBinary(sum_x, buf);
-        writeBinary(sum_y, buf);
-        writeBinary(sum_xx, buf);
-        writeBinary(sum_xy, buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(count, buf);
-        readBinary(sum_x, buf);
-        readBinary(sum_y, buf);
-        readBinary(sum_xx, buf);
-        readBinary(sum_xy, buf);
-    }
-
-    T getK() const
-    {
-        T divisor = sum_xx * count - sum_x * sum_x;
-
-        if (divisor == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-
-        return (sum_xy * count - sum_x * sum_y) / divisor;
-    }
-
-    T getB(T k) const
-    {
-        if (count == 0)
-            return std::numeric_limits<T>::quiet_NaN();
-
-        return (sum_y - k * sum_x) / count;
-    }
-};
-
-/// Calculates simple linear regression parameters.
-/// Result is a tuple (k, b) for y = k * x + b equation, solved by least squares approximation.
-template <typename X, typename Y, typename Ret = Float64>
-class AggregateFunctionSimpleLinearRegression final : public IAggregateFunctionDataHelper<
-    AggregateFunctionSimpleLinearRegressionData<Ret>,
-    AggregateFunctionSimpleLinearRegression<X, Y, Ret>
->
-{
-public:
-    AggregateFunctionSimpleLinearRegression(
-        const DataTypes & arguments,
-        const Array & params
-    ):
-        IAggregateFunctionDataHelper<
-            AggregateFunctionSimpleLinearRegressionData<Ret>,
-            AggregateFunctionSimpleLinearRegression<X, Y, Ret>
-        > {arguments, params, createResultType()}
-    {
-        // notice: arguments has been checked before
-    }
-
-    String getName() const override
-    {
-        return "simpleLinearRegression";
-    }
-
-    void add(
-        AggregateDataPtr __restrict place,
-        const IColumn ** columns,
-        size_t row_num,
-        Arena *
-    ) const override
-    {
-        auto col_x = assert_cast<const ColumnVector<X> *>(columns[0]);
-        auto col_y = assert_cast<const ColumnVector<Y> *>(columns[1]);
-
-        X x = col_x->getData()[row_num];
-        Y y = col_y->getData()[row_num];
-
-        this->data(place).add(x, y);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    static DataTypePtr createResultType()
-    {
-        DataTypes types
-        {
-            std::make_shared<DataTypeNumber<Ret>>(),
-            std::make_shared<DataTypeNumber<Ret>>(),
-        };
-
-        Strings names
-        {
-            "k",
-            "b",
-        };
-
-        return std::make_shared<DataTypeTuple>(
-            std::move(types),
-            std::move(names)
-        );
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void insertResultInto(
-        AggregateDataPtr __restrict place,
-        IColumn & to,
-        Arena *) const override
-    {
-        Ret k = this->data(place).getK();
-        Ret b = this->data(place).getB(k);
-
-        auto & col_tuple = assert_cast<ColumnTuple &>(to);
-        auto & col_k = assert_cast<ColumnVector<Ret> &>(col_tuple.getColumn(0));
-        auto & col_b = assert_cast<ColumnVector<Ret> &>(col_tuple.getColumn(1));
-
-        col_k.getData().push_back(k);
-        col_b.getData().push_back(b);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionSparkbar.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSparkbar.cpp
@ -1,8 +1,18 @@
-#include <AggregateFunctions/AggregateFunctionSparkbar.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/AggregateFunctionFactory.h>

+#include <array>
+#include <string_view>
+#include <DataTypes/DataTypeString.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Columns/ColumnString.h>
+#include <Common/PODArray.h>
+#include <IO/ReadBufferFromString.h>
+#include <Common/HashTable/HashMap.h>
+#include <Columns/IColumn.h>
+

 namespace DB
 {
@ -13,11 +23,309 @@ namespace ErrorCodes
 {
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+    extern const int BAD_ARGUMENTS;
 }

 namespace
 {

+template<typename X, typename Y>
+struct AggregateFunctionSparkbarData
+{
+    /// TODO: calculate histogram instead of storing all points
+    using Points = HashMap<X, Y>;
+    Points points;
+
+    X min_x = std::numeric_limits<X>::max();
+    X max_x = std::numeric_limits<X>::lowest();
+
+    Y min_y = std::numeric_limits<Y>::max();
+    Y max_y = std::numeric_limits<Y>::lowest();
+
+    Y insert(const X & x, const Y & y)
+    {
+        if (isNaN(y) || y <= 0)
+            return 0;
+
+        auto [it, inserted] = points.insert({x, y});
+        if (!inserted)
+        {
+            if constexpr (std::is_floating_point_v<Y>)
+            {
+                it->getMapped() += y;
+                return it->getMapped();
+            }
+            else
+            {
+                Y res;
+                bool has_overfllow = common::addOverflow(it->getMapped(), y, res);
+                it->getMapped() = has_overfllow ? std::numeric_limits<Y>::max() : res;
+            }
+        }
+        return it->getMapped();
+    }
+
+    void add(X x, Y y)
+    {
+        auto new_y = insert(x, y);
+
+        min_x = std::min(x, min_x);
+        max_x = std::max(x, max_x);
+
+        min_y = std::min(y, min_y);
+        max_y = std::max(new_y, max_y);
+    }
+
+    void merge(const AggregateFunctionSparkbarData & other)
+    {
+        if (other.points.empty())
+            return;
+
+        for (auto & point : other.points)
+        {
+            auto new_y = insert(point.getKey(), point.getMapped());
+            max_y = std::max(new_y, max_y);
+        }
+
+        min_x = std::min(other.min_x, min_x);
+        max_x = std::max(other.max_x, max_x);
+
+        min_y = std::min(other.min_y, min_y);
+        max_y = std::max(other.max_y, max_y);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(min_x, buf);
+        writeBinary(max_x, buf);
+        writeBinary(min_y, buf);
+        writeBinary(max_y, buf);
+        writeVarUInt(points.size(), buf);
+
+        for (const auto & elem : points)
+        {
+            writeBinary(elem.getKey(), buf);
+            writeBinary(elem.getMapped(), buf);
+        }
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(min_x, buf);
+        readBinary(max_x, buf);
+        readBinary(min_y, buf);
+        readBinary(max_y, buf);
+        size_t size;
+        readVarUInt(size, buf);
+
+        X x;
+        Y y;
+        for (size_t i = 0; i < size; ++i)
+        {
+            readBinary(x, buf);
+            readBinary(y, buf);
+            insert(x, y);
+        }
+    }
+};
+
+template<typename X, typename Y>
+class AggregateFunctionSparkbar final
+    : public IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar<X, Y>>
+{
+
+private:
+    static constexpr size_t BAR_LEVELS = 8;
+    const size_t width = 0;
+
+    /// Range for x specified in parameters.
+    const bool is_specified_range_x = false;
+    const X begin_x = std::numeric_limits<X>::min();
+    const X end_x = std::numeric_limits<X>::max();
+
+    size_t updateFrame(ColumnString::Chars & frame, Y value) const
+    {
+        static constexpr std::array<std::string_view, BAR_LEVELS + 1> bars{" ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"};
+        const auto & bar = (isNaN(value) || value < 1 || static_cast<Y>(BAR_LEVELS) < value) ? bars[0] : bars[static_cast<UInt8>(value)];
+        frame.insert(bar.begin(), bar.end());
+        return bar.size();
+    }
+
+    /**
+     *  The minimum value of y is rendered as the lowest height "▁",
+     *  the maximum value of y is rendered as the highest height "█", and the middle value will be rendered proportionally.
+     *  If a bucket has no y value, it will be rendered as " ".
+     */
+    void render(ColumnString & to_column, const AggregateFunctionSparkbarData<X, Y> & data) const
+    {
+        auto & values = to_column.getChars();
+        auto & offsets = to_column.getOffsets();
+
+        if (data.points.empty())
+        {
+            values.push_back('\0');
+            offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
+            return;
+        }
+
+        auto from_x = is_specified_range_x ? begin_x : data.min_x;
+        auto to_x = is_specified_range_x ? end_x : data.max_x;
+
+        if (from_x >= to_x)
+        {
+            size_t sz = updateFrame(values, 8);
+            values.push_back('\0');
+            offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
+            return;
+        }
+
+        PaddedPODArray<Y> histogram(width, 0);
+        PaddedPODArray<UInt64> count_histogram(width, 0); /// The number of points in each bucket
+
+        for (const auto & point : data.points)
+        {
+            if (point.getKey() < from_x || to_x < point.getKey())
+                continue;
+
+            X delta = to_x - from_x;
+            if (delta < std::numeric_limits<X>::max())
+                delta = delta + 1;
+
+            X value = point.getKey() - from_x;
+            Float64 w = histogram.size();
+            size_t index = std::min<size_t>(static_cast<size_t>(w / delta * value), histogram.size() - 1);
+
+            Y res;
+            bool has_overfllow = false;
+            if constexpr (std::is_floating_point_v<Y>)
+                res = histogram[index] + point.getMapped();
+            else
+                has_overfllow = common::addOverflow(histogram[index], point.getMapped(), res);
+
+            if (unlikely(has_overfllow))
+            {
+                /// In case of overflow, just saturate
+                /// Do not count new values, because we do not know how many of them were added
+                histogram[index] = std::numeric_limits<Y>::max();
+            }
+            else
+            {
+                histogram[index] = res;
+                count_histogram[index] += 1;
+            }
+        }
+
+        for (size_t i = 0; i < histogram.size(); ++i)
+        {
+            if (count_histogram[i] > 0)
+                histogram[i] /= count_histogram[i];
+        }
+
+        Y y_max = 0;
+        for (auto & y : histogram)
+        {
+            if (isNaN(y) || y <= 0)
+                continue;
+            y_max = std::max(y_max, y);
+        }
+
+        if (y_max == 0)
+        {
+            values.push_back('\0');
+            offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
+            return;
+        }
+
+        /// Scale the histogram to the range [0, BAR_LEVELS]
+        for (auto & y : histogram)
+        {
+            if (isNaN(y) || y <= 0)
+            {
+                y = 0;
+                continue;
+            }
+
+            constexpr auto levels_num = static_cast<Y>(BAR_LEVELS - 1);
+            if constexpr (std::is_floating_point_v<Y>)
+            {
+                y = y / (y_max / levels_num) + 1;
+            }
+            else
+            {
+                Y scaled;
+                bool has_overfllow = common::mulOverflow<Y>(y, levels_num, scaled);
+
+                if (has_overfllow)
+                    y = y / (y_max / levels_num) + 1;
+                else
+                    y = scaled / y_max + 1;
+            }
+        }
+
+        size_t sz = 0;
+        for (const auto & y : histogram)
+            sz += updateFrame(values, y);
+
+        values.push_back('\0');
+        offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
+    }
+
+public:
+    AggregateFunctionSparkbar(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar>(arguments, params, std::make_shared<DataTypeString>())
+        , width(params.empty() ? 0 : params.at(0).safeGet<UInt64>())
+        , is_specified_range_x(params.size() >= 3)
+        , begin_x(is_specified_range_x ? static_cast<X>(params.at(1).safeGet<X>()) : std::numeric_limits<X>::min())
+        , end_x(is_specified_range_x ? static_cast<X>(params.at(2).safeGet<X>()) : std::numeric_limits<X>::max())
+    {
+        if (width < 2 || 1024 < width)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter width must be in range [2, 1024]");
+
+        if (begin_x >= end_x)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter `min_x` must be less than `max_x`");
+    }
+
+    String getName() const override
+    {
+        return "sparkbar";
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * /*arena*/) const override
+    {
+        X x = assert_cast<const ColumnVector<X> *>(columns[0])->getData()[row_num];
+        if (begin_x <= x && x <= end_x)
+        {
+            Y y = assert_cast<const ColumnVector<Y> *>(columns[1])->getData()[row_num];
+            this->data(place).add(x, y);
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena * /*arena*/) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * /*arena*/) const override
+    {
+        auto & to_column = assert_cast<ColumnString &>(to);
+        const auto & data = this->data(place);
+        render(to_column, data);
+    }
+};
+
+
 template <template <typename, typename> class AggregateFunctionTemplate, typename Data, typename ... TArgs>
 IAggregateFunction * createWithUIntegerOrTimeType(const std::string & name, const IDataType & argument_type, TArgs && ... args)
 {
--- a/src/AggregateFunctions/AggregateFunctionSparkbar.h
+++ b/src/AggregateFunctions/AggregateFunctionSparkbar.h
@ -1,323 +0,0 @@
-#pragma once
-
-#include <base/arithmeticOverflow.h>
-
-#include <array>
-#include <string_view>
-#include <DataTypes/DataTypeString.h>
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <base/range.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <Columns/ColumnString.h>
-#include <Common/PODArray.h>
-#include <IO/ReadBufferFromString.h>
-#include <Common/HashTable/HashMap.h>
-#include <Columns/IColumn.h>
-
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
-
-template<typename X, typename Y>
-struct AggregateFunctionSparkbarData
-{
-    /// TODO: calculate histogram instead of storing all points
-    using Points = HashMap<X, Y>;
-    Points points;
-
-    X min_x = std::numeric_limits<X>::max();
-    X max_x = std::numeric_limits<X>::lowest();
-
-    Y min_y = std::numeric_limits<Y>::max();
-    Y max_y = std::numeric_limits<Y>::lowest();
-
-    Y insert(const X & x, const Y & y)
-    {
-        if (isNaN(y) || y <= 0)
-            return 0;
-
-        auto [it, inserted] = points.insert({x, y});
-        if (!inserted)
-        {
-            if constexpr (std::is_floating_point_v<Y>)
-            {
-                it->getMapped() += y;
-                return it->getMapped();
-            }
-            else
-            {
-                Y res;
-                bool has_overfllow = common::addOverflow(it->getMapped(), y, res);
-                it->getMapped() = has_overfllow ? std::numeric_limits<Y>::max() : res;
-            }
-        }
-        return it->getMapped();
-    }
-
-    void add(X x, Y y)
-    {
-        auto new_y = insert(x, y);
-
-        min_x = std::min(x, min_x);
-        max_x = std::max(x, max_x);
-
-        min_y = std::min(y, min_y);
-        max_y = std::max(new_y, max_y);
-    }
-
-    void merge(const AggregateFunctionSparkbarData & other)
-    {
-        if (other.points.empty())
-            return;
-
-        for (auto & point : other.points)
-        {
-            auto new_y = insert(point.getKey(), point.getMapped());
-            max_y = std::max(new_y, max_y);
-        }
-
-        min_x = std::min(other.min_x, min_x);
-        max_x = std::max(other.max_x, max_x);
-
-        min_y = std::min(other.min_y, min_y);
-        max_y = std::max(other.max_y, max_y);
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(min_x, buf);
-        writeBinary(max_x, buf);
-        writeBinary(min_y, buf);
-        writeBinary(max_y, buf);
-        writeVarUInt(points.size(), buf);
-
-        for (const auto & elem : points)
-        {
-            writeBinary(elem.getKey(), buf);
-            writeBinary(elem.getMapped(), buf);
-        }
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(min_x, buf);
-        readBinary(max_x, buf);
-        readBinary(min_y, buf);
-        readBinary(max_y, buf);
-        size_t size;
-        readVarUInt(size, buf);
-
-        X x;
-        Y y;
-        for (size_t i = 0; i < size; ++i)
-        {
-            readBinary(x, buf);
-            readBinary(y, buf);
-            insert(x, y);
-        }
-    }
-};
-
-template<typename X, typename Y>
-class AggregateFunctionSparkbar final
-    : public IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar<X, Y>>
-{
-
-private:
-    static constexpr size_t BAR_LEVELS = 8;
-    const size_t width = 0;
-
-    /// Range for x specified in parameters.
-    const bool is_specified_range_x = false;
-    const X begin_x = std::numeric_limits<X>::min();
-    const X end_x = std::numeric_limits<X>::max();
-
-    size_t updateFrame(ColumnString::Chars & frame, Y value) const
-    {
-        static constexpr std::array<std::string_view, BAR_LEVELS + 1> bars{" ", "▁", "▂", "▃", "▄", "▅", "▆", "▇", "█"};
-        const auto & bar = (isNaN(value) || value < 1 || static_cast<Y>(BAR_LEVELS) < value) ? bars[0] : bars[static_cast<UInt8>(value)];
-        frame.insert(bar.begin(), bar.end());
-        return bar.size();
-    }
-
-    /**
-     *  The minimum value of y is rendered as the lowest height "▁",
-     *  the maximum value of y is rendered as the highest height "█", and the middle value will be rendered proportionally.
-     *  If a bucket has no y value, it will be rendered as " ".
-     */
-    void render(ColumnString & to_column, const AggregateFunctionSparkbarData<X, Y> & data) const
-    {
-        auto & values = to_column.getChars();
-        auto & offsets = to_column.getOffsets();
-
-        if (data.points.empty())
-        {
-            values.push_back('\0');
-            offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
-            return;
-        }
-
-        auto from_x = is_specified_range_x ? begin_x : data.min_x;
-        auto to_x = is_specified_range_x ? end_x : data.max_x;
-
-        if (from_x >= to_x)
-        {
-            size_t sz = updateFrame(values, 8);
-            values.push_back('\0');
-            offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
-            return;
-        }
-
-        PaddedPODArray<Y> histogram(width, 0);
-        PaddedPODArray<UInt64> count_histogram(width, 0); /// The number of points in each bucket
-
-        for (const auto & point : data.points)
-        {
-            if (point.getKey() < from_x || to_x < point.getKey())
-                continue;
-
-            X delta = to_x - from_x;
-            if (delta < std::numeric_limits<X>::max())
-                delta = delta + 1;
-
-            X value = point.getKey() - from_x;
-            Float64 w = histogram.size();
-            size_t index = std::min<size_t>(static_cast<size_t>(w / delta * value), histogram.size() - 1);
-
-            Y res;
-            bool has_overfllow = false;
-            if constexpr (std::is_floating_point_v<Y>)
-                res = histogram[index] + point.getMapped();
-            else
-                has_overfllow = common::addOverflow(histogram[index], point.getMapped(), res);
-
-            if (unlikely(has_overfllow))
-            {
-                /// In case of overflow, just saturate
-                /// Do not count new values, because we do not know how many of them were added
-                histogram[index] = std::numeric_limits<Y>::max();
-            }
-            else
-            {
-                histogram[index] = res;
-                count_histogram[index] += 1;
-            }
-        }
-
-        for (size_t i = 0; i < histogram.size(); ++i)
-        {
-            if (count_histogram[i] > 0)
-                histogram[i] /= count_histogram[i];
-        }
-
-        Y y_max = 0;
-        for (auto & y : histogram)
-        {
-            if (isNaN(y) || y <= 0)
-                continue;
-            y_max = std::max(y_max, y);
-        }
-
-        if (y_max == 0)
-        {
-            values.push_back('\0');
-            offsets.push_back(offsets.empty() ? 1 : offsets.back() + 1);
-            return;
-        }
-
-        /// Scale the histogram to the range [0, BAR_LEVELS]
-        for (auto & y : histogram)
-        {
-            if (isNaN(y) || y <= 0)
-            {
-                y = 0;
-                continue;
-            }
-
-            constexpr auto levels_num = static_cast<Y>(BAR_LEVELS - 1);
-            if constexpr (std::is_floating_point_v<Y>)
-            {
-                y = y / (y_max / levels_num) + 1;
-            }
-            else
-            {
-                Y scaled;
-                bool has_overfllow = common::mulOverflow<Y>(y, levels_num, scaled);
-
-                if (has_overfllow)
-                    y = y / (y_max / levels_num) + 1;
-                else
-                    y = scaled / y_max + 1;
-            }
-        }
-
-        size_t sz = 0;
-        for (const auto & y : histogram)
-            sz += updateFrame(values, y);
-
-        values.push_back('\0');
-        offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
-    }
-
-public:
-    AggregateFunctionSparkbar(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<AggregateFunctionSparkbarData<X, Y>, AggregateFunctionSparkbar>(arguments, params, std::make_shared<DataTypeString>())
-        , width(params.empty() ? 0 : params.at(0).safeGet<UInt64>())
-        , is_specified_range_x(params.size() >= 3)
-        , begin_x(is_specified_range_x ? static_cast<X>(params.at(1).safeGet<X>()) : std::numeric_limits<X>::min())
-        , end_x(is_specified_range_x ? static_cast<X>(params.at(2).safeGet<X>()) : std::numeric_limits<X>::max())
-    {
-        if (width < 2 || 1024 < width)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter width must be in range [2, 1024]");
-
-        if (begin_x >= end_x)
-            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter `min_x` must be less than `max_x`");
-    }
-
-    String getName() const override
-    {
-        return "sparkbar";
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * /*arena*/) const override
-    {
-        X x = assert_cast<const ColumnVector<X> *>(columns[0])->getData()[row_num];
-        if (begin_x <= x && x <= end_x)
-        {
-            Y y = assert_cast<const ColumnVector<Y> *>(columns[1])->getData()[row_num];
-            this->data(place).add(x, y);
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr __restrict rhs, Arena * /*arena*/) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena * /*arena*/) const override
-    {
-        auto & to_column = assert_cast<ColumnString &>(to);
-        const auto & data = this->data(place);
-        render(to_column, data);
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionStatistics.cpp
+++ b/src/AggregateFunctions/AggregateFunctionStatistics.cpp
@ -1,7 +1,15 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/AggregateFunctionStatistics.h>
+
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <Columns/ColumnsNumber.h>
+#include <Common/assert_cast.h>
+
+#include <cmath>


 namespace DB
@ -16,6 +24,454 @@ namespace ErrorCodes
 namespace
 {

+/// This function returns true if both values are large and comparable.
+/// It is used to calculate the mean value by merging two sources.
+/// It means that if the sizes of both sources are large and comparable, then we must apply a special
+///  formula guaranteeing more stability.
+bool areComparable(UInt64 a, UInt64 b)
+{
+    const Float64 sensitivity = 0.001;
+    const UInt64 threshold = 10000;
+
+    if ((a == 0) || (b == 0))
+        return false;
+
+    auto res = std::minmax(a, b);
+    return (((1 - static_cast<Float64>(res.first) / res.second) < sensitivity) && (res.first > threshold));
+}
+
+
+/** Statistical aggregate functions
+  * varSamp - sample variance
+  * stddevSamp - mean sample quadratic deviation
+  * varPop - variance
+  * stddevPop - standard deviation
+  * covarSamp - selective covariance
+  * covarPop - covariance
+  * corr - correlation
+  */
+
+/** Parallel and incremental algorithm for calculating variance.
+  * Source: "Updating formulae and a pairwise algorithm for computing sample variances"
+  * (Chan et al., Stanford University, 12.1979)
+  */
+template <typename T, typename Op>
+class AggregateFunctionVarianceData
+{
+public:
+    void update(const IColumn & column, size_t row_num)
+    {
+        T received = assert_cast<const ColumnVector<T> &>(column).getData()[row_num];
+        Float64 val = static_cast<Float64>(received);
+        Float64 delta = val - mean;
+
+        ++count;
+        mean += delta / count;
+        m2 += delta * (val - mean);
+    }
+
+    void mergeWith(const AggregateFunctionVarianceData & source)
+    {
+        UInt64 total_count = count + source.count;
+        if (total_count == 0)
+            return;
+
+        Float64 factor = static_cast<Float64>(count * source.count) / total_count;
+        Float64 delta = mean - source.mean;
+
+        if (areComparable(count, source.count))
+            mean = (source.count * source.mean + count * mean) / total_count;
+        else
+            mean = source.mean + delta * (static_cast<Float64>(count) / total_count);
+
+        m2 += source.m2 + delta * delta * factor;
+        count = total_count;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeVarUInt(count, buf);
+        writeBinary(mean, buf);
+        writeBinary(m2, buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readVarUInt(count, buf);
+        readBinary(mean, buf);
+        readBinary(m2, buf);
+    }
+
+    void publish(IColumn & to) const
+    {
+        assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(m2, count));
+    }
+
+private:
+    UInt64 count = 0;
+    Float64 mean = 0.0;
+    Float64 m2 = 0.0;
+};
+
+/** The main code for the implementation of varSamp, stddevSamp, varPop, stddevPop.
+  */
+template <typename T, typename Op>
+class AggregateFunctionVariance final
+    : public IAggregateFunctionDataHelper<AggregateFunctionVarianceData<T, Op>, AggregateFunctionVariance<T, Op>>
+{
+public:
+    explicit AggregateFunctionVariance(const DataTypePtr & arg)
+        : IAggregateFunctionDataHelper<AggregateFunctionVarianceData<T, Op>, AggregateFunctionVariance<T, Op>>({arg}, {}, std::make_shared<DataTypeFloat64>())
+    {}
+
+    String getName() const override { return Op::name; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).update(*columns[0], row_num);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).mergeWith(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        this->data(place).publish(to);
+    }
+};
+
+/** Implementing the varSamp function.
+  */
+struct AggregateFunctionVarSampImpl
+{
+    static constexpr auto name = "varSampStable";
+
+    static inline Float64 apply(Float64 m2, UInt64 count)
+    {
+        if (count < 2)
+            return std::numeric_limits<Float64>::infinity();
+        else
+            return m2 / (count - 1);
+    }
+};
+
+/** Implementing the stddevSamp function.
+  */
+struct AggregateFunctionStdDevSampImpl
+{
+    static constexpr auto name = "stddevSampStable";
+
+    static inline Float64 apply(Float64 m2, UInt64 count)
+    {
+        return sqrt(AggregateFunctionVarSampImpl::apply(m2, count));
+    }
+};
+
+/** Implementing the varPop function.
+  */
+struct AggregateFunctionVarPopImpl
+{
+    static constexpr auto name = "varPopStable";
+
+    static inline Float64 apply(Float64 m2, UInt64 count)
+    {
+        if (count == 0)
+            return std::numeric_limits<Float64>::infinity();
+        else if (count == 1)
+            return 0.0;
+        else
+            return m2 / count;
+    }
+};
+
+/** Implementing the stddevPop function.
+  */
+struct AggregateFunctionStdDevPopImpl
+{
+    static constexpr auto name = "stddevPopStable";
+
+    static inline Float64 apply(Float64 m2, UInt64 count)
+    {
+        return sqrt(AggregateFunctionVarPopImpl::apply(m2, count));
+    }
+};
+
+/** If `compute_marginal_moments` flag is set this class provides the successor
+  * CovarianceData support of marginal moments for calculating the correlation.
+  */
+template <bool compute_marginal_moments>
+class BaseCovarianceData
+{
+protected:
+    void incrementMarginalMoments(Float64, Float64) {}
+    void mergeWith(const BaseCovarianceData &) {}
+    void serialize(WriteBuffer &) const {}
+    void deserialize(const ReadBuffer &) {}
+};
+
+template <>
+class BaseCovarianceData<true>
+{
+protected:
+    void incrementMarginalMoments(Float64 left_incr, Float64 right_incr)
+    {
+        left_m2 += left_incr;
+        right_m2 += right_incr;
+    }
+
+    void mergeWith(const BaseCovarianceData & source)
+    {
+        left_m2 += source.left_m2;
+        right_m2 += source.right_m2;
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(left_m2, buf);
+        writeBinary(right_m2, buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(left_m2, buf);
+        readBinary(right_m2, buf);
+    }
+
+    Float64 left_m2 = 0.0;
+    Float64 right_m2 = 0.0;
+};
+
+/** Parallel and incremental algorithm for calculating covariance.
+  * Source: "Numerically Stable, Single-Pass, Parallel Statistics Algorithms"
+  * (J. Bennett et al., Sandia National Laboratories,
+  *  2009 IEEE International Conference on Cluster Computing)
+  */
+template <typename T, typename U, typename Op, bool compute_marginal_moments>
+class CovarianceData : public BaseCovarianceData<compute_marginal_moments>
+{
+private:
+    using Base = BaseCovarianceData<compute_marginal_moments>;
+
+public:
+    void update(const IColumn & column_left, const IColumn & column_right, size_t row_num)
+    {
+        T left_received = assert_cast<const ColumnVector<T> &>(column_left).getData()[row_num];
+        Float64 left_val = static_cast<Float64>(left_received);
+        Float64 left_delta = left_val - left_mean;
+
+        U right_received = assert_cast<const ColumnVector<U> &>(column_right).getData()[row_num];
+        Float64 right_val = static_cast<Float64>(right_received);
+        Float64 right_delta = right_val - right_mean;
+
+        Float64 old_right_mean = right_mean;
+
+        ++count;
+
+        left_mean += left_delta / count;
+        right_mean += right_delta / count;
+        co_moment += (left_val - left_mean) * (right_val - old_right_mean);
+
+        /// Update the marginal moments, if any.
+        if (compute_marginal_moments)
+        {
+            Float64 left_incr = left_delta * (left_val - left_mean);
+            Float64 right_incr = right_delta * (right_val - right_mean);
+            Base::incrementMarginalMoments(left_incr, right_incr);
+        }
+    }
+
+    void mergeWith(const CovarianceData & source)
+    {
+        UInt64 total_count = count + source.count;
+        if (total_count == 0)
+            return;
+
+        Float64 factor = static_cast<Float64>(count * source.count) / total_count;
+        Float64 left_delta = left_mean - source.left_mean;
+        Float64 right_delta = right_mean - source.right_mean;
+
+        if (areComparable(count, source.count))
+        {
+            left_mean = (source.count * source.left_mean + count * left_mean) / total_count;
+            right_mean = (source.count * source.right_mean + count * right_mean) / total_count;
+        }
+        else
+        {
+            left_mean = source.left_mean + left_delta * (static_cast<Float64>(count) / total_count);
+            right_mean = source.right_mean + right_delta * (static_cast<Float64>(count) / total_count);
+        }
+
+        co_moment += source.co_moment + left_delta * right_delta * factor;
+        count = total_count;
+
+        /// Update the marginal moments, if any.
+        if (compute_marginal_moments)
+        {
+            Float64 left_incr = left_delta * left_delta * factor;
+            Float64 right_incr = right_delta * right_delta * factor;
+            Base::mergeWith(source);
+            Base::incrementMarginalMoments(left_incr, right_incr);
+        }
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeVarUInt(count, buf);
+        writeBinary(left_mean, buf);
+        writeBinary(right_mean, buf);
+        writeBinary(co_moment, buf);
+        Base::serialize(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readVarUInt(count, buf);
+        readBinary(left_mean, buf);
+        readBinary(right_mean, buf);
+        readBinary(co_moment, buf);
+        Base::deserialize(buf);
+    }
+
+    void publish(IColumn & to) const
+    {
+        if constexpr (compute_marginal_moments)
+            assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(co_moment, Base::left_m2, Base::right_m2, count));
+        else
+            assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(co_moment, count));
+    }
+
+private:
+    UInt64 count = 0;
+    Float64 left_mean = 0.0;
+    Float64 right_mean = 0.0;
+    Float64 co_moment = 0.0;
+};
+
+template <typename T, typename U, typename Op, bool compute_marginal_moments = false>
+class AggregateFunctionCovariance final
+    : public IAggregateFunctionDataHelper<
+        CovarianceData<T, U, Op, compute_marginal_moments>,
+        AggregateFunctionCovariance<T, U, Op, compute_marginal_moments>>
+{
+public:
+    explicit AggregateFunctionCovariance(const DataTypes & args) : IAggregateFunctionDataHelper<
+        CovarianceData<T, U, Op, compute_marginal_moments>,
+        AggregateFunctionCovariance<T, U, Op, compute_marginal_moments>>(args, {}, std::make_shared<DataTypeFloat64>())
+    {}
+
+    String getName() const override { return Op::name; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).update(*columns[0], *columns[1], row_num);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).mergeWith(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        this->data(place).publish(to);
+    }
+};
+
+/** Implementing the covarSamp function.
+  */
+struct AggregateFunctionCovarSampImpl
+{
+    static constexpr auto name = "covarSampStable";
+
+    static inline Float64 apply(Float64 co_moment, UInt64 count)
+    {
+        if (count < 2)
+            return std::numeric_limits<Float64>::infinity();
+        else
+            return co_moment / (count - 1);
+    }
+};
+
+/** Implementing the covarPop function.
+  */
+struct AggregateFunctionCovarPopImpl
+{
+    static constexpr auto name = "covarPopStable";
+
+    static inline Float64 apply(Float64 co_moment, UInt64 count)
+    {
+        if (count == 0)
+            return std::numeric_limits<Float64>::infinity();
+        else if (count == 1)
+            return 0.0;
+        else
+            return co_moment / count;
+    }
+};
+
+/** `corr` function implementation.
+  */
+struct AggregateFunctionCorrImpl
+{
+    static constexpr auto name = "corrStable";
+
+    static inline Float64 apply(Float64 co_moment, Float64 left_m2, Float64 right_m2, UInt64 count)
+    {
+        if (count < 2)
+            return std::numeric_limits<Float64>::infinity();
+        else
+            return co_moment / sqrt(left_m2 * right_m2);
+    }
+};
+
+template <typename T>
+using AggregateFunctionVarSampStable = AggregateFunctionVariance<T, AggregateFunctionVarSampImpl>;
+
+template <typename T>
+using AggregateFunctionStddevSampStable = AggregateFunctionVariance<T, AggregateFunctionStdDevSampImpl>;
+
+template <typename T>
+using AggregateFunctionVarPopStable = AggregateFunctionVariance<T, AggregateFunctionVarPopImpl>;
+
+template <typename T>
+using AggregateFunctionStddevPopStable = AggregateFunctionVariance<T, AggregateFunctionStdDevPopImpl>;
+
+template <typename T, typename U>
+using AggregateFunctionCovarSampStable = AggregateFunctionCovariance<T, U, AggregateFunctionCovarSampImpl>;
+
+template <typename T, typename U>
+using AggregateFunctionCovarPopStable = AggregateFunctionCovariance<T, U, AggregateFunctionCovarPopImpl>;
+
+template <typename T, typename U>
+using AggregateFunctionCorrStable = AggregateFunctionCovariance<T, U, AggregateFunctionCorrImpl, true>;
+
+
 template <template <typename> typename FunctionTemplate>
 AggregateFunctionPtr createAggregateFunctionStatisticsUnary(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
--- a/src/AggregateFunctions/AggregateFunctionStatistics.h
+++ b/src/AggregateFunctions/AggregateFunctionStatistics.h
@ -1,468 +0,0 @@
-#pragma once
-
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <Columns/ColumnsNumber.h>
-#include <Common/assert_cast.h>
-
-#include <cmath>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace detail
-{
-
-/// This function returns true if both values are large and comparable.
-/// It is used to calculate the mean value by merging two sources.
-/// It means that if the sizes of both sources are large and comparable, then we must apply a special
-///  formula guaranteeing more stability.
-bool areComparable(UInt64 a, UInt64 b)
-{
-    const Float64 sensitivity = 0.001;
-    const UInt64 threshold = 10000;
-
-    if ((a == 0) || (b == 0))
-        return false;
-
-    auto res = std::minmax(a, b);
-    return (((1 - static_cast<Float64>(res.first) / res.second) < sensitivity) && (res.first > threshold));
-}
-
-}
-
-/** Statistical aggregate functions
-  * varSamp - sample variance
-  * stddevSamp - mean sample quadratic deviation
-  * varPop - variance
-  * stddevPop - standard deviation
-  * covarSamp - selective covariance
-  * covarPop - covariance
-  * corr - correlation
-  */
-
-/** Parallel and incremental algorithm for calculating variance.
-  * Source: "Updating formulae and a pairwise algorithm for computing sample variances"
-  * (Chan et al., Stanford University, 12.1979)
-  */
-template <typename T, typename Op>
-class AggregateFunctionVarianceData
-{
-public:
-    void update(const IColumn & column, size_t row_num)
-    {
-        T received = assert_cast<const ColumnVector<T> &>(column).getData()[row_num];
-        Float64 val = static_cast<Float64>(received);
-        Float64 delta = val - mean;
-
-        ++count;
-        mean += delta / count;
-        m2 += delta * (val - mean);
-    }
-
-    void mergeWith(const AggregateFunctionVarianceData & source)
-    {
-        UInt64 total_count = count + source.count;
-        if (total_count == 0)
-            return;
-
-        Float64 factor = static_cast<Float64>(count * source.count) / total_count;
-        Float64 delta = mean - source.mean;
-
-        if (detail::areComparable(count, source.count))
-            mean = (source.count * source.mean + count * mean) / total_count;
-        else
-            mean = source.mean + delta * (static_cast<Float64>(count) / total_count);
-
-        m2 += source.m2 + delta * delta * factor;
-        count = total_count;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeVarUInt(count, buf);
-        writeBinary(mean, buf);
-        writeBinary(m2, buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readVarUInt(count, buf);
-        readBinary(mean, buf);
-        readBinary(m2, buf);
-    }
-
-    void publish(IColumn & to) const
-    {
-        assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(m2, count));
-    }
-
-private:
-    UInt64 count = 0;
-    Float64 mean = 0.0;
-    Float64 m2 = 0.0;
-};
-
-/** The main code for the implementation of varSamp, stddevSamp, varPop, stddevPop.
-  */
-template <typename T, typename Op>
-class AggregateFunctionVariance final
-    : public IAggregateFunctionDataHelper<AggregateFunctionVarianceData<T, Op>, AggregateFunctionVariance<T, Op>>
-{
-public:
-    explicit AggregateFunctionVariance(const DataTypePtr & arg)
-        : IAggregateFunctionDataHelper<AggregateFunctionVarianceData<T, Op>, AggregateFunctionVariance<T, Op>>({arg}, {}, std::make_shared<DataTypeFloat64>())
-    {}
-
-    String getName() const override { return Op::name; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).update(*columns[0], row_num);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).mergeWith(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        this->data(place).publish(to);
-    }
-};
-
-/** Implementing the varSamp function.
-  */
-struct AggregateFunctionVarSampImpl
-{
-    static constexpr auto name = "varSampStable";
-
-    static inline Float64 apply(Float64 m2, UInt64 count)
-    {
-        if (count < 2)
-            return std::numeric_limits<Float64>::infinity();
-        else
-            return m2 / (count - 1);
-    }
-};
-
-/** Implementing the stddevSamp function.
-  */
-struct AggregateFunctionStdDevSampImpl
-{
-    static constexpr auto name = "stddevSampStable";
-
-    static inline Float64 apply(Float64 m2, UInt64 count)
-    {
-        return sqrt(AggregateFunctionVarSampImpl::apply(m2, count));
-    }
-};
-
-/** Implementing the varPop function.
-  */
-struct AggregateFunctionVarPopImpl
-{
-    static constexpr auto name = "varPopStable";
-
-    static inline Float64 apply(Float64 m2, UInt64 count)
-    {
-        if (count == 0)
-            return std::numeric_limits<Float64>::infinity();
-        else if (count == 1)
-            return 0.0;
-        else
-            return m2 / count;
-    }
-};
-
-/** Implementing the stddevPop function.
-  */
-struct AggregateFunctionStdDevPopImpl
-{
-    static constexpr auto name = "stddevPopStable";
-
-    static inline Float64 apply(Float64 m2, UInt64 count)
-    {
-        return sqrt(AggregateFunctionVarPopImpl::apply(m2, count));
-    }
-};
-
-/** If `compute_marginal_moments` flag is set this class provides the successor
-  * CovarianceData support of marginal moments for calculating the correlation.
-  */
-template <bool compute_marginal_moments>
-class BaseCovarianceData
-{
-protected:
-    void incrementMarginalMoments(Float64, Float64) {}
-    void mergeWith(const BaseCovarianceData &) {}
-    void serialize(WriteBuffer &) const {}
-    void deserialize(const ReadBuffer &) {}
-};
-
-template <>
-class BaseCovarianceData<true>
-{
-protected:
-    void incrementMarginalMoments(Float64 left_incr, Float64 right_incr)
-    {
-        left_m2 += left_incr;
-        right_m2 += right_incr;
-    }
-
-    void mergeWith(const BaseCovarianceData & source)
-    {
-        left_m2 += source.left_m2;
-        right_m2 += source.right_m2;
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(left_m2, buf);
-        writeBinary(right_m2, buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(left_m2, buf);
-        readBinary(right_m2, buf);
-    }
-
-    Float64 left_m2 = 0.0;
-    Float64 right_m2 = 0.0;
-};
-
-/** Parallel and incremental algorithm for calculating covariance.
-  * Source: "Numerically Stable, Single-Pass, Parallel Statistics Algorithms"
-  * (J. Bennett et al., Sandia National Laboratories,
-  *  2009 IEEE International Conference on Cluster Computing)
-  */
-template <typename T, typename U, typename Op, bool compute_marginal_moments>
-class CovarianceData : public BaseCovarianceData<compute_marginal_moments>
-{
-private:
-    using Base = BaseCovarianceData<compute_marginal_moments>;
-
-public:
-    void update(const IColumn & column_left, const IColumn & column_right, size_t row_num)
-    {
-        T left_received = assert_cast<const ColumnVector<T> &>(column_left).getData()[row_num];
-        Float64 left_val = static_cast<Float64>(left_received);
-        Float64 left_delta = left_val - left_mean;
-
-        U right_received = assert_cast<const ColumnVector<U> &>(column_right).getData()[row_num];
-        Float64 right_val = static_cast<Float64>(right_received);
-        Float64 right_delta = right_val - right_mean;
-
-        Float64 old_right_mean = right_mean;
-
-        ++count;
-
-        left_mean += left_delta / count;
-        right_mean += right_delta / count;
-        co_moment += (left_val - left_mean) * (right_val - old_right_mean);
-
-        /// Update the marginal moments, if any.
-        if (compute_marginal_moments)
-        {
-            Float64 left_incr = left_delta * (left_val - left_mean);
-            Float64 right_incr = right_delta * (right_val - right_mean);
-            Base::incrementMarginalMoments(left_incr, right_incr);
-        }
-    }
-
-    void mergeWith(const CovarianceData & source)
-    {
-        UInt64 total_count = count + source.count;
-        if (total_count == 0)
-            return;
-
-        Float64 factor = static_cast<Float64>(count * source.count) / total_count;
-        Float64 left_delta = left_mean - source.left_mean;
-        Float64 right_delta = right_mean - source.right_mean;
-
-        if (detail::areComparable(count, source.count))
-        {
-            left_mean = (source.count * source.left_mean + count * left_mean) / total_count;
-            right_mean = (source.count * source.right_mean + count * right_mean) / total_count;
-        }
-        else
-        {
-            left_mean = source.left_mean + left_delta * (static_cast<Float64>(count) / total_count);
-            right_mean = source.right_mean + right_delta * (static_cast<Float64>(count) / total_count);
-        }
-
-        co_moment += source.co_moment + left_delta * right_delta * factor;
-        count = total_count;
-
-        /// Update the marginal moments, if any.
-        if (compute_marginal_moments)
-        {
-            Float64 left_incr = left_delta * left_delta * factor;
-            Float64 right_incr = right_delta * right_delta * factor;
-            Base::mergeWith(source);
-            Base::incrementMarginalMoments(left_incr, right_incr);
-        }
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeVarUInt(count, buf);
-        writeBinary(left_mean, buf);
-        writeBinary(right_mean, buf);
-        writeBinary(co_moment, buf);
-        Base::serialize(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readVarUInt(count, buf);
-        readBinary(left_mean, buf);
-        readBinary(right_mean, buf);
-        readBinary(co_moment, buf);
-        Base::deserialize(buf);
-    }
-
-    void publish(IColumn & to) const
-    {
-        if constexpr (compute_marginal_moments)
-            assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(co_moment, Base::left_m2, Base::right_m2, count));
-        else
-            assert_cast<ColumnFloat64 &>(to).getData().push_back(Op::apply(co_moment, count));
-    }
-
-private:
-    UInt64 count = 0;
-    Float64 left_mean = 0.0;
-    Float64 right_mean = 0.0;
-    Float64 co_moment = 0.0;
-};
-
-template <typename T, typename U, typename Op, bool compute_marginal_moments = false>
-class AggregateFunctionCovariance final
-    : public IAggregateFunctionDataHelper<
-        CovarianceData<T, U, Op, compute_marginal_moments>,
-        AggregateFunctionCovariance<T, U, Op, compute_marginal_moments>>
-{
-public:
-    explicit AggregateFunctionCovariance(const DataTypes & args) : IAggregateFunctionDataHelper<
-        CovarianceData<T, U, Op, compute_marginal_moments>,
-        AggregateFunctionCovariance<T, U, Op, compute_marginal_moments>>(args, {}, std::make_shared<DataTypeFloat64>())
-    {}
-
-    String getName() const override { return Op::name; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).update(*columns[0], *columns[1], row_num);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).mergeWith(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        this->data(place).publish(to);
-    }
-};
-
-/** Implementing the covarSamp function.
-  */
-struct AggregateFunctionCovarSampImpl
-{
-    static constexpr auto name = "covarSampStable";
-
-    static inline Float64 apply(Float64 co_moment, UInt64 count)
-    {
-        if (count < 2)
-            return std::numeric_limits<Float64>::infinity();
-        else
-            return co_moment / (count - 1);
-    }
-};
-
-/** Implementing the covarPop function.
-  */
-struct AggregateFunctionCovarPopImpl
-{
-    static constexpr auto name = "covarPopStable";
-
-    static inline Float64 apply(Float64 co_moment, UInt64 count)
-    {
-        if (count == 0)
-            return std::numeric_limits<Float64>::infinity();
-        else if (count == 1)
-            return 0.0;
-        else
-            return co_moment / count;
-    }
-};
-
-/** `corr` function implementation.
-  */
-struct AggregateFunctionCorrImpl
-{
-    static constexpr auto name = "corrStable";
-
-    static inline Float64 apply(Float64 co_moment, Float64 left_m2, Float64 right_m2, UInt64 count)
-    {
-        if (count < 2)
-            return std::numeric_limits<Float64>::infinity();
-        else
-            return co_moment / sqrt(left_m2 * right_m2);
-    }
-};
-
-template <typename T>
-using AggregateFunctionVarSampStable = AggregateFunctionVariance<T, AggregateFunctionVarSampImpl>;
-
-template <typename T>
-using AggregateFunctionStddevSampStable = AggregateFunctionVariance<T, AggregateFunctionStdDevSampImpl>;
-
-template <typename T>
-using AggregateFunctionVarPopStable = AggregateFunctionVariance<T, AggregateFunctionVarPopImpl>;
-
-template <typename T>
-using AggregateFunctionStddevPopStable = AggregateFunctionVariance<T, AggregateFunctionStdDevPopImpl>;
-
-template <typename T, typename U>
-using AggregateFunctionCovarSampStable = AggregateFunctionCovariance<T, U, AggregateFunctionCovarSampImpl>;
-
-template <typename T, typename U>
-using AggregateFunctionCovarPopStable = AggregateFunctionCovariance<T, U, AggregateFunctionCovarPopImpl>;
-
-template <typename T, typename U>
-using AggregateFunctionCorrStable = AggregateFunctionCovariance<T, U, AggregateFunctionCorrImpl, true>;
-
-}
--- a/src/AggregateFunctions/AggregateFunctionSumCount.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSumCount.cpp
@ -1,7 +1,8 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionSumCount.h>
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <AggregateFunctions/AggregateFunctionAvg.h>


 namespace DB
@ -16,6 +17,59 @@ namespace ErrorCodes

 namespace
 {
+
+template <typename T>
+class AggregateFunctionSumCount final : public AggregateFunctionAvg<T>
+{
+public:
+    using Base = AggregateFunctionAvg<T>;
+
+    explicit AggregateFunctionSumCount(const DataTypes & argument_types_, UInt32 num_scale_ = 0)
+        : Base(argument_types_, createResultType(num_scale_), num_scale_)
+    {}
+
+    static DataTypePtr createResultType(UInt32 num_scale_)
+    {
+        auto second_elem = std::make_shared<DataTypeUInt64>();
+        return std::make_shared<DataTypeTuple>(DataTypes{getReturnTypeFirstElement(num_scale_), std::move(second_elem)});
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const final
+    {
+        assert_cast<ColumnVectorOrDecimal<AvgFieldType<T>> &>((assert_cast<ColumnTuple &>(to)).getColumn(0)).getData().push_back(
+            this->data(place).numerator);
+
+        assert_cast<ColumnUInt64 &>((assert_cast<ColumnTuple &>(to)).getColumn(1)).getData().push_back(
+            this->data(place).denominator);
+    }
+
+    String getName() const final { return "sumCount"; }
+
+#if USE_EMBEDDED_COMPILER
+
+    bool isCompilable() const override
+    {
+        return false;
+    }
+
+#endif
+
+private:
+    static auto getReturnTypeFirstElement(UInt32 num_scale_)
+    {
+        using FieldType = AvgFieldType<T>;
+
+        if constexpr (!is_decimal<T>)
+            return std::make_shared<DataTypeNumber<FieldType>>();
+        else
+        {
+            using DataType = DataTypeDecimal<FieldType>;
+            return std::make_shared<DataType>(DataType::maxPrecision(), num_scale_);
+        }
+    }
+};
+
+
 bool allowType(const DataTypePtr& type) noexcept
 {
    const WhichDataType t(type);
--- a/src/AggregateFunctions/AggregateFunctionSumCount.h
+++ b/src/AggregateFunctions/AggregateFunctionSumCount.h
@ -1,61 +0,0 @@
-#pragma once
-
-#include <type_traits>
-#include <DataTypes/DataTypeTuple.h>
-#include <AggregateFunctions/AggregateFunctionAvg.h>
-
-
-namespace DB
-{
-template <typename T>
-class AggregateFunctionSumCount final : public AggregateFunctionAvg<T>
-{
-public:
-    using Base = AggregateFunctionAvg<T>;
-
-    explicit AggregateFunctionSumCount(const DataTypes & argument_types_, UInt32 num_scale_ = 0)
-        : Base(argument_types_, createResultType(num_scale_), num_scale_)
-    {}
-
-    static DataTypePtr createResultType(UInt32 num_scale_)
-    {
-        auto second_elem = std::make_shared<DataTypeUInt64>();
-        return std::make_shared<DataTypeTuple>(DataTypes{getReturnTypeFirstElement(num_scale_), std::move(second_elem)});
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const final
-    {
-        assert_cast<ColumnVectorOrDecimal<AvgFieldType<T>> &>((assert_cast<ColumnTuple &>(to)).getColumn(0)).getData().push_back(
-            this->data(place).numerator);
-
-        assert_cast<ColumnUInt64 &>((assert_cast<ColumnTuple &>(to)).getColumn(1)).getData().push_back(
-            this->data(place).denominator);
-    }
-
-    String getName() const final { return "sumCount"; }
-
-#if USE_EMBEDDED_COMPILER
-
-    bool isCompilable() const override
-    {
-        return false;
-    }
-
-#endif
-
-private:
-    static auto getReturnTypeFirstElement(UInt32 num_scale_)
-    {
-        using FieldType = AvgFieldType<T>;
-
-        if constexpr (!is_decimal<T>)
-            return std::make_shared<DataTypeNumber<FieldType>>();
-        else
-        {
-            using DataType = DataTypeDecimal<FieldType>;
-            return std::make_shared<DataType>(DataType::maxPrecision(), num_scale_);
-        }
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionTopK.cpp
+++ b/src/AggregateFunctions/AggregateFunctionTopK.cpp
@ -1,5 +1,4 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionTopK.h>
 #include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>
 #include <Common/FieldVisitorConvertToNumber.h>
@ -7,6 +6,20 @@
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeIPv4andIPv6.h>

+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <IO/ReadHelpersArena.h>
+
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeString.h>
+
+#include <Columns/ColumnArray.h>
+
+#include <Common/SpaceSaving.h>
+#include <Common/assert_cast.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+

 namespace DB
 {
@ -25,6 +38,229 @@ namespace ErrorCodes
 namespace
 {

+static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF;
+
+template <typename T>
+struct AggregateFunctionTopKData
+{
+    using Set = SpaceSaving<T, HashCRC32<T>>;
+
+    Set value;
+};
+
+
+template <typename T, bool is_weighted>
+class AggregateFunctionTopK
+    : public IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>
+{
+protected:
+    using State = AggregateFunctionTopKData<T>;
+    UInt64 threshold;
+    UInt64 reserved;
+
+public:
+    AggregateFunctionTopK(UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
+        : IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>(argument_types_, params, createResultType(argument_types_))
+        , threshold(threshold_), reserved(load_factor * threshold)
+    {}
+
+    AggregateFunctionTopK(UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params, const DataTypePtr & result_type_)
+        : IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>(argument_types_, params, result_type_)
+        , threshold(threshold_), reserved(load_factor * threshold)
+    {}
+
+    String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }
+
+    static DataTypePtr createResultType(const DataTypes & argument_types_)
+    {
+        return std::make_shared<DataTypeArray>(argument_types_[0]);
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        auto & set = this->data(place).value;
+        if (set.capacity() != reserved)
+            set.resize(reserved);
+
+        if constexpr (is_weighted)
+            set.insert(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num], columns[1]->getUInt(row_num));
+        else
+            set.insert(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto & set = this->data(place).value;
+        if (set.capacity() != reserved)
+            set.resize(reserved);
+        set.merge(this->data(rhs).value);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).value.write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
+    {
+        auto & set = this->data(place).value;
+        set.resize(reserved);
+        set.read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
+        ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
+
+        const typename State::Set & set = this->data(place).value;
+        auto result_vec = set.topK(threshold);
+        size_t size = result_vec.size();
+
+        offsets_to.push_back(offsets_to.back() + size);
+
+        typename ColumnVector<T>::Container & data_to = assert_cast<ColumnVector<T> &>(arr_to.getData()).getData();
+        size_t old_size = data_to.size();
+        data_to.resize(old_size + size);
+
+        size_t i = 0;
+        for (auto it = result_vec.begin(); it != result_vec.end(); ++it, ++i)
+            data_to[old_size + i] = it->key;
+    }
+};
+
+
+/// Generic implementation, it uses serialized representation as object descriptor.
+struct AggregateFunctionTopKGenericData
+{
+    using Set = SpaceSaving<StringRef, StringRefHash>;
+
+    Set value;
+};
+
+/** Template parameter with true value should be used for columns that store their elements in memory continuously.
+ *  For such columns topK() can be implemented more efficiently (especially for small numeric arrays).
+ */
+template <bool is_plain_column, bool is_weighted>
+class AggregateFunctionTopKGeneric
+    : public IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>
+{
+private:
+    using State = AggregateFunctionTopKGenericData;
+
+    UInt64 threshold;
+    UInt64 reserved;
+
+    static void deserializeAndInsert(StringRef str, IColumn & data_to);
+
+public:
+    AggregateFunctionTopKGeneric(
+        UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
+        : IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>(argument_types_, params, createResultType(argument_types_))
+        , threshold(threshold_), reserved(load_factor * threshold) {}
+
+    String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }
+
+    static DataTypePtr createResultType(const DataTypes & argument_types_)
+    {
+        return std::make_shared<DataTypeArray>(argument_types_[0]);
+    }
+
+    bool allocatesMemoryInArena() const override
+    {
+        return true;
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).value.write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
+    {
+        auto & set = this->data(place).value;
+        set.clear();
+
+        // Specialized here because there's no deserialiser for StringRef
+        size_t size = 0;
+        readVarUInt(size, buf);
+        if (unlikely(size > TOP_K_MAX_SIZE))
+            throw Exception(
+                ErrorCodes::ARGUMENT_OUT_OF_BOUND,
+                "Too large size ({}) for aggregate function '{}' state (maximum is {})",
+                size,
+                getName(),
+                TOP_K_MAX_SIZE);
+        set.resize(size);
+        for (size_t i = 0; i < size; ++i)
+        {
+            auto ref = readStringBinaryInto(*arena, buf);
+            UInt64 count;
+            UInt64 error;
+            readVarUInt(count, buf);
+            readVarUInt(error, buf);
+            set.insert(ref, count, error);
+            arena->rollback(ref.size);
+        }
+
+        set.readAlphaMap(buf);
+    }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    {
+        auto & set = this->data(place).value;
+        if (set.capacity() != reserved)
+            set.resize(reserved);
+
+        if constexpr (is_plain_column)
+        {
+            if constexpr (is_weighted)
+                set.insert(columns[0]->getDataAt(row_num), columns[1]->getUInt(row_num));
+            else
+                set.insert(columns[0]->getDataAt(row_num));
+        }
+        else
+        {
+            const char * begin = nullptr;
+            StringRef str_serialized = columns[0]->serializeValueIntoArena(row_num, *arena, begin);
+            if constexpr (is_weighted)
+                set.insert(str_serialized, columns[1]->getUInt(row_num));
+            else
+                set.insert(str_serialized);
+            arena->rollback(str_serialized.size);
+        }
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        auto & set = this->data(place).value;
+        if (set.capacity() != reserved)
+            set.resize(reserved);
+        set.merge(this->data(rhs).value);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
+        ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
+        IColumn & data_to = arr_to.getData();
+
+        auto result_vec = this->data(place).value.topK(threshold);
+        offsets_to.push_back(offsets_to.back() + result_vec.size());
+
+        for (auto & elem : result_vec)
+        {
+            if constexpr (is_plain_column)
+                data_to.insertData(elem.key.data, elem.key.size);
+            else
+                data_to.deserializeAndInsertFromArena(elem.key.data);
+        }
+    }
+};
+
+
 /// Substitute return type for Date and DateTime
 template <bool is_weighted>
 class AggregateFunctionTopKDate : public AggregateFunctionTopK<DataTypeDate::FieldType, is_weighted>
--- a/src/AggregateFunctions/AggregateFunctionTopK.h
+++ b/src/AggregateFunctions/AggregateFunctionTopK.h
@ -1,250 +0,0 @@
-#pragma once
-
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-#include <IO/ReadHelpersArena.h>
-
-#include <DataTypes/DataTypeArray.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeString.h>
-
-#include <Columns/ColumnArray.h>
-
-#include <Common/SpaceSaving.h>
-#include <Common/assert_cast.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-
-
-namespace DB
-{
-struct Settings;
-
-static inline constexpr UInt64 TOP_K_MAX_SIZE = 0xFFFFFF;
-
-namespace ErrorCodes
-{
-    extern const int ARGUMENT_OUT_OF_BOUND;
-}
-
-template <typename T>
-struct AggregateFunctionTopKData
-{
-    using Set = SpaceSaving<T, HashCRC32<T>>;
-
-    Set value;
-};
-
-
-template <typename T, bool is_weighted>
-class AggregateFunctionTopK
-    : public IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>
-{
-protected:
-    using State = AggregateFunctionTopKData<T>;
-    UInt64 threshold;
-    UInt64 reserved;
-
-public:
-    AggregateFunctionTopK(UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
-        : IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>(argument_types_, params, createResultType(argument_types_))
-        , threshold(threshold_), reserved(load_factor * threshold)
-    {}
-
-    AggregateFunctionTopK(UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params, const DataTypePtr & result_type_)
-        : IAggregateFunctionDataHelper<AggregateFunctionTopKData<T>, AggregateFunctionTopK<T, is_weighted>>(argument_types_, params, result_type_)
-        , threshold(threshold_), reserved(load_factor * threshold)
-    {}
-
-    String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }
-
-    static DataTypePtr createResultType(const DataTypes & argument_types_)
-    {
-        return std::make_shared<DataTypeArray>(argument_types_[0]);
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        auto & set = this->data(place).value;
-        if (set.capacity() != reserved)
-            set.resize(reserved);
-
-        if constexpr (is_weighted)
-            set.insert(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num], columns[1]->getUInt(row_num));
-        else
-            set.insert(assert_cast<const ColumnVector<T> &>(*columns[0]).getData()[row_num]);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        auto & set = this->data(place).value;
-        if (set.capacity() != reserved)
-            set.resize(reserved);
-        set.merge(this->data(rhs).value);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).value.write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
-    {
-        auto & set = this->data(place).value;
-        set.resize(reserved);
-        set.read(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
-        ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
-
-        const typename State::Set & set = this->data(place).value;
-        auto result_vec = set.topK(threshold);
-        size_t size = result_vec.size();
-
-        offsets_to.push_back(offsets_to.back() + size);
-
-        typename ColumnVector<T>::Container & data_to = assert_cast<ColumnVector<T> &>(arr_to.getData()).getData();
-        size_t old_size = data_to.size();
-        data_to.resize(old_size + size);
-
-        size_t i = 0;
-        for (auto it = result_vec.begin(); it != result_vec.end(); ++it, ++i)
-            data_to[old_size + i] = it->key;
-    }
-};
-
-
-/// Generic implementation, it uses serialized representation as object descriptor.
-struct AggregateFunctionTopKGenericData
-{
-    using Set = SpaceSaving<StringRef, StringRefHash>;
-
-    Set value;
-};
-
-/** Template parameter with true value should be used for columns that store their elements in memory continuously.
- *  For such columns topK() can be implemented more efficiently (especially for small numeric arrays).
- */
-template <bool is_plain_column, bool is_weighted>
-class AggregateFunctionTopKGeneric
-    : public IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>
-{
-private:
-    using State = AggregateFunctionTopKGenericData;
-
-    UInt64 threshold;
-    UInt64 reserved;
-
-    static void deserializeAndInsert(StringRef str, IColumn & data_to);
-
-public:
-    AggregateFunctionTopKGeneric(
-        UInt64 threshold_, UInt64 load_factor, const DataTypes & argument_types_, const Array & params)
-        : IAggregateFunctionDataHelper<AggregateFunctionTopKGenericData, AggregateFunctionTopKGeneric<is_plain_column, is_weighted>>(argument_types_, params, createResultType(argument_types_))
-        , threshold(threshold_), reserved(load_factor * threshold) {}
-
-    String getName() const override { return is_weighted ? "topKWeighted" : "topK"; }
-
-    static DataTypePtr createResultType(const DataTypes & argument_types_)
-    {
-        return std::make_shared<DataTypeArray>(argument_types_[0]);
-    }
-
-    bool allocatesMemoryInArena() const override
-    {
-        return true;
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).value.write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena * arena) const override
-    {
-        auto & set = this->data(place).value;
-        set.clear();
-
-        // Specialized here because there's no deserialiser for StringRef
-        size_t size = 0;
-        readVarUInt(size, buf);
-        if (unlikely(size > TOP_K_MAX_SIZE))
-            throw Exception(
-                ErrorCodes::ARGUMENT_OUT_OF_BOUND,
-                "Too large size ({}) for aggregate function '{}' state (maximum is {})",
-                size,
-                getName(),
-                TOP_K_MAX_SIZE);
-        set.resize(size);
-        for (size_t i = 0; i < size; ++i)
-        {
-            auto ref = readStringBinaryInto(*arena, buf);
-            UInt64 count;
-            UInt64 error;
-            readVarUInt(count, buf);
-            readVarUInt(error, buf);
-            set.insert(ref, count, error);
-            arena->rollback(ref.size);
-        }
-
-        set.readAlphaMap(buf);
-    }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
-    {
-        auto & set = this->data(place).value;
-        if (set.capacity() != reserved)
-            set.resize(reserved);
-
-        if constexpr (is_plain_column)
-        {
-            if constexpr (is_weighted)
-                set.insert(columns[0]->getDataAt(row_num), columns[1]->getUInt(row_num));
-            else
-                set.insert(columns[0]->getDataAt(row_num));
-        }
-        else
-        {
-            const char * begin = nullptr;
-            StringRef str_serialized = columns[0]->serializeValueIntoArena(row_num, *arena, begin);
-            if constexpr (is_weighted)
-                set.insert(str_serialized, columns[1]->getUInt(row_num));
-            else
-                set.insert(str_serialized);
-            arena->rollback(str_serialized.size);
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        auto & set = this->data(place).value;
-        if (set.capacity() != reserved)
-            set.resize(reserved);
-        set.merge(this->data(rhs).value);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        ColumnArray & arr_to = assert_cast<ColumnArray &>(to);
-        ColumnArray::Offsets & offsets_to = arr_to.getOffsets();
-        IColumn & data_to = arr_to.getData();
-
-        auto result_vec = this->data(place).value.topK(threshold);
-        offsets_to.push_back(offsets_to.back() + result_vec.size());
-
-        for (auto & elem : result_vec)
-        {
-            if constexpr (is_plain_column)
-                data_to.insertData(elem.key.data, elem.key.size);
-            else
-                data_to.deserializeAndInsertFromArena(elem.key.data);
-        }
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp
+++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.cpp
@ -1,5 +1,3 @@
-#include <AggregateFunctions/AggregateFunctionUniqCombined.h>
-
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>

@ -10,6 +8,23 @@
 #include <DataTypes/DataTypeDateTime.h>
 #include <DataTypes/DataTypeIPv4andIPv6.h>

+#include <base/bit_cast.h>
+
+#include <Common/CombinedCardinalityEstimator.h>
+#include <Common/SipHash.h>
+#include <Common/typeid_cast.h>
+#include <Common/assert_cast.h>
+
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <DataTypes/DataTypesNumber.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/UniqCombinedBiasData.h>
+#include <AggregateFunctions/UniqVariadicHash.h>
+
+#include <Columns/ColumnVector.h>
+
 #include <functional>


@ -26,119 +41,332 @@ namespace ErrorCodes

 namespace
 {
-    template <UInt8 K, typename HashValueType>
-    struct WithK
+
+/** Hash function for uniqCombined/uniqCombined64 (based on Ret).
+ */
+template <typename T, typename Ret>
+struct AggregateFunctionUniqCombinedTraits
+{
+    static Ret hash(T x)
    {
-        template <typename T>
-        using AggregateFunction = AggregateFunctionUniqCombined<T, K, HashValueType>;
-
-        template <bool is_exact, bool argument_is_tuple>
-        using AggregateFunctionVariadic = AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>;
-    };
-
-    template <UInt8 K, typename HashValueType>
-    AggregateFunctionPtr createAggregateFunctionWithK(const DataTypes & argument_types, const Array & params)
-    {
-        /// We use exact hash function if the arguments are not contiguous in memory, because only exact hash function has support for this case.
-        bool use_exact_hash_function = !isAllArgumentsContiguousInMemory(argument_types);
-
-        if (argument_types.size() == 1)
-        {
-            const IDataType & argument_type = *argument_types[0];
-
-            AggregateFunctionPtr res(createWithNumericType<WithK<K, HashValueType>::template AggregateFunction>(*argument_types[0], argument_types, params));
-
-            WhichDataType which(argument_type);
-            if (res)
-                return res;
-            else if (which.isDate())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDate::FieldType>>(argument_types, params);
-            else if (which.isDate32())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDate32::FieldType>>(argument_types, params);
-            else if (which.isDateTime())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDateTime::FieldType>>(argument_types, params);
-            else if (which.isStringOrFixedString())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<String>>(argument_types, params);
-            else if (which.isUUID())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeUUID::FieldType>>(argument_types, params);
-            else if (which.isIPv4())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeIPv4::FieldType>>(argument_types, params);
-            else if (which.isIPv6())
-                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeIPv6::FieldType>>(argument_types, params);
-            else if (which.isTuple())
-            {
-                if (use_exact_hash_function)
-                    return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<true, true>>(argument_types, params);
-                else
-                    return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<false, true>>(argument_types, params);
-            }
-        }
-
-        /// "Variadic" method also works as a fallback generic case for a single argument.
-        if (use_exact_hash_function)
-            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<true, false>>(argument_types, params);
+        if constexpr (sizeof(T) > sizeof(UInt64))
+            return static_cast<Ret>(DefaultHash64<T>(x));
        else
-            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<false, false>>(argument_types, params);
+            return static_cast<Ret>(intHash64(x));
    }
+};

-    template <UInt8 K>
-    AggregateFunctionPtr createAggregateFunctionWithHashType(bool use_64_bit_hash, const DataTypes & argument_types, const Array & params)
+template <typename Ret>
+struct AggregateFunctionUniqCombinedTraits<UInt128, Ret>
+{
+    static Ret hash(UInt128 x)
    {
-        if (use_64_bit_hash)
-            return createAggregateFunctionWithK<K, UInt64>(argument_types, params);
+        return static_cast<Ret>(sipHash64(x));
+    }
+};
+
+template <typename Ret>
+struct AggregateFunctionUniqCombinedTraits<Float32, Ret>
+{
+    static Ret hash(Float32 x)
+    {
+        UInt64 res = bit_cast<UInt64>(x);
+        return static_cast<Ret>(intHash64(res));
+    }
+};
+
+template <typename Ret>
+struct AggregateFunctionUniqCombinedTraits<Float64, Ret>
+{
+    static Ret hash(Float64 x)
+    {
+        UInt64 res = bit_cast<UInt64>(x);
+        return static_cast<Ret>(intHash64(res));
+    }
+};
+
+// Unlike HashTableGrower always grows to power of 2.
+struct UniqCombinedHashTableGrower : public HashTableGrowerWithPrecalculation<>
+{
+    void increaseSize() { increaseSizeDegree(1); }
+};
+
+template <typename Key, UInt8 K>
+struct AggregateFunctionUniqCombinedDataWithKey
+{
+    // TODO(ilezhankin): pre-generate values for |UniqCombinedBiasData|,
+    //                   at the moment gen-bias-data.py script doesn't work.
+
+    // We want to migrate from |HashSet| to |HyperLogLogCounter| when the sizes in memory become almost equal.
+    // The size per element in |HashSet| is sizeof(Key)*2 bytes, and the overall size of |HyperLogLogCounter| is 2^K * 6 bits.
+    // For Key=UInt32 we can calculate: 2^X * 4 * 2 ≤ 2^(K-3) * 6 ⇒ X ≤ K-4.
+    using Set = CombinedCardinalityEstimator<Key, HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>, 16, K - 5 + (sizeof(Key) == sizeof(UInt32)), K, TrivialHash, Key>;
+
+    Set set;
+};
+
+template <typename Key>
+struct AggregateFunctionUniqCombinedDataWithKey<Key, 17>
+{
+    using Set = CombinedCardinalityEstimator<Key,
+        HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>,
+        16,
+        12 + (sizeof(Key) == sizeof(UInt32)),
+        17,
+        TrivialHash,
+        Key,
+        HyperLogLogBiasEstimator<UniqCombinedBiasData>,
+        HyperLogLogMode::FullFeatured>;
+
+    Set set;
+};
+
+
+template <typename T, UInt8 K, typename HashValueType>
+struct AggregateFunctionUniqCombinedData : public AggregateFunctionUniqCombinedDataWithKey<HashValueType, K>
+{
+};
+
+
+/// For String keys, 64 bit hash is always used (both for uniqCombined and uniqCombined64),
+///  because of backwards compatibility (64 bit hash was already used for uniqCombined).
+template <UInt8 K, typename HashValueType>
+struct AggregateFunctionUniqCombinedData<String, K, HashValueType> : public AggregateFunctionUniqCombinedDataWithKey<UInt64 /*always*/, K>
+{
+};
+
+template <UInt8 K, typename HashValueType>
+struct AggregateFunctionUniqCombinedData<IPv6, K, HashValueType> : public AggregateFunctionUniqCombinedDataWithKey<UInt64 /*always*/, K>
+{
+};
+
+template <typename T, UInt8 K, typename HashValueType>
+class AggregateFunctionUniqCombined final
+    : public IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<T, K, HashValueType>, AggregateFunctionUniqCombined<T, K, HashValueType>>
+{
+public:
+    AggregateFunctionUniqCombined(const DataTypes & argument_types_, const Array & params_)
+        : IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<T, K, HashValueType>, AggregateFunctionUniqCombined<T, K, HashValueType>>(argument_types_, params_, std::make_shared<DataTypeUInt64>())
+    {}
+
+    String getName() const override
+    {
+        if constexpr (std::is_same_v<HashValueType, UInt64>)
+            return "uniqCombined64";
        else
-            return createAggregateFunctionWithK<K, UInt32>(argument_types, params);
+            return "uniqCombined";
    }

-    AggregateFunctionPtr createAggregateFunctionUniqCombined(bool use_64_bit_hash,
-        const std::string & name, const DataTypes & argument_types, const Array & params)
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
    {
-        /// log2 of the number of cells in HyperLogLog.
-        /// Reasonable default value, selected to be comparable in quality with "uniq" aggregate function.
-        UInt8 precision = 17;
-
-        if (!params.empty())
+        if constexpr (std::is_same_v<T, String> || std::is_same_v<T, IPv6>)
        {
-            if (params.size() != 1)
-                throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires one parameter or less.",
-                    name);
-
-            UInt64 precision_param = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
-            // This range is hardcoded below
-            if (precision_param > 20 || precision_param < 12)
-                throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter for aggregate function {} is out of range: [12, 20].",
-                    name);
-            precision = precision_param;
+            StringRef value = columns[0]->getDataAt(row_num);
+            this->data(place).set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size));
        }
-
-        if (argument_types.empty())
-            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Incorrect number of arguments for aggregate function {}", name);
-
-        switch (precision) // NOLINT(bugprone-switch-missing-default-case)
+        else
        {
-            case 12:
-                return createAggregateFunctionWithHashType<12>(use_64_bit_hash, argument_types, params);
-            case 13:
-                return createAggregateFunctionWithHashType<13>(use_64_bit_hash, argument_types, params);
-            case 14:
-                return createAggregateFunctionWithHashType<14>(use_64_bit_hash, argument_types, params);
-            case 15:
-                return createAggregateFunctionWithHashType<15>(use_64_bit_hash, argument_types, params);
-            case 16:
-                return createAggregateFunctionWithHashType<16>(use_64_bit_hash, argument_types, params);
-            case 17:
-                return createAggregateFunctionWithHashType<17>(use_64_bit_hash, argument_types, params);
-            case 18:
-                return createAggregateFunctionWithHashType<18>(use_64_bit_hash, argument_types, params);
-            case 19:
-                return createAggregateFunctionWithHashType<19>(use_64_bit_hash, argument_types, params);
-            case 20:
-                return createAggregateFunctionWithHashType<20>(use_64_bit_hash, argument_types, params);
+            const auto & value = assert_cast<const ColumnVector<T> &>(*columns[0]).getElement(row_num);
+            this->data(place).set.insert(AggregateFunctionUniqCombinedTraits<T, HashValueType>::hash(value));
        }
-
-        UNREACHABLE();
    }

+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).set.merge(this->data(rhs).set);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).set.write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).set.read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
+    }
+};
+
+/** For multiple arguments. To compute, hashes them.
+  * You can pass multiple arguments as is; You can also pass one argument - a tuple.
+  * But (for the possibility of efficient implementation), you can not pass several arguments, among which there are tuples.
+  */
+template <bool is_exact, bool argument_is_tuple, UInt8 K, typename HashValueType>
+class AggregateFunctionUniqCombinedVariadic final : public IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>,
+                                                           AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>>
+{
+private:
+    size_t num_args = 0;
+
+public:
+    explicit AggregateFunctionUniqCombinedVariadic(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>,
+            AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>>(arguments, params, std::make_shared<DataTypeUInt64>())
+    {
+        if (argument_is_tuple)
+            num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
+        else
+            num_args = arguments.size();
+    }
+
+    String getName() const override
+    {
+        if constexpr (std::is_same_v<HashValueType, UInt64>)
+            return "uniqCombined64";
+        else
+            return "uniqCombined";
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).set.insert(typename AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>::Set::value_type(
+            UniqVariadicHash<is_exact, argument_is_tuple>::apply(num_args, columns, row_num)));
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).set.merge(this->data(rhs).set);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).set.write(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
+    {
+        this->data(place).set.read(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
+    }
+};
+
+
+template <UInt8 K, typename HashValueType>
+struct WithK
+{
+    template <typename T>
+    using AggregateFunction = AggregateFunctionUniqCombined<T, K, HashValueType>;
+
+    template <bool is_exact, bool argument_is_tuple>
+    using AggregateFunctionVariadic = AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>;
+};
+
+template <UInt8 K, typename HashValueType>
+AggregateFunctionPtr createAggregateFunctionWithK(const DataTypes & argument_types, const Array & params)
+{
+    /// We use exact hash function if the arguments are not contiguous in memory, because only exact hash function has support for this case.
+    bool use_exact_hash_function = !isAllArgumentsContiguousInMemory(argument_types);
+
+    if (argument_types.size() == 1)
+    {
+        const IDataType & argument_type = *argument_types[0];
+
+        AggregateFunctionPtr res(createWithNumericType<WithK<K, HashValueType>::template AggregateFunction>(*argument_types[0], argument_types, params));
+
+        WhichDataType which(argument_type);
+        if (res)
+            return res;
+        else if (which.isDate())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDate::FieldType>>(argument_types, params);
+        else if (which.isDate32())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDate32::FieldType>>(argument_types, params);
+        else if (which.isDateTime())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeDateTime::FieldType>>(argument_types, params);
+        else if (which.isStringOrFixedString())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<String>>(argument_types, params);
+        else if (which.isUUID())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeUUID::FieldType>>(argument_types, params);
+        else if (which.isIPv4())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeIPv4::FieldType>>(argument_types, params);
+        else if (which.isIPv6())
+            return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunction<DataTypeIPv6::FieldType>>(argument_types, params);
+        else if (which.isTuple())
+        {
+            if (use_exact_hash_function)
+                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<true, true>>(argument_types, params);
+            else
+                return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<false, true>>(argument_types, params);
+        }
+    }
+
+    /// "Variadic" method also works as a fallback generic case for a single argument.
+    if (use_exact_hash_function)
+        return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<true, false>>(argument_types, params);
+    else
+        return std::make_shared<typename WithK<K, HashValueType>::template AggregateFunctionVariadic<false, false>>(argument_types, params);
+}
+
+template <UInt8 K>
+AggregateFunctionPtr createAggregateFunctionWithHashType(bool use_64_bit_hash, const DataTypes & argument_types, const Array & params)
+{
+    if (use_64_bit_hash)
+        return createAggregateFunctionWithK<K, UInt64>(argument_types, params);
+    else
+        return createAggregateFunctionWithK<K, UInt32>(argument_types, params);
+}
+
+AggregateFunctionPtr createAggregateFunctionUniqCombined(bool use_64_bit_hash,
+    const std::string & name, const DataTypes & argument_types, const Array & params)
+{
+    /// log2 of the number of cells in HyperLogLog.
+    /// Reasonable default value, selected to be comparable in quality with "uniq" aggregate function.
+    UInt8 precision = 17;
+
+    if (!params.empty())
+    {
+        if (params.size() != 1)
+            throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Aggregate function {} requires one parameter or less.",
+                name);
+
+        UInt64 precision_param = applyVisitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
+        // This range is hardcoded below
+        if (precision_param > 20 || precision_param < 12)
+            throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Parameter for aggregate function {} is out of range: [12, 20].",
+                name);
+        precision = precision_param;
+    }
+
+    if (argument_types.empty())
+        throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Incorrect number of arguments for aggregate function {}", name);
+
+    switch (precision) // NOLINT(bugprone-switch-missing-default-case)
+    {
+        case 12:
+            return createAggregateFunctionWithHashType<12>(use_64_bit_hash, argument_types, params);
+        case 13:
+            return createAggregateFunctionWithHashType<13>(use_64_bit_hash, argument_types, params);
+        case 14:
+            return createAggregateFunctionWithHashType<14>(use_64_bit_hash, argument_types, params);
+        case 15:
+            return createAggregateFunctionWithHashType<15>(use_64_bit_hash, argument_types, params);
+        case 16:
+            return createAggregateFunctionWithHashType<16>(use_64_bit_hash, argument_types, params);
+        case 17:
+            return createAggregateFunctionWithHashType<17>(use_64_bit_hash, argument_types, params);
+        case 18:
+            return createAggregateFunctionWithHashType<18>(use_64_bit_hash, argument_types, params);
+        case 19:
+            return createAggregateFunctionWithHashType<19>(use_64_bit_hash, argument_types, params);
+        case 20:
+            return createAggregateFunctionWithHashType<20>(use_64_bit_hash, argument_types, params);
+    }
+
+    UNREACHABLE();
+}
+
 }

 void registerAggregateFunctionUniqCombined(AggregateFunctionFactory & factory)
--- a/src/AggregateFunctions/AggregateFunctionUniqCombined.h
+++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.h
@ -1,240 +0,0 @@
-#pragma once
-
-#include <base/bit_cast.h>
-
-#include <Common/CombinedCardinalityEstimator.h>
-#include <Common/SipHash.h>
-#include <Common/typeid_cast.h>
-#include <Common/assert_cast.h>
-
-#include <DataTypes/DataTypeTuple.h>
-#include <DataTypes/DataTypeUUID.h>
-#include <DataTypes/DataTypesNumber.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/UniqCombinedBiasData.h>
-#include <AggregateFunctions/UniqVariadicHash.h>
-
-#include <Columns/ColumnVector.h>
-#include <Columns/ColumnsNumber.h>
-
-
-namespace DB
-{
-struct Settings;
-namespace detail
-{
-    /** Hash function for uniqCombined/uniqCombined64 (based on Ret).
-     */
-    template <typename T, typename Ret>
-    struct AggregateFunctionUniqCombinedTraits
-    {
-        static Ret hash(T x)
-        {
-            if constexpr (sizeof(T) > sizeof(UInt64))
-                return static_cast<Ret>(DefaultHash64<T>(x));
-            else
-                return static_cast<Ret>(intHash64(x));
-        }
-    };
-
-    template <typename Ret>
-    struct AggregateFunctionUniqCombinedTraits<UInt128, Ret>
-    {
-        static Ret hash(UInt128 x)
-        {
-            return static_cast<Ret>(sipHash64(x));
-        }
-    };
-
-    template <typename Ret>
-    struct AggregateFunctionUniqCombinedTraits<Float32, Ret>
-    {
-        static Ret hash(Float32 x)
-        {
-            UInt64 res = bit_cast<UInt64>(x);
-            return static_cast<Ret>(intHash64(res));
-        }
-    };
-
-    template <typename Ret>
-    struct AggregateFunctionUniqCombinedTraits<Float64, Ret>
-    {
-        static Ret hash(Float64 x)
-        {
-            UInt64 res = bit_cast<UInt64>(x);
-            return static_cast<Ret>(intHash64(res));
-        }
-    };
-
-}
-
-// Unlike HashTableGrower always grows to power of 2.
-struct UniqCombinedHashTableGrower : public HashTableGrowerWithPrecalculation<>
-{
-    void increaseSize() { increaseSizeDegree(1); }
-};
-
-template <typename Key, UInt8 K>
-struct AggregateFunctionUniqCombinedDataWithKey
-{
-    // TODO(ilezhankin): pre-generate values for |UniqCombinedBiasData|,
-    //                   at the moment gen-bias-data.py script doesn't work.
-
-    // We want to migrate from |HashSet| to |HyperLogLogCounter| when the sizes in memory become almost equal.
-    // The size per element in |HashSet| is sizeof(Key)*2 bytes, and the overall size of |HyperLogLogCounter| is 2^K * 6 bits.
-    // For Key=UInt32 we can calculate: 2^X * 4 * 2 ≤ 2^(K-3) * 6 ⇒ X ≤ K-4.
-    using Set = CombinedCardinalityEstimator<Key, HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>, 16, K - 5 + (sizeof(Key) == sizeof(UInt32)), K, TrivialHash, Key>;
-
-    Set set;
-};
-
-template <typename Key>
-struct AggregateFunctionUniqCombinedDataWithKey<Key, 17>
-{
-    using Set = CombinedCardinalityEstimator<Key,
-        HashSet<Key, TrivialHash, UniqCombinedHashTableGrower>,
-        16,
-        12 + (sizeof(Key) == sizeof(UInt32)),
-        17,
-        TrivialHash,
-        Key,
-        HyperLogLogBiasEstimator<UniqCombinedBiasData>,
-        HyperLogLogMode::FullFeatured>;
-
-    Set set;
-};
-
-
-template <typename T, UInt8 K, typename HashValueType>
-struct AggregateFunctionUniqCombinedData : public AggregateFunctionUniqCombinedDataWithKey<HashValueType, K>
-{
-};
-
-
-/// For String keys, 64 bit hash is always used (both for uniqCombined and uniqCombined64),
-///  because of backwards compatibility (64 bit hash was already used for uniqCombined).
-template <UInt8 K, typename HashValueType>
-struct AggregateFunctionUniqCombinedData<String, K, HashValueType> : public AggregateFunctionUniqCombinedDataWithKey<UInt64 /*always*/, K>
-{
-};
-
-template <UInt8 K, typename HashValueType>
-struct AggregateFunctionUniqCombinedData<IPv6, K, HashValueType> : public AggregateFunctionUniqCombinedDataWithKey<UInt64 /*always*/, K>
-{
-};
-
-template <typename T, UInt8 K, typename HashValueType>
-class AggregateFunctionUniqCombined final
-    : public IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<T, K, HashValueType>, AggregateFunctionUniqCombined<T, K, HashValueType>>
-{
-public:
-    AggregateFunctionUniqCombined(const DataTypes & argument_types_, const Array & params_)
-        : IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<T, K, HashValueType>, AggregateFunctionUniqCombined<T, K, HashValueType>>(argument_types_, params_, std::make_shared<DataTypeUInt64>())
-    {}
-
-    String getName() const override
-    {
-        if constexpr (std::is_same_v<HashValueType, UInt64>)
-            return "uniqCombined64";
-        else
-            return "uniqCombined";
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        if constexpr (std::is_same_v<T, String> || std::is_same_v<T, IPv6>)
-        {
-            StringRef value = columns[0]->getDataAt(row_num);
-            this->data(place).set.insert(CityHash_v1_0_2::CityHash64(value.data, value.size));
-        }
-        else
-        {
-            const auto & value = assert_cast<const ColumnVector<T> &>(*columns[0]).getElement(row_num);
-            this->data(place).set.insert(detail::AggregateFunctionUniqCombinedTraits<T, HashValueType>::hash(value));
-        }
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).set.merge(this->data(rhs).set);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).set.write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).set.read(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
-    }
-};
-
-/** For multiple arguments. To compute, hashes them.
-  * You can pass multiple arguments as is; You can also pass one argument - a tuple.
-  * But (for the possibility of efficient implementation), you can not pass several arguments, among which there are tuples.
-  */
-template <bool is_exact, bool argument_is_tuple, UInt8 K, typename HashValueType>
-class AggregateFunctionUniqCombinedVariadic final : public IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>,
-                                                           AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>>
-{
-private:
-    size_t num_args = 0;
-
-public:
-    explicit AggregateFunctionUniqCombinedVariadic(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>,
-            AggregateFunctionUniqCombinedVariadic<is_exact, argument_is_tuple, K, HashValueType>>(arguments, params, std::make_shared<DataTypeUInt64>())
-    {
-        if (argument_is_tuple)
-            num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
-        else
-            num_args = arguments.size();
-    }
-
-    String getName() const override
-    {
-        if constexpr (std::is_same_v<HashValueType, UInt64>)
-            return "uniqCombined64";
-        else
-            return "uniqCombined";
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).set.insert(typename AggregateFunctionUniqCombinedData<UInt64, K, HashValueType>::Set::value_type(
-            UniqVariadicHash<is_exact, argument_is_tuple>::apply(num_args, columns, row_num)));
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).set.merge(this->data(rhs).set);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).set.write(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
-    {
-        this->data(place).set.read(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
-    }
-};
-
-}
--- a/src/AggregateFunctions/AggregateFunctionUniqUpTo.cpp
+++ b/src/AggregateFunctions/AggregateFunctionUniqUpTo.cpp
@ -1,12 +1,24 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/Helpers.h>
-#include <AggregateFunctions/AggregateFunctionUniqUpTo.h>
 #include <Common/FieldVisitorConvertToNumber.h>
 #include <DataTypes/DataTypeDate.h>
 #include <DataTypes/DataTypeDate32.h>
 #include <DataTypes/DataTypeDateTime.h>
-#include <DataTypes/DataTypeString.h>
-#include <DataTypes/DataTypeFixedString.h>
+
+#include <Common/typeid_cast.h>
+#include <Common/assert_cast.h>
+
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/UniqVariadicHash.h>
+
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeUUID.h>
+
+#include <Columns/ColumnsNumber.h>
+
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>


 namespace DB
@ -24,6 +36,258 @@ namespace ErrorCodes
 namespace
 {

+/** Counts the number of unique values up to no more than specified in the parameter.
+  *
+  * Example: uniqUpTo(3)(UserID)
+  * - will count the number of unique visitors, return 1, 2, 3 or 4 if visitors > = 4.
+  *
+  * For strings, a non-cryptographic hash function is used, due to which the calculation may be a bit inaccurate.
+  */
+
+template <typename T>
+struct AggregateFunctionUniqUpToData
+{
+/** If count == threshold + 1 - this means that it is "overflowed" (values greater than threshold).
+  * In this case (for example, after calling the merge function), the `data` array does not necessarily contain the initialized values
+  * - example: combine a state in which there are few values, with another state that has overflowed;
+  *   then set count to `threshold + 1`, and values from another state are not copied.
+  */
+    UInt8 count = 0;
+    char data_ptr[0];
+
+    T load(size_t i) const
+    {
+        return unalignedLoad<T>(data_ptr + i * sizeof(T));
+    }
+
+    void store(size_t i, const T & x)
+    {
+        unalignedStore<T>(data_ptr + i * sizeof(T), x);
+    }
+
+    size_t size() const
+    {
+        return count;
+    }
+
+    /// threshold - for how many elements there is room in a `data`.
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE insert(T x, UInt8 threshold)
+    {
+        /// The state is already full - nothing needs to be done.
+        if (count > threshold)
+            return;
+
+        /// Linear search for the matching element.
+        for (size_t i = 0; i < count; ++i)
+            if (load(i) == x)
+                return;
+
+        /// Did not find the matching element. If there is room for one more element, insert it.
+        if (count < threshold)
+            store(count, x);
+
+        /// After increasing count, the state may be overflowed.
+        ++count;
+    }
+
+    void merge(const AggregateFunctionUniqUpToData<T> & rhs, UInt8 threshold)
+    {
+        if (count > threshold)
+            return;
+
+        if (rhs.count > threshold)
+        {
+        /// If `rhs` is overflowed, then set `count` too also overflowed for the current state.
+            count = rhs.count;
+            return;
+        }
+
+        for (size_t i = 0; i < rhs.count; ++i)
+            insert(rhs.load(i), threshold);
+    }
+
+    void write(WriteBuffer & wb, UInt8 threshold) const
+    {
+        writeBinary(count, wb);
+
+        /// Write values only if the state is not overflowed. Otherwise, they are not needed, and only the fact that the state is overflowed is important.
+        if (count <= threshold)
+            wb.write(data_ptr, count * sizeof(T));
+    }
+
+    void read(ReadBuffer & rb, UInt8 threshold)
+    {
+        readBinary(count, rb);
+
+        if (count <= threshold)
+            rb.readStrict(data_ptr, count * sizeof(T));
+    }
+
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
+    {
+        insert(assert_cast<const ColumnVector<T> &>(column).getData()[row_num], threshold);
+    }
+};
+
+
+/// For strings, their hashes are remembered.
+template <>
+struct AggregateFunctionUniqUpToData<String> : AggregateFunctionUniqUpToData<UInt64>
+{
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
+    {
+        /// Keep in mind that calculations are approximate.
+        StringRef value = column.getDataAt(row_num);
+        insert(CityHash_v1_0_2::CityHash64(value.data, value.size), threshold);
+    }
+};
+
+template <>
+struct AggregateFunctionUniqUpToData<UInt128> : AggregateFunctionUniqUpToData<UInt64>
+{
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
+    {
+        UInt128 value = assert_cast<const ColumnVector<UInt128> &>(column).getData()[row_num];
+        insert(sipHash64(value), threshold);
+    }
+};
+
+template <>
+struct AggregateFunctionUniqUpToData<UInt256> : AggregateFunctionUniqUpToData<UInt64>
+{
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
+    {
+        UInt256 value = assert_cast<const ColumnVector<UInt256> &>(column).getData()[row_num];
+        insert(sipHash64(value), threshold);
+    }
+};
+
+template <>
+struct AggregateFunctionUniqUpToData<Int256> : AggregateFunctionUniqUpToData<UInt64>
+{
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
+    {
+        Int256 value = assert_cast<const ColumnVector<Int256> &>(column).getData()[row_num];
+        insert(sipHash64(value), threshold);
+    }
+};
+
+
+template <typename T>
+class AggregateFunctionUniqUpTo final : public IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<T>, AggregateFunctionUniqUpTo<T>>
+{
+private:
+    UInt8 threshold;
+
+public:
+    AggregateFunctionUniqUpTo(UInt8 threshold_, const DataTypes & argument_types_, const Array & params_)
+        : IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<T>, AggregateFunctionUniqUpTo<T>>(argument_types_, params_, std::make_shared<DataTypeUInt64>())
+        , threshold(threshold_)
+    {
+    }
+
+    size_t sizeOfData() const override
+    {
+        return sizeof(AggregateFunctionUniqUpToData<T>) + sizeof(T) * threshold;
+    }
+
+    String getName() const override { return "uniqUpTo"; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
+    void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).add(*columns[0], row_num, threshold);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs), threshold);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf, threshold);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).read(buf, threshold);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
+    }
+};
+
+
+/** For multiple arguments. To compute, hashes them.
+  * You can pass multiple arguments as is; You can also pass one argument - a tuple.
+  * But (for the possibility of effective implementation), you can not pass several arguments, among which there are tuples.
+  */
+template <bool is_exact, bool argument_is_tuple>
+class AggregateFunctionUniqUpToVariadic final
+    : public IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<UInt64>, AggregateFunctionUniqUpToVariadic<is_exact, argument_is_tuple>>
+{
+private:
+    size_t num_args = 0;
+    UInt8 threshold;
+
+public:
+    AggregateFunctionUniqUpToVariadic(const DataTypes & arguments, const Array & params, UInt8 threshold_)
+        : IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<UInt64>, AggregateFunctionUniqUpToVariadic<is_exact, argument_is_tuple>>(arguments, params, std::make_shared<DataTypeUInt64>())
+        , threshold(threshold_)
+    {
+        if (argument_is_tuple)
+            num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
+        else
+            num_args = arguments.size();
+    }
+
+    size_t sizeOfData() const override
+    {
+        return sizeof(AggregateFunctionUniqUpToData<UInt64>) + sizeof(UInt64) * threshold;
+    }
+
+    String getName() const override { return "uniqUpTo"; }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).insert(UInt64(UniqVariadicHash<is_exact, argument_is_tuple>::apply(num_args, columns, row_num)), threshold);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs), threshold);
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).write(buf, threshold);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
+    {
+        this->data(place).read(buf, threshold);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
+    }
+};
+
+
 constexpr UInt8 uniq_upto_max_threshold = 100;


--- a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h
+++ b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h
@ -1,278 +0,0 @@
-#pragma once
-
-#include <base/unaligned.h>
-
-#include <Common/typeid_cast.h>
-#include <Common/assert_cast.h>
-
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/UniqVariadicHash.h>
-
-#include <DataTypes/DataTypesNumber.h>
-#include <DataTypes/DataTypeTuple.h>
-#include <DataTypes/DataTypeUUID.h>
-
-#include <Columns/ColumnsNumber.h>
-
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-
-
-namespace DB
-{
-struct Settings;
-
-
-/** Counts the number of unique values up to no more than specified in the parameter.
-  *
-  * Example: uniqUpTo(3)(UserID)
-  * - will count the number of unique visitors, return 1, 2, 3 or 4 if visitors > = 4.
-  *
-  * For strings, a non-cryptographic hash function is used, due to which the calculation may be a bit inaccurate.
-  */
-
-template <typename T>
-struct AggregateFunctionUniqUpToData
-{
-/** If count == threshold + 1 - this means that it is "overflowed" (values greater than threshold).
-  * In this case (for example, after calling the merge function), the `data` array does not necessarily contain the initialized values
-  * - example: combine a state in which there are few values, with another state that has overflowed;
-  *   then set count to `threshold + 1`, and values from another state are not copied.
-  */
-    UInt8 count = 0;
-    char data_ptr[0];
-
-    T load(size_t i) const
-    {
-        return unalignedLoad<T>(data_ptr + i * sizeof(T));
-    }
-
-    void store(size_t i, const T & x)
-    {
-        unalignedStore<T>(data_ptr + i * sizeof(T), x);
-    }
-
-    size_t size() const
-    {
-        return count;
-    }
-
-    /// threshold - for how many elements there is room in a `data`.
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE insert(T x, UInt8 threshold)
-    {
-        /// The state is already full - nothing needs to be done.
-        if (count > threshold)
-            return;
-
-        /// Linear search for the matching element.
-        for (size_t i = 0; i < count; ++i)
-            if (load(i) == x)
-                return;
-
-        /// Did not find the matching element. If there is room for one more element, insert it.
-        if (count < threshold)
-            store(count, x);
-
-        /// After increasing count, the state may be overflowed.
-        ++count;
-    }
-
-    void merge(const AggregateFunctionUniqUpToData<T> & rhs, UInt8 threshold)
-    {
-        if (count > threshold)
-            return;
-
-        if (rhs.count > threshold)
-        {
-        /// If `rhs` is overflowed, then set `count` too also overflowed for the current state.
-            count = rhs.count;
-            return;
-        }
-
-        for (size_t i = 0; i < rhs.count; ++i)
-            insert(rhs.load(i), threshold);
-    }
-
-    void write(WriteBuffer & wb, UInt8 threshold) const
-    {
-        writeBinary(count, wb);
-
-        /// Write values only if the state is not overflowed. Otherwise, they are not needed, and only the fact that the state is overflowed is important.
-        if (count <= threshold)
-            wb.write(data_ptr, count * sizeof(T));
-    }
-
-    void read(ReadBuffer & rb, UInt8 threshold)
-    {
-        readBinary(count, rb);
-
-        if (count <= threshold)
-            rb.readStrict(data_ptr, count * sizeof(T));
-    }
-
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
-    {
-        insert(assert_cast<const ColumnVector<T> &>(column).getData()[row_num], threshold);
-    }
-};
-
-
-/// For strings, their hashes are remembered.
-template <>
-struct AggregateFunctionUniqUpToData<String> : AggregateFunctionUniqUpToData<UInt64>
-{
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
-    {
-        /// Keep in mind that calculations are approximate.
-        StringRef value = column.getDataAt(row_num);
-        insert(CityHash_v1_0_2::CityHash64(value.data, value.size), threshold);
-    }
-};
-
-template <>
-struct AggregateFunctionUniqUpToData<UInt128> : AggregateFunctionUniqUpToData<UInt64>
-{
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
-    {
-        UInt128 value = assert_cast<const ColumnVector<UInt128> &>(column).getData()[row_num];
-        insert(sipHash64(value), threshold);
-    }
-};
-
-template <>
-struct AggregateFunctionUniqUpToData<UInt256> : AggregateFunctionUniqUpToData<UInt64>
-{
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
-    {
-        UInt256 value = assert_cast<const ColumnVector<UInt256> &>(column).getData()[row_num];
-        insert(sipHash64(value), threshold);
-    }
-};
-
-template <>
-struct AggregateFunctionUniqUpToData<Int256> : AggregateFunctionUniqUpToData<UInt64>
-{
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(const IColumn & column, size_t row_num, UInt8 threshold)
-    {
-        Int256 value = assert_cast<const ColumnVector<Int256> &>(column).getData()[row_num];
-        insert(sipHash64(value), threshold);
-    }
-};
-
-
-template <typename T>
-class AggregateFunctionUniqUpTo final : public IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<T>, AggregateFunctionUniqUpTo<T>>
-{
-private:
-    UInt8 threshold;
-
-public:
-    AggregateFunctionUniqUpTo(UInt8 threshold_, const DataTypes & argument_types_, const Array & params_)
-        : IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<T>, AggregateFunctionUniqUpTo<T>>(argument_types_, params_, std::make_shared<DataTypeUInt64>())
-        , threshold(threshold_)
-    {
-    }
-
-    size_t sizeOfData() const override
-    {
-        return sizeof(AggregateFunctionUniqUpToData<T>) + sizeof(T) * threshold;
-    }
-
-    String getName() const override { return "uniqUpTo"; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function
-    void ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).add(*columns[0], row_num, threshold);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs), threshold);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf, threshold);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).read(buf, threshold);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
-    }
-};
-
-
-/** For multiple arguments. To compute, hashes them.
-  * You can pass multiple arguments as is; You can also pass one argument - a tuple.
-  * But (for the possibility of effective implementation), you can not pass several arguments, among which there are tuples.
-  */
-template <bool is_exact, bool argument_is_tuple>
-class AggregateFunctionUniqUpToVariadic final
-    : public IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<UInt64>, AggregateFunctionUniqUpToVariadic<is_exact, argument_is_tuple>>
-{
-private:
-    size_t num_args = 0;
-    UInt8 threshold;
-
-public:
-    AggregateFunctionUniqUpToVariadic(const DataTypes & arguments, const Array & params, UInt8 threshold_)
-        : IAggregateFunctionDataHelper<AggregateFunctionUniqUpToData<UInt64>, AggregateFunctionUniqUpToVariadic<is_exact, argument_is_tuple>>(arguments, params, std::make_shared<DataTypeUInt64>())
-        , threshold(threshold_)
-    {
-        if (argument_is_tuple)
-            num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
-        else
-            num_args = arguments.size();
-    }
-
-    size_t sizeOfData() const override
-    {
-        return sizeof(AggregateFunctionUniqUpToData<UInt64>) + sizeof(UInt64) * threshold;
-    }
-
-    String getName() const override { return "uniqUpTo"; }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).insert(UInt64(UniqVariadicHash<is_exact, argument_is_tuple>::apply(num_args, columns, row_num)), threshold);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs), threshold);
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).write(buf, threshold);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
-    {
-        this->data(place).read(buf, threshold);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
-    }
-};
-
-
-}
--- a/src/AggregateFunctions/AggregateFunctionVarianceMatrix.cpp
+++ b/src/AggregateFunctions/AggregateFunctionVarianceMatrix.cpp
@ -1,7 +1,11 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/Helpers.h>
 #include <AggregateFunctions/FactoryHelpers.h>
-#include <AggregateFunctions/AggregateFunctionVarianceMatrix.h>
+#include <Columns/ColumnArray.h>
+#include <Common/PODArray_fwd.h>
+#include <DataTypes/DataTypeArray.h>
+#include <AggregateFunctions/IAggregateFunction.h>
+#include <AggregateFunctions/Moments.h>
+#include <DataTypes/DataTypesNumber.h>


 namespace DB
@ -16,6 +20,149 @@ namespace ErrorCodes
 namespace
 {

+enum class StatisticsMatrixFunctionKind
+{
+    covarPopMatrix,
+    covarSampMatrix,
+    corrMatrix
+};
+
+template <StatisticsMatrixFunctionKind _kind>
+struct AggregateFunctionVarianceMatrixData
+{
+    using DataType = std::conditional_t<_kind == StatisticsMatrixFunctionKind::corrMatrix, CorrMoments<Float64>, CovarMoments<Float64>>;
+
+    AggregateFunctionVarianceMatrixData() = default;
+
+    explicit AggregateFunctionVarianceMatrixData(const size_t _num_args)
+        : num_args(_num_args)
+    {
+        data_matrix.resize_fill(num_args * (num_args + 1) / 2, DataType());
+    }
+
+    void add(const IColumn ** column, const size_t row_num)
+    {
+        for (size_t i = 0; i < num_args; ++i)
+            for (size_t j = 0; j <= i; ++j)
+                 data_matrix[i * (i + 1) / 2 + j].add(column[i]->getFloat64(row_num), column[j]->getFloat64(row_num));
+    }
+
+    void merge(const AggregateFunctionVarianceMatrixData & other)
+    {
+        for (size_t i = 0; i < num_args; ++i)
+            for (size_t j = 0; j <= i; ++j)
+                data_matrix[i * (i + 1) / 2 + j].merge(other.data_matrix[i * (i + 1) / 2 + j]);
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        for (size_t i = 0; i < num_args; ++i)
+            for (size_t j = 0; j <= i; ++j)
+                data_matrix[i * (i + 1) / 2 + j].write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        for (size_t i = 0; i < num_args; ++i)
+            for (size_t j = 0; j <= i; ++j)
+                data_matrix[i * (i + 1) / 2 + j].read(buf);
+    }
+
+    void insertResultInto(IColumn & to) const
+    {
+        auto & data_to = assert_cast<ColumnFloat64 &>(assert_cast<ColumnArray &>(assert_cast<ColumnArray &>(to).getData()).getData()).getData();
+        auto & root_offsets_to = assert_cast<ColumnArray &>(to).getOffsets();
+        auto & nested_offsets_to = assert_cast<ColumnArray &>(assert_cast<ColumnArray &>(to).getData()).getOffsets();
+        for (size_t i = 0; i < num_args; ++i)
+        {
+            for (size_t j = 0; j < num_args; ++j)
+            {
+                auto & data = i < j ? data_matrix[j * (j + 1) / 2 + i] : data_matrix[i * (i + 1) / 2 + j];
+                if constexpr (kind == StatisticsMatrixFunctionKind::covarPopMatrix)
+                    data_to.push_back(data.getPopulation());
+                if constexpr (kind == StatisticsMatrixFunctionKind::covarSampMatrix)
+                    data_to.push_back(data.getSample());
+                if constexpr (kind == StatisticsMatrixFunctionKind::corrMatrix)
+                    data_to.push_back(data.get());
+            }
+            nested_offsets_to.push_back(nested_offsets_to.back() + num_args);
+        }
+        root_offsets_to.push_back(root_offsets_to.back() + num_args);
+    }
+
+    static constexpr StatisticsMatrixFunctionKind kind = _kind;
+    PaddedPODArray<DataType> data_matrix;
+    size_t num_args;
+};
+
+template <typename Data>
+class AggregateFunctionVarianceMatrix final
+    : public IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>
+{
+public:
+
+    explicit AggregateFunctionVarianceMatrix(const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>(argument_types_, {}, createResultType())
+    {}
+
+    AggregateFunctionVarianceMatrix(const IDataType &, const DataTypes & argument_types_)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>(argument_types_, {}, createResultType())
+    {}
+
+    String getName() const override
+    {
+        if constexpr (Data::kind == StatisticsMatrixFunctionKind::covarPopMatrix)
+            return "covarPopMatrix";
+        if constexpr (Data::kind == StatisticsMatrixFunctionKind::covarSampMatrix)
+            return "covarSampMatrix";
+        if constexpr (Data::kind == StatisticsMatrixFunctionKind::corrMatrix)
+            return "corrMatrix";
+        UNREACHABLE();
+    }
+
+    void create(AggregateDataPtr __restrict place) const override
+    {
+        new (place) Data(this->argument_types.size());
+    }
+
+    static DataTypePtr createResultType()
+    {
+        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>()));
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
+    {
+        this->data(place).add(columns, row_num);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        this->data(place).insertResultInto(to);
+    }
+};
+
+using AggregateFunctionCovarPopMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::covarPopMatrix>>;
+using AggregateFunctionCovarSampMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::covarSampMatrix>>;
+using AggregateFunctionCorrMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::corrMatrix>>;
+
+
 template <typename FunctionTemplate>
 AggregateFunctionPtr createAggregateFunctionVarianceMatrix(
    const std::string & name, const DataTypes & argument_types, const Array & parameters, const Settings *)
--- a/src/AggregateFunctions/AggregateFunctionVarianceMatrix.h
+++ b/src/AggregateFunctions/AggregateFunctionVarianceMatrix.h
@ -1,159 +0,0 @@
-#pragma once
-
-#include <Columns/ColumnArray.h>
-#include <Columns/ColumnsNumber.h>
-#include <Common/PODArray.h>
-#include <Common/PODArray_fwd.h>
-#include <DataTypes/DataTypeArray.h>
-#include <AggregateFunctions/IAggregateFunction.h>
-#include <AggregateFunctions/Moments.h>
-#include <DataTypes/DataTypesNumber.h>
-
-namespace DB
-{
-struct Settings;
-
-enum class StatisticsMatrixFunctionKind
-{
-    covarPopMatrix,
-    covarSampMatrix,
-    corrMatrix
-};
-
-template <StatisticsMatrixFunctionKind _kind>
-struct AggregateFunctionVarianceMatrixData
-{
-    using DataType = std::conditional_t<_kind == StatisticsMatrixFunctionKind::corrMatrix, CorrMoments<Float64>, CovarMoments<Float64>>;
-
-    AggregateFunctionVarianceMatrixData() = default;
-
-    explicit AggregateFunctionVarianceMatrixData(const size_t _num_args)
-        : num_args(_num_args)
-    {
-        data_matrix.resize_fill(num_args * (num_args + 1) / 2, DataType());
-    }
-
-    void add(const IColumn ** column, const size_t row_num)
-    {
-        for (size_t i = 0; i < num_args; ++i)
-            for (size_t j = 0; j <= i; ++j)
-                 data_matrix[i * (i + 1) / 2 + j].add(column[i]->getFloat64(row_num), column[j]->getFloat64(row_num));
-    }
-
-    void merge(const AggregateFunctionVarianceMatrixData & other)
-    {
-        for (size_t i = 0; i < num_args; ++i)
-            for (size_t j = 0; j <= i; ++j)
-                data_matrix[i * (i + 1) / 2 + j].merge(other.data_matrix[i * (i + 1) / 2 + j]);
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        for (size_t i = 0; i < num_args; ++i)
-            for (size_t j = 0; j <= i; ++j)
-                data_matrix[i * (i + 1) / 2 + j].write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        for (size_t i = 0; i < num_args; ++i)
-            for (size_t j = 0; j <= i; ++j)
-                data_matrix[i * (i + 1) / 2 + j].read(buf);
-    }
-
-    void insertResultInto(IColumn & to) const
-    {
-        auto & data_to = assert_cast<ColumnFloat64 &>(assert_cast<ColumnArray &>(assert_cast<ColumnArray &>(to).getData()).getData()).getData();
-        auto & root_offsets_to = assert_cast<ColumnArray &>(to).getOffsets();
-        auto & nested_offsets_to = assert_cast<ColumnArray &>(assert_cast<ColumnArray &>(to).getData()).getOffsets();
-        for (size_t i = 0; i < num_args; ++i)
-        {
-            for (size_t j = 0; j < num_args; ++j)
-            {
-                auto & data = i < j ? data_matrix[j * (j + 1) / 2 + i] : data_matrix[i * (i + 1) / 2 + j];
-                if constexpr (kind == StatisticsMatrixFunctionKind::covarPopMatrix)
-                    data_to.push_back(data.getPopulation());
-                if constexpr (kind == StatisticsMatrixFunctionKind::covarSampMatrix)
-                    data_to.push_back(data.getSample());
-                if constexpr (kind == StatisticsMatrixFunctionKind::corrMatrix)
-                    data_to.push_back(data.get());
-            }
-            nested_offsets_to.push_back(nested_offsets_to.back() + num_args);
-        }
-        root_offsets_to.push_back(root_offsets_to.back() + num_args);
-    }
-
-    static constexpr StatisticsMatrixFunctionKind kind = _kind;
-    PaddedPODArray<DataType> data_matrix;
-    size_t num_args;
-};
-
-template <typename Data>
-class AggregateFunctionVarianceMatrix final
-    : public IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>
-{
-public:
-
-    explicit AggregateFunctionVarianceMatrix(const DataTypes & argument_types_)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>(argument_types_, {}, createResultType())
-    {}
-
-    AggregateFunctionVarianceMatrix(const IDataType &, const DataTypes & argument_types_)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionVarianceMatrix<Data>>(argument_types_, {}, createResultType())
-    {}
-
-    String getName() const override
-    {
-        if constexpr (Data::kind == StatisticsMatrixFunctionKind::covarPopMatrix)
-            return "covarPopMatrix";
-        if constexpr (Data::kind == StatisticsMatrixFunctionKind::covarSampMatrix)
-            return "covarSampMatrix";
-        if constexpr (Data::kind == StatisticsMatrixFunctionKind::corrMatrix)
-            return "corrMatrix";
-        UNREACHABLE();
-    }
-
-    void create(AggregateDataPtr __restrict place) const override
-    {
-        new (place) Data(this->argument_types.size());
-    }
-
-    static DataTypePtr createResultType()
-    {
-        return std::make_shared<DataTypeArray>(std::make_shared<DataTypeArray>(std::make_shared<DataTypeFloat64>()));
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
-    {
-        this->data(place).add(columns, row_num);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        this->data(place).insertResultInto(to);
-    }
-};
-
-using AggregateFunctionCovarPopMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::covarPopMatrix>>;
-using AggregateFunctionCovarSampMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::covarSampMatrix>>;
-using AggregateFunctionCorrMatrix = AggregateFunctionVarianceMatrix<AggregateFunctionVarianceMatrixData<StatisticsMatrixFunctionKind::corrMatrix>>;
-
-}
-
--- a/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp
+++ b/src/AggregateFunctions/AggregateFunctionWindowFunnel.cpp
@ -1,13 +1,15 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
-#include <AggregateFunctions/AggregateFunctionWindowFunnel.h>
-#include <AggregateFunctions/FactoryHelpers.h>
 #include <AggregateFunctions/Helpers.h>
 #include <Core/Settings.h>
 #include <DataTypes/DataTypeDate.h>
-#include <DataTypes/DataTypeDate32.h>
 #include <DataTypes/DataTypeDateTime.h>

-#include <base/range.h>
+#include <unordered_set>
+#include <Columns/ColumnsNumber.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Common/assert_cast.h>


 namespace DB
@ -18,11 +20,280 @@ namespace ErrorCodes
 {
    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int BAD_ARGUMENTS;
 }

 namespace
 {

+static constexpr size_t max_events = 32;
+
+template <typename T>
+struct AggregateFunctionWindowFunnelData
+{
+    using TimestampEvent = std::pair<T, UInt8>;
+    using TimestampEvents = PODArrayWithStackMemory<TimestampEvent, 64>;
+
+    bool sorted = true;
+    TimestampEvents events_list;
+
+    size_t size() const
+    {
+        return events_list.size();
+    }
+
+    void add(T timestamp, UInt8 event)
+    {
+        /// Since most events should have already been sorted by timestamp.
+        if (sorted && events_list.size() > 0)
+        {
+            if (events_list.back().first == timestamp)
+                sorted = events_list.back().second <= event;
+            else
+                sorted = events_list.back().first <= timestamp;
+        }
+        events_list.emplace_back(timestamp, event);
+    }
+
+    void merge(const AggregateFunctionWindowFunnelData & other)
+    {
+        if (other.events_list.empty())
+            return;
+
+        const auto size = events_list.size();
+
+        events_list.insert(std::begin(other.events_list), std::end(other.events_list));
+
+        /// either sort whole container or do so partially merging ranges afterwards
+        if (!sorted && !other.sorted)
+            std::stable_sort(std::begin(events_list), std::end(events_list));
+        else
+        {
+            const auto begin = std::begin(events_list);
+            const auto middle = std::next(begin, size);
+            const auto end = std::end(events_list);
+
+            if (!sorted)
+                std::stable_sort(begin, middle);
+
+            if (!other.sorted)
+                std::stable_sort(middle, end);
+
+            std::inplace_merge(begin, middle, end);
+        }
+
+        sorted = true;
+    }
+
+    void sort()
+    {
+        if (!sorted)
+        {
+            std::stable_sort(std::begin(events_list), std::end(events_list));
+            sorted = true;
+        }
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        writeBinary(sorted, buf);
+        writeBinary(events_list.size(), buf);
+
+        for (const auto & events : events_list)
+        {
+            writeBinary(events.first, buf);
+            writeBinary(events.second, buf);
+        }
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        readBinary(sorted, buf);
+
+        size_t size;
+        readBinary(size, buf);
+
+        /// TODO Protection against huge size
+
+        events_list.clear();
+        events_list.reserve(size);
+
+        T timestamp;
+        UInt8 event;
+
+        for (size_t i = 0; i < size; ++i)
+        {
+            readBinary(timestamp, buf);
+            readBinary(event, buf);
+            events_list.emplace_back(timestamp, event);
+        }
+    }
+};
+
+/** Calculates the max event level in a sliding window.
+  * The max size of events is 32, that's enough for funnel analytics
+  *
+  * Usage:
+  * - windowFunnel(window)(timestamp, cond1, cond2, cond3, ....)
+  */
+template <typename T, typename Data>
+class AggregateFunctionWindowFunnel final
+    : public IAggregateFunctionDataHelper<Data, AggregateFunctionWindowFunnel<T, Data>>
+{
+private:
+    UInt64 window;
+    UInt8 events_size;
+    /// When the 'strict_deduplication' is set, it applies conditions only for the not repeating values.
+    bool strict_deduplication;
+
+    /// When the 'strict_order' is set, it doesn't allow interventions of other events.
+    /// In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2.
+    bool strict_order;
+
+    /// Applies conditions only to events with strictly increasing timestamps
+    bool strict_increase;
+
+    /// Loop through the entire events_list, update the event timestamp value
+    /// The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window.
+    /// If found, returns the max event level, else return 0.
+    /// The algorithm works in O(n) time, but the overall function works in O(n * log(n)) due to sorting.
+    UInt8 getEventLevel(Data & data) const
+    {
+        if (data.size() == 0)
+            return 0;
+        if (!strict_order && events_size == 1)
+            return 1;
+
+        data.sort();
+
+        /// events_timestamp stores the timestamp of the first and previous i-th level event happen within time window
+        std::vector<std::optional<std::pair<UInt64, UInt64>>> events_timestamp(events_size);
+        bool first_event = false;
+        for (size_t i = 0; i < data.events_list.size(); ++i)
+        {
+            const T & timestamp = data.events_list[i].first;
+            const auto & event_idx = data.events_list[i].second - 1;
+            if (strict_order && event_idx == -1)
+            {
+                if (first_event)
+                    break;
+                else
+                    continue;
+            }
+            else if (event_idx == 0)
+            {
+                events_timestamp[0] = std::make_pair(timestamp, timestamp);
+                first_event = true;
+            }
+            else if (strict_deduplication && events_timestamp[event_idx].has_value())
+            {
+                return data.events_list[i - 1].second;
+            }
+            else if (strict_order && first_event && !events_timestamp[event_idx - 1].has_value())
+            {
+                for (size_t event = 0; event < events_timestamp.size(); ++event)
+                {
+                    if (!events_timestamp[event].has_value())
+                        return event;
+                }
+            }
+            else if (events_timestamp[event_idx - 1].has_value())
+            {
+                auto first_timestamp = events_timestamp[event_idx - 1]->first;
+                bool time_matched = timestamp <= first_timestamp + window;
+                if (strict_increase)
+                    time_matched = time_matched && events_timestamp[event_idx - 1]->second < timestamp;
+                if (time_matched)
+                {
+                    events_timestamp[event_idx] = std::make_pair(first_timestamp, timestamp);
+                    if (event_idx + 1 == events_size)
+                        return events_size;
+                }
+            }
+        }
+
+        for (size_t event = events_timestamp.size(); event > 0; --event)
+        {
+            if (events_timestamp[event - 1].has_value())
+                return event;
+        }
+        return 0;
+    }
+
+public:
+    String getName() const override
+    {
+        return "windowFunnel";
+    }
+
+    AggregateFunctionWindowFunnel(const DataTypes & arguments, const Array & params)
+        : IAggregateFunctionDataHelper<Data, AggregateFunctionWindowFunnel<T, Data>>(arguments, params, std::make_shared<DataTypeUInt8>())
+    {
+        events_size = arguments.size() - 1;
+        window = params.at(0).safeGet<UInt64>();
+
+        strict_deduplication = false;
+        strict_order = false;
+        strict_increase = false;
+        for (size_t i = 1; i < params.size(); ++i)
+        {
+            String option = params.at(i).safeGet<String>();
+            if (option == "strict_deduplication")
+                strict_deduplication = true;
+            else if (option == "strict_order")
+                strict_order = true;
+            else if (option == "strict_increase")
+                strict_increase = true;
+            else if (option == "strict")
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "strict is replaced with strict_deduplication in Aggregate function {}", getName());
+            else
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} doesn't support a parameter: {}", getName(), option);
+        }
+    }
+
+    bool allocatesMemoryInArena() const override { return false; }
+
+    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
+    {
+        bool has_event = false;
+        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
+        /// reverse iteration and stable sorting are needed for events that are qualified by more than one condition.
+        for (auto i = events_size; i > 0; --i)
+        {
+            auto event = assert_cast<const ColumnVector<UInt8> *>(columns[i])->getData()[row_num];
+            if (event)
+            {
+                this->data(place).add(timestamp, i);
+                has_event = true;
+            }
+        }
+
+        if (strict_order && !has_event)
+            this->data(place).add(timestamp, 0);
+    }
+
+    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
+    {
+        this->data(place).merge(this->data(rhs));
+    }
+
+    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
+    {
+        this->data(place).serialize(buf);
+    }
+
+    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
+    {
+        this->data(place).deserialize(buf);
+    }
+
+    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
+    {
+        assert_cast<ColumnUInt8 &>(to).getData().push_back(getEventLevel(this->data(place)));
+    }
+};
+
+
 template <template <typename> class Data>
 AggregateFunctionPtr
 createAggregateFunctionWindowFunnel(const std::string & name, const DataTypes & arguments, const Array & params, const Settings *)
--- a/src/AggregateFunctions/AggregateFunctionWindowFunnel.h
+++ b/src/AggregateFunctions/AggregateFunctionWindowFunnel.h
@ -1,287 +0,0 @@
-#pragma once
-
-#include <unordered_set>
-#include <Columns/ColumnsNumber.h>
-#include <DataTypes/DataTypeDateTime.h>
-#include <DataTypes/DataTypesNumber.h>
-#include <IO/ReadHelpers.h>
-#include <IO/WriteHelpers.h>
-#include <Common/assert_cast.h>
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int BAD_ARGUMENTS;
-}
-
-static constexpr size_t max_events = 32;
-
-template <typename T>
-struct AggregateFunctionWindowFunnelData
-{
-    using TimestampEvent = std::pair<T, UInt8>;
-    using TimestampEvents = PODArrayWithStackMemory<TimestampEvent, 64>;
-
-    bool sorted = true;
-    TimestampEvents events_list;
-
-    size_t size() const
-    {
-        return events_list.size();
-    }
-
-    void add(T timestamp, UInt8 event)
-    {
-        /// Since most events should have already been sorted by timestamp.
-        if (sorted && events_list.size() > 0)
-        {
-            if (events_list.back().first == timestamp)
-                sorted = events_list.back().second <= event;
-            else
-                sorted = events_list.back().first <= timestamp;
-        }
-        events_list.emplace_back(timestamp, event);
-    }
-
-    void merge(const AggregateFunctionWindowFunnelData & other)
-    {
-        if (other.events_list.empty())
-            return;
-
-        const auto size = events_list.size();
-
-        events_list.insert(std::begin(other.events_list), std::end(other.events_list));
-
-        /// either sort whole container or do so partially merging ranges afterwards
-        if (!sorted && !other.sorted)
-            std::stable_sort(std::begin(events_list), std::end(events_list));
-        else
-        {
-            const auto begin = std::begin(events_list);
-            const auto middle = std::next(begin, size);
-            const auto end = std::end(events_list);
-
-            if (!sorted)
-                std::stable_sort(begin, middle);
-
-            if (!other.sorted)
-                std::stable_sort(middle, end);
-
-            std::inplace_merge(begin, middle, end);
-        }
-
-        sorted = true;
-    }
-
-    void sort()
-    {
-        if (!sorted)
-        {
-            std::stable_sort(std::begin(events_list), std::end(events_list));
-            sorted = true;
-        }
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        writeBinary(sorted, buf);
-        writeBinary(events_list.size(), buf);
-
-        for (const auto & events : events_list)
-        {
-            writeBinary(events.first, buf);
-            writeBinary(events.second, buf);
-        }
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        readBinary(sorted, buf);
-
-        size_t size;
-        readBinary(size, buf);
-
-        /// TODO Protection against huge size
-
-        events_list.clear();
-        events_list.reserve(size);
-
-        T timestamp;
-        UInt8 event;
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            readBinary(timestamp, buf);
-            readBinary(event, buf);
-            events_list.emplace_back(timestamp, event);
-        }
-    }
-};
-
-/** Calculates the max event level in a sliding window.
-  * The max size of events is 32, that's enough for funnel analytics
-  *
-  * Usage:
-  * - windowFunnel(window)(timestamp, cond1, cond2, cond3, ....)
-  */
-template <typename T, typename Data>
-class AggregateFunctionWindowFunnel final
-    : public IAggregateFunctionDataHelper<Data, AggregateFunctionWindowFunnel<T, Data>>
-{
-private:
-    UInt64 window;
-    UInt8 events_size;
-    /// When the 'strict_deduplication' is set, it applies conditions only for the not repeating values.
-    bool strict_deduplication;
-
-    /// When the 'strict_order' is set, it doesn't allow interventions of other events.
-    /// In the case of 'A->B->D->C', it stops finding 'A->B->C' at the 'D' and the max event level is 2.
-    bool strict_order;
-
-    /// Applies conditions only to events with strictly increasing timestamps
-    bool strict_increase;
-
-    /// Loop through the entire events_list, update the event timestamp value
-    /// The level path must be 1---2---3---...---check_events_size, find the max event level that satisfied the path in the sliding window.
-    /// If found, returns the max event level, else return 0.
-    /// The algorithm works in O(n) time, but the overall function works in O(n * log(n)) due to sorting.
-    UInt8 getEventLevel(Data & data) const
-    {
-        if (data.size() == 0)
-            return 0;
-        if (!strict_order && events_size == 1)
-            return 1;
-
-        data.sort();
-
-        /// events_timestamp stores the timestamp of the first and previous i-th level event happen within time window
-        std::vector<std::optional<std::pair<UInt64, UInt64>>> events_timestamp(events_size);
-        bool first_event = false;
-        for (size_t i = 0; i < data.events_list.size(); ++i)
-        {
-            const T & timestamp = data.events_list[i].first;
-            const auto & event_idx = data.events_list[i].second - 1;
-            if (strict_order && event_idx == -1)
-            {
-                if (first_event)
-                    break;
-                else
-                    continue;
-            }
-            else if (event_idx == 0)
-            {
-                events_timestamp[0] = std::make_pair(timestamp, timestamp);
-                first_event = true;
-            }
-            else if (strict_deduplication && events_timestamp[event_idx].has_value())
-            {
-                return data.events_list[i - 1].second;
-            }
-            else if (strict_order && first_event && !events_timestamp[event_idx - 1].has_value())
-            {
-                for (size_t event = 0; event < events_timestamp.size(); ++event)
-                {
-                    if (!events_timestamp[event].has_value())
-                        return event;
-                }
-            }
-            else if (events_timestamp[event_idx - 1].has_value())
-            {
-                auto first_timestamp = events_timestamp[event_idx - 1]->first;
-                bool time_matched = timestamp <= first_timestamp + window;
-                if (strict_increase)
-                    time_matched = time_matched && events_timestamp[event_idx - 1]->second < timestamp;
-                if (time_matched)
-                {
-                    events_timestamp[event_idx] = std::make_pair(first_timestamp, timestamp);
-                    if (event_idx + 1 == events_size)
-                        return events_size;
-                }
-            }
-        }
-
-        for (size_t event = events_timestamp.size(); event > 0; --event)
-        {
-            if (events_timestamp[event - 1].has_value())
-                return event;
-        }
-        return 0;
-    }
-
-public:
-    String getName() const override
-    {
-        return "windowFunnel";
-    }
-
-    AggregateFunctionWindowFunnel(const DataTypes & arguments, const Array & params)
-        : IAggregateFunctionDataHelper<Data, AggregateFunctionWindowFunnel<T, Data>>(arguments, params, std::make_shared<DataTypeUInt8>())
-    {
-        events_size = arguments.size() - 1;
-        window = params.at(0).safeGet<UInt64>();
-
-        strict_deduplication = false;
-        strict_order = false;
-        strict_increase = false;
-        for (size_t i = 1; i < params.size(); ++i)
-        {
-            String option = params.at(i).safeGet<String>();
-            if (option == "strict_deduplication")
-                strict_deduplication = true;
-            else if (option == "strict_order")
-                strict_order = true;
-            else if (option == "strict_increase")
-                strict_increase = true;
-            else if (option == "strict")
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "strict is replaced with strict_deduplication in Aggregate function {}", getName());
-            else
-                throw Exception(ErrorCodes::BAD_ARGUMENTS, "Aggregate function {} doesn't support a parameter: {}", getName(), option);
-        }
-    }
-
-    bool allocatesMemoryInArena() const override { return false; }
-
-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, const size_t row_num, Arena *) const override
-    {
-        bool has_event = false;
-        const auto timestamp = assert_cast<const ColumnVector<T> *>(columns[0])->getData()[row_num];
-        /// reverse iteration and stable sorting are needed for events that are qualified by more than one condition.
-        for (auto i = events_size; i > 0; --i)
-        {
-            auto event = assert_cast<const ColumnVector<UInt8> *>(columns[i])->getData()[row_num];
-            if (event)
-            {
-                this->data(place).add(timestamp, i);
-                has_event = true;
-            }
-        }
-
-        if (strict_order && !has_event)
-            this->data(place).add(timestamp, 0);
-    }
-
-    void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, Arena *) const override
-    {
-        this->data(place).merge(this->data(rhs));
-    }
-
-    void serialize(ConstAggregateDataPtr __restrict place, WriteBuffer & buf, std::optional<size_t> /* version */) const override
-    {
-        this->data(place).serialize(buf);
-    }
-
-    void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version  */, Arena *) const override
-    {
-        this->data(place).deserialize(buf);
-    }
-
-    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
-    {
-        assert_cast<ColumnUInt8 &>(to).getData().push_back(getEventLevel(this->data(place)));
-    }
-};
-
-}
--- a/src/AggregateFunctions/QuantileApprox.h
+++ b/src/AggregateFunctions/QuantileApprox.h
@ -1,477 +0,0 @@
-#pragma once
-
-#include <cmath>
-#include <base/sort.h>
-#include <Common/RadixSort.h>
-#include <IO/WriteBuffer.h>
-#include <IO/ReadBuffer.h>
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-
-
-namespace DB
-{
-namespace ErrorCodes
-{
-    extern const int LOGICAL_ERROR;
-    extern const int NOT_IMPLEMENTED;
-}
-
-template <typename T>
-class ApproxSampler
-{
-public:
-    struct Stats
-    {
-        T value;      // the sampled value
-        Int64 g;      // the minimum rank jump from the previous value's minimum rank
-        Int64 delta;  // the maximum span of the rank
-
-        Stats() = default;
-        Stats(T value_, Int64 g_, Int64 delta_) : value(value_), g(g_), delta(delta_) {}
-    };
-
-    struct QueryResult
-    {
-        size_t index;
-        Int64 rank;
-        T value;
-
-        QueryResult(size_t index_, Int64 rank_, T value_) : index(index_), rank(rank_), value(value_) { }
-    };
-
-    ApproxSampler() = default;
-
-    explicit ApproxSampler(
-        double relative_error_,
-        size_t compress_threshold_ = default_compress_threshold,
-        size_t count_ = 0,
-        bool compressed_ = false)
-        : relative_error(relative_error_)
-        , compress_threshold(compress_threshold_)
-        , count(count_)
-        , compressed(compressed_)
-    {
-        sampled.reserve(compress_threshold);
-        backup_sampled.reserve(compress_threshold);
-
-        head_sampled.reserve(default_head_size);
-    }
-
-    bool isCompressed() const { return compressed; }
-    void setCompressed() { compressed = true; }
-
-    void insert(T x)
-    {
-        head_sampled.push_back(x);
-        compressed = false;
-        if (head_sampled.size() >= default_head_size)
-        {
-            withHeadBufferInserted();
-            if (sampled.size() >= compress_threshold)
-                compress();
-        }
-    }
-
-    void query(const Float64 * percentiles, const size_t * indices, size_t size, T * result) const
-    {
-        if (!head_sampled.empty())
-            throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot operate on an uncompressed summary, call compress() first");
-
-        if (sampled.empty())
-        {
-            for (size_t i = 0; i < size; ++i)
-                result[i] = T();
-            return;
-        }
-
-        Int64 current_max = std::numeric_limits<Int64>::min();
-        for (const auto & stats : sampled)
-            current_max = std::max(stats.delta + stats.g, current_max);
-        Int64 target_error = current_max/2;
-
-        size_t index= 0;
-        auto min_rank = sampled[0].g;
-        for (size_t i = 0; i < size; ++i)
-        {
-            double percentile = percentiles[indices[i]];
-            if (percentile <= relative_error)
-            {
-                result[indices[i]] = sampled.front().value;
-            }
-            else if (percentile >= 1 - relative_error)
-            {
-                result[indices[i]] = sampled.back().value;
-            }
-            else
-            {
-                QueryResult res = findApproxQuantile(index, min_rank, target_error, percentile);
-                index = res.index;
-                min_rank = res.rank;
-                result[indices[i]] = res.value;
-            }
-        }
-
-    }
-
-    void compress()
-    {
-        if (compressed)
-            return;
-
-        withHeadBufferInserted();
-
-        doCompress(2 * relative_error * count);
-        compressed = true;
-    }
-
-
-    void merge(const ApproxSampler & other)
-    {
-        if (other.count == 0)
-            return;
-        else if (count == 0)
-        {
-            compress_threshold = other.compress_threshold;
-            relative_error = other.relative_error;
-            count = other.count;
-            compressed = other.compressed;
-
-            sampled.resize(other.sampled.size());
-            memcpy(sampled.data(), other.sampled.data(), sizeof(Stats) * other.sampled.size());
-            return;
-        }
-        else
-        {
-            // Merge the two buffers.
-            // The GK algorithm is a bit unclear about it, but we need to adjust the statistics during the
-            // merging. The main idea is that samples that come from one side will suffer from the lack of
-            // precision of the other.
-            // As a concrete example, take two QuantileSummaries whose samples (value, g, delta) are:
-            // `a = [(0, 1, 0), (20, 99, 0)]` and `b = [(10, 1, 0), (30, 49, 0)]`
-            // This means `a` has 100 values, whose minimum is 0 and maximum is 20,
-            // while `b` has 50 values, between 10 and 30.
-            // The resulting samples of the merge will be:
-            // a+b = [(0, 1, 0), (10, 1, ??), (20, 99, ??), (30, 49, 0)]
-            // The values of `g` do not change, as they represent the minimum number of values between two
-            // consecutive samples. The values of `delta` should be adjusted, however.
-            // Take the case of the sample `10` from `b`. In the original stream, it could have appeared
-            // right after `0` (as expressed by `g=1`) or right before `20`, so `delta=99+0-1=98`.
-            // In the GK algorithm's style of working in terms of maximum bounds, one can observe that the
-            // maximum additional uncertainty over samples coming from `b` is `max(g_a + delta_a) =
-            // floor(2 * eps_a * n_a)`. Likewise, additional uncertainty over samples from `a` is
-            // `floor(2 * eps_b * n_b)`.
-            // Only samples that interleave the other side are affected. That means that samples from
-            // one side that are lesser (or greater) than all samples from the other side are just copied
-            // unmodified.
-            // If the merging instances have different `relativeError`, the resulting instance will carry
-            // the largest one: `eps_ab = max(eps_a, eps_b)`.
-            // The main invariant of the GK algorithm is kept:
-            // `max(g_ab + delta_ab) <= floor(2 * eps_ab * (n_a + n_b))` since
-            // `max(g_ab + delta_ab) <= floor(2 * eps_a * n_a) + floor(2 * eps_b * n_b)`
-            // Finally, one can see how the `insert(x)` operation can be expressed as `merge([(x, 1, 0])`
-            compress();
-
-            backup_sampled.clear();
-            backup_sampled.reserve(sampled.size() + other.sampled.size());
-            double merged_relative_error = std::max(relative_error, other.relative_error);
-            size_t merged_count = count + other.count;
-            Int64 additional_self_delta = static_cast<Int64>(std::floor(2 * other.relative_error * other.count));
-            Int64 additional_other_delta = static_cast<Int64>(std::floor(2 * relative_error * count));
-
-            // Do a merge of two sorted lists until one of the lists is fully consumed
-            size_t self_idx = 0;
-            size_t other_idx = 0;
-            while (self_idx < sampled.size() && other_idx < other.sampled.size())
-            {
-                const Stats & self_sample = sampled[self_idx];
-                const Stats & other_sample = other.sampled[other_idx];
-
-                // Detect next sample
-                Stats next_sample;
-                Int64 additional_delta = 0;
-                if (self_sample.value < other_sample.value)
-                {
-                    ++self_idx;
-                    next_sample = self_sample;
-                    additional_delta = other_idx > 0 ? additional_self_delta : 0;
-                }
-                else
-                {
-                    ++other_idx;
-                    next_sample = other_sample;
-                    additional_delta = self_idx > 0 ? additional_other_delta : 0;
-                }
-
-                // Insert it
-                next_sample.delta += additional_delta;
-                backup_sampled.emplace_back(std::move(next_sample));
-            }
-
-            // Copy the remaining samples from the other list
-            // (by construction, at most one `while` loop will run)
-            while (self_idx < sampled.size())
-            {
-                backup_sampled.emplace_back(sampled[self_idx]);
-                ++self_idx;
-            }
-            while (other_idx < other.sampled.size())
-            {
-                backup_sampled.emplace_back(other.sampled[other_idx]);
-                ++other_idx;
-            }
-
-            std::swap(sampled, backup_sampled);
-            relative_error = merged_relative_error;
-            count = merged_count;
-            compress_threshold = other.compress_threshold;
-
-            doCompress(2 * merged_relative_error * merged_count);
-            compressed = true;
-        }
-    }
-
-    void write(WriteBuffer & buf) const
-    {
-        writeBinaryLittleEndian(compress_threshold, buf);
-        writeBinaryLittleEndian(relative_error, buf);
-        writeBinaryLittleEndian(count, buf);
-        writeBinaryLittleEndian(sampled.size(), buf);
-
-        for (const auto & stats : sampled)
-        {
-            writeBinaryLittleEndian(stats.value, buf);
-            writeBinaryLittleEndian(stats.g, buf);
-            writeBinaryLittleEndian(stats.delta, buf);
-        }
-    }
-
-    void read(ReadBuffer & buf)
-    {
-        readBinaryLittleEndian(compress_threshold, buf);
-        readBinaryLittleEndian(relative_error, buf);
-        readBinaryLittleEndian(count, buf);
-
-        size_t sampled_len = 0;
-        readBinaryLittleEndian(sampled_len, buf);
-        sampled.resize(sampled_len);
-
-        for (size_t i = 0; i < sampled_len; ++i)
-        {
-            auto stats = sampled[i];
-            readBinaryLittleEndian(stats.value, buf);
-            readBinaryLittleEndian(stats.g, buf);
-            readBinaryLittleEndian(stats.delta, buf);
-        }
-    }
-
-private:
-    QueryResult findApproxQuantile(size_t index, Int64 min_rank_at_index, double target_error, double percentile) const
-    {
-        Stats curr_sample = sampled[index];
-        Int64 rank = static_cast<Int64>(std::ceil(percentile * count));
-        size_t i = index;
-        Int64 min_rank = min_rank_at_index;
-        while (i < sampled.size() - 1)
-        {
-            Int64 max_rank = min_rank + curr_sample.delta;
-            if (max_rank - target_error <= rank && rank <= min_rank + target_error)
-                return {i, min_rank, curr_sample.value};
-            else
-            {
-                ++i;
-                curr_sample = sampled[i];
-                min_rank += curr_sample.g;
-            }
-        }
-        return {sampled.size()-1, 0, sampled.back().value};
-    }
-
-    void withHeadBufferInserted()
-    {
-        if (head_sampled.empty())
-            return;
-
-        bool use_radix_sort = head_sampled.size() >= 256 && (is_arithmetic_v<T> && !is_big_int_v<T>);
-        if (use_radix_sort)
-            RadixSort<RadixSortNumTraits<T>>::executeLSD(head_sampled.data(), head_sampled.size());
-        else
-            ::sort(head_sampled.begin(), head_sampled.end());
-
-        backup_sampled.clear();
-        backup_sampled.reserve(sampled.size() + head_sampled.size());
-
-        size_t sample_idx = 0;
-        size_t ops_idx = 0;
-        size_t current_count = count;
-        for (; ops_idx < head_sampled.size(); ++ops_idx)
-        {
-            T current_sample = head_sampled[ops_idx];
-
-            // Add all the samples before the next observation.
-            while (sample_idx < sampled.size() && sampled[sample_idx].value <= current_sample)
-            {
-                backup_sampled.emplace_back(sampled[sample_idx]);
-                ++sample_idx;
-            }
-
-            // If it is the first one to insert, of if it is the last one
-            ++current_count;
-            Int64 delta;
-            if (backup_sampled.empty() || (sample_idx == sampled.size() && ops_idx == (head_sampled.size() - 1)))
-                delta = 0;
-            else
-                delta = static_cast<Int64>(std::floor(2 * relative_error * current_count));
-
-            backup_sampled.emplace_back(current_sample, 1, delta);
-        }
-
-        // Add all the remaining existing samples
-        for (; sample_idx < sampled.size(); ++sample_idx)
-            backup_sampled.emplace_back(sampled[sample_idx]);
-
-        std::swap(sampled, backup_sampled);
-        head_sampled.clear();
-        count = current_count;
-    }
-
-
-    void doCompress(double merge_threshold)
-    {
-        if (sampled.empty())
-            return;
-
-        backup_sampled.clear();
-        // Start for the last element, which is always part of the set.
-        // The head contains the current new head, that may be merged with the current element.
-        Stats head = sampled.back();
-        ssize_t i = sampled.size() - 2;
-
-        // Do not compress the last element
-        while (i >= 1)
-        {
-            // The current sample:
-            const auto & sample1 = sampled[i];
-            // Do we need to compress?
-            if (sample1.g + head.g + head.delta < merge_threshold)
-            {
-                // Do not insert yet, just merge the current element into the head.
-                head.g += sample1.g;
-            }
-            else
-            {
-                // Prepend the current head, and keep the current sample as target for merging.
-                backup_sampled.push_back(head);
-                head = sample1;
-            }
-            --i;
-        }
-
-        backup_sampled.push_back(head);
-        // If necessary, add the minimum element:
-        auto curr_head = sampled.front();
-
-        // don't add the minimum element if `currentSamples` has only one element (both `currHead` and
-        // `head` point to the same element)
-        if (curr_head.value <= head.value && sampled.size() > 1)
-            backup_sampled.emplace_back(sampled.front());
-
-        std::reverse(backup_sampled.begin(), backup_sampled.end());
-        std::swap(sampled, backup_sampled);
-    }
-
-    double relative_error;
-    size_t compress_threshold;
-    size_t count = 0;
-    bool compressed;
-
-    PaddedPODArray<Stats> sampled;
-    PaddedPODArray<Stats> backup_sampled;
-
-    PaddedPODArray<T> head_sampled;
-
-    static constexpr size_t default_compress_threshold = 10000;
-    static constexpr size_t default_head_size = 50000;
-};
-
-template <typename Value>
-class QuantileGK
-{
-private:
-    using Data = ApproxSampler<Value>;
-    mutable Data data;
-
-public:
-    QuantileGK() = default;
-
-    explicit QuantileGK(size_t accuracy) : data(1.0 / static_cast<double>(accuracy)) { }
-
-    void add(const Value & x)
-    {
-        data.insert(x);
-    }
-
-    template <typename Weight>
-    void add(const Value &, const Weight &)
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add with weight is not implemented for GKSampler");
-    }
-
-    void merge(const QuantileGK & rhs)
-    {
-        if (!data.isCompressed())
-            data.compress();
-
-        data.merge(rhs.data);
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        /// Always compress before serialization
-        if (!data.isCompressed())
-            data.compress();
-
-        data.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        data.read(buf);
-
-        data.setCompressed();
-    }
-
-    /// Get the value of the `level` quantile. The level must be between 0 and 1.
-    Value get(Float64 level)
-    {
-        if (!data.isCompressed())
-            data.compress();
-
-        Value res;
-        size_t indice = 0;
-        data.query(&level, &indice, 1, &res);
-        return res;
-    }
-
-    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
-    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
-    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
-    {
-        if (!data.isCompressed())
-            data.compress();
-
-        data.query(levels, indices, size, result);
-    }
-
-    Float64 getFloat64(Float64 /*level*/)
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat64 is not implemented for GKSampler");
-    }
-
-    void getManyFloat(const Float64 * /*levels*/, const size_t * /*indices*/, size_t /*size*/, Float64 * /*result*/)
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for GKSampler");
-    }
-};
-
-}
--- a/src/AggregateFunctions/QuantileExactWeighted.h
+++ b/src/AggregateFunctions/QuantileExactWeighted.h
@ -1,203 +0,0 @@
-#pragma once
-
-#include <base/sort.h>
-
-#include <Common/HashTable/HashMap.h>
-#include <Common/NaNUtils.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
-/** Calculates quantile by counting number of occurrences for each value in a hash map.
-  *
-  * It uses O(distinct(N)) memory. Can be naturally applied for values with weight.
-  * In case of many identical values, it can be more efficient than QuantileExact even when weight is not used.
-  */
-template <typename Value>
-struct QuantileExactWeighted
-{
-    struct Int128Hash
-    {
-        size_t operator()(Int128 x) const
-        {
-            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
-        }
-    };
-
-    using Weight = UInt64;
-    using UnderlyingType = NativeType<Value>;
-    using Hasher = HashCRC32<UnderlyingType>;
-
-    /// When creating, the hash table must be small.
-    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
-
-    Map map;
-
-    void add(const Value & x)
-    {
-        /// We must skip NaNs as they are not compatible with comparison sorting.
-        if (!isNaN(x))
-            ++map[x];
-    }
-
-    void add(const Value & x, Weight weight)
-    {
-        if (!isNaN(x))
-            map[x] += weight;
-    }
-
-    void merge(const QuantileExactWeighted & rhs)
-    {
-        for (const auto & pair : rhs.map)
-            map[pair.getKey()] += pair.getMapped();
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        map.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        typename Map::Reader reader(buf);
-        while (reader.next())
-        {
-            const auto & pair = reader.get();
-            map[pair.first] = pair.second;
-        }
-    }
-
-    /// Get the value of the `level` quantile. The level must be between 0 and 1.
-    Value get(Float64 level) const
-    {
-        size_t size = map.size();
-
-        if (0 == size)
-            return std::numeric_limits<Value>::quiet_NaN();
-
-        /// Copy the data to a temporary array to get the element you need in order.
-        using Pair = typename Map::value_type;
-        std::unique_ptr<Pair[]> array_holder(new Pair[size]);
-        Pair * array = array_holder.get();
-
-        /// Note: 64-bit integer weight can overflow.
-        /// We do some implementation specific behaviour (return approximate or garbage results).
-        /// Float64 is used as accumulator here to get approximate results.
-        /// But weight can be already overflowed in computations in 'add' and 'merge' methods.
-        /// It will be reasonable to change the type of weight to Float64 in the map,
-        /// but we don't do that for compatibility of serialized data.
-
-        size_t i = 0;
-        Float64 sum_weight = 0;
-        for (const auto & pair : map)
-        {
-            sum_weight += pair.getMapped();
-            array[i] = pair.getValue();
-            ++i;
-        }
-
-        ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
-
-        Float64 threshold = std::ceil(sum_weight * level);
-        Float64 accumulated = 0;
-
-        const Pair * it = array;
-        const Pair * end = array + size;
-        while (it < end)
-        {
-            accumulated += it->second;
-
-            if (accumulated >= threshold)
-                break;
-
-            ++it;
-        }
-
-        if (it == end)
-            --it;
-
-        return it->first;
-    }
-
-    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
-    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
-    void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
-    {
-        size_t size = map.size();
-
-        if (0 == size)
-        {
-            for (size_t i = 0; i < num_levels; ++i)
-                result[i] = Value();
-            return;
-        }
-
-        /// Copy the data to a temporary array to get the element you need in order.
-        using Pair = typename Map::value_type;
-        std::unique_ptr<Pair[]> array_holder(new Pair[size]);
-        Pair * array = array_holder.get();
-
-        size_t i = 0;
-        Float64 sum_weight = 0;
-        for (const auto & pair : map)
-        {
-            sum_weight += pair.getMapped();
-            array[i] = pair.getValue();
-            ++i;
-        }
-
-        ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
-
-        Float64 accumulated = 0;
-
-        const Pair * it = array;
-        const Pair * end = array + size;
-
-        size_t level_index = 0;
-        Float64 threshold = std::ceil(sum_weight * levels[indices[level_index]]);
-
-        while (it < end)
-        {
-            accumulated += it->second;
-
-            while (accumulated >= threshold)
-            {
-                result[indices[level_index]] = it->first;
-                ++level_index;
-
-                if (level_index == num_levels)
-                    return;
-
-                threshold = std::ceil(sum_weight * levels[indices[level_index]]);
-            }
-
-            ++it;
-        }
-
-        while (level_index < num_levels)
-        {
-            result[indices[level_index]] = array[size - 1].first;
-            ++level_index;
-        }
-    }
-
-    /// The same, but in the case of an empty state, NaN is returned.
-    Float64 getFloat(Float64) const
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileExact");
-    }
-
-    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileExact");
-    }
-};
-
-}
--- a/src/AggregateFunctions/QuantileInterpolatedWeighted.h
+++ b/src/AggregateFunctions/QuantileInterpolatedWeighted.h
@ -1,308 +0,0 @@
-#pragma once
-
-#include <base/sort.h>
-
-#include <Common/HashTable/HashMap.h>
-#include <Common/NaNUtils.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
-/** Approximates Quantile by:
-  * - sorting input values and weights
-  * - building a cumulative distribution based on weights
-  * - performing linear interpolation between the weights and values
-  *
-  */
-template <typename Value>
-struct QuantileInterpolatedWeighted
-{
-    struct Int128Hash
-    {
-        size_t operator()(Int128 x) const
-        {
-            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
-        }
-    };
-
-    using Weight = UInt64;
-    using UnderlyingType = NativeType<Value>;
-    using Hasher = HashCRC32<UnderlyingType>;
-
-    /// When creating, the hash table must be small.
-    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
-
-    Map map;
-
-    void add(const Value & x)
-    {
-        /// We must skip NaNs as they are not compatible with comparison sorting.
-        if (!isNaN(x))
-            ++map[x];
-    }
-
-    void add(const Value & x, Weight weight)
-    {
-        if (!isNaN(x))
-            map[x] += weight;
-    }
-
-    void merge(const QuantileInterpolatedWeighted & rhs)
-    {
-        for (const auto & pair : rhs.map)
-            map[pair.getKey()] += pair.getMapped();
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        map.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        typename Map::Reader reader(buf);
-        while (reader.next())
-        {
-            const auto & pair = reader.get();
-            map[pair.first] = pair.second;
-        }
-    }
-
-    Value get(Float64 level) const
-    {
-        return getImpl<Value>(level);
-    }
-
-    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
-    {
-        getManyImpl<Value>(levels, indices, size, result);
-    }
-
-    /// The same, but in the case of an empty state, NaN is returned.
-    Float64 getFloat(Float64) const
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileInterpolatedWeighted");
-    }
-
-    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileInterpolatedWeighted");
-    }
-
-private:
-    using Pair = typename std::pair<UnderlyingType, Float64>;
-
-    /// Get the value of the `level` quantile. The level must be between 0 and 1.
-    template <typename T>
-    T getImpl(Float64 level) const
-    {
-        size_t size = map.size();
-
-        if (0 == size)
-            return std::numeric_limits<Value>::quiet_NaN();
-
-        /// Maintain a vector of pair of values and weights for easier sorting and for building
-        /// a cumulative distribution using the provided weights.
-        std::vector<Pair> value_weight_pairs;
-        value_weight_pairs.reserve(size);
-
-        /// Note: weight provided must be a 64-bit integer
-        /// Float64 is used as accumulator here to get approximate results.
-        /// But weight used in the internal array is stored as Float64 as we
-        /// do some quantile estimation operation which involves division and
-        /// require Float64 level of precision.
-
-        Float64 sum_weight = 0;
-        for (const auto & pair : map)
-        {
-            sum_weight += pair.getMapped();
-            auto value = pair.getKey();
-            auto weight = pair.getMapped();
-            value_weight_pairs.push_back({value, weight});
-        }
-
-        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
-
-        Float64 accumulated = 0;
-
-        /// vector for populating and storing the cumulative sum using the provided weights.
-        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
-        std::vector<Float64> weights_cum_sum;
-        weights_cum_sum.reserve(size);
-
-        for (size_t idx = 0; idx < size; ++idx)
-        {
-            accumulated += value_weight_pairs[idx].second;
-            weights_cum_sum.push_back(accumulated);
-        }
-
-        /// The following estimation of quantile is general and the idea is:
-        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
-
-        /// calculates a simple cumulative distribution based on weights
-        if (sum_weight != 0)
-        {
-            for (size_t idx = 0; idx < size; ++idx)
-                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
-        }
-
-        /// perform linear interpolation
-        size_t idx = 0;
-        if (size >= 2)
-        {
-            if (level >= value_weight_pairs[size - 2].second)
-            {
-                idx = size - 2;
-            }
-            else
-            {
-                size_t start = 0, end = size - 1;
-                while (start <= end)
-                {
-                    size_t mid = start + (end - start) / 2;
-                    if (mid > size)
-                        break;
-                    if (level > value_weight_pairs[mid + 1].second)
-                        start = mid + 1;
-                    else
-                    {
-                        idx = mid;
-                        end = mid - 1;
-                    }
-                }
-            }
-        }
-
-        size_t l = idx;
-        size_t u = idx + 1 < size ? idx + 1 : idx;
-
-        Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
-        UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
-
-        if (level < xl)
-            yr = yl;
-        if (level > xr)
-            yl = yr;
-
-        return static_cast<T>(interpolate(level, xl, xr, yl, yr));
-    }
-
-    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
-    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
-    template <typename T>
-    void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
-    {
-        size_t size = map.size();
-
-        if (0 == size)
-        {
-            for (size_t i = 0; i < num_levels; ++i)
-                result[i] = Value();
-            return;
-        }
-
-        std::vector<Pair> value_weight_pairs;
-        value_weight_pairs.reserve(size);
-
-        Float64 sum_weight = 0;
-        for (const auto & pair : map)
-        {
-            sum_weight += pair.getMapped();
-            auto value = pair.getKey();
-            auto weight = pair.getMapped();
-            value_weight_pairs.push_back({value, weight});
-        }
-
-        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
-
-        Float64 accumulated = 0;
-
-        /// vector for populating and storing the cumulative sum using the provided weights.
-        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
-        std::vector<Float64> weights_cum_sum;
-        weights_cum_sum.reserve(size);
-
-        for (size_t idx = 0; idx < size; ++idx)
-        {
-            accumulated += value_weight_pairs[idx].second;
-            weights_cum_sum.emplace_back(accumulated);
-        }
-
-
-        /// The following estimation of quantile is general and the idea is:
-        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
-
-        /// calculates a simple cumulative distribution based on weights
-        if (sum_weight != 0)
-        {
-            for (size_t idx = 0; idx < size; ++idx)
-                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
-        }
-
-        for (size_t level_index = 0; level_index < num_levels; ++level_index)
-        {
-            /// perform linear interpolation for every level
-            auto level = levels[indices[level_index]];
-
-            size_t idx = 0;
-            if (size >= 2)
-            {
-                if (level >= value_weight_pairs[size - 2].second)
-                {
-                    idx = size - 2;
-                }
-                else
-                {
-                    size_t start = 0, end = size - 1;
-                    while (start <= end)
-                    {
-                        size_t mid = start + (end - start) / 2;
-                        if (mid > size)
-                            break;
-                        if (level > value_weight_pairs[mid + 1].second)
-                            start = mid + 1;
-                        else
-                        {
-                            idx = mid;
-                            end = mid - 1;
-                        }
-                    }
-                }
-            }
-
-            size_t l = idx;
-            size_t u = idx + 1 < size ? idx + 1 : idx;
-
-            Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
-            UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
-
-            if (level < xl)
-                yr = yl;
-            if (level > xr)
-                yl = yr;
-
-            result[indices[level_index]] = static_cast<T>(interpolate(level, xl, xr, yl, yr));
-        }
-    }
-
-    /// This ignores overflows or NaN's that might arise during add, sub and mul operations and doesn't aim to provide exact
-    /// results since `the quantileInterpolatedWeighted` function itself relies mainly on approximation.
-    UnderlyingType NO_SANITIZE_UNDEFINED interpolate(Float64 level, Float64 xl, Float64 xr, UnderlyingType yl, UnderlyingType yr) const
-    {
-        UnderlyingType dy = yr - yl;
-        Float64 dx = xr - xl;
-        dx = dx == 0 ? 1 : dx; /// to handle NaN behavior that might arise during integer division below.
-
-        /// yl + (dy / dx) * (level - xl)
-        return static_cast<UnderlyingType>(yl + (dy / dx) * (level - xl));
-    }
-};
-
-}
--- a/src/AggregateFunctions/QuantileReservoirSampler.h
+++ b/src/AggregateFunctions/QuantileReservoirSampler.h
@ -1,102 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/ReservoirSampler.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
-/** Quantile calculation with "reservoir sample" algorithm.
-  * It collects pseudorandom subset of limited size from a stream of values,
-  *  and approximate quantile from it.
-  * The result is non-deterministic. Also look at QuantileReservoirSamplerDeterministic.
-  *
-  * This algorithm is quite inefficient in terms of precision for memory usage,
-  *  but very efficient in CPU (though less efficient than QuantileTiming and than QuantileExact for small sets).
-  */
-template <typename Value>
-struct QuantileReservoirSampler
-{
-    using Data = ReservoirSampler<Value, ReservoirSamplerOnEmpty::RETURN_NAN_OR_ZERO>;
-    Data data;
-
-    void add(const Value & x)
-    {
-        data.insert(x);
-    }
-
-    template <typename Weight>
-    void add(const Value &, const Weight &)
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add with weight is not implemented for ReservoirSampler");
-    }
-
-    void merge(const QuantileReservoirSampler & rhs)
-    {
-        data.merge(rhs.data);
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        data.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        data.read(buf);
-    }
-
-    /// Get the value of the `level` quantile. The level must be between 0 and 1.
-    Value get(Float64 level)
-    {
-        if (data.empty())
-            return {};
-
-        if constexpr (is_decimal<Value>)
-            return Value(static_cast<typename Value::NativeType>(data.quantileInterpolated(level)));
-        else
-            return static_cast<Value>(data.quantileInterpolated(level));
-    }
-
-    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
-    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
-    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
-    {
-        bool is_empty = data.empty();
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            if (is_empty)
-            {
-                result[i] = Value{};
-            }
-            else
-            {
-                if constexpr (is_decimal<Value>)
-                    result[indices[i]] = Value(static_cast<typename Value::NativeType>(data.quantileInterpolated(levels[indices[i]])));
-                else
-                    result[indices[i]] = Value(data.quantileInterpolated(levels[indices[i]]));
-            }
-        }
-    }
-
-    /// The same, but in the case of an empty state, NaN is returned.
-    Float64 getFloat(Float64 level)
-    {
-        return data.quantileInterpolated(level);
-    }
-
-    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
-    {
-        for (size_t i = 0; i < size; ++i)
-            result[indices[i]] = data.quantileInterpolated(levels[indices[i]]);
-    }
-};
-
-}
--- a/src/AggregateFunctions/QuantileReservoirSamplerDeterministic.h
+++ b/src/AggregateFunctions/QuantileReservoirSamplerDeterministic.h
@ -1,102 +0,0 @@
-#pragma once
-
-#include <AggregateFunctions/ReservoirSamplerDeterministic.h>
-
-
-namespace DB
-{
-struct Settings;
-
-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
-/** Quantile calculation with "reservoir sample" algorithm.
-  * It collects pseudorandom subset of limited size from a stream of values,
-  *  and approximate quantile from it.
-  * The function accept second argument, named "determinator"
-  *  and a hash function from it is calculated and used as a source for randomness
-  *  to apply random sampling.
-  * The function is deterministic, but care should be taken with choose of "determinator" argument.
-  */
-template <typename Value>
-struct QuantileReservoirSamplerDeterministic
-{
-    using Data = ReservoirSamplerDeterministic<Value, ReservoirSamplerDeterministicOnEmpty::RETURN_NAN_OR_ZERO>;
-    Data data;
-
-    void add(const Value &)
-    {
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method add without determinator is not implemented for ReservoirSamplerDeterministic");
-    }
-
-    template <typename Determinator>
-    void add(const Value & x, const Determinator & determinator)
-    {
-        data.insert(x, determinator);
-    }
-
-    void merge(const QuantileReservoirSamplerDeterministic & rhs)
-    {
-        data.merge(rhs.data);
-    }
-
-    void serialize(WriteBuffer & buf) const
-    {
-        data.write(buf);
-    }
-
-    void deserialize(ReadBuffer & buf)
-    {
-        data.read(buf);
-    }
-
-    /// Get the value of the `level` quantile. The level must be between 0 and 1.
-    Value get(Float64 level)
-    {
-        if (data.empty())
-            return {};
-
-        if constexpr (is_decimal<Value>)
-            return static_cast<typename Value::NativeType>(data.quantileInterpolated(level));
-        else
-            return static_cast<Value>(data.quantileInterpolated(level));
-    }
-
-    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
-    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
-    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
-    {
-        bool is_empty = data.empty();
-
-        for (size_t i = 0; i < size; ++i)
-        {
-            if (is_empty)
-            {
-                result[i] = Value{};
-            }
-            else
-            {
-                if constexpr (is_decimal<Value>)
-                    result[indices[i]] = static_cast<typename Value::NativeType>(data.quantileInterpolated(levels[indices[i]]));
-                else
-                    result[indices[i]] = static_cast<Value>(data.quantileInterpolated(levels[indices[i]]));
-            }
-        }
-    }
-
-    /// The same, but in the case of an empty state, NaN is returned.
-    Float64 getFloat(Float64 level)
-    {
-        return data.quantileInterpolated(level);
-    }
-
-    void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
-    {
-        for (size_t i = 0; i < size; ++i)
-            result[indices[i]] = data.quantileInterpolated(levels[indices[i]]);
-    }
-};
-
-}
--- a/src/Common/HyperLogLogCounter.h
+++ b/src/Common/HyperLogLogCounter.h
@ -4,7 +4,7 @@
 #include <Common/HyperLogLogBiasEstimator.h>
 #include <Common/CompactArray.h>
 #include <Common/HashTable/Hash.h>
-#include <Common/TransformEndianness.hpp>
+#include <Common/transformEndianness.h>

 #include <IO/ReadBuffer.h>
 #include <IO/WriteBuffer.h>
--- a/src/Common/SipHash.h
+++ b/src/Common/SipHash.h
@ -13,8 +13,6 @@
  * (~ 700 MB/sec, 15 million strings per second)
  */

-#include "TransformEndianness.hpp"
-
 #include <bit>
 #include <string>
 #include <type_traits>
@ -24,9 +22,11 @@
 #include <base/unaligned.h>
 #include <base/hex.h>
 #include <Common/Exception.h>
+#include <Common/transformEndianness.h>

 #include <city.h>

+
 namespace DB::ErrorCodes
 {
    extern const int LOGICAL_ERROR;
--- a/src/Common/TransformEndianness.hpp
+++ b/src/Common/TransformEndianness.hpp
--- a/src/Functions/reinterpretAs.cpp
+++ b/src/Functions/reinterpretAs.cpp
@ -19,7 +19,7 @@
 #include <DataTypes/DataTypesDecimal.h>
 #include <DataTypes/DataTypesNumber.h>

-#include <Common/TransformEndianness.hpp>
+#include <Common/transformEndianness.h>
 #include <Common/memcpySmall.h>
 #include <Common/typeid_cast.h>

--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -17,7 +17,7 @@
 #include <Common/DateLUT.h>
 #include <Common/LocalDate.h>
 #include <Common/LocalDateTime.h>
-#include <Common/TransformEndianness.hpp>
+#include <Common/transformEndianness.h>
 #include <base/StringRef.h>
 #include <base/arithmeticOverflow.h>
 #include <base/sort.h>
--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@ -15,7 +15,7 @@
 #include <Common/DateLUT.h>
 #include <Common/LocalDate.h>
 #include <Common/LocalDateTime.h>
-#include <Common/TransformEndianness.hpp>
+#include <Common/transformEndianness.h>
 #include <base/find_symbols.h>
 #include <base/StringRef.h>
 #include <base/DecomposedFloat.h>