From a2f9329e18c07ffbdcf63492e8176129e06e6316 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 16 Sep 2024 13:56:43 +0800 Subject: [PATCH] support quantileExactWeightedInterpolated --- .../AggregateFunctionQuantile.h | 3 + ...AggregateFunctionQuantileExactWeighted.cpp | 237 ++++++++++++++++-- 2 files changed, 214 insertions(+), 26 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionQuantile.h b/src/AggregateFunctions/AggregateFunctionQuantile.h index 423fd4bc569..aa6755f237d 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantile.h +++ b/src/AggregateFunctions/AggregateFunctionQuantile.h @@ -312,6 +312,9 @@ struct NameQuantilesExactInclusive { static constexpr auto name = "quantilesExac struct NameQuantileExactWeighted { static constexpr auto name = "quantileExactWeighted"; }; struct NameQuantilesExactWeighted { static constexpr auto name = "quantilesExactWeighted"; }; +struct NameQuantileExactWeightedInterpolated { static constexpr auto name = "quantileExactWeightedInterpolated"; }; +struct NameQuantilesExactWeightedInterpolated { static constexpr auto name = "quantilesExactWeightedInterpolated"; }; + struct NameQuantileInterpolatedWeighted { static constexpr auto name = "quantileInterpolatedWeighted"; }; struct NameQuantilesInterpolatedWeighted { static constexpr auto name = "quantilesInterpolatedWeighted"; }; diff --git a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp index 469abdf45a2..85acac8cb50 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileExactWeighted.cpp @@ -29,7 +29,7 @@ namespace * It uses O(distinct(N)) memory. Can be naturally applied for values with weight. * In case of many identical values, it can be more efficient than QuantileExact even when weight is not used. */ -template +template struct QuantileExactWeighted { struct Int128Hash @@ -46,6 +46,7 @@ struct QuantileExactWeighted /// When creating, the hash table must be small. using Map = HashMapWithStackMemory; + using Pair = typename Map::value_type; Map map; @@ -85,6 +86,42 @@ struct QuantileExactWeighted /// Get the value of the `level` quantile. The level must be between 0 and 1. Value get(Float64 level) const + { + if constexpr (interpolated) + return getInterpolatedImpl(level); + else + return getImpl(level); + } + + /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. + /// indices - an array of index levels such that the corresponding elements will go in ascending order. + void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + { + if constexpr (interpolated) + getManyInterpolatedImpl(levels, indices, num_levels, result); + else + getManyImpl(levels, indices, num_levels, result); + } + + Float64 getFloat(Float64 level) const + { + if constexpr (interpolated) + return getFloatInterpolatedImpl(level); + else + return getFloatImpl(level); + } + + void getManyFloat(const Float64 * levels, const size_t * indices, size_t num_levels, Float64 * result) const + { + if constexpr (interpolated) + getManyFloatInterpolatedImpl(levels, indices, num_levels, result); + else + getManyFloatImpl(levels, indices, num_levels, result); + } + +private: + /// get implementation without interpolation + Value getImpl(Float64 level) const { size_t size = map.size(); @@ -92,7 +129,6 @@ struct QuantileExactWeighted return std::numeric_limits::quiet_NaN(); /// Copy the data to a temporary array to get the element you need in order. - using Pair = typename Map::value_type; std::unique_ptr array_holder(new Pair[size]); Pair * array = array_holder.get(); @@ -135,9 +171,8 @@ struct QuantileExactWeighted return it->first; } - /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. - /// indices - an array of index levels such that the corresponding elements will go in ascending order. - void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + /// getMany implementation without interpolation + void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const { size_t size = map.size(); @@ -149,7 +184,6 @@ struct QuantileExactWeighted } /// Copy the data to a temporary array to get the element you need in order. - using Pair = typename Map::value_type; std::unique_ptr array_holder(new Pair[size]); Pair * array = array_holder.get(); @@ -197,23 +231,167 @@ struct QuantileExactWeighted } } - /// The same, but in the case of an empty state, NaN is returned. - Float64 getFloat(Float64) const + /// getFloat implementation without interpolation + Float64 getFloatImpl(Float64) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getFloat is not implemented for QuantileExact"); } - void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const + /// getManyFloat implementation without interpolation + void getManyFloatImpl(const Float64 *, const size_t *, size_t, Float64 *) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getManyFloat is not implemented for QuantileExact"); } + + /// get implementation with interpolation + Value getInterpolatedImpl(Float64 level) const + { + size_t size = map.size(); + if (0 == size) + return std::numeric_limits::quiet_NaN(); + + Float64 res = getFloatInterpolatedImpl(level); + if constexpr (is_decimal) + return Value(static_cast(res)); + else + return static_cast(res); + } + + /// getMany implementation with interpolation + void getManyInterpolatedImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const + { + size_t size = map.size(); + if (0 == size) + { + for (size_t i = 0; i < num_levels; ++i) + result[i] = Value(); + return; + } + + std::unique_ptr res_holder(new Float64[num_levels]); + Float64 * res = res_holder.get(); + getManyFloatInterpolatedImpl(levels, indices, num_levels, res); + for (size_t i = 0; i < num_levels; ++i) + { + if constexpr (is_decimal) + result[i] = Value(static_cast(res[i])); + else + result[i] = Value(res[i]); + } + } + + /// getFloat implementation with interpolation + Float64 getFloatInterpolatedImpl(Float64 level) const + { + size_t size = map.size(); + + if (0 == size) + return std::numeric_limits::quiet_NaN(); + + /// Copy the data to a temporary array to get the element you need in order. + std::unique_ptr array_holder(new Pair[size]); + Pair * array = array_holder.get(); + + size_t i = 0; + for (const auto & pair : map) + { + array[i] = pair.getValue(); + ++i; + } + + ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::partial_sum(array, array + size, array, [](const Pair & acc, const Pair & p) { return Pair(p.first, acc.second + p.second); }); + Weight max_position = array[size - 1].second - 1; + Float64 position = max_position * level; + return quantileInterpolated(array, size, position); + } + + /// getManyFloat implementation with interpolation + void getManyFloatInterpolatedImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Float64 * result) const + { + size_t size = map.size(); + if (0 == size) + { + for (size_t i = 0; i < num_levels; ++i) + result[i] = std::numeric_limits::quiet_NaN(); + return; + } + + /// Copy the data to a temporary array to get the element you need in order. + std::unique_ptr array_holder(new Pair[size]); + Pair * array = array_holder.get(); + + size_t i = 0; + for (const auto & pair : map) + { + array[i] = pair.getValue(); + ++i; + } + + ::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; }); + std::partial_sum(array, array + size, array, [](Pair acc, Pair & p) { return Pair(p.first, acc.second + p.second); }); + Weight max_position = array[size - 1].second - 1; + + for (size_t j = 0; j < num_levels; ++j) + { + Float64 position = max_position * levels[indices[j]]; + result[indices[j]] = quantileInterpolated(array, size, position); + } + } + + /// Calculate quantile, using linear interpolation between two closest values + Float64 NO_SANITIZE_UNDEFINED quantileInterpolated(const Pair * array, size_t size, Float64 position) const + { + /* + for (size_t i = 0; i < size; ++i) + std::cout << "array[" << i << "]: " << toString(Field(array[i].first)) << ", " << array[i].second << std::endl; + std::cout << "position: " << position << std::endl; + */ + size_t lower = static_cast(std::floor(position)); + size_t higher = static_cast(std::ceil(position)); + // std::cout << "lower: " << lower << ", higher: " << higher << std::endl; + + const auto * lower_it = std::lower_bound(array, array + size, lower + 1, [](const Pair & a, size_t b) { return a.second < b; }); + const auto * higher_it = std::lower_bound(array, array + size, higher + 1, [](const Pair & a, size_t b) { return a.second < b; }); + if (lower_it == array + size) + lower_it = array + size - 1; + if (higher_it == array + size) + higher_it = array + size - 1; + // std::cout << "lower_index:" << lower_it - array << ", higher_index:" << higher_it - array << std::endl; + + UnderlyingType lower_key = lower_it->first; + UnderlyingType higher_key = higher_it->first; + + if (lower == higher) + return static_cast(lower_key); + if (lower_key == higher_key) + return static_cast(lower_key); + + return (static_cast(higher) - position) * lower_key + (position - static_cast(lower)) * higher_key; + } }; -template using FuncQuantileExactWeighted = AggregateFunctionQuantile, NameQuantileExactWeighted, true, void, false, false>; -template using FuncQuantilesExactWeighted = AggregateFunctionQuantile, NameQuantilesExactWeighted, true, void, true, false>; +template +using FuncQuantileExactWeighted = AggregateFunctionQuantile< + Value, + QuantileExactWeighted, + NameQuantileExactWeighted, + true, + std::conditional_t, + false, + false>; +template +using FuncQuantilesExactWeighted = AggregateFunctionQuantile< + Value, + QuantileExactWeighted, + NameQuantilesExactWeighted, + true, + std::conditional_t, + true, + false>; -template