ClickHouse/src/AggregateFunctions/QuantileBFloat16Histogram.h

209 lines
5.7 KiB
C++
Raw Normal View History

2021-04-14 20:38:56 +00:00
#pragma once
2021-06-15 19:55:21 +00:00
#include <common/types.h>
#include <common/bit_cast.h>
#include <Common/HashTable/HashMap.h>
2021-04-28 14:54:10 +00:00
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
2021-04-14 20:38:56 +00:00
2021-05-21 06:30:13 +00:00
2021-04-24 19:11:56 +00:00
namespace DB
2021-04-14 20:38:56 +00:00
{
2021-05-21 06:30:13 +00:00
/** `bfloat16` is a 16-bit floating point data type that is the same as the corresponding most significant 16 bits of the `float`.
* https://en.wikipedia.org/wiki/Bfloat16_floating-point_format
*
* To calculate quantile, simply convert input value to 16 bit (convert to float, then take the most significant 16 bits),
* and calculate the histogram of these values.
*
* Hash table is the preferred way to store histogram, because the number of distinct values is small:
* ```
* SELECT uniq(bfloat)
* FROM
* (
* SELECT
* number,
* toFloat32(number) AS f,
* bitShiftRight(bitAnd(reinterpretAsUInt32(reinterpretAsFixedString(f)), 4294901760) AS cut, 16),
* reinterpretAsFloat32(reinterpretAsFixedString(cut)) AS bfloat
* FROM numbers(100000000)
* )
*
* uniq(bfloat)
* 2623
*
* ```
* (when increasing the range of values 1000 times, the number of distinct bfloat16 values increases just by 1280).
*
* Then calculate quantile from the histogram.
*
* This sketch is very simple and rough. Its relative precision is constant 1 / 256 = 0.390625%.
*/
2021-04-24 19:11:56 +00:00
template <typename Value>
2021-04-26 09:39:08 +00:00
struct QuantileBFloat16Histogram
2021-04-14 20:38:56 +00:00
{
2021-05-21 06:30:13 +00:00
using BFloat16 = UInt16;
2021-04-28 14:54:10 +00:00
using Weight = UInt64;
2021-05-21 06:33:00 +00:00
/// Make automatic memory for 16 elements to avoid allocations for small states.
/// The usage of trivial hash is ok, because we effectively take logarithm of the values and pathological cases are unlikely.
2021-05-21 06:30:13 +00:00
using Data = HashMapWithStackMemory<BFloat16, Weight, TrivialHash, 4>;
2021-04-28 14:54:10 +00:00
Data data;
2021-04-14 20:38:56 +00:00
2021-05-21 06:30:13 +00:00
void add(const Value & x)
{
add(x, 1);
}
2021-04-14 20:38:56 +00:00
2021-04-28 14:54:10 +00:00
void add(const Value & x, Weight w)
2021-04-14 20:38:56 +00:00
{
2021-04-28 14:54:10 +00:00
if (!isNaN(x))
2021-05-21 06:30:13 +00:00
data[toBFloat16(x)] += w;
2021-04-14 20:38:56 +00:00
}
2021-04-28 14:54:10 +00:00
void merge(const QuantileBFloat16Histogram & rhs)
{
for (const auto & pair : rhs.data)
data[pair.getKey()] += pair.getMapped();
}
2021-04-14 20:38:56 +00:00
2021-05-21 06:30:13 +00:00
void serialize(WriteBuffer & buf) const
{
data.write(buf);
}
2021-04-14 20:38:56 +00:00
2021-05-21 06:30:13 +00:00
void deserialize(ReadBuffer & buf)
{
data.read(buf);
}
2021-04-14 20:38:56 +00:00
2021-05-21 06:30:13 +00:00
Value get(Float64 level) const
{
return getImpl<Value>(level);
}
2021-04-14 20:38:56 +00:00
2021-04-28 14:54:10 +00:00
void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
2021-04-14 20:38:56 +00:00
{
2021-04-28 14:54:10 +00:00
getManyImpl(levels, indices, size, result);
2021-04-14 20:38:56 +00:00
}
2021-05-21 06:30:13 +00:00
Float64 getFloat(Float64 level) const
{
return getImpl<Float64>(level);
}
2021-04-14 20:38:56 +00:00
2021-04-28 14:54:10 +00:00
void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result) const
2021-04-14 20:38:56 +00:00
{
2021-04-28 14:54:10 +00:00
getManyImpl(levels, indices, size, result);
}
private:
2021-05-21 06:30:13 +00:00
/// Take the most significant 16 bits of the floating point number.
BFloat16 toBFloat16(const Value & x) const
{
2021-06-15 19:55:21 +00:00
return bit_cast<UInt32>(static_cast<Float32>(x)) >> 16;
2021-05-21 06:30:13 +00:00
}
2021-04-28 14:54:10 +00:00
2021-05-21 06:30:13 +00:00
/// Put the bits into most significant 16 bits of the floating point number and fill other bits with zeros.
Float32 toFloat32(const BFloat16 & x) const
{
2021-06-15 19:55:21 +00:00
return bit_cast<Float32>(x << 16);
2021-05-21 06:30:13 +00:00
}
2021-04-28 14:54:10 +00:00
using Pair = PairNoInit<Float32, Weight>;
template <typename T>
T getImpl(Float64 level) const
{
size_t size = data.size();
if (0 == size)
return std::numeric_limits<T>::quiet_NaN();
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
Pair * array = array_holder.get();
Float64 sum_weight = 0;
Pair * arr_it = array;
for (const auto & pair : data)
{
sum_weight += pair.getMapped();
2021-05-21 06:30:13 +00:00
*arr_it = {toFloat32(pair.getKey()), pair.getMapped()};
2021-04-28 14:54:10 +00:00
++arr_it;
}
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
Float64 threshold = std::ceil(sum_weight * level);
Float64 accumulated = 0;
for (const Pair * p = array; p != (array + size); ++p)
{
accumulated += p->second;
if (accumulated >= threshold)
return p->first;
}
return array[size - 1].first;
}
template <typename T>
void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, T * result) const
{
size_t size = data.size();
if (0 == size)
{
for (size_t i = 0; i < num_levels; ++i)
result[i] = std::numeric_limits<T>::quiet_NaN();
return;
}
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
Pair * array = array_holder.get();
Float64 sum_weight = 0;
Pair * arr_it = array;
for (const auto & pair : data)
{
sum_weight += pair.getMapped();
2021-05-21 06:30:13 +00:00
*arr_it = {toFloat32(pair.getKey()), pair.getMapped()};
2021-04-28 14:54:10 +00:00
++arr_it;
}
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
size_t level_index = 0;
Float64 accumulated = 0;
Float64 threshold = std::ceil(sum_weight * levels[indices[level_index]]);
for (const Pair * p = array; p != (array + size); ++p)
{
accumulated += p->second;
while (accumulated >= threshold)
{
result[indices[level_index]] = p->first;
++level_index;
if (level_index == num_levels)
return;
threshold = std::ceil(sum_weight * levels[indices[level_index]]);
}
}
while (level_index < num_levels)
{
result[indices[level_index]] = array[size - 1].first;
++level_index;
}
2021-04-14 20:38:56 +00:00
}
};
2021-04-14 21:06:22 +00:00
}