2017-12-20 07:36:30 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-12-20 20:25:22 +00:00
|
|
|
#include <Common/HashTable/HashMap.h>
|
2018-03-14 05:03:51 +00:00
|
|
|
#include <Common/NaNUtils.h>
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
|
|
|
extern const int NOT_IMPLEMENTED;
|
|
|
|
}
|
|
|
|
|
2019-01-22 19:56:53 +00:00
|
|
|
/** Calculates quantile by counting number of occurrences for each value in a hash map.
|
2017-12-20 20:25:22 +00:00
|
|
|
*
|
2019-02-02 14:27:43 +00:00
|
|
|
* It uses O(distinct(N)) memory. Can be naturally applied for values with weight.
|
2017-12-20 20:25:22 +00:00
|
|
|
* In case of many identical values, it can be more efficient than QuantileExact even when weight is not used.
|
|
|
|
*/
|
2017-12-20 07:36:30 +00:00
|
|
|
template <typename Value>
|
|
|
|
struct QuantileExactWeighted
|
|
|
|
{
|
2019-05-16 14:33:13 +00:00
|
|
|
struct Int128Hash
|
|
|
|
{
|
|
|
|
size_t operator()(Int128 x) const
|
|
|
|
{
|
|
|
|
return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2017-12-20 08:39:21 +00:00
|
|
|
using Weight = UInt64;
|
2019-05-16 14:33:13 +00:00
|
|
|
using UnderlyingType = typename NativeType<Value>::Type;
|
|
|
|
using Hasher = std::conditional_t<std::is_same_v<Value, Decimal128>, Int128Hash, HashCRC32<UnderlyingType>>;
|
2017-12-20 08:39:21 +00:00
|
|
|
|
2017-12-20 07:36:30 +00:00
|
|
|
/// When creating, the hash table must be small.
|
|
|
|
using Map = HashMap<
|
2019-05-16 14:33:13 +00:00
|
|
|
UnderlyingType, Weight,
|
|
|
|
Hasher,
|
2017-12-20 07:36:30 +00:00
|
|
|
HashTableGrower<4>,
|
|
|
|
HashTableAllocatorWithStackMemory<sizeof(std::pair<Value, Weight>) * (1 << 3)>
|
|
|
|
>;
|
|
|
|
|
|
|
|
Map map;
|
|
|
|
|
|
|
|
void add(const Value & x)
|
|
|
|
{
|
2018-03-14 05:03:51 +00:00
|
|
|
/// We must skip NaNs as they are not compatible with comparison sorting.
|
|
|
|
if (!isNaN(x))
|
|
|
|
++map[x];
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
|
2019-05-16 14:33:13 +00:00
|
|
|
void add(const Value & x, Weight weight)
|
2017-12-20 07:36:30 +00:00
|
|
|
{
|
2018-03-14 05:03:51 +00:00
|
|
|
if (!isNaN(x))
|
|
|
|
map[x] += weight;
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void merge(const QuantileExactWeighted & rhs)
|
|
|
|
{
|
|
|
|
for (const auto & pair : rhs.map)
|
2019-02-28 09:35:38 +00:00
|
|
|
map[pair.getFirst()] += pair.getSecond();
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
void serialize(WriteBuffer & buf) const
|
|
|
|
{
|
|
|
|
map.write(buf);
|
|
|
|
}
|
|
|
|
|
|
|
|
void deserialize(ReadBuffer & buf)
|
|
|
|
{
|
|
|
|
typename Map::Reader reader(buf);
|
|
|
|
while (reader.next())
|
|
|
|
{
|
|
|
|
const auto & pair = reader.get();
|
2019-02-28 09:35:38 +00:00
|
|
|
map[pair.getFirst()] = pair.getSecond();
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the value of the `level` quantile. The level must be between 0 and 1.
|
|
|
|
Value get(Float64 level) const
|
|
|
|
{
|
|
|
|
size_t size = map.size();
|
|
|
|
|
|
|
|
if (0 == size)
|
2018-08-13 08:33:51 +00:00
|
|
|
return std::numeric_limits<Value>::quiet_NaN();
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
/// Copy the data to a temporary array to get the element you need in order.
|
2017-12-20 08:39:21 +00:00
|
|
|
using Pair = typename Map::value_type;
|
2017-12-20 07:36:30 +00:00
|
|
|
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
|
|
|
|
Pair * array = array_holder.get();
|
|
|
|
|
|
|
|
size_t i = 0;
|
|
|
|
UInt64 sum_weight = 0;
|
|
|
|
for (const auto & pair : map)
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
sum_weight += pair.getSecond();
|
|
|
|
array[i] = pair.getValue();
|
2017-12-20 07:36:30 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
2019-02-28 09:35:38 +00:00
|
|
|
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.getFirst() < b.getFirst(); });
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
UInt64 threshold = std::ceil(sum_weight * level);
|
|
|
|
UInt64 accumulated = 0;
|
|
|
|
|
|
|
|
const Pair * it = array;
|
|
|
|
const Pair * end = array + size;
|
|
|
|
while (it < end)
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
accumulated += it->getSecond();
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
if (accumulated >= threshold)
|
|
|
|
break;
|
|
|
|
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (it == end)
|
|
|
|
--it;
|
|
|
|
|
2019-02-28 09:35:38 +00:00
|
|
|
return it->getFirst();
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
|
|
|
|
/// indices - an array of index levels such that the corresponding elements will go in ascending order.
|
2017-12-20 08:39:21 +00:00
|
|
|
void getMany(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
|
2017-12-20 07:36:30 +00:00
|
|
|
{
|
|
|
|
size_t size = map.size();
|
|
|
|
|
|
|
|
if (0 == size)
|
|
|
|
{
|
2017-12-20 08:39:21 +00:00
|
|
|
for (size_t i = 0; i < num_levels; ++i)
|
2017-12-20 07:36:30 +00:00
|
|
|
result[i] = Value();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Copy the data to a temporary array to get the element you need in order.
|
2017-12-20 08:39:21 +00:00
|
|
|
using Pair = typename Map::value_type;
|
2017-12-20 07:36:30 +00:00
|
|
|
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
|
|
|
|
Pair * array = array_holder.get();
|
|
|
|
|
|
|
|
size_t i = 0;
|
|
|
|
UInt64 sum_weight = 0;
|
|
|
|
for (const auto & pair : map)
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
sum_weight += pair.getSecond();
|
|
|
|
array[i] = pair.getValue();
|
2017-12-20 07:36:30 +00:00
|
|
|
++i;
|
|
|
|
}
|
|
|
|
|
2019-02-28 09:35:38 +00:00
|
|
|
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.getFirst() < b.getFirst(); });
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
UInt64 accumulated = 0;
|
|
|
|
|
|
|
|
const Pair * it = array;
|
|
|
|
const Pair * end = array + size;
|
|
|
|
|
|
|
|
size_t level_index = 0;
|
2017-12-20 21:37:30 +00:00
|
|
|
UInt64 threshold = std::ceil(sum_weight * levels[indices[level_index]]);
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
while (it < end)
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
accumulated += it->getSecond();
|
2017-12-20 07:36:30 +00:00
|
|
|
|
|
|
|
while (accumulated >= threshold)
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
result[indices[level_index]] = it->getFirst();
|
2017-12-20 07:36:30 +00:00
|
|
|
++level_index;
|
|
|
|
|
2017-12-20 08:39:21 +00:00
|
|
|
if (level_index == num_levels)
|
2017-12-20 07:36:30 +00:00
|
|
|
return;
|
|
|
|
|
2017-12-20 21:37:30 +00:00
|
|
|
threshold = std::ceil(sum_weight * levels[indices[level_index]]);
|
2017-12-20 07:36:30 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
|
2017-12-20 08:39:21 +00:00
|
|
|
while (level_index < num_levels)
|
2017-12-20 07:36:30 +00:00
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
result[indices[level_index]] = array[size - 1].getFirst();
|
2017-12-20 07:36:30 +00:00
|
|
|
++level_index;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// The same, but in the case of an empty state, NaN is returned.
|
2017-12-21 01:19:25 +00:00
|
|
|
Float64 getFloat(Float64) const
|
2017-12-20 07:36:30 +00:00
|
|
|
{
|
|
|
|
throw Exception("Method getFloat is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);
|
|
|
|
}
|
|
|
|
|
2017-12-21 01:19:25 +00:00
|
|
|
void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
|
2017-12-20 07:36:30 +00:00
|
|
|
{
|
|
|
|
throw Exception("Method getManyFloat is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|