2015-11-15 08:31:08 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <DB/Common/HashTable/HashMap.h>
|
|
|
|
|
|
|
|
|
|
#include <DB/DataTypes/DataTypesNumberFixed.h>
|
|
|
|
|
#include <DB/DataTypes/DataTypeArray.h>
|
|
|
|
|
|
|
|
|
|
#include <DB/AggregateFunctions/IBinaryAggregateFunction.h>
|
2015-11-23 21:33:43 +00:00
|
|
|
|
#include <DB/AggregateFunctions/QuantilesCommon.h>
|
2015-11-15 08:31:08 +00:00
|
|
|
|
|
|
|
|
|
#include <DB/Columns/ColumnArray.h>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** В качестве состояния используется хэш-таблица вида: значение -> сколько раз встретилось.
|
|
|
|
|
*/
|
|
|
|
|
template <typename T>
|
|
|
|
|
struct AggregateFunctionQuantileExactWeightedData
|
|
|
|
|
{
|
|
|
|
|
using Key = T;
|
|
|
|
|
using Weight = UInt64;
|
|
|
|
|
|
|
|
|
|
/// При создании, хэш-таблица должна быть небольшой.
|
|
|
|
|
using Map = HashMap<
|
|
|
|
|
Key, Weight,
|
|
|
|
|
HashCRC32<Key>,
|
|
|
|
|
HashTableGrower<4>,
|
|
|
|
|
HashTableAllocatorWithStackMemory<sizeof(std::pair<Key, Weight>) * (1 << 3)>
|
|
|
|
|
>;
|
|
|
|
|
|
|
|
|
|
Map map;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Точно вычисляет квантиль по множеству значений, для каждого из которых задан вес - сколько раз значение встречалось.
|
|
|
|
|
* Можно рассматривать набор пар value, weight - как набор гистограмм,
|
|
|
|
|
* в которых value - значение, округлённое до середины столбика, а weight - высота столбика.
|
|
|
|
|
* В качестве типа аргумента может быть только числовой тип (в том числе, дата и дата-с-временем).
|
|
|
|
|
* Тип результата совпадает с типом аргумента.
|
|
|
|
|
*/
|
|
|
|
|
template <typename ValueType, typename WeightType>
|
|
|
|
|
class AggregateFunctionQuantileExactWeighted final
|
|
|
|
|
: public IBinaryAggregateFunction<
|
|
|
|
|
AggregateFunctionQuantileExactWeightedData<ValueType>,
|
|
|
|
|
AggregateFunctionQuantileExactWeighted<ValueType, WeightType>>
|
|
|
|
|
{
|
|
|
|
|
private:
|
|
|
|
|
double level;
|
|
|
|
|
DataTypePtr type;
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
AggregateFunctionQuantileExactWeighted(double level_ = 0.5) : level(level_) {}
|
|
|
|
|
|
|
|
|
|
String getName() const override { return "quantileExactWeighted"; }
|
|
|
|
|
|
|
|
|
|
DataTypePtr getReturnType() const override
|
|
|
|
|
{
|
|
|
|
|
return type;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setArgumentsImpl(const DataTypes & arguments)
|
|
|
|
|
{
|
|
|
|
|
type = arguments[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setParameters(const Array & params) override
|
|
|
|
|
{
|
|
|
|
|
if (params.size() != 1)
|
|
|
|
|
throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
|
|
|
|
|
|
|
|
|
level = apply_visitor(FieldVisitorConvertToNumber<Float64>(), params[0]);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-19 22:30:40 +00:00
|
|
|
|
void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_weight, size_t row_num, Arena *) const
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
this->data(place)
|
|
|
|
|
.map[static_cast<const ColumnVector<ValueType> &>(column_value).getData()[row_num]]
|
|
|
|
|
+= static_cast<const ColumnVector<WeightType> &>(column_weight).getData()[row_num];
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-23 23:33:17 +00:00
|
|
|
|
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
const auto & rhs_map = this->data(rhs).map;
|
|
|
|
|
|
|
|
|
|
for (const auto & pair : rhs_map)
|
|
|
|
|
map[pair.first] += pair.second;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
|
|
|
|
|
{
|
|
|
|
|
this->data(place).map.write(buf);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-22 23:26:08 +00:00
|
|
|
|
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
typename AggregateFunctionQuantileExactWeightedData<ValueType>::Map::Reader reader(buf);
|
|
|
|
|
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
while (reader.next())
|
|
|
|
|
{
|
|
|
|
|
const auto & pair = reader.get();
|
2016-03-12 04:01:03 +00:00
|
|
|
|
map[pair.first] = pair.second;
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
|
|
|
|
|
{
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
size_t size = map.size();
|
|
|
|
|
|
|
|
|
|
if (0 == size)
|
|
|
|
|
{
|
|
|
|
|
static_cast<ColumnVector<ValueType> &>(to).getData().push_back(ValueType());
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Копируем данные во временный массив, чтобы получить нужный по порядку элемент.
|
|
|
|
|
using Pair = typename AggregateFunctionQuantileExactWeightedData<ValueType>::Map::value_type;
|
2015-11-16 20:33:43 +00:00
|
|
|
|
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
Pair * array = array_holder.get();
|
|
|
|
|
|
|
|
|
|
size_t i = 0;
|
|
|
|
|
UInt64 sum_weight = 0;
|
|
|
|
|
for (const auto & pair : map)
|
|
|
|
|
{
|
|
|
|
|
sum_weight += pair.second;
|
|
|
|
|
array[i] = pair;
|
|
|
|
|
++i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
|
|
|
|
|
|
2016-03-13 19:00:59 +00:00
|
|
|
|
UInt64 threshold = std::ceil(sum_weight * level);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
UInt64 accumulated = 0;
|
|
|
|
|
|
|
|
|
|
const Pair * it = array;
|
|
|
|
|
const Pair * end = array + size;
|
2016-03-13 14:40:27 +00:00
|
|
|
|
while (it < end)
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
accumulated += it->second;
|
2016-03-13 14:40:27 +00:00
|
|
|
|
|
|
|
|
|
if (accumulated >= threshold)
|
|
|
|
|
break;
|
|
|
|
|
|
2015-11-15 08:31:08 +00:00
|
|
|
|
++it;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (it == end)
|
|
|
|
|
--it;
|
|
|
|
|
|
|
|
|
|
static_cast<ColumnVector<ValueType> &>(to).getData().push_back(it->first);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** То же самое, но позволяет вычислить сразу несколько квантилей.
|
|
|
|
|
* Для этого, принимает в качестве параметров несколько уровней. Пример: quantilesExactWeighted(0.5, 0.8, 0.9, 0.95)(ConnectTiming, Weight).
|
|
|
|
|
* Возвращает массив результатов.
|
|
|
|
|
*/
|
|
|
|
|
template <typename ValueType, typename WeightType>
|
|
|
|
|
class AggregateFunctionQuantilesExactWeighted final
|
|
|
|
|
: public IBinaryAggregateFunction<
|
|
|
|
|
AggregateFunctionQuantileExactWeightedData<ValueType>,
|
|
|
|
|
AggregateFunctionQuantilesExactWeighted<ValueType, WeightType>>
|
|
|
|
|
{
|
|
|
|
|
private:
|
2015-11-23 21:33:43 +00:00
|
|
|
|
QuantileLevels<double> levels;
|
2015-11-15 08:31:08 +00:00
|
|
|
|
DataTypePtr type;
|
|
|
|
|
|
|
|
|
|
public:
|
|
|
|
|
String getName() const override { return "quantilesExactWeighted"; }
|
|
|
|
|
|
|
|
|
|
DataTypePtr getReturnType() const override
|
|
|
|
|
{
|
2016-05-28 07:48:40 +00:00
|
|
|
|
return std::make_shared<DataTypeArray>(type);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setArgumentsImpl(const DataTypes & arguments)
|
|
|
|
|
{
|
|
|
|
|
type = arguments[0];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setParameters(const Array & params) override
|
|
|
|
|
{
|
2015-11-23 21:33:43 +00:00
|
|
|
|
levels.set(params);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
|
2016-09-19 22:30:40 +00:00
|
|
|
|
void addImpl(AggregateDataPtr place, const IColumn & column_value, const IColumn & column_weight, size_t row_num, Arena *) const
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
this->data(place)
|
|
|
|
|
.map[static_cast<const ColumnVector<ValueType> &>(column_value).getData()[row_num]]
|
|
|
|
|
+= static_cast<const ColumnVector<WeightType> &>(column_weight).getData()[row_num];
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-23 23:33:17 +00:00
|
|
|
|
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
const auto & rhs_map = this->data(rhs).map;
|
|
|
|
|
|
|
|
|
|
for (const auto & pair : rhs_map)
|
|
|
|
|
map[pair.first] += pair.second;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
|
|
|
|
|
{
|
|
|
|
|
this->data(place).map.write(buf);
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-22 23:26:08 +00:00
|
|
|
|
void deserialize(AggregateDataPtr place, ReadBuffer & buf, Arena *) const override
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
|
|
|
|
typename AggregateFunctionQuantileExactWeightedData<ValueType>::Map::Reader reader(buf);
|
|
|
|
|
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
while (reader.next())
|
|
|
|
|
{
|
|
|
|
|
const auto & pair = reader.get();
|
2016-03-12 04:01:03 +00:00
|
|
|
|
map[pair.first] = pair.second;
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
|
|
|
|
|
{
|
|
|
|
|
auto & map = this->data(place).map;
|
|
|
|
|
size_t size = map.size();
|
|
|
|
|
|
|
|
|
|
ColumnArray & arr_to = static_cast<ColumnArray &>(to);
|
|
|
|
|
ColumnArray::Offsets_t & offsets_to = arr_to.getOffsets();
|
|
|
|
|
|
|
|
|
|
size_t num_levels = levels.size();
|
|
|
|
|
offsets_to.push_back((offsets_to.size() == 0 ? 0 : offsets_to.back()) + num_levels);
|
|
|
|
|
|
|
|
|
|
typename ColumnVector<ValueType>::Container_t & data_to = static_cast<ColumnVector<ValueType> &>(arr_to.getData()).getData();
|
|
|
|
|
|
2015-11-23 21:33:43 +00:00
|
|
|
|
size_t old_size = data_to.size();
|
|
|
|
|
data_to.resize(old_size + num_levels);
|
|
|
|
|
|
2015-11-15 08:31:08 +00:00
|
|
|
|
if (0 == size)
|
|
|
|
|
{
|
|
|
|
|
for (size_t i = 0; i < num_levels; ++i)
|
2015-11-23 21:33:43 +00:00
|
|
|
|
data_to[old_size + i] = ValueType();
|
2015-11-15 08:31:08 +00:00
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Копируем данные во временный массив, чтобы получить нужный по порядку элемент.
|
|
|
|
|
using Pair = typename AggregateFunctionQuantileExactWeightedData<ValueType>::Map::value_type;
|
2015-11-16 23:49:18 +00:00
|
|
|
|
std::unique_ptr<Pair[]> array_holder(new Pair[size]);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
Pair * array = array_holder.get();
|
|
|
|
|
|
|
|
|
|
size_t i = 0;
|
|
|
|
|
UInt64 sum_weight = 0;
|
|
|
|
|
for (const auto & pair : map)
|
|
|
|
|
{
|
|
|
|
|
sum_weight += pair.second;
|
|
|
|
|
array[i] = pair;
|
|
|
|
|
++i;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::sort(array, array + size, [](const Pair & a, const Pair & b) { return a.first < b.first; });
|
|
|
|
|
|
|
|
|
|
UInt64 accumulated = 0;
|
|
|
|
|
|
|
|
|
|
const Pair * it = array;
|
|
|
|
|
const Pair * end = array + size;
|
|
|
|
|
|
2016-03-13 14:40:27 +00:00
|
|
|
|
size_t level_index = 0;
|
2016-03-13 19:00:59 +00:00
|
|
|
|
UInt64 threshold = std::ceil(sum_weight * levels.levels[levels.permutation[level_index]]);
|
2016-03-13 14:40:27 +00:00
|
|
|
|
|
|
|
|
|
while (it < end)
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
2016-03-13 14:40:27 +00:00
|
|
|
|
accumulated += it->second;
|
2015-11-15 08:31:08 +00:00
|
|
|
|
|
2016-03-13 14:40:27 +00:00
|
|
|
|
while (accumulated >= threshold)
|
2015-11-15 08:31:08 +00:00
|
|
|
|
{
|
2016-03-13 18:15:41 +00:00
|
|
|
|
data_to[old_size + levels.permutation[level_index]] = it->first;
|
2016-03-13 14:40:27 +00:00
|
|
|
|
++level_index;
|
|
|
|
|
|
|
|
|
|
if (level_index == num_levels)
|
|
|
|
|
return;
|
|
|
|
|
|
2016-03-13 19:00:59 +00:00
|
|
|
|
threshold = std::ceil(sum_weight * levels.levels[levels.permutation[level_index]]);
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-13 14:40:27 +00:00
|
|
|
|
++it;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
while (level_index < num_levels)
|
|
|
|
|
{
|
2016-03-13 18:15:41 +00:00
|
|
|
|
data_to[old_size + levels.permutation[level_index]] = array[size - 1].first;
|
2016-03-13 14:40:27 +00:00
|
|
|
|
++level_index;
|
2015-11-15 08:31:08 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
}
|