2014-07-24 20:20:36 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
2015-10-12 07:05:54 +00:00
|
|
|
|
#include <DB/Core/FieldVisitors.h>
|
2014-07-25 01:32:59 +00:00
|
|
|
|
#include <DB/AggregateFunctions/IUnaryAggregateFunction.h>
|
2015-10-29 04:04:43 +00:00
|
|
|
|
#include <DB/AggregateFunctions/UniqVariadicHash.h>
|
2014-07-25 01:32:59 +00:00
|
|
|
|
#include <DB/DataTypes/DataTypesNumberFixed.h>
|
2015-10-29 03:41:09 +00:00
|
|
|
|
#include <DB/DataTypes/DataTypeTuple.h>
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/** Считает количество уникальных значений до не более чем указанного в параметре.
|
|
|
|
|
*
|
|
|
|
|
* Пример: uniqUpTo(3)(UserID)
|
|
|
|
|
* - посчитает количество уникальных посетителей, вернёт 1, 2, 3 или 4, если их >= 4.
|
|
|
|
|
*
|
|
|
|
|
* Для строк используется некриптографическая хэш-функция, за счёт чего рассчёт может быть немного неточным.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
template <typename T>
|
2014-08-01 05:19:17 +00:00
|
|
|
|
struct __attribute__((__packed__)) AggregateFunctionUniqUpToData
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/** Если count == threshold + 1 - это значит, что "переполнилось" (значений больше threshold).
|
|
|
|
|
* В этом случае (например, после вызова функции merge), массив data не обязательно содержит инициализированные значения
|
|
|
|
|
* - пример: объединяем состояние, в котором мало значений, с другим состоянием, которое переполнилось;
|
|
|
|
|
* тогда выставляем count в threshold + 1, а значения из другого состояния не копируем.
|
|
|
|
|
*/
|
2014-08-01 05:19:17 +00:00
|
|
|
|
UInt8 count = 0;
|
2014-08-01 19:14:21 +00:00
|
|
|
|
|
|
|
|
|
/// Данные идут после конца структуры. При вставке, делается линейный поиск.
|
|
|
|
|
T data[0];
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
2014-07-25 01:32:59 +00:00
|
|
|
|
|
2014-07-24 20:20:36 +00:00
|
|
|
|
size_t size() const
|
|
|
|
|
{
|
|
|
|
|
return count;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// threshold - для скольки элементов есть место в data.
|
2014-08-01 05:19:17 +00:00
|
|
|
|
void insert(T x, UInt8 threshold)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/// Состояние уже переполнено - ничего делать не нужно.
|
2014-07-24 20:20:36 +00:00
|
|
|
|
if (count > threshold)
|
|
|
|
|
return;
|
|
|
|
|
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/// Линейный поиск совпадающего элемента.
|
|
|
|
|
for (size_t i = 0; i < count; ++i)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
if (data[i] == x)
|
|
|
|
|
return;
|
|
|
|
|
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/// Не нашли совпадающий элемент. Если есть место ещё для одного элемента - вставляем его.
|
2014-07-24 20:20:36 +00:00
|
|
|
|
if (count < threshold)
|
|
|
|
|
data[count] = x;
|
|
|
|
|
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/// После увеличения count, состояние может оказаться переполненным.
|
2014-07-24 20:20:36 +00:00
|
|
|
|
++count;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-01 05:19:17 +00:00
|
|
|
|
void merge(const AggregateFunctionUniqUpToData<T> & rhs, UInt8 threshold)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
if (count > threshold)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (rhs.count > threshold)
|
|
|
|
|
{
|
2014-08-01 19:14:21 +00:00
|
|
|
|
/// Если rhs переполнено, то выставляем у текущего состояния count тоже переполненным.
|
2014-07-24 20:20:36 +00:00
|
|
|
|
count = rhs.count;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-01 19:14:21 +00:00
|
|
|
|
for (size_t i = 0; i < rhs.count; ++i)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
insert(rhs.data[i], threshold);
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-01 05:19:17 +00:00
|
|
|
|
void write(WriteBuffer & wb, UInt8 threshold) const
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2014-08-01 19:14:21 +00:00
|
|
|
|
writeBinary(count, wb);
|
|
|
|
|
|
|
|
|
|
/// Пишем значения, только если состояние не переполнено. Иначе они не нужны, а важен только факт того, что состояние переполнено.
|
|
|
|
|
if (count <= threshold)
|
2014-10-02 19:20:09 +00:00
|
|
|
|
wb.write(reinterpret_cast<const char *>(&data[0]), count * sizeof(data[0]));
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 04:01:03 +00:00
|
|
|
|
void read(ReadBuffer & rb, UInt8 threshold)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2016-03-12 04:01:03 +00:00
|
|
|
|
readBinary(count, rb);
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
2016-03-12 04:01:03 +00:00
|
|
|
|
if (count <= threshold)
|
|
|
|
|
rb.read(reinterpret_cast<char *>(&data[0]), count * sizeof(data[0]));
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
2014-07-25 01:32:59 +00:00
|
|
|
|
|
|
|
|
|
|
2015-11-15 06:23:44 +00:00
|
|
|
|
void addImpl(const IColumn & column, size_t row_num, UInt8 threshold)
|
2014-07-25 01:32:59 +00:00
|
|
|
|
{
|
|
|
|
|
insert(static_cast<const ColumnVector<T> &>(column).getData()[row_num], threshold);
|
|
|
|
|
}
|
2014-07-24 20:20:36 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
/// Для строк, запоминаются их хэши.
|
|
|
|
|
template <>
|
2014-07-25 01:32:59 +00:00
|
|
|
|
struct AggregateFunctionUniqUpToData<String> : AggregateFunctionUniqUpToData<UInt64>
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2015-11-15 06:23:44 +00:00
|
|
|
|
void addImpl(const IColumn & column, size_t row_num, UInt8 threshold)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2014-07-25 01:32:59 +00:00
|
|
|
|
/// Имейте ввиду, что вычисление приближённое.
|
|
|
|
|
StringRef value = column.getDataAt(row_num);
|
|
|
|
|
insert(CityHash64(value.data, value.size), threshold);
|
|
|
|
|
}
|
|
|
|
|
};
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
|
|
|
|
|
2014-08-01 05:19:17 +00:00
|
|
|
|
constexpr UInt8 uniq_upto_max_threshold = 100;
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
|
class AggregateFunctionUniqUpTo final : public IUnaryAggregateFunction<AggregateFunctionUniqUpToData<T>, AggregateFunctionUniqUpTo<T> >
|
|
|
|
|
{
|
|
|
|
|
private:
|
2014-08-01 05:19:17 +00:00
|
|
|
|
UInt8 threshold = 5; /// Значение по-умолчанию, если параметр не указан.
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
|
|
|
|
public:
|
2015-11-11 02:04:23 +00:00
|
|
|
|
size_t sizeOfData() const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
return sizeof(AggregateFunctionUniqUpToData<T>) + sizeof(T) * threshold;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
String getName() const override { return "uniqUpTo"; }
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
DataTypePtr getReturnType() const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2016-05-28 07:48:40 +00:00
|
|
|
|
return std::make_shared<DataTypeUInt64>();
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-15 08:31:08 +00:00
|
|
|
|
void setArgument(const DataTypePtr & argument)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void setParameters(const Array & params) override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
if (params.size() != 1)
|
|
|
|
|
throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
|
|
|
|
|
2014-08-01 19:31:38 +00:00
|
|
|
|
UInt64 threshold_param = apply_visitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
|
2014-07-24 20:20:36 +00:00
|
|
|
|
|
2014-08-01 19:31:38 +00:00
|
|
|
|
if (threshold_param > uniq_upto_max_threshold)
|
2014-07-24 20:20:36 +00:00
|
|
|
|
throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(uniq_upto_max_threshold),
|
|
|
|
|
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
2014-08-01 19:31:38 +00:00
|
|
|
|
|
|
|
|
|
threshold = threshold_param;
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-15 06:23:44 +00:00
|
|
|
|
void addImpl(AggregateDataPtr place, const IColumn & column, size_t row_num) const
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2015-11-15 06:23:44 +00:00
|
|
|
|
this->data(place).addImpl(column, row_num, threshold);
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
this->data(place).merge(this->data(rhs), threshold);
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2014-07-31 20:26:22 +00:00
|
|
|
|
this->data(place).write(buf, threshold);
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 04:01:03 +00:00
|
|
|
|
void deserialize(AggregateDataPtr place, ReadBuffer & buf) const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
2016-03-12 04:01:03 +00:00
|
|
|
|
this->data(place).read(buf, threshold);
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
|
2014-07-24 20:20:36 +00:00
|
|
|
|
{
|
|
|
|
|
static_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2015-10-29 03:41:09 +00:00
|
|
|
|
/** Для нескольких аргументов. Для вычисления, хэширует их.
|
|
|
|
|
* Можно передать несколько аргументов как есть; также можно передать один аргумент - кортеж.
|
|
|
|
|
* Но (для возможности эффективной реализации), нельзя передать несколько аргументов, среди которых есть кортежи.
|
|
|
|
|
*/
|
|
|
|
|
template <bool argument_is_tuple>
|
|
|
|
|
class AggregateFunctionUniqUpToVariadic final : public IAggregateFunctionHelper<AggregateFunctionUniqUpToData<UInt64>>
|
|
|
|
|
{
|
|
|
|
|
private:
|
|
|
|
|
size_t num_args = 0;
|
|
|
|
|
UInt8 threshold = 5; /// Значение по-умолчанию, если параметр не указан.
|
|
|
|
|
|
|
|
|
|
public:
|
2015-11-11 02:04:23 +00:00
|
|
|
|
size_t sizeOfData() const override
|
2015-10-30 02:34:24 +00:00
|
|
|
|
{
|
|
|
|
|
return sizeof(AggregateFunctionUniqUpToData<UInt64>) + sizeof(UInt64) * threshold;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
String getName() const override { return "uniqUpTo"; }
|
2015-10-29 03:41:09 +00:00
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
DataTypePtr getReturnType() const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
2016-05-28 07:48:40 +00:00
|
|
|
|
return std::make_shared<DataTypeUInt64>();
|
2015-10-29 03:41:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void setArguments(const DataTypes & arguments) override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
|
|
|
|
if (argument_is_tuple)
|
|
|
|
|
num_args = typeid_cast<const DataTypeTuple &>(*arguments[0]).getElements().size();
|
|
|
|
|
else
|
|
|
|
|
num_args = arguments.size();
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void setParameters(const Array & params) override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
|
|
|
|
if (params.size() != 1)
|
|
|
|
|
throw Exception("Aggregate function " + getName() + " requires exactly one parameter.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
|
|
|
|
|
|
|
|
|
|
UInt64 threshold_param = apply_visitor(FieldVisitorConvertToNumber<UInt64>(), params[0]);
|
|
|
|
|
|
|
|
|
|
if (threshold_param > uniq_upto_max_threshold)
|
|
|
|
|
throw Exception("Too large parameter for aggregate function " + getName() + ". Maximum: " + toString(uniq_upto_max_threshold),
|
|
|
|
|
ErrorCodes::ARGUMENT_OUT_OF_BOUND);
|
|
|
|
|
|
|
|
|
|
threshold = threshold_param;
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num) const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
2015-10-29 04:02:22 +00:00
|
|
|
|
this->data(place).insert(UniqVariadicHash<false, argument_is_tuple>::apply(num_args, columns, row_num), threshold);
|
2015-10-29 03:41:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
|
|
|
|
this->data(place).merge(this->data(rhs), threshold);
|
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
|
|
|
|
this->data(place).write(buf, threshold);
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 04:01:03 +00:00
|
|
|
|
void deserialize(AggregateDataPtr place, ReadBuffer & buf) const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
2016-03-12 04:01:03 +00:00
|
|
|
|
this->data(place).read(buf, threshold);
|
2015-10-29 03:41:09 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-11-11 02:04:23 +00:00
|
|
|
|
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const override
|
2015-10-29 03:41:09 +00:00
|
|
|
|
{
|
|
|
|
|
static_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).size());
|
|
|
|
|
}
|
2015-11-21 19:46:27 +00:00
|
|
|
|
|
|
|
|
|
static void addFree(const IAggregateFunction * that, AggregateDataPtr place, const IColumn ** columns, size_t row_num)
|
|
|
|
|
{
|
|
|
|
|
return static_cast<const AggregateFunctionUniqUpToVariadic &>(*that).add(place, columns, row_num);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
IAggregateFunction::AddFunc getAddressOfAddFunction() const override final { return &addFree; }
|
2015-10-29 03:41:09 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2014-07-24 20:20:36 +00:00
|
|
|
|
}
|