ClickHouse/dbms/include/DB/AggregateFunctions/AggregateFunctionUniq.h

259 lines
6.4 KiB
C
Raw Normal View History

2011-09-26 04:00:46 +00:00
#pragma once
#include <city.h>
#include <type_traits>
2011-09-26 04:00:46 +00:00
#include <stats/UniquesHashSet.h>
#include <DB/IO/WriteHelpers.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteBufferFromString.h>
2011-09-26 04:00:46 +00:00
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <DB/DataTypes/DataTypeString.h>
2011-09-26 04:00:46 +00:00
#include <DB/Interpreters/AggregationCommon.h>
#include <DB/Common/HashTable/HashSet.h>
#include <DB/Common/HyperLogLogWithSmallSetOptimization.h>
2015-07-20 17:09:43 +00:00
#include <DB/Common/CombinedCardinalityEstimator.h>
#include <DB/Columns/ColumnString.h>
2011-09-26 04:00:46 +00:00
#include <DB/AggregateFunctions/IUnaryAggregateFunction.h>
namespace DB
{
template <typename T> struct AggregateFunctionUniqTraits
2011-09-26 04:00:46 +00:00
{
static UInt64 hash(T x) { return x; }
2011-09-26 04:00:46 +00:00
};
template <> struct AggregateFunctionUniqTraits<Float32>
2011-09-26 04:00:46 +00:00
{
static UInt64 hash(Float32 x)
{
UInt64 res = 0;
memcpy(reinterpret_cast<char *>(&res), reinterpret_cast<char *>(&x), sizeof(x));
return res;
}
2011-09-26 04:00:46 +00:00
};
template <> struct AggregateFunctionUniqTraits<Float64>
{
static UInt64 hash(Float64 x)
{
UInt64 res = 0;
memcpy(reinterpret_cast<char *>(&res), reinterpret_cast<char *>(&x), sizeof(x));
return res;
}
};
struct AggregateFunctionUniqUniquesHashSetData
{
typedef UniquesHashSet<DefaultHash<UInt64>> Set;
Set set;
static String getName() { return "uniq"; }
};
template <typename T>
struct AggregateFunctionUniqHLL12Data
{
typedef HyperLogLogWithSmallSetOptimization<T, 16, 12> Set;
Set set;
static String getName() { return "uniqHLL12"; }
};
template <>
struct AggregateFunctionUniqHLL12Data<String>
{
typedef HyperLogLogWithSmallSetOptimization<UInt64, 16, 12> Set;
Set set;
static String getName() { return "uniqHLL12"; }
};
template <typename T>
struct AggregateFunctionUniqExactData
{
typedef T Key;
/// При создании, хэш-таблица должна быть небольшой.
typedef HashSet<
Key,
DefaultHash<Key>,
2014-05-03 16:03:49 +00:00
HashTableGrower<4>,
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 4)>
> Set;
Set set;
static String getName() { return "uniqExact"; }
};
/// Для строк будем класть в хэш-таблицу значения SipHash-а (128 бит).
template <>
struct AggregateFunctionUniqExactData<String>
{
typedef UInt128 Key;
/// При создании, хэш-таблица должна быть небольшой.
typedef HashSet<
Key,
UInt128TrivialHash,
2014-05-03 16:03:49 +00:00
HashTableGrower<3>,
HashTableAllocatorWithStackMemory<sizeof(Key) * (1 << 3)>
> Set;
Set set;
static String getName() { return "uniqExact"; }
};
2015-07-20 17:09:43 +00:00
template <typename T>
struct AggregateFunctionUniqCombinedData
{
using Key = T;
using Set = CombinedCardinalityEstimator<Key, HashSet<Key, DefaultHash<Key>, HashTableGrower<4> >, 16, 16, 19>;
Set set;
static String getName() { return "uniqCombined"; }
};
template <>
struct AggregateFunctionUniqCombinedData<String>
{
using Key = UInt64;
using Set = CombinedCardinalityEstimator<Key, HashSet<Key, DefaultHash<Key>, HashTableGrower<4> >, 16, 16, 19>;
Set set;
static String getName() { return "uniqCombined"; }
};
namespace detail
{
/** Структура для делегации работы по добавлению одного элемента в агрегатные функции uniq.
* Используется для частичной специализации для добавления строк.
*/
template<typename T, typename Data>
struct OneAdder
{
static void addOne(Data & data, const IColumn & column, size_t row_num)
{
data.set.insert(AggregateFunctionUniqTraits<T>::hash(static_cast<const ColumnVector<T> &>(column).getData()[row_num]));
}
};
template<typename Data>
struct OneAdder<String, Data>
{
static void addOne(Data & data, const IColumn & column, size_t row_num)
{
/// Имейте ввиду, что вычисление приближённое.
StringRef value = column.getDataAt(row_num);
data.set.insert(CityHash64(value.data, value.size));
}
};
template<typename T>
struct OneAdder<T, AggregateFunctionUniqExactData<T> >
{
static void addOne(AggregateFunctionUniqExactData<T> & data, const IColumn & column, size_t row_num)
{
data.set.insert(static_cast<const ColumnVector<T> &>(column).getData()[row_num]);
}
};
template<>
struct OneAdder<String, AggregateFunctionUniqExactData<String> >
{
static void addOne(AggregateFunctionUniqExactData<String> & data, const IColumn & column, size_t row_num)
{
StringRef value = column.getDataAt(row_num);
UInt128 key;
SipHash hash;
hash.update(value.data, value.size);
hash.get128(key.first, key.second);
data.set.insert(key);
}
};
2015-07-20 17:09:43 +00:00
template<typename T>
struct OneAdder<T, AggregateFunctionUniqCombinedData<T> >
{
static void addOne(AggregateFunctionUniqCombinedData<T> & data, const IColumn & column, size_t row_num)
{
if (data.set.isMedium())
data.set.insert(static_cast<const ColumnVector<T> &>(column).getData()[row_num]);
else
data.set.insert(AggregateFunctionUniqTraits<T>::hash(static_cast<const ColumnVector<T> &>(column).getData()[row_num]));
}
};
template<>
struct OneAdder<String, AggregateFunctionUniqCombinedData<String> >
{
static void addOne(AggregateFunctionUniqCombinedData<String> & data, const IColumn & column, size_t row_num)
{
StringRef value = column.getDataAt(row_num);
data.set.insert(CityHash64(value.data, value.size));
}
};
}
2011-09-26 04:00:46 +00:00
/// Приближённо вычисляет количество различных значений.
template <typename T, typename Data>
class AggregateFunctionUniq final : public IUnaryAggregateFunction<Data, AggregateFunctionUniq<T, Data> >
2011-09-26 04:00:46 +00:00
{
public:
String getName() const { return Data::getName(); }
2011-09-26 04:00:46 +00:00
DataTypePtr getReturnType() const
{
return new DataTypeUInt64;
2011-09-26 04:00:46 +00:00
}
void setArgument(const DataTypePtr & argument)
{
}
void addOne(AggregateDataPtr place, const IColumn & column, size_t row_num) const
2011-09-26 04:00:46 +00:00
{
detail::OneAdder<T, Data>::addOne(this->data(place), column, row_num);
2011-09-26 04:00:46 +00:00
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs) const
2011-09-26 04:00:46 +00:00
{
this->data(place).set.merge(this->data(rhs).set);
2011-09-26 04:00:46 +00:00
}
void serialize(ConstAggregateDataPtr place, WriteBuffer & buf) const
2011-09-26 04:00:46 +00:00
{
this->data(place).set.write(buf);
2011-09-26 04:00:46 +00:00
}
void deserializeMerge(AggregateDataPtr place, ReadBuffer & buf) const
2011-09-26 04:00:46 +00:00
{
this->data(place).set.readAndMerge(buf);
2011-09-26 04:00:46 +00:00
}
void insertResultInto(ConstAggregateDataPtr place, IColumn & to) const
2011-09-26 04:00:46 +00:00
{
static_cast<ColumnUInt64 &>(to).getData().push_back(this->data(place).set.size());
2011-09-26 04:00:46 +00:00
}
};
2011-09-26 04:00:46 +00:00
}