2015-02-22 07:23:37 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-11-12 00:46:22 +00:00
|
|
|
#include <boost/noncopyable.hpp>
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/HyperLogLogCounter.h>
|
|
|
|
#include <Common/HashTable/SmallTable.h>
|
2017-04-08 01:32:05 +00:00
|
|
|
|
2015-02-22 07:23:37 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/** For a small number of keys - an array of fixed size "on the stack".
|
2017-05-07 20:25:26 +00:00
|
|
|
* For large, HyperLogLog is allocated.
|
|
|
|
* See also the more practical implementation in CombinedCardinalityEstimator.h,
|
|
|
|
* where a hash table is also used for medium-sized sets.
|
2015-02-22 07:23:37 +00:00
|
|
|
*/
|
2017-04-08 01:32:05 +00:00
|
|
|
template
|
|
|
|
<
|
2017-04-01 07:20:54 +00:00
|
|
|
typename Key,
|
|
|
|
UInt8 small_set_size,
|
|
|
|
UInt8 K,
|
|
|
|
typename Hash = IntHash32<Key>,
|
|
|
|
typename DenominatorType = double>
|
2017-11-12 00:46:22 +00:00
|
|
|
class HyperLogLogWithSmallSetOptimization : private boost::noncopyable
|
2015-02-22 07:23:37 +00:00
|
|
|
{
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
using Small = SmallSet<Key, small_set_size>;
|
|
|
|
using Large = HyperLogLogCounter<K, Hash, UInt32, DenominatorType>;
|
2020-08-19 11:52:17 +00:00
|
|
|
using LargeValueType = typename Large::value_type;
|
2015-02-22 07:23:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Small small;
|
|
|
|
Large * large = nullptr;
|
2015-02-22 07:23:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
bool isLarge() const
|
|
|
|
{
|
|
|
|
return large != nullptr;
|
|
|
|
}
|
2015-02-22 07:23:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void toLarge()
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// At the time of copying data from `tiny`, setting the value of `large` is still not possible (otherwise it will overwrite some data).
|
2017-04-01 07:20:54 +00:00
|
|
|
Large * tmp_large = new Large;
|
2015-02-22 07:23:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
for (const auto & x : small)
|
2020-08-19 11:52:17 +00:00
|
|
|
tmp_large->insert(static_cast<LargeValueType>(x.getValue()));
|
2015-02-22 07:23:37 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
large = tmp_large;
|
|
|
|
}
|
2015-02-22 07:23:37 +00:00
|
|
|
|
|
|
|
public:
|
2018-07-16 03:12:01 +00:00
|
|
|
using value_type = Key;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
~HyperLogLogWithSmallSetOptimization()
|
|
|
|
{
|
|
|
|
if (isLarge())
|
|
|
|
delete large;
|
|
|
|
}
|
|
|
|
|
2019-11-11 08:36:19 +00:00
|
|
|
/// ALWAYS_INLINE is required to have better code layout for uniqHLL12 function
|
|
|
|
void ALWAYS_INLINE insert(Key value)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
if (!isLarge())
|
|
|
|
{
|
|
|
|
if (small.find(value) == small.end())
|
|
|
|
{
|
|
|
|
if (!small.full())
|
|
|
|
small.insert(value);
|
|
|
|
else
|
|
|
|
{
|
|
|
|
toLarge();
|
2020-08-19 11:52:17 +00:00
|
|
|
large->insert(static_cast<LargeValueType>(value));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
2020-08-19 11:52:17 +00:00
|
|
|
large->insert(static_cast<LargeValueType>(value));
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2018-01-31 11:36:01 +00:00
|
|
|
UInt64 size() const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
return !isLarge() ? small.size() : large->size();
|
|
|
|
}
|
|
|
|
|
|
|
|
void merge(const HyperLogLogWithSmallSetOptimization & rhs)
|
|
|
|
{
|
|
|
|
if (rhs.isLarge())
|
|
|
|
{
|
|
|
|
if (!isLarge())
|
|
|
|
toLarge();
|
|
|
|
|
|
|
|
large->merge(*rhs.large);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
for (const auto & x : rhs.small)
|
2019-02-28 09:35:38 +00:00
|
|
|
insert(x.getValue());
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// You can only call for an empty object.
|
2017-04-01 07:20:54 +00:00
|
|
|
void read(DB::ReadBuffer & in)
|
|
|
|
{
|
|
|
|
bool is_large;
|
|
|
|
readBinary(is_large, in);
|
|
|
|
|
|
|
|
if (is_large)
|
|
|
|
{
|
|
|
|
toLarge();
|
|
|
|
large->read(in);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
small.read(in);
|
|
|
|
}
|
|
|
|
|
|
|
|
void readAndMerge(DB::ReadBuffer & in)
|
|
|
|
{
|
|
|
|
bool is_rhs_large;
|
|
|
|
readBinary(is_rhs_large, in);
|
|
|
|
|
|
|
|
if (!isLarge() && is_rhs_large)
|
|
|
|
toLarge();
|
|
|
|
|
|
|
|
if (!is_rhs_large)
|
|
|
|
{
|
|
|
|
typename Small::Reader reader(in);
|
|
|
|
while (reader.next())
|
|
|
|
insert(reader.get());
|
|
|
|
}
|
|
|
|
else
|
|
|
|
large->readAndMerge(in);
|
|
|
|
}
|
|
|
|
|
|
|
|
void write(DB::WriteBuffer & out) const
|
|
|
|
{
|
|
|
|
writeBinary(isLarge(), out);
|
|
|
|
|
|
|
|
if (isLarge())
|
|
|
|
large->write(out);
|
|
|
|
else
|
|
|
|
small.write(out);
|
|
|
|
}
|
2015-02-22 07:23:37 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
}
|