ClickHouse/dbms/src/Common/HyperLogLogWithSmallSetOptimization.h
Amos Bird 26ab5dd7a7 A Proper lookup table that uses HashTable's API
This is the first step of allowing heterogeneous cells in hash tables.

performance test results are

```

1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>;
2. NewLookupMap<UInt16, UInt8>

ResolutionWidth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................223550276.46
ResolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24
Best: 2 - 24877272124

ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99
ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98
Best: 2 - 26180888998

ResolutionWidth 300000 1 ...................................................................................239307348.81
ResolutionWidth 300000 2 ...................................................................................257592761.30
Best: 2 - 25759276130

ResolutionWidth 1000000 1 .........................240144759.26
ResolutionWidth 1000000 2 .........................257093531.91
Best: 2 - 25709353191

ResolutionWidth 5000000 1 .....241573260.35
ResolutionWidth 5000000 2 .....259314162.79
Best: 2 - 25931416279

ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84
ResolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41
Best: 2 - 24945950441

ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17
ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64
Best: 2 - 25376910564

ResolutionDepth 300000 1 ...................................................................................233079225.18
ResolutionDepth 300000 2 ...................................................................................256316273.78
Best: 2 - 25631627378

ResolutionDepth 1000000 1 .........................234184633.51
ResolutionDepth 1000000 2 .........................261100491.57
Best: 2 - 26110049157

ResolutionDepth 5000000 1 .....233118795.66
ResolutionDepth 5000000 2 .....252436160.41
Best: 2 - 25243616041

```
2019-03-01 16:47:13 +08:00

152 lines
3.2 KiB
C++

#pragma once
#include <boost/noncopyable.hpp>
#include <Common/HyperLogLogCounter.h>
#include <Common/HashTable/SmallTable.h>
#include <Common/MemoryTracker.h>
namespace DB
{
/** For a small number of keys - an array of fixed size "on the stack".
* For large, HyperLogLog is allocated.
* See also the more practical implementation in CombinedCardinalityEstimator.h,
* where a hash table is also used for medium-sized sets.
*/
template
<
typename Key,
UInt8 small_set_size,
UInt8 K,
typename Hash = IntHash32<Key>,
typename DenominatorType = double>
class HyperLogLogWithSmallSetOptimization : private boost::noncopyable
{
private:
using Small = SmallSet<Key, small_set_size>;
using Large = HyperLogLogCounter<K, Hash, UInt32, DenominatorType>;
Small small;
Large * large = nullptr;
bool isLarge() const
{
return large != nullptr;
}
void toLarge()
{
CurrentMemoryTracker::alloc(sizeof(Large));
/// At the time of copying data from `tiny`, setting the value of `large` is still not possible (otherwise it will overwrite some data).
Large * tmp_large = new Large;
for (const auto & x : small)
tmp_large->insert(x.getValue());
large = tmp_large;
}
public:
using value_type = Key;
~HyperLogLogWithSmallSetOptimization()
{
if (isLarge())
{
delete large;
CurrentMemoryTracker::free(sizeof(Large));
}
}
void insert(Key value)
{
if (!isLarge())
{
if (small.find(value) == small.end())
{
if (!small.full())
small.insert(value);
else
{
toLarge();
large->insert(value);
}
}
}
else
large->insert(value);
}
UInt64 size() const
{
return !isLarge() ? small.size() : large->size();
}
void merge(const HyperLogLogWithSmallSetOptimization & rhs)
{
if (rhs.isLarge())
{
if (!isLarge())
toLarge();
large->merge(*rhs.large);
}
else
{
for (const auto & x : rhs.small)
insert(x.getValue());
}
}
/// You can only call for an empty object.
void read(DB::ReadBuffer & in)
{
bool is_large;
readBinary(is_large, in);
if (is_large)
{
toLarge();
large->read(in);
}
else
small.read(in);
}
void readAndMerge(DB::ReadBuffer & in)
{
bool is_rhs_large;
readBinary(is_rhs_large, in);
if (!isLarge() && is_rhs_large)
toLarge();
if (!is_rhs_large)
{
typename Small::Reader reader(in);
while (reader.next())
insert(reader.get());
}
else
large->readAndMerge(in);
}
void write(DB::WriteBuffer & out) const
{
writeBinary(isLarge(), out);
if (isLarge())
large->write(out);
else
small.write(out);
}
};
}