ClickHouse/src/Common/CombinedCardinalityEstimator.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

329 lines
9.0 KiB
C++
Raw Normal View History

2015-07-20 17:09:43 +00:00
#pragma once
#include <Common/HashTable/SmallTable.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HyperLogLogCounter.h>
#include <Core/Defines.h>
2015-07-20 17:09:43 +00:00
namespace DB
{
2020-02-25 18:10:48 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
2015-07-20 17:09:43 +00:00
namespace details
{
2020-01-03 14:44:29 +00:00
enum class ContainerType : uint8_t { SMALL = 1, MEDIUM = 2, LARGE = 3 };
2015-07-20 17:09:43 +00:00
static inline ContainerType max(const ContainerType & lhs, const ContainerType & rhs)
2015-07-20 17:09:43 +00:00
{
2020-01-03 14:44:29 +00:00
uint8_t res = std::max(static_cast<uint8_t>(lhs), static_cast<uint8_t>(rhs));
2015-07-20 17:09:43 +00:00
return static_cast<ContainerType>(res);
}
}
/** For a small number of keys - an array of fixed size "on the stack".
2017-05-07 20:25:26 +00:00
* For the average, HashSet is allocated.
* For large, HyperLogLog is allocated.
2015-07-20 17:09:43 +00:00
*/
template
<
typename Key,
typename HashContainer,
UInt8 small_set_size_max,
UInt8 medium_set_power2_max,
UInt8 K,
typename Hash = IntHash32<Key>,
typename HashValueType = UInt32,
typename BiasEstimator = TrivialBiasEstimator,
HyperLogLogMode mode = HyperLogLogMode::FullFeatured,
typename DenominatorType = double
>
2015-07-20 17:09:43 +00:00
class CombinedCardinalityEstimator
{
public:
using Self = CombinedCardinalityEstimator
<
Key,
HashContainer,
small_set_size_max,
medium_set_power2_max,
K,
Hash,
HashValueType,
BiasEstimator,
mode,
DenominatorType
>;
2015-07-20 17:09:43 +00:00
using value_type = Key;
2015-07-20 17:09:43 +00:00
private:
using Small = SmallSet<Key, small_set_size_max>;
using Medium = HashContainer;
using Large = HyperLogLogCounter<K, Key, Hash, HashValueType, DenominatorType, BiasEstimator, mode>;
2015-07-20 17:09:43 +00:00
public:
CombinedCardinalityEstimator()
2015-07-20 17:09:43 +00:00
{
setContainerType(details::ContainerType::SMALL);
}
~CombinedCardinalityEstimator()
{
destroy();
2015-07-20 17:09:43 +00:00
}
2015-07-20 17:09:43 +00:00
void insert(Key value)
{
auto container_type = getContainerType();
2015-07-20 17:09:43 +00:00
if (container_type == details::ContainerType::SMALL)
{
if (small.find(value) == small.end())
{
if (!small.full())
small.insert(value);
else
{
toMedium();
getContainer<Medium>().insert(value);
2015-07-20 17:09:43 +00:00
}
}
}
else if (container_type == details::ContainerType::MEDIUM)
{
auto & container = getContainer<Medium>();
if (container.size() < medium_set_size_max)
container.insert(value);
2015-07-20 17:09:43 +00:00
else
{
toLarge();
getContainer<Large>().insert(value);
2015-07-20 17:09:43 +00:00
}
}
else if (container_type == details::ContainerType::LARGE)
getContainer<Large>().insert(value);
2015-07-20 17:09:43 +00:00
}
UInt64 size() const
2015-07-20 17:09:43 +00:00
{
auto container_type = getContainerType();
2015-07-20 17:09:43 +00:00
if (container_type == details::ContainerType::SMALL)
return small.size();
else if (container_type == details::ContainerType::MEDIUM)
return getContainer<Medium>().size();
2015-07-20 17:09:43 +00:00
else if (container_type == details::ContainerType::LARGE)
return getContainer<Large>().size();
2015-07-20 17:09:43 +00:00
else
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
}
2015-07-20 17:09:43 +00:00
void merge(const Self & rhs)
{
auto container_type = getContainerType();
auto max_container_type = details::max(container_type, rhs.getContainerType());
2015-07-20 17:09:43 +00:00
if (container_type != max_container_type)
{
if (max_container_type == details::ContainerType::MEDIUM)
toMedium();
else if (max_container_type == details::ContainerType::LARGE)
toLarge();
}
if (rhs.getContainerType() == details::ContainerType::SMALL)
2015-07-20 17:09:43 +00:00
{
for (const auto & x : rhs.small)
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidthesolutionWidthest: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepthesolutionDepthest: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
insert(x.getValue());
2015-07-20 17:09:43 +00:00
}
else if (rhs.getContainerType() == details::ContainerType::MEDIUM)
2015-07-20 17:09:43 +00:00
{
for (const auto & x : rhs.getContainer<Medium>())
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidthesolutionWidth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................248772721.24 Best: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepthesolutionDepth 30000 2 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................249459504.41 Best: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
insert(x.getValue());
2015-07-20 17:09:43 +00:00
}
else if (rhs.getContainerType() == details::ContainerType::LARGE)
getContainer<Large>().merge(rhs.getContainer<Large>());
2015-07-20 17:09:43 +00:00
}
2017-05-07 20:25:26 +00:00
/// You can only call for an empty object.
2015-07-20 17:09:43 +00:00
void read(DB::ReadBuffer & in)
{
UInt8 v;
readBinary(v, in);
auto container_type = static_cast<details::ContainerType>(v);
if (container_type == details::ContainerType::SMALL)
2015-07-20 17:09:43 +00:00
small.read(in);
else if (container_type == details::ContainerType::MEDIUM)
{
toMedium();
getContainer<Medium>().read(in);
}
else if (container_type == details::ContainerType::LARGE)
{
toLarge();
getContainer<Large>().read(in);
}
2015-07-20 17:09:43 +00:00
}
2015-07-20 17:09:43 +00:00
void readAndMerge(DB::ReadBuffer & in)
{
auto container_type = getContainerType();
2017-05-07 20:25:26 +00:00
/// If readAndMerge is called with an empty state, just deserialize
/// the state is specified as a parameter.
if ((container_type == details::ContainerType::SMALL) && small.empty())
{
read(in);
return;
}
UInt8 v;
readBinary(v, in);
auto rhs_container_type = static_cast<details::ContainerType>(v);
auto max_container_type = details::max(container_type, rhs_container_type);
if (container_type != max_container_type)
{
if (max_container_type == details::ContainerType::MEDIUM)
toMedium();
else if (max_container_type == details::ContainerType::LARGE)
toLarge();
}
if (rhs_container_type == details::ContainerType::SMALL)
{
typename Small::Reader reader(in);
while (reader.next())
insert(reader.get());
}
else if (rhs_container_type == details::ContainerType::MEDIUM)
{
typename Medium::Reader reader(in);
while (reader.next())
insert(reader.get());
}
else if (rhs_container_type == details::ContainerType::LARGE)
getContainer<Large>().readAndMerge(in);
2015-07-20 17:09:43 +00:00
}
2015-07-20 17:09:43 +00:00
void write(DB::WriteBuffer & out) const
{
auto container_type = getContainerType();
writeBinary(static_cast<UInt8>(container_type), out);
2015-07-20 17:09:43 +00:00
if (container_type == details::ContainerType::SMALL)
small.write(out);
else if (container_type == details::ContainerType::MEDIUM)
getContainer<Medium>().write(out);
2015-07-20 17:09:43 +00:00
else if (container_type == details::ContainerType::LARGE)
getContainer<Large>().write(out);
2015-07-20 17:09:43 +00:00
}
private:
void toMedium()
{
if (getContainerType() != details::ContainerType::SMALL)
2015-07-20 17:09:43 +00:00
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
auto tmp_medium = std::make_unique<Medium>();
2015-07-20 17:09:43 +00:00
for (const auto & x : small)
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidthesolutionWidthest: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepthesolutionDepthest: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
tmp_medium->insert(x.getValue());
medium = tmp_medium.release();
setContainerType(details::ContainerType::MEDIUM);
2015-07-20 17:09:43 +00:00
}
2015-07-20 17:09:43 +00:00
void toLarge()
{
auto container_type = getContainerType();
2015-07-20 17:09:43 +00:00
if ((container_type != details::ContainerType::SMALL) && (container_type != details::ContainerType::MEDIUM))
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
auto tmp_large = std::make_unique<Large>();
2015-07-20 17:29:59 +00:00
if (container_type == details::ContainerType::SMALL)
{
for (const auto & x : small)
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidthesolutionWidthest: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepthesolutionDepthest: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
tmp_large->insert(x.getValue());
2015-07-20 17:29:59 +00:00
}
else if (container_type == details::ContainerType::MEDIUM)
{
for (const auto & x : getContainer<Medium>())
A Proper lookup table that uses HashTable's API This is the first step of allowing heterogeneous cells in hash tables. performance test results are ``` 1. HashMap<UInt16, UInt8, TrivialHash, HashTableFixedGrower<16>>; 2. NewLookupMap<UInt16, UInt8> ResolutionWidthesolutionWidthest: 2 - 24877272124 ResolutionWidth 100000 1 ..........................................................................................................................................................................................................................................................238498413.99 ResolutionWidth 100000 2 ..........................................................................................................................................................................................................................................................261808889.98 Best: 2 - 26180888998 ResolutionWidth 300000 1 ...................................................................................239307348.81 ResolutionWidth 300000 2 ...................................................................................257592761.30 Best: 2 - 25759276130 ResolutionWidth 1000000 1 .........................240144759.26 ResolutionWidth 1000000 2 .........................257093531.91 Best: 2 - 25709353191 ResolutionWidth 5000000 1 .....241573260.35 ResolutionWidth 5000000 2 .....259314162.79 Best: 2 - 25931416279 ResolutionDepth 30000 1 .................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................217108119.84 ResolutionDepthest: 2 - 24945950441 ResolutionDepth 100000 1 ..........................................................................................................................................................................................................................................................229065162.17 ResolutionDepth 100000 2 ..........................................................................................................................................................................................................................................................253769105.64 Best: 2 - 25376910564 ResolutionDepth 300000 1 ...................................................................................233079225.18 ResolutionDepth 300000 2 ...................................................................................256316273.78 Best: 2 - 25631627378 ResolutionDepth 1000000 1 .........................234184633.51 ResolutionDepth 1000000 2 .........................261100491.57 Best: 2 - 26110049157 ResolutionDepth 5000000 1 .....233118795.66 ResolutionDepth 5000000 2 .....252436160.41 Best: 2 - 25243616041 ```
2019-02-28 09:35:38 +00:00
tmp_large->insert(x.getValue());
destroy();
2015-07-20 17:29:59 +00:00
}
large = tmp_large.release();
setContainerType(details::ContainerType::LARGE);
}
void NO_INLINE destroy()
{
auto container_type = getContainerType();
clearContainerType();
2015-07-20 17:29:59 +00:00
if (container_type == details::ContainerType::MEDIUM)
{
delete medium;
medium = nullptr;
2015-07-20 17:29:59 +00:00
}
else if (container_type == details::ContainerType::LARGE)
{
delete large;
large = nullptr;
}
}
2017-09-15 12:16:12 +00:00
template <typename T>
inline T & getContainer()
{
return *reinterpret_cast<T *>(address & mask);
}
2017-09-15 12:16:12 +00:00
template <typename T>
inline const T & getContainer() const
{
return *reinterpret_cast<T *>(address & mask);
}
void setContainerType(details::ContainerType t)
{
address &= mask;
address |= static_cast<UInt8>(t);
}
inline details::ContainerType getContainerType() const
{
return static_cast<details::ContainerType>(address & ~mask);
}
void clearContainerType()
{
address &= mask;
2015-07-20 17:09:43 +00:00
}
Small small;
union
{
Medium * medium;
Large * large;
UInt64 address = 0;
};
static const UInt64 mask = 0xFFFFFFFFFFFFFFFC;
2022-01-11 15:45:41 +00:00
static const UInt32 medium_set_size_max = 1ULL << medium_set_power2_max;
2015-07-20 17:09:43 +00:00
};
}