2015-07-20 17:09:43 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <DB/Common/HashTable/SmallTable.h>
|
|
|
|
|
#include <DB/Common/HashTable/HashSet.h>
|
2015-07-24 16:46:00 +00:00
|
|
|
|
#include <statdaemons/HyperLogLogCounter.h>
|
2015-07-22 14:12:34 +00:00
|
|
|
|
#include <DB/Core/Defines.h>
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
|
|
|
|
namespace details
|
|
|
|
|
{
|
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
enum class ContainerType : UInt8 { SMALL = 1, MEDIUM = 2, LARGE = 3 };
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
static inline ContainerType max(const ContainerType & lhs, const ContainerType & rhs)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
2015-07-24 16:46:00 +00:00
|
|
|
|
UInt8 res = std::max(static_cast<UInt8>(lhs), static_cast<UInt8>(rhs));
|
2015-07-20 17:09:43 +00:00
|
|
|
|
return static_cast<ContainerType>(res);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** Для маленького количества ключей - массив фиксированного размера "на стеке".
|
|
|
|
|
* Для среднего - выделяется HashSet.
|
|
|
|
|
* Для большого - выделяется HyperLogLog.
|
|
|
|
|
*/
|
2015-07-24 16:46:00 +00:00
|
|
|
|
template
|
|
|
|
|
<
|
|
|
|
|
typename Key,
|
|
|
|
|
typename HashContainer,
|
|
|
|
|
UInt8 small_set_size_max,
|
|
|
|
|
UInt8 medium_set_power2_max,
|
|
|
|
|
UInt8 K,
|
|
|
|
|
typename Hash = IntHash32<Key>,
|
2015-08-20 17:11:21 +00:00
|
|
|
|
typename HashValueType = UInt32,
|
2015-08-21 17:09:34 +00:00
|
|
|
|
typename BiasEstimator = TrivialBiasEstimator,
|
2015-08-20 15:59:18 +00:00
|
|
|
|
HyperLogLogMode mode = HyperLogLogMode::FullFeatured,
|
2015-08-11 17:31:31 +00:00
|
|
|
|
typename DenominatorType = double
|
2015-07-24 16:46:00 +00:00
|
|
|
|
>
|
2015-07-20 17:09:43 +00:00
|
|
|
|
class CombinedCardinalityEstimator
|
|
|
|
|
{
|
|
|
|
|
public:
|
2015-08-20 15:59:18 +00:00
|
|
|
|
using Self = CombinedCardinalityEstimator
|
|
|
|
|
<
|
|
|
|
|
Key,
|
|
|
|
|
HashContainer,
|
|
|
|
|
small_set_size_max,
|
|
|
|
|
medium_set_power2_max,
|
|
|
|
|
K,
|
|
|
|
|
Hash,
|
2015-08-20 17:11:21 +00:00
|
|
|
|
HashValueType,
|
2015-08-21 17:09:34 +00:00
|
|
|
|
BiasEstimator,
|
2015-08-20 15:59:18 +00:00
|
|
|
|
mode,
|
|
|
|
|
DenominatorType
|
|
|
|
|
>;
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
using Small = SmallSet<Key, small_set_size_max>;
|
|
|
|
|
using Medium = HashContainer;
|
2015-08-21 17:09:34 +00:00
|
|
|
|
using Large = HyperLogLogCounter<K, Hash, HashValueType, DenominatorType, BiasEstimator, mode>;
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
public:
|
2015-07-22 14:12:34 +00:00
|
|
|
|
CombinedCardinalityEstimator()
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
setContainerType(details::ContainerType::SMALL);
|
|
|
|
|
}
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
~CombinedCardinalityEstimator()
|
|
|
|
|
{
|
|
|
|
|
destroy();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void insert(Key value)
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
|
2015-07-20 17:09:43 +00:00
|
|
|
|
if (container_type == details::ContainerType::SMALL)
|
|
|
|
|
{
|
|
|
|
|
if (small.find(value) == small.end())
|
|
|
|
|
{
|
|
|
|
|
if (!small.full())
|
|
|
|
|
small.insert(value);
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
toMedium();
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Medium>().insert(value);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if (container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
{
|
2015-07-22 15:25:59 +00:00
|
|
|
|
auto & container = getContainer<Medium>();
|
|
|
|
|
if (container.size() < medium_set_size_max)
|
|
|
|
|
container.insert(value);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
toLarge();
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Large>().insert(value);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
else if (container_type == details::ContainerType::LARGE)
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Large>().insert(value);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
UInt32 size() const
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
|
2015-07-20 17:09:43 +00:00
|
|
|
|
if (container_type == details::ContainerType::SMALL)
|
|
|
|
|
return small.size();
|
|
|
|
|
else if (container_type == details::ContainerType::MEDIUM)
|
2015-07-22 15:25:59 +00:00
|
|
|
|
return getContainer<Medium>().size();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
else if (container_type == details::ContainerType::LARGE)
|
2015-07-22 15:25:59 +00:00
|
|
|
|
return getContainer<Large>().size();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
else
|
|
|
|
|
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void merge(const Self & rhs)
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
auto max_container_type = details::max(container_type, rhs.getContainerType());
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
if (container_type != max_container_type)
|
|
|
|
|
{
|
|
|
|
|
if (max_container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
toMedium();
|
|
|
|
|
else if (max_container_type == details::ContainerType::LARGE)
|
|
|
|
|
toLarge();
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-24 16:46:00 +00:00
|
|
|
|
if (rhs.getContainerType() == details::ContainerType::SMALL)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
|
|
|
|
for (const auto & x : rhs.small)
|
|
|
|
|
insert(x);
|
|
|
|
|
}
|
2015-07-24 16:46:00 +00:00
|
|
|
|
else if (rhs.getContainerType() == details::ContainerType::MEDIUM)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
2015-07-24 16:46:00 +00:00
|
|
|
|
for (const auto & x : rhs.getContainer<Medium>())
|
|
|
|
|
insert(x);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
2015-07-24 16:46:00 +00:00
|
|
|
|
else if (rhs.getContainerType() == details::ContainerType::LARGE)
|
|
|
|
|
getContainer<Large>().merge(rhs.getContainer<Large>());
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Можно вызывать только для пустого объекта.
|
|
|
|
|
void read(DB::ReadBuffer & in)
|
|
|
|
|
{
|
|
|
|
|
UInt8 v;
|
|
|
|
|
readBinary(v, in);
|
2015-07-22 15:25:59 +00:00
|
|
|
|
auto container_type = static_cast<details::ContainerType>(v);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-07-22 15:25:59 +00:00
|
|
|
|
if (container_type == details::ContainerType::SMALL)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
small.read(in);
|
2015-07-22 15:25:59 +00:00
|
|
|
|
else if (container_type == details::ContainerType::MEDIUM)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
|
|
|
|
toMedium();
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Medium>().read(in);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
2015-07-22 15:25:59 +00:00
|
|
|
|
else if (container_type == details::ContainerType::LARGE)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
{
|
|
|
|
|
toLarge();
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Large>().read(in);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void readAndMerge(DB::ReadBuffer & in)
|
|
|
|
|
{
|
2015-07-24 16:46:00 +00:00
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
|
|
|
|
|
UInt8 v;
|
|
|
|
|
readBinary(v, in);
|
|
|
|
|
auto rhs_container_type = static_cast<details::ContainerType>(v);
|
|
|
|
|
|
|
|
|
|
auto max_container_type = details::max(container_type, rhs_container_type);
|
|
|
|
|
|
|
|
|
|
if (container_type != max_container_type)
|
|
|
|
|
{
|
|
|
|
|
if (max_container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
toMedium();
|
|
|
|
|
else if (max_container_type == details::ContainerType::LARGE)
|
|
|
|
|
toLarge();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (rhs_container_type == details::ContainerType::SMALL)
|
|
|
|
|
{
|
|
|
|
|
typename Small::Reader reader(in);
|
|
|
|
|
while (reader.next())
|
|
|
|
|
insert(reader.get());
|
|
|
|
|
}
|
|
|
|
|
else if (rhs_container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
{
|
|
|
|
|
typename Medium::Reader reader(in);
|
|
|
|
|
while (reader.next())
|
|
|
|
|
insert(reader.get());
|
|
|
|
|
}
|
|
|
|
|
else if (rhs_container_type == details::ContainerType::LARGE)
|
|
|
|
|
getContainer<Large>().readAndMerge(in);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void write(DB::WriteBuffer & out) const
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto container_type = getContainerType();
|
2015-07-22 15:25:59 +00:00
|
|
|
|
writeBinary(static_cast<UInt8>(container_type), out);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
if (container_type == details::ContainerType::SMALL)
|
|
|
|
|
small.write(out);
|
|
|
|
|
else if (container_type == details::ContainerType::MEDIUM)
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Medium>().write(out);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
else if (container_type == details::ContainerType::LARGE)
|
2015-07-22 15:25:59 +00:00
|
|
|
|
getContainer<Large>().write(out);
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
void toMedium()
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
if (getContainerType() != details::ContainerType::SMALL)
|
2015-07-20 17:09:43 +00:00
|
|
|
|
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto tmp_medium = std::make_unique<Medium>();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
|
|
|
|
for (const auto & x : small)
|
|
|
|
|
tmp_medium->insert(x);
|
|
|
|
|
|
2015-08-10 13:47:43 +00:00
|
|
|
|
medium = tmp_medium.release();
|
2015-07-22 14:12:34 +00:00
|
|
|
|
setContainerType(details::ContainerType::MEDIUM);
|
|
|
|
|
|
|
|
|
|
if (current_memory_tracker)
|
|
|
|
|
current_memory_tracker->alloc(sizeof(medium));
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void toLarge()
|
|
|
|
|
{
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
|
2015-07-20 17:09:43 +00:00
|
|
|
|
if ((container_type != details::ContainerType::SMALL) && (container_type != details::ContainerType::MEDIUM))
|
|
|
|
|
throw Poco::Exception("Internal error", ErrorCodes::LOGICAL_ERROR);
|
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
auto tmp_large = std::make_unique<Large>();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-07-20 17:29:59 +00:00
|
|
|
|
if (container_type == details::ContainerType::SMALL)
|
|
|
|
|
{
|
|
|
|
|
for (const auto & x : small)
|
|
|
|
|
tmp_large->insert(x);
|
|
|
|
|
}
|
|
|
|
|
else if (container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
{
|
2015-07-22 15:25:59 +00:00
|
|
|
|
for (const auto & x : getContainer<Medium>())
|
2015-07-20 17:29:59 +00:00
|
|
|
|
tmp_large->insert(x);
|
2015-07-22 14:12:34 +00:00
|
|
|
|
|
|
|
|
|
destroy();
|
2015-07-20 17:29:59 +00:00
|
|
|
|
}
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-08-10 13:47:43 +00:00
|
|
|
|
large = tmp_large.release();
|
2015-07-22 14:12:34 +00:00
|
|
|
|
setContainerType(details::ContainerType::LARGE);
|
|
|
|
|
|
|
|
|
|
if (current_memory_tracker)
|
|
|
|
|
current_memory_tracker->alloc(sizeof(large));
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void NO_INLINE destroy()
|
|
|
|
|
{
|
|
|
|
|
auto container_type = getContainerType();
|
|
|
|
|
|
|
|
|
|
clearContainerType();
|
2015-07-20 17:09:43 +00:00
|
|
|
|
|
2015-07-20 17:29:59 +00:00
|
|
|
|
if (container_type == details::ContainerType::MEDIUM)
|
|
|
|
|
{
|
2015-08-10 13:47:43 +00:00
|
|
|
|
delete medium;
|
|
|
|
|
medium = nullptr;
|
|
|
|
|
|
2015-07-20 17:29:59 +00:00
|
|
|
|
if (current_memory_tracker)
|
|
|
|
|
current_memory_tracker->free(sizeof(medium));
|
|
|
|
|
}
|
2015-07-22 14:12:34 +00:00
|
|
|
|
else if (container_type == details::ContainerType::LARGE)
|
|
|
|
|
{
|
2015-08-10 13:47:43 +00:00
|
|
|
|
delete large;
|
|
|
|
|
large = nullptr;
|
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
if (current_memory_tracker)
|
|
|
|
|
current_memory_tracker->free(sizeof(large));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
template<typename T>
|
2015-07-24 16:46:00 +00:00
|
|
|
|
inline T & getContainer()
|
2015-07-22 14:12:34 +00:00
|
|
|
|
{
|
2015-07-22 15:25:59 +00:00
|
|
|
|
return *reinterpret_cast<T *>(address & mask);
|
2015-07-22 14:12:34 +00:00
|
|
|
|
}
|
2015-07-20 17:29:59 +00:00
|
|
|
|
|
2015-07-22 14:12:34 +00:00
|
|
|
|
template<typename T>
|
2015-07-24 16:46:00 +00:00
|
|
|
|
inline const T & getContainer() const
|
2015-07-22 14:12:34 +00:00
|
|
|
|
{
|
2015-07-22 15:25:59 +00:00
|
|
|
|
return *reinterpret_cast<T *>(address & mask);
|
2015-07-22 14:12:34 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void setContainerType(details::ContainerType t)
|
|
|
|
|
{
|
2015-08-10 13:47:43 +00:00
|
|
|
|
address &= mask;
|
2015-07-22 14:12:34 +00:00
|
|
|
|
address |= static_cast<UInt8>(t);
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-22 15:25:59 +00:00
|
|
|
|
inline details::ContainerType getContainerType() const
|
2015-07-22 14:12:34 +00:00
|
|
|
|
{
|
|
|
|
|
return static_cast<details::ContainerType>(address & ~mask);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void clearContainerType()
|
|
|
|
|
{
|
|
|
|
|
address &= mask;
|
2015-07-20 17:09:43 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
|
|
|
|
Small small;
|
2015-07-22 14:12:34 +00:00
|
|
|
|
union
|
|
|
|
|
{
|
2015-08-10 13:47:43 +00:00
|
|
|
|
Medium * medium;
|
|
|
|
|
Large * large;
|
2015-07-22 14:12:34 +00:00
|
|
|
|
UInt64 address = 0;
|
|
|
|
|
};
|
2015-08-10 13:47:43 +00:00
|
|
|
|
static const UInt64 mask = 0xFFFFFFFFFFFFFFFC;
|
2015-07-22 14:12:34 +00:00
|
|
|
|
static const UInt32 medium_set_size_max = 1UL << medium_set_power2_max;
|
2015-07-20 17:09:43 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
}
|