2017-05-02 21:08:37 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include <boost/range/adaptor/reversed.hpp>
|
|
|
|
|
2017-06-26 00:11:32 +00:00
|
|
|
#include <Common/ArenaWithFreeLists.h>
|
2017-05-02 21:08:37 +00:00
|
|
|
#include <Common/UInt128.h>
|
|
|
|
#include <Common/HashTable/Hash.h>
|
|
|
|
#include <Common/HashTable/HashMap.h>
|
|
|
|
|
|
|
|
#include <IO/WriteBuffer.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/ReadBuffer.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/VarInt.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implementation of the Filtered Space-Saving for TopK streaming analysis.
|
|
|
|
* http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf
|
|
|
|
* It implements suggested reduce-and-combine algorithm from Parallel Space Saving:
|
|
|
|
* https://arxiv.org/pdf/1401.0702.pdf
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2017-06-26 00:11:32 +00:00
|
|
|
/*
|
|
|
|
* Arena interface to allow specialized storage of keys.
|
|
|
|
* POD keys do not require additional storage, so this interface is empty.
|
|
|
|
*/
|
|
|
|
template <typename TKey> struct SpaceSavingArena
|
|
|
|
{
|
|
|
|
SpaceSavingArena() {}
|
|
|
|
const TKey emplace(const TKey & key) { return key; }
|
|
|
|
void free(const TKey & key) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Specialized storage for StringRef with a freelist arena.
|
|
|
|
* Keys of this type that are retained on insertion must be serialised into local storage,
|
|
|
|
* otherwise the reference would be invalid after the processed block is released.
|
|
|
|
*/
|
|
|
|
template <> struct SpaceSavingArena<StringRef>
|
|
|
|
{
|
|
|
|
const StringRef emplace(const StringRef & key)
|
|
|
|
{
|
|
|
|
auto ptr = arena.alloc(key.size);
|
|
|
|
std::copy(key.data, key.data + key.size, ptr);
|
|
|
|
return StringRef{ptr, key.size};
|
|
|
|
}
|
|
|
|
|
|
|
|
void free(const StringRef & key)
|
|
|
|
{
|
|
|
|
if (key.data)
|
|
|
|
arena.free(const_cast<char *>(key.data), key.size);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
ArenaWithFreeLists arena;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
template
|
|
|
|
<
|
|
|
|
typename TKey,
|
2017-06-26 00:11:32 +00:00
|
|
|
typename Hash = DefaultHash<TKey>,
|
2017-05-10 00:17:45 +00:00
|
|
|
typename Grower = HashTableGrower<>,
|
|
|
|
typename Allocator = HashTableAllocator
|
|
|
|
>
|
2017-05-02 21:08:37 +00:00
|
|
|
class SpaceSaving
|
|
|
|
{
|
2017-05-10 00:17:45 +00:00
|
|
|
private:
|
|
|
|
// Suggested constants in the paper "Finding top-k elements in data streams", chap 6. equation (24)
|
|
|
|
// Round to nearest power of 2 for cheaper binning without modulo
|
|
|
|
constexpr uint64_t nextAlphaSize (uint64_t x)
|
|
|
|
{
|
|
|
|
constexpr uint64_t ALPHA_MAP_ELEMENTS_PER_COUNTER = 6;
|
|
|
|
return 1ULL<<(sizeof(uint64_t) * 8 - __builtin_clzll(x * ALPHA_MAP_ELEMENTS_PER_COUNTER));
|
|
|
|
}
|
|
|
|
|
2017-05-02 21:08:37 +00:00
|
|
|
public:
|
2017-06-26 00:11:32 +00:00
|
|
|
using Self = SpaceSaving<TKey, Hash, Grower, Allocator>;
|
2017-05-05 21:17:04 +00:00
|
|
|
|
|
|
|
struct Counter
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
Counter() {}
|
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
Counter(const TKey & k, UInt64 c = 0, UInt64 e = 0, size_t h = 0)
|
|
|
|
: key(k), slot(0), hash(h), count(c), error(e) {}
|
2017-05-02 21:08:37 +00:00
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void write(WriteBuffer & wb) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
writeBinary(key, wb);
|
|
|
|
writeVarUInt(count, wb);
|
|
|
|
writeVarUInt(error, wb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void read(ReadBuffer & rb)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
readBinary(key, rb);
|
|
|
|
readVarUInt(count, rb);
|
|
|
|
readVarUInt(error, rb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// greater() taking slot error into account
|
2017-05-05 21:17:04 +00:00
|
|
|
bool operator> (const Counter & b) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
return (count > b.count) || (count == b.count && error < b.error);
|
|
|
|
}
|
|
|
|
|
|
|
|
TKey key;
|
2017-05-10 00:17:45 +00:00
|
|
|
size_t slot, hash;
|
2017-05-05 21:17:04 +00:00
|
|
|
UInt64 count;
|
|
|
|
UInt64 error;
|
2017-05-02 21:08:37 +00:00
|
|
|
};
|
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {}
|
2017-06-26 00:11:32 +00:00
|
|
|
|
2017-05-02 21:08:37 +00:00
|
|
|
~SpaceSaving() { destroyElements(); }
|
|
|
|
|
|
|
|
inline size_t size() const
|
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
return counter_list.size();
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
inline size_t capacity() const
|
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
return m_capacity;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void resize(size_t new_capacity)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
counter_list.reserve(new_capacity);
|
2017-05-10 00:17:45 +00:00
|
|
|
alpha_map.resize(nextAlphaSize(new_capacity));
|
2017-05-05 21:17:04 +00:00
|
|
|
m_capacity = new_capacity;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 23:36:02 +00:00
|
|
|
void insert(const TKey & key, UInt64 increment = 1, UInt64 error = 0)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
// Increase weight of a key that already exists
|
|
|
|
// It uses hashtable for both value mapping as a presence test (c_i != 0)
|
2017-05-05 21:17:04 +00:00
|
|
|
auto hash = counter_map.hash(key);
|
|
|
|
auto it = counter_map.find(key, hash);
|
|
|
|
if (it != counter_map.end())
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
auto c = it->second;
|
|
|
|
c->count += increment;
|
|
|
|
c->error += error;
|
|
|
|
percolate(c);
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
// Key doesn't exist, but can fit in the top K
|
2017-05-10 00:17:45 +00:00
|
|
|
else if (unlikely(size() < capacity()))
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2017-06-26 00:11:32 +00:00
|
|
|
auto c = new Counter(arena.emplace(key), increment, error, hash);
|
2017-05-02 21:08:37 +00:00
|
|
|
push(c);
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
auto min = counter_list.back();
|
2017-05-10 00:17:45 +00:00
|
|
|
const size_t alpha_mask = alpha_map.size() - 1;
|
|
|
|
auto & alpha = alpha_map[hash & alpha_mask];
|
2017-05-05 21:17:04 +00:00
|
|
|
if (alpha + increment < min->count)
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
alpha += increment;
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Erase the current minimum element
|
2017-05-10 00:17:45 +00:00
|
|
|
alpha_map[min->hash & alpha_mask] = min->count;
|
|
|
|
it = counter_map.find(min->key, min->hash);
|
2017-05-02 21:08:37 +00:00
|
|
|
|
|
|
|
// Replace minimum with newly inserted element
|
2017-05-10 00:17:45 +00:00
|
|
|
if (it != counter_map.end())
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2017-06-26 00:11:32 +00:00
|
|
|
arena.free(min->key);
|
2017-05-10 00:17:45 +00:00
|
|
|
min->hash = hash;
|
2017-06-26 00:11:32 +00:00
|
|
|
min->key = arena.emplace(key);
|
2017-05-02 21:08:37 +00:00
|
|
|
min->count = alpha + increment;
|
|
|
|
min->error = alpha + error;
|
|
|
|
percolate(min);
|
2017-05-10 00:17:45 +00:00
|
|
|
|
|
|
|
it->second = min;
|
2017-06-26 00:11:32 +00:00
|
|
|
it->first = min->key;
|
2017-05-10 00:17:45 +00:00
|
|
|
counter_map.reinsert(it, hash);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parallel Space Saving reduction and combine step from:
|
|
|
|
* https://arxiv.org/pdf/1401.0702.pdf
|
|
|
|
*/
|
2017-05-05 21:17:04 +00:00
|
|
|
void merge(const Self & rhs)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
UInt64 m1 = 0;
|
|
|
|
UInt64 m2 = 0;
|
|
|
|
|
|
|
|
if (size() == capacity())
|
|
|
|
{
|
|
|
|
m1 = counter_list.back()->count;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
2017-05-05 21:17:04 +00:00
|
|
|
|
|
|
|
if (rhs.size() == rhs.capacity())
|
|
|
|
{
|
|
|
|
m2 = rhs.counter_list.back()->count;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Updated algorithm to mutate current table in place
|
|
|
|
* without mutating rhs table or creating new one
|
|
|
|
* in the first step we expect that no elements overlap
|
|
|
|
* and in the second sweep we correct the error if they do.
|
|
|
|
*/
|
2017-05-05 21:17:04 +00:00
|
|
|
if (m2 > 0)
|
|
|
|
{
|
|
|
|
for (auto counter : counter_list)
|
|
|
|
{
|
|
|
|
counter->count += m2;
|
|
|
|
counter->error += m2;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The list is sorted in descending order, we have to scan in reverse
|
2017-05-05 21:17:04 +00:00
|
|
|
for (auto counter : boost::adaptors::reverse(rhs.counter_list))
|
|
|
|
{
|
|
|
|
if (counter_map.find(counter->key) != counter_map.end())
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
// Subtract m2 previously added, guaranteed not negative
|
2017-05-05 21:17:04 +00:00
|
|
|
insert(counter->key, counter->count - m2, counter->error - m2);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
// Counters not monitored in S1
|
2017-05-05 21:17:04 +00:00
|
|
|
insert(counter->key, counter->count + m1, counter->error + m1);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Counter> topK(size_t k) const
|
|
|
|
{
|
|
|
|
std::vector<Counter> res;
|
2017-05-05 21:17:04 +00:00
|
|
|
for (auto counter : counter_list)
|
|
|
|
{
|
|
|
|
res.push_back(*counter);
|
|
|
|
if (res.size() == k)
|
2017-05-02 21:08:37 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void write(WriteBuffer & wb) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
writeVarUInt(size(), wb);
|
|
|
|
for (auto counter : counter_list)
|
|
|
|
counter->write(wb);
|
|
|
|
for (auto alpha : alpha_map)
|
|
|
|
writeVarUInt(alpha, wb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void read(ReadBuffer & rb)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
destroyElements();
|
|
|
|
size_t count = 0;
|
2017-05-05 21:17:04 +00:00
|
|
|
readVarUInt(count, rb);
|
2017-05-02 21:08:37 +00:00
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
for (size_t i = 0; i < count; ++i)
|
|
|
|
{
|
|
|
|
auto counter = new Counter();
|
|
|
|
counter->read(rb);
|
2017-05-10 00:17:45 +00:00
|
|
|
counter->hash = counter_map.hash(counter->key);
|
2017-05-05 21:17:04 +00:00
|
|
|
push(counter);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
for (size_t i = 0; i < nextAlphaSize(m_capacity); ++i)
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
UInt64 alpha = 0;
|
2017-05-05 21:17:04 +00:00
|
|
|
readVarUInt(alpha, rb);
|
|
|
|
alpha_map.push_back(alpha);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
2017-05-05 21:17:04 +00:00
|
|
|
void push(Counter * counter)
|
|
|
|
{
|
|
|
|
counter->slot = counter_list.size();
|
|
|
|
counter_list.push_back(counter);
|
|
|
|
counter_map[counter->key] = counter;
|
|
|
|
percolate(counter);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// This is equivallent to one step of bubble sort
|
2017-05-05 21:17:04 +00:00
|
|
|
void percolate(Counter * counter)
|
|
|
|
{
|
|
|
|
while (counter->slot > 0)
|
|
|
|
{
|
|
|
|
auto next = counter_list[counter->slot - 1];
|
|
|
|
if (*counter > *next)
|
|
|
|
{
|
|
|
|
std::swap(next->slot, counter->slot);
|
|
|
|
std::swap(counter_list[next->slot], counter_list[counter->slot]);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
2017-05-05 21:17:04 +00:00
|
|
|
else
|
|
|
|
break;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2017-05-05 21:17:04 +00:00
|
|
|
void destroyElements()
|
|
|
|
{
|
|
|
|
for (auto counter : counter_list)
|
|
|
|
delete counter;
|
|
|
|
|
|
|
|
counter_map.clear();
|
|
|
|
counter_list.clear();
|
|
|
|
alpha_map.clear();
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-06-26 00:11:32 +00:00
|
|
|
HashMap<TKey, Counter *, Hash, Grower, Allocator> counter_map;
|
2017-05-05 21:17:04 +00:00
|
|
|
std::vector<Counter *> counter_list;
|
|
|
|
std::vector<UInt64> alpha_map;
|
2017-06-26 00:11:32 +00:00
|
|
|
SpaceSavingArena<TKey> arena;
|
2017-05-05 21:17:04 +00:00
|
|
|
size_t m_capacity;
|
2017-05-02 21:08:37 +00:00
|
|
|
};
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
};
|