2017-05-02 21:08:37 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <iostream>
|
|
|
|
#include <vector>
|
|
|
|
|
|
|
|
#include <boost/range/adaptor/reversed.hpp>
|
|
|
|
|
2022-01-30 19:49:48 +00:00
|
|
|
#include <base/sort.h>
|
|
|
|
|
2021-06-14 02:26:05 +00:00
|
|
|
#include <Common/AllocatorWithMemoryTracking.h>
|
2017-06-26 00:11:32 +00:00
|
|
|
#include <Common/ArenaWithFreeLists.h>
|
2022-03-11 21:47:28 +00:00
|
|
|
#include <Common/ArenaUtils.h>
|
2017-05-02 21:08:37 +00:00
|
|
|
#include <Common/HashTable/Hash.h>
|
|
|
|
#include <Common/HashTable/HashMap.h>
|
|
|
|
|
|
|
|
#include <IO/WriteBuffer.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <IO/ReadBuffer.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/VarInt.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Implementation of the Filtered Space-Saving for TopK streaming analysis.
|
|
|
|
* http://www.l2f.inesc-id.pt/~fmmb/wiki/uploads/Work/misnis.ref0a.pdf
|
|
|
|
* It implements suggested reduce-and-combine algorithm from Parallel Space Saving:
|
|
|
|
* https://arxiv.org/pdf/1401.0702.pdf
|
|
|
|
*/
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2017-06-26 00:11:32 +00:00
|
|
|
/*
|
|
|
|
* Arena interface to allow specialized storage of keys.
|
|
|
|
* POD keys do not require additional storage, so this interface is empty.
|
|
|
|
*/
|
2017-10-08 22:53:38 +00:00
|
|
|
template <typename TKey>
|
|
|
|
struct SpaceSavingArena
|
2017-06-26 00:11:32 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
SpaceSavingArena() = default;
|
|
|
|
TKey emplace(const TKey & key) { return key; }
|
2017-12-01 18:36:55 +00:00
|
|
|
void free(const TKey & /*key*/) {}
|
2017-06-26 00:11:32 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Specialized storage for StringRef with a freelist arena.
|
2019-01-22 19:56:53 +00:00
|
|
|
* Keys of this type that are retained on insertion must be serialized into local storage,
|
2017-06-26 00:11:32 +00:00
|
|
|
* otherwise the reference would be invalid after the processed block is released.
|
|
|
|
*/
|
2017-10-08 22:53:38 +00:00
|
|
|
template <>
|
|
|
|
struct SpaceSavingArena<StringRef>
|
2017-06-26 00:11:32 +00:00
|
|
|
{
|
2022-07-15 11:15:46 +00:00
|
|
|
StringRef emplace(StringRef key)
|
2017-06-26 00:11:32 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
return copyStringInArena(arena, key);
|
2017-06-26 00:11:32 +00:00
|
|
|
}
|
|
|
|
|
2022-07-15 11:15:46 +00:00
|
|
|
void free(StringRef key)
|
2017-06-26 00:11:32 +00:00
|
|
|
{
|
|
|
|
if (key.data)
|
|
|
|
arena.free(const_cast<char *>(key.data), key.size);
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
|
|
|
ArenaWithFreeLists arena;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
template
|
|
|
|
<
|
|
|
|
typename TKey,
|
2020-05-15 16:23:31 +00:00
|
|
|
typename Hash = DefaultHash<TKey>
|
2017-05-10 00:17:45 +00:00
|
|
|
>
|
2017-05-02 21:08:37 +00:00
|
|
|
class SpaceSaving
|
|
|
|
{
|
2017-05-10 00:17:45 +00:00
|
|
|
private:
|
|
|
|
// Suggested constants in the paper "Finding top-k elements in data streams", chap 6. equation (24)
|
|
|
|
// Round to nearest power of 2 for cheaper binning without modulo
|
2017-12-01 23:14:53 +00:00
|
|
|
constexpr uint64_t nextAlphaSize(uint64_t x)
|
2017-05-10 00:17:45 +00:00
|
|
|
{
|
2022-03-11 21:47:28 +00:00
|
|
|
constexpr uint64_t alpha_map_elements_per_counter = 6;
|
2022-07-31 14:34:05 +00:00
|
|
|
return 1ULL << (sizeof(uint64_t) * 8 - std::countl_zero(x * alpha_map_elements_per_counter));
|
2017-05-10 00:17:45 +00:00
|
|
|
}
|
|
|
|
|
2017-05-02 21:08:37 +00:00
|
|
|
public:
|
2018-09-13 14:59:03 +00:00
|
|
|
using Self = SpaceSaving;
|
2017-05-05 21:17:04 +00:00
|
|
|
|
|
|
|
struct Counter
|
|
|
|
{
|
2023-02-19 22:15:09 +00:00
|
|
|
Counter() = default;
|
2017-05-02 21:08:37 +00:00
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
explicit Counter(const TKey & k, UInt64 c = 0, UInt64 e = 0, size_t h = 0)
|
2017-05-10 00:17:45 +00:00
|
|
|
: key(k), slot(0), hash(h), count(c), error(e) {}
|
2017-05-02 21:08:37 +00:00
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void write(WriteBuffer & wb) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
writeBinary(key, wb);
|
|
|
|
writeVarUInt(count, wb);
|
|
|
|
writeVarUInt(error, wb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void read(ReadBuffer & rb)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
readBinary(key, rb);
|
|
|
|
readVarUInt(count, rb);
|
|
|
|
readVarUInt(error, rb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// greater() taking slot error into account
|
2017-05-05 21:17:04 +00:00
|
|
|
bool operator> (const Counter & b) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
return (count > b.count) || (count == b.count && error < b.error);
|
|
|
|
}
|
|
|
|
|
|
|
|
TKey key;
|
2019-08-09 10:11:50 +00:00
|
|
|
size_t slot;
|
2019-08-07 10:15:25 +00:00
|
|
|
size_t hash;
|
2017-05-05 21:17:04 +00:00
|
|
|
UInt64 count;
|
|
|
|
UInt64 error;
|
2017-05-02 21:08:37 +00:00
|
|
|
};
|
|
|
|
|
2022-03-11 21:47:28 +00:00
|
|
|
explicit SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {}
|
2017-06-26 00:11:32 +00:00
|
|
|
|
2017-05-02 21:08:37 +00:00
|
|
|
~SpaceSaving() { destroyElements(); }
|
|
|
|
|
|
|
|
inline size_t size() const
|
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
return counter_list.size();
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
inline size_t capacity() const
|
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
return m_capacity;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-09-28 07:01:54 +00:00
|
|
|
void clear()
|
|
|
|
{
|
|
|
|
return destroyElements();
|
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void resize(size_t new_capacity)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
counter_list.reserve(new_capacity);
|
2017-05-10 00:17:45 +00:00
|
|
|
alpha_map.resize(nextAlphaSize(new_capacity));
|
2017-05-05 21:17:04 +00:00
|
|
|
m_capacity = new_capacity;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 23:36:02 +00:00
|
|
|
void insert(const TKey & key, UInt64 increment = 1, UInt64 error = 0)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
// Increase weight of a key that already exists
|
2017-05-05 21:17:04 +00:00
|
|
|
auto hash = counter_map.hash(key);
|
2020-09-04 01:05:57 +00:00
|
|
|
|
2021-05-02 22:42:01 +00:00
|
|
|
if (auto * counter = findCounter(key, hash); counter)
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2019-08-07 10:15:25 +00:00
|
|
|
counter->count += increment;
|
|
|
|
counter->error += error;
|
|
|
|
percolate(counter);
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
2020-09-04 01:05:57 +00:00
|
|
|
|
2017-05-02 21:08:37 +00:00
|
|
|
// Key doesn't exist, but can fit in the top K
|
2020-09-04 01:05:57 +00:00
|
|
|
if (unlikely(size() < capacity()))
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2021-05-02 22:42:01 +00:00
|
|
|
auto * c = new Counter(arena.emplace(key), increment, error, hash);
|
2017-05-02 21:08:37 +00:00
|
|
|
push(c);
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2021-05-02 22:42:01 +00:00
|
|
|
auto * min = counter_list.back();
|
2019-07-05 07:16:39 +00:00
|
|
|
// The key doesn't exist and cannot fit in the current top K, but
|
|
|
|
// the new key has a bigger weight and is virtually more present
|
|
|
|
// compared to the element who is less present on the set. This part
|
|
|
|
// of the code is useful for the function topKWeighted
|
2019-07-03 11:36:06 +00:00
|
|
|
if (increment > min->count)
|
|
|
|
{
|
|
|
|
destroyLastElement();
|
|
|
|
push(new Counter(arena.emplace(key), increment, error, hash));
|
|
|
|
return;
|
|
|
|
}
|
2019-08-07 10:15:25 +00:00
|
|
|
|
2017-05-10 00:17:45 +00:00
|
|
|
const size_t alpha_mask = alpha_map.size() - 1;
|
|
|
|
auto & alpha = alpha_map[hash & alpha_mask];
|
2017-05-05 21:17:04 +00:00
|
|
|
if (alpha + increment < min->count)
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
alpha += increment;
|
2017-05-05 23:36:02 +00:00
|
|
|
return;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Erase the current minimum element
|
2017-05-10 00:17:45 +00:00
|
|
|
alpha_map[min->hash & alpha_mask] = min->count;
|
2019-08-08 12:55:08 +00:00
|
|
|
destroyLastElement();
|
2019-08-07 10:15:25 +00:00
|
|
|
|
|
|
|
push(new Counter(arena.emplace(key), alpha + increment, alpha + error, hash));
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Parallel Space Saving reduction and combine step from:
|
|
|
|
* https://arxiv.org/pdf/1401.0702.pdf
|
|
|
|
*/
|
2017-05-05 21:17:04 +00:00
|
|
|
void merge(const Self & rhs)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
UInt64 m1 = 0;
|
|
|
|
UInt64 m2 = 0;
|
|
|
|
|
|
|
|
if (size() == capacity())
|
|
|
|
{
|
|
|
|
m1 = counter_list.back()->count;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
2017-05-05 21:17:04 +00:00
|
|
|
|
|
|
|
if (rhs.size() == rhs.capacity())
|
|
|
|
{
|
|
|
|
m2 = rhs.counter_list.back()->count;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Updated algorithm to mutate current table in place
|
|
|
|
* without mutating rhs table or creating new one
|
|
|
|
* in the first step we expect that no elements overlap
|
|
|
|
* and in the second sweep we correct the error if they do.
|
|
|
|
*/
|
2017-05-05 21:17:04 +00:00
|
|
|
if (m2 > 0)
|
|
|
|
{
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : counter_list)
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
|
|
|
counter->count += m2;
|
|
|
|
counter->error += m2;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// The list is sorted in descending order, we have to scan in reverse
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : boost::adaptors::reverse(rhs.counter_list))
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2019-08-09 10:11:50 +00:00
|
|
|
size_t hash = counter_map.hash(counter->key);
|
2021-05-02 22:42:01 +00:00
|
|
|
if (auto * current = findCounter(counter->key, hash))
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
// Subtract m2 previously added, guaranteed not negative
|
2019-08-09 10:11:50 +00:00
|
|
|
current->count += (counter->count - m2);
|
|
|
|
current->error += (counter->error - m2);
|
2017-05-05 21:17:04 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
// Counters not monitored in S1
|
2019-08-09 10:11:50 +00:00
|
|
|
counter_list.push_back(new Counter(arena.emplace(counter->key), counter->count + m1, counter->error + m1, hash));
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
2019-08-09 10:11:50 +00:00
|
|
|
|
2022-01-30 19:49:48 +00:00
|
|
|
::sort(counter_list.begin(), counter_list.end(), [](Counter * l, Counter * r) { return *l > *r; });
|
2019-08-09 10:11:50 +00:00
|
|
|
|
|
|
|
if (counter_list.size() > m_capacity)
|
|
|
|
{
|
|
|
|
for (size_t i = m_capacity; i < counter_list.size(); ++i)
|
|
|
|
{
|
|
|
|
arena.free(counter_list[i]->key);
|
|
|
|
delete counter_list[i];
|
|
|
|
}
|
|
|
|
counter_list.resize(m_capacity);
|
|
|
|
}
|
|
|
|
|
|
|
|
for (size_t i = 0; i < counter_list.size(); ++i)
|
|
|
|
counter_list[i]->slot = i;
|
|
|
|
rebuildCounterMap();
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
std::vector<Counter> topK(size_t k) const
|
|
|
|
{
|
|
|
|
std::vector<Counter> res;
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : counter_list)
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
|
|
|
res.push_back(*counter);
|
|
|
|
if (res.size() == k)
|
2017-05-02 21:08:37 +00:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void write(WriteBuffer & wb) const
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
2017-05-05 21:17:04 +00:00
|
|
|
writeVarUInt(size(), wb);
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : counter_list)
|
2017-05-05 21:17:04 +00:00
|
|
|
counter->write(wb);
|
2017-09-28 07:01:54 +00:00
|
|
|
|
|
|
|
writeVarUInt(alpha_map.size(), wb);
|
2017-05-05 21:17:04 +00:00
|
|
|
for (auto alpha : alpha_map)
|
|
|
|
writeVarUInt(alpha, wb);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
void read(ReadBuffer & rb)
|
2017-05-02 21:08:37 +00:00
|
|
|
{
|
|
|
|
destroyElements();
|
|
|
|
size_t count = 0;
|
2017-05-05 21:17:04 +00:00
|
|
|
readVarUInt(count, rb);
|
2017-05-02 21:08:37 +00:00
|
|
|
|
2017-05-05 21:17:04 +00:00
|
|
|
for (size_t i = 0; i < count; ++i)
|
|
|
|
{
|
2021-05-02 22:42:01 +00:00
|
|
|
auto * counter = new Counter();
|
2017-05-05 21:17:04 +00:00
|
|
|
counter->read(rb);
|
2017-05-10 00:17:45 +00:00
|
|
|
counter->hash = counter_map.hash(counter->key);
|
2017-05-05 21:17:04 +00:00
|
|
|
push(counter);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2017-09-28 07:01:54 +00:00
|
|
|
readAlphaMap(rb);
|
|
|
|
}
|
|
|
|
|
|
|
|
void readAlphaMap(ReadBuffer & rb)
|
|
|
|
{
|
|
|
|
size_t alpha_size = 0;
|
|
|
|
readVarUInt(alpha_size, rb);
|
|
|
|
for (size_t i = 0; i < alpha_size; ++i)
|
2017-05-05 21:17:04 +00:00
|
|
|
{
|
2017-05-02 21:08:37 +00:00
|
|
|
UInt64 alpha = 0;
|
2017-05-05 21:17:04 +00:00
|
|
|
readVarUInt(alpha, rb);
|
|
|
|
alpha_map.push_back(alpha);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
protected:
|
2017-05-05 21:17:04 +00:00
|
|
|
void push(Counter * counter)
|
|
|
|
{
|
|
|
|
counter->slot = counter_list.size();
|
|
|
|
counter_list.push_back(counter);
|
|
|
|
counter_map[counter->key] = counter;
|
|
|
|
percolate(counter);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// This is equivallent to one step of bubble sort
|
2017-05-05 21:17:04 +00:00
|
|
|
void percolate(Counter * counter)
|
|
|
|
{
|
|
|
|
while (counter->slot > 0)
|
|
|
|
{
|
2021-05-02 22:42:01 +00:00
|
|
|
auto * next = counter_list[counter->slot - 1];
|
2017-05-05 21:17:04 +00:00
|
|
|
if (*counter > *next)
|
|
|
|
{
|
|
|
|
std::swap(next->slot, counter->slot);
|
|
|
|
std::swap(counter_list[next->slot], counter_list[counter->slot]);
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
2017-05-05 21:17:04 +00:00
|
|
|
else
|
|
|
|
break;
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private:
|
2017-05-05 21:17:04 +00:00
|
|
|
void destroyElements()
|
|
|
|
{
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : counter_list)
|
2019-08-09 10:11:50 +00:00
|
|
|
{
|
|
|
|
arena.free(counter->key);
|
2017-05-05 21:17:04 +00:00
|
|
|
delete counter;
|
2019-08-09 10:11:50 +00:00
|
|
|
}
|
2017-05-05 21:17:04 +00:00
|
|
|
|
|
|
|
counter_map.clear();
|
|
|
|
counter_list.clear();
|
|
|
|
alpha_map.clear();
|
2017-05-02 21:08:37 +00:00
|
|
|
}
|
|
|
|
|
2019-07-03 11:36:06 +00:00
|
|
|
void destroyLastElement()
|
|
|
|
{
|
|
|
|
auto last_element = counter_list.back();
|
2020-12-09 12:47:22 +00:00
|
|
|
counter_map.erase(last_element->key);
|
2019-08-09 10:11:50 +00:00
|
|
|
arena.free(last_element->key);
|
|
|
|
delete last_element;
|
2019-07-03 11:36:06 +00:00
|
|
|
counter_list.pop_back();
|
2019-08-07 10:15:25 +00:00
|
|
|
|
|
|
|
++removed_keys;
|
|
|
|
if (removed_keys * 2 > counter_map.size())
|
|
|
|
rebuildCounterMap();
|
|
|
|
}
|
|
|
|
|
|
|
|
Counter * findCounter(const TKey & key, size_t hash)
|
|
|
|
{
|
|
|
|
auto it = counter_map.find(key, hash);
|
2019-08-20 09:58:44 +00:00
|
|
|
if (!it)
|
2019-08-07 10:15:25 +00:00
|
|
|
return nullptr;
|
|
|
|
|
2019-10-29 15:16:51 +00:00
|
|
|
return it->getMapped();
|
2019-07-03 11:36:06 +00:00
|
|
|
}
|
|
|
|
|
2019-08-07 10:15:25 +00:00
|
|
|
void rebuildCounterMap()
|
|
|
|
{
|
|
|
|
removed_keys = 0;
|
|
|
|
counter_map.clear();
|
2021-05-02 22:42:01 +00:00
|
|
|
for (auto * counter : counter_list)
|
2019-08-07 10:15:25 +00:00
|
|
|
counter_map[counter->key] = counter;
|
|
|
|
}
|
|
|
|
|
2020-05-26 05:54:04 +00:00
|
|
|
using CounterMap = HashMapWithStackMemory<TKey, Counter *, Hash, 4>;
|
2019-08-07 10:15:25 +00:00
|
|
|
|
|
|
|
CounterMap counter_map;
|
2021-06-14 02:26:05 +00:00
|
|
|
std::vector<Counter *, AllocatorWithMemoryTracking<Counter *>> counter_list;
|
|
|
|
std::vector<UInt64, AllocatorWithMemoryTracking<UInt64>> alpha_map;
|
2017-06-26 00:11:32 +00:00
|
|
|
SpaceSavingArena<TKey> arena;
|
2017-05-05 21:17:04 +00:00
|
|
|
size_t m_capacity;
|
2019-08-07 10:15:25 +00:00
|
|
|
size_t removed_keys = 0;
|
2017-05-02 21:08:37 +00:00
|
|
|
};
|
|
|
|
|
2018-08-10 04:02:56 +00:00
|
|
|
}
|