SpaceSaving: internal storage for StringRef{}

The SpaceSaving has now specialised storage for
some keys, which only copies keys that
are to be retained in the structure, not all.

Most of the PODs implement this interface empty,
so there shouldn’t be any extra cost.
This commit is contained in:
Marek Vavruša 2017-06-25 17:11:32 -07:00 committed by alexey-milovidov
parent 106a979ac2
commit e189c39056
2 changed files with 50 additions and 13 deletions

View File

@ -31,7 +31,6 @@ struct AggregateFunctionTopKData
{
using Set = SpaceSaving
<
T,
T,
HashCRC32<T>,
HashTableGrower<4>,
@ -129,7 +128,6 @@ struct AggregateFunctionTopKGenericData
{
using Set = SpaceSaving
<
std::string,
StringRef,
StringRefHash,
HashTableGrower<4>,
@ -199,12 +197,12 @@ public:
size_t count = 0;
readVarUInt(count, buf);
for (size_t i = 0; i < count; ++i) {
std::string key_string;
readStringBinary(key_string, buf);
auto ref = readStringBinaryInto(*arena, buf);
UInt64 count, error;
readVarUInt(count, buf);
readVarUInt(error, buf);
set.insert(key_string, count, error);
set.insert(ref, count, error);
arena->rollback(ref.size);
}
}
@ -216,7 +214,7 @@ public:
}
StringRef str_serialized = column.getDataAt(row_num);
set.insert(str_serialized.toString());
set.insert(str_serialized);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override

View File

@ -5,6 +5,7 @@
#include <boost/range/adaptor/reversed.hpp>
#include <Common/ArenaWithFreeLists.h>
#include <Common/UInt128.h>
#include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashMap.h>
@ -25,11 +26,46 @@
namespace DB
{
/*
* Arena interface to allow specialized storage of keys.
* POD keys do not require additional storage, so this interface is empty.
*/
template <typename TKey> struct SpaceSavingArena
{
SpaceSavingArena() {}
const TKey emplace(const TKey & key) { return key; }
void free(const TKey & key) {}
};
/*
* Specialized storage for StringRef with a freelist arena.
* Keys of this type that are retained on insertion must be serialised into local storage,
* otherwise the reference would be invalid after the processed block is released.
*/
template <> struct SpaceSavingArena<StringRef>
{
const StringRef emplace(const StringRef & key)
{
auto ptr = arena.alloc(key.size);
std::copy(key.data, key.data + key.size, ptr);
return StringRef{ptr, key.size};
}
void free(const StringRef & key)
{
if (key.data)
arena.free(const_cast<char *>(key.data), key.size);
}
private:
ArenaWithFreeLists arena;
};
template
<
typename TKey,
typename HashKey = TKey,
typename Hash = DefaultHash<HashKey>,
typename Hash = DefaultHash<TKey>,
typename Grower = HashTableGrower<>,
typename Allocator = HashTableAllocator
>
@ -45,7 +81,7 @@ private:
}
public:
using Self = SpaceSaving<TKey, HashKey, Hash, Grower, Allocator>;
using Self = SpaceSaving<TKey, Hash, Grower, Allocator>;
struct Counter
{
@ -81,6 +117,7 @@ public:
};
SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {}
~SpaceSaving() { destroyElements(); }
inline size_t size() const
@ -117,7 +154,7 @@ public:
// Key doesn't exist, but can fit in the top K
else if (unlikely(size() < capacity()))
{
auto c = new Counter(key, increment, error, hash);
auto c = new Counter(arena.emplace(key), increment, error, hash);
push(c);
return;
}
@ -138,14 +175,15 @@ public:
// Replace minimum with newly inserted element
if (it != counter_map.end())
{
arena.free(min->key);
min->hash = hash;
min->key = key;
min->key = arena.emplace(key);
min->count = alpha + increment;
min->error = alpha + error;
percolate(min);
it->second = min;
it->first = key;
it->first = min->key;
counter_map.reinsert(it, hash);
}
}
@ -279,9 +317,10 @@ private:
alpha_map.clear();
}
HashMap<HashKey, Counter *, Hash, Grower, Allocator> counter_map;
HashMap<TKey, Counter *, Hash, Grower, Allocator> counter_map;
std::vector<Counter *> counter_list;
std::vector<UInt64> alpha_map;
SpaceSavingArena<TKey> arena;
size_t m_capacity;
};