SpaceSaving: internal storage for StringRef{}

The SpaceSaving has now specialised storage for
some keys, which only copies keys that
are to be retained in the structure, not all.

Most of the PODs implement this interface empty,
so there shouldn’t be any extra cost.
This commit is contained in:
Marek Vavruša 2017-06-25 17:11:32 -07:00 committed by alexey-milovidov
parent 106a979ac2
commit e189c39056
2 changed files with 50 additions and 13 deletions

View File

@ -31,7 +31,6 @@ struct AggregateFunctionTopKData
{ {
using Set = SpaceSaving using Set = SpaceSaving
< <
T,
T, T,
HashCRC32<T>, HashCRC32<T>,
HashTableGrower<4>, HashTableGrower<4>,
@ -129,7 +128,6 @@ struct AggregateFunctionTopKGenericData
{ {
using Set = SpaceSaving using Set = SpaceSaving
< <
std::string,
StringRef, StringRef,
StringRefHash, StringRefHash,
HashTableGrower<4>, HashTableGrower<4>,
@ -199,12 +197,12 @@ public:
size_t count = 0; size_t count = 0;
readVarUInt(count, buf); readVarUInt(count, buf);
for (size_t i = 0; i < count; ++i) { for (size_t i = 0; i < count; ++i) {
std::string key_string; auto ref = readStringBinaryInto(*arena, buf);
readStringBinary(key_string, buf);
UInt64 count, error; UInt64 count, error;
readVarUInt(count, buf); readVarUInt(count, buf);
readVarUInt(error, buf); readVarUInt(error, buf);
set.insert(key_string, count, error); set.insert(ref, count, error);
arena->rollback(ref.size);
} }
} }
@ -216,7 +214,7 @@ public:
} }
StringRef str_serialized = column.getDataAt(row_num); StringRef str_serialized = column.getDataAt(row_num);
set.insert(str_serialized.toString()); set.insert(str_serialized);
} }
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override

View File

@ -5,6 +5,7 @@
#include <boost/range/adaptor/reversed.hpp> #include <boost/range/adaptor/reversed.hpp>
#include <Common/ArenaWithFreeLists.h>
#include <Common/UInt128.h> #include <Common/UInt128.h>
#include <Common/HashTable/Hash.h> #include <Common/HashTable/Hash.h>
#include <Common/HashTable/HashMap.h> #include <Common/HashTable/HashMap.h>
@ -25,11 +26,46 @@
namespace DB namespace DB
{ {
/*
* Arena interface to allow specialized storage of keys.
* POD keys do not require additional storage, so this interface is empty.
*/
template <typename TKey> struct SpaceSavingArena
{
SpaceSavingArena() {}
const TKey emplace(const TKey & key) { return key; }
void free(const TKey & key) {}
};
/*
* Specialized storage for StringRef with a freelist arena.
* Keys of this type that are retained on insertion must be serialised into local storage,
* otherwise the reference would be invalid after the processed block is released.
*/
template <> struct SpaceSavingArena<StringRef>
{
const StringRef emplace(const StringRef & key)
{
auto ptr = arena.alloc(key.size);
std::copy(key.data, key.data + key.size, ptr);
return StringRef{ptr, key.size};
}
void free(const StringRef & key)
{
if (key.data)
arena.free(const_cast<char *>(key.data), key.size);
}
private:
ArenaWithFreeLists arena;
};
template template
< <
typename TKey, typename TKey,
typename HashKey = TKey, typename Hash = DefaultHash<TKey>,
typename Hash = DefaultHash<HashKey>,
typename Grower = HashTableGrower<>, typename Grower = HashTableGrower<>,
typename Allocator = HashTableAllocator typename Allocator = HashTableAllocator
> >
@ -45,7 +81,7 @@ private:
} }
public: public:
using Self = SpaceSaving<TKey, HashKey, Hash, Grower, Allocator>; using Self = SpaceSaving<TKey, Hash, Grower, Allocator>;
struct Counter struct Counter
{ {
@ -81,6 +117,7 @@ public:
}; };
SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {} SpaceSaving(size_t c = 10) : alpha_map(nextAlphaSize(c)), m_capacity(c) {}
~SpaceSaving() { destroyElements(); } ~SpaceSaving() { destroyElements(); }
inline size_t size() const inline size_t size() const
@ -117,7 +154,7 @@ public:
// Key doesn't exist, but can fit in the top K // Key doesn't exist, but can fit in the top K
else if (unlikely(size() < capacity())) else if (unlikely(size() < capacity()))
{ {
auto c = new Counter(key, increment, error, hash); auto c = new Counter(arena.emplace(key), increment, error, hash);
push(c); push(c);
return; return;
} }
@ -138,14 +175,15 @@ public:
// Replace minimum with newly inserted element // Replace minimum with newly inserted element
if (it != counter_map.end()) if (it != counter_map.end())
{ {
arena.free(min->key);
min->hash = hash; min->hash = hash;
min->key = key; min->key = arena.emplace(key);
min->count = alpha + increment; min->count = alpha + increment;
min->error = alpha + error; min->error = alpha + error;
percolate(min); percolate(min);
it->second = min; it->second = min;
it->first = key; it->first = min->key;
counter_map.reinsert(it, hash); counter_map.reinsert(it, hash);
} }
} }
@ -279,9 +317,10 @@ private:
alpha_map.clear(); alpha_map.clear();
} }
HashMap<HashKey, Counter *, Hash, Grower, Allocator> counter_map; HashMap<TKey, Counter *, Hash, Grower, Allocator> counter_map;
std::vector<Counter *> counter_list; std::vector<Counter *> counter_list;
std::vector<UInt64> alpha_map; std::vector<UInt64> alpha_map;
SpaceSavingArena<TKey> arena;
size_t m_capacity; size_t m_capacity;
}; };