Merge pull request #6729 from yandex/aku/key-holder

Key memory management for compound hash tables.
This commit is contained in:
alexey-milovidov 2019-09-07 03:09:18 +03:00 committed by GitHub
commit b7cbd33886
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 253 additions and 139 deletions

View File

@ -10,6 +10,7 @@
#include <Columns/ColumnArray.h>
#include <Common/HashTable/HashSet.h>
#include <Common/HashTable/HashTableKeyHolder.h>
#include <Common/assert_cast.h>
#include <AggregateFunctions/IAggregateFunction.h>
@ -132,11 +133,6 @@ struct AggregateFunctionGroupUniqArrayGenericData
Set value;
};
/// Helper function for deserialize and insert for the class AggregateFunctionGroupUniqArrayGeneric
template <bool is_plain_column>
static StringRef getSerializationImpl(const IColumn & column, size_t row_num, Arena & arena);
template <bool is_plain_column>
static void deserializeAndInsertImpl(StringRef str, IColumn & data_to);
@ -154,9 +150,18 @@ class AggregateFunctionGroupUniqArrayGeneric
using State = AggregateFunctionGroupUniqArrayGenericData;
static StringRef getSerialization(const IColumn & column, size_t row_num, Arena & arena)
static auto getKeyHolder(const IColumn & column, size_t row_num, Arena & arena)
{
return getSerializationImpl<is_plain_column>(column, row_num, arena);
if constexpr (is_plain_column)
{
return ArenaKeyHolder{column.getDataAt(row_num), arena};
}
else
{
const char * begin = nullptr;
StringRef serialized = column.serializeValueIntoArena(row_num, arena, begin);
return SerializedKeyHolder{serialized, arena};
}
}
static void deserializeAndInsert(StringRef str, IColumn & data_to)
@ -209,26 +214,13 @@ public:
void add(AggregateDataPtr place, const IColumn ** columns, size_t row_num, Arena * arena) const override
{
auto & set = this->data(place).value;
if (limit_num_elems && set.size() >= max_elems)
return;
bool inserted;
State::Set::iterator it;
if (limit_num_elems && set.size() >= max_elems)
return;
StringRef str_serialized = getSerialization(*columns[0], row_num, *arena);
set.emplace(str_serialized, it, inserted);
if constexpr (!is_plain_column)
{
if (!inserted)
arena->rollback(str_serialized.size);
}
else
{
if (inserted)
it->getValueMutable().data = arena->insert(str_serialized.data, str_serialized.size);
}
auto key_holder = getKeyHolder(*columns[0], row_num, *arena);
set.emplace(key_holder, it, inserted);
}
void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena * arena) const override
@ -241,15 +233,11 @@ public:
for (auto & rhs_elem : rhs_set)
{
if (limit_num_elems && cur_set.size() >= max_elems)
return ;
cur_set.emplace(rhs_elem.getValue(), it, inserted);
if (inserted)
{
if (it->getValue().size)
it->getValueMutable().data = arena->insert(it->getValue().data, it->getValue().size);
else
it->getValueMutable().data = nullptr;
}
return;
// We have to copy the keys to our arena.
assert(arena != nullptr);
cur_set.emplace(ArenaKeyHolder{rhs_elem.getValue(), *arena}, it, inserted);
}
}
@ -271,20 +259,6 @@ public:
const char * getHeaderFilePath() const override { return __FILE__; }
};
template <>
inline StringRef getSerializationImpl<false>(const IColumn & column, size_t row_num, Arena & arena)
{
const char * begin = nullptr;
return column.serializeValueIntoArena(row_num, arena, begin);
}
template <>
inline StringRef getSerializationImpl<true>(const IColumn & column, size_t row_num, Arena &)
{
return column.getDataAt(row_num);
}
template <>
inline void deserializeAndInsertImpl<false>(StringRef str, IColumn & data_to)
{

View File

@ -170,11 +170,14 @@ public:
/** Rollback just performed allocation.
* Must pass size not more that was just allocated.
* Return the resulting head pointer, so that the caller can assert that
* the allocation it intended to roll back was indeed the last one.
*/
void rollback(size_t size)
void * rollback(size_t size)
{
head->pos -= size;
ASAN_POISON_MEMORY_REGION(head->pos, size + pad_right);
return head->pos;
}
/** Begin or expand allocation of contiguous piece of memory without alignment.

View File

@ -1,6 +1,8 @@
#pragma once
#include <Common/HashTable/HashTable.h>
#include <Common/HashTable/HashTableKeyHolder.h>
#include <Common/ColumnsHashingImpl.h>
#include <Common/Arena.h>
#include <Common/LRUCache.h>
@ -57,7 +59,7 @@ struct HashMethodOneNumber
using Base::getHash; /// (const Data & data, size_t row, Arena & pool) -> size_t
/// Is used for default implementation in HashMethodBase.
FieldType getKey(size_t row, Arena &) const { return unalignedLoad<FieldType>(vec + row * sizeof(FieldType)); }
FieldType getKeyHolder(size_t row, Arena &) const { return unalignedLoad<FieldType>(vec + row * sizeof(FieldType)); }
/// Get StringRef from value which can be inserted into column.
static StringRef getValueRef(const Value & value)
@ -86,24 +88,24 @@ struct HashMethodString
chars = column_string.getChars().data();
}
auto getKey(ssize_t row, Arena &) const
auto getKeyHolder(ssize_t row, [[maybe_unused]] Arena & pool) const
{
return StringRef(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
StringRef key(chars + offsets[row - 1], offsets[row] - offsets[row - 1] - 1);
if constexpr (place_string_to_arena)
{
return ArenaKeyHolder{key, pool};
}
else
{
return key;
}
}
static StringRef getValueRef(const Value & value) { return value.first; }
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
static ALWAYS_INLINE void onNewKey([[maybe_unused]] StringRef & key, [[maybe_unused]] Arena & pool)
{
if constexpr (place_string_to_arena)
{
if (key.size)
key.data = pool.insert(key.data, key.size);
}
}
};
@ -126,17 +128,24 @@ struct HashMethodFixedString
chars = &column_string.getChars();
}
StringRef getKey(size_t row, Arena &) const { return StringRef(&(*chars)[row * n], n); }
auto getKeyHolder(size_t row, [[maybe_unused]] Arena & pool) const
{
StringRef key(&(*chars)[row * n], n);
if constexpr (place_string_to_arena)
{
return ArenaKeyHolder{key, pool};
}
else
{
return key;
}
}
static StringRef getValueRef(const Value & value) { return value.first; }
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, use_cache>;
static ALWAYS_INLINE void onNewKey([[maybe_unused]] StringRef & key, [[maybe_unused]] Arena & pool)
{
if constexpr (place_string_to_arena)
key.data = pool.insert(key.data, key.size);
}
};
@ -316,10 +325,10 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod
}
}
/// Get the key from the key columns for insertion into the hash table.
ALWAYS_INLINE auto getKey(size_t row, Arena & pool) const
/// Get the key holder from the key columns for insertion into the hash table.
ALWAYS_INLINE auto getKeyHolder(size_t row, Arena & pool) const
{
return Base::getKey(getIndexAt(row), pool);
return Base::getKeyHolder(getIndexAt(row), pool);
}
template <typename Data>
@ -347,30 +356,23 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod
return EmplaceResult(false);
}
auto key = getKey(row_, pool);
auto key_holder = getKeyHolder(row_, pool);
bool inserted = false;
typename Data::iterator it;
if (saved_hash)
data.emplace(key, it, inserted, saved_hash[row]);
data.emplace(key_holder, it, inserted, saved_hash[row]);
else
data.emplace(key, it, inserted);
data.emplace(key_holder, it, inserted);
visit_cache[row] = VisitValue::Found;
if (inserted)
{
if constexpr (has_mapped)
{
new(&it->getSecond()) Mapped();
Base::onNewKey(it->getFirstMutable(), pool);
}
else
Base::onNewKey(*it, pool);
}
if constexpr (has_mapped)
{
if (inserted)
{
new (&it->getSecond()) Mapped();
}
mapped_cache[row] = it->getSecond();
return EmplaceResult(it->getSecond(), mapped_cache[row], inserted);
}
@ -407,13 +409,13 @@ struct HashMethodSingleLowCardinalityColumn : public SingleColumnMethod
return FindResult(visit_cache[row] == VisitValue::Found);
}
auto key = getKey(row_, pool);
auto key_holder = getKeyHolder(row_, pool);
typename Data::iterator it;
if (saved_hash)
it = data.find(key, saved_hash[row]);
it = data.find(*key_holder, saved_hash[row]);
else
it = data.find(key);
it = data.find(*key_holder);
bool found = it != data.end();
visit_cache[row] = found ? VisitValue::Found : VisitValue::NotFound;
@ -493,7 +495,7 @@ struct HashMethodKeysFixed
}
}
ALWAYS_INLINE Key getKey(size_t row, Arena &) const
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
{
if constexpr (has_nullable_keys)
{
@ -532,12 +534,12 @@ struct HashMethodSerialized
protected:
friend class columns_hashing_impl::HashMethodBase<Self, Value, Mapped, false>;
ALWAYS_INLINE StringRef getKey(size_t row, Arena & pool) const
ALWAYS_INLINE SerializedKeyHolder getKeyHolder(size_t row, Arena & pool) const
{
return serializeKeysToPoolContiguous(row, keys_size, key_columns, pool);
return SerializedKeyHolder{
serializeKeysToPoolContiguous(row, keys_size, key_columns, pool),
pool};
}
static ALWAYS_INLINE void onExistingKey(StringRef & key, Arena & pool) { pool.rollback(key.size); }
};
/// For the case when there is one string key.
@ -554,7 +556,10 @@ struct HashMethodHashed
HashMethodHashed(ColumnRawPtrs key_columns_, const Sizes &, const HashMethodContextPtr &)
: key_columns(std::move(key_columns_)) {}
ALWAYS_INLINE Key getKey(size_t row, Arena &) const { return hash128(row, key_columns.size(), key_columns); }
ALWAYS_INLINE Key getKeyHolder(size_t row, Arena &) const
{
return hash128(row, key_columns.size(), key_columns);
}
static ALWAYS_INLINE StringRef getValueRef(const Value & value)
{

View File

@ -2,6 +2,7 @@
#include <Columns/IColumn.h>
#include <Common/assert_cast.h>
#include <Common/HashTable/HashTableKeyHolder.h>
#include <Interpreters/AggregationCommon.h>
@ -117,26 +118,22 @@ public:
template <typename Data>
ALWAYS_INLINE EmplaceResult emplaceKey(Data & data, size_t row, Arena & pool)
{
auto key = static_cast<Derived &>(*this).getKey(row, pool);
return emplaceKeyImpl(key, data, pool);
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return emplaceImpl(key_holder, data);
}
template <typename Data>
ALWAYS_INLINE FindResult findKey(Data & data, size_t row, Arena & pool)
{
auto key = static_cast<Derived &>(*this).getKey(row, pool);
auto res = findKeyImpl(key, data);
static_cast<Derived &>(*this).onExistingKey(key, pool);
return res;
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return findKeyImpl(keyHolderGetKey(key_holder), data);
}
template <typename Data>
ALWAYS_INLINE size_t getHash(const Data & data, size_t row, Arena & pool)
{
auto key = static_cast<Derived &>(*this).getKey(row, pool);
auto res = data.hash(key);
static_cast<Derived &>(*this).onExistingKey(key, pool);
return res;
auto key_holder = static_cast<Derived &>(*this).getKeyHolder(row, pool);
return data.hash(keyHolderGetKey(key_holder));
}
protected:
@ -157,20 +154,13 @@ protected:
}
}
template <typename Key>
static ALWAYS_INLINE void onNewKey(Key & /*key*/, Arena & /*pool*/) {}
template <typename Key>
static ALWAYS_INLINE void onExistingKey(Key & /*key*/, Arena & /*pool*/) {}
template <typename Data, typename Key>
ALWAYS_INLINE EmplaceResult emplaceKeyImpl(Key key, Data & data, Arena & pool)
template <typename Data, typename KeyHolder>
ALWAYS_INLINE EmplaceResult emplaceImpl(KeyHolder & key_holder, Data & data)
{
if constexpr (Cache::consecutive_keys_optimization)
{
if (cache.found && cache.check(key))
if (cache.found && cache.check(keyHolderGetKey(key_holder)))
{
static_cast<Derived &>(*this).onExistingKey(key, pool);
if constexpr (has_mapped)
return EmplaceResult(cache.value.second, cache.value.second, false);
else
@ -180,7 +170,7 @@ protected:
typename Data::iterator it;
bool inserted = false;
data.emplace(key, it, inserted);
data.emplace(key_holder, it, inserted);
[[maybe_unused]] Mapped * cached = nullptr;
if constexpr (has_mapped)
@ -191,13 +181,8 @@ protected:
if constexpr (has_mapped)
{
new(&it->getSecond()) Mapped();
static_cast<Derived &>(*this).onNewKey(it->getFirstMutable(), pool);
}
else
static_cast<Derived &>(*this).onNewKey(it->getValueMutable(), pool);
}
else
static_cast<Derived &>(*this).onExistingKey(key, pool);
if constexpr (consecutive_keys_optimization)
{

View File

@ -21,6 +21,7 @@
#include <IO/VarInt.h>
#include <Common/HashTable/HashTableAllocator.h>
#include <Common/HashTable/HashTableKeyHolder.h>
#ifdef DBMS_HASH_MAP_DEBUG_RESIZES
#include <iostream>
@ -630,6 +631,8 @@ protected:
/// If the key is zero, insert it into a special place and return true.
/// We don't have to persist a zero key, because it's not actually inserted.
/// That's why we just take a Key by value, an not a key holder.
bool ALWAYS_INLINE emplaceIfZero(Key x, iterator & it, bool & inserted, size_t hash_value)
{
/// If it is claimed that the zero key can not be inserted into the table.
@ -655,17 +658,23 @@ protected:
return false;
}
void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, Key x, iterator & it, bool & inserted, size_t hash_value)
template <typename KeyHolder>
void ALWAYS_INLINE emplaceNonZeroImpl(size_t place_value, KeyHolder && key_holder,
iterator & it, bool & inserted, size_t hash_value)
{
it = iterator(this, &buf[place_value]);
if (!buf[place_value].isZero(*this))
{
keyHolderDiscardKey(key_holder);
inserted = false;
return;
}
new(&buf[place_value]) Cell(x, *this);
keyHolderPersistKey(key_holder);
const auto & key = keyHolderGetKey(key_holder);
new(&buf[place_value]) Cell(key, *this);
buf[place_value].setHash(hash_value);
inserted = true;
++m_size;
@ -687,19 +696,21 @@ protected:
throw;
}
it = find(x, hash_value);
it = find(keyHolderGetKey(key_holder), hash_value);
}
}
/// Only for non-zero keys. Find the right place, insert the key there, if it does not already exist. Set iterator to the cell in output parameter.
void ALWAYS_INLINE emplaceNonZero(Key x, iterator & it, bool & inserted, size_t hash_value)
template <typename KeyHolder>
void ALWAYS_INLINE emplaceNonZero(KeyHolder && key_holder, iterator & it,
bool & inserted, size_t hash_value)
{
size_t place_value = findCell(x, hash_value, grower.place(hash_value));
emplaceNonZeroImpl(place_value, x, it, inserted, hash_value);
const auto & key = keyHolderGetKey(key_holder);
size_t place_value = findCell(key, hash_value, grower.place(hash_value));
emplaceNonZeroImpl(place_value, key_holder, it, inserted, hash_value);
}
public:
/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
std::pair<iterator, bool> ALWAYS_INLINE insert(const value_type & x)
@ -708,7 +719,9 @@ public:
size_t hash_value = hash(Cell::getKey(x));
if (!emplaceIfZero(Cell::getKey(x), res.first, res.second, hash_value))
{
emplaceNonZero(Cell::getKey(x), res.first, res.second, hash_value);
}
if (res.second)
res.first.ptr->setMapped(x);
@ -739,19 +752,20 @@ public:
* if (inserted)
* new(&it->second) Mapped(value);
*/
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted)
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, bool & inserted)
{
size_t hash_value = hash(x);
if (!emplaceIfZero(x, it, inserted, hash_value))
emplaceNonZero(x, it, inserted, hash_value);
const auto & key = keyHolderGetKey(key_holder);
emplace(key_holder, it, inserted, hash(key));
}
/// Same, but with a precalculated value of hash function.
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value)
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it,
bool & inserted, size_t hash_value)
{
if (!emplaceIfZero(x, it, inserted, hash_value))
emplaceNonZero(x, it, inserted, hash_value);
const auto & key = keyHolderGetKey(key_holder);
if (!emplaceIfZero(key, it, inserted, hash_value))
emplaceNonZero(key_holder, it, inserted, hash_value);
}
/// Copy the cell from another hash table. It is assumed that the cell is not zero, and also that there was no such key in the table yet.

View File

@ -0,0 +1,130 @@
#pragma once
#include <Common/Arena.h>
/**
* In some aggregation scenarios, when adding a key to the hash table, we
* start with a temporary key object, and if it turns out to be a new key,
* we must make it persistent (e.g. copy to an Arena) and use the resulting
* persistent object as hash table key. This happens only for StringRef keys,
* because other key types are stored by value, but StringRef is a pointer-like
* type: the actual data are stored elsewhere. Even for StringRef, we don't
* make a persistent copy of the key in each of the following cases:
* 1) the aggregation method doesn't use temporary keys, so they're persistent
* from the start;
* 1) the key is already present in the hash table;
* 3) that particular key is stored by value, e.g. a short StringRef key in
* StringHashMap.
*
* In the past, the caller was responsible for making the key persistent after
* in was inserted. emplace() returned whether the key is new or not, so the
* caller only stored new keys (this is case (2) from the above list). However,
* now we are adding a compound hash table for StringRef keys, so case (3)
* appears. The decision about persistence now depends on some properties of
* the key, and the logic of this decision is tied to the particular hash table
* implementation. This means that the hash table user now doesn't have enough
* data and logic to make this decision by itself.
*
* To support these new requirements, we now manage key persistence by passing
* a special key holder to emplace(), which has the functions to make the key
* persistent or to discard it. emplace() then calls these functions at the
* appropriate moments.
*
* This approach has the following benefits:
* - no extra runtime branches in the caller to make the key persistent.
* - no additional data is stored in the hash table itself, which is important
* when it's used in aggregate function states.
* - no overhead when the key memory management isn't needed: we just pass the
* bare key without any wrapper to emplace(), and the default callbacks do
* nothing.
*
* This file defines the default key persistence functions, as well as two
* different key holders and corresponding functions for storing StringRef
* keys to Arena.
*/
/**
* Returns the key. Can return the temporary key initially.
* After the call to keyHolderPersistKey(), must return the persistent key.
*/
template <typename Key>
inline Key & ALWAYS_INLINE keyHolderGetKey(Key && key) { return key; }
/**
* Make the key persistent. keyHolderGetKey() must return the persistent key
* after this call.
*/
template <typename Key>
inline void ALWAYS_INLINE keyHolderPersistKey(Key &&) {}
/**
* Discard the key. Calling keyHolderGetKey() is ill-defined after this.
*/
template <typename Key>
inline void ALWAYS_INLINE keyHolderDiscardKey(Key &&) {}
namespace DB
{
/**
* ArenaKeyHolder is a key holder for hash tables that serializes a StringRef
* key to an Arena.
*/
struct ArenaKeyHolder
{
StringRef key;
Arena & pool;
};
}
inline StringRef & ALWAYS_INLINE keyHolderGetKey(DB::ArenaKeyHolder & holder)
{
return holder.key;
}
inline void ALWAYS_INLINE keyHolderPersistKey(DB::ArenaKeyHolder & holder)
{
// Hash table shouldn't ask us to persist a zero key
assert(holder.key.size > 0);
holder.key.data = holder.pool.insert(holder.key.data, holder.key.size);
}
inline void ALWAYS_INLINE keyHolderDiscardKey(DB::ArenaKeyHolder &)
{
}
namespace DB
{
/**
* SerializedKeyHolder is a key holder for a StringRef key that is already
* serialized to an Arena. The key must be the last allocation in this Arena,
* and is discarded by rolling back the allocation.
*/
struct SerializedKeyHolder
{
StringRef key;
Arena & pool;
};
}
inline StringRef & ALWAYS_INLINE keyHolderGetKey(DB::SerializedKeyHolder & holder)
{
return holder.key;
}
inline void ALWAYS_INLINE keyHolderPersistKey(DB::SerializedKeyHolder &)
{
}
inline void ALWAYS_INLINE keyHolderDiscardKey(DB::SerializedKeyHolder & holder)
{
[[maybe_unused]] void * new_head = holder.pool.rollback(holder.key.size);
assert(new_head == holder.key.data);
holder.key.data = nullptr;
holder.key.size = 0;
}

View File

@ -235,19 +235,22 @@ public:
* if (inserted)
* new(&it->second) Mapped(value);
*/
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted)
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it, bool & inserted)
{
size_t hash_value = hash(x);
emplace(x, it, inserted, hash_value);
size_t hash_value = hash(keyHolderGetKey(key_holder));
emplace(key_holder, it, inserted, hash_value);
}
/// Same, but with a precalculated values of hash function.
void ALWAYS_INLINE emplace(Key x, iterator & it, bool & inserted, size_t hash_value)
template <typename KeyHolder>
void ALWAYS_INLINE emplace(KeyHolder && key_holder, iterator & it,
bool & inserted, size_t hash_value)
{
size_t buck = getBucketFromHash(hash_value);
typename Impl::iterator impl_it;
impls[buck].emplace(x, impl_it, inserted, hash_value);
impls[buck].emplace(key_holder, impl_it, inserted, hash_value);
it = iterator(this, buck, impl_it);
}