2016-06-07 08:23:15 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Common/HashTable/HashTable.h>
|
2016-06-07 08:23:15 +00:00
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Two-level hash table.
|
2017-08-10 23:25:51 +00:00
|
|
|
* Represents 256 (or 1ULL << BITS_FOR_BUCKET) small hash tables (buckets of the first level).
|
2017-05-07 20:25:26 +00:00
|
|
|
* To determine which one to use, one of the bytes of the hash function is taken.
|
2016-06-07 08:23:15 +00:00
|
|
|
*
|
2017-05-07 20:25:26 +00:00
|
|
|
* Usually works a little slower than a simple hash table.
|
|
|
|
* However, it has advantages in some cases:
|
2017-05-10 04:00:19 +00:00
|
|
|
* - if you need to merge two hash tables together, then you can easily parallelize it by buckets;
|
|
|
|
* - delay during resizes is amortized, since the small hash tables will be resized separately;
|
|
|
|
* - in theory, resizes are cache-local in a larger range of sizes.
|
2016-06-07 08:23:15 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
template <size_t initial_size_degree = 8>
|
|
|
|
struct TwoLevelHashTableGrower : public HashTableGrower<initial_size_degree>
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Increase the size of the hash table.
|
2017-04-01 07:20:54 +00:00
|
|
|
void increaseSize()
|
|
|
|
{
|
|
|
|
this->size_degree += this->size_degree >= 15 ? 1 : 2;
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
template
|
|
|
|
<
|
2017-04-01 07:20:54 +00:00
|
|
|
typename Key,
|
|
|
|
typename Cell,
|
|
|
|
typename Hash,
|
|
|
|
typename Grower,
|
|
|
|
typename Allocator, /// TODO WithStackMemory
|
|
|
|
typename ImplTable = HashTable<Key, Cell, Hash, Grower, Allocator>,
|
|
|
|
size_t BITS_FOR_BUCKET = 8
|
2016-06-07 08:23:15 +00:00
|
|
|
>
|
|
|
|
class TwoLevelHashTable :
|
2017-04-01 07:20:54 +00:00
|
|
|
private boost::noncopyable,
|
|
|
|
protected Hash /// empty base optimization
|
2016-06-07 08:23:15 +00:00
|
|
|
{
|
|
|
|
protected:
|
2017-04-01 07:20:54 +00:00
|
|
|
friend class const_iterator;
|
|
|
|
friend class iterator;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
using HashValue = size_t;
|
2018-09-13 14:59:03 +00:00
|
|
|
using Self = TwoLevelHashTable;
|
2016-06-07 08:23:15 +00:00
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
using Impl = ImplTable;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-08-10 23:25:51 +00:00
|
|
|
static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t hash(const Key & x) const { return Hash::operator()(x); }
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// NOTE Bad for hash tables with more than 2^32 cells.
|
2017-04-01 07:20:54 +00:00
|
|
|
static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
|
2016-06-07 08:23:15 +00:00
|
|
|
|
|
|
|
protected:
|
2017-04-01 07:20:54 +00:00
|
|
|
typename Impl::iterator beginOfNextNonEmptyBucket(size_t & bucket)
|
|
|
|
{
|
|
|
|
while (bucket != NUM_BUCKETS && impls[bucket].empty())
|
|
|
|
++bucket;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (bucket != NUM_BUCKETS)
|
|
|
|
return impls[bucket].begin();
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
--bucket;
|
|
|
|
return impls[MAX_BUCKET].end();
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
typename Impl::const_iterator beginOfNextNonEmptyBucket(size_t & bucket) const
|
|
|
|
{
|
|
|
|
while (bucket != NUM_BUCKETS && impls[bucket].empty())
|
|
|
|
++bucket;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
if (bucket != NUM_BUCKETS)
|
|
|
|
return impls[bucket].begin();
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
--bucket;
|
|
|
|
return impls[MAX_BUCKET].end();
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
using key_type = typename Impl::key_type;
|
2019-10-29 15:16:51 +00:00
|
|
|
using mapped_type = typename Impl::mapped_type;
|
2017-04-01 07:20:54 +00:00
|
|
|
using value_type = typename Impl::value_type;
|
2019-10-29 15:16:51 +00:00
|
|
|
using cell_type = typename Impl::cell_type;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
using LookupResult = typename Impl::LookupResult;
|
|
|
|
using ConstLookupResult = typename Impl::ConstLookupResult;
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
Impl impls[NUM_BUCKETS];
|
2016-06-07 08:23:15 +00:00
|
|
|
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
TwoLevelHashTable() {}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Copy the data from another (normal) hash table. It should have the same hash function.
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename Source>
|
|
|
|
TwoLevelHashTable(const Source & src)
|
|
|
|
{
|
|
|
|
typename Source::const_iterator it = src.begin();
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-05-10 04:00:19 +00:00
|
|
|
/// It is assumed that the zero key (stored separately) is first in iteration order.
|
2017-04-01 07:20:54 +00:00
|
|
|
if (it != src.end() && it.getPtr()->isZero(src))
|
|
|
|
{
|
2019-02-28 09:35:38 +00:00
|
|
|
insert(it->getValue());
|
2017-04-01 07:20:54 +00:00
|
|
|
++it;
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
for (; it != src.end(); ++it)
|
|
|
|
{
|
|
|
|
const Cell * cell = it.getPtr();
|
|
|
|
size_t hash_value = cell->getHash(src);
|
|
|
|
size_t buck = getBucketFromHash(hash_value);
|
|
|
|
impls[buck].insertUniqueNonZero(cell, hash_value);
|
|
|
|
}
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
class iterator
|
|
|
|
{
|
|
|
|
Self * container;
|
|
|
|
size_t bucket;
|
|
|
|
typename Impl::iterator current_it;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
friend class TwoLevelHashTable;
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
iterator(Self * container_, size_t bucket_, typename Impl::iterator current_it_)
|
|
|
|
: container(container_), bucket(bucket_), current_it(current_it_) {}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
public:
|
|
|
|
iterator() {}
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
bool operator== (const iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
|
|
|
|
bool operator!= (const iterator & rhs) const { return !(*this == rhs); }
|
2016-06-07 08:23:15 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
iterator & operator++()
|
|
|
|
{
|
|
|
|
++current_it;
|
|
|
|
if (current_it == container->impls[bucket].end())
|
|
|
|
{
|
|
|
|
++bucket;
|
|
|
|
current_it = container->beginOfNextNonEmptyBucket(bucket);
|
|
|
|
}
|
|
|
|
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2019-02-28 09:35:38 +00:00
|
|
|
Cell & operator* () const { return *current_it; }
|
|
|
|
Cell * operator->() const { return current_it.getPtr(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
Cell * getPtr() const { return current_it.getPtr(); }
|
|
|
|
size_t getHash() const { return current_it.getHash(); }
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
class const_iterator
|
|
|
|
{
|
|
|
|
Self * container;
|
|
|
|
size_t bucket;
|
|
|
|
typename Impl::const_iterator current_it;
|
|
|
|
|
|
|
|
friend class TwoLevelHashTable;
|
|
|
|
|
|
|
|
const_iterator(Self * container_, size_t bucket_, typename Impl::const_iterator current_it_)
|
|
|
|
: container(container_), bucket(bucket_), current_it(current_it_) {}
|
|
|
|
|
|
|
|
public:
|
|
|
|
const_iterator() {}
|
|
|
|
const_iterator(const iterator & rhs) : container(rhs.container), bucket(rhs.bucket), current_it(rhs.current_it) {}
|
|
|
|
|
|
|
|
bool operator== (const const_iterator & rhs) const { return bucket == rhs.bucket && current_it == rhs.current_it; }
|
|
|
|
bool operator!= (const const_iterator & rhs) const { return !(*this == rhs); }
|
|
|
|
|
|
|
|
const_iterator & operator++()
|
|
|
|
{
|
|
|
|
++current_it;
|
|
|
|
if (current_it == container->impls[bucket].end())
|
|
|
|
{
|
|
|
|
++bucket;
|
|
|
|
current_it = container->beginOfNextNonEmptyBucket(bucket);
|
|
|
|
}
|
|
|
|
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2019-02-28 09:35:38 +00:00
|
|
|
const Cell & operator* () const { return *current_it; }
|
|
|
|
const Cell * operator->() const { return current_it->getPtr(); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
const Cell * getPtr() const { return current_it.getPtr(); }
|
|
|
|
size_t getHash() const { return current_it.getHash(); }
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
const_iterator begin() const
|
|
|
|
{
|
|
|
|
size_t buck = 0;
|
|
|
|
typename Impl::const_iterator impl_it = beginOfNextNonEmptyBucket(buck);
|
|
|
|
return { this, buck, impl_it };
|
|
|
|
}
|
|
|
|
|
|
|
|
iterator begin()
|
|
|
|
{
|
|
|
|
size_t buck = 0;
|
|
|
|
typename Impl::iterator impl_it = beginOfNextNonEmptyBucket(buck);
|
|
|
|
return { this, buck, impl_it };
|
|
|
|
}
|
|
|
|
|
|
|
|
const_iterator end() const { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
|
|
|
|
iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; }
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Insert a value. In the case of any more complex values, it is better to use the `emplace` function.
|
2019-08-20 09:58:44 +00:00
|
|
|
std::pair<LookupResult, bool> ALWAYS_INLINE insert(const value_type & x)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
size_t hash_value = hash(Cell::getKey(x));
|
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
std::pair<LookupResult, bool> res;
|
2017-04-01 07:20:54 +00:00
|
|
|
emplace(Cell::getKey(x), res.first, res.second, hash_value);
|
|
|
|
|
|
|
|
if (res.second)
|
2019-10-29 15:16:51 +00:00
|
|
|
insertSetMapped(res.first->getMapped(), x);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Insert the key,
|
2017-05-10 04:00:19 +00:00
|
|
|
* return an iterator to a position that can be used for `placement new` of value,
|
2017-05-07 20:25:26 +00:00
|
|
|
* as well as the flag - whether a new key was inserted.
|
2017-04-01 07:20:54 +00:00
|
|
|
*
|
2017-05-09 19:07:35 +00:00
|
|
|
* You have to make `placement new` values if you inserted a new key,
|
2017-05-07 20:25:26 +00:00
|
|
|
* since when destroying a hash table, the destructor will be invoked for it!
|
2017-04-01 07:20:54 +00:00
|
|
|
*
|
2017-05-07 20:25:26 +00:00
|
|
|
* Example usage:
|
2017-04-01 07:20:54 +00:00
|
|
|
*
|
|
|
|
* Map::iterator it;
|
|
|
|
* bool inserted;
|
|
|
|
* map.emplace(key, it, inserted);
|
|
|
|
* if (inserted)
|
|
|
|
* new(&it->second) Mapped(value);
|
|
|
|
*/
|
2019-07-31 15:44:03 +00:00
|
|
|
template <typename KeyHolder>
|
2019-08-20 09:58:44 +00:00
|
|
|
void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-07-31 15:44:03 +00:00
|
|
|
size_t hash_value = hash(keyHolderGetKey(key_holder));
|
|
|
|
emplace(key_holder, it, inserted, hash_value);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Same, but with a precalculated values of hash function.
|
2019-07-31 15:44:03 +00:00
|
|
|
template <typename KeyHolder>
|
2019-08-20 09:58:44 +00:00
|
|
|
void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it,
|
2019-07-31 15:44:03 +00:00
|
|
|
bool & inserted, size_t hash_value)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
size_t buck = getBucketFromHash(hash_value);
|
2019-08-20 09:58:44 +00:00
|
|
|
impls[buck].emplace(key_holder, it, inserted, hash_value);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
LookupResult ALWAYS_INLINE find(Key x, size_t hash_value)
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
size_t buck = getBucketFromHash(hash_value);
|
2019-08-20 09:58:44 +00:00
|
|
|
return impls[buck].find(x, hash_value);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
ConstLookupResult ALWAYS_INLINE find(Key x, size_t hash_value) const
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
2019-08-20 09:58:44 +00:00
|
|
|
return const_cast<std::decay_t<decltype(*this)> *>(this)->find(x, hash_value);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
LookupResult ALWAYS_INLINE find(Key x) { return find(x, hash(x)); }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-08-20 09:58:44 +00:00
|
|
|
ConstLookupResult ALWAYS_INLINE find(Key x) const { return find(x, hash(x)); }
|
2018-08-23 13:22:03 +00:00
|
|
|
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void write(DB::WriteBuffer & wb) const
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
impls[i].write(wb);
|
|
|
|
}
|
|
|
|
|
|
|
|
void writeText(DB::WriteBuffer & wb) const
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
{
|
|
|
|
if (i != 0)
|
|
|
|
DB::writeChar(',', wb);
|
|
|
|
impls[i].writeText(wb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
void read(DB::ReadBuffer & rb)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
impls[i].read(rb);
|
|
|
|
}
|
|
|
|
|
|
|
|
void readText(DB::ReadBuffer & rb)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
{
|
|
|
|
if (i != 0)
|
|
|
|
DB::assertChar(',', rb);
|
|
|
|
impls[i].readText(rb);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
size_t size() const
|
|
|
|
{
|
|
|
|
size_t res = 0;
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
res += impls[i].size();
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
|
|
|
bool empty() const
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
if (!impls[i].empty())
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t getBufferSizeInBytes() const
|
|
|
|
{
|
|
|
|
size_t res = 0;
|
|
|
|
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
|
|
|
res += impls[i].getBufferSizeInBytes();
|
|
|
|
|
|
|
|
return res;
|
|
|
|
}
|
2016-06-07 08:23:15 +00:00
|
|
|
};
|