mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 01:00:48 +00:00
Merge pull request #6243 from ClickHouse/aku/hashtables
Introduce String Hash Map to speed up aggregation over short string keys.
This commit is contained in:
commit
1a609c27bd
@ -358,6 +358,12 @@ protected:
|
||||
template <typename, typename, typename, typename, typename, typename, size_t>
|
||||
friend class TwoLevelHashTable;
|
||||
|
||||
template <typename, typename, size_t>
|
||||
friend class TwoLevelStringHashTable;
|
||||
|
||||
template <typename SubMaps>
|
||||
friend class StringHashTable;
|
||||
|
||||
using HashValue = size_t;
|
||||
using Self = HashTable;
|
||||
using cell_type = Cell;
|
||||
|
180
dbms/src/Common/HashTable/StringHashMap.h
Normal file
180
dbms/src/Common/HashTable/StringHashMap.h
Normal file
@ -0,0 +1,180 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashTableAllocator.h>
|
||||
#include <Common/HashTable/StringHashTable.h>
|
||||
|
||||
template <typename Key, typename TMapped>
|
||||
struct StringHashMapCell : public HashMapCell<Key, TMapped, StringHashTableHash, HashTableNoState>
|
||||
{
|
||||
using Base = HashMapCell<Key, TMapped, StringHashTableHash, HashTableNoState>;
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
};
|
||||
|
||||
template<typename Key, typename Mapped>
|
||||
auto lookupResultGetMapped(StringHashMapCell<Key, Mapped> * cell) { return &cell->getSecond(); }
|
||||
|
||||
template <typename TMapped>
|
||||
struct StringHashMapCell<StringKey16, TMapped> : public HashMapCell<StringKey16, TMapped, StringHashTableHash, HashTableNoState>
|
||||
{
|
||||
using Base = HashMapCell<StringKey16, TMapped, StringHashTableHash, HashTableNoState>;
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
|
||||
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
|
||||
static bool isZero(const StringKey16 & key, const HashTableNoState & /*state*/) { return key.low == 0; }
|
||||
void setZero() { this->value.first.low = 0; }
|
||||
};
|
||||
|
||||
template <typename TMapped>
|
||||
struct StringHashMapCell<StringKey24, TMapped> : public HashMapCell<StringKey24, TMapped, StringHashTableHash, HashTableNoState>
|
||||
{
|
||||
using Base = HashMapCell<StringKey24, TMapped, StringHashTableHash, HashTableNoState>;
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
bool isZero(const HashTableNoState & state) const { return isZero(this->value.first, state); }
|
||||
// Assuming String does not contain zero bytes. NOTE: Cannot be used in serialized method
|
||||
static bool isZero(const StringKey24 & key, const HashTableNoState & /*state*/) { return key.a == 0; }
|
||||
void setZero() { this->value.first.a = 0; }
|
||||
};
|
||||
|
||||
template <typename TMapped>
|
||||
struct StringHashMapCell<StringRef, TMapped> : public HashMapCellWithSavedHash<StringRef, TMapped, StringHashTableHash, HashTableNoState>
|
||||
{
|
||||
using Base = HashMapCellWithSavedHash<StringRef, TMapped, StringHashTableHash, HashTableNoState>;
|
||||
using Base::Base;
|
||||
static constexpr bool need_zero_value_storage = false;
|
||||
};
|
||||
|
||||
template <typename TMapped, typename Allocator>
|
||||
struct StringHashMapSubMaps
|
||||
{
|
||||
using T0 = StringHashTableEmpty<StringHashMapCell<StringRef, TMapped>>;
|
||||
using T1 = HashMapTable<StringKey8, StringHashMapCell<StringKey8, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
|
||||
using T2 = HashMapTable<StringKey16, StringHashMapCell<StringKey16, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
|
||||
using T3 = HashMapTable<StringKey24, StringHashMapCell<StringKey24, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
|
||||
using Ts = HashMapTable<StringRef, StringHashMapCell<StringRef, TMapped>, StringHashTableHash, StringHashTableGrower<>, Allocator>;
|
||||
};
|
||||
|
||||
template <typename TMapped, typename Allocator = HashTableAllocator>
|
||||
class StringHashMap : public StringHashTable<StringHashMapSubMaps<TMapped, Allocator>>
|
||||
{
|
||||
public:
|
||||
using Base = StringHashTable<StringHashMapSubMaps<TMapped, Allocator>>;
|
||||
using Self = StringHashMap;
|
||||
using Key = StringRef;
|
||||
using key_type = StringRef;
|
||||
using mapped_type = TMapped;
|
||||
using value_type = typename Base::Ts::value_type;
|
||||
using LookupResult = mapped_type *;
|
||||
|
||||
using Base::Base;
|
||||
|
||||
/// Merge every cell's value of current map into the destination map.
|
||||
/// Func should have signature void(Mapped & dst, Mapped & src, bool emplaced).
|
||||
/// Each filled cell in current map will invoke func once. If that map doesn't
|
||||
/// have a key equals to the given cell, a new cell gets emplaced into that map,
|
||||
/// and func is invoked with the third argument emplaced set to true. Otherwise
|
||||
/// emplaced is set to false.
|
||||
template <typename Func>
|
||||
void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func)
|
||||
{
|
||||
if (this->m0.hasZero())
|
||||
{
|
||||
const bool emplace_new_zero = !that.m0.hasZero();
|
||||
if (emplace_new_zero)
|
||||
{
|
||||
that.m0.setHasZero();
|
||||
}
|
||||
|
||||
func(that.m0.zeroValue()->getSecond(), this->m0.zeroValue()->getSecond(),
|
||||
emplace_new_zero);
|
||||
}
|
||||
|
||||
this->m1.mergeToViaEmplace(that.m1, func);
|
||||
this->m2.mergeToViaEmplace(that.m2, func);
|
||||
this->m3.mergeToViaEmplace(that.m3, func);
|
||||
this->ms.mergeToViaEmplace(that.ms, func);
|
||||
}
|
||||
|
||||
/// Merge every cell's value of current map into the destination map via find.
|
||||
/// Func should have signature void(Mapped & dst, Mapped & src, bool exist).
|
||||
/// Each filled cell in current map will invoke func once. If that map doesn't
|
||||
/// have a key equals to the given cell, func is invoked with the third argument
|
||||
/// exist set to false. Otherwise exist is set to true.
|
||||
template <typename Func>
|
||||
void ALWAYS_INLINE mergeToViaFind(Self & that, Func && func)
|
||||
{
|
||||
if (this->m0.hasZero())
|
||||
{
|
||||
if (that.m0.hasZero())
|
||||
{
|
||||
func(that.m0.zeroValue()->getSecond(), this->m0.zeroValue()->getSecond(), true);
|
||||
}
|
||||
else
|
||||
{
|
||||
func(this->m0.zeroValue()->getSecond(), this->m0.zeroValue()->getSecond(), false);
|
||||
}
|
||||
}
|
||||
|
||||
this->m1.mergeToViaFind(that.m1, func);
|
||||
this->m2.mergeToViaFind(that.m2, func);
|
||||
this->m3.mergeToViaFind(that.m3, func);
|
||||
this->ms.mergeToViaFind(that.ms, func);
|
||||
}
|
||||
|
||||
mapped_type & ALWAYS_INLINE operator[](Key x)
|
||||
{
|
||||
bool inserted;
|
||||
LookupResult it = nullptr;
|
||||
emplace(x, it, inserted);
|
||||
if (inserted)
|
||||
new (it) mapped_type();
|
||||
return *it;
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void ALWAYS_INLINE forEachValue(Func && func)
|
||||
{
|
||||
if (this->m0.size())
|
||||
{
|
||||
func(StringRef{}, this->m0.zeroValue()->getSecond());
|
||||
}
|
||||
|
||||
for (auto & v : this->m1)
|
||||
{
|
||||
func(toStringRef(v.getFirst()), v.getSecond());
|
||||
}
|
||||
|
||||
for (auto & v : this->m2)
|
||||
{
|
||||
func(toStringRef(v.getFirst()), v.getSecond());
|
||||
}
|
||||
|
||||
for (auto & v : this->m3)
|
||||
{
|
||||
func(toStringRef(v.getFirst()), v.getSecond());
|
||||
}
|
||||
|
||||
for (auto & v : this->ms)
|
||||
{
|
||||
func(v.getFirst(), v.getSecond());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void ALWAYS_INLINE forEachMapped(Func && func)
|
||||
{
|
||||
if (this->m0.size())
|
||||
func(this->m0.zeroValue()->getSecond());
|
||||
for (auto & v : this->m1)
|
||||
func(v.getSecond());
|
||||
for (auto & v : this->m2)
|
||||
func(v.getSecond());
|
||||
for (auto & v : this->m3)
|
||||
func(v.getSecond());
|
||||
for (auto & v : this->ms)
|
||||
func(v.getSecond());
|
||||
}
|
||||
};
|
371
dbms/src/Common/HashTable/StringHashTable.h
Normal file
371
dbms/src/Common/HashTable/StringHashTable.h
Normal file
@ -0,0 +1,371 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashTable.h>
|
||||
|
||||
struct StringKey0
|
||||
{
|
||||
};
|
||||
|
||||
using StringKey8 = UInt64;
|
||||
using StringKey16 = DB::UInt128;
|
||||
struct StringKey24
|
||||
{
|
||||
UInt64 a;
|
||||
UInt64 b;
|
||||
UInt64 c;
|
||||
|
||||
bool operator==(const StringKey24 rhs) const { return a == rhs.a && b == rhs.b && c == rhs.c; }
|
||||
};
|
||||
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey8 & n)
|
||||
{
|
||||
return {reinterpret_cast<const char *>(&n), 8ul - (__builtin_clzll(n) >> 3)};
|
||||
}
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey16 & n)
|
||||
{
|
||||
return {reinterpret_cast<const char *>(&n), 16ul - (__builtin_clzll(n.high) >> 3)};
|
||||
}
|
||||
inline StringRef ALWAYS_INLINE toStringRef(const StringKey24 & n)
|
||||
{
|
||||
return {reinterpret_cast<const char *>(&n), 24ul - (__builtin_clzll(n.c) >> 3)};
|
||||
}
|
||||
|
||||
struct StringHashTableHash
|
||||
{
|
||||
#if defined(__SSE4_2__)
|
||||
size_t ALWAYS_INLINE operator()(StringKey8 key) const
|
||||
{
|
||||
size_t res = -1ULL;
|
||||
res = _mm_crc32_u64(res, key);
|
||||
return res;
|
||||
}
|
||||
size_t ALWAYS_INLINE operator()(StringKey16 key) const
|
||||
{
|
||||
size_t res = -1ULL;
|
||||
res = _mm_crc32_u64(res, key.low);
|
||||
res = _mm_crc32_u64(res, key.high);
|
||||
return res;
|
||||
}
|
||||
size_t ALWAYS_INLINE operator()(StringKey24 key) const
|
||||
{
|
||||
size_t res = -1ULL;
|
||||
res = _mm_crc32_u64(res, key.a);
|
||||
res = _mm_crc32_u64(res, key.b);
|
||||
res = _mm_crc32_u64(res, key.c);
|
||||
return res;
|
||||
}
|
||||
#else
|
||||
size_t ALWAYS_INLINE operator()(StringKey8 key) const
|
||||
{
|
||||
return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 8);
|
||||
}
|
||||
size_t ALWAYS_INLINE operator()(StringKey16 key) const
|
||||
{
|
||||
return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 16);
|
||||
}
|
||||
size_t ALWAYS_INLINE operator()(StringKey24 key) const
|
||||
{
|
||||
return CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&key), 24);
|
||||
}
|
||||
#endif
|
||||
size_t ALWAYS_INLINE operator()(StringRef key) const
|
||||
{
|
||||
return StringRefHash()(key);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename Cell>
|
||||
struct StringHashTableEmpty
|
||||
{
|
||||
using Self = StringHashTableEmpty;
|
||||
|
||||
bool has_zero = false;
|
||||
std::aligned_storage_t<sizeof(Cell), alignof(Cell)> zero_value_storage; /// Storage of element with zero key.
|
||||
|
||||
public:
|
||||
bool hasZero() const { return has_zero; }
|
||||
|
||||
void setHasZero()
|
||||
{
|
||||
has_zero = true;
|
||||
new (zeroValue()) Cell();
|
||||
}
|
||||
|
||||
void setHasZero(const Cell & other)
|
||||
{
|
||||
has_zero = true;
|
||||
new (zeroValue()) Cell(other);
|
||||
}
|
||||
|
||||
void clearHasZero()
|
||||
{
|
||||
has_zero = false;
|
||||
if (!std::is_trivially_destructible_v<Cell>)
|
||||
zeroValue()->~Cell();
|
||||
}
|
||||
|
||||
Cell * zeroValue() { return reinterpret_cast<Cell *>(&zero_value_storage); }
|
||||
const Cell * zeroValue() const { return reinterpret_cast<const Cell *>(&zero_value_storage); }
|
||||
|
||||
using LookupResult = Cell *;
|
||||
using ConstLookupResult = const Cell *;
|
||||
|
||||
template <typename KeyHolder>
|
||||
void ALWAYS_INLINE emplace(KeyHolder &&, LookupResult & it, bool & inserted, size_t /* hash */)
|
||||
{
|
||||
if (!hasZero())
|
||||
{
|
||||
setHasZero();
|
||||
inserted = true;
|
||||
}
|
||||
else
|
||||
inserted = false;
|
||||
it = zeroValue();
|
||||
}
|
||||
|
||||
template <typename Key>
|
||||
LookupResult ALWAYS_INLINE find(Key, size_t /* hash */)
|
||||
{
|
||||
return hasZero() ? zeroValue() : nullptr;
|
||||
}
|
||||
|
||||
|
||||
void write(DB::WriteBuffer & wb) const { zeroValue()->write(wb); }
|
||||
void writeText(DB::WriteBuffer & wb) const { zeroValue()->writeText(wb); }
|
||||
void read(DB::ReadBuffer & rb) { zeroValue()->read(rb); }
|
||||
void readText(DB::ReadBuffer & rb) { zeroValue()->readText(rb); }
|
||||
size_t size() const { return hasZero() ? 1 : 0; }
|
||||
bool empty() const { return !hasZero(); }
|
||||
size_t getBufferSizeInBytes() const { return sizeof(Cell); }
|
||||
size_t getCollisions() const { return 0; }
|
||||
};
|
||||
|
||||
template <size_t initial_size_degree = 8>
|
||||
struct StringHashTableGrower : public HashTableGrower<initial_size_degree>
|
||||
{
|
||||
// Smooth growing for string maps
|
||||
void increaseSize() { this->size_degree += 1; }
|
||||
};
|
||||
|
||||
template <typename SubMaps>
|
||||
class StringHashTable : private boost::noncopyable
|
||||
{
|
||||
protected:
|
||||
static constexpr size_t NUM_MAPS = 5;
|
||||
// Map for storing empty string
|
||||
using T0 = typename SubMaps::T0;
|
||||
|
||||
// Short strings are stored as numbers
|
||||
using T1 = typename SubMaps::T1;
|
||||
using T2 = typename SubMaps::T2;
|
||||
using T3 = typename SubMaps::T3;
|
||||
|
||||
// Long strings are stored as StringRef along with saved hash
|
||||
using Ts = typename SubMaps::Ts;
|
||||
using Self = StringHashTable;
|
||||
|
||||
template <typename, typename, size_t>
|
||||
friend class TwoLevelStringHashTable;
|
||||
|
||||
T0 m0;
|
||||
T1 m1;
|
||||
T2 m2;
|
||||
T3 m3;
|
||||
Ts ms;
|
||||
|
||||
public:
|
||||
using Key = StringRef;
|
||||
using key_type = Key;
|
||||
using value_type = typename Ts::value_type;
|
||||
using LookupResult = typename Ts::mapped_type *;
|
||||
|
||||
StringHashTable() {}
|
||||
|
||||
StringHashTable(size_t reserve_for_num_elements)
|
||||
: m1{reserve_for_num_elements / 4}
|
||||
, m2{reserve_for_num_elements / 4}
|
||||
, m3{reserve_for_num_elements / 4}
|
||||
, ms{reserve_for_num_elements / 4}
|
||||
{
|
||||
}
|
||||
|
||||
StringHashTable(StringHashTable && rhs) { *this = std::move(rhs); }
|
||||
~StringHashTable() {}
|
||||
|
||||
public:
|
||||
// Dispatch is written in a way that maximizes the performance:
|
||||
// 1. Always memcpy 8 times bytes
|
||||
// 2. Use switch case extension to generate fast dispatching table
|
||||
// 3. Funcs are named callables that can be force_inlined
|
||||
// NOTE: It relies on Little Endianness
|
||||
template <typename KeyHolder, typename Func>
|
||||
decltype(auto) ALWAYS_INLINE dispatch(KeyHolder && key_holder, Func && func)
|
||||
{
|
||||
const StringRef & x = keyHolderGetKey(key_holder);
|
||||
const size_t sz = x.size;
|
||||
if (sz == 0)
|
||||
{
|
||||
static constexpr StringKey0 key0{};
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(m0, key0, 0);
|
||||
}
|
||||
|
||||
const char * p = x.data;
|
||||
// pending bits that needs to be shifted out
|
||||
const char s = (-sz & 7) * 8;
|
||||
union
|
||||
{
|
||||
StringKey8 k8;
|
||||
StringKey16 k16;
|
||||
StringKey24 k24;
|
||||
UInt64 n[3];
|
||||
};
|
||||
StringHashTableHash hash;
|
||||
switch ((sz - 1) >> 3)
|
||||
{
|
||||
case 0: // 1..8 bytes
|
||||
{
|
||||
// first half page
|
||||
if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
|
||||
{
|
||||
memcpy(&n[0], p, 8);
|
||||
n[0] &= -1ul >> s;
|
||||
}
|
||||
else
|
||||
{
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[0], lp, 8);
|
||||
n[0] >>= s;
|
||||
}
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(m1, k8, hash(k8));
|
||||
}
|
||||
case 1: // 9..16 bytes
|
||||
{
|
||||
memcpy(&n[0], p, 8);
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[1], lp, 8);
|
||||
n[1] >>= s;
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(m2, k16, hash(k16));
|
||||
}
|
||||
case 2: // 17..24 bytes
|
||||
{
|
||||
memcpy(&n[0], p, 16);
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[2], lp, 8);
|
||||
n[2] >>= s;
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(m3, k24, hash(k24));
|
||||
}
|
||||
default: // >= 25 bytes
|
||||
{
|
||||
return func(ms, std::forward<KeyHolder>(key_holder), hash(x));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct EmplaceCallable
|
||||
{
|
||||
LookupResult & mapped;
|
||||
bool & inserted;
|
||||
|
||||
EmplaceCallable(LookupResult & mapped_, bool & inserted_)
|
||||
: mapped(mapped_), inserted(inserted_) {}
|
||||
|
||||
template <typename Map, typename KeyHolder>
|
||||
void ALWAYS_INLINE operator()(Map & map, KeyHolder && key_holder, size_t hash)
|
||||
{
|
||||
typename Map::LookupResult result;
|
||||
map.emplace(key_holder, result, inserted, hash);
|
||||
mapped = lookupResultGetMapped(result);
|
||||
}
|
||||
};
|
||||
|
||||
template <typename KeyHolder>
|
||||
void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
|
||||
{
|
||||
this->dispatch(key_holder, EmplaceCallable(it, inserted));
|
||||
}
|
||||
|
||||
struct FindCallable
|
||||
{
|
||||
// find() doesn't need any key memory management, so we don't work with
|
||||
// any key holders here, only with normal keys. The key type is still
|
||||
// different for every subtable, this is why it is a template parameter.
|
||||
template <typename Submap, typename SubmapKey>
|
||||
LookupResult ALWAYS_INLINE operator()(Submap & map, const SubmapKey & key, size_t hash)
|
||||
{
|
||||
return lookupResultGetMapped(map.find(key, hash));
|
||||
}
|
||||
};
|
||||
|
||||
LookupResult ALWAYS_INLINE find(Key x)
|
||||
{
|
||||
return dispatch(x, FindCallable{});
|
||||
}
|
||||
|
||||
void write(DB::WriteBuffer & wb) const
|
||||
{
|
||||
m0.write(wb);
|
||||
m1.write(wb);
|
||||
m2.write(wb);
|
||||
m3.write(wb);
|
||||
ms.write(wb);
|
||||
}
|
||||
|
||||
void writeText(DB::WriteBuffer & wb) const
|
||||
{
|
||||
m0.writeText(wb);
|
||||
DB::writeChar(',', wb);
|
||||
m1.writeText(wb);
|
||||
DB::writeChar(',', wb);
|
||||
m2.writeText(wb);
|
||||
DB::writeChar(',', wb);
|
||||
m3.writeText(wb);
|
||||
DB::writeChar(',', wb);
|
||||
ms.writeText(wb);
|
||||
}
|
||||
|
||||
void read(DB::ReadBuffer & rb)
|
||||
{
|
||||
m0.read(rb);
|
||||
m1.read(rb);
|
||||
m2.read(rb);
|
||||
m3.read(rb);
|
||||
ms.read(rb);
|
||||
}
|
||||
|
||||
void readText(DB::ReadBuffer & rb)
|
||||
{
|
||||
m0.readText(rb);
|
||||
DB::assertChar(',', rb);
|
||||
m1.readText(rb);
|
||||
DB::assertChar(',', rb);
|
||||
m2.readText(rb);
|
||||
DB::assertChar(',', rb);
|
||||
m3.readText(rb);
|
||||
DB::assertChar(',', rb);
|
||||
ms.readText(rb);
|
||||
}
|
||||
|
||||
size_t size() const { return m0.size() + m1.size() + m2.size() + m3.size() + ms.size(); }
|
||||
|
||||
bool empty() const { return m0.empty() && m1.empty() && m2.empty() && m3.empty() && ms.empty(); }
|
||||
|
||||
size_t getBufferSizeInBytes() const
|
||||
{
|
||||
return m0.getBufferSizeInBytes() + m1.getBufferSizeInBytes() + m2.getBufferSizeInBytes() + m3.getBufferSizeInBytes()
|
||||
+ ms.getBufferSizeInBytes();
|
||||
}
|
||||
|
||||
void clearAndShrink()
|
||||
{
|
||||
m1.clearHasZero();
|
||||
m1.clearAndShrink();
|
||||
m2.clearAndShrink();
|
||||
m3.clearAndShrink();
|
||||
ms.clearAndShrink();
|
||||
}
|
||||
};
|
37
dbms/src/Common/HashTable/TwoLevelStringHashMap.h
Normal file
37
dbms/src/Common/HashTable/TwoLevelStringHashMap.h
Normal file
@ -0,0 +1,37 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/HashTable/StringHashMap.h>
|
||||
#include <Common/HashTable/TwoLevelStringHashTable.h>
|
||||
|
||||
template <typename TMapped, typename Allocator = HashTableAllocator, template <typename...> typename ImplTable = StringHashMap>
|
||||
class TwoLevelStringHashMap : public TwoLevelStringHashTable<StringHashMapSubMaps<TMapped, Allocator>, ImplTable<TMapped, Allocator>>
|
||||
{
|
||||
public:
|
||||
using Key = StringRef;
|
||||
using key_type = Key;
|
||||
using Self = TwoLevelStringHashMap;
|
||||
using Base = TwoLevelStringHashTable<StringHashMapSubMaps<TMapped, Allocator>, StringHashMap<TMapped, Allocator>>;
|
||||
using Base::Base;
|
||||
using typename Base::Impl;
|
||||
using mapped_type = TMapped;
|
||||
using value_type = typename Base::value_type;
|
||||
|
||||
using LookupResult = typename Base::LookupResult;
|
||||
|
||||
template <typename Func>
|
||||
void ALWAYS_INLINE forEachMapped(Func && func)
|
||||
{
|
||||
for (auto i = 0u; i < this->NUM_BUCKETS; ++i)
|
||||
return this->impls[i].forEachMapped(func);
|
||||
}
|
||||
|
||||
mapped_type & ALWAYS_INLINE operator[](Key x)
|
||||
{
|
||||
bool inserted;
|
||||
LookupResult it;
|
||||
emplace(x, it, inserted);
|
||||
if (inserted)
|
||||
new (lookupResultGetMapped(it)) mapped_type();
|
||||
return *lookupResultGetMapped(it);
|
||||
}
|
||||
};
|
218
dbms/src/Common/HashTable/TwoLevelStringHashTable.h
Normal file
218
dbms/src/Common/HashTable/TwoLevelStringHashTable.h
Normal file
@ -0,0 +1,218 @@
|
||||
#pragma once
|
||||
|
||||
#include <Common/HashTable/StringHashTable.h>
|
||||
|
||||
template <typename SubMaps, typename ImplTable = StringHashTable<SubMaps>, size_t BITS_FOR_BUCKET = 8>
|
||||
class TwoLevelStringHashTable : private boost::noncopyable
|
||||
{
|
||||
protected:
|
||||
using HashValue = size_t;
|
||||
using Self = TwoLevelStringHashTable;
|
||||
|
||||
public:
|
||||
using Key = StringRef;
|
||||
using Impl = ImplTable;
|
||||
|
||||
static constexpr size_t NUM_BUCKETS = 1ULL << BITS_FOR_BUCKET;
|
||||
static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1;
|
||||
|
||||
// TODO: currently hashing contains redundant computations when doing distributed or external aggregations
|
||||
size_t hash(const Key & x) const
|
||||
{
|
||||
return const_cast<Self &>(*this).dispatch(x,
|
||||
[&](const auto &, const auto &, size_t hash) { return hash; });
|
||||
}
|
||||
|
||||
size_t operator()(const Key & x) const { return hash(x); }
|
||||
|
||||
/// NOTE Bad for hash tables with more than 2^32 cells.
|
||||
static size_t getBucketFromHash(size_t hash_value) { return (hash_value >> (32 - BITS_FOR_BUCKET)) & MAX_BUCKET; }
|
||||
|
||||
public:
|
||||
using key_type = typename Impl::key_type;
|
||||
using value_type = typename Impl::value_type;
|
||||
using LookupResult = typename Impl::LookupResult;
|
||||
|
||||
Impl impls[NUM_BUCKETS];
|
||||
|
||||
TwoLevelStringHashTable() {}
|
||||
|
||||
template <typename Source>
|
||||
TwoLevelStringHashTable(const Source & src)
|
||||
{
|
||||
if (src.m0.hasZero())
|
||||
impls[0].m0.setHasZero(*src.m0.zeroValue());
|
||||
|
||||
for (auto & v : src.m1)
|
||||
{
|
||||
size_t hash_value = v.getHash(src.m1);
|
||||
size_t buck = getBucketFromHash(hash_value);
|
||||
impls[buck].m1.insertUniqueNonZero(&v, hash_value);
|
||||
}
|
||||
for (auto & v : src.m2)
|
||||
{
|
||||
size_t hash_value = v.getHash(src.m2);
|
||||
size_t buck = getBucketFromHash(hash_value);
|
||||
impls[buck].m2.insertUniqueNonZero(&v, hash_value);
|
||||
}
|
||||
for (auto & v : src.m3)
|
||||
{
|
||||
size_t hash_value = v.getHash(src.m3);
|
||||
size_t buck = getBucketFromHash(hash_value);
|
||||
impls[buck].m3.insertUniqueNonZero(&v, hash_value);
|
||||
}
|
||||
for (auto & v : src.ms)
|
||||
{
|
||||
size_t hash_value = v.getHash(src.ms);
|
||||
size_t buck = getBucketFromHash(hash_value);
|
||||
impls[buck].ms.insertUniqueNonZero(&v, hash_value);
|
||||
}
|
||||
}
|
||||
|
||||
// This function is mostly the same as StringHashTable::dispatch, but with
|
||||
// added bucket computation. See the comments there.
|
||||
template <typename Func, typename KeyHolder>
|
||||
decltype(auto) ALWAYS_INLINE dispatch(KeyHolder && key_holder, Func && func)
|
||||
{
|
||||
const StringRef & x = keyHolderGetKey(key_holder);
|
||||
const size_t sz = x.size;
|
||||
if (sz == 0)
|
||||
{
|
||||
static constexpr StringKey0 key0{};
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(impls[0].m0, key0, 0);
|
||||
}
|
||||
|
||||
const char * p = x.data;
|
||||
// pending bits that needs to be shifted out
|
||||
const char s = (-sz & 7) * 8;
|
||||
union
|
||||
{
|
||||
StringKey8 k8;
|
||||
StringKey16 k16;
|
||||
StringKey24 k24;
|
||||
UInt64 n[3];
|
||||
};
|
||||
StringHashTableHash hash;
|
||||
switch ((sz - 1) >> 3)
|
||||
{
|
||||
case 0:
|
||||
{
|
||||
// first half page
|
||||
if ((reinterpret_cast<uintptr_t>(p) & 2048) == 0)
|
||||
{
|
||||
memcpy(&n[0], p, 8);
|
||||
n[0] &= -1ul >> s;
|
||||
}
|
||||
else
|
||||
{
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[0], lp, 8);
|
||||
n[0] >>= s;
|
||||
}
|
||||
auto res = hash(k8);
|
||||
auto buck = getBucketFromHash(res);
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(impls[buck].m1, k8, res);
|
||||
}
|
||||
case 1:
|
||||
{
|
||||
memcpy(&n[0], p, 8);
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[1], lp, 8);
|
||||
n[1] >>= s;
|
||||
auto res = hash(k16);
|
||||
auto buck = getBucketFromHash(res);
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(impls[buck].m2, k16, res);
|
||||
}
|
||||
case 2:
|
||||
{
|
||||
memcpy(&n[0], p, 16);
|
||||
const char * lp = x.data + x.size - 8;
|
||||
memcpy(&n[2], lp, 8);
|
||||
n[2] >>= s;
|
||||
auto res = hash(k24);
|
||||
auto buck = getBucketFromHash(res);
|
||||
keyHolderDiscardKey(key_holder);
|
||||
return func(impls[buck].m3, k24, res);
|
||||
}
|
||||
default:
|
||||
{
|
||||
auto res = hash(x);
|
||||
auto buck = getBucketFromHash(res);
|
||||
return func(impls[buck].ms, std::forward<KeyHolder>(key_holder), res);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename KeyHolder>
|
||||
void ALWAYS_INLINE emplace(KeyHolder && key_holder, LookupResult & it, bool & inserted)
|
||||
{
|
||||
dispatch(key_holder, typename Impl::EmplaceCallable{it, inserted});
|
||||
}
|
||||
|
||||
LookupResult ALWAYS_INLINE find(Key x)
|
||||
{
|
||||
return dispatch(x, typename Impl::FindCallable{});
|
||||
}
|
||||
|
||||
void write(DB::WriteBuffer & wb) const
|
||||
{
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
impls[i].write(wb);
|
||||
}
|
||||
|
||||
void writeText(DB::WriteBuffer & wb) const
|
||||
{
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
DB::writeChar(',', wb);
|
||||
impls[i].writeText(wb);
|
||||
}
|
||||
}
|
||||
|
||||
void read(DB::ReadBuffer & rb)
|
||||
{
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
impls[i].read(rb);
|
||||
}
|
||||
|
||||
void readText(DB::ReadBuffer & rb)
|
||||
{
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
{
|
||||
if (i != 0)
|
||||
DB::assertChar(',', rb);
|
||||
impls[i].readText(rb);
|
||||
}
|
||||
}
|
||||
|
||||
size_t size() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
res += impls[i].size();
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
bool empty() const
|
||||
{
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
if (!impls[i].empty())
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t getBufferSizeInBytes() const
|
||||
{
|
||||
size_t res = 0;
|
||||
for (size_t i = 0; i < NUM_BUCKETS; ++i)
|
||||
res += impls[i].getBufferSizeInBytes();
|
||||
|
||||
return res;
|
||||
}
|
||||
};
|
@ -11,6 +11,9 @@
|
||||
#include <Common/HashTable/FixedHashMap.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/TwoLevelHashMap.h>
|
||||
#include <Common/HashTable/StringHashMap.h>
|
||||
#include <Common/HashTable/TwoLevelStringHashMap.h>
|
||||
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <Common/UInt128.h>
|
||||
#include <Common/LRUCache.h>
|
||||
@ -69,12 +72,20 @@ using AggregatedDataWithUInt8Key = FixedHashMap<UInt8, AggregateDataPtr>;
|
||||
using AggregatedDataWithUInt16Key = FixedHashMap<UInt16, AggregateDataPtr>;
|
||||
|
||||
using AggregatedDataWithUInt64Key = HashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
|
||||
|
||||
using AggregatedDataWithShortStringKey = StringHashMap<AggregateDataPtr>;
|
||||
|
||||
using AggregatedDataWithStringKey = HashMapWithSavedHash<StringRef, AggregateDataPtr>;
|
||||
|
||||
using AggregatedDataWithKeys128 = HashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
|
||||
using AggregatedDataWithKeys256 = HashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
|
||||
|
||||
using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>>;
|
||||
|
||||
using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap<AggregateDataPtr>;
|
||||
|
||||
using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr>;
|
||||
|
||||
using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap<UInt128, AggregateDataPtr, UInt128HashCRC32>;
|
||||
using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap<UInt256, AggregateDataPtr, UInt256HashCRC32>;
|
||||
|
||||
@ -139,6 +150,8 @@ struct AggregationDataWithNullKeyTwoLevel : public Base
|
||||
|
||||
template <typename ... Types>
|
||||
using HashTableWithNullKey = AggregationDataWithNullKey<HashMapTable<Types ...>>;
|
||||
template <typename ... Types>
|
||||
using StringHashTableWithNullKey = AggregationDataWithNullKey<StringHashMap<Types ...>>;
|
||||
|
||||
using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey<AggregatedDataWithUInt8Key>;
|
||||
using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey<AggregatedDataWithUInt16Key>;
|
||||
@ -149,6 +162,10 @@ using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey<Aggregate
|
||||
using AggregatedDataWithNullableUInt64KeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
|
||||
TwoLevelHashMap<UInt64, AggregateDataPtr, HashCRC32<UInt64>,
|
||||
TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
|
||||
|
||||
using AggregatedDataWithNullableShortStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
|
||||
TwoLevelStringHashMap<AggregateDataPtr, HashTableAllocator, StringHashTableWithNullKey>>;
|
||||
|
||||
using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel<
|
||||
TwoLevelHashMapWithSavedHash<StringRef, AggregateDataPtr, DefaultHash<StringRef>,
|
||||
TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>;
|
||||
@ -216,6 +233,32 @@ struct AggregationMethodString
|
||||
};
|
||||
|
||||
|
||||
/// Same as above but without cache
|
||||
template <typename TData>
|
||||
struct AggregationMethodStringNoCache
|
||||
{
|
||||
using Data = TData;
|
||||
using Key = typename Data::key_type;
|
||||
using Mapped = typename Data::mapped_type;
|
||||
|
||||
Data data;
|
||||
|
||||
AggregationMethodStringNoCache() {}
|
||||
|
||||
template <typename Other>
|
||||
AggregationMethodStringNoCache(const Other & other) : data(other.data) {}
|
||||
|
||||
using State = ColumnsHashing::HashMethodString<typename Data::value_type, Mapped, true, false>;
|
||||
|
||||
static const bool low_cardinality_optimization = false;
|
||||
|
||||
static void insertKeyIntoColumns(const StringRef & key, MutableColumns & key_columns, const Sizes &)
|
||||
{
|
||||
key_columns[0]->insertData(key.data, key.size);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// For the case where there is one fixed-length string key.
|
||||
template <typename TData>
|
||||
struct AggregationMethodFixedString
|
||||
@ -241,6 +284,32 @@ struct AggregationMethodFixedString
|
||||
}
|
||||
};
|
||||
|
||||
/// Same as above but without cache
|
||||
template <typename TData>
|
||||
struct AggregationMethodFixedStringNoCache
|
||||
{
|
||||
using Data = TData;
|
||||
using Key = typename Data::key_type;
|
||||
using Mapped = typename Data::mapped_type;
|
||||
|
||||
Data data;
|
||||
|
||||
AggregationMethodFixedStringNoCache() {}
|
||||
|
||||
template <typename Other>
|
||||
AggregationMethodFixedStringNoCache(const Other & other) : data(other.data) {}
|
||||
|
||||
using State = ColumnsHashing::HashMethodFixedString<typename Data::value_type, Mapped, true, false>;
|
||||
|
||||
static const bool low_cardinality_optimization = false;
|
||||
|
||||
static void insertKeyIntoColumns(const StringRef & key, MutableColumns & key_columns, const Sizes &)
|
||||
{
|
||||
key_columns[0]->insertData(key.data, key.size);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/// Single low cardinality column.
|
||||
template <typename SingleColumnMethod>
|
||||
struct AggregationMethodSingleLowCardinalityColumn : public SingleColumnMethod
|
||||
@ -434,16 +503,16 @@ struct AggregatedDataVariants : private boost::noncopyable
|
||||
|
||||
std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64Key>> key32;
|
||||
std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64Key>> key64;
|
||||
std::unique_ptr<AggregationMethodString<AggregatedDataWithStringKey>> key_string;
|
||||
std::unique_ptr<AggregationMethodFixedString<AggregatedDataWithStringKey>> key_fixed_string;
|
||||
std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKey>> key_string;
|
||||
std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKey>> key_fixed_string;
|
||||
std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128>> keys128;
|
||||
std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256>> keys256;
|
||||
std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKey>> serialized;
|
||||
|
||||
std::unique_ptr<AggregationMethodOneNumber<UInt32, AggregatedDataWithUInt64KeyTwoLevel>> key32_two_level;
|
||||
std::unique_ptr<AggregationMethodOneNumber<UInt64, AggregatedDataWithUInt64KeyTwoLevel>> key64_two_level;
|
||||
std::unique_ptr<AggregationMethodString<AggregatedDataWithStringKeyTwoLevel>> key_string_two_level;
|
||||
std::unique_ptr<AggregationMethodFixedString<AggregatedDataWithStringKeyTwoLevel>> key_fixed_string_two_level;
|
||||
std::unique_ptr<AggregationMethodStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>> key_string_two_level;
|
||||
std::unique_ptr<AggregationMethodFixedStringNoCache<AggregatedDataWithShortStringKeyTwoLevel>> key_fixed_string_two_level;
|
||||
std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys128TwoLevel>> keys128_two_level;
|
||||
std::unique_ptr<AggregationMethodKeysFixed<AggregatedDataWithKeys256TwoLevel>> keys256_two_level;
|
||||
std::unique_ptr<AggregationMethodSerialized<AggregatedDataWithStringKeyTwoLevel>> serialized_two_level;
|
||||
|
@ -37,6 +37,12 @@ add_executable (hash_map_string_small hash_map_string_small.cpp)
|
||||
target_include_directories (hash_map_string_small SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR})
|
||||
target_link_libraries (hash_map_string_small PRIVATE dbms)
|
||||
|
||||
add_executable (string_hash_map string_hash_map.cpp)
|
||||
target_link_libraries (string_hash_map PRIVATE dbms)
|
||||
|
||||
add_executable (string_hash_map_aggregation string_hash_map.cpp)
|
||||
target_link_libraries (string_hash_map_aggregation PRIVATE dbms)
|
||||
|
||||
add_executable (two_level_hash_map two_level_hash_map.cpp)
|
||||
target_include_directories (two_level_hash_map SYSTEM BEFORE PRIVATE ${SPARSEHASH_INCLUDE_DIR})
|
||||
target_link_libraries (two_level_hash_map PRIVATE dbms)
|
||||
|
246
dbms/src/Interpreters/tests/string_hash_map.cpp
Normal file
246
dbms/src/Interpreters/tests/string_hash_map.cpp
Normal file
@ -0,0 +1,246 @@
|
||||
#include <iomanip>
|
||||
#include <iostream>
|
||||
#include <vector>
|
||||
#include <Compression/CompressedReadBuffer.h>
|
||||
#include <Core/Types.h>
|
||||
#include <IO/ReadBufferFromFile.h>
|
||||
#include <IO/ReadHelpers.h>
|
||||
#include <Interpreters/AggregationCommon.h>
|
||||
#include <Common/HashTable/HashMap.h>
|
||||
#include <Common/HashTable/HashTableKeyHolder.h>
|
||||
#include <Common/HashTable/StringHashMap.h>
|
||||
#include <Common/Stopwatch.h>
|
||||
#include <common/StringRef.h>
|
||||
|
||||
/**
|
||||
|
||||
#include <fstream>
|
||||
#include <random>
|
||||
|
||||
using namespace std;
|
||||
|
||||
int main()
|
||||
{
|
||||
std::string s;
|
||||
std::random_device dev;
|
||||
std::mt19937 rng(dev());
|
||||
std::uniform_int_distribution<std::mt19937::result_type> dist(0, 25);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial1(100, 0.01);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial2(100, 0.02);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial4(100, 0.04);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial8(100, 0.08);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial16(100, 0.16);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial24(100, 0.24);
|
||||
std::binomial_distribution<std::mt19937::result_type> binomial48(100, 0.48);
|
||||
// 11GB
|
||||
std::ofstream f("/tmp/terms.csv");
|
||||
size_t l1, l2, l4, l8, l16, l24, l48;
|
||||
for (auto n = 0ul; n < 1e8; ++n)
|
||||
{
|
||||
l1 = binomial1(rng) + 1;
|
||||
l2 = binomial2(rng) + l1 + 1;
|
||||
l4 = binomial4(rng) + l2 + 1;
|
||||
l8 = binomial8(rng) + l4 + 1;
|
||||
l16 = binomial16(rng) + l8 + 1;
|
||||
l24 = binomial24(rng) + l16 + 1;
|
||||
l48 = binomial48(rng) + l24 + 1;
|
||||
s.resize(l48);
|
||||
for (auto i = 0ul; i < l48 - 1; ++i)
|
||||
s[i] = 'a' + dist(rng);
|
||||
s[l1 - 1] = ',';
|
||||
s[l2 - 1] = ',';
|
||||
s[l4 - 1] = ',';
|
||||
s[l8 - 1] = ',';
|
||||
s[l16 - 1] = ',';
|
||||
s[l24 - 1] = ',';
|
||||
s[l48 - 1] = '\n';
|
||||
f << s;
|
||||
}
|
||||
f.close();
|
||||
return 0;
|
||||
}
|
||||
|
||||
create table terms (term1 String, term2 String, term4 String, term8 String, term16 String, term24 String, term48 String) engine TinyLog;
|
||||
insert into terms select * from file('/tmp/terms.csv', CSV, 'a String, b String, c String, d String, e String, f String, g String');
|
||||
|
||||
NOTE: for reliable test results, try isolating cpu cores and do python -m perf tune. Also bind numa nodes if any.
|
||||
# isolate cpu 18
|
||||
dir=/home/amos/git/chorigin/data/data/default/terms
|
||||
for file in term1 term2 term4 term8 term16 term24 term48; do
|
||||
for size in 30000000 50000000 80000000 100000000; do
|
||||
BEST_METHOD=0
|
||||
BEST_RESULT=0
|
||||
for method in {1..2}; do
|
||||
echo -ne $file $size $method ''
|
||||
numactl --membind=0 taskset -c 18 ./string_hash_map $size $method <"$dir"/"$file".bin 2>&1 | perl -nE 'say /([0-9\.]+) elem/g if /HashMap/' | tee /tmp/string_hash_map_res
|
||||
CUR_RESULT=$(cat /tmp/string_hash_map_res | tr -d '.')
|
||||
if [[ $CUR_RESULT -gt $BEST_RESULT ]]; then
|
||||
BEST_METHOD=$method
|
||||
BEST_RESULT=$CUR_RESULT
|
||||
fi
|
||||
done
|
||||
echo Best: $BEST_METHOD - $BEST_RESULT
|
||||
done
|
||||
done
|
||||
|
||||
---------------------------
|
||||
|
||||
term1 30000000 1 68785770.85 term2 30000000 1 42531788.83 term4 30000000 1 14759901.41 term8 30000000 1 8072903.47
|
||||
term1 30000000 2 40812128.16 term2 30000000 2 21352402.10 term4 30000000 2 9008907.80 term8 30000000 2 5822641.82
|
||||
Best: 1 - 6878577085 Best: 1 - 4253178883 Best: 1 - 1475990141 Best: 1 - 807290347
|
||||
term1 50000000 1 68027542.41 term2 50000000 1 40493742.80 term4 50000000 1 16827650.85 term8 50000000 1 7405230.14
|
||||
term1 50000000 2 37589806.02 term2 50000000 2 19362975.09 term4 50000000 2 8278094.11 term8 50000000 2 5106810.80
|
||||
Best: 1 - 6802754241 Best: 1 - 4049374280 Best: 1 - 1682765085 Best: 1 - 740523014
|
||||
term1 80000000 1 68651875.88 term2 80000000 1 38253695.50 term4 80000000 1 15847177.93 term8 80000000 1 7536319.25
|
||||
term1 80000000 2 38092189.20 term2 80000000 2 20287003.01 term4 80000000 2 9322770.34 term8 80000000 2 4355572.15
|
||||
Best: 1 - 6865187588 Best: 1 - 3825369550 Best: 1 - 1584717793 Best: 1 - 753631925
|
||||
term1 100000000 1 68641941.59 term2 100000000 1 39120834.79 term4 100000000 1 16773904.90 term8 100000000 1 7147146.55
|
||||
term1 100000000 2 38358006.72 term2 100000000 2 20629363.17 term4 100000000 2 9665201.92 term8 100000000 2 4728255.07
|
||||
Best: 1 - 6864194159 Best: 1 - 3912083479 Best: 1 - 1677390490 Best: 1 - 714714655
|
||||
|
||||
|
||||
term16 30000000 1 6823029.35 term24 30000000 1 5706271.14 term48 30000000 1 4695716.47
|
||||
term16 30000000 2 5672283.33 term24 30000000 2 5498855.56 term48 30000000 2 4860537.26
|
||||
Best: 1 - 682302935 Best: 1 - 570627114 Best: 2 - 486053726
|
||||
term16 50000000 1 6214581.25 term24 50000000 1 5249785.66 term48 50000000 1 4282606.12
|
||||
term16 50000000 2 4990361.44 term24 50000000 2 4855552.24 term48 50000000 2 4348923.29
|
||||
Best: 1 - 621458125 Best: 1 - 524978566 Best: 2 - 434892329
|
||||
term16 80000000 1 5382855.70 term24 80000000 1 4580133.04 term48 80000000 1 3779436.15
|
||||
term16 80000000 2 4282192.79 term24 80000000 2 4178791.09 term48 80000000 2 3788409.72
|
||||
Best: 1 - 538285570 Best: 1 - 458013304 Best: 2 - 378840972
|
||||
term16 100000000 1 5930103.42 term24 100000000 1 5030621.52 term48 100000000 1 4084666.73
|
||||
term16 100000000 2 4621719.60 term24 100000000 2 4499866.83 term48 100000000 2 4067029.31
|
||||
Best: 1 - 593010342 Best: 1 - 503062152 Best: 1 - 408466673
|
||||
|
||||
*/
|
||||
|
||||
|
||||
using Value = uint64_t;
|
||||
|
||||
template <typename Map>
|
||||
void NO_INLINE bench(const std::vector<StringRef> & data, DB::Arena &, const char * name)
|
||||
{
|
||||
// warm up
|
||||
/*
|
||||
{
|
||||
Map map;
|
||||
typename Map::LookupResult it;
|
||||
bool inserted;
|
||||
|
||||
for (size_t i = 0, size = data.size(); i < size; ++i)
|
||||
{
|
||||
auto key_holder = DB::ArenaKeyHolder{data[i], pool};
|
||||
map.emplace(key_holder, it, inserted);
|
||||
if (inserted)
|
||||
it->getSecond() = 0;
|
||||
++it->getSecond();
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
std::cerr << "method " << name << std::endl;
|
||||
for (auto t = 0ul; t < 7; ++t)
|
||||
{
|
||||
DB::Arena pool(128 * 1024 * 1024);
|
||||
Stopwatch watch;
|
||||
Map map;
|
||||
typename Map::LookupResult it;
|
||||
bool inserted;
|
||||
|
||||
for (size_t i = 0, size = data.size(); i < size; ++i)
|
||||
{
|
||||
map.emplace(DB::ArenaKeyHolder{data[i], pool}, it, inserted);
|
||||
if (inserted)
|
||||
*lookupResultGetMapped(it) = 0;
|
||||
++*lookupResultGetMapped(it);
|
||||
}
|
||||
watch.stop();
|
||||
|
||||
std::cerr << "arena-memory " << pool.size() + map.getBufferSizeInBytes() << std::endl;
|
||||
std::cerr << "single-run " << std::setprecision(3)
|
||||
<< watch.elapsedSeconds() << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
template <typename Map>
|
||||
runFromFile()
|
||||
{
|
||||
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
|
||||
DB::CompressedReadBuffer in2(in1);
|
||||
|
||||
Map map;
|
||||
DB::Arena pool(128 * 1024 * 1024);
|
||||
for (size_t i = 0; i < n && !in2.eof(); ++i)
|
||||
{
|
||||
auto key = DB::readStringBinaryInto(pool, in2);
|
||||
|
||||
bool inserted;
|
||||
Map::LookupResult mapped;
|
||||
map.emplaceKeyHolder(DB::SerializedKeyHolder(key, pool), mapped, inserted);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename Map>
|
||||
benchFromFile()
|
||||
{
|
||||
double best_time = -1.;
|
||||
for (auto t = 0ul; t < 50; ++t)
|
||||
{
|
||||
Stopwatch watch;
|
||||
runFromFile();
|
||||
watch.stop();
|
||||
|
||||
if (best_time < 0 || best_time > watch.elapsedSeconds())
|
||||
{
|
||||
best_time = watch.elapsedSeconds();
|
||||
}
|
||||
}
|
||||
|
||||
std::cerr << std::fixed << std::setprecision(2) << "HashMap (" << name << "). Elapsed: " << best_time << " (" << data.size() / best_time
|
||||
<< " elem/sec.)" << std::endl;
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
int main(int argc, char ** argv)
|
||||
{
|
||||
if (argc < 3)
|
||||
{
|
||||
std::cerr << "Usage: program n m\n";
|
||||
return 1;
|
||||
}
|
||||
|
||||
size_t n = atoi(argv[1]);
|
||||
size_t m = atoi(argv[2]);
|
||||
|
||||
DB::Arena pool(128 * 1024 * 1024);
|
||||
std::vector<StringRef> data(n);
|
||||
|
||||
std::cerr << "sizeof(Key) = " << sizeof(StringRef) << ", sizeof(Value) = " << sizeof(Value) << std::endl;
|
||||
|
||||
{
|
||||
Stopwatch watch;
|
||||
DB::ReadBufferFromFileDescriptor in1(STDIN_FILENO);
|
||||
DB::CompressedReadBuffer in2(in1);
|
||||
|
||||
std::string tmp;
|
||||
for (size_t i = 0; i < n && !in2.eof(); ++i)
|
||||
{
|
||||
DB::readStringBinary(tmp, in2);
|
||||
data[i] = StringRef(pool.insert(tmp.data(), tmp.size()), tmp.size());
|
||||
}
|
||||
|
||||
watch.stop();
|
||||
std::cerr << std::fixed << std::setprecision(2) << "Vector. Size: " << n << ", elapsed: " << watch.elapsedSeconds() << " ("
|
||||
<< n / watch.elapsedSeconds() << " elem/sec.)" << std::endl;
|
||||
}
|
||||
|
||||
if (!m || m == 1)
|
||||
bench<StringHashMap<Value>>(data, pool, "StringHashMap");
|
||||
if (!m || m == 2)
|
||||
bench<HashMapWithSavedHash<StringRef, Value>>(data, pool, "HashMapWithSavedHash");
|
||||
if (!m || m == 3)
|
||||
bench<HashMap<StringRef, Value>>(data, pool, "HashMap");
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user