diff --git a/dbms/include/DB/Common/HashTable/HashTable.h b/dbms/include/DB/Common/HashTable/HashTable.h index 4b7be434a0b..c4b797c7439 100644 --- a/dbms/include/DB/Common/HashTable/HashTable.h +++ b/dbms/include/DB/Common/HashTable/HashTable.h @@ -189,7 +189,7 @@ struct ZeroValueStorage { private: bool has_zero = false; - char zero_value_storage[sizeof(Cell)]; /// Кусок памяти для элемента с ключём 0. + char zero_value_storage[sizeof(Cell)] __attribute__((__aligned__(__alignof__(Cell)))); /// Кусок памяти для элемента с ключём 0. public: bool hasZero() const { return has_zero; } diff --git a/dbms/include/DB/Common/HashTable/TwoLevelHashTable.h b/dbms/include/DB/Common/HashTable/TwoLevelHashTable.h index 51a54590d04..d80d8cae97f 100644 --- a/dbms/include/DB/Common/HashTable/TwoLevelHashTable.h +++ b/dbms/include/DB/Common/HashTable/TwoLevelHashTable.h @@ -40,34 +40,28 @@ protected: size_t m_size = 0; /// Количество элементов size_t hash(const Key & x) const { return Hash::operator()(x); } - size_t bucket(size_t hash_value) const { return hash_value >> 24; } /// TODO: брать не настолько младший байт. + size_t bucket(size_t hash_value) const { return hash_value >> 56; } typename Impl::iterator beginOfNextNonEmptyBucket(size_t & bucket) { - do - { + while (bucket != NUM_BUCKETS && impls[bucket].empty()) ++bucket; - } - while (bucket != NUM_BUCKETS && !impls[bucket].empty()); if (bucket != NUM_BUCKETS) return impls[bucket].begin(); - return impls[NUM_BUCKETS - 1].end(); + return impls[MAX_BUCKET].end(); } typename Impl::const_iterator beginOfNextNonEmptyBucket(size_t & bucket) const { - do - { + while (bucket != NUM_BUCKETS && impls[bucket].empty()) ++bucket; - } - while (bucket != NUM_BUCKETS && !impls[bucket].empty()); if (bucket != NUM_BUCKETS) return impls[bucket].begin(); - return impls[NUM_BUCKETS - 1].end(); + return impls[MAX_BUCKET].end(); } public: @@ -75,19 +69,20 @@ public: typedef typename Impl::value_type value_type; static constexpr size_t NUM_BUCKETS = 256; + static constexpr size_t MAX_BUCKET = NUM_BUCKETS - 1; Impl impls[NUM_BUCKETS]; class iterator { - Impl * impls; + Self * container; size_t bucket; typename Impl::iterator current_it; friend class TwoLevelHashTable; - iterator(Impl * impls_, size_t bucket_, typename Impl::iterator & current_it_) - : impls(impls_), bucket(bucket_), current_it(current_it_) {} + iterator(Self * container_, size_t bucket_, typename Impl::iterator current_it_) + : container(container_), bucket(bucket_), current_it(current_it_) {} public: iterator() {} @@ -98,8 +93,11 @@ public: iterator & operator++() { ++current_it; - if (current_it == impls[bucket].end()) - current_it = beginOfNextNonEmptyBucket(bucket); + if (current_it == container->impls[bucket].end()) + { + ++bucket; + current_it = container->beginOfNextNonEmptyBucket(bucket); + } return *this; } @@ -111,18 +109,18 @@ public: class const_iterator { - Impl * impls; + Self * container; size_t bucket; typename Impl::const_iterator current_it; friend class TwoLevelHashTable; - const_iterator(Impl * impls_, size_t bucket_, typename Impl::const_iterator & current_it_) - : impls(impls_), bucket(bucket_), current_it(current_it_) {} + const_iterator(Self * container_, size_t bucket_, typename Impl::const_iterator current_it_) + : container(container_), bucket(bucket_), current_it(current_it_) {} public: const_iterator() {} - const_iterator(const iterator & rhs) : impls(rhs.impls), current_it(rhs.current_it), bucket(rhs.bucket) {} + const_iterator(const iterator & rhs) : container(rhs.container), bucket(rhs.bucket), current_it(rhs.current_it) {} bool operator== (const const_iterator & rhs) const { return current_it == rhs.current_it; } bool operator!= (const const_iterator & rhs) const { return current_it != rhs.current_it; } @@ -130,8 +128,11 @@ public: const_iterator & operator++() { ++current_it; - if (current_it == impls[bucket].end()) - current_it = beginOfNextNonEmptyBucket(bucket); + if (current_it == container->impls[bucket].end()) + { + ++bucket; + current_it = container->beginOfNextNonEmptyBucket(bucket); + } return *this; } @@ -144,17 +145,19 @@ public: const_iterator begin() const { size_t buck = 0; - return beginOfNextNonEmptyBucket(buck); + typename Impl::const_iterator impl_it = beginOfNextNonEmptyBucket(buck); + return { this, buck, impl_it }; } iterator begin() { size_t buck = 0; - return beginOfNextNonEmptyBucket(buck); + typename Impl::iterator impl_it = beginOfNextNonEmptyBucket(buck); + return { this, buck, impl_it }; } - const_iterator end() const { return impls[NUM_BUCKETS - 1].end(); } - iterator end() { return impls[NUM_BUCKETS - 1].end(); } + const_iterator end() const { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; } + iterator end() { return { this, MAX_BUCKET, impls[MAX_BUCKET].end() }; } /// Вставить значение. В случае хоть сколько-нибудь сложных значений, лучше используйте функцию emplace. @@ -196,7 +199,7 @@ public: size_t buck = bucket(hash_value); typename Impl::iterator impl_it; impls[buck].emplace(x, impl_it, inserted); - it = iterator(impls, buck, impl_it); + it = iterator(this, buck, impl_it); if (inserted) ++m_size; @@ -210,7 +213,7 @@ public: typename Impl::iterator found = impls[buck].find(x); return found != impls[buck].end() - ? iterator(impls, buck, found) + ? iterator(this, buck, found) : end(); } @@ -222,7 +225,7 @@ public: typename Impl::const_iterator found = impls[buck].find(x); return found != impls[buck].end() - ? const_iterator(impls, buck, found) + ? const_iterator(this, buck, found) : end(); } diff --git a/dbms/src/Interpreters/tests/hash_map_string.cpp b/dbms/src/Interpreters/tests/hash_map_string.cpp index ed8a655bf43..1c3b5d33bf7 100644 --- a/dbms/src/Interpreters/tests/hash_map_string.cpp +++ b/dbms/src/Interpreters/tests/hash_map_string.cpp @@ -81,35 +81,160 @@ struct DefaultHash }; +#define mix(h) ({ \ + (h) ^= (h) >> 23; \ + (h) *= 0x2127599bf4325c37ULL; \ + (h) ^= (h) >> 47; }) + +struct FastHash64 +{ + size_t operator() (CompactStringRef x) const + { + const char * buf = x.data(); + size_t len = x.size; + + const uint64_t m = 0x880355f21e6d1965ULL; + const uint64_t *pos = (const uint64_t *)buf; + const uint64_t *end = pos + (len / 8); + const unsigned char *pos2; + uint64_t h = len * m; + uint64_t v; + + while (pos != end) { + v = *pos++; + h ^= mix(v); + h *= m; + } + + pos2 = (const unsigned char*)pos; + v = 0; + + switch (len & 7) { + case 7: v ^= (uint64_t)pos2[6] << 48; + case 6: v ^= (uint64_t)pos2[5] << 40; + case 5: v ^= (uint64_t)pos2[4] << 32; + case 4: v ^= (uint64_t)pos2[3] << 24; + case 3: v ^= (uint64_t)pos2[2] << 16; + case 2: v ^= (uint64_t)pos2[1] << 8; + case 1: v ^= (uint64_t)pos2[0]; + h ^= mix(v); + h *= m; + } + + return mix(h); + } +}; + + +struct CrapWow +{ + size_t operator() (CompactStringRef x) const + { + const char * key = x.data(); + size_t len = x.size; + size_t seed = 0; + + const UInt64 m = 0x95b47aa3355ba1a1, n = 0x8a970be7488fda55; + UInt64 hash; + // 3 = m, 4 = n + // r12 = h, r13 = k, ecx = seed, r12 = key + asm( + "leaq (%%rcx,%4), %%r13\n" + "movq %%rdx, %%r14\n" + "movq %%rcx, %%r15\n" + "movq %%rcx, %%r12\n" + "addq %%rax, %%r13\n" + "andq $0xfffffffffffffff0, %%rcx\n" + "jz QW%=\n" + "addq %%rcx, %%r14\n\n" + "negq %%rcx\n" + "XW%=:\n" + "movq %4, %%rax\n" + "mulq (%%r14,%%rcx)\n" + "xorq %%rax, %%r12\n" + "xorq %%rdx, %%r13\n" + "movq %3, %%rax\n" + "mulq 8(%%r14,%%rcx)\n" + "xorq %%rdx, %%r12\n" + "xorq %%rax, %%r13\n" + "addq $16, %%rcx\n" + "jnz XW%=\n" + "QW%=:\n" + "movq %%r15, %%rcx\n" + "andq $8, %%r15\n" + "jz B%=\n" + "movq %4, %%rax\n" + "mulq (%%r14)\n" + "addq $8, %%r14\n" + "xorq %%rax, %%r12\n" + "xorq %%rdx, %%r13\n" + "B%=:\n" + "andq $7, %%rcx\n" + "jz F%=\n" + "movq $1, %%rdx\n" + "shlq $3, %%rcx\n" + "movq %3, %%rax\n" + "shlq %%cl, %%rdx\n" + "addq $-1, %%rdx\n" + "andq (%%r14), %%rdx\n" + "mulq %%rdx\n" + "xorq %%rdx, %%r12\n" + "xorq %%rax, %%r13\n" + "F%=:\n" + "leaq (%%r13,%4), %%rax\n" + "xorq %%r12, %%rax\n" + "mulq %4\n" + "xorq %%rdx, %%rax\n" + "xorq %%r12, %%rax\n" + "xorq %%r13, %%rax\n" + : "=a"(hash), "=c"(key), "=d"(key) + : "r"(m), "r"(n), "a"(seed), "c"(len), "d"(key) + : "%r12", "%r13", "%r14", "%r15", "cc" + ); + return hash; + } +}; + + +struct SimpleHash +{ + size_t operator() (CompactStringRef x) const + { + const char * pos = x.data(); + size_t size = x.size; + + const char * end = pos + size; + + size_t res = 0; + + if (size == 0) + return 0; + + if (size < 8) + { + memcpy(reinterpret_cast(&res), pos, size); + return intHash64(res); + } + + while (pos + 8 < end) + { + UInt64 word = *reinterpret_cast(pos); + res = intHash64(word ^ res); + + pos += 8; + } + + UInt64 word = *reinterpret_cast(end - 8); + res = intHash64(word ^ res); + + return res; + } +}; + + typedef CompactStringRef Key; typedef UInt64 Value; -struct CellWithSavedHash : public HashMapCell > -{ - size_t saved_hash; - - CellWithSavedHash() : HashMapCell() {} - CellWithSavedHash(const Key & key_, const State & state) : HashMapCell(key_, state) {} - CellWithSavedHash(const value_type & value_, const State & state) : HashMapCell(value_, state) {} - -/* static bool equals(const StringRef & lhs, const StringRef & rhs) - { - if (lhs.size != rhs.size) - return false; - - for (size_t pos = lhs.size - 1; pos < lhs.size; --pos) - if (lhs.data[pos] != rhs.data[pos]) - return false; - - return true; - }*/ - - bool keyEquals(const Key & key_) const { return value.first == key_; } - bool keyEquals(const CellWithSavedHash & other) const { return saved_hash == other.saved_hash && value.first == other.value.first; } - - void setHash(size_t hash_value) { saved_hash = hash_value; } - size_t getHash(const DefaultHash & hash) const { return saved_hash; } -}; struct Grower : public HashTableGrower<> { @@ -188,7 +313,7 @@ int main(int argc, char ** argv) //typedef HashMap Map; /// Сохранение хэша ускоряет ресайзы примерно в 2 раза, и общую производительность - на 6-8%. - typedef HashMapTable, Grower> Map; + typedef HashMapWithSavedHash, Grower> Map; Map map; Map::iterator it; @@ -204,7 +329,7 @@ int main(int argc, char ** argv) watch.stop(); std::cerr << std::fixed << std::setprecision(2) - << "HashMap. Size: " << map.size() + << "HashMap (CityHash64). Size: " << map.size() << ", elapsed: " << watch.elapsedSeconds() << " (" << n / watch.elapsedSeconds() << " elem/sec.)" #ifdef DBMS_HASH_MAP_COUNT_COLLISIONS @@ -217,6 +342,93 @@ int main(int argc, char ** argv) { Stopwatch watch; + typedef HashMapWithSavedHash Map; + + Map map; + Map::iterator it; + bool inserted; + + for (size_t i = 0; i < n; ++i) + { + map.emplace(data[i], it, inserted); + if (inserted) + it->second = 0; + ++it->second; + } + + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) + << "HashMap (FastHash64). Size: " << map.size() + << ", elapsed: " << watch.elapsedSeconds() + << " (" << n / watch.elapsedSeconds() << " elem/sec.)" +#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS + << ", collisions: " << map.getCollisions() +#endif + << std::endl; + } + + if (!m || m == 3) + { + Stopwatch watch; + + typedef HashMapWithSavedHash Map; + + Map map; + Map::iterator it; + bool inserted; + + for (size_t i = 0; i < n; ++i) + { + map.emplace(data[i], it, inserted); + if (inserted) + it->second = 0; + ++it->second; + } + + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) + << "HashMap (CrapWow). Size: " << map.size() + << ", elapsed: " << watch.elapsedSeconds() + << " (" << n / watch.elapsedSeconds() << " elem/sec.)" +#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS + << ", collisions: " << map.getCollisions() +#endif + << std::endl; + } + + if (!m || m == 4) + { + Stopwatch watch; + + typedef HashMapWithSavedHash Map; + + Map map; + Map::iterator it; + bool inserted; + + for (size_t i = 0; i < n; ++i) + { + map.emplace(data[i], it, inserted); + if (inserted) + it->second = 0; + ++it->second; + } + + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) + << "HashMap (SimpleHash). Size: " << map.size() + << ", elapsed: " << watch.elapsedSeconds() + << " (" << n / watch.elapsedSeconds() << " elem/sec.)" +#ifdef DBMS_HASH_MAP_COUNT_COLLISIONS + << ", collisions: " << map.getCollisions() +#endif + << std::endl; + } + + if (!m || m == 5) + { + Stopwatch watch; + std::unordered_map > map; for (size_t i = 0; i < n; ++i) ++map[data[i]]; @@ -229,7 +441,7 @@ int main(int argc, char ** argv) << std::endl; } - if (!m || m == 3) + if (!m || m == 6) { Stopwatch watch; @@ -246,7 +458,7 @@ int main(int argc, char ** argv) << std::endl; } - if (!m || m == 4) + if (!m || m == 7) { Stopwatch watch; diff --git a/dbms/src/Interpreters/tests/two_level_hash_map.cpp b/dbms/src/Interpreters/tests/two_level_hash_map.cpp index 41b921c7e50..26df5c372d7 100644 --- a/dbms/src/Interpreters/tests/two_level_hash_map.cpp +++ b/dbms/src/Interpreters/tests/two_level_hash_map.cpp @@ -7,10 +7,11 @@ #include #include -#include - //#define DBMS_HASH_MAP_DEBUG_RESIZES +#include +#include + #include #include #include @@ -48,6 +49,8 @@ int main(int argc, char ** argv) { Stopwatch watch; + std::cerr << sizeof(HashMapCell >) << std::endl; + typedef TwoLevelHashTable >, DefaultHash, HashTableGrower<8>, HashTableAllocator> Map; Map map; @@ -68,6 +71,58 @@ int main(int argc, char ** argv) << ", elapsed: " << watch.elapsedSeconds() << " (" << n / watch.elapsedSeconds() << " elem/sec.)" << std::endl; + + size_t sum_counts = 0; + size_t elems = 0; + for (const auto & kv : map) + { + sum_counts += kv.second; + ++elems; + } + + std::cerr << "sum_counts: " << sum_counts << ", elems: " << elems << std::endl; + } + + { + Stopwatch watch; + + typedef TwoLevelHashTable >, DefaultHash, HashTableGrower<8>, HashTableAllocator> Map; + //typedef HashMap Map; + + Map map; + Map::iterator it; + bool inserted; + + for (size_t i = 0; i < n; ++i) + { + map.emplace(i, it, inserted); + if (inserted) + it->second = 0; + ++it->second; + } + + watch.stop(); + std::cerr << std::fixed << std::setprecision(2) + << "HashMap. Size: " << map.size() + << ", elapsed: " << watch.elapsedSeconds() + << " (" << n / watch.elapsedSeconds() << " elem/sec.)" + << std::endl; + + size_t sum_counts = 0; + size_t elems = 0; + for (const auto & kv : map) + { + sum_counts += kv.second; + ++elems; + + if (kv.first > n) + std::cerr << kv.first << std::endl; + } + + std::cerr << "sum_counts: " << sum_counts << ", elems: " << elems << std::endl; + + if (sum_counts != n) + std::cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!" << std::endl; } return 0;