From e16891713da01f841318dc0d6e4f9b2bd39db5fe Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 10 Dec 2021 23:45:36 +0300 Subject: [PATCH 1/2] Fix sparse_hashed dict performance with sequential keys (wrong hash function) In #27152 the hash function for sparse_hash_map had been changed to std::hash<> switch it back to DefaultHash<> (ClickHouse builtin), since std::hash<> for numeric keys returns itself and this does not works great with sparse_hash_map. I've tried the example from #32480 and using some hash fixes the performance of sparse_hashed layout. Fixes: #32480 v2: Add comments for SparseHashMap --- src/Common/SparseHashMap.h | 10 ++++++++++ src/Dictionaries/HashedDictionary.h | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/src/Common/SparseHashMap.h b/src/Common/SparseHashMap.h index 3f38d52a2b8..b2f29b75933 100644 --- a/src/Common/SparseHashMap.h +++ b/src/Common/SparseHashMap.h @@ -4,6 +4,16 @@ #include +/// Hash function for sparse hash table is very important, for example: +/// +/// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> +/// in case of sequential set of keys, but with random access to this set, i.e. +/// +/// SELECT number FROM numbers(3000000) ORDER BY rand() +/// +/// - but std::hash<> works good enough (and sometimes better) for generic cases +/// +/// So std::hash<> by default is preferred. template , class EqualKey = std::equal_to, class Alloc = google::libc_allocator_with_realloc>> diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 23919c009c5..2c871c38075 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -124,11 +124,22 @@ private: HashMap, HashMapWithSavedHash>>; + /// Here we use SparseHashMap with DefaultHash<> for the following reasons: + /// + /// - DefaultHash<> is used for HashMap + /// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> + /// in case of sequential set of keys, but with random access to this set, i.e. + /// + /// SELECT number FROM numbers(3000000) ORDER BY rand() + /// + /// And even though std::hash<> works better in some other cases, + /// DefaultHash<> is preferred since the difference for this particular + /// case is significant, i.e. it can be 10x+. template using CollectionTypeSparse = std::conditional_t< dictionary_key_type == DictionaryKeyType::Simple, - SparseHashMap, - SparseHashMap>; + SparseHashMap>, + SparseHashMap>>; template using CollectionType = std::conditional_t, CollectionTypeNonSparse>; From a7dc6f309f608c21c0b913691e2f3b440dc22197 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 14 Dec 2021 09:12:42 +0300 Subject: [PATCH 2/2] Remove SparseHashMap.h It was added only for arcadia build, and used only in one place, no need to have a separate typedef for it. --- src/Common/SparseHashMap.h | 20 -------------------- src/Dictionaries/HashedArrayDictionary.h | 2 -- src/Dictionaries/HashedDictionary.h | 9 ++++----- 3 files changed, 4 insertions(+), 27 deletions(-) delete mode 100644 src/Common/SparseHashMap.h diff --git a/src/Common/SparseHashMap.h b/src/Common/SparseHashMap.h deleted file mode 100644 index b2f29b75933..00000000000 --- a/src/Common/SparseHashMap.h +++ /dev/null @@ -1,20 +0,0 @@ -#pragma once - -/// SparseHashMap is a wrapper for google::sparse_hash_map. - -#include - -/// Hash function for sparse hash table is very important, for example: -/// -/// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> -/// in case of sequential set of keys, but with random access to this set, i.e. -/// -/// SELECT number FROM numbers(3000000) ORDER BY rand() -/// -/// - but std::hash<> works good enough (and sometimes better) for generic cases -/// -/// So std::hash<> by default is preferred. -template , - class EqualKey = std::equal_to, - class Alloc = google::libc_allocator_with_realloc>> -using SparseHashMap = google::sparse_hash_map; diff --git a/src/Dictionaries/HashedArrayDictionary.h b/src/Dictionaries/HashedArrayDictionary.h index 5ad1efeb056..ca5d7cb1bf6 100644 --- a/src/Dictionaries/HashedArrayDictionary.h +++ b/src/Dictionaries/HashedArrayDictionary.h @@ -5,8 +5,6 @@ #include #include -#include - #include #include #include diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 2c871c38075..16be4e4c73e 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -4,8 +4,7 @@ #include #include #include - -#include +#include #include #include @@ -124,7 +123,7 @@ private: HashMap, HashMapWithSavedHash>>; - /// Here we use SparseHashMap with DefaultHash<> for the following reasons: + /// Here we use sparse_hash_map with DefaultHash<> for the following reasons: /// /// - DefaultHash<> is used for HashMap /// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> @@ -138,8 +137,8 @@ private: template using CollectionTypeSparse = std::conditional_t< dictionary_key_type == DictionaryKeyType::Simple, - SparseHashMap>, - SparseHashMap>>; + google::sparse_hash_map>, + google::sparse_hash_map>>; template using CollectionType = std::conditional_t, CollectionTypeNonSparse>;