diff --git a/src/Common/SparseHashMap.h b/src/Common/SparseHashMap.h index 3f38d52a2b8..b2f29b75933 100644 --- a/src/Common/SparseHashMap.h +++ b/src/Common/SparseHashMap.h @@ -4,6 +4,16 @@ #include +/// Hash function for sparse hash table is very important, for example: +/// +/// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> +/// in case of sequential set of keys, but with random access to this set, i.e. +/// +/// SELECT number FROM numbers(3000000) ORDER BY rand() +/// +/// - but std::hash<> works good enough (and sometimes better) for generic cases +/// +/// So std::hash<> by default is preferred. template , class EqualKey = std::equal_to, class Alloc = google::libc_allocator_with_realloc>> diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index 23919c009c5..2c871c38075 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -124,11 +124,22 @@ private: HashMap, HashMapWithSavedHash>>; + /// Here we use SparseHashMap with DefaultHash<> for the following reasons: + /// + /// - DefaultHash<> is used for HashMap + /// - DefaultHash<> (from HashTable/Hash.h> works better then std::hash<> + /// in case of sequential set of keys, but with random access to this set, i.e. + /// + /// SELECT number FROM numbers(3000000) ORDER BY rand() + /// + /// And even though std::hash<> works better in some other cases, + /// DefaultHash<> is preferred since the difference for this particular + /// case is significant, i.e. it can be 10x+. template using CollectionTypeSparse = std::conditional_t< dictionary_key_type == DictionaryKeyType::Simple, - SparseHashMap, - SparseHashMap>; + SparseHashMap>, + SparseHashMap>>; template using CollectionType = std::conditional_t, CollectionTypeNonSparse>;