#pragma once #include /** Hash functions that are better than the trivial function std::hash. * * Example: when aggregated by the visitor ID, the performance increase is more than 5 times. * This is because of following reasons: * - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits; * - in typical implementation of standard library, hash function for integers is trivial and just use lower bits; * - traffic is non-uniformly distributed across a day; * - we are using open-addressing linear probing hash tables that are most critical to hash function quality, * and trivial hash function gives disasterous results. */ /** Taken from MurmurHash. This is Murmur finalizer. * Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID. */ inline DB::UInt64 intHash64(DB::UInt64 x) { x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33; return x; } /** CRC32C is not very high-quality as a hash function, * according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits, * but can behave well when used in hash tables, * due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle). * Works only with SSE 4.2 support. */ #if __SSE4_2__ #include #endif inline DB::UInt64 intHashCRC32(DB::UInt64 x) { #if __SSE4_2__ return _mm_crc32_u64(-1ULL, x); #else /// On other platforms we do not have CRC32. NOTE This can be confusing. return intHash64(x); #endif } template struct DefaultHash; template inline size_t DefaultHash64(T key) { union { T in; DB::UInt64 out; } u; u.out = 0; u.in = key; return intHash64(u.out); } #define DEFINE_HASH(T) \ template <> struct DefaultHash\ {\ size_t operator() (T key) const\ {\ return DefaultHash64(key);\ }\ }; DEFINE_HASH(DB::UInt8) DEFINE_HASH(DB::UInt16) DEFINE_HASH(DB::UInt32) DEFINE_HASH(DB::UInt64) DEFINE_HASH(DB::Int8) DEFINE_HASH(DB::Int16) DEFINE_HASH(DB::Int32) DEFINE_HASH(DB::Int64) DEFINE_HASH(DB::Float32) DEFINE_HASH(DB::Float64) #undef DEFINE_HASH template struct HashCRC32; template inline size_t hashCRC32(T key) { union { T in; DB::UInt64 out; } u; u.out = 0; u.in = key; return intHashCRC32(u.out); } #define DEFINE_HASH(T) \ template <> struct HashCRC32\ {\ size_t operator() (T key) const\ {\ return hashCRC32(key);\ }\ }; DEFINE_HASH(DB::UInt8) DEFINE_HASH(DB::UInt16) DEFINE_HASH(DB::UInt32) DEFINE_HASH(DB::UInt64) DEFINE_HASH(DB::Int8) DEFINE_HASH(DB::Int16) DEFINE_HASH(DB::Int32) DEFINE_HASH(DB::Int64) DEFINE_HASH(DB::Float32) DEFINE_HASH(DB::Float64) #undef DEFINE_HASH /// It is reasonable to use for UInt8, UInt16 with sufficient hash table size. struct TrivialHash { template size_t operator() (T key) const { return key; } }; /** A relatively good non-cryptographic hash function from UInt64 to UInt32. * But worse (both in quality and speed) than just cutting intHash64. * Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm * * Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right. * This change did not affect the smhasher test results. * * It is recommended to use different salt for different tasks. * That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread), * and in another place, in the aggregate function, the same hash was used in the hash table, * as a result, this aggregate function was monstrously slowed due to collisions. * * NOTE Salting is far from perfect, because it commutes with first steps of calculation. * * NOTE As mentioned, this function is slower than intHash64. * But occasionaly, it is faster, when written in a loop and loop is vectorized. */ template inline DB::UInt32 intHash32(DB::UInt64 key) { key ^= salt; key = (~key) + (key << 18); key = key ^ ((key >> 31) | (key << 33)); key = key * 21; key = key ^ ((key >> 11) | (key << 53)); key = key + (key << 6); key = key ^ ((key >> 22) | (key << 42)); return key; } /// For containers. template struct IntHash32 { size_t operator() (const T & key) const { return intHash32(key); } };