2014-03-17 02:01:03 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Core/Types.h>
|
2017-11-09 23:37:57 +00:00
|
|
|
#include <Common/UInt128.h>
|
2014-03-17 02:01:03 +00:00
|
|
|
|
2019-08-07 21:53:50 +00:00
|
|
|
#include <type_traits>
|
|
|
|
|
2014-03-17 02:01:03 +00:00
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Hash functions that are better than the trivial function std::hash.
|
2017-05-09 19:07:35 +00:00
|
|
|
*
|
2017-05-10 04:00:19 +00:00
|
|
|
* Example: when we do aggregation by the visitor ID, the performance increase is more than 5 times.
|
2017-05-09 19:07:35 +00:00
|
|
|
* This is because of following reasons:
|
|
|
|
* - in Yandex, visitor identifier is an integer that has timestamp with seconds resolution in lower bits;
|
|
|
|
* - in typical implementation of standard library, hash function for integers is trivial and just use lower bits;
|
|
|
|
* - traffic is non-uniformly distributed across a day;
|
|
|
|
* - we are using open-addressing linear probing hash tables that are most critical to hash function quality,
|
2019-01-22 19:56:53 +00:00
|
|
|
* and trivial hash function gives disastrous results.
|
2014-03-17 02:01:03 +00:00
|
|
|
*/
|
2014-05-10 05:17:08 +00:00
|
|
|
|
2017-05-09 19:07:35 +00:00
|
|
|
/** Taken from MurmurHash. This is Murmur finalizer.
|
2017-05-07 20:25:26 +00:00
|
|
|
* Faster than intHash32 when inserting into the hash table UInt64 -> UInt64, where the key is the visitor ID.
|
2014-05-10 05:17:08 +00:00
|
|
|
*/
|
|
|
|
inline DB::UInt64 intHash64(DB::UInt64 x)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
x ^= x >> 33;
|
|
|
|
x *= 0xff51afd7ed558ccdULL;
|
|
|
|
x ^= x >> 33;
|
|
|
|
x *= 0xc4ceb9fe1a85ec53ULL;
|
|
|
|
x ^= x >> 33;
|
2014-05-10 05:17:08 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return x;
|
2014-05-10 05:17:08 +00:00
|
|
|
}
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** CRC32C is not very high-quality as a hash function,
|
2017-05-09 19:07:35 +00:00
|
|
|
* according to avalanche and bit independence tests (see SMHasher software), as well as a small number of bits,
|
2017-05-07 20:25:26 +00:00
|
|
|
* but can behave well when used in hash tables,
|
|
|
|
* due to high speed (latency 3 + 1 clock cycle, throughput 1 clock cycle).
|
|
|
|
* Works only with SSE 4.2 support.
|
2014-12-26 03:41:31 +00:00
|
|
|
*/
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_2__
|
2017-05-09 19:07:35 +00:00
|
|
|
#include <nmmintrin.h>
|
|
|
|
#endif
|
|
|
|
|
2019-01-04 12:10:00 +00:00
|
|
|
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
2018-06-15 05:21:12 +00:00
|
|
|
#include <arm_acle.h>
|
|
|
|
#include <arm_neon.h>
|
|
|
|
#endif
|
|
|
|
|
2014-12-26 03:41:31 +00:00
|
|
|
inline DB::UInt64 intHashCRC32(DB::UInt64 x)
|
|
|
|
{
|
2019-01-04 12:10:00 +00:00
|
|
|
#ifdef __SSE4_2__
|
2017-05-09 19:07:35 +00:00
|
|
|
return _mm_crc32_u64(-1ULL, x);
|
2019-01-04 12:10:00 +00:00
|
|
|
#elif defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
|
2018-11-10 20:09:07 +00:00
|
|
|
return __crc32cd(-1U, x);
|
2016-01-13 20:21:56 +00:00
|
|
|
#else
|
2017-05-09 19:07:35 +00:00
|
|
|
/// On other platforms we do not have CRC32. NOTE This can be confusing.
|
2017-04-01 07:20:54 +00:00
|
|
|
return intHash64(x);
|
2016-01-13 20:21:56 +00:00
|
|
|
#endif
|
2014-12-26 03:41:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-03-17 02:01:03 +00:00
|
|
|
template <typename T>
|
|
|
|
inline size_t DefaultHash64(T key)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
union
|
|
|
|
{
|
|
|
|
T in;
|
|
|
|
DB::UInt64 out;
|
|
|
|
} u;
|
|
|
|
u.out = 0;
|
|
|
|
u.in = key;
|
|
|
|
return intHash64(u.out);
|
2014-03-17 02:01:03 +00:00
|
|
|
}
|
|
|
|
|
2019-08-07 21:53:50 +00:00
|
|
|
template <typename T, typename Enable = void>
|
|
|
|
struct DefaultHash;
|
2014-03-17 02:01:03 +00:00
|
|
|
|
2019-08-07 21:53:50 +00:00
|
|
|
template <typename T>
|
2019-11-02 05:55:06 +00:00
|
|
|
struct DefaultHash<T, std::enable_if_t<is_arithmetic_v<T>>>
|
2019-08-07 21:53:50 +00:00
|
|
|
{
|
|
|
|
size_t operator() (T key) const
|
|
|
|
{
|
|
|
|
return DefaultHash64<T>(key);
|
|
|
|
}
|
|
|
|
};
|
2014-12-26 03:41:31 +00:00
|
|
|
|
|
|
|
|
|
|
|
template <typename T> struct HashCRC32;
|
|
|
|
|
|
|
|
template <typename T>
|
|
|
|
inline size_t hashCRC32(T key)
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
union
|
|
|
|
{
|
|
|
|
T in;
|
|
|
|
DB::UInt64 out;
|
|
|
|
} u;
|
|
|
|
u.out = 0;
|
|
|
|
u.in = key;
|
|
|
|
return intHashCRC32(u.out);
|
2014-12-26 03:41:31 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
#define DEFINE_HASH(T) \
|
|
|
|
template <> struct HashCRC32<T>\
|
|
|
|
{\
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t operator() (T key) const\
|
|
|
|
{\
|
|
|
|
return hashCRC32<T>(key);\
|
|
|
|
}\
|
2014-12-26 03:41:31 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
DEFINE_HASH(DB::UInt8)
|
|
|
|
DEFINE_HASH(DB::UInt16)
|
|
|
|
DEFINE_HASH(DB::UInt32)
|
|
|
|
DEFINE_HASH(DB::UInt64)
|
2017-11-09 23:37:57 +00:00
|
|
|
DEFINE_HASH(DB::UInt128)
|
2014-12-26 03:41:31 +00:00
|
|
|
DEFINE_HASH(DB::Int8)
|
|
|
|
DEFINE_HASH(DB::Int16)
|
|
|
|
DEFINE_HASH(DB::Int32)
|
|
|
|
DEFINE_HASH(DB::Int64)
|
|
|
|
DEFINE_HASH(DB::Float32)
|
|
|
|
DEFINE_HASH(DB::Float64)
|
|
|
|
|
|
|
|
#undef DEFINE_HASH
|
2015-02-13 01:17:44 +00:00
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// It is reasonable to use for UInt8, UInt16 with sufficient hash table size.
|
2015-02-13 01:17:44 +00:00
|
|
|
struct TrivialHash
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
template <typename T>
|
|
|
|
size_t operator() (T key) const
|
|
|
|
{
|
|
|
|
return key;
|
|
|
|
}
|
2015-02-13 01:17:44 +00:00
|
|
|
};
|
2015-11-15 09:06:53 +00:00
|
|
|
|
|
|
|
|
2017-05-09 19:07:35 +00:00
|
|
|
/** A relatively good non-cryptographic hash function from UInt64 to UInt32.
|
2017-05-07 20:25:26 +00:00
|
|
|
* But worse (both in quality and speed) than just cutting intHash64.
|
|
|
|
* Taken from here: http://www.concentric.net/~ttwang/tech/inthash.htm
|
2015-11-15 09:06:53 +00:00
|
|
|
*
|
2017-05-07 20:25:26 +00:00
|
|
|
* Slightly changed compared to the function by link: shifts to the right are accidentally replaced by a cyclic shift to the right.
|
|
|
|
* This change did not affect the smhasher test results.
|
2015-11-15 09:06:53 +00:00
|
|
|
*
|
2017-05-07 20:25:26 +00:00
|
|
|
* It is recommended to use different salt for different tasks.
|
2017-05-09 19:07:35 +00:00
|
|
|
* That was the case that in the database values were sorted by hash (for low-quality pseudo-random spread),
|
2017-05-07 20:25:26 +00:00
|
|
|
* and in another place, in the aggregate function, the same hash was used in the hash table,
|
|
|
|
* as a result, this aggregate function was monstrously slowed due to collisions.
|
2017-05-09 19:07:35 +00:00
|
|
|
*
|
|
|
|
* NOTE Salting is far from perfect, because it commutes with first steps of calculation.
|
|
|
|
*
|
|
|
|
* NOTE As mentioned, this function is slower than intHash64.
|
2019-01-22 19:56:53 +00:00
|
|
|
* But occasionally, it is faster, when written in a loop and loop is vectorized.
|
2015-11-15 09:06:53 +00:00
|
|
|
*/
|
2015-11-15 09:17:11 +00:00
|
|
|
template <DB::UInt64 salt>
|
|
|
|
inline DB::UInt32 intHash32(DB::UInt64 key)
|
2015-11-15 09:06:53 +00:00
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
key ^= salt;
|
2015-11-15 09:06:53 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
key = (~key) + (key << 18);
|
|
|
|
key = key ^ ((key >> 31) | (key << 33));
|
|
|
|
key = key * 21;
|
|
|
|
key = key ^ ((key >> 11) | (key << 53));
|
|
|
|
key = key + (key << 6);
|
|
|
|
key = key ^ ((key >> 22) | (key << 42));
|
2015-11-15 09:06:53 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
return key;
|
2015-11-15 09:06:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// For containers.
|
2015-11-15 09:17:11 +00:00
|
|
|
template <typename T, DB::UInt64 salt = 0>
|
2015-11-15 09:06:53 +00:00
|
|
|
struct IntHash32
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t operator() (const T & key) const
|
|
|
|
{
|
|
|
|
return intHash32<salt>(key);
|
|
|
|
}
|
2015-11-15 09:06:53 +00:00
|
|
|
};
|