ClickHouse/dbms/Common/SipHash.h

207 lines
5.2 KiB
C++
Raw Normal View History

#pragma once
2017-05-07 20:25:26 +00:00
/** SipHash is a fast cryptographic hash function for short strings.
* Taken from here: https://www.131002.net/siphash/
*
* This is SipHash 2-4 variant.
*
2017-05-07 20:25:26 +00:00
* Two changes are made:
* - returns also 128 bits, not only 64;
2017-05-07 20:25:26 +00:00
* - done streaming (can be calculated in parts).
*
2017-05-07 20:25:26 +00:00
* On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
* (~ 700 MB/sec, 15 million strings per second)
*/
#include <common/types.h>
2018-09-03 10:14:05 +00:00
#include <common/unaligned.h>
#include <string>
2018-03-03 15:36:20 +00:00
#include <type_traits>
2019-02-01 10:14:17 +00:00
#include <Core/Defines.h>
#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
#define SIPROUND \
do \
{ \
v0 += v1; v1 = ROTL(v1, 13); v1 ^= v0; v0 = ROTL(v0, 32); \
v2 += v3; v3 = ROTL(v3, 16); v3 ^= v2; \
v0 += v3; v3 = ROTL(v3, 21); v3 ^= v0; \
v2 += v1; v1 = ROTL(v1, 17); v1 ^= v2; v2 = ROTL(v2, 32); \
} while(0)
class SipHash
{
private:
/// State.
UInt64 v0;
UInt64 v1;
UInt64 v2;
UInt64 v3;
2017-05-07 20:25:26 +00:00
/// How many bytes have been processed.
UInt64 cnt;
2017-05-07 20:25:26 +00:00
/// The current 8 bytes of input data.
union
{
UInt64 current_word;
UInt8 current_bytes[8];
};
2019-02-01 10:14:17 +00:00
ALWAYS_INLINE void finalize()
{
2017-05-07 20:25:26 +00:00
/// In the last free byte, we write the remainder of the division by 256.
current_bytes[7] = cnt;
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
v2 ^= 0xff;
SIPROUND;
SIPROUND;
SIPROUND;
SIPROUND;
}
public:
2017-05-07 20:25:26 +00:00
/// Arguments - seed.
SipHash(UInt64 k0 = 0, UInt64 k1 = 0)
{
2017-05-07 20:25:26 +00:00
/// Initialize the state with some random bytes and seed.
v0 = 0x736f6d6570736575ULL ^ k0;
v1 = 0x646f72616e646f6dULL ^ k1;
v2 = 0x6c7967656e657261ULL ^ k0;
v3 = 0x7465646279746573ULL ^ k1;
cnt = 0;
current_word = 0;
}
void update(const char * data, UInt64 size)
{
const char * end = data + size;
2017-05-07 20:25:26 +00:00
/// We'll finish to process the remainder of the previous update, if any.
if (cnt & 7)
{
while (cnt & 7 && data < end)
{
current_bytes[cnt & 7] = *data;
++data;
++cnt;
}
/// If we still do not have enough bytes to an 8-byte word.
if (cnt & 7)
return;
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
}
cnt += end - data;
while (data + 8 <= end)
{
2018-09-03 10:14:05 +00:00
current_word = unalignedLoad<UInt64>(data);
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
data += 8;
}
2017-05-07 20:25:26 +00:00
/// Pad the remainder, which is missing up to an 8-byte word.
current_word = 0;
switch (end - data)
{
2017-12-02 02:58:25 +00:00
case 7: current_bytes[6] = data[6]; [[fallthrough]];
case 6: current_bytes[5] = data[5]; [[fallthrough]];
case 5: current_bytes[4] = data[4]; [[fallthrough]];
case 4: current_bytes[3] = data[3]; [[fallthrough]];
case 3: current_bytes[2] = data[2]; [[fallthrough]];
case 2: current_bytes[1] = data[1]; [[fallthrough]];
case 1: current_bytes[0] = data[0]; [[fallthrough]];
case 0: break;
}
}
2018-03-03 15:36:20 +00:00
/// NOTE: std::has_unique_object_representations is only available since clang 6. As of Mar 2017 we still use clang 5 sometimes.
template <typename T>
std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, void> update(const T & x)
{
update(reinterpret_cast<const char *>(&x), sizeof(x));
}
void update(const std::string & x)
{
2018-09-03 10:14:05 +00:00
update(x.data(), x.length());
}
2017-05-07 20:25:26 +00:00
/// Get the result in some form. This can only be done once!
void get128(char * out)
{
finalize();
unalignedStore<UInt64>(out, v0 ^ v1);
unalignedStore<UInt64>(out + 8, v2 ^ v3);
}
/// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos
template <typename T>
2019-02-01 10:14:17 +00:00
ALWAYS_INLINE void get128(T & lo, T & hi)
{
static_assert(sizeof(T) == 8);
finalize();
lo = v0 ^ v1;
hi = v2 ^ v3;
}
UInt64 get64()
{
finalize();
return v0 ^ v1 ^ v2 ^ v3;
}
};
#undef ROTL
#undef SIPROUND
#include <cstddef>
inline void sipHash128(const char * data, const size_t size, char * out)
{
SipHash hash;
hash.update(data, size);
hash.get128(out);
}
inline UInt64 sipHash64(const char * data, const size_t size)
{
SipHash hash;
hash.update(data, size);
return hash.get64();
}
2018-03-03 15:36:20 +00:00
template <typename T>
std::enable_if_t<std::/*has_unique_object_representations_v*/is_standard_layout_v<T>, UInt64> sipHash64(const T & x)
{
SipHash hash;
hash.update(x);
return hash.get64();
}
inline UInt64 sipHash64(const std::string & s)
{
return sipHash64(s.data(), s.size());
}