ClickHouse/src/Common/SipHash.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

309 lines
8.2 KiB
C++
Raw Normal View History

#pragma once
2017-05-07 20:25:26 +00:00
/** SipHash is a fast cryptographic hash function for short strings.
* Taken from here: https://www.131002.net/siphash/
*
* This is SipHash 2-4 variant.
*
2017-05-07 20:25:26 +00:00
* Two changes are made:
* - returns also 128 bits, not only 64;
2017-05-07 20:25:26 +00:00
* - done streaming (can be calculated in parts).
*
2017-05-07 20:25:26 +00:00
* On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
* (~ 700 MB/sec, 15 million strings per second)
*/
2023-04-21 10:38:45 +00:00
#include <bit>
#include <string>
2018-03-03 15:36:20 +00:00
#include <type_traits>
2019-02-01 10:14:17 +00:00
#include <Core/Defines.h>
#include <base/extended_types.h>
#include <base/types.h>
#include <base/unaligned.h>
#include <base/hex.h>
#include <Common/Exception.h>
2023-11-06 02:49:55 +00:00
#include <Common/transformEndianness.h>
2021-01-26 18:22:40 +00:00
2023-07-06 00:35:44 +00:00
#include <city.h>
2023-04-21 10:38:45 +00:00
2023-11-06 02:49:55 +00:00
namespace DB::ErrorCodes
{
extern const int LOGICAL_ERROR;
}
#define SIPROUND \
do \
{ \
2023-04-21 10:38:45 +00:00
v0 += v1; v1 = std::rotl(v1, 13); v1 ^= v0; v0 = std::rotl(v0, 32); \
v2 += v3; v3 = std::rotl(v3, 16); v3 ^= v2; \
v0 += v3; v3 = std::rotl(v3, 21); v3 ^= v0; \
v2 += v1; v1 = std::rotl(v1, 17); v1 ^= v2; v2 = std::rotl(v2, 32); \
2014-12-19 18:33:30 +00:00
} while(0)
2022-09-18 02:48:08 +00:00
/// Define macro CURRENT_BYTES_IDX for building index used in current_bytes array
/// to ensure correct byte order on different endian machines
2022-09-15 13:25:23 +00:00
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
2022-09-18 02:48:08 +00:00
#define CURRENT_BYTES_IDX(i) (7 - i)
2022-09-15 13:25:23 +00:00
#else
#define CURRENT_BYTES_IDX(i) (i)
#endif
class SipHash
{
private:
/// State.
UInt64 v0;
UInt64 v1;
UInt64 v2;
UInt64 v3;
2017-05-07 20:25:26 +00:00
/// How many bytes have been processed.
UInt64 cnt;
/// Whether it should use the reference algo for 128-bit or CH's version
bool is_reference_128;
2017-05-07 20:25:26 +00:00
/// The current 8 bytes of input data.
union
{
UInt64 current_word;
UInt8 current_bytes[8];
};
2019-02-01 10:14:17 +00:00
ALWAYS_INLINE void finalize()
2013-08-28 17:13:43 +00:00
{
2017-05-07 20:25:26 +00:00
/// In the last free byte, we write the remainder of the division by 256.
2022-09-15 13:25:23 +00:00
current_bytes[CURRENT_BYTES_IDX(7)] = static_cast<UInt8>(cnt);
2013-08-28 17:13:43 +00:00
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
if (is_reference_128)
v2 ^= 0xee;
else
v2 ^= 0xff;
2013-08-28 17:13:43 +00:00
SIPROUND;
SIPROUND;
SIPROUND;
SIPROUND;
}
public:
2017-05-07 20:25:26 +00:00
/// Arguments - seed.
SipHash(UInt64 key0 = 0, UInt64 key1 = 0, bool is_reference_128_ = false) /// NOLINT
{
2017-05-07 20:25:26 +00:00
/// Initialize the state with some random bytes and seed.
v0 = 0x736f6d6570736575ULL ^ key0;
v1 = 0x646f72616e646f6dULL ^ key1;
v2 = 0x6c7967656e657261ULL ^ key0;
v3 = 0x7465646279746573ULL ^ key1;
is_reference_128 = is_reference_128_;
if (is_reference_128)
v1 ^= 0xee;
cnt = 0;
current_word = 0;
}
2022-02-13 11:08:06 +00:00
ALWAYS_INLINE void update(const char * data, UInt64 size)
{
const char * end = data + size;
2017-05-07 20:25:26 +00:00
/// We'll finish to process the remainder of the previous update, if any.
if (cnt & 7)
{
while (cnt & 7 && data < end)
{
2022-09-15 13:25:23 +00:00
current_bytes[CURRENT_BYTES_IDX(cnt & 7)] = *data;
++data;
++cnt;
}
/// If we still do not have enough bytes to an 8-byte word.
if (cnt & 7)
return;
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
}
cnt += end - data;
while (data + 8 <= end)
{
2023-04-21 10:38:45 +00:00
current_word = unalignedLoadLittleEndian<UInt64>(data);
v3 ^= current_word;
SIPROUND;
SIPROUND;
v0 ^= current_word;
data += 8;
}
2017-05-07 20:25:26 +00:00
/// Pad the remainder, which is missing up to an 8-byte word.
current_word = 0;
switch (end - data)
{
2022-09-15 13:25:23 +00:00
case 7: current_bytes[CURRENT_BYTES_IDX(6)] = data[6]; [[fallthrough]];
case 6: current_bytes[CURRENT_BYTES_IDX(5)] = data[5]; [[fallthrough]];
case 5: current_bytes[CURRENT_BYTES_IDX(4)] = data[4]; [[fallthrough]];
case 4: current_bytes[CURRENT_BYTES_IDX(3)] = data[3]; [[fallthrough]];
case 3: current_bytes[CURRENT_BYTES_IDX(2)] = data[2]; [[fallthrough]];
case 2: current_bytes[CURRENT_BYTES_IDX(1)] = data[1]; [[fallthrough]];
case 1: current_bytes[CURRENT_BYTES_IDX(0)] = data[0]; [[fallthrough]];
case 0: break;
}
}
template <typename Transform = void, typename T>
2022-02-13 11:08:06 +00:00
ALWAYS_INLINE void update(const T & x)
{
if constexpr (std::endian::native == std::endian::big)
{
auto transformed_x = x;
if constexpr (!std::is_same_v<Transform, void>)
transformed_x = Transform()(x);
else
DB::transformEndianness<std::endian::little>(transformed_x);
update(reinterpret_cast<const char *>(&transformed_x), sizeof(transformed_x)); /// NOLINT
}
else
update(reinterpret_cast<const char *>(&x), sizeof(x)); /// NOLINT
}
ALWAYS_INLINE void update(const std::string & x) { update(x.data(), x.length()); }
ALWAYS_INLINE void update(const std::string_view x) { update(x.data(), x.size()); }
ALWAYS_INLINE void update(const char * s) { update(std::string_view(s)); }
2022-05-13 13:43:42 +00:00
ALWAYS_INLINE UInt64 get64()
{
2013-08-28 17:13:43 +00:00
finalize();
return v0 ^ v1 ^ v2 ^ v3;
}
template <typename T>
requires (sizeof(T) == 8)
2019-02-01 10:14:17 +00:00
ALWAYS_INLINE void get128(T & lo, T & hi)
2013-08-28 17:13:43 +00:00
{
finalize();
lo = v0 ^ v1;
hi = v2 ^ v3;
}
ALWAYS_INLINE UInt128 get128()
{
UInt128 res;
get128(res.items[UInt128::_impl::little(0)], res.items[UInt128::_impl::little(1)]);
return res;
}
UInt128 get128Reference()
{
if (!is_reference_128)
throw DB::Exception(
2024-02-21 10:33:08 +00:00
DB::ErrorCodes::LOGICAL_ERROR, "Logical error: can't call get128Reference when is_reference_128 is not set");
finalize();
const auto lo = v0 ^ v1 ^ v2 ^ v3;
v1 ^= 0xdd;
SIPROUND;
SIPROUND;
SIPROUND;
SIPROUND;
const auto hi = v0 ^ v1 ^ v2 ^ v3;
2023-04-21 10:38:45 +00:00
UInt128 res = hi;
res <<= 64;
res |= lo;
return res;
}
};
#undef ROTL
#undef SIPROUND
#include <cstddef>
inline std::array<char, 16> getSipHash128AsArray(SipHash & sip_hash)
{
std::array<char, 16> arr;
*reinterpret_cast<UInt128*>(arr.data()) = sip_hash.get128();
return arr;
}
2023-07-31 13:48:50 +00:00
inline CityHash_v1_0_2::uint128 getSipHash128AsPair(SipHash & sip_hash)
{
2023-07-06 00:35:44 +00:00
CityHash_v1_0_2::uint128 result;
sip_hash.get128(result.low64, result.high64);
return result;
}
inline UInt128 sipHash128Keyed(UInt64 key0, UInt64 key1, const char * data, const size_t size)
{
SipHash hash(key0, key1);
hash.update(data, size);
return hash.get128();
}
inline UInt128 sipHash128(const char * data, const size_t size)
{
return sipHash128Keyed(0, 0, data, size);
}
inline String sipHash128String(const char * data, const size_t size)
{
return getHexUIntLowercase(sipHash128(data, size));
}
inline String sipHash128String(const String & str)
{
return sipHash128String(str.data(), str.size());
}
inline UInt128 sipHash128ReferenceKeyed(UInt64 key0, UInt64 key1, const char * data, const size_t size)
{
SipHash hash(key0, key1, true);
hash.update(data, size);
return hash.get128Reference();
}
inline UInt128 sipHash128Reference(const char * data, const size_t size)
{
return sipHash128ReferenceKeyed(0, 0, data, size);
}
inline UInt64 sipHash64Keyed(UInt64 key0, UInt64 key1, const char * data, const size_t size)
{
SipHash hash(key0, key1);
hash.update(data, size);
return hash.get64();
}
inline UInt64 sipHash64(const char * data, const size_t size)
{
return sipHash64Keyed(0, 0, data, size);
}
2018-03-03 15:36:20 +00:00
template <typename T>
2023-06-23 15:22:21 +00:00
inline UInt64 sipHash64(const T & x)
{
SipHash hash;
hash.update(x);
return hash.get64();
}
inline UInt64 sipHash64(const std::string & s)
{
return sipHash64(s.data(), s.size());
}
2022-09-18 02:48:08 +00:00
#undef CURRENT_BYTES_IDX