128bit hash-functions accepting arbitrary list of arguments

add sipHash128 return UInt128.

better test.

better combineHashes

better test

Fix build
This commit is contained in:
nicelulu 2021-09-13 16:20:20 +08:00 committed by zhangxiao871
parent b8d3994765
commit 7d74f26ce9
5 changed files with 79 additions and 17 deletions

View File

@ -18,6 +18,7 @@
#include <string>
#include <type_traits>
#include <Core/Defines.h>
#include <base/extended_types.h>
#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
@ -191,6 +192,15 @@ inline void sipHash128(const char * data, const size_t size, char * out)
hash.get128(out);
}
inline UInt128 sipHash128(const char * data, const size_t size)
{
SipHash hash;
hash.update(data, size);
UInt128 res;
hash.get128(res);
return res;
}
inline UInt64 sipHash64(const char * data, const size_t size)
{
SipHash hash;

View File

@ -105,6 +105,14 @@ struct IntHash64Impl
}
};
template<typename T, typename HashFunction>
T combineHashesFunc(T t1, T t2)
{
T hashes[] = {t1, t2};
return HashFunction::apply(reinterpret_cast<const char *>(hashes), 2 * sizeof(T));
}
#if USE_SSL
struct HalfMD5Impl
{
@ -250,8 +258,7 @@ struct SipHash64Impl
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
UInt64 hashes[] = {h1, h2};
return apply(reinterpret_cast<const char *>(hashes), 16);
return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2);
}
static constexpr bool use_int_hash_for_pods = false;
@ -260,12 +267,20 @@ struct SipHash64Impl
struct SipHash128Impl
{
static constexpr auto name = "sipHash128";
enum { length = 16 };
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
using ReturnType = UInt128;
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
sipHash128(begin, size, reinterpret_cast<char*>(out_char_data));
return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2);
}
static UInt128 apply(const char * data, const size_t size)
{
return sipHash128(data, size);
}
static constexpr bool use_int_hash_for_pods = false;
};
#if !defined(ARCADIA_BUILD)
@ -350,7 +365,7 @@ struct MurmurHash3Impl32
static UInt32 combineHashes(UInt32 h1, UInt32 h2)
{
return IntHash32Impl::apply(h1) ^ h2;
return combineHashesFunc<UInt32, MurmurHash3Impl32>(h1, h2);
}
static constexpr bool use_int_hash_for_pods = false;
@ -374,7 +389,7 @@ struct MurmurHash3Impl64
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
return IntHash64Impl::apply(h1) ^ h2;
return combineHashesFunc<UInt64, MurmurHash3Impl64>(h1, h2);
}
static constexpr bool use_int_hash_for_pods = false;
@ -383,12 +398,22 @@ struct MurmurHash3Impl64
struct MurmurHash3Impl128
{
static constexpr auto name = "murmurHash3_128";
enum { length = 16 };
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
using ReturnType = UInt128;
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
MurmurHash3_x64_128(begin, size, 0, out_char_data);
return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2);
}
static UInt128 apply(const char * data, const size_t size)
{
char bytes[16];
MurmurHash3_x64_128(data, size, 0, bytes);
return *reinterpret_cast<UInt128 *>(bytes);
}
static constexpr bool use_int_hash_for_pods = false;
};
#endif
@ -1097,7 +1122,12 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
{
return std::make_shared<DataTypeNumber<ToType>>();
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
return std::make_shared<DataTypeFixedString>(sizeof(UInt128));
}
else
return std::make_shared<DataTypeNumber<ToType>>();
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
@ -1119,6 +1149,13 @@ public:
for (const auto & col : arguments)
executeForArgument(col.type.get(), col.column.get(), vec_to, is_first_argument);
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128));
col_to_fixed_string->getChars() = std::move(*reinterpret_cast<ColumnFixedString::Chars *>(&col_to->getData()));
return col_to_fixed_string;
}
return col_to;
}
};
@ -1363,7 +1400,7 @@ using FunctionSHA256 = FunctionStringHashFixedString<SHA256Impl>;
using FunctionSHA384 = FunctionStringHashFixedString<SHA384Impl>;
using FunctionSHA512 = FunctionStringHashFixedString<SHA512Impl>;
#endif
using FunctionSipHash128 = FunctionStringHashFixedString<SipHash128Impl>;
using FunctionSipHash128 = FunctionAnyHash<SipHash128Impl>;
using FunctionCityHash64 = FunctionAnyHash<ImplCityHash64>;
using FunctionFarmFingerprint64 = FunctionAnyHash<ImplFarmFingerprint64>;
using FunctionFarmHash64 = FunctionAnyHash<ImplFarmHash64>;
@ -1375,7 +1412,7 @@ using FunctionMurmurHash2_64 = FunctionAnyHash<MurmurHash2Impl64>;
using FunctionGccMurmurHash = FunctionAnyHash<GccMurmurHashImpl>;
using FunctionMurmurHash3_32 = FunctionAnyHash<MurmurHash3Impl32>;
using FunctionMurmurHash3_64 = FunctionAnyHash<MurmurHash3Impl64>;
using FunctionMurmurHash3_128 = FunctionStringHashFixedString<MurmurHash3Impl128>;
using FunctionMurmurHash3_128 = FunctionAnyHash<MurmurHash3Impl128>;
#endif
using FunctionJavaHash = FunctionAnyHash<JavaHashImpl>;

View File

@ -34,4 +34,3 @@ SELECT gccMurmurHash(1);
SELECT hex(murmurHash3_128('foo'));
SELECT hex(murmurHash3_128('\x01'));

View File

@ -1,6 +1,11 @@
12940785793559895259
17926972817233444501
7456555839952096623
CC45107CC4B79F62D831BEF2103C7CBF
DF2EC2F0669B000EDFF6ADEE264E7D68
4CD1C30C38AB935D418B5269EF197B9E
9D78134EE48654D753CCA1B76185CF8E
389D16428D2AADEC9713905572F42864
955237314186186656
8175794665478042155
9325786087413524176
@ -10,9 +15,11 @@
13951512892560982617
4952008279444388047
15509665835504406222
8163029322371165472
8788309436660676487
236561483980029756
17115680070506536913
8310530586050490380
9559168348687039888
8DD5527CC43D76F4760D26BE0F641F7E
F8F7AD9B6CD4CF117A71E277E2EC2931
12384823029245979431
4507350192761038840
1188926775431157506

View File

@ -4,6 +4,12 @@ SELECT sipHash64(1, 2, 3);
SELECT sipHash64(1, 3, 2);
SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(sipHash128('foo'));
SELECT hex(sipHash128('\x01'));
SELECT hex(sipHash128('foo', 'foo'));
SELECT hex(sipHash128('foo', 'foo', 'foo'));
SELECT hex(sipHash128(1, 2, 3));
SELECT halfMD5(1, 2, 3);
SELECT halfMD5(1, 3, 2);
SELECT halfMD5(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
@ -20,6 +26,9 @@ SELECT murmurHash3_64(1, 2, 3);
SELECT murmurHash3_64(1, 3, 2);
SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(murmurHash3_128('foo', 'foo'));
SELECT hex(murmurHash3_128('foo', 'foo', 'foo'));
SELECT gccMurmurHash(1, 2, 3);
SELECT gccMurmurHash(1, 3, 2);
SELECT gccMurmurHash(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));