Merge pull request #28965 from nicelulu/issues_28774

128bit hash-functions accepting arbitrary list of arguments
This commit is contained in:
Dmitry Novik 2021-11-09 14:40:25 +03:00 committed by GitHub
commit 406bb4d997
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 74 additions and 12 deletions

View File

@ -18,6 +18,7 @@
#include <string>
#include <type_traits>
#include <Core/Defines.h>
#include <base/extended_types.h>
#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
@ -191,6 +192,15 @@ inline void sipHash128(const char * data, const size_t size, char * out)
hash.get128(out);
}
inline UInt128 sipHash128(const char * data, const size_t size)
{
SipHash hash;
hash.update(data, size);
UInt128 res;
hash.get128(res);
return res;
}
inline UInt64 sipHash64(const char * data, const size_t size)
{
SipHash hash;

View File

@ -103,6 +103,14 @@ struct IntHash64Impl
}
};
template<typename T, typename HashFunction>
T combineHashesFunc(T t1, T t2)
{
T hashes[] = {t1, t2};
return HashFunction::apply(reinterpret_cast<const char *>(hashes), 2 * sizeof(T));
}
#if USE_SSL
struct HalfMD5Impl
{
@ -248,8 +256,7 @@ struct SipHash64Impl
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
UInt64 hashes[] = {h1, h2};
return apply(reinterpret_cast<const char *>(hashes), 16);
return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2);
}
static constexpr bool use_int_hash_for_pods = false;
@ -258,12 +265,20 @@ struct SipHash64Impl
struct SipHash128Impl
{
static constexpr auto name = "sipHash128";
enum { length = 16 };
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
using ReturnType = UInt128;
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
sipHash128(begin, size, reinterpret_cast<char*>(out_char_data));
return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2);
}
static UInt128 apply(const char * data, const size_t size)
{
return sipHash128(data, size);
}
static constexpr bool use_int_hash_for_pods = false;
};
/** Why we need MurmurHash2?
@ -380,12 +395,22 @@ struct MurmurHash3Impl64
struct MurmurHash3Impl128
{
static constexpr auto name = "murmurHash3_128";
enum { length = 16 };
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
using ReturnType = UInt128;
static UInt128 apply(const char * data, const size_t size)
{
MurmurHash3_x64_128(begin, size, 0, out_char_data);
char bytes[16];
MurmurHash3_x64_128(data, size, 0, bytes);
return *reinterpret_cast<UInt128 *>(bytes);
}
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2);
}
static constexpr bool use_int_hash_for_pods = false;
};
/// http://hg.openjdk.java.net/jdk8u/jdk8u/jdk/file/478a4add975b/src/share/classes/java/lang/String.java#l1452
@ -1093,6 +1118,11 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override
{
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
return std::make_shared<DataTypeFixedString>(sizeof(UInt128));
}
else
return std::make_shared<DataTypeNumber<ToType>>();
}
@ -1115,6 +1145,13 @@ public:
for (const auto & col : arguments)
executeForArgument(col.type.get(), col.column.get(), vec_to, is_first_argument);
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128));
col_to_fixed_string->getChars() = std::move(*reinterpret_cast<ColumnFixedString::Chars *>(&col_to->getData()));
return col_to_fixed_string;
}
return col_to;
}
};
@ -1359,7 +1396,7 @@ using FunctionSHA256 = FunctionStringHashFixedString<SHA256Impl>;
using FunctionSHA384 = FunctionStringHashFixedString<SHA384Impl>;
using FunctionSHA512 = FunctionStringHashFixedString<SHA512Impl>;
#endif
using FunctionSipHash128 = FunctionStringHashFixedString<SipHash128Impl>;
using FunctionSipHash128 = FunctionAnyHash<SipHash128Impl>;
using FunctionCityHash64 = FunctionAnyHash<ImplCityHash64>;
using FunctionFarmFingerprint64 = FunctionAnyHash<ImplFarmFingerprint64>;
using FunctionFarmHash64 = FunctionAnyHash<ImplFarmHash64>;
@ -1370,7 +1407,7 @@ using FunctionMurmurHash2_64 = FunctionAnyHash<MurmurHash2Impl64>;
using FunctionGccMurmurHash = FunctionAnyHash<GccMurmurHashImpl>;
using FunctionMurmurHash3_32 = FunctionAnyHash<MurmurHash3Impl32>;
using FunctionMurmurHash3_64 = FunctionAnyHash<MurmurHash3Impl64>;
using FunctionMurmurHash3_128 = FunctionStringHashFixedString<MurmurHash3Impl128>;
using FunctionMurmurHash3_128 = FunctionAnyHash<MurmurHash3Impl128>;
using FunctionJavaHash = FunctionAnyHash<JavaHashImpl>;
using FunctionJavaHashUTF16LE = FunctionAnyHash<JavaHashUTF16LEImpl>;

View File

@ -34,4 +34,3 @@ SELECT gccMurmurHash(1);
SELECT hex(murmurHash3_128('foo'));
SELECT hex(murmurHash3_128('\x01'));

View File

@ -1,6 +1,11 @@
12940785793559895259
17926972817233444501
7456555839952096623
CC45107CC4B79F62D831BEF2103C7CBF
DF2EC2F0669B000EDFF6ADEE264E7D68
4CD1C30C38AB935D418B5269EF197B9E
9D78134EE48654D753CCA1B76185CF8E
389D16428D2AADEC9713905572F42864
955237314186186656
8175794665478042155
9325786087413524176
@ -13,6 +18,8 @@
8163029322371165472
8788309436660676487
236561483980029756
8DD5527CC43D76F4760D26BE0F641F7E
F8F7AD9B6CD4CF117A71E277E2EC2931
12384823029245979431
4507350192761038840
1188926775431157506

View File

@ -4,6 +4,12 @@ SELECT sipHash64(1, 2, 3);
SELECT sipHash64(1, 3, 2);
SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(sipHash128('foo'));
SELECT hex(sipHash128('\x01'));
SELECT hex(sipHash128('foo', 'foo'));
SELECT hex(sipHash128('foo', 'foo', 'foo'));
SELECT hex(sipHash128(1, 2, 3));
SELECT halfMD5(1, 2, 3);
SELECT halfMD5(1, 3, 2);
SELECT halfMD5(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
@ -20,6 +26,9 @@ SELECT murmurHash3_64(1, 2, 3);
SELECT murmurHash3_64(1, 3, 2);
SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(murmurHash3_128('foo', 'foo'));
SELECT hex(murmurHash3_128('foo', 'foo', 'foo'));
SELECT gccMurmurHash(1, 2, 3);
SELECT gccMurmurHash(1, 3, 2);
SELECT gccMurmurHash(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));