From 7d74f26ce993261d8e99d9e5799bc1fa4a220300 Mon Sep 17 00:00:00 2001 From: nicelulu <821008736@qq.com> Date: Mon, 13 Sep 2021 16:20:20 +0800 Subject: [PATCH 1/3] 128bit hash-functions accepting arbitrary list of arguments add sipHash128 return UInt128. better test. better combineHashes better test Fix build --- src/Common/SipHash.h | 10 +++ src/Functions/FunctionsHashing.h | 63 +++++++++++++++---- .../queries/0_stateless/00678_murmurhash.sql | 1 - .../00746_hashing_tuples.reference | 13 +++- .../0_stateless/00746_hashing_tuples.sql | 9 +++ 5 files changed, 79 insertions(+), 17 deletions(-) diff --git a/src/Common/SipHash.h b/src/Common/SipHash.h index 559744abe72..e0d0b5ed631 100644 --- a/src/Common/SipHash.h +++ b/src/Common/SipHash.h @@ -18,6 +18,7 @@ #include #include #include +#include #define ROTL(x, b) static_cast(((x) << (b)) | ((x) >> (64 - (b)))) @@ -191,6 +192,15 @@ inline void sipHash128(const char * data, const size_t size, char * out) hash.get128(out); } +inline UInt128 sipHash128(const char * data, const size_t size) +{ + SipHash hash; + hash.update(data, size); + UInt128 res; + hash.get128(res); + return res; +} + inline UInt64 sipHash64(const char * data, const size_t size) { SipHash hash; diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 0e9f54229a5..0e0758272a3 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -105,6 +105,14 @@ struct IntHash64Impl } }; +template +T combineHashesFunc(T t1, T t2) +{ + T hashes[] = {t1, t2}; + return HashFunction::apply(reinterpret_cast(hashes), 2 * sizeof(T)); +} + + #if USE_SSL struct HalfMD5Impl { @@ -250,8 +258,7 @@ struct SipHash64Impl static UInt64 combineHashes(UInt64 h1, UInt64 h2) { - UInt64 hashes[] = {h1, h2}; - return apply(reinterpret_cast(hashes), 16); + return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; @@ -260,12 +267,20 @@ struct SipHash64Impl struct SipHash128Impl { static constexpr auto name = "sipHash128"; - enum { length = 16 }; - static void apply(const char * begin, const size_t size, unsigned char * out_char_data) + using ReturnType = UInt128; + + static UInt128 combineHashes(UInt128 h1, UInt128 h2) { - sipHash128(begin, size, reinterpret_cast(out_char_data)); + return combineHashesFunc(h1, h2); } + + static UInt128 apply(const char * data, const size_t size) + { + return sipHash128(data, size); + } + + static constexpr bool use_int_hash_for_pods = false; }; #if !defined(ARCADIA_BUILD) @@ -350,7 +365,7 @@ struct MurmurHash3Impl32 static UInt32 combineHashes(UInt32 h1, UInt32 h2) { - return IntHash32Impl::apply(h1) ^ h2; + return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; @@ -374,7 +389,7 @@ struct MurmurHash3Impl64 static UInt64 combineHashes(UInt64 h1, UInt64 h2) { - return IntHash64Impl::apply(h1) ^ h2; + return combineHashesFunc(h1, h2); } static constexpr bool use_int_hash_for_pods = false; @@ -383,12 +398,22 @@ struct MurmurHash3Impl64 struct MurmurHash3Impl128 { static constexpr auto name = "murmurHash3_128"; - enum { length = 16 }; - static void apply(const char * begin, const size_t size, unsigned char * out_char_data) + using ReturnType = UInt128; + + static UInt128 combineHashes(UInt128 h1, UInt128 h2) { - MurmurHash3_x64_128(begin, size, 0, out_char_data); + return combineHashesFunc(h1, h2); } + + static UInt128 apply(const char * data, const size_t size) + { + char bytes[16]; + MurmurHash3_x64_128(data, size, 0, bytes); + return *reinterpret_cast(bytes); + } + + static constexpr bool use_int_hash_for_pods = false; }; #endif @@ -1097,7 +1122,12 @@ public: DataTypePtr getReturnTypeImpl(const DataTypes & /*arguments*/) const override { - return std::make_shared>(); + if constexpr (std::is_same_v) /// backward-compatible + { + return std::make_shared(sizeof(UInt128)); + } + else + return std::make_shared>(); } ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override @@ -1119,6 +1149,13 @@ public: for (const auto & col : arguments) executeForArgument(col.type.get(), col.column.get(), vec_to, is_first_argument); + if constexpr (std::is_same_v) /// backward-compatible + { + auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128)); + col_to_fixed_string->getChars() = std::move(*reinterpret_cast(&col_to->getData())); + return col_to_fixed_string; + } + return col_to; } }; @@ -1363,7 +1400,7 @@ using FunctionSHA256 = FunctionStringHashFixedString; using FunctionSHA384 = FunctionStringHashFixedString; using FunctionSHA512 = FunctionStringHashFixedString; #endif -using FunctionSipHash128 = FunctionStringHashFixedString; +using FunctionSipHash128 = FunctionAnyHash; using FunctionCityHash64 = FunctionAnyHash; using FunctionFarmFingerprint64 = FunctionAnyHash; using FunctionFarmHash64 = FunctionAnyHash; @@ -1375,7 +1412,7 @@ using FunctionMurmurHash2_64 = FunctionAnyHash; using FunctionGccMurmurHash = FunctionAnyHash; using FunctionMurmurHash3_32 = FunctionAnyHash; using FunctionMurmurHash3_64 = FunctionAnyHash; -using FunctionMurmurHash3_128 = FunctionStringHashFixedString; +using FunctionMurmurHash3_128 = FunctionAnyHash; #endif using FunctionJavaHash = FunctionAnyHash; diff --git a/tests/queries/0_stateless/00678_murmurhash.sql b/tests/queries/0_stateless/00678_murmurhash.sql index 91b4deef9b3..705c62480a0 100644 --- a/tests/queries/0_stateless/00678_murmurhash.sql +++ b/tests/queries/0_stateless/00678_murmurhash.sql @@ -34,4 +34,3 @@ SELECT gccMurmurHash(1); SELECT hex(murmurHash3_128('foo')); SELECT hex(murmurHash3_128('\x01')); - diff --git a/tests/queries/0_stateless/00746_hashing_tuples.reference b/tests/queries/0_stateless/00746_hashing_tuples.reference index ebb03034add..9a28f560cce 100644 --- a/tests/queries/0_stateless/00746_hashing_tuples.reference +++ b/tests/queries/0_stateless/00746_hashing_tuples.reference @@ -1,6 +1,11 @@ 12940785793559895259 17926972817233444501 7456555839952096623 +CC45107CC4B79F62D831BEF2103C7CBF +DF2EC2F0669B000EDFF6ADEE264E7D68 +4CD1C30C38AB935D418B5269EF197B9E +9D78134EE48654D753CCA1B76185CF8E +389D16428D2AADEC9713905572F42864 955237314186186656 8175794665478042155 9325786087413524176 @@ -10,9 +15,11 @@ 13951512892560982617 4952008279444388047 15509665835504406222 -8163029322371165472 -8788309436660676487 -236561483980029756 +17115680070506536913 +8310530586050490380 +9559168348687039888 +8DD5527CC43D76F4760D26BE0F641F7E +F8F7AD9B6CD4CF117A71E277E2EC2931 12384823029245979431 4507350192761038840 1188926775431157506 diff --git a/tests/queries/0_stateless/00746_hashing_tuples.sql b/tests/queries/0_stateless/00746_hashing_tuples.sql index fe6c7e373b4..466a2184c65 100644 --- a/tests/queries/0_stateless/00746_hashing_tuples.sql +++ b/tests/queries/0_stateless/00746_hashing_tuples.sql @@ -4,6 +4,12 @@ SELECT sipHash64(1, 2, 3); SELECT sipHash64(1, 3, 2); SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); +SELECT hex(sipHash128('foo')); +SELECT hex(sipHash128('\x01')); +SELECT hex(sipHash128('foo', 'foo')); +SELECT hex(sipHash128('foo', 'foo', 'foo')); +SELECT hex(sipHash128(1, 2, 3)); + SELECT halfMD5(1, 2, 3); SELECT halfMD5(1, 3, 2); SELECT halfMD5(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); @@ -20,6 +26,9 @@ SELECT murmurHash3_64(1, 2, 3); SELECT murmurHash3_64(1, 3, 2); SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); +SELECT hex(murmurHash3_128('foo', 'foo')); +SELECT hex(murmurHash3_128('foo', 'foo', 'foo')); + SELECT gccMurmurHash(1, 2, 3); SELECT gccMurmurHash(1, 3, 2); SELECT gccMurmurHash(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2)))); \ No newline at end of file From e1c3f2dd481d2d6b8d53928668da9893d119ab8a Mon Sep 17 00:00:00 2001 From: zhangxiao871 <821008736@qq.com> Date: Mon, 11 Oct 2021 11:50:43 +0800 Subject: [PATCH 2/3] RollBack murmurHash3_32 murmurHash3_64 --- src/Functions/FunctionsHashing.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsHashing.h b/src/Functions/FunctionsHashing.h index 0e0758272a3..b39158d864d 100644 --- a/src/Functions/FunctionsHashing.h +++ b/src/Functions/FunctionsHashing.h @@ -365,7 +365,7 @@ struct MurmurHash3Impl32 static UInt32 combineHashes(UInt32 h1, UInt32 h2) { - return combineHashesFunc(h1, h2); + return IntHash32Impl::apply(h1) ^ h2; } static constexpr bool use_int_hash_for_pods = false; @@ -389,7 +389,7 @@ struct MurmurHash3Impl64 static UInt64 combineHashes(UInt64 h1, UInt64 h2) { - return combineHashesFunc(h1, h2); + return IntHash64Impl::apply(h1) ^ h2; } static constexpr bool use_int_hash_for_pods = false; @@ -401,11 +401,6 @@ struct MurmurHash3Impl128 using ReturnType = UInt128; - static UInt128 combineHashes(UInt128 h1, UInt128 h2) - { - return combineHashesFunc(h1, h2); - } - static UInt128 apply(const char * data, const size_t size) { char bytes[16]; @@ -413,6 +408,11 @@ struct MurmurHash3Impl128 return *reinterpret_cast(bytes); } + static UInt128 combineHashes(UInt128 h1, UInt128 h2) + { + return combineHashesFunc(h1, h2); + } + static constexpr bool use_int_hash_for_pods = false; }; #endif From 1da8a019adce8065f62e2503dbe5fb88eb87064a Mon Sep 17 00:00:00 2001 From: zhangxiao871 <821008736@qq.com> Date: Mon, 11 Oct 2021 15:47:28 +0800 Subject: [PATCH 3/3] gen test --- tests/queries/0_stateless/00746_hashing_tuples.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/00746_hashing_tuples.reference b/tests/queries/0_stateless/00746_hashing_tuples.reference index 9a28f560cce..71d45be5a54 100644 --- a/tests/queries/0_stateless/00746_hashing_tuples.reference +++ b/tests/queries/0_stateless/00746_hashing_tuples.reference @@ -15,9 +15,9 @@ DF2EC2F0669B000EDFF6ADEE264E7D68 13951512892560982617 4952008279444388047 15509665835504406222 -17115680070506536913 -8310530586050490380 -9559168348687039888 +8163029322371165472 +8788309436660676487 +236561483980029756 8DD5527CC43D76F4760D26BE0F641F7E F8F7AD9B6CD4CF117A71E277E2EC2931 12384823029245979431