mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 02:52:13 +00:00
Merge pull request #3519 from yandex/better-behaviour-of-hash-functions
Fixed idiosyncrasy with hash functions introduced in #3451
This commit is contained in:
commit
cce68f5b40
@ -113,6 +113,10 @@ struct HalfMD5Impl
|
||||
UInt64 hashes[] = {h1, h2};
|
||||
return apply(reinterpret_cast<const char *>(hashes), 16);
|
||||
}
|
||||
|
||||
/// If true, it will use intHash32 or intHash64 to hash POD types. This behaviour is intended for better performance of some functions.
|
||||
/// Otherwise it will hash bytes in memory as a string using corresponding hash function.
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct MD5Impl
|
||||
@ -186,8 +190,9 @@ struct SipHash64Impl
|
||||
UInt64 hashes[] = {h1, h2};
|
||||
return apply(reinterpret_cast<const char *>(hashes), 16);
|
||||
}
|
||||
};
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct SipHash128Impl
|
||||
{
|
||||
@ -201,6 +206,156 @@ struct SipHash128Impl
|
||||
};
|
||||
|
||||
|
||||
/** Why we need MurmurHash2?
|
||||
* MurmurHash2 is an outdated hash function, superseded by MurmurHash3 and subsequently by CityHash, xxHash, HighwayHash.
|
||||
* Usually there is no reason to use MurmurHash.
|
||||
* It is needed for the cases when you already have MurmurHash in some applications and you want to reproduce it
|
||||
* in ClickHouse as is. For example, it is needed to reproduce the behaviour
|
||||
* for NGINX a/b testing module: https://nginx.ru/en/docs/http/ngx_http_split_clients_module.html
|
||||
*/
|
||||
struct MurmurHash2Impl32
|
||||
{
|
||||
static constexpr auto name = "murmurHash2_32";
|
||||
|
||||
using ReturnType = UInt32;
|
||||
|
||||
static UInt32 apply(const char * data, const size_t size)
|
||||
{
|
||||
return MurmurHash2(data, size, 0);
|
||||
}
|
||||
|
||||
static UInt32 combineHashes(UInt32 h1, UInt32 h2)
|
||||
{
|
||||
return IntHash32Impl::apply(h1) ^ h2;
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct MurmurHash2Impl64
|
||||
{
|
||||
static constexpr auto name = "murmurHash2_64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * data, const size_t size)
|
||||
{
|
||||
return MurmurHash64A(data, size, 0);
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return IntHash64Impl::apply(h1) ^ h2;
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl32
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_32";
|
||||
using ReturnType = UInt32;
|
||||
|
||||
static UInt32 apply(const char * data, const size_t size)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt32 h;
|
||||
char bytes[sizeof(h)];
|
||||
};
|
||||
MurmurHash3_x86_32(data, size, 0, bytes);
|
||||
return h;
|
||||
}
|
||||
|
||||
static UInt32 combineHashes(UInt32 h1, UInt32 h2)
|
||||
{
|
||||
return IntHash32Impl::apply(h1) ^ h2;
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl64
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * data, const size_t size)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt64 h[2];
|
||||
char bytes[16];
|
||||
};
|
||||
MurmurHash3_x64_128(data, size, 0, bytes);
|
||||
return h[0] ^ h[1];
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return IntHash64Impl::apply(h1) ^ h2;
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl128
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_128";
|
||||
enum { length = 16 };
|
||||
|
||||
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
|
||||
{
|
||||
MurmurHash3_x64_128(begin, size, 0, out_char_data);
|
||||
}
|
||||
};
|
||||
|
||||
struct ImplCityHash64
|
||||
{
|
||||
static constexpr auto name = "cityHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = CityHash_v1_0_2::uint128;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len) { return CityHash_v1_0_2::CityHash64(s, len); }
|
||||
static constexpr bool use_int_hash_for_pods = true;
|
||||
};
|
||||
|
||||
// see farmhash.h for definition of NAMESPACE_FOR_HASH_FUNCTIONS
|
||||
struct ImplFarmHash64
|
||||
{
|
||||
static constexpr auto name = "farmHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = NAMESPACE_FOR_HASH_FUNCTIONS::uint128_t;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return NAMESPACE_FOR_HASH_FUNCTIONS::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len) { return NAMESPACE_FOR_HASH_FUNCTIONS::Hash64(s, len); }
|
||||
static constexpr bool use_int_hash_for_pods = true;
|
||||
};
|
||||
|
||||
struct ImplMetroHash64
|
||||
{
|
||||
static constexpr auto name = "metroHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = CityHash_v1_0_2::uint128;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt64 u64;
|
||||
UInt8 u8[sizeof(u64)];
|
||||
};
|
||||
|
||||
metrohash64_1(reinterpret_cast<const UInt8 *>(s), len, 0, u8);
|
||||
|
||||
return u64;
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = true;
|
||||
};
|
||||
|
||||
|
||||
template <typename Impl>
|
||||
class FunctionStringHashFixedString : public IFunction
|
||||
{
|
||||
@ -259,12 +414,6 @@ public:
|
||||
};
|
||||
|
||||
|
||||
inline bool allowIntHash(const IDataType * data_type)
|
||||
{
|
||||
return data_type->isValueRepresentedByNumber();
|
||||
}
|
||||
|
||||
|
||||
template <typename Impl, typename Name>
|
||||
class FunctionIntHash : public IFunction
|
||||
{
|
||||
@ -308,7 +457,7 @@ public:
|
||||
|
||||
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
|
||||
{
|
||||
if (!allowIntHash(arguments[0].get()))
|
||||
if (!arguments[0]->isValueRepresentedByNumber())
|
||||
throw Exception("Illegal type " + arguments[0]->getName() + " of argument of function " + getName(),
|
||||
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
|
||||
|
||||
@ -359,10 +508,19 @@ private:
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
ToType h;
|
||||
if constexpr (std::is_same_v<ToType, UInt64>)
|
||||
h = IntHash64Impl::apply(ext::bit_cast<UInt64>(vec_from[i]));
|
||||
|
||||
if constexpr (Impl::use_int_hash_for_pods)
|
||||
{
|
||||
if constexpr (std::is_same_v<ToType, UInt64>)
|
||||
h = IntHash64Impl::apply(ext::bit_cast<UInt64>(vec_from[i]));
|
||||
else
|
||||
h = IntHash32Impl::apply(ext::bit_cast<UInt32>(vec_from[i]));
|
||||
}
|
||||
else
|
||||
h = IntHash32Impl::apply(ext::bit_cast<UInt32>(vec_from[i]));
|
||||
{
|
||||
h = Impl::apply(reinterpret_cast<const char *>(&vec_from[i]), sizeof(vec_from[i]));
|
||||
}
|
||||
|
||||
if (first)
|
||||
vec_to[i] = h;
|
||||
else
|
||||
@ -610,102 +768,6 @@ public:
|
||||
};
|
||||
|
||||
|
||||
/** Why we need MurmurHash2?
|
||||
* MurmurHash2 is an outdated hash function, superseded by MurmurHash3 and subsequently by CityHash, xxHash, HighwayHash.
|
||||
* Usually there is no reason to use MurmurHash.
|
||||
* It is needed for the cases when you already have MurmurHash in some applications and you want to reproduce it
|
||||
* in ClickHouse as is. For example, it is needed to reproduce the behaviour
|
||||
* for NGINX a/b testing module: https://nginx.ru/en/docs/http/ngx_http_split_clients_module.html
|
||||
*/
|
||||
struct MurmurHash2Impl32
|
||||
{
|
||||
static constexpr auto name = "murmurHash2_32";
|
||||
|
||||
using ReturnType = UInt32;
|
||||
|
||||
static UInt32 apply(const char * data, const size_t size)
|
||||
{
|
||||
return MurmurHash2(data, size, 0);
|
||||
}
|
||||
|
||||
static UInt32 combineHashes(UInt32 h1, UInt32 h2)
|
||||
{
|
||||
return IntHash32Impl::apply(h1) ^ h2;
|
||||
}
|
||||
};
|
||||
|
||||
struct MurmurHash2Impl64
|
||||
{
|
||||
static constexpr auto name = "murmurHash2_64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * data, const size_t size)
|
||||
{
|
||||
return MurmurHash64A(data, size, 0);
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return IntHash64Impl::apply(h1) ^ h2;
|
||||
}
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl32
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_32";
|
||||
using ReturnType = UInt32;
|
||||
|
||||
static UInt32 apply(const char * data, const size_t size)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt32 h;
|
||||
char bytes[sizeof(h)];
|
||||
};
|
||||
MurmurHash3_x86_32(data, size, 0, bytes);
|
||||
return h;
|
||||
}
|
||||
|
||||
static UInt32 combineHashes(UInt32 h1, UInt32 h2)
|
||||
{
|
||||
return IntHash32Impl::apply(h1) ^ h2;
|
||||
}
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl64
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * data, const size_t size)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt64 h[2];
|
||||
char bytes[16];
|
||||
};
|
||||
MurmurHash3_x64_128(data, size, 0, bytes);
|
||||
return h[0] ^ h[1];
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return IntHash64Impl::apply(h1) ^ h2;
|
||||
}
|
||||
};
|
||||
|
||||
struct MurmurHash3Impl128
|
||||
{
|
||||
static constexpr auto name = "murmurHash3_128";
|
||||
enum { length = 16 };
|
||||
|
||||
static void apply(const char * begin, const size_t size, unsigned char * out_char_data)
|
||||
{
|
||||
MurmurHash3_x64_128(begin, size, 0, out_char_data);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
struct URLHashImpl
|
||||
{
|
||||
static UInt64 apply(const char * data, const size_t size)
|
||||
@ -899,48 +961,6 @@ struct NameIntHash32 { static constexpr auto name = "intHash32"; };
|
||||
struct NameIntHash64 { static constexpr auto name = "intHash64"; };
|
||||
|
||||
|
||||
struct ImplCityHash64
|
||||
{
|
||||
static constexpr auto name = "cityHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = CityHash_v1_0_2::uint128;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len) { return CityHash_v1_0_2::CityHash64(s, len); }
|
||||
};
|
||||
|
||||
// see farmhash.h for definition of NAMESPACE_FOR_HASH_FUNCTIONS
|
||||
struct ImplFarmHash64
|
||||
{
|
||||
static constexpr auto name = "farmHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = NAMESPACE_FOR_HASH_FUNCTIONS::uint128_t;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return NAMESPACE_FOR_HASH_FUNCTIONS::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len) { return NAMESPACE_FOR_HASH_FUNCTIONS::Hash64(s, len); }
|
||||
};
|
||||
|
||||
struct ImplMetroHash64
|
||||
{
|
||||
static constexpr auto name = "metroHash64";
|
||||
using ReturnType = UInt64;
|
||||
using uint128_t = CityHash_v1_0_2::uint128;
|
||||
|
||||
static auto combineHashes(UInt64 h1, UInt64 h2) { return CityHash_v1_0_2::Hash128to64(uint128_t(h1, h2)); }
|
||||
static auto apply(const char * s, const size_t len)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt64 u64;
|
||||
UInt8 u8[sizeof(u64)];
|
||||
};
|
||||
|
||||
metrohash64_1(reinterpret_cast<const UInt8 *>(s), len, 0, u8);
|
||||
|
||||
return u64;
|
||||
}
|
||||
};
|
||||
|
||||
using FunctionHalfMD5 = FunctionAnyHash<HalfMD5Impl>;
|
||||
using FunctionSipHash64 = FunctionAnyHash<SipHash64Impl>;
|
||||
using FunctionIntHash32 = FunctionIntHash<IntHash32Impl, NameIntHash32>;
|
||||
|
@ -1,25 +1,26 @@
|
||||
3012058918
|
||||
1298551497
|
||||
864444010
|
||||
367840556
|
||||
623211862
|
||||
3533626746
|
||||
2388617433
|
||||
2708309598
|
||||
2414502773
|
||||
670491991
|
||||
1343103100
|
||||
0
|
||||
0
|
||||
0
|
||||
0
|
||||
1343103100
|
||||
1996614413
|
||||
0
|
||||
0
|
||||
0
|
||||
1
|
||||
1
|
||||
14834356025302342401
|
||||
12725806677685968135
|
||||
10577349846663553072
|
||||
12725806677685968135
|
||||
4138058784
|
||||
3831157163
|
||||
1343103100
|
||||
3831157163
|
||||
11303473983767132390
|
||||
956517343494314387
|
||||
10577349846663553072
|
||||
956517343494314387
|
||||
6145F501578671E2877DBA2BE487AF7E
|
||||
16FE7483905CCE7A85670E43E4678877
|
||||
|
@ -13,6 +13,7 @@ SELECT murmurHash2_32('\x03\0\0');
|
||||
SELECT murmurHash2_32(1);
|
||||
SELECT murmurHash2_32(toUInt16(2));
|
||||
|
||||
SELECT murmurHash2_32(2) = bitXor(toUInt32(0x5bd1e995 * bitXor(toUInt32(3 * 0x5bd1e995) AS a, bitShiftRight(a, 13))) AS b, bitShiftRight(b, 15));
|
||||
SELECT murmurHash2_32('\x02') = bitXor(toUInt32(0x5bd1e995 * bitXor(toUInt32(3 * 0x5bd1e995) AS a, bitShiftRight(a, 13))) AS b, bitShiftRight(b, 15));
|
||||
|
||||
SELECT murmurHash2_64('foo');
|
||||
|
@ -1,15 +1,15 @@
|
||||
8732148587615156034
|
||||
3856459458360415155
|
||||
1993857991550209231
|
||||
5465424717626995012
|
||||
15495040516566687427
|
||||
13266110974878256384
|
||||
617416965
|
||||
3293554683
|
||||
4210800467
|
||||
6847376565456338547
|
||||
15499510486101262177
|
||||
13552202417419166072
|
||||
6847376565456338547
|
||||
15499510486101262177
|
||||
14474638290107799038
|
||||
12940785793559895259
|
||||
17926972817233444501
|
||||
7456555839952096623
|
||||
955237314186186656
|
||||
8175794665478042155
|
||||
9325786087413524176
|
||||
2822869866
|
||||
1460833561
|
||||
222444531
|
||||
13951512892560982617
|
||||
4952008279444388047
|
||||
15509665835504406222
|
||||
8163029322371165472
|
||||
8788309436660676487
|
||||
236561483980029756
|
||||
|
@ -0,0 +1,7 @@
|
||||
1452224150530656417
|
||||
11717965186011240346
|
||||
13379111408315310133
|
||||
13379111408315310133
|
||||
13379111408315310133
|
||||
623211862
|
||||
9052087431341907723
|
7
dbms/tests/queries/0_stateless/00751_hashing_ints.sql
Normal file
7
dbms/tests/queries/0_stateless/00751_hashing_ints.sql
Normal file
@ -0,0 +1,7 @@
|
||||
SELECT halfMD5(123456);
|
||||
SELECT sipHash64(123456);
|
||||
SELECT cityHash64(123456);
|
||||
SELECT farmHash64(123456);
|
||||
SELECT metroHash64(123456);
|
||||
SELECT murmurHash2_32(123456);
|
||||
SELECT murmurHash2_64(123456);
|
Loading…
Reference in New Issue
Block a user