mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-23 08:02:02 +00:00
Merge pull request #52712 from ClibMouse/feature/hashing-big-endian-support
This commit is contained in:
commit
078eadc473
@ -17,7 +17,8 @@
|
||||
#ifndef METROHASH_PLATFORM_H
|
||||
#define METROHASH_PLATFORM_H
|
||||
|
||||
#include <stdint.h>
|
||||
#include <bit>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
|
||||
// rotate right idiom recognized by most compilers
|
||||
@ -33,6 +34,11 @@ inline static uint64_t read_u64(const void * const ptr)
|
||||
// so we use memcpy() which is the most portable. clang & gcc usually translates `memcpy()` into a single `load` instruction
|
||||
// when hardware supports it, so using memcpy() is efficient too.
|
||||
memcpy(&result, ptr, sizeof(result));
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
result = std::byteswap(result);
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -40,6 +46,11 @@ inline static uint64_t read_u32(const void * const ptr)
|
||||
{
|
||||
uint32_t result;
|
||||
memcpy(&result, ptr, sizeof(result));
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
result = std::byteswap(result);
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -47,6 +58,11 @@ inline static uint64_t read_u16(const void * const ptr)
|
||||
{
|
||||
uint16_t result;
|
||||
memcpy(&result, ptr, sizeof(result));
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
result = std::byteswap(result);
|
||||
#endif
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
@ -153,15 +153,10 @@ struct IntHash64Impl
|
||||
template<typename T, typename HashFunction>
|
||||
T combineHashesFunc(T t1, T t2)
|
||||
{
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
T tmp;
|
||||
reverseMemcpy(&tmp, &t1, sizeof(T));
|
||||
t1 = tmp;
|
||||
reverseMemcpy(&tmp, &t2, sizeof(T));
|
||||
t2 = tmp;
|
||||
#endif
|
||||
T hashes[] = {t1, t2};
|
||||
return HashFunction::apply(reinterpret_cast<const char *>(hashes), 2 * sizeof(T));
|
||||
transformEndianness<std::endian::little>(t1);
|
||||
transformEndianness<std::endian::little>(t2);
|
||||
const T hashes[] {t1, t2};
|
||||
return HashFunction::apply(reinterpret_cast<const char *>(hashes), sizeof(hashes));
|
||||
}
|
||||
|
||||
|
||||
@ -184,21 +179,14 @@ struct HalfMD5Impl
|
||||
MD5_Update(&ctx, reinterpret_cast<const unsigned char *>(begin), size);
|
||||
MD5_Final(buf.char_data, &ctx);
|
||||
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
return buf.uint64_data; /// No need to flip bytes on big endian machines
|
||||
#else
|
||||
return std::byteswap(buf.uint64_data); /// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t
|
||||
#endif
|
||||
/// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t
|
||||
transformEndianness<std::endian::big>(buf.uint64_data);
|
||||
return buf.uint64_data;
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
h1 = std::byteswap(h1);
|
||||
h2 = std::byteswap(h2);
|
||||
#endif
|
||||
UInt64 hashes[] = {h1, h2};
|
||||
return apply(reinterpret_cast<const char *>(hashes), 16);
|
||||
return combineHashesFunc<UInt64, HalfMD5Impl>(h1, h2);
|
||||
}
|
||||
|
||||
/// If true, it will use intHash32 or intHash64 to hash POD types. This behaviour is intended for better performance of some functions.
|
||||
@ -311,15 +299,8 @@ struct SipHash64Impl
|
||||
static constexpr auto name = "sipHash64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * begin, size_t size)
|
||||
{
|
||||
return sipHash64(begin, size);
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2);
|
||||
}
|
||||
static UInt64 apply(const char * begin, size_t size) { return sipHash64(begin, size); }
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2); }
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
@ -336,12 +317,10 @@ struct SipHash64KeyedImpl
|
||||
|
||||
static UInt64 combineHashesKeyed(const Key & key, UInt64 h1, UInt64 h2)
|
||||
{
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
h1 = std::byteswap(h1);
|
||||
h2 = std::byteswap(h2);
|
||||
#endif
|
||||
UInt64 hashes[] = {h1, h2};
|
||||
return applyKeyed(key, reinterpret_cast<const char *>(hashes), 2 * sizeof(UInt64));
|
||||
transformEndianness<std::endian::little>(h1);
|
||||
transformEndianness<std::endian::little>(h2);
|
||||
const UInt64 hashes[]{h1, h2};
|
||||
return applyKeyed(key, reinterpret_cast<const char *>(hashes), sizeof(hashes));
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
@ -353,15 +332,8 @@ struct SipHash128Impl
|
||||
|
||||
using ReturnType = UInt128;
|
||||
|
||||
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
|
||||
{
|
||||
return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2);
|
||||
}
|
||||
|
||||
static UInt128 apply(const char * data, const size_t size)
|
||||
{
|
||||
return sipHash128(data, size);
|
||||
}
|
||||
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2); }
|
||||
static UInt128 apply(const char * data, const size_t size) { return sipHash128(data, size); }
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
@ -378,15 +350,10 @@ struct SipHash128KeyedImpl
|
||||
|
||||
static UInt128 combineHashesKeyed(const Key & key, UInt128 h1, UInt128 h2)
|
||||
{
|
||||
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
|
||||
UInt128 tmp;
|
||||
reverseMemcpy(&tmp, &h1, sizeof(UInt128));
|
||||
h1 = tmp;
|
||||
reverseMemcpy(&tmp, &h2, sizeof(UInt128));
|
||||
h2 = tmp;
|
||||
#endif
|
||||
UInt128 hashes[] = {h1, h2};
|
||||
return applyKeyed(key, reinterpret_cast<const char *>(hashes), 2 * sizeof(UInt128));
|
||||
transformEndianness<std::endian::little>(h1);
|
||||
transformEndianness<std::endian::little>(h2);
|
||||
const UInt128 hashes[]{h1, h2};
|
||||
return applyKeyed(key, reinterpret_cast<const char *>(hashes), sizeof(hashes));
|
||||
}
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
@ -531,10 +498,7 @@ struct MurmurHash3Impl64
|
||||
return h[0] ^ h[1];
|
||||
}
|
||||
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
return IntHash64Impl::apply(h1) ^ h2;
|
||||
}
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return IntHash64Impl::apply(h1) ^ h2; }
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
@ -552,10 +516,7 @@ struct MurmurHash3Impl128
|
||||
return *reinterpret_cast<UInt128 *>(bytes);
|
||||
}
|
||||
|
||||
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
|
||||
{
|
||||
return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2);
|
||||
}
|
||||
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2); }
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
@ -1040,11 +1001,10 @@ private:
|
||||
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
|
||||
{
|
||||
const typename ColVecType::Container & vec_from = col_from->getData();
|
||||
size_t size = vec_from.size();
|
||||
const size_t size = vec_from.size();
|
||||
for (size_t i = 0; i < size; ++i)
|
||||
{
|
||||
ToType hash;
|
||||
|
||||
if constexpr (Impl::use_int_hash_for_pods)
|
||||
{
|
||||
if constexpr (std::is_same_v<ToType, UInt64>)
|
||||
@ -1058,13 +1018,8 @@ private:
|
||||
hash = JavaHashImpl::apply(vec_from[i]);
|
||||
else
|
||||
{
|
||||
FromType value = vec_from[i];
|
||||
if constexpr (std::endian::native == std::endian::big)
|
||||
{
|
||||
FromType value_reversed;
|
||||
reverseMemcpy(&value_reversed, &value, sizeof(value));
|
||||
value = value_reversed;
|
||||
}
|
||||
auto value = vec_from[i];
|
||||
transformEndianness<std::endian::little>(value);
|
||||
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
}
|
||||
}
|
||||
@ -1078,8 +1033,8 @@ private:
|
||||
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
|
||||
{
|
||||
auto value = col_from_const->template getValue<FromType>();
|
||||
ToType hash;
|
||||
|
||||
ToType hash;
|
||||
if constexpr (Impl::use_int_hash_for_pods)
|
||||
{
|
||||
if constexpr (std::is_same_v<ToType, UInt64>)
|
||||
@ -1093,17 +1048,12 @@ private:
|
||||
hash = JavaHashImpl::apply(value);
|
||||
else
|
||||
{
|
||||
if constexpr (std::endian::native == std::endian::big)
|
||||
{
|
||||
FromType value_reversed;
|
||||
reverseMemcpy(&value_reversed, &value, sizeof(value));
|
||||
value = value_reversed;
|
||||
}
|
||||
transformEndianness<std::endian::little>(value);
|
||||
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
}
|
||||
}
|
||||
|
||||
size_t size = vec_to.size();
|
||||
const size_t size = vec_to.size();
|
||||
if constexpr (first)
|
||||
vec_to.assign(size, hash);
|
||||
else
|
||||
@ -1120,6 +1070,16 @@ private:
|
||||
{
|
||||
using ColVecType = ColumnVectorOrDecimal<FromType>;
|
||||
|
||||
static const auto to_little_endian = [](auto & value)
|
||||
{
|
||||
// IPv6 addresses are parsed into four 32-bit components in big-endian ordering on both platforms, so no change is necessary.
|
||||
// Reference: `parseIPv6orIPv4` in src/Common/formatIPv6.h.
|
||||
if constexpr (std::endian::native == std::endian::big && std::is_same_v<std::remove_reference_t<decltype(value)>, IPv6>)
|
||||
return;
|
||||
|
||||
transformEndianness<std::endian::little>(value);
|
||||
};
|
||||
|
||||
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
|
||||
{
|
||||
const typename ColVecType::Container & vec_from = col_from->getData();
|
||||
@ -1131,9 +1091,10 @@ private:
|
||||
hash = apply(key, reinterpret_cast<const char *>(&vec_from[i]), sizeof(vec_from[i]));
|
||||
else
|
||||
{
|
||||
char tmp_buffer[sizeof(vec_from[i])];
|
||||
reverseMemcpy(tmp_buffer, &vec_from[i], sizeof(vec_from[i]));
|
||||
hash = apply(key, reinterpret_cast<const char *>(tmp_buffer), sizeof(vec_from[i]));
|
||||
auto value = vec_from[i];
|
||||
to_little_endian(value);
|
||||
|
||||
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
}
|
||||
if constexpr (first)
|
||||
vec_to[i] = hash;
|
||||
@ -1144,17 +1105,10 @@ private:
|
||||
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
|
||||
{
|
||||
auto value = col_from_const->template getValue<FromType>();
|
||||
to_little_endian(value);
|
||||
|
||||
ToType hash;
|
||||
if constexpr (std::endian::native == std::endian::little)
|
||||
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
else
|
||||
{
|
||||
char tmp_buffer[sizeof(value)];
|
||||
reverseMemcpy(tmp_buffer, &value, sizeof(value));
|
||||
hash = apply(key, reinterpret_cast<const char *>(tmp_buffer), sizeof(value));
|
||||
}
|
||||
size_t size = vec_to.size();
|
||||
const auto hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
|
||||
const size_t size = vec_to.size();
|
||||
if constexpr (first)
|
||||
vec_to.assign(size, hash);
|
||||
else
|
||||
@ -1423,6 +1377,9 @@ public:
|
||||
|
||||
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
|
||||
{
|
||||
if (std::endian::native == std::endian::big)
|
||||
std::ranges::for_each(col_to->getData(), transformEndianness<std::endian::little, ToType>);
|
||||
|
||||
auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128));
|
||||
const auto & data = col_to->getData();
|
||||
auto & chars = col_to_fixed_string->getChars();
|
||||
@ -1676,21 +1633,8 @@ struct ImplWyHash64
|
||||
static constexpr auto name = "wyHash64";
|
||||
using ReturnType = UInt64;
|
||||
|
||||
static UInt64 apply(const char * s, const size_t len)
|
||||
{
|
||||
return wyhash(s, len, 0, _wyp);
|
||||
}
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
|
||||
{
|
||||
union
|
||||
{
|
||||
UInt64 u64[2];
|
||||
char chars[16];
|
||||
};
|
||||
u64[0] = h1;
|
||||
u64[1] = h2;
|
||||
return apply(chars, 16);
|
||||
}
|
||||
static UInt64 apply(const char * s, const size_t len) { return wyhash(s, len, 0, _wyp); }
|
||||
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc<UInt64, ImplWyHash64>(h1, h2); }
|
||||
|
||||
static constexpr bool use_int_hash_for_pods = false;
|
||||
};
|
||||
|
@ -25,5 +25,5 @@
|
||||
9631199822919835226
|
||||
4334672815104069193
|
||||
4334672815104069193
|
||||
1
|
||||
1
|
||||
6145F501578671E2877DBA2BE487AF7E
|
||||
16FE7483905CCE7A85670E43E4678877
|
||||
|
@ -32,7 +32,5 @@ SELECT gccMurmurHash('foo');
|
||||
SELECT gccMurmurHash('\x01');
|
||||
SELECT gccMurmurHash(1);
|
||||
|
||||
-- Comparison with reverse for big endian
|
||||
SELECT hex(murmurHash3_128('foo')) = hex(reverse(unhex('6145F501578671E2877DBA2BE487AF7E'))) or hex(murmurHash3_128('foo')) = '6145F501578671E2877DBA2BE487AF7E';
|
||||
-- Comparison with reverse for big endian
|
||||
SELECT hex(murmurHash3_128('\x01')) = hex(reverse(unhex('16FE7483905CCE7A85670E43E4678877'))) or hex(murmurHash3_128('\x01')) = '16FE7483905CCE7A85670E43E4678877';
|
||||
SELECT hex(murmurHash3_128('foo'));
|
||||
SELECT hex(murmurHash3_128('\x01'));
|
||||
|
@ -1,11 +1,11 @@
|
||||
12940785793559895259
|
||||
17926972817233444501
|
||||
7456555839952096623
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
CC45107CC4B79F62D831BEF2103C7CBF
|
||||
DF2EC2F0669B000EDFF6ADEE264E7D68
|
||||
4CD1C30C38AB935D418B5269EF197B9E
|
||||
9D78134EE48654D753CCA1B76185CF8E
|
||||
389D16428D2AADEC9713905572F42864
|
||||
955237314186186656
|
||||
8175794665478042155
|
||||
9325786087413524176
|
||||
@ -18,8 +18,8 @@
|
||||
8163029322371165472
|
||||
8788309436660676487
|
||||
236561483980029756
|
||||
1
|
||||
1
|
||||
8DD5527CC43D76F4760D26BE0F641F7E
|
||||
F8F7AD9B6CD4CF117A71E277E2EC2931
|
||||
12384823029245979431
|
||||
4507350192761038840
|
||||
1188926775431157506
|
||||
|
@ -4,11 +4,11 @@ SELECT sipHash64(1, 2, 3);
|
||||
SELECT sipHash64(1, 3, 2);
|
||||
SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
|
||||
|
||||
SELECT hex(sipHash128('foo')) = hex(reverse(unhex('CC45107CC4B79F62D831BEF2103C7CBF'))) or hex(sipHash128('foo')) = 'CC45107CC4B79F62D831BEF2103C7CBF';
|
||||
SELECT hex(sipHash128('\x01')) = hex(reverse(unhex('DF2EC2F0669B000EDFF6ADEE264E7D68'))) or hex(sipHash128('\x01')) = 'DF2EC2F0669B000EDFF6ADEE264E7D68';
|
||||
SELECT hex(sipHash128('foo', 'foo')) = hex(reverse(unhex('4CD1C30C38AB935D418B5269EF197B9E'))) or hex(sipHash128('foo', 'foo')) = '4CD1C30C38AB935D418B5269EF197B9E';
|
||||
SELECT hex(sipHash128('foo', 'foo', 'foo')) = hex(reverse(unhex('9D78134EE48654D753CCA1B76185CF8E'))) or hex(sipHash128('foo', 'foo', 'foo')) = '9D78134EE48654D753CCA1B76185CF8E';
|
||||
SELECT hex(sipHash128(1, 2, 3)) = hex(reverse(unhex('389D16428D2AADEC9713905572F42864'))) or hex(sipHash128(1, 2, 3)) = '389D16428D2AADEC9713905572F42864';
|
||||
SELECT hex(sipHash128('foo'));
|
||||
SELECT hex(sipHash128('\x01'));
|
||||
SELECT hex(sipHash128('foo', 'foo'));
|
||||
SELECT hex(sipHash128('foo', 'foo', 'foo'));
|
||||
SELECT hex(sipHash128(1, 2, 3));
|
||||
|
||||
SELECT halfMD5(1, 2, 3);
|
||||
SELECT halfMD5(1, 3, 2);
|
||||
@ -26,8 +26,8 @@ SELECT murmurHash3_64(1, 2, 3);
|
||||
SELECT murmurHash3_64(1, 3, 2);
|
||||
SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
|
||||
|
||||
SELECT hex(murmurHash3_128('foo', 'foo')) = hex(reverse(unhex('8DD5527CC43D76F4760D26BE0F641F7E'))) or hex(murmurHash3_128('foo', 'foo')) = '8DD5527CC43D76F4760D26BE0F641F7E';
|
||||
SELECT hex(murmurHash3_128('foo', 'foo', 'foo')) = hex(reverse(unhex('F8F7AD9B6CD4CF117A71E277E2EC2931'))) or hex(murmurHash3_128('foo', 'foo', 'foo')) = 'F8F7AD9B6CD4CF117A71E277E2EC2931';
|
||||
SELECT hex(murmurHash3_128('foo', 'foo'));
|
||||
SELECT hex(murmurHash3_128('foo', 'foo', 'foo'));
|
||||
|
||||
SELECT gccMurmurHash(1, 2, 3);
|
||||
SELECT gccMurmurHash(1, 3, 2);
|
||||
|
@ -191,9 +191,9 @@ E51B38608EF25F57
|
||||
1
|
||||
1
|
||||
E28DBDE7FE22E41C
|
||||
1
|
||||
1CE422FEE7BD8DE20000000000000000
|
||||
E28DBDE7FE22E41C
|
||||
1
|
||||
1CE422FEE7BD8DE20000000000000000
|
||||
Check bug with hashing of const integer values
|
||||
11862823756610506724
|
||||
11862823756610506724
|
||||
|
@ -269,9 +269,9 @@ select sipHash64Keyed(toUInt64(0), '1'); -- { serverError 48 }
|
||||
select sipHash128Keyed(toUInt64(0), '1'); -- { serverError 48 }
|
||||
|
||||
select hex(sipHash64());
|
||||
SELECT hex(sipHash128()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000';
|
||||
SELECT hex(sipHash128());
|
||||
select hex(sipHash64Keyed());
|
||||
SELECT hex(sipHash128Keyed()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128Keyed()) = '1CE422FEE7BD8DE20000000000000000';
|
||||
SELECT hex(sipHash128Keyed());
|
||||
|
||||
SELECT 'Check bug with hashing of const integer values';
|
||||
DROP TABLE IF EXISTS tab;
|
||||
|
@ -1 +1 @@
|
||||
1
|
||||
1CE422FEE7BD8DE20000000000000000
|
||||
|
@ -1 +1 @@
|
||||
SELECT hex(sipHash128Reference()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000';
|
||||
SELECT hex(sipHash128Reference());
|
||||
|
Loading…
Reference in New Issue
Block a user