Merge pull request #52712 from ClibMouse/feature/hashing-big-endian-support

This commit is contained in:
vdimir 2023-08-02 11:09:03 +02:00 committed by GitHub
commit 078eadc473
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 91 additions and 133 deletions

View File

@ -17,7 +17,8 @@
#ifndef METROHASH_PLATFORM_H
#define METROHASH_PLATFORM_H
#include <stdint.h>
#include <bit>
#include <cstdint>
#include <cstring>
// rotate right idiom recognized by most compilers
@ -33,6 +34,11 @@ inline static uint64_t read_u64(const void * const ptr)
// so we use memcpy() which is the most portable. clang & gcc usually translates `memcpy()` into a single `load` instruction
// when hardware supports it, so using memcpy() is efficient too.
memcpy(&result, ptr, sizeof(result));
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
result = std::byteswap(result);
#endif
return result;
}
@ -40,6 +46,11 @@ inline static uint64_t read_u32(const void * const ptr)
{
uint32_t result;
memcpy(&result, ptr, sizeof(result));
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
result = std::byteswap(result);
#endif
return result;
}
@ -47,6 +58,11 @@ inline static uint64_t read_u16(const void * const ptr)
{
uint16_t result;
memcpy(&result, ptr, sizeof(result));
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
result = std::byteswap(result);
#endif
return result;
}

View File

@ -153,15 +153,10 @@ struct IntHash64Impl
template<typename T, typename HashFunction>
T combineHashesFunc(T t1, T t2)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
T tmp;
reverseMemcpy(&tmp, &t1, sizeof(T));
t1 = tmp;
reverseMemcpy(&tmp, &t2, sizeof(T));
t2 = tmp;
#endif
T hashes[] = {t1, t2};
return HashFunction::apply(reinterpret_cast<const char *>(hashes), 2 * sizeof(T));
transformEndianness<std::endian::little>(t1);
transformEndianness<std::endian::little>(t2);
const T hashes[] {t1, t2};
return HashFunction::apply(reinterpret_cast<const char *>(hashes), sizeof(hashes));
}
@ -184,21 +179,14 @@ struct HalfMD5Impl
MD5_Update(&ctx, reinterpret_cast<const unsigned char *>(begin), size);
MD5_Final(buf.char_data, &ctx);
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
return buf.uint64_data; /// No need to flip bytes on big endian machines
#else
return std::byteswap(buf.uint64_data); /// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t
#endif
/// Compatibility with existing code. Cast need for old poco AND macos where UInt64 != uint64_t
transformEndianness<std::endian::big>(buf.uint64_data);
return buf.uint64_data;
}
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
h1 = std::byteswap(h1);
h2 = std::byteswap(h2);
#endif
UInt64 hashes[] = {h1, h2};
return apply(reinterpret_cast<const char *>(hashes), 16);
return combineHashesFunc<UInt64, HalfMD5Impl>(h1, h2);
}
/// If true, it will use intHash32 or intHash64 to hash POD types. This behaviour is intended for better performance of some functions.
@ -311,15 +299,8 @@ struct SipHash64Impl
static constexpr auto name = "sipHash64";
using ReturnType = UInt64;
static UInt64 apply(const char * begin, size_t size)
{
return sipHash64(begin, size);
}
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2);
}
static UInt64 apply(const char * begin, size_t size) { return sipHash64(begin, size); }
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc<UInt64, SipHash64Impl>(h1, h2); }
static constexpr bool use_int_hash_for_pods = false;
};
@ -336,12 +317,10 @@ struct SipHash64KeyedImpl
static UInt64 combineHashesKeyed(const Key & key, UInt64 h1, UInt64 h2)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
h1 = std::byteswap(h1);
h2 = std::byteswap(h2);
#endif
UInt64 hashes[] = {h1, h2};
return applyKeyed(key, reinterpret_cast<const char *>(hashes), 2 * sizeof(UInt64));
transformEndianness<std::endian::little>(h1);
transformEndianness<std::endian::little>(h2);
const UInt64 hashes[]{h1, h2};
return applyKeyed(key, reinterpret_cast<const char *>(hashes), sizeof(hashes));
}
static constexpr bool use_int_hash_for_pods = false;
@ -353,15 +332,8 @@ struct SipHash128Impl
using ReturnType = UInt128;
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2);
}
static UInt128 apply(const char * data, const size_t size)
{
return sipHash128(data, size);
}
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, SipHash128Impl>(h1, h2); }
static UInt128 apply(const char * data, const size_t size) { return sipHash128(data, size); }
static constexpr bool use_int_hash_for_pods = false;
};
@ -378,15 +350,10 @@ struct SipHash128KeyedImpl
static UInt128 combineHashesKeyed(const Key & key, UInt128 h1, UInt128 h2)
{
#if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
UInt128 tmp;
reverseMemcpy(&tmp, &h1, sizeof(UInt128));
h1 = tmp;
reverseMemcpy(&tmp, &h2, sizeof(UInt128));
h2 = tmp;
#endif
UInt128 hashes[] = {h1, h2};
return applyKeyed(key, reinterpret_cast<const char *>(hashes), 2 * sizeof(UInt128));
transformEndianness<std::endian::little>(h1);
transformEndianness<std::endian::little>(h2);
const UInt128 hashes[]{h1, h2};
return applyKeyed(key, reinterpret_cast<const char *>(hashes), sizeof(hashes));
}
static constexpr bool use_int_hash_for_pods = false;
@ -531,10 +498,7 @@ struct MurmurHash3Impl64
return h[0] ^ h[1];
}
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
return IntHash64Impl::apply(h1) ^ h2;
}
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return IntHash64Impl::apply(h1) ^ h2; }
static constexpr bool use_int_hash_for_pods = false;
};
@ -552,10 +516,7 @@ struct MurmurHash3Impl128
return *reinterpret_cast<UInt128 *>(bytes);
}
static UInt128 combineHashes(UInt128 h1, UInt128 h2)
{
return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2);
}
static UInt128 combineHashes(UInt128 h1, UInt128 h2) { return combineHashesFunc<UInt128, MurmurHash3Impl128>(h1, h2); }
static constexpr bool use_int_hash_for_pods = false;
};
@ -1040,11 +1001,10 @@ private:
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
{
const typename ColVecType::Container & vec_from = col_from->getData();
size_t size = vec_from.size();
const size_t size = vec_from.size();
for (size_t i = 0; i < size; ++i)
{
ToType hash;
if constexpr (Impl::use_int_hash_for_pods)
{
if constexpr (std::is_same_v<ToType, UInt64>)
@ -1058,13 +1018,8 @@ private:
hash = JavaHashImpl::apply(vec_from[i]);
else
{
FromType value = vec_from[i];
if constexpr (std::endian::native == std::endian::big)
{
FromType value_reversed;
reverseMemcpy(&value_reversed, &value, sizeof(value));
value = value_reversed;
}
auto value = vec_from[i];
transformEndianness<std::endian::little>(value);
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
}
}
@ -1078,8 +1033,8 @@ private:
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
{
auto value = col_from_const->template getValue<FromType>();
ToType hash;
ToType hash;
if constexpr (Impl::use_int_hash_for_pods)
{
if constexpr (std::is_same_v<ToType, UInt64>)
@ -1093,17 +1048,12 @@ private:
hash = JavaHashImpl::apply(value);
else
{
if constexpr (std::endian::native == std::endian::big)
{
FromType value_reversed;
reverseMemcpy(&value_reversed, &value, sizeof(value));
value = value_reversed;
}
transformEndianness<std::endian::little>(value);
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
}
}
size_t size = vec_to.size();
const size_t size = vec_to.size();
if constexpr (first)
vec_to.assign(size, hash);
else
@ -1120,6 +1070,16 @@ private:
{
using ColVecType = ColumnVectorOrDecimal<FromType>;
static const auto to_little_endian = [](auto & value)
{
// IPv6 addresses are parsed into four 32-bit components in big-endian ordering on both platforms, so no change is necessary.
// Reference: `parseIPv6orIPv4` in src/Common/formatIPv6.h.
if constexpr (std::endian::native == std::endian::big && std::is_same_v<std::remove_reference_t<decltype(value)>, IPv6>)
return;
transformEndianness<std::endian::little>(value);
};
if (const ColVecType * col_from = checkAndGetColumn<ColVecType>(column))
{
const typename ColVecType::Container & vec_from = col_from->getData();
@ -1131,9 +1091,10 @@ private:
hash = apply(key, reinterpret_cast<const char *>(&vec_from[i]), sizeof(vec_from[i]));
else
{
char tmp_buffer[sizeof(vec_from[i])];
reverseMemcpy(tmp_buffer, &vec_from[i], sizeof(vec_from[i]));
hash = apply(key, reinterpret_cast<const char *>(tmp_buffer), sizeof(vec_from[i]));
auto value = vec_from[i];
to_little_endian(value);
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
}
if constexpr (first)
vec_to[i] = hash;
@ -1144,17 +1105,10 @@ private:
else if (auto col_from_const = checkAndGetColumnConst<ColVecType>(column))
{
auto value = col_from_const->template getValue<FromType>();
to_little_endian(value);
ToType hash;
if constexpr (std::endian::native == std::endian::little)
hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
else
{
char tmp_buffer[sizeof(value)];
reverseMemcpy(tmp_buffer, &value, sizeof(value));
hash = apply(key, reinterpret_cast<const char *>(tmp_buffer), sizeof(value));
}
size_t size = vec_to.size();
const auto hash = apply(key, reinterpret_cast<const char *>(&value), sizeof(value));
const size_t size = vec_to.size();
if constexpr (first)
vec_to.assign(size, hash);
else
@ -1423,6 +1377,9 @@ public:
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
if (std::endian::native == std::endian::big)
std::ranges::for_each(col_to->getData(), transformEndianness<std::endian::little, ToType>);
auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128));
const auto & data = col_to->getData();
auto & chars = col_to_fixed_string->getChars();
@ -1676,21 +1633,8 @@ struct ImplWyHash64
static constexpr auto name = "wyHash64";
using ReturnType = UInt64;
static UInt64 apply(const char * s, const size_t len)
{
return wyhash(s, len, 0, _wyp);
}
static UInt64 combineHashes(UInt64 h1, UInt64 h2)
{
union
{
UInt64 u64[2];
char chars[16];
};
u64[0] = h1;
u64[1] = h2;
return apply(chars, 16);
}
static UInt64 apply(const char * s, const size_t len) { return wyhash(s, len, 0, _wyp); }
static UInt64 combineHashes(UInt64 h1, UInt64 h2) { return combineHashesFunc<UInt64, ImplWyHash64>(h1, h2); }
static constexpr bool use_int_hash_for_pods = false;
};

View File

@ -25,5 +25,5 @@
9631199822919835226
4334672815104069193
4334672815104069193
1
1
6145F501578671E2877DBA2BE487AF7E
16FE7483905CCE7A85670E43E4678877

View File

@ -32,7 +32,5 @@ SELECT gccMurmurHash('foo');
SELECT gccMurmurHash('\x01');
SELECT gccMurmurHash(1);
-- Comparison with reverse for big endian
SELECT hex(murmurHash3_128('foo')) = hex(reverse(unhex('6145F501578671E2877DBA2BE487AF7E'))) or hex(murmurHash3_128('foo')) = '6145F501578671E2877DBA2BE487AF7E';
-- Comparison with reverse for big endian
SELECT hex(murmurHash3_128('\x01')) = hex(reverse(unhex('16FE7483905CCE7A85670E43E4678877'))) or hex(murmurHash3_128('\x01')) = '16FE7483905CCE7A85670E43E4678877';
SELECT hex(murmurHash3_128('foo'));
SELECT hex(murmurHash3_128('\x01'));

View File

@ -1,11 +1,11 @@
12940785793559895259
17926972817233444501
7456555839952096623
1
1
1
1
1
CC45107CC4B79F62D831BEF2103C7CBF
DF2EC2F0669B000EDFF6ADEE264E7D68
4CD1C30C38AB935D418B5269EF197B9E
9D78134EE48654D753CCA1B76185CF8E
389D16428D2AADEC9713905572F42864
955237314186186656
8175794665478042155
9325786087413524176
@ -18,8 +18,8 @@
8163029322371165472
8788309436660676487
236561483980029756
1
1
8DD5527CC43D76F4760D26BE0F641F7E
F8F7AD9B6CD4CF117A71E277E2EC2931
12384823029245979431
4507350192761038840
1188926775431157506

View File

@ -4,11 +4,11 @@ SELECT sipHash64(1, 2, 3);
SELECT sipHash64(1, 3, 2);
SELECT sipHash64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(sipHash128('foo')) = hex(reverse(unhex('CC45107CC4B79F62D831BEF2103C7CBF'))) or hex(sipHash128('foo')) = 'CC45107CC4B79F62D831BEF2103C7CBF';
SELECT hex(sipHash128('\x01')) = hex(reverse(unhex('DF2EC2F0669B000EDFF6ADEE264E7D68'))) or hex(sipHash128('\x01')) = 'DF2EC2F0669B000EDFF6ADEE264E7D68';
SELECT hex(sipHash128('foo', 'foo')) = hex(reverse(unhex('4CD1C30C38AB935D418B5269EF197B9E'))) or hex(sipHash128('foo', 'foo')) = '4CD1C30C38AB935D418B5269EF197B9E';
SELECT hex(sipHash128('foo', 'foo', 'foo')) = hex(reverse(unhex('9D78134EE48654D753CCA1B76185CF8E'))) or hex(sipHash128('foo', 'foo', 'foo')) = '9D78134EE48654D753CCA1B76185CF8E';
SELECT hex(sipHash128(1, 2, 3)) = hex(reverse(unhex('389D16428D2AADEC9713905572F42864'))) or hex(sipHash128(1, 2, 3)) = '389D16428D2AADEC9713905572F42864';
SELECT hex(sipHash128('foo'));
SELECT hex(sipHash128('\x01'));
SELECT hex(sipHash128('foo', 'foo'));
SELECT hex(sipHash128('foo', 'foo', 'foo'));
SELECT hex(sipHash128(1, 2, 3));
SELECT halfMD5(1, 2, 3);
SELECT halfMD5(1, 3, 2);
@ -26,8 +26,8 @@ SELECT murmurHash3_64(1, 2, 3);
SELECT murmurHash3_64(1, 3, 2);
SELECT murmurHash3_64(('a', [1, 2, 3], 4, (4, ['foo', 'bar'], 1, (1, 2))));
SELECT hex(murmurHash3_128('foo', 'foo')) = hex(reverse(unhex('8DD5527CC43D76F4760D26BE0F641F7E'))) or hex(murmurHash3_128('foo', 'foo')) = '8DD5527CC43D76F4760D26BE0F641F7E';
SELECT hex(murmurHash3_128('foo', 'foo', 'foo')) = hex(reverse(unhex('F8F7AD9B6CD4CF117A71E277E2EC2931'))) or hex(murmurHash3_128('foo', 'foo', 'foo')) = 'F8F7AD9B6CD4CF117A71E277E2EC2931';
SELECT hex(murmurHash3_128('foo', 'foo'));
SELECT hex(murmurHash3_128('foo', 'foo', 'foo'));
SELECT gccMurmurHash(1, 2, 3);
SELECT gccMurmurHash(1, 3, 2);

View File

@ -191,9 +191,9 @@ E51B38608EF25F57
1
1
E28DBDE7FE22E41C
1
1CE422FEE7BD8DE20000000000000000
E28DBDE7FE22E41C
1
1CE422FEE7BD8DE20000000000000000
Check bug with hashing of const integer values
11862823756610506724
11862823756610506724

View File

@ -269,9 +269,9 @@ select sipHash64Keyed(toUInt64(0), '1'); -- { serverError 48 }
select sipHash128Keyed(toUInt64(0), '1'); -- { serverError 48 }
select hex(sipHash64());
SELECT hex(sipHash128()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000';
SELECT hex(sipHash128());
select hex(sipHash64Keyed());
SELECT hex(sipHash128Keyed()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128Keyed()) = '1CE422FEE7BD8DE20000000000000000';
SELECT hex(sipHash128Keyed());
SELECT 'Check bug with hashing of const integer values';
DROP TABLE IF EXISTS tab;

View File

@ -1 +1 @@
1
1CE422FEE7BD8DE20000000000000000

View File

@ -1 +1 @@
SELECT hex(sipHash128Reference()) = hex(reverse(unhex('1CE422FEE7BD8DE20000000000000000'))) or hex(sipHash128()) = '1CE422FEE7BD8DE20000000000000000';
SELECT hex(sipHash128Reference());