Merge pull request #52329 from ClibMouse/feature/mergetree-checksum-on-big-endian

Implement endianness-indepedent support for MergeTree checksums
This commit is contained in:
Yakov Olkhovskiy 2023-08-13 23:59:05 -04:00 committed by GitHub
commit 300399d1f5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 66 additions and 39 deletions

View File

@ -3,23 +3,25 @@
#include <base/Decimal_fwd.h>
#include <base/extended_types.h>
#include <city.h>
#include <utility>
namespace DB
{
template <std::endian endian, typename T>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
requires std::is_integral_v<T>
inline void transformEndianness(T & value)
{
if constexpr (endian != std::endian::native)
if constexpr (ToEndian != FromEndian)
value = std::byteswap(value);
}
template <std::endian endian, typename T>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
requires is_big_int_v<T>
inline void transformEndianness(T & x)
{
if constexpr (std::endian::native != endian)
if constexpr (ToEndian != FromEndian)
{
auto & items = x.items;
std::transform(std::begin(items), std::end(items), std::begin(items), [](auto & item) { return std::byteswap(item); });
@ -27,42 +29,49 @@ inline void transformEndianness(T & x)
}
}
template <std::endian endian, typename T>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
requires is_decimal<T>
inline void transformEndianness(T & x)
{
transformEndianness<endian>(x.value);
transformEndianness<ToEndian, FromEndian>(x.value);
}
template <std::endian endian, typename T>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
requires std::is_floating_point_v<T>
inline void transformEndianness(T & value)
{
if constexpr (std::endian::native != endian)
if constexpr (ToEndian != FromEndian)
{
auto * start = reinterpret_cast<std::byte *>(&value);
std::reverse(start, start + sizeof(T));
}
}
template <std::endian endian, typename T>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
requires std::is_scoped_enum_v<T>
inline void transformEndianness(T & x)
{
using UnderlyingType = std::underlying_type_t<T>;
transformEndianness<endian>(reinterpret_cast<UnderlyingType &>(x));
transformEndianness<ToEndian, FromEndian>(reinterpret_cast<UnderlyingType &>(x));
}
template <std::endian endian, typename A, typename B>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename A, typename B>
inline void transformEndianness(std::pair<A, B> & pair)
{
transformEndianness<endian>(pair.first);
transformEndianness<endian>(pair.second);
transformEndianness<ToEndian, FromEndian>(pair.first);
transformEndianness<ToEndian, FromEndian>(pair.second);
}
template <std::endian endian, typename T, typename Tag>
template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T, typename Tag>
inline void transformEndianness(StrongTypedef<T, Tag> & x)
{
transformEndianness<endian>(x.toUnderType());
transformEndianness<ToEndian, FromEndian>(x.toUnderType());
}
template <std::endian ToEndian, std::endian FromEndian = std::endian::native>
inline void transformEndianness(CityHash_v1_0_2::uint128 & x)
{
transformEndianness<ToEndian, FromEndian>(x.low64);
transformEndianness<ToEndian, FromEndian>(x.high64);
}
}

View File

@ -10,6 +10,8 @@
#include <Formats/ProtobufReader.h>
#include <Core/Field.h>
#include <ranges>
namespace DB
{
@ -135,13 +137,25 @@ template <typename T>
void SerializationNumber<T>::serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const
{
const typename ColumnVector<T>::Container & x = typeid_cast<const ColumnVector<T> &>(column).getData();
size_t size = x.size();
if (limit == 0 || offset + limit > size)
if (const size_t size = x.size(); limit == 0 || offset + limit > size)
limit = size - offset;
if (limit)
if (limit == 0)
return;
if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2)
{
static constexpr auto to_little_endian = [](auto i)
{
transformEndianness<std::endian::little>(i);
return i;
};
std::ranges::for_each(
x | std::views::drop(offset) | std::views::take(limit) | std::views::transform(to_little_endian),
[&ostr](const auto & i) { ostr.write(reinterpret_cast<const char *>(&i), sizeof(typename ColumnVector<T>::ValueType)); });
}
else
ostr.write(reinterpret_cast<const char *>(&x[offset]), sizeof(typename ColumnVector<T>::ValueType) * limit);
}
@ -149,10 +163,13 @@ template <typename T>
void SerializationNumber<T>::deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double /*avg_value_size_hint*/) const
{
typename ColumnVector<T>::Container & x = typeid_cast<ColumnVector<T> &>(column).getData();
size_t initial_size = x.size();
const size_t initial_size = x.size();
x.resize(initial_size + limit);
size_t size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(typename ColumnVector<T>::ValueType) * limit);
const size_t size = istr.readBig(reinterpret_cast<char*>(&x[initial_size]), sizeof(typename ColumnVector<T>::ValueType) * limit);
x.resize(initial_size + size / sizeof(typename ColumnVector<T>::ValueType));
if constexpr (std::endian::native == std::endian::big && sizeof(T) >= 2)
std::ranges::for_each(x | std::views::drop(initial_size), [](auto & i) { transformEndianness<std::endian::big, std::endian::little>(i); });
}
template class SerializationNumber<UInt8>;

View File

@ -1374,8 +1374,8 @@ public:
if constexpr (std::is_same_v<ToType, UInt128>) /// backward-compatible
{
if (std::endian::native == std::endian::big)
std::ranges::for_each(col_to->getData(), transformEndianness<std::endian::little, ToType>);
if constexpr (std::endian::native == std::endian::big)
std::ranges::for_each(col_to->getData(), transformEndianness<std::endian::little, std::endian::native, ToType>);
auto col_to_fixed_string = ColumnFixedString::create(sizeof(UInt128));
const auto & data = col_to->getData();

View File

@ -187,15 +187,15 @@ bool MergeTreeDataPartChecksums::readV3(ReadBuffer & in)
String name;
Checksum sum;
readBinary(name, in);
readStringBinary(name, in);
readVarUInt(sum.file_size, in);
readPODBinary(sum.file_hash, in);
readBinary(sum.is_compressed, in);
readBinaryLittleEndian(sum.file_hash, in);
readBinaryLittleEndian(sum.is_compressed, in);
if (sum.is_compressed)
{
readVarUInt(sum.uncompressed_size, in);
readPODBinary(sum.uncompressed_hash, in);
readBinaryLittleEndian(sum.uncompressed_hash, in);
}
files.emplace(std::move(name), sum);
@ -223,15 +223,15 @@ void MergeTreeDataPartChecksums::write(WriteBuffer & to) const
const String & name = it.first;
const Checksum & sum = it.second;
writeBinary(name, out);
writeStringBinary(name, out);
writeVarUInt(sum.file_size, out);
writePODBinary(sum.file_hash, out);
writeBinary(sum.is_compressed, out);
writeBinaryLittleEndian(sum.file_hash, out);
writeBinaryLittleEndian(sum.is_compressed, out);
if (sum.is_compressed)
{
writeVarUInt(sum.uncompressed_size, out);
writePODBinary(sum.uncompressed_hash, out);
writeBinaryLittleEndian(sum.uncompressed_hash, out);
}
}
}
@ -339,9 +339,9 @@ void MinimalisticDataPartChecksums::serializeWithoutHeader(WriteBuffer & to) con
writeVarUInt(num_compressed_files, to);
writeVarUInt(num_uncompressed_files, to);
writePODBinary(hash_of_all_files, to);
writePODBinary(hash_of_uncompressed_files, to);
writePODBinary(uncompressed_hash_of_compressed_files, to);
writeBinaryLittleEndian(hash_of_all_files, to);
writeBinaryLittleEndian(hash_of_uncompressed_files, to);
writeBinaryLittleEndian(uncompressed_hash_of_compressed_files, to);
}
String MinimalisticDataPartChecksums::getSerializedString() const
@ -382,9 +382,9 @@ void MinimalisticDataPartChecksums::deserializeWithoutHeader(ReadBuffer & in)
readVarUInt(num_compressed_files, in);
readVarUInt(num_uncompressed_files, in);
readPODBinary(hash_of_all_files, in);
readPODBinary(hash_of_uncompressed_files, in);
readPODBinary(uncompressed_hash_of_compressed_files, in);
readBinaryLittleEndian(hash_of_all_files, in);
readBinaryLittleEndian(hash_of_uncompressed_files, in);
readBinaryLittleEndian(uncompressed_hash_of_compressed_files, in);
}
void MinimalisticDataPartChecksums::computeTotalChecksums(const MergeTreeDataPartChecksums & full_checksums_)

View File

@ -365,8 +365,9 @@ void MergeTreeDataPartWriterCompact::addToChecksums(MergeTreeDataPartChecksums &
{
uncompressed_size += stream->hashing_buf.count();
auto stream_hash = stream->hashing_buf.getHash();
transformEndianness<std::endian::little>(stream_hash);
uncompressed_hash = CityHash_v1_0_2::CityHash128WithSeed(
reinterpret_cast<char *>(&stream_hash), sizeof(stream_hash), uncompressed_hash);
reinterpret_cast<const char *>(&stream_hash), sizeof(stream_hash), uncompressed_hash);
}
checksums.files[data_file_name].is_compressed = true;