Fix multiple codecs and add test

This commit is contained in:
Amos Bird 2024-10-30 09:41:18 +08:00
parent c5d6acf5e3
commit 10ee24d9a0
No known key found for this signature in database
GPG Key ID: 80D430DCBECFEDB4
5 changed files with 49 additions and 26 deletions

View File

@ -1,6 +1,3 @@
/// For magic_enum to properly get enum name of DB::CompressionMethodByte
#define MAGIC_ENUM_RANGE_MAX 256
#include <iostream>
#include <optional>
#include <boost/program_options.hpp>
@ -14,9 +11,12 @@
#include <Compression/CompressedWriteBuffer.h>
#include <Compression/CompressedReadBuffer.h>
#include <Compression/CompressedReadBufferFromFile.h>
#include <Compression/getCompressionCodecForFile.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <IO/copyData.h>
#include <Parsers/parseQuery.h>
#include <Parsers/queryToString.h>
#include <Parsers/ExpressionElementParsers.h>
#include <Compression/CompressionFactory.h>
#include <Common/TerminalSize.h>
@ -41,31 +41,19 @@ void checkAndWriteHeader(DB::ReadBuffer & in, DB::WriteBuffer & out)
{
while (!in.eof())
{
in.ignore(16); /// checksum
char header[COMPRESSED_BLOCK_HEADER_SIZE];
in.readStrict(header, COMPRESSED_BLOCK_HEADER_SIZE);
UInt32 size_compressed = unalignedLoad<UInt32>(&header[1]);
UInt32 size_compressed;
UInt32 size_decompressed;
auto codec = DB::getCompressionCodecForFile(in, size_compressed, size_decompressed, true /* skip_to_next_block */);
if (size_compressed > DBMS_MAX_COMPRESSED_SIZE)
throw DB::Exception(DB::ErrorCodes::TOO_LARGE_SIZE_COMPRESSED, "Too large size_compressed. Most likely corrupted data.");
UInt32 size_decompressed = unalignedLoad<UInt32>(&header[5]);
auto method_byte = static_cast<uint8_t>(header[0]);
auto method = magic_enum::enum_cast<DB::CompressionMethodByte>(method_byte);
if (method)
DB::writeText(magic_enum::enum_name(*method), out);
else
DB::writeText(fmt::format("UNKNOWN({})", method_byte), out);
DB::writeText(queryToString(codec->getFullCodecDesc()), out);
DB::writeChar('\t', out);
DB::writeText(size_decompressed, out);
DB::writeChar('\t', out);
DB::writeText(size_compressed, out);
DB::writeChar('\n', out);
in.ignore(size_compressed - COMPRESSED_BLOCK_HEADER_SIZE);
}
}

View File

@ -10,33 +10,50 @@
namespace DB
{
using Checksum = CityHash_v1_0_2::uint128;
CompressionCodecPtr getCompressionCodecForFile(const IDataPartStorage & data_part_storage, const String & relative_path)
CompressionCodecPtr
getCompressionCodecForFile(ReadBuffer & read_buffer, UInt32 & size_compressed, UInt32 & size_decompressed, bool skip_to_next_block)
{
auto read_buffer = data_part_storage.readFile(relative_path, {}, std::nullopt, std::nullopt);
read_buffer->ignore(sizeof(Checksum));
read_buffer.ignore(sizeof(Checksum));
UInt8 header_size = ICompressionCodec::getHeaderSize();
size_t starting_bytes = read_buffer.count();
PODArray<char> compressed_buffer;
compressed_buffer.resize(header_size);
read_buffer->readStrict(compressed_buffer.data(), header_size);
read_buffer.readStrict(compressed_buffer.data(), header_size);
uint8_t method = ICompressionCodec::readMethod(compressed_buffer.data());
size_compressed = unalignedLoad<UInt32>(&compressed_buffer[1]);
size_decompressed = unalignedLoad<UInt32>(&compressed_buffer[5]);
if (method == static_cast<uint8_t>(CompressionMethodByte::Multiple))
{
compressed_buffer.resize(1);
read_buffer->readStrict(compressed_buffer.data(), 1);
read_buffer.readStrict(compressed_buffer.data(), 1);
compressed_buffer.resize(1 + compressed_buffer[0]);
read_buffer->readStrict(compressed_buffer.data() + 1, compressed_buffer[0]);
read_buffer.readStrict(compressed_buffer.data() + 1, compressed_buffer[0]);
auto codecs_bytes = CompressionCodecMultiple::getCodecsBytesFromData(compressed_buffer.data());
Codecs codecs;
for (auto byte : codecs_bytes)
codecs.push_back(CompressionCodecFactory::instance().get(byte));
if (skip_to_next_block)
read_buffer.ignore(size_compressed - (read_buffer.count() - starting_bytes));
return std::make_shared<CompressionCodecMultiple>(codecs);
}
if (skip_to_next_block)
read_buffer.ignore(size_compressed - (read_buffer.count() - starting_bytes));
return CompressionCodecFactory::instance().get(method);
}
CompressionCodecPtr getCompressionCodecForFile(const IDataPartStorage & data_part_storage, const String & relative_path)
{
auto read_buffer = data_part_storage.readFile(relative_path, {}, std::nullopt, std::nullopt);
UInt32 size_compressed;
UInt32 size_decompressed;
return getCompressionCodecForFile(*read_buffer, size_compressed, size_decompressed, false);
}
}

View File

@ -13,4 +13,8 @@ namespace DB
/// from metadata.
CompressionCodecPtr getCompressionCodecForFile(const IDataPartStorage & data_part_storage, const String & relative_path);
/// Same as above which is used by clickhouse-compressor to print compression statistics of each data block.
CompressionCodecPtr
getCompressionCodecForFile(ReadBuffer & read_buffer, UInt32 & size_compressed, UInt32 & size_decompressed, bool skip_to_next_block);
}

View File

@ -0,0 +1 @@
CODEC(Delta(1), LZ4) 14 48

View File

@ -0,0 +1,13 @@
#!/usr/bin/env bash
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
echo "Hello, World!" > 03260_test_data
$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input '03260_test_data' --output '03260_test_out'
$CLICKHOUSE_COMPRESSOR --stat '03260_test_out'
rm -f 03260_test_data 03260_test_out