ClickHouse/src/Compression/ICompressionCodec.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

139 lines
4.5 KiB
C++
Raw Normal View History

2018-12-28 18:15:26 +00:00
#include "ICompressionCodec.h"
2020-07-09 01:00:16 +00:00
#include <cassert>
2020-08-26 08:59:02 +00:00
#include <Parsers/ASTFunction.h>
2021-10-02 07:13:14 +00:00
#include <base/unaligned.h>
2020-07-09 01:00:16 +00:00
#include <Common/Exception.h>
#include <Parsers/queryToString.h>
2020-09-14 19:15:25 +00:00
#include <Parsers/ASTIdentifier.h>
2020-09-18 11:37:58 +00:00
#include <Compression/CompressionCodecMultiple.h>
2018-10-11 02:57:48 +00:00
2019-12-19 19:23:49 +00:00
2018-10-11 02:57:48 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_DECOMPRESS;
2019-12-19 19:23:49 +00:00
extern const int CORRUPTED_DATA;
2020-09-14 19:15:25 +00:00
extern const int LOGICAL_ERROR;
2018-10-11 02:57:48 +00:00
}
2020-09-14 19:15:25 +00:00
void ICompressionCodec::setCodecDescription(const String & codec_name, const ASTs & arguments)
2020-08-26 08:59:02 +00:00
{
std::shared_ptr<ASTFunction> result = std::make_shared<ASTFunction>();
result->name = "CODEC";
2020-09-16 09:08:39 +00:00
2020-10-27 11:04:03 +00:00
/// Special case for codec Multiple, which doesn't have name. It's just list
2020-09-16 08:18:42 +00:00
/// of other codecs.
2020-09-14 19:15:25 +00:00
if (codec_name.empty())
{
2020-09-14 19:15:25 +00:00
ASTPtr codec_desc = std::make_shared<ASTExpressionList>();
for (const auto & argument : arguments)
codec_desc->children.push_back(argument);
result->arguments = codec_desc;
}
else
{
2020-09-14 19:15:25 +00:00
ASTPtr codec_desc;
2020-09-16 08:18:42 +00:00
if (arguments.empty()) /// Codec without arguments is just ASTIdentifier
2020-09-14 19:15:25 +00:00
codec_desc = std::make_shared<ASTIdentifier>(codec_name);
2020-09-16 08:18:42 +00:00
else /// Codec with arguments represented as ASTFunction
2020-09-14 19:15:25 +00:00
codec_desc = makeASTFunction(codec_name, arguments);
result->arguments = std::make_shared<ASTExpressionList>();
result->arguments->children.push_back(codec_desc);
}
2020-09-14 19:15:25 +00:00
result->children.push_back(result->arguments);
2020-09-14 19:15:25 +00:00
full_codec_desc = result;
}
ASTPtr ICompressionCodec::getFullCodecDesc() const
{
if (full_codec_desc == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Codec description is not prepared");
2020-09-14 19:15:25 +00:00
return full_codec_desc;
}
ASTPtr ICompressionCodec::getCodecDesc() const
{
auto arguments = getFullCodecDesc()->as<ASTFunction>()->arguments;
2020-09-16 08:18:42 +00:00
/// If it has exactly one argument, than it's single codec, return it
2020-09-14 19:15:25 +00:00
if (arguments->children.size() == 1)
return arguments->children[0];
2020-09-16 08:18:42 +00:00
else /// Otherwise we have multiple codecs and return them as expression list
2020-09-14 19:15:25 +00:00
return arguments;
2020-08-26 08:59:02 +00:00
}
2018-12-19 17:20:18 +00:00
UInt64 ICompressionCodec::getHash() const
{
SipHash hash;
updateHash(hash);
return hash.get64();
}
UInt32 ICompressionCodec::compress(const char * source, UInt32 source_size, char * dest) const
2018-10-11 02:57:48 +00:00
{
assert(source != nullptr && dest != nullptr);
2018-12-19 17:20:18 +00:00
dest[0] = getMethodByte();
UInt8 header_size = getHeaderSize();
/// Write data from header_size
UInt32 compressed_bytes_written = doCompressData(source, source_size, &dest[header_size]);
2023-04-21 10:38:45 +00:00
unalignedStoreLittleEndian<UInt32>(&dest[1], compressed_bytes_written + header_size);
unalignedStoreLittleEndian<UInt32>(&dest[5], source_size);
2018-12-19 17:20:18 +00:00
return header_size + compressed_bytes_written;
2018-10-11 02:57:48 +00:00
}
2022-07-06 22:34:31 +00:00
UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, char * dest) const
2018-10-11 02:57:48 +00:00
{
assert(source != nullptr && dest != nullptr);
2019-12-19 19:23:49 +00:00
UInt8 header_size = getHeaderSize();
2019-12-19 19:23:49 +00:00
if (source_size < header_size)
throw Exception(ErrorCodes::CORRUPTED_DATA,
"Can't decompress data: the compressed data size ({}, this should include header size) "
"is less than the header size ({})", source_size, static_cast<size_t>(header_size));
2019-12-19 19:23:49 +00:00
2020-07-09 01:00:16 +00:00
uint8_t our_method = getMethodByte();
2020-01-03 14:39:24 +00:00
uint8_t method = source[0];
2020-07-09 01:00:16 +00:00
if (method != our_method)
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Can't decompress data with codec byte {} using codec with byte {}", method, our_method);
2018-12-19 17:20:18 +00:00
2019-08-26 14:39:49 +00:00
UInt32 decompressed_size = readDecompressedBlockSize(source);
2022-07-06 22:18:55 +00:00
doDecompressData(&source[header_size], source_size - header_size, dest, decompressed_size);
2018-12-19 17:20:18 +00:00
return decompressed_size;
2018-10-11 02:57:48 +00:00
}
2018-12-19 17:20:18 +00:00
UInt32 ICompressionCodec::readCompressedBlockSize(const char * source)
2018-10-11 02:57:48 +00:00
{
2023-04-21 10:38:45 +00:00
UInt32 compressed_block_size = unalignedLoadLittleEndian<UInt32>(&source[1]);
2021-10-01 06:32:54 +00:00
if (compressed_block_size == 0)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with compressed block size 0");
return compressed_block_size;
2018-10-11 02:57:48 +00:00
}
2018-12-19 17:20:18 +00:00
UInt32 ICompressionCodec::readDecompressedBlockSize(const char * source)
{
2023-04-21 10:38:45 +00:00
UInt32 decompressed_block_size = unalignedLoadLittleEndian<UInt32>(&source[5]);
2021-10-01 06:32:54 +00:00
if (decompressed_block_size == 0)
throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: header is corrupt with decompressed block size 0");
return decompressed_block_size;
2018-12-19 17:20:18 +00:00
}
2020-01-03 14:39:24 +00:00
uint8_t ICompressionCodec::readMethod(const char * source)
2018-12-19 17:20:18 +00:00
{
2020-01-03 14:39:24 +00:00
return static_cast<uint8_t>(source[0]);
2018-12-19 17:20:18 +00:00
}
2018-10-11 02:57:48 +00:00
}