mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-18 04:12:19 +00:00
Added Codec-file and shit-coded CompressionMethodByte
This commit is contained in:
parent
7886e06217
commit
181e214a8a
251
src/Compression/CompressionCodecGCD.cpp
Normal file
251
src/Compression/CompressionCodecGCD.cpp
Normal file
@ -0,0 +1,251 @@
|
|||||||
|
#include <Compression/ICompressionCodec.h>
|
||||||
|
#include <Compression/CompressionInfo.h>
|
||||||
|
#include <Compression/CompressionFactory.h>
|
||||||
|
#include <base/unaligned.h>
|
||||||
|
#include <Parsers/IAST.h>
|
||||||
|
#include <Parsers/ASTLiteral.h>
|
||||||
|
#include <Parsers/ASTFunction.h>
|
||||||
|
#include <IO/WriteHelpers.h>
|
||||||
|
#include "Common/Exception.h"
|
||||||
|
#include "base/types.h"
|
||||||
|
#include "config.h"
|
||||||
|
|
||||||
|
#include <numeric>
|
||||||
|
|
||||||
|
namespace DB
|
||||||
|
{
|
||||||
|
|
||||||
|
class CompressionCodecGCD : public ICompressionCodec
|
||||||
|
{
|
||||||
|
public:
|
||||||
|
explicit CompressionCodecGCD(UInt8 gcd_bytes_size_);
|
||||||
|
|
||||||
|
uint8_t getMethodByte() const override;
|
||||||
|
|
||||||
|
void updateHash(SipHash & hash) const override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
|
||||||
|
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
|
||||||
|
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override { return uncompressed_size + 2; }
|
||||||
|
|
||||||
|
bool isCompression() const override { return false; }
|
||||||
|
bool isGenericCompression() const override { return false; }
|
||||||
|
|
||||||
|
private:
|
||||||
|
const UInt8 gcd_bytes_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
namespace ErrorCodes
|
||||||
|
{
|
||||||
|
extern const int CANNOT_COMPRESS;
|
||||||
|
extern const int CANNOT_DECOMPRESS;
|
||||||
|
extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
|
||||||
|
extern const int ILLEGAL_CODEC_PARAMETER;
|
||||||
|
extern const int BAD_ARGUMENTS;
|
||||||
|
}
|
||||||
|
|
||||||
|
CompressionCodecGCD::CompressionCodecGCD(UInt8 gcd_bytes_size_)
|
||||||
|
: gcd_bytes_size(gcd_bytes_size_)
|
||||||
|
{
|
||||||
|
setCodecDescription("GCD", {std::make_shared<ASTLiteral>(static_cast<UInt64>(gcd_bytes_size))});
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t CompressionCodecGCD::getMethodByte() const
|
||||||
|
{
|
||||||
|
return static_cast<uint8_t>(CompressionMethodByte::GCD);
|
||||||
|
}
|
||||||
|
|
||||||
|
void CompressionCodecGCD::updateHash(SipHash & hash) const
|
||||||
|
{
|
||||||
|
getCodecDesc()->updateTreeHash(hash);
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||||
|
{
|
||||||
|
if (source_size % sizeof(T) != 0)
|
||||||
|
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot GCD compress, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||||
|
|
||||||
|
const char * const source_end = source + source_size;
|
||||||
|
|
||||||
|
T gcd{};
|
||||||
|
const auto * cur_source = source;
|
||||||
|
while (cur_source < source_end)
|
||||||
|
{
|
||||||
|
if (cur_source == source)
|
||||||
|
{
|
||||||
|
gcd = unalignedLoad<T>(cur_source);
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
gcd = std::gcd(gcd, unalignedLoad<T>(cur_source));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
unalignedStore<T>(dest, gcd);
|
||||||
|
dest += sizeof(T);
|
||||||
|
|
||||||
|
cur_source = source;
|
||||||
|
while (cur_source < source_end)
|
||||||
|
{
|
||||||
|
unalignedStore<T>(dest, unalignedLoad<T>(cur_source) / gcd);
|
||||||
|
cur_source += sizeof(T);
|
||||||
|
dest += sizeof(T);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename T>
|
||||||
|
void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size)
|
||||||
|
{
|
||||||
|
const char * const output_end = dest + output_size;
|
||||||
|
|
||||||
|
if (source_size % sizeof(T) != 0)
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||||
|
|
||||||
|
if (source_size < sizeof(T)) {
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is less than {}", source_size, sizeof(T));
|
||||||
|
}
|
||||||
|
|
||||||
|
const char * const source_end = source + source_size;
|
||||||
|
const T gcd = unalignedLoad<T>(source);
|
||||||
|
source += sizeof(T);
|
||||||
|
while (source < source_end)
|
||||||
|
{
|
||||||
|
if (dest + sizeof(T) > output_end) [[unlikely]]
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data");
|
||||||
|
unalignedStore<T>(dest, unalignedLoad<T>(source) * gcd);
|
||||||
|
|
||||||
|
source += sizeof(T);
|
||||||
|
dest += sizeof(T);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
UInt32 CompressionCodecGCD::doCompressData(const char * source, UInt32 source_size, char * dest) const
|
||||||
|
{
|
||||||
|
UInt8 bytes_to_skip = source_size % gcd_bytes_size;
|
||||||
|
dest[0] = gcd_bytes_size;
|
||||||
|
dest[1] = bytes_to_skip; /// unused (backward compatibility)
|
||||||
|
memcpy(&dest[2], source, bytes_to_skip);
|
||||||
|
size_t start_pos = 2 + bytes_to_skip;
|
||||||
|
switch (gcd_bytes_size)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return 1 + 1 + source_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
void CompressionCodecGCD::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
|
||||||
|
{
|
||||||
|
if (source_size < 2)
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
|
||||||
|
|
||||||
|
if (uncompressed_size == 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
UInt8 bytes_size = source[0];
|
||||||
|
|
||||||
|
if (!(bytes_size == 1 || bytes_size == 2 || bytes_size == 4 || bytes_size == 8))
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
|
||||||
|
|
||||||
|
UInt8 bytes_to_skip = uncompressed_size % bytes_size;
|
||||||
|
UInt32 output_size = uncompressed_size - bytes_to_skip;
|
||||||
|
|
||||||
|
if (static_cast<UInt32>(2 + bytes_to_skip) > source_size)
|
||||||
|
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress. File has wrong header");
|
||||||
|
|
||||||
|
memcpy(dest, &source[2], bytes_to_skip);
|
||||||
|
UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
|
||||||
|
switch (bytes_size)
|
||||||
|
{
|
||||||
|
case 1:
|
||||||
|
decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
|
||||||
|
break;
|
||||||
|
case 8:
|
||||||
|
decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], output_size);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace
|
||||||
|
{
|
||||||
|
|
||||||
|
UInt8 getGCDBytesSize(const IDataType * column_type)
|
||||||
|
{
|
||||||
|
if (!column_type->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion())
|
||||||
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is not applicable for {} because the data type is not of fixed size",
|
||||||
|
column_type->getName());
|
||||||
|
|
||||||
|
size_t max_size = column_type->getSizeOfValueInMemory();
|
||||||
|
if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8)
|
||||||
|
return static_cast<UInt8>(max_size);
|
||||||
|
else
|
||||||
|
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec GCD is only applicable for data types of size 1, 2, 4, 8 bytes. Given type {}",
|
||||||
|
column_type->getName());
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
void registerCodecGCD(CompressionCodecFactory & factory)
|
||||||
|
{
|
||||||
|
UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::GCD);
|
||||||
|
auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
|
||||||
|
{
|
||||||
|
/// Default bytes size is 1.
|
||||||
|
UInt8 gcd_bytes_size = 1;
|
||||||
|
|
||||||
|
if (arguments && !arguments->children.empty())
|
||||||
|
{
|
||||||
|
if (arguments->children.size() > 1)
|
||||||
|
throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "GCD codec must have 1 parameter, given {}", arguments->children.size());
|
||||||
|
|
||||||
|
const auto children = arguments->children;
|
||||||
|
const auto * literal = children[0]->as<ASTLiteral>();
|
||||||
|
if (!literal || literal->value.getType() != Field::Types::Which::UInt64)
|
||||||
|
throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "GCD codec argument must be unsigned integer");
|
||||||
|
|
||||||
|
size_t user_bytes_size = literal->value.safeGet<UInt64>();
|
||||||
|
if (user_bytes_size != 1 && user_bytes_size != 2 && user_bytes_size != 4 && user_bytes_size != 8)
|
||||||
|
throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "GCD value for GCD codec can be 1, 2, 4 or 8, given {}", user_bytes_size);
|
||||||
|
gcd_bytes_size = static_cast<UInt8>(user_bytes_size);
|
||||||
|
}
|
||||||
|
else if (column_type)
|
||||||
|
{
|
||||||
|
gcd_bytes_size = getGCDBytesSize(column_type);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
|
||||||
|
};
|
||||||
|
factory.registerCompressionCodecWithType("GCD", method_code, codec_builder);
|
||||||
|
}
|
||||||
|
|
||||||
|
CompressionCodecPtr getCompressionCodecGCD(UInt8 gcd_bytes_size)
|
||||||
|
{
|
||||||
|
return std::make_shared<CompressionCodecGCD>(gcd_bytes_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
@ -47,6 +47,7 @@ enum class CompressionMethodByte : uint8_t
|
|||||||
AES_256_GCM_SIV = 0x97,
|
AES_256_GCM_SIV = 0x97,
|
||||||
FPC = 0x98,
|
FPC = 0x98,
|
||||||
DeflateQpl = 0x99,
|
DeflateQpl = 0x99,
|
||||||
|
GCD = 0x69, // TODO: Edit this shit
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user