mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 00:22:29 +00:00
Merge 55d66a0c50
into e0f8b8d351
This commit is contained in:
commit
5d378419ee
502
src/Compression/CompressionCodecChimp.cpp
Normal file
502
src/Compression/CompressionCodecChimp.cpp
Normal file
@ -0,0 +1,502 @@
|
||||
#pragma clang diagnostic ignored "-Wreserved-identifier"
|
||||
|
||||
#include <Compression/ICompressionCodec.h>
|
||||
#include <Compression/CompressionInfo.h>
|
||||
#include <Compression/CompressionFactory.h>
|
||||
#include <base/unaligned.h>
|
||||
#include <Parsers/IAST_fwd.h>
|
||||
#include <Parsers/ASTLiteral.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
#include <IO/ReadBufferFromMemory.h>
|
||||
#include <IO/BitHelpers.h>
|
||||
|
||||
#include <bitset>
|
||||
#include <cstring>
|
||||
#include <algorithm>
|
||||
#include <type_traits>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Chimp column codec implementation.
|
||||
*
|
||||
* Implementation of Chimp128 algorithm proposed in: Panagiotis Liakos, Katia Papakonstantinopoulou, Yannis Kotidis:
|
||||
* Chimp: Efficient Lossless Floating Point Compression for Time Series Databases. Proc. VLDB Endow. 15(11): 3058-3070 (2022)
|
||||
* Available in: https://dl.acm.org/doi/abs/10.14778/3551793.3551852
|
||||
*
|
||||
*/
|
||||
class CompressionCodecChimp : public ICompressionCodec
|
||||
{
|
||||
public:
|
||||
explicit CompressionCodecChimp(UInt8 data_bytes_size_);
|
||||
|
||||
uint8_t getMethodByte() const override;
|
||||
|
||||
void updateHash(SipHash & hash) const override;
|
||||
|
||||
protected:
|
||||
|
||||
UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override;
|
||||
|
||||
void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override;
|
||||
|
||||
UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override;
|
||||
|
||||
bool isCompression() const override { return true; }
|
||||
bool isGenericCompression() const override { return false; }
|
||||
bool isFloatingPointTimeSeriesCodec() const override { return true; }
|
||||
|
||||
private:
|
||||
const UInt8 data_bytes_size;
|
||||
};
|
||||
|
||||
namespace LeadingZero
|
||||
{
|
||||
static const auto BIT_LENGTH = 3;
|
||||
static const short round[65] =
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
8, 8, 8, 8, 12, 12, 12, 12,
|
||||
16, 16, 18, 18, 20, 20, 22, 22,
|
||||
24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24,
|
||||
24, 24, 24, 24, 24, 24, 24, 24, 24
|
||||
};
|
||||
static const short binaryRepresentation[65] =
|
||||
{
|
||||
0, 0, 0, 0, 0, 0, 0, 0,
|
||||
1, 1, 1, 1, 2, 2, 2, 2,
|
||||
3, 3, 4, 4, 5, 5, 6, 6,
|
||||
7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7,
|
||||
7, 7, 7, 7, 7, 7, 7, 7, 7
|
||||
};
|
||||
static const short reverseBinaryRepresentation[8] = {0, 8, 12, 16, 18, 20, 22, 24};
|
||||
}
|
||||
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int CANNOT_COMPRESS;
|
||||
extern const int CANNOT_DECOMPRESS;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
|
||||
extern const int ILLEGAL_CODEC_PARAMETER;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
constexpr UInt8 getBitLengthOfLength(UInt8 data_bytes_size)
|
||||
{
|
||||
// 4-byte value is 32 bits, and we need 5 bits to represent 32 values
|
||||
// 8-byte 64 bits => 6
|
||||
const UInt8 bit_lengths[] = {0, 0, 0, 0, 5, 0, 0, 0, 6};
|
||||
assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0);
|
||||
return bit_lengths[data_bytes_size];
|
||||
}
|
||||
|
||||
UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
|
||||
{
|
||||
constexpr UInt8 items_count_size = 4;
|
||||
return items_count_size + data_bytes_size;
|
||||
}
|
||||
|
||||
UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
|
||||
{
|
||||
const UInt32 items_count = uncompressed_size / data_bytes_size;
|
||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size);
|
||||
static const short LOG_NO_PREVIOUS_VALUES = static_cast<short>(std::log2(data_bytes_size * 16));
|
||||
// worst case (for 32-bit value):
|
||||
// 2 bits (flag) + 6 bits (previous values index) + 3 bits (no of leading zeroes) + 5 bits(data bit-size) + non-zero data bits.
|
||||
const UInt32 max_item_size_bits = 2 + LOG_NO_PREVIOUS_VALUES + LeadingZero::BIT_LENGTH + DATA_BIT_LENGTH + data_bytes_size * 8;
|
||||
// + 8 is to round up to next byte.
|
||||
return (items_count * max_item_size_bits + 8) / 8;
|
||||
}
|
||||
|
||||
struct BinaryValueInfo
|
||||
{
|
||||
UInt8 leading_zero_bits;
|
||||
UInt8 data_bits;
|
||||
UInt8 trailing_zero_bits;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
BinaryValueInfo getBinaryValueInfo(const T & value)
|
||||
{
|
||||
constexpr UInt8 bit_size = sizeof(T) * 8;
|
||||
const UInt8 lz = LeadingZero::round[getLeadingZeroBits(value)];
|
||||
const UInt8 tz = getTrailingZeroBits(value);
|
||||
const UInt8 data_size = value == 0 ? 0 : static_cast<UInt8>(bit_size - lz - tz);
|
||||
return {lz, data_size, tz};
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)
|
||||
{
|
||||
if (source_size % sizeof(T) != 0)
|
||||
throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with Chimp codec, data size {} is not aligned to {}", source_size, sizeof(T));
|
||||
|
||||
const char * const source_end = source + source_size;
|
||||
const char * const dest_start = dest;
|
||||
const char * const dest_end = dest + dest_size;
|
||||
|
||||
const UInt32 items_count = source_size / sizeof(T);
|
||||
|
||||
static const short NO_PREVIOUS_VALUES = sizeof(T) * 16;
|
||||
T stored_values[NO_PREVIOUS_VALUES];
|
||||
for (int i = 0; i < NO_PREVIOUS_VALUES; i++)
|
||||
{
|
||||
stored_values[i] = 0;
|
||||
}
|
||||
static const short LOG_NO_PREVIOUS_VALUES = static_cast<short>(std::log2(NO_PREVIOUS_VALUES));
|
||||
static const short THRESHOLD = 6 + LOG_NO_PREVIOUS_VALUES;
|
||||
static const int ARRAY_SIZE = static_cast<int>(std::pow(2, THRESHOLD + 1));
|
||||
int indices[ARRAY_SIZE];
|
||||
for (int i = 0; i < ARRAY_SIZE; i++)
|
||||
{
|
||||
indices[i] = 0;
|
||||
}
|
||||
static const short setLsb = ARRAY_SIZE - 1;
|
||||
|
||||
unalignedStoreLittleEndian<UInt32>(dest, items_count);
|
||||
dest += sizeof(items_count);
|
||||
|
||||
T prev_value = 0;
|
||||
// That would cause first XORed value to be written in-full.
|
||||
BinaryValueInfo prev_xored_info{0, 0, 0};
|
||||
|
||||
if (source < source_end)
|
||||
{
|
||||
prev_value = unalignedLoadLittleEndian<T>(source);
|
||||
unalignedStoreLittleEndian<T>(dest, prev_value);
|
||||
|
||||
source += sizeof(prev_value);
|
||||
dest += sizeof(prev_value);
|
||||
stored_values[0] = prev_value;
|
||||
}
|
||||
|
||||
BitWriter writer(dest, dest_end - dest);
|
||||
|
||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
||||
|
||||
int total = 0;
|
||||
int previous_index = 0;
|
||||
int current_index = 0;
|
||||
|
||||
while (source < source_end)
|
||||
{
|
||||
const T curr_value = unalignedLoadLittleEndian<T>(source);
|
||||
source += sizeof(curr_value);
|
||||
|
||||
// find best matching previous value
|
||||
T xored_data;
|
||||
BinaryValueInfo curr_xored_info;
|
||||
int match_key = static_cast<int>(curr_value & setLsb);
|
||||
int match_index = indices[match_key];
|
||||
if ((total - match_index) < NO_PREVIOUS_VALUES)
|
||||
{
|
||||
T tempXor = curr_value ^ stored_values[match_index % NO_PREVIOUS_VALUES];
|
||||
curr_xored_info = getBinaryValueInfo(tempXor);
|
||||
// if match is good enough, use it
|
||||
if (curr_xored_info.trailing_zero_bits > THRESHOLD)
|
||||
{
|
||||
previous_index = match_index % NO_PREVIOUS_VALUES;
|
||||
xored_data = tempXor;
|
||||
}
|
||||
// otherwise use immediately previous value
|
||||
else
|
||||
{
|
||||
previous_index = total % NO_PREVIOUS_VALUES;
|
||||
xored_data = curr_value ^ stored_values[previous_index];
|
||||
curr_xored_info = getBinaryValueInfo(xored_data);
|
||||
}
|
||||
}
|
||||
// if match is outside of range, use immediately previous value
|
||||
else
|
||||
{
|
||||
previous_index = total % NO_PREVIOUS_VALUES;
|
||||
xored_data = curr_value ^ stored_values[previous_index];
|
||||
curr_xored_info = getBinaryValueInfo(xored_data);
|
||||
}
|
||||
|
||||
// encode
|
||||
// 0b00 prefix
|
||||
if (xored_data == 0)
|
||||
{
|
||||
writer.writeBits(2, 0b00);
|
||||
writer.writeBits(LOG_NO_PREVIOUS_VALUES, previous_index);
|
||||
curr_xored_info.leading_zero_bits = 255; // max value so it can't be used
|
||||
}
|
||||
// 0b01 prefix
|
||||
else if (curr_xored_info.trailing_zero_bits > THRESHOLD)
|
||||
{
|
||||
writer.writeBits(2, 0b01);
|
||||
writer.writeBits(LOG_NO_PREVIOUS_VALUES, previous_index);
|
||||
writer.writeBits(LeadingZero::BIT_LENGTH, LeadingZero::binaryRepresentation[curr_xored_info.leading_zero_bits]);
|
||||
writer.writeBits(DATA_BIT_LENGTH, curr_xored_info.data_bits);
|
||||
writer.writeBits(curr_xored_info.data_bits, xored_data >> curr_xored_info.trailing_zero_bits);
|
||||
curr_xored_info.leading_zero_bits = 255; // max value so it can't be used
|
||||
}
|
||||
// 0b10 prefix
|
||||
else if (prev_xored_info.leading_zero_bits == curr_xored_info.leading_zero_bits)
|
||||
{
|
||||
writer.writeBits(2, 0b10);
|
||||
writer.writeBits(curr_xored_info.data_bits + curr_xored_info.trailing_zero_bits, xored_data);
|
||||
}
|
||||
// 0b11 prefix
|
||||
else
|
||||
{
|
||||
writer.writeBits(2, 0b11);
|
||||
writer.writeBits(LeadingZero::BIT_LENGTH, LeadingZero::binaryRepresentation[curr_xored_info.leading_zero_bits]);
|
||||
writer.writeBits(curr_xored_info.data_bits + curr_xored_info.trailing_zero_bits, xored_data);
|
||||
}
|
||||
|
||||
// update stored previous values and indices
|
||||
prev_xored_info = curr_xored_info;
|
||||
prev_value = curr_value;
|
||||
current_index = (current_index + 1) % NO_PREVIOUS_VALUES;
|
||||
stored_values[current_index] = curr_value;
|
||||
total++;
|
||||
indices[match_key] = total;
|
||||
}
|
||||
writer.flush();
|
||||
|
||||
return static_cast<UInt32>((dest - dest_start) + (writer.count() + 7) / 8);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)
|
||||
{
|
||||
static const short NO_PREVIOUS_VALUES = sizeof(T) * 16;
|
||||
static const short LOG_NO_PREVIOUS_VALUES = static_cast<short>(std::log2(NO_PREVIOUS_VALUES));
|
||||
int current_index = 0;
|
||||
T stored_values[NO_PREVIOUS_VALUES];
|
||||
for (int i = 0; i < NO_PREVIOUS_VALUES; i++)
|
||||
{
|
||||
stored_values[i] = 0;
|
||||
}
|
||||
|
||||
const char * const source_end = source + source_size;
|
||||
|
||||
if (source + sizeof(UInt32) > source_end)
|
||||
return;
|
||||
|
||||
|
||||
const UInt32 items_count = unalignedLoadLittleEndian<UInt32>(source);
|
||||
source += sizeof(items_count);
|
||||
|
||||
T prev_value = 0;
|
||||
|
||||
// decoding first item
|
||||
if (source + sizeof(T) > source_end || items_count < 1)
|
||||
return;
|
||||
|
||||
if (static_cast<UInt64>(items_count) * sizeof(T) > dest_size)
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data: corrupted input data.");
|
||||
|
||||
prev_value = unalignedLoadLittleEndian<T>(source);
|
||||
unalignedStoreLittleEndian<T>(dest, prev_value);
|
||||
|
||||
source += sizeof(prev_value);
|
||||
dest += sizeof(prev_value);
|
||||
stored_values[0] = prev_value;
|
||||
|
||||
BitReader reader(source, source_size - sizeof(items_count) - sizeof(prev_value));
|
||||
|
||||
BinaryValueInfo prev_xored_info{0, 0, 0};
|
||||
|
||||
static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
|
||||
|
||||
// since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,
|
||||
// we have to keep track of items to avoid reading more than there is.
|
||||
for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read)
|
||||
{
|
||||
T curr_value = prev_value;
|
||||
BinaryValueInfo curr_xored_info = prev_xored_info;
|
||||
T xored_data = 0;
|
||||
UInt64 match_index;
|
||||
UInt8 flag = reader.readBits(2);
|
||||
switch (flag)
|
||||
{
|
||||
// 0b11 prefix
|
||||
case 3:
|
||||
curr_xored_info.leading_zero_bits = LeadingZero::reverseBinaryRepresentation[reader.readBits(LeadingZero::BIT_LENGTH)];
|
||||
curr_xored_info.data_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits;
|
||||
xored_data = static_cast<T>(reader.readBits(curr_xored_info.data_bits));
|
||||
curr_value = prev_value ^ xored_data;
|
||||
break;
|
||||
// 0b10 prefix
|
||||
case 2:
|
||||
curr_xored_info.leading_zero_bits = prev_xored_info.leading_zero_bits;
|
||||
curr_xored_info.data_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits;
|
||||
xored_data = static_cast<T>(reader.readBits(curr_xored_info.data_bits));
|
||||
curr_value = prev_value ^ xored_data;
|
||||
break;
|
||||
// 0b01 prefix
|
||||
case 1:
|
||||
match_index = reader.readBits(LOG_NO_PREVIOUS_VALUES);
|
||||
prev_value = stored_values[match_index];
|
||||
curr_xored_info.leading_zero_bits = LeadingZero::reverseBinaryRepresentation[reader.readBits(LeadingZero::BIT_LENGTH)];
|
||||
curr_xored_info.data_bits = reader.readBits(DATA_BIT_LENGTH);
|
||||
if (curr_xored_info.data_bits == 0)
|
||||
{
|
||||
curr_xored_info.data_bits = sizeof(T) * 8;
|
||||
}
|
||||
curr_xored_info.trailing_zero_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits - curr_xored_info.data_bits;
|
||||
xored_data = static_cast<T>(reader.readBits(curr_xored_info.data_bits));
|
||||
xored_data <<= curr_xored_info.trailing_zero_bits;
|
||||
curr_value = prev_value ^ xored_data;
|
||||
break;
|
||||
// 0b00 prefix
|
||||
case 0:
|
||||
match_index = reader.readBits(LOG_NO_PREVIOUS_VALUES);
|
||||
prev_value = stored_values[match_index];
|
||||
curr_value = prev_value;
|
||||
break;
|
||||
}
|
||||
unalignedStoreLittleEndian<T>(dest, curr_value);
|
||||
dest += sizeof(curr_value);
|
||||
|
||||
current_index = (current_index + 1) % NO_PREVIOUS_VALUES;
|
||||
stored_values[current_index] = curr_value;
|
||||
prev_xored_info = curr_xored_info;
|
||||
prev_value = curr_value;
|
||||
}
|
||||
}
|
||||
|
||||
UInt8 getDataBytesSize(const IDataType * column_type)
|
||||
{
|
||||
if (!column_type->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion())
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec Chimp is not applicable for {} because the data type is not of fixed size",
|
||||
column_type->getName());
|
||||
|
||||
size_t max_size = column_type->getSizeOfValueInMemory();
|
||||
if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8)
|
||||
return static_cast<UInt8>(max_size);
|
||||
else
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec Chimp is only applicable for data types of size 1, 2, 4, 8 bytes. Given type {}",
|
||||
column_type->getName());
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
CompressionCodecChimp::CompressionCodecChimp(UInt8 data_bytes_size_)
|
||||
: data_bytes_size(data_bytes_size_)
|
||||
{
|
||||
setCodecDescription("Chimp");
|
||||
}
|
||||
|
||||
uint8_t CompressionCodecChimp::getMethodByte() const
|
||||
{
|
||||
return static_cast<uint8_t>(CompressionMethodByte::Chimp);
|
||||
}
|
||||
|
||||
void CompressionCodecChimp::updateHash(SipHash & hash) const
|
||||
{
|
||||
getCodecDesc()->updateTreeHash(hash, /*ignore_aliases=*/ true);
|
||||
hash.update(data_bytes_size);
|
||||
}
|
||||
|
||||
UInt32 CompressionCodecChimp::getMaxCompressedDataSize(UInt32 uncompressed_size) const
|
||||
{
|
||||
const auto result = 2 // common header
|
||||
+ data_bytes_size // max bytes skipped if source is not properly aligned.
|
||||
+ getCompressedHeaderSize(data_bytes_size) // data-specific header
|
||||
+ getCompressedDataSize(data_bytes_size, uncompressed_size);
|
||||
return result;
|
||||
}
|
||||
|
||||
UInt32 CompressionCodecChimp::doCompressData(const char * source, UInt32 source_size, char * dest) const
|
||||
{
|
||||
UInt8 bytes_to_skip = source_size % data_bytes_size;
|
||||
dest[0] = data_bytes_size;
|
||||
dest[1] = bytes_to_skip; /// unused (backward compatibility)
|
||||
memcpy(&dest[2], source, bytes_to_skip);
|
||||
size_t start_pos = 2 + bytes_to_skip;
|
||||
UInt32 result_size = 0;
|
||||
|
||||
const UInt32 compressed_size = getMaxCompressedDataSize(source_size);
|
||||
switch (data_bytes_size) // NOLINT(bugprone-switch-missing-default-case)
|
||||
{
|
||||
case 4:
|
||||
result_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
|
||||
break;
|
||||
case 8:
|
||||
result_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
|
||||
break;
|
||||
}
|
||||
return 2 + bytes_to_skip + result_size;
|
||||
}
|
||||
|
||||
void CompressionCodecChimp::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
|
||||
{
|
||||
if (source_size < 2)
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header");
|
||||
|
||||
UInt8 bytes_size = source[0];
|
||||
|
||||
if (bytes_size == 0)
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header");
|
||||
|
||||
UInt8 bytes_to_skip = uncompressed_size % bytes_size;
|
||||
|
||||
if (static_cast<UInt32>(2 + bytes_to_skip) > source_size)
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header");
|
||||
|
||||
if (bytes_to_skip >= uncompressed_size)
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header");
|
||||
|
||||
memcpy(dest, &source[2], bytes_to_skip);
|
||||
UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
|
||||
UInt32 uncompressed_size_left = uncompressed_size - bytes_to_skip;
|
||||
switch (bytes_size) // NOLINT(bugprone-switch-missing-default-case)
|
||||
{
|
||||
case 4:
|
||||
decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], uncompressed_size_left);
|
||||
break;
|
||||
case 8:
|
||||
decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], uncompressed_size_left);
|
||||
break;
|
||||
default:
|
||||
throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header");
|
||||
}
|
||||
}
|
||||
|
||||
void registerCodecChimp(CompressionCodecFactory & factory)
|
||||
{
|
||||
UInt8 method_code = static_cast<UInt8>(CompressionMethodByte::Chimp);
|
||||
auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr
|
||||
{
|
||||
/// Default bytes size is 1
|
||||
UInt8 data_bytes_size = 1;
|
||||
if (arguments && !arguments->children.empty())
|
||||
{
|
||||
if (arguments->children.size() > 1)
|
||||
throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "Chimp codec must have 1 parameter, given {}", arguments->children.size());
|
||||
|
||||
const auto children = arguments->children;
|
||||
const auto * literal = children[0]->as<ASTLiteral>();
|
||||
if (!literal || literal->value.getType() != Field::Types::Which::UInt64)
|
||||
throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "Chimp codec argument must be unsigned integer");
|
||||
|
||||
size_t user_bytes_size = literal->value.safeGet<UInt64>();
|
||||
if (user_bytes_size != 4 && user_bytes_size != 8)
|
||||
throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "Argument value for Chimp codec can be 4 or 8, given {}", user_bytes_size);
|
||||
data_bytes_size = static_cast<UInt8>(user_bytes_size);
|
||||
}
|
||||
else if (column_type)
|
||||
{
|
||||
data_bytes_size = getDataBytesSize(column_type);
|
||||
}
|
||||
|
||||
return std::make_shared<CompressionCodecChimp>(data_bytes_size);
|
||||
};
|
||||
factory.registerCompressionCodecWithType("Chimp", method_code, codec_builder);
|
||||
}
|
||||
}
|
@ -186,6 +186,7 @@ void registerCodecGorilla(CompressionCodecFactory & factory);
|
||||
void registerCodecEncrypted(CompressionCodecFactory & factory);
|
||||
void registerCodecFPC(CompressionCodecFactory & factory);
|
||||
void registerCodecGCD(CompressionCodecFactory & factory);
|
||||
void registerCodecChimp(CompressionCodecFactory & factory);
|
||||
|
||||
CompressionCodecFactory::CompressionCodecFactory()
|
||||
{
|
||||
@ -203,6 +204,7 @@ CompressionCodecFactory::CompressionCodecFactory()
|
||||
registerCodecGorilla(*this);
|
||||
registerCodecEncrypted(*this);
|
||||
registerCodecFPC(*this);
|
||||
registerCodecChimp(*this);
|
||||
registerCodecGCD(*this);
|
||||
|
||||
default_codec = get("LZ4", {});
|
||||
|
@ -48,6 +48,7 @@ enum class CompressionMethodByte : uint8_t
|
||||
FPC = 0x98,
|
||||
GCD = 0x9a,
|
||||
ZSTD_QPL = 0x9b,
|
||||
Chimp = 0x9c,
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -520,12 +520,13 @@ public:
|
||||
|
||||
TEST_P(CodecTest, TranscodingWithDataType)
|
||||
{
|
||||
/// Gorilla can only be applied to floating point columns
|
||||
/// Gorilla and Chimp can only be applied to floating point columns
|
||||
bool codec_is_gorilla = std::get<0>(GetParam()).codec_statement.find("Gorilla") != std::string::npos;
|
||||
bool codec_is_chimp = std::get<0>(GetParam()).codec_statement.find("Chimp") != std::string::npos;
|
||||
WhichDataType which(std::get<1>(GetParam()).data_type.get());
|
||||
bool data_is_float = which.isFloat();
|
||||
if (codec_is_gorilla && !data_is_float)
|
||||
GTEST_SKIP() << "Skipping Gorilla-compressed non-float column";
|
||||
if ((codec_is_gorilla || codec_is_chimp) && !data_is_float)
|
||||
GTEST_SKIP() << "Skipping Gorilla/Chimp-compressed non-float column";
|
||||
|
||||
const auto codec = makeCodec(CODEC_WITH_DATA_TYPE);
|
||||
testTranscoding(*codec);
|
||||
@ -808,7 +809,10 @@ const auto DefaultCodecsToTest = ::testing::Values(
|
||||
Codec("DoubleDelta, ZSTD"),
|
||||
Codec("Gorilla"),
|
||||
Codec("Gorilla, LZ4"),
|
||||
Codec("Gorilla, ZSTD")
|
||||
Codec("Gorilla, ZSTD"),
|
||||
Codec("Chimp"),
|
||||
Codec("Chimp, LZ4"),
|
||||
Codec("Chimp, ZSTD")
|
||||
);
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -13,6 +13,7 @@
|
||||
<value>DoubleDelta</value>
|
||||
<value>Gorilla</value>
|
||||
<value>FPC</value>
|
||||
<value>Chimp</value>
|
||||
</values>
|
||||
</substitution>
|
||||
<substitution>
|
||||
|
@ -13,6 +13,7 @@
|
||||
<value>DoubleDelta</value>
|
||||
<value>Gorilla</value>
|
||||
<value>FPC</value>
|
||||
<value>Chimp</value>
|
||||
</values>
|
||||
</substitution>
|
||||
<substitution>
|
||||
|
@ -0,0 +1,2 @@
|
||||
F64
|
||||
F32
|
63
tests/queries/0_stateless/00950_test_chimp_codec.sql
Normal file
63
tests/queries/0_stateless/00950_test_chimp_codec.sql
Normal file
@ -0,0 +1,63 @@
|
||||
DROP TABLE IF EXISTS codecTest;
|
||||
|
||||
SET cross_to_inner_join_rewrite = 1;
|
||||
|
||||
CREATE TABLE codecTest (
|
||||
key UInt64,
|
||||
name String,
|
||||
ref_valueF64 Float64,
|
||||
ref_valueF32 Float32,
|
||||
valueF64 Float64 CODEC(Chimp),
|
||||
valueF32 Float32 CODEC(Chimp)
|
||||
) Engine = MergeTree ORDER BY key;
|
||||
|
||||
-- best case - same value
|
||||
INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32)
|
||||
SELECT number AS n, 'e()', e() AS v, v, v, v FROM system.numbers LIMIT 1, 100;
|
||||
|
||||
-- good case - values that grow insignificantly
|
||||
INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32)
|
||||
SELECT number AS n, 'log2(n)', log2(n) AS v, v, v, v FROM system.numbers LIMIT 101, 100;
|
||||
|
||||
-- bad case - values differ significantly
|
||||
INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32)
|
||||
SELECT number AS n, 'n*sqrt(n)', n*sqrt(n) AS v, v, v, v FROM system.numbers LIMIT 201, 100;
|
||||
|
||||
-- worst case - almost like a random values
|
||||
INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32)
|
||||
SELECT number AS n, 'sin(n*n*n)*n', sin(n * n * n * n* n) AS v, v, v, v FROM system.numbers LIMIT 301, 100;
|
||||
|
||||
|
||||
-- These floating-point values are expected to be BINARY equal, so comparing by-value is Ok here.
|
||||
|
||||
-- referencing previous row key, value, and case name to simplify debugging.
|
||||
SELECT 'F64';
|
||||
SELECT
|
||||
c1.key, c1.name,
|
||||
c1.ref_valueF64, c1.valueF64, c1.ref_valueF64 - c1.valueF64 AS dF64,
|
||||
'prev:',
|
||||
c2.key, c2.ref_valueF64
|
||||
FROM
|
||||
codecTest as c1, codecTest as c2
|
||||
WHERE
|
||||
dF64 != 0
|
||||
AND
|
||||
c2.key = c1.key - 1
|
||||
LIMIT 10;
|
||||
|
||||
|
||||
SELECT 'F32';
|
||||
SELECT
|
||||
c1.key, c1.name,
|
||||
c1.ref_valueF32, c1.valueF32, c1.ref_valueF32 - c1.valueF32 AS dF32,
|
||||
'prev:',
|
||||
c2.key, c2.ref_valueF32
|
||||
FROM
|
||||
codecTest as c1, codecTest as c2
|
||||
WHERE
|
||||
dF32 != 0
|
||||
AND
|
||||
c2.key = c1.key - 1
|
||||
LIMIT 10;
|
||||
|
||||
DROP TABLE IF EXISTS codecTest;
|
@ -8,24 +8,33 @@ CREATE TABLE tbl (
|
||||
-- Nullable
|
||||
v1_gor Nullable(Float64) CODEC(Gorilla),
|
||||
v1_fpc Nullable(Float64) CODEC(FPC),
|
||||
v1_chi Nullable(Float64) CODEC(Chimp),
|
||||
-- Array
|
||||
v2_gor Array(Float64) CODEC(Gorilla),
|
||||
v2_fpc Array(Float64) CODEC(FPC),
|
||||
v2_chi Array(Float64) CODEC(Chimp),
|
||||
v3_gor Array(Array(Float64)) CODEC(Gorilla),
|
||||
v3_fpc Array(Array(Float64)) CODEC(FPC),
|
||||
v3_chi Array(Array(Float64)) CODEC(Chimp),
|
||||
v4_gor Array(Nullable(Float64)) CODEC(Gorilla),
|
||||
v4_fpc Array(Nullable(Float64)) CODEC(FPC),
|
||||
v4_chi Array(Nullable(Float64)) CODEC(Chimp),
|
||||
v5_gor Array(Tuple(Float64)) CODEC(Gorilla),
|
||||
v5_fpc Array(Tuple(Float64)) CODEC(FPC),
|
||||
v5_chi Array(Tuple(Float64)) CODEC(Chimp),
|
||||
-- Tuple
|
||||
v6_gor Tuple(Float64) CODEC(Gorilla),
|
||||
v6_fpc Tuple(Float64) CODEC(FPC),
|
||||
v6_chi Tuple(Float64) CODEC(Chimp),
|
||||
v7_gor Tuple(Tuple(Float64)) CODEC(Gorilla),
|
||||
v7_fpc Tuple(Tuple(Float64)) CODEC(FPC),
|
||||
v7_chi Tuple(Tuple(Float64)) CODEC(Chimp),
|
||||
v8_gor Tuple(Nullable(Float64)) CODEC(Gorilla),
|
||||
v8_fpc Tuple(Nullable(Float64)) CODEC(FPC),
|
||||
v8_chi Tuple(Nullable(Float64)) CODEC(Chimp),
|
||||
v9_gor Tuple(Array(Float64)) CODEC(Gorilla),
|
||||
v9_fpc Tuple(Array(Float64)) CODEC(FPC),
|
||||
v9_chi Tuple(Array(Float64)) CODEC(Chimp),
|
||||
) Engine = MergeTree ORDER BY tuple();
|
||||
|
||||
DROP TABLE IF EXISTS tbl;
|
||||
|
@ -7,3 +7,6 @@
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
1
|
||||
|
@ -4,31 +4,39 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CURDIR"/../shell_config.sh
|
||||
|
||||
echo "Hello, World!" > 02584_test_data
|
||||
DATA_FILE=$(mktemp -q 02584_test_data_XXXXXX)
|
||||
OUT_FILE=$(mktemp -q 02584_test_out_XXXXXX)
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
echo "Hello, World!" > $DATA_FILE;
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out';
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Chimp(1)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Chimp(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Chimp([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'Chimp(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE;
|
||||
|
||||
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "CANNOT_COMPRESS";
|
||||
$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "CANNOT_COMPRESS";
|
||||
|
||||
rm 02584_test_data 02584_test_out
|
||||
rm $DATA_FILE $OUT_FILE
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user