diff --git a/src/Compression/CompressionCodecChimp.cpp b/src/Compression/CompressionCodecChimp.cpp new file mode 100644 index 00000000000..70074e221fb --- /dev/null +++ b/src/Compression/CompressionCodecChimp.cpp @@ -0,0 +1,502 @@ +#pragma clang diagnostic ignored "-Wreserved-identifier" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ + +/** Chimp column codec implementation. + * + * Implementation of Chimp128 algorithm proposed in: Panagiotis Liakos, Katia Papakonstantinopoulou, Yannis Kotidis: + * Chimp: Efficient Lossless Floating Point Compression for Time Series Databases. Proc. VLDB Endow. 15(11): 3058-3070 (2022) + * Available in: https://dl.acm.org/doi/abs/10.14778/3551793.3551852 + * + */ +class CompressionCodecChimp : public ICompressionCodec +{ +public: + explicit CompressionCodecChimp(UInt8 data_bytes_size_); + + uint8_t getMethodByte() const override; + + void updateHash(SipHash & hash) const override; + +protected: + + UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; + + void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; + + UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; + + bool isCompression() const override { return true; } + bool isGenericCompression() const override { return false; } + bool isFloatingPointTimeSeriesCodec() const override { return true; } + +private: + const UInt8 data_bytes_size; +}; + +namespace LeadingZero +{ + static const auto BIT_LENGTH = 3; + static const short round[65] = + { + 0, 0, 0, 0, 0, 0, 0, 0, + 8, 8, 8, 8, 12, 12, 12, 12, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24 + }; + static const short binaryRepresentation[65] = + { + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7 + }; + static const short reverseBinaryRepresentation[8] = {0, 8, 12, 16, 18, 20, 22, 24}; +} + +namespace ErrorCodes +{ + extern const int CANNOT_COMPRESS; + extern const int CANNOT_DECOMPRESS; + extern const int BAD_ARGUMENTS; + extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE; + extern const int ILLEGAL_CODEC_PARAMETER; +} + +namespace +{ + +constexpr UInt8 getBitLengthOfLength(UInt8 data_bytes_size) +{ + // 4-byte value is 32 bits, and we need 5 bits to represent 32 values + // 8-byte 64 bits => 6 + const UInt8 bit_lengths[] = {0, 0, 0, 0, 5, 0, 0, 0, 6}; + assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0); + return bit_lengths[data_bytes_size]; +} + +UInt32 getCompressedHeaderSize(UInt8 data_bytes_size) +{ + constexpr UInt8 items_count_size = 4; + return items_count_size + data_bytes_size; +} + +UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size) +{ + const UInt32 items_count = uncompressed_size / data_bytes_size; + static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size); + static const short LOG_NO_PREVIOUS_VALUES = static_cast(std::log2(data_bytes_size * 16)); + // worst case (for 32-bit value): + // 2 bits (flag) + 6 bits (previous values index) + 3 bits (no of leading zeroes) + 5 bits(data bit-size) + non-zero data bits. + const UInt32 max_item_size_bits = 2 + LOG_NO_PREVIOUS_VALUES + LeadingZero::BIT_LENGTH + DATA_BIT_LENGTH + data_bytes_size * 8; + // + 8 is to round up to next byte. + return (items_count * max_item_size_bits + 8) / 8; +} + +struct BinaryValueInfo +{ + UInt8 leading_zero_bits; + UInt8 data_bits; + UInt8 trailing_zero_bits; +}; + +template +BinaryValueInfo getBinaryValueInfo(const T & value) +{ + constexpr UInt8 bit_size = sizeof(T) * 8; + const UInt8 lz = LeadingZero::round[getLeadingZeroBits(value)]; + const UInt8 tz = getTrailingZeroBits(value); + const UInt8 data_size = value == 0 ? 0 : static_cast(bit_size - lz - tz); + return {lz, data_size, tz}; +} + +template +UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) +{ + if (source_size % sizeof(T) != 0) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress with Chimp codec, data size {} is not aligned to {}", source_size, sizeof(T)); + + const char * const source_end = source + source_size; + const char * const dest_start = dest; + const char * const dest_end = dest + dest_size; + + const UInt32 items_count = source_size / sizeof(T); + + static const short NO_PREVIOUS_VALUES = sizeof(T) * 16; + T stored_values[NO_PREVIOUS_VALUES]; + for (int i = 0; i < NO_PREVIOUS_VALUES; i++) + { + stored_values[i] = 0; + } + static const short LOG_NO_PREVIOUS_VALUES = static_cast(std::log2(NO_PREVIOUS_VALUES)); + static const short THRESHOLD = 6 + LOG_NO_PREVIOUS_VALUES; + static const int ARRAY_SIZE = static_cast(std::pow(2, THRESHOLD + 1)); + int indices[ARRAY_SIZE]; + for (int i = 0; i < ARRAY_SIZE; i++) + { + indices[i] = 0; + } + static const short setLsb = ARRAY_SIZE - 1; + + unalignedStoreLittleEndian(dest, items_count); + dest += sizeof(items_count); + + T prev_value = 0; + // That would cause first XORed value to be written in-full. + BinaryValueInfo prev_xored_info{0, 0, 0}; + + if (source < source_end) + { + prev_value = unalignedLoadLittleEndian(source); + unalignedStoreLittleEndian(dest, prev_value); + + source += sizeof(prev_value); + dest += sizeof(prev_value); + stored_values[0] = prev_value; + } + + BitWriter writer(dest, dest_end - dest); + + static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T)); + + int total = 0; + int previous_index = 0; + int current_index = 0; + + while (source < source_end) + { + const T curr_value = unalignedLoadLittleEndian(source); + source += sizeof(curr_value); + + // find best matching previous value + T xored_data; + BinaryValueInfo curr_xored_info; + int match_key = static_cast(curr_value & setLsb); + int match_index = indices[match_key]; + if ((total - match_index) < NO_PREVIOUS_VALUES) + { + T tempXor = curr_value ^ stored_values[match_index % NO_PREVIOUS_VALUES]; + curr_xored_info = getBinaryValueInfo(tempXor); + // if match is good enough, use it + if (curr_xored_info.trailing_zero_bits > THRESHOLD) + { + previous_index = match_index % NO_PREVIOUS_VALUES; + xored_data = tempXor; + } + // otherwise use immediately previous value + else + { + previous_index = total % NO_PREVIOUS_VALUES; + xored_data = curr_value ^ stored_values[previous_index]; + curr_xored_info = getBinaryValueInfo(xored_data); + } + } + // if match is outside of range, use immediately previous value + else + { + previous_index = total % NO_PREVIOUS_VALUES; + xored_data = curr_value ^ stored_values[previous_index]; + curr_xored_info = getBinaryValueInfo(xored_data); + } + + // encode + // 0b00 prefix + if (xored_data == 0) + { + writer.writeBits(2, 0b00); + writer.writeBits(LOG_NO_PREVIOUS_VALUES, previous_index); + curr_xored_info.leading_zero_bits = 255; // max value so it can't be used + } + // 0b01 prefix + else if (curr_xored_info.trailing_zero_bits > THRESHOLD) + { + writer.writeBits(2, 0b01); + writer.writeBits(LOG_NO_PREVIOUS_VALUES, previous_index); + writer.writeBits(LeadingZero::BIT_LENGTH, LeadingZero::binaryRepresentation[curr_xored_info.leading_zero_bits]); + writer.writeBits(DATA_BIT_LENGTH, curr_xored_info.data_bits); + writer.writeBits(curr_xored_info.data_bits, xored_data >> curr_xored_info.trailing_zero_bits); + curr_xored_info.leading_zero_bits = 255; // max value so it can't be used + } + // 0b10 prefix + else if (prev_xored_info.leading_zero_bits == curr_xored_info.leading_zero_bits) + { + writer.writeBits(2, 0b10); + writer.writeBits(curr_xored_info.data_bits + curr_xored_info.trailing_zero_bits, xored_data); + } + // 0b11 prefix + else + { + writer.writeBits(2, 0b11); + writer.writeBits(LeadingZero::BIT_LENGTH, LeadingZero::binaryRepresentation[curr_xored_info.leading_zero_bits]); + writer.writeBits(curr_xored_info.data_bits + curr_xored_info.trailing_zero_bits, xored_data); + } + + // update stored previous values and indices + prev_xored_info = curr_xored_info; + prev_value = curr_value; + current_index = (current_index + 1) % NO_PREVIOUS_VALUES; + stored_values[current_index] = curr_value; + total++; + indices[match_key] = total; + } + writer.flush(); + + return static_cast((dest - dest_start) + (writer.count() + 7) / 8); +} + +template +void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size) +{ + static const short NO_PREVIOUS_VALUES = sizeof(T) * 16; + static const short LOG_NO_PREVIOUS_VALUES = static_cast(std::log2(NO_PREVIOUS_VALUES)); + int current_index = 0; + T stored_values[NO_PREVIOUS_VALUES]; + for (int i = 0; i < NO_PREVIOUS_VALUES; i++) + { + stored_values[i] = 0; + } + + const char * const source_end = source + source_size; + + if (source + sizeof(UInt32) > source_end) + return; + + + const UInt32 items_count = unalignedLoadLittleEndian(source); + source += sizeof(items_count); + + T prev_value = 0; + + // decoding first item + if (source + sizeof(T) > source_end || items_count < 1) + return; + + if (static_cast(items_count) * sizeof(T) > dest_size) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data: corrupted input data."); + + prev_value = unalignedLoadLittleEndian(source); + unalignedStoreLittleEndian(dest, prev_value); + + source += sizeof(prev_value); + dest += sizeof(prev_value); + stored_values[0] = prev_value; + + BitReader reader(source, source_size - sizeof(items_count) - sizeof(prev_value)); + + BinaryValueInfo prev_xored_info{0, 0, 0}; + + static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T)); + + // since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes, + // we have to keep track of items to avoid reading more than there is. + for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read) + { + T curr_value = prev_value; + BinaryValueInfo curr_xored_info = prev_xored_info; + T xored_data = 0; + UInt64 match_index; + UInt8 flag = reader.readBits(2); + switch (flag) + { + // 0b11 prefix + case 3: + curr_xored_info.leading_zero_bits = LeadingZero::reverseBinaryRepresentation[reader.readBits(LeadingZero::BIT_LENGTH)]; + curr_xored_info.data_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits; + xored_data = static_cast(reader.readBits(curr_xored_info.data_bits)); + curr_value = prev_value ^ xored_data; + break; + // 0b10 prefix + case 2: + curr_xored_info.leading_zero_bits = prev_xored_info.leading_zero_bits; + curr_xored_info.data_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits; + xored_data = static_cast(reader.readBits(curr_xored_info.data_bits)); + curr_value = prev_value ^ xored_data; + break; + // 0b01 prefix + case 1: + match_index = reader.readBits(LOG_NO_PREVIOUS_VALUES); + prev_value = stored_values[match_index]; + curr_xored_info.leading_zero_bits = LeadingZero::reverseBinaryRepresentation[reader.readBits(LeadingZero::BIT_LENGTH)]; + curr_xored_info.data_bits = reader.readBits(DATA_BIT_LENGTH); + if (curr_xored_info.data_bits == 0) + { + curr_xored_info.data_bits = sizeof(T) * 8; + } + curr_xored_info.trailing_zero_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits - curr_xored_info.data_bits; + xored_data = static_cast(reader.readBits(curr_xored_info.data_bits)); + xored_data <<= curr_xored_info.trailing_zero_bits; + curr_value = prev_value ^ xored_data; + break; + // 0b00 prefix + case 0: + match_index = reader.readBits(LOG_NO_PREVIOUS_VALUES); + prev_value = stored_values[match_index]; + curr_value = prev_value; + break; + } + unalignedStoreLittleEndian(dest, curr_value); + dest += sizeof(curr_value); + + current_index = (current_index + 1) % NO_PREVIOUS_VALUES; + stored_values[current_index] = curr_value; + prev_xored_info = curr_xored_info; + prev_value = curr_value; + } +} + +UInt8 getDataBytesSize(const IDataType * column_type) +{ + if (!column_type->isValueUnambiguouslyRepresentedInFixedSizeContiguousMemoryRegion()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec Chimp is not applicable for {} because the data type is not of fixed size", + column_type->getName()); + + size_t max_size = column_type->getSizeOfValueInMemory(); + if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8) + return static_cast(max_size); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Codec Chimp is only applicable for data types of size 1, 2, 4, 8 bytes. Given type {}", + column_type->getName()); +} + +} + + +CompressionCodecChimp::CompressionCodecChimp(UInt8 data_bytes_size_) + : data_bytes_size(data_bytes_size_) +{ + setCodecDescription("Chimp"); +} + +uint8_t CompressionCodecChimp::getMethodByte() const +{ + return static_cast(CompressionMethodByte::Chimp); +} + +void CompressionCodecChimp::updateHash(SipHash & hash) const +{ + getCodecDesc()->updateTreeHash(hash, /*ignore_aliases=*/ true); + hash.update(data_bytes_size); +} + +UInt32 CompressionCodecChimp::getMaxCompressedDataSize(UInt32 uncompressed_size) const +{ + const auto result = 2 // common header + + data_bytes_size // max bytes skipped if source is not properly aligned. + + getCompressedHeaderSize(data_bytes_size) // data-specific header + + getCompressedDataSize(data_bytes_size, uncompressed_size); + return result; +} + +UInt32 CompressionCodecChimp::doCompressData(const char * source, UInt32 source_size, char * dest) const +{ + UInt8 bytes_to_skip = source_size % data_bytes_size; + dest[0] = data_bytes_size; + dest[1] = bytes_to_skip; /// unused (backward compatibility) + memcpy(&dest[2], source, bytes_to_skip); + size_t start_pos = 2 + bytes_to_skip; + UInt32 result_size = 0; + + const UInt32 compressed_size = getMaxCompressedDataSize(source_size); + switch (data_bytes_size) // NOLINT(bugprone-switch-missing-default-case) + { + case 4: + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); + break; + case 8: + result_size = compressDataForType(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size); + break; + } + return 2 + bytes_to_skip + result_size; +} + +void CompressionCodecChimp::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const +{ + if (source_size < 2) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header"); + + UInt8 bytes_size = source[0]; + + if (bytes_size == 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header"); + + UInt8 bytes_to_skip = uncompressed_size % bytes_size; + + if (static_cast(2 + bytes_to_skip) > source_size) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header"); + + if (bytes_to_skip >= uncompressed_size) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header"); + + memcpy(dest, &source[2], bytes_to_skip); + UInt32 source_size_no_header = source_size - bytes_to_skip - 2; + UInt32 uncompressed_size_left = uncompressed_size - bytes_to_skip; + switch (bytes_size) // NOLINT(bugprone-switch-missing-default-case) + { + case 4: + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], uncompressed_size_left); + break; + case 8: + decompressDataForType(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip], uncompressed_size_left); + break; + default: + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress Chimp-encoded data. File has wrong header"); + } +} + +void registerCodecChimp(CompressionCodecFactory & factory) +{ + UInt8 method_code = static_cast(CompressionMethodByte::Chimp); + auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr + { + /// Default bytes size is 1 + UInt8 data_bytes_size = 1; + if (arguments && !arguments->children.empty()) + { + if (arguments->children.size() > 1) + throw Exception(ErrorCodes::ILLEGAL_SYNTAX_FOR_CODEC_TYPE, "Chimp codec must have 1 parameter, given {}", arguments->children.size()); + + const auto children = arguments->children; + const auto * literal = children[0]->as(); + if (!literal || literal->value.getType() != Field::Types::Which::UInt64) + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "Chimp codec argument must be unsigned integer"); + + size_t user_bytes_size = literal->value.safeGet(); + if (user_bytes_size != 4 && user_bytes_size != 8) + throw Exception(ErrorCodes::ILLEGAL_CODEC_PARAMETER, "Argument value for Chimp codec can be 4 or 8, given {}", user_bytes_size); + data_bytes_size = static_cast(user_bytes_size); + } + else if (column_type) + { + data_bytes_size = getDataBytesSize(column_type); + } + + return std::make_shared(data_bytes_size); + }; + factory.registerCompressionCodecWithType("Chimp", method_code, codec_builder); +} +} diff --git a/src/Compression/CompressionFactory.cpp b/src/Compression/CompressionFactory.cpp index c8ad3d71376..ec5b0c84628 100644 --- a/src/Compression/CompressionFactory.cpp +++ b/src/Compression/CompressionFactory.cpp @@ -186,6 +186,7 @@ void registerCodecGorilla(CompressionCodecFactory & factory); void registerCodecEncrypted(CompressionCodecFactory & factory); void registerCodecFPC(CompressionCodecFactory & factory); void registerCodecGCD(CompressionCodecFactory & factory); +void registerCodecChimp(CompressionCodecFactory & factory); CompressionCodecFactory::CompressionCodecFactory() { @@ -203,6 +204,7 @@ CompressionCodecFactory::CompressionCodecFactory() registerCodecGorilla(*this); registerCodecEncrypted(*this); registerCodecFPC(*this); + registerCodecChimp(*this); registerCodecGCD(*this); default_codec = get("LZ4", {}); diff --git a/src/Compression/CompressionInfo.h b/src/Compression/CompressionInfo.h index f01661cbe1d..fa9765fb59e 100644 --- a/src/Compression/CompressionInfo.h +++ b/src/Compression/CompressionInfo.h @@ -48,6 +48,7 @@ enum class CompressionMethodByte : uint8_t FPC = 0x98, GCD = 0x9a, ZSTD_QPL = 0x9b, + Chimp = 0x9c, }; } diff --git a/src/Compression/tests/gtest_compressionCodec.cpp b/src/Compression/tests/gtest_compressionCodec.cpp index 8265ba63fc2..f8ff227db75 100644 --- a/src/Compression/tests/gtest_compressionCodec.cpp +++ b/src/Compression/tests/gtest_compressionCodec.cpp @@ -520,12 +520,13 @@ public: TEST_P(CodecTest, TranscodingWithDataType) { - /// Gorilla can only be applied to floating point columns + /// Gorilla and Chimp can only be applied to floating point columns bool codec_is_gorilla = std::get<0>(GetParam()).codec_statement.find("Gorilla") != std::string::npos; + bool codec_is_chimp = std::get<0>(GetParam()).codec_statement.find("Chimp") != std::string::npos; WhichDataType which(std::get<1>(GetParam()).data_type.get()); bool data_is_float = which.isFloat(); - if (codec_is_gorilla && !data_is_float) - GTEST_SKIP() << "Skipping Gorilla-compressed non-float column"; + if ((codec_is_gorilla || codec_is_chimp) && !data_is_float) + GTEST_SKIP() << "Skipping Gorilla/Chimp-compressed non-float column"; const auto codec = makeCodec(CODEC_WITH_DATA_TYPE); testTranscoding(*codec); @@ -808,7 +809,10 @@ const auto DefaultCodecsToTest = ::testing::Values( Codec("DoubleDelta, ZSTD"), Codec("Gorilla"), Codec("Gorilla, LZ4"), - Codec("Gorilla, ZSTD") + Codec("Gorilla, ZSTD"), + Codec("Chimp"), + Codec("Chimp, LZ4"), + Codec("Chimp, ZSTD") ); /////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index 64325d30189..b9ce71a4122 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -13,6 +13,7 @@ DoubleDelta Gorilla FPC + Chimp diff --git a/tests/performance/codecs_float_select.xml b/tests/performance/codecs_float_select.xml index 325c140d9a0..0a3e421a5d3 100644 --- a/tests/performance/codecs_float_select.xml +++ b/tests/performance/codecs_float_select.xml @@ -13,6 +13,7 @@ DoubleDelta Gorilla FPC + Chimp diff --git a/tests/queries/0_stateless/00950_test_chimp_codec.reference b/tests/queries/0_stateless/00950_test_chimp_codec.reference new file mode 100644 index 00000000000..5e871ea0329 --- /dev/null +++ b/tests/queries/0_stateless/00950_test_chimp_codec.reference @@ -0,0 +1,2 @@ +F64 +F32 diff --git a/tests/queries/0_stateless/00950_test_chimp_codec.sql b/tests/queries/0_stateless/00950_test_chimp_codec.sql new file mode 100644 index 00000000000..c6115dfd4fd --- /dev/null +++ b/tests/queries/0_stateless/00950_test_chimp_codec.sql @@ -0,0 +1,63 @@ +DROP TABLE IF EXISTS codecTest; + +SET cross_to_inner_join_rewrite = 1; + +CREATE TABLE codecTest ( + key UInt64, + name String, + ref_valueF64 Float64, + ref_valueF32 Float32, + valueF64 Float64 CODEC(Chimp), + valueF32 Float32 CODEC(Chimp) +) Engine = MergeTree ORDER BY key; + +-- best case - same value +INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, 'e()', e() AS v, v, v, v FROM system.numbers LIMIT 1, 100; + +-- good case - values that grow insignificantly +INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, 'log2(n)', log2(n) AS v, v, v, v FROM system.numbers LIMIT 101, 100; + +-- bad case - values differ significantly +INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, 'n*sqrt(n)', n*sqrt(n) AS v, v, v, v FROM system.numbers LIMIT 201, 100; + +-- worst case - almost like a random values +INSERT INTO codecTest (key, name, ref_valueF64, valueF64, ref_valueF32, valueF32) + SELECT number AS n, 'sin(n*n*n)*n', sin(n * n * n * n* n) AS v, v, v, v FROM system.numbers LIMIT 301, 100; + + +-- These floating-point values are expected to be BINARY equal, so comparing by-value is Ok here. + +-- referencing previous row key, value, and case name to simplify debugging. +SELECT 'F64'; +SELECT + c1.key, c1.name, + c1.ref_valueF64, c1.valueF64, c1.ref_valueF64 - c1.valueF64 AS dF64, + 'prev:', + c2.key, c2.ref_valueF64 +FROM + codecTest as c1, codecTest as c2 +WHERE + dF64 != 0 +AND + c2.key = c1.key - 1 +LIMIT 10; + + +SELECT 'F32'; +SELECT + c1.key, c1.name, + c1.ref_valueF32, c1.valueF32, c1.ref_valueF32 - c1.valueF32 AS dF32, + 'prev:', + c2.key, c2.ref_valueF32 +FROM + codecTest as c1, codecTest as c2 +WHERE + dF32 != 0 +AND + c2.key = c1.key - 1 +LIMIT 10; + +DROP TABLE IF EXISTS codecTest; diff --git a/tests/queries/0_stateless/02538_nullable_array_tuple_timeseries.sql b/tests/queries/0_stateless/02538_nullable_array_tuple_timeseries.sql index 26451c93ed9..cfaf94b0f7b 100644 --- a/tests/queries/0_stateless/02538_nullable_array_tuple_timeseries.sql +++ b/tests/queries/0_stateless/02538_nullable_array_tuple_timeseries.sql @@ -8,24 +8,33 @@ CREATE TABLE tbl ( -- Nullable v1_gor Nullable(Float64) CODEC(Gorilla), v1_fpc Nullable(Float64) CODEC(FPC), + v1_chi Nullable(Float64) CODEC(Chimp), -- Array v2_gor Array(Float64) CODEC(Gorilla), v2_fpc Array(Float64) CODEC(FPC), + v2_chi Array(Float64) CODEC(Chimp), v3_gor Array(Array(Float64)) CODEC(Gorilla), v3_fpc Array(Array(Float64)) CODEC(FPC), + v3_chi Array(Array(Float64)) CODEC(Chimp), v4_gor Array(Nullable(Float64)) CODEC(Gorilla), v4_fpc Array(Nullable(Float64)) CODEC(FPC), + v4_chi Array(Nullable(Float64)) CODEC(Chimp), v5_gor Array(Tuple(Float64)) CODEC(Gorilla), v5_fpc Array(Tuple(Float64)) CODEC(FPC), + v5_chi Array(Tuple(Float64)) CODEC(Chimp), -- Tuple v6_gor Tuple(Float64) CODEC(Gorilla), v6_fpc Tuple(Float64) CODEC(FPC), + v6_chi Tuple(Float64) CODEC(Chimp), v7_gor Tuple(Tuple(Float64)) CODEC(Gorilla), v7_fpc Tuple(Tuple(Float64)) CODEC(FPC), + v7_chi Tuple(Tuple(Float64)) CODEC(Chimp), v8_gor Tuple(Nullable(Float64)) CODEC(Gorilla), v8_fpc Tuple(Nullable(Float64)) CODEC(FPC), + v8_chi Tuple(Nullable(Float64)) CODEC(Chimp), v9_gor Tuple(Array(Float64)) CODEC(Gorilla), v9_fpc Tuple(Array(Float64)) CODEC(FPC), + v9_chi Tuple(Array(Float64)) CODEC(Chimp), ) Engine = MergeTree ORDER BY tuple(); DROP TABLE IF EXISTS tbl; diff --git a/tests/queries/0_stateless/02584_compressor_codecs.reference b/tests/queries/0_stateless/02584_compressor_codecs.reference index bb0850568bb..4dff9ef38ef 100644 --- a/tests/queries/0_stateless/02584_compressor_codecs.reference +++ b/tests/queries/0_stateless/02584_compressor_codecs.reference @@ -7,3 +7,6 @@ 1 1 1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02584_compressor_codecs.sh b/tests/queries/0_stateless/02584_compressor_codecs.sh index fad6847b792..69d6041b8f3 100755 --- a/tests/queries/0_stateless/02584_compressor_codecs.sh +++ b/tests/queries/0_stateless/02584_compressor_codecs.sh @@ -4,31 +4,39 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -echo "Hello, World!" > 02584_test_data +DATA_FILE=$(mktemp -q 02584_test_data_XXXXXX) +OUT_FILE=$(mktemp -q 02584_test_out_XXXXXX) -$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' -$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; +echo "Hello, World!" > $DATA_FILE; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; +$CLICKHOUSE_COMPRESSOR --codec 'Delta' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; +$CLICKHOUSE_COMPRESSOR --codec 'Delta(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Delta([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Delta(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'DoubleDelta(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; -$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; -$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out'; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Gorilla(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; + +$CLICKHOUSE_COMPRESSOR --codec 'Chimp(1)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Chimp(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Chimp([1,2])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'Chimp(4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; + +$CLICKHOUSE_COMPRESSOR --codec 'FPC' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 1)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC([1,2,3])' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "ILLEGAL_CODEC_PARAMETER"; +$CLICKHOUSE_COMPRESSOR --codec 'FPC(5, 4)' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE; -$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input '02584_test_data' --output '02584_test_out' 2>&1 | grep -c "CANNOT_COMPRESS"; +$CLICKHOUSE_COMPRESSOR --codec 'T64' --codec 'LZ4' --input $DATA_FILE --output $OUT_FILE 2>&1 | grep -c "CANNOT_COMPRESS"; -rm 02584_test_data 02584_test_out +rm $DATA_FILE $OUT_FILE