ClickHouse/dbms/src/Compression/CompressionCodecGorilla.cpp

#include <Compression/CompressionCodecGorilla.h>
#include <Compression/CompressionInfo.h>
#include <Compression/CompressionFactory.h>
#include <common/unaligned.h>
#include <Parsers/IAST_fwd.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadBufferFromMemory.h>
#include <IO/BitHelpers.h>

#include <string.h>
#include <algorithm>
#include <cstdlib>
#include <type_traits>

#include <bitset>

namespace DB
{

namespace ErrorCodes
{
extern const int CANNOT_COMPRESS;
extern const int CANNOT_DECOMPRESS;
extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;
extern const int ILLEGAL_CODEC_PARAMETER;
}

namespace
{

constexpr inline UInt8 getBitLengthOfLength(UInt8 data_bytes_size)
{
    // 1-byte value is 8 bits, and we need 4 bits to represent 8 : 1000,
    // 2-byte         16 bits        =>    5
    // 4-byte         32 bits        =>    6
    // 8-byte         64 bits        =>    7
    const UInt8 bit_lengths[] = {0, 4, 5, 0, 6, 0, 0, 0, 7};
    assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0);

    return bit_lengths[data_bytes_size];
}


UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)
{
    const UInt8 items_count_size = 4;

    return items_count_size + data_bytes_size;
}

UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)
{
    const UInt32 items_count = uncompressed_size / data_bytes_size;
    static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size);
    // -1 since there must be at least 1 non-zero bit.
    static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;

    // worst case (for 32-bit value):
    // 11 + 5 bits of leading zeroes bit-size + 5 bits of data bit-size + non-zero data bits.
    const UInt32 max_item_size_bits = 2 + LEADING_ZEROES_BIT_LENGTH + DATA_BIT_LENGTH + data_bytes_size * 8;

    // + 8 is to round up to next byte.
    return (items_count * max_item_size_bits + 8) / 8;
}

struct binary_value_info
{
    UInt8 leading_zero_bits;
    UInt8 data_bits;
    UInt8 trailing_zero_bits;
};

template <typename T>
binary_value_info getLeadingAndTrailingBits(const T & value)
{
    constexpr UInt8 bit_size = sizeof(T) * 8;

    const UInt8 lz = getLeadingZeroBits(value);
    const UInt8 tz = getTrailingZeroBits(value);
    const UInt8 data_size = value == 0 ? 0 : static_cast<UInt8>(bit_size - lz - tz);

    return binary_value_info{lz, data_size, tz};
}

template <typename T>
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)
{
    static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
    // -1 since there must be at least 1 non-zero bit.
    static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;

    if (source_size % sizeof(T) != 0)
        throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS);
    const char * source_end = source + source_size;
    const char * dest_end = dest + dest_size;

    const UInt32 items_count = source_size / sizeof(T);

    unalignedStore<UInt32>(dest, items_count);
    dest += sizeof(items_count);

    T prev_value{};
    // That would cause first XORed value to be written in-full.
    binary_value_info prev_xored_info{0, 0, 0};

    if (source < source_end)
    {
        prev_value = unalignedLoad<T>(source);
        unalignedStore<T>(dest, prev_value);

        source += sizeof(prev_value);
        dest += sizeof(prev_value);
    }

    BitWriter writer(dest, dest_end - dest);

    while (source < source_end)
    {
        const T curr_value = unalignedLoad<T>(source);
        source += sizeof(curr_value);

        const auto xored_data = curr_value ^ prev_value;
        const binary_value_info curr_xored_info = getLeadingAndTrailingBits(xored_data);

        if (xored_data == 0)
        {
            writer.writeBits(1, 0);
        }
        else if (prev_xored_info.data_bits != 0
                && prev_xored_info.leading_zero_bits <= curr_xored_info.leading_zero_bits
                && prev_xored_info.trailing_zero_bits <= curr_xored_info.trailing_zero_bits)
        {
            writer.writeBits(2, 0b10);
            writer.writeBits(prev_xored_info.data_bits, xored_data >> prev_xored_info.trailing_zero_bits);
        }
        else
        {
            writer.writeBits(2, 0b11);
            writer.writeBits(LEADING_ZEROES_BIT_LENGTH, curr_xored_info.leading_zero_bits);
            writer.writeBits(DATA_BIT_LENGTH, curr_xored_info.data_bits);
            writer.writeBits(curr_xored_info.data_bits, xored_data >> curr_xored_info.trailing_zero_bits);
            prev_xored_info = curr_xored_info;
        }

        prev_value = curr_value;
    }

    writer.flush();

    return sizeof(items_count) + sizeof(prev_value) + writer.count() / 8;
}

template <typename T>
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
{
    static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));
    // -1 since there must be at least 1 non-zero bit.
    static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;

    const char * source_end = source + source_size;

    const UInt32 items_count = unalignedLoad<UInt32>(source);
    source += sizeof(items_count);

    T prev_value{};

    if (source < source_end)
    {
        prev_value = unalignedLoad<T>(source);
        unalignedStore<T>(dest, prev_value);

        source += sizeof(prev_value);
        dest += sizeof(prev_value);
    }

    BitReader reader(source, source_size - sizeof(items_count) - sizeof(prev_value));

    binary_value_info prev_xored_info{0, 0, 0};

    // since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,
    // we have to keep track of items to avoid reading more that there is.
    for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read)
    {
        T curr_value = prev_value;
        binary_value_info curr_xored_info = prev_xored_info;
        T xored_data{};

        if (reader.readBit() == 1)
        {
            if (reader.readBit() == 1)
            {
                // 0b11 prefix
                curr_xored_info.leading_zero_bits = reader.readBits(LEADING_ZEROES_BIT_LENGTH);
                curr_xored_info.data_bits = reader.readBits(DATA_BIT_LENGTH);
                curr_xored_info.trailing_zero_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits - curr_xored_info.data_bits;
            }
            // else: 0b10 prefix - use prev_xored_info

            if (curr_xored_info.leading_zero_bits == 0
                && curr_xored_info.data_bits == 0
                && curr_xored_info.trailing_zero_bits == 0)
            {
                throw Exception("Cannot decompress gorilla-encoded data: corrupted input data.",
                        ErrorCodes::CANNOT_DECOMPRESS);
            }

            xored_data = reader.readBits(curr_xored_info.data_bits);
            xored_data <<= curr_xored_info.trailing_zero_bits;
            curr_value = prev_value ^ xored_data;
        }
        // else: 0b0 prefix - use prev_value

        unalignedStore<T>(dest, curr_value);
        dest += sizeof(curr_value);

        prev_xored_info = curr_xored_info;
        prev_value = curr_value;
    }
}

UInt8 getDataBytesSize(DataTypePtr column_type)
{
    UInt8 delta_bytes_size = 1;
    if (column_type && column_type->haveMaximumSizeOfValue())
    {
        size_t max_size = column_type->getSizeOfValueInMemory();
        if (max_size == 1 || max_size == 2 || max_size == 4 || max_size == 8)
            delta_bytes_size = static_cast<UInt8>(max_size);
    }
    return delta_bytes_size;
}

}


CompressionCodecGorilla::CompressionCodecGorilla(UInt8 data_bytes_size_)
    : data_bytes_size(data_bytes_size_)
{
}

UInt8 CompressionCodecGorilla::getMethodByte() const
{
    return static_cast<UInt8>(CompressionMethodByte::Gorilla);
}

String CompressionCodecGorilla::getCodecDesc() const
{
    return "Gorilla";
}

UInt32 CompressionCodecGorilla::getMaxCompressedDataSize(UInt32 uncompressed_size) const
{
    const auto result = 2 // common header
            + data_bytes_size // max bytes skipped if source is not properly aligned.
            + getCompressedHeaderSize(data_bytes_size) // data-specific header
            + getCompressedDataSize(data_bytes_size, uncompressed_size);

    return result;
}

UInt32 CompressionCodecGorilla::doCompressData(const char * source, UInt32 source_size, char * dest) const
{
    UInt8 bytes_to_skip = source_size % data_bytes_size;
    dest[0] = data_bytes_size;
    dest[1] = bytes_to_skip; /// unused (backward compatibility)
    memcpy(&dest[2], source, bytes_to_skip);
    size_t start_pos = 2 + bytes_to_skip;
    UInt32 result_size = 0;

    const UInt32 compressed_size = getMaxCompressedDataSize(source_size);
    switch (data_bytes_size)
    {
    case 1:
        result_size = compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
        break;
    case 2:
        result_size = compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
        break;
    case 4:
        result_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
        break;
    case 8:
        result_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);
        break;
    }

    return 1 + 1 + result_size;
}

void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const
{
    if (source_size < 2)
        throw Exception("Cannot decompress. File has wrong header", ErrorCodes::CANNOT_DECOMPRESS);

    UInt8 bytes_size = source[0];
    UInt8 bytes_to_skip = uncompressed_size % bytes_size;

    if (UInt32(2 + bytes_to_skip) > source_size)
        throw Exception("Cannot decompress. File has wrong header", ErrorCodes::CANNOT_DECOMPRESS);

    memcpy(dest, &source[2], bytes_to_skip);
    UInt32 source_size_no_header = source_size - bytes_to_skip - 2;
    switch (bytes_size)
    {
    case 1:
        decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
        break;
    case 2:
        decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
        break;
    case 4:
        decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
        break;
    case 8:
        decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);
        break;
    }
}

void CompressionCodecGorilla::useInfoAboutType(DataTypePtr data_type)
{
    data_bytes_size = getDataBytesSize(data_type);
}

void registerCodecGorilla(CompressionCodecFactory & factory)
{
    UInt8 method_code = UInt8(CompressionMethodByte::Gorilla);
    factory.registerCompressionCodecWithType("Gorilla", method_code, [&](const ASTPtr &, DataTypePtr column_type) -> CompressionCodecPtr
    {
        UInt8 delta_bytes_size = getDataBytesSize(column_type);
        return std::make_shared<CompressionCodecGorilla>(delta_bytes_size);
    });
}
}
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`#include <Compression/CompressionCodecGorilla.h>`
			`#include <Compression/CompressionInfo.h>`
			`#include <Compression/CompressionFactory.h>`
			`#include <common/unaligned.h>`
			`#include <Parsers/IAST_fwd.h>`
			`#include <IO/WriteHelpers.h>`
			`#include <IO/ReadBufferFromMemory.h>`
			`#include <IO/BitHelpers.h>`

			`#include <string.h>`
			`#include <algorithm>`
			`#include <cstdlib>`
			`#include <type_traits>`

			`#include <bitset>`

			`namespace DB`
			`{`

			`namespace ErrorCodes`
			`{`
			`extern const int CANNOT_COMPRESS;`
			`extern const int CANNOT_DECOMPRESS;`
			`extern const int ILLEGAL_SYNTAX_FOR_CODEC_TYPE;`
			`extern const int ILLEGAL_CODEC_PARAMETER;`
			`}`

			`namespace`
			`{`

			`constexpr inline UInt8 getBitLengthOfLength(UInt8 data_bytes_size)`
			`{`
			`// 1-byte value is 8 bits, and we need 4 bits to represent 8 : 1000,`
			`// 2-byte 16 bits => 5`
			`// 4-byte 32 bits => 6`
			`// 8-byte 64 bits => 7`
			`const UInt8 bit_lengths[] = {0, 4, 5, 0, 6, 0, 0, 0, 7};`
			`assert(data_bytes_size >= 1 && data_bytes_size < sizeof(bit_lengths) && bit_lengths[data_bytes_size] != 0);`

			`return bit_lengths[data_bytes_size];`
			`}`


			`UInt32 getCompressedHeaderSize(UInt8 data_bytes_size)`
			`{`
			`const UInt8 items_count_size = 4;`

			`return items_count_size + data_bytes_size;`
			`}`

			`UInt32 getCompressedDataSize(UInt8 data_bytes_size, UInt32 uncompressed_size)`
			`{`
			`const UInt32 items_count = uncompressed_size / data_bytes_size;`
			`static const auto DATA_BIT_LENGTH = getBitLengthOfLength(data_bytes_size);`
			`// -1 since there must be at least 1 non-zero bit.`
			`static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;`

			`// worst case (for 32-bit value):`
			`// 11 + 5 bits of leading zeroes bit-size + 5 bits of data bit-size + non-zero data bits.`
			`const UInt32 max_item_size_bits = 2 + LEADING_ZEROES_BIT_LENGTH + DATA_BIT_LENGTH + data_bytes_size * 8;`

			`// + 8 is to round up to next byte.`
			`return (items_count * max_item_size_bits + 8) / 8;`
			`}`

			`struct binary_value_info`
			`{`
			`UInt8 leading_zero_bits;`
			`UInt8 data_bits;`
			`UInt8 trailing_zero_bits;`
			`};`

			`template <typename T>`
			`binary_value_info getLeadingAndTrailingBits(const T & value)`
			`{`
			`constexpr UInt8 bit_size = sizeof(T) * 8;`

			`const UInt8 lz = getLeadingZeroBits(value);`
			`const UInt8 tz = getTrailingZeroBits(value);`
			`const UInt8 data_size = value == 0 ? 0 : static_cast<UInt8>(bit_size - lz - tz);`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`return binary_value_info{lz, data_size, tz};`
			`}`

			`template <typename T>`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 dest_size)`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`{`
			`static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));`
			`// -1 since there must be at least 1 non-zero bit.`
			`static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;`

			`if (source_size % sizeof(T) != 0)`
			`throw Exception("Cannot compress, data size " + toString(source_size) + " is not aligned to " + toString(sizeof(T)), ErrorCodes::CANNOT_COMPRESS);`
			`const char * source_end = source + source_size;`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`const char * dest_end = dest + dest_size;`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`const UInt32 items_count = source_size / sizeof(T);`

Require explicit type in unalignedStore This is a follow-up to PR #5786, which fixed a segfault caused by an unexpected deduced type for unalignedStore. To prevent future errors of this kind, require a caller to specify the stored type explicitly. 2019-06-28 16:21:05 +00:00			`unalignedStore<UInt32>(dest, items_count);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`dest += sizeof(items_count);`

			`T prev_value{};`
			`// That would cause first XORed value to be written in-full.`
			`binary_value_info prev_xored_info{0, 0, 0};`

			`if (source < source_end)`
			`{`
			`prev_value = unalignedLoad<T>(source);`
Require explicit type in unalignedStore This is a follow-up to PR #5786, which fixed a segfault caused by an unexpected deduced type for unalignedStore. To prevent future errors of this kind, require a caller to specify the stored type explicitly. 2019-06-28 16:21:05 +00:00			`unalignedStore<T>(dest, prev_value);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`source += sizeof(prev_value);`
			`dest += sizeof(prev_value);`
			`}`

Performance improvement for Gorilla and DoubleDelta: through BitReader and BitWriter BitWriter and BitReader are now operating on memory directly rather than via Read/Write buffers. 2019-11-29 12:29:32 +00:00			`BitWriter writer(dest, dest_end - dest);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`while (source < source_end)`
			`{`
			`const T curr_value = unalignedLoad<T>(source);`
			`source += sizeof(curr_value);`

			`const auto xored_data = curr_value ^ prev_value;`
			`const binary_value_info curr_xored_info = getLeadingAndTrailingBits(xored_data);`

			`if (xored_data == 0)`
			`{`
			`writer.writeBits(1, 0);`
			`}`
			`else if (prev_xored_info.data_bits != 0`
			`&& prev_xored_info.leading_zero_bits <= curr_xored_info.leading_zero_bits`
			`&& prev_xored_info.trailing_zero_bits <= curr_xored_info.trailing_zero_bits)`
			`{`
			`writer.writeBits(2, 0b10);`
			`writer.writeBits(prev_xored_info.data_bits, xored_data >> prev_xored_info.trailing_zero_bits);`
			`}`
			`else`
			`{`
			`writer.writeBits(2, 0b11);`
			`writer.writeBits(LEADING_ZEROES_BIT_LENGTH, curr_xored_info.leading_zero_bits);`
			`writer.writeBits(DATA_BIT_LENGTH, curr_xored_info.data_bits);`
			`writer.writeBits(curr_xored_info.data_bits, xored_data >> curr_xored_info.trailing_zero_bits);`
			`prev_xored_info = curr_xored_info;`
			`}`

			`prev_value = curr_value;`
			`}`

			`writer.flush();`

Performance improvement for Gorilla and DoubleDelta: through BitReader and BitWriter BitWriter and BitReader are now operating on memory directly rather than via Read/Write buffers. 2019-11-29 12:29:32 +00:00			`return sizeof(items_count) + sizeof(prev_value) + writer.count() / 8;`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`}`

			`template <typename T>`
			`void decompressDataForType(const char * source, UInt32 source_size, char * dest)`
			`{`
			`static const auto DATA_BIT_LENGTH = getBitLengthOfLength(sizeof(T));`
			`// -1 since there must be at least 1 non-zero bit.`
			`static const auto LEADING_ZEROES_BIT_LENGTH = DATA_BIT_LENGTH - 1;`

			`const char * source_end = source + source_size;`

			`const UInt32 items_count = unalignedLoad<UInt32>(source);`
			`source += sizeof(items_count);`

			`T prev_value{};`

			`if (source < source_end)`
			`{`
			`prev_value = unalignedLoad<T>(source);`
Require explicit type in unalignedStore This is a follow-up to PR #5786, which fixed a segfault caused by an unexpected deduced type for unalignedStore. To prevent future errors of this kind, require a caller to specify the stored type explicitly. 2019-06-28 16:21:05 +00:00			`unalignedStore<T>(dest, prev_value);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`source += sizeof(prev_value);`
			`dest += sizeof(prev_value);`
			`}`

Performance improvement for Gorilla and DoubleDelta: through BitReader and BitWriter BitWriter and BitReader are now operating on memory directly rather than via Read/Write buffers. 2019-11-29 12:29:32 +00:00			`BitReader reader(source, source_size - sizeof(items_count) - sizeof(prev_value));`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`binary_value_info prev_xored_info{0, 0, 0};`

			`// since data is tightly packed, up to 1 bit per value, and last byte is padded with zeroes,`
			`// we have to keep track of items to avoid reading more that there is.`
			`for (UInt32 items_read = 1; items_read < items_count && !reader.eof(); ++items_read)`
			`{`
Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table 2019-06-13 14:04:38 +00:00			`T curr_value = prev_value;`
			`binary_value_info curr_xored_info = prev_xored_info;`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`T xored_data{};`

Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table 2019-06-13 14:04:38 +00:00			`if (reader.readBit() == 1)`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`{`
Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table 2019-06-13 14:04:38 +00:00			`if (reader.readBit() == 1)`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`{`
			`// 0b11 prefix`
			`curr_xored_info.leading_zero_bits = reader.readBits(LEADING_ZEROES_BIT_LENGTH);`
			`curr_xored_info.data_bits = reader.readBits(DATA_BIT_LENGTH);`
			`curr_xored_info.trailing_zero_bits = sizeof(T) * 8 - curr_xored_info.leading_zero_bits - curr_xored_info.data_bits;`
			`}`
Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table 2019-06-13 14:04:38 +00:00			`// else: 0b10 prefix - use prev_xored_info`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`if (curr_xored_info.leading_zero_bits == 0`
			`&& curr_xored_info.data_bits == 0`
			`&& curr_xored_info.trailing_zero_bits == 0)`
			`{`
			`throw Exception("Cannot decompress gorilla-encoded data: corrupted input data.",`
			`ErrorCodes::CANNOT_DECOMPRESS);`
			`}`

			`xored_data = reader.readBits(curr_xored_info.data_bits);`
			`xored_data <<= curr_xored_info.trailing_zero_bits;`
			`curr_value = prev_value ^ xored_data;`
			`}`
Post-PR fixes: * BitHelpers.cpp was removed, corresponding code was moved to the header * BitIO test as GTest-based test binary * gtest-based unit test for DoubleDelta and Gorilla codecs * getLeadingZeroBits from SFINAE to consexpr if * removed couple of unneeded if's * Fixed sql-test to use one table 2019-06-13 14:04:38 +00:00			`// else: 0b0 prefix - use prev_value`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
Require explicit type in unalignedStore This is a follow-up to PR #5786, which fixed a segfault caused by an unexpected deduced type for unalignedStore. To prevent future errors of this kind, require a caller to specify the stored type explicitly. 2019-06-28 16:21:05 +00:00			`unalignedStore<T>(dest, curr_value);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`dest += sizeof(curr_value);`

			`prev_xored_info = curr_xored_info;`
			`prev_value = curr_value;`
			`}`
			`}`

			`UInt8 getDataBytesSize(DataTypePtr column_type)`
			`{`
			`UInt8 delta_bytes_size = 1;`
			`if (column_type && column_type->haveMaximumSizeOfValue())`
			`{`
			`size_t max_size = column_type->getSizeOfValueInMemory();`
			`if (max_size == 1 \|\| max_size == 2 \|\| max_size == 4 \|\| max_size == 8)`
			`delta_bytes_size = static_cast<UInt8>(max_size);`
			`}`
			`return delta_bytes_size;`
			`}`

Post-PR fixes #2 Fixed style issues and build for clang-7; 64-bit buffer for BitReader and BitWriter; Fixed overflow and writing more bytes on flushing; Added maskLowBits() and tests for it. 2019-06-17 03:27:42 +00:00			`}`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00

			`CompressionCodecGorilla::CompressionCodecGorilla(UInt8 data_bytes_size_)`
			`: data_bytes_size(data_bytes_size_)`
			`{`
			`}`

			`UInt8 CompressionCodecGorilla::getMethodByte() const`
			`{`
			`return static_cast<UInt8>(CompressionMethodByte::Gorilla);`
			`}`

			`String CompressionCodecGorilla::getCodecDesc() const`
			`{`
			`return "Gorilla";`
			`}`

			`UInt32 CompressionCodecGorilla::getMaxCompressedDataSize(UInt32 uncompressed_size) const`
			`{`
			`const auto result = 2 // common header`
			`+ data_bytes_size // max bytes skipped if source is not properly aligned.`
			`+ getCompressedHeaderSize(data_bytes_size) // data-specific header`
			`+ getCompressedDataSize(data_bytes_size, uncompressed_size);`

			`return result;`
			`}`

			`UInt32 CompressionCodecGorilla::doCompressData(const char * source, UInt32 source_size, char * dest) const`
			`{`
			`UInt8 bytes_to_skip = source_size % data_bytes_size;`
			`dest[0] = data_bytes_size;`
fix vulnerabilities 2019-08-26 14:39:49 +00:00			`dest[1] = bytes_to_skip; /// unused (backward compatibility)`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`memcpy(&dest[2], source, bytes_to_skip);`
			`size_t start_pos = 2 + bytes_to_skip;`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`UInt32 result_size = 0;`

			`const UInt32 compressed_size = getMaxCompressedDataSize(source_size);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`switch (data_bytes_size)`
			`{`
			`case 1:`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`result_size = compressDataForType<UInt8>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`break;`
			`case 2:`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`result_size = compressDataForType<UInt16>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`break;`
			`case 4:`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`result_size = compressDataForType<UInt32>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`break;`
			`case 8:`
Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`result_size = compressDataForType<UInt64>(&source[bytes_to_skip], source_size - bytes_to_skip, &dest[start_pos], compressed_size);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`break;`
			`}`

Fixed Gorilla encoding error on small sequences. Added test cases for small sequences; Refurbished test cases for codecs; 2019-08-11 08:01:02 +00:00			`return 1 + 1 + result_size;`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`}`

fix vulnerabilities 2019-08-26 14:39:49 +00:00			`void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`{`
fix vulnerabilities 2019-08-26 14:39:49 +00:00			`if (source_size < 2)`
			`throw Exception("Cannot decompress. File has wrong header", ErrorCodes::CANNOT_DECOMPRESS);`

Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00			`UInt8 bytes_size = source[0];`
fix vulnerabilities 2019-08-26 14:39:49 +00:00			`UInt8 bytes_to_skip = uncompressed_size % bytes_size;`

fix build 2019-08-26 16:58:40 +00:00			`if (UInt32(2 + bytes_to_skip) > source_size)`
fix vulnerabilities 2019-08-26 14:39:49 +00:00			`throw Exception("Cannot decompress. File has wrong header", ErrorCodes::CANNOT_DECOMPRESS);`
Gorilla column encoding Added Gorilla column encoding for any fixed-width type; Added tests for Float32 and Float64 values. 2019-06-12 17:12:08 +00:00
			`memcpy(dest, &source[2], bytes_to_skip);`
			`UInt32 source_size_no_header = source_size - bytes_to_skip - 2;`
			`switch (bytes_size)`
			`{`
			`case 1:`
			`decompressDataForType<UInt8>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);`
			`break;`
			`case 2:`
			`decompressDataForType<UInt16>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);`
			`break;`
			`case 4:`
			`decompressDataForType<UInt32>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);`
			`break;`
			`case 8:`
			`decompressDataForType<UInt64>(&source[2 + bytes_to_skip], source_size_no_header, &dest[bytes_to_skip]);`
			`break;`
			`}`
			`}`

			`void CompressionCodecGorilla::useInfoAboutType(DataTypePtr data_type)`
			`{`
			`data_bytes_size = getDataBytesSize(data_type);`
			`}`

			`void registerCodecGorilla(CompressionCodecFactory & factory)`
			`{`
			`UInt8 method_code = UInt8(CompressionMethodByte::Gorilla);`
			`factory.registerCompressionCodecWithType("Gorilla", method_code, [&](const ASTPtr &, DataTypePtr column_type) -> CompressionCodecPtr`
			`{`
			`UInt8 delta_bytes_size = getDataBytesSize(column_type);`
			`return std::make_shared<CompressionCodecGorilla>(delta_bytes_size);`
			`});`
			`}`
			`}`