#pragma once #include namespace DB { /** Gorilla column codec implementation. * * Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf * * This codec is best used against monotonic floating sequences, like CPU usage percentage * or any other gauge. * * Given input sequence a: [a0, a1, ... an] * * First, write number of items (sizeof(int32)*8 bits): n * Then write first item as is (sizeof(a[0])*8 bits): a[0] * Loop over remaining items and calculate xor_diff: * xor_diff = a[i] ^ a[i - 1] (e.g. 00000011'10110100) * Write it in compact binary form with `BitWriter` * if xor_diff == 0: * write 1 bit: 0 * else: * calculate leading zero bits (lzb) * and trailing zero bits (tzb) of xor_diff, * compare to lzb and tzb of previous xor_diff * (X = sizeof(a[i]) * 8, e.g. X = 16, lzb = 6, tzb = 2) * if lzb >= prev_lzb && tzb >= prev_tzb: * (e.g. prev_lzb=4, prev_tzb=1) * write 2 bit prefix: 0b10 * write xor_diff >> prev_tzb (X - prev_lzb - prev_tzb bits):0b00111011010 * (where X = sizeof(a[i]) * 8, e.g. 16) * else: * write 2 bit prefix: 0b11 * write 5 bits of lzb: 0b00110 * write 6 bits of (X - lzb - tzb)=(16-6-2)=8: 0b001000 * write (X - lzb - tzb) non-zero bits of xor_diff: 0b11101101 * prev_lzb = lzb * prev_tzb = tzb * * @example sequence of Float32 values [0.1, 0.1, 0.11, 0.2, 0.1] is encoded as: * * .- 4-byte little endian sequence length: 5 : 0x00000005 * | .- 4 byte (sizeof(Float32) a[0] as UInt32 : -10 : 0xcdcccc3d * | | .- 4 encoded xor diffs (see below) * v_______________ v______________ v__________________________________________________ * \x05\x00\x00\x00\xcd\xcc\xcc\x3d\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00 * * 4 binary encoded xor diffs (\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00): * * ........................................... * a[i-1] = 00111101110011001100110011001101 * a[i] = 00111101110011001100110011001101 * xor_diff = 00000000000000000000000000000000 * .- 1-bit prefix : 0b0 * | * | ........................................... * | a[i-1] = 00111101110011001100110011001101 * ! a[i] = 00111101111000010100011110101110 * | xor_diff = 00000000001011011000101101100011 * | lzb = 10 * | tzb = 0 * |.- 2-bit prefix : 0b11 * || .- lzb (10) : 0b1010 * || | .- data length (32-10-0): 22 : 0b010110 * || | | .- data : 0b1011011000101101100011 * || | | | * || | | | ........................................... * || | | | a[i-1] = 00111101111000010100011110101110 * || | | | a[i] = 00111110010011001100110011001101 * || | | | xor_diff = 00000011101011011000101101100011 * || | | | .- 2-bit prefix : 0b11 * || | | | | .- lzb = 6 : 0b00110 * || | | | | | .- data length = (32 - 6) = 26 : 0b011010 * || | | | | | | .- data : 0b11101011011000101101100011 * || | | | | | | | * || | | | | | | | ........................................... * || | | | | | | | a[i-1] = 00111110010011001100110011001101 * || | | | | | | | a[i] = 00111101110011001100110011001101 * || | | | | | | | xor_diff = 00000011100000000000000000000000 * || | | | | | | | .- 2-bit prefix : 0b10 * || | | | | | | | | .- data : 0b11100000000000000000000000 * VV_v____ v_____v________________________V_v_____v______v____________________________V_v_____________________________ * 01101010 01011010 11011000 10110110 00111100 11001101 01110101 10110001 01101100 01110111 00000000 00000000 00000000 * * Please also see unit tests for: * * Examples on what output `BitWriter` produces on predefined input. * * Compatibility tests solidifying encoded binary output on set of predefined sequences. */ class CompressionCodecGorilla : public ICompressionCodec { public: CompressionCodecGorilla(UInt8 data_bytes_size_); uint8_t getMethodByte() const override; String getCodecDesc() const override; void useInfoAboutType(DataTypePtr data_type) override; protected: UInt32 doCompressData(const char * source, UInt32 source_size, char * dest) const override; void doDecompressData(const char * source, UInt32 source_size, char * dest, UInt32 uncompressed_size) const override; UInt32 getMaxCompressedDataSize(UInt32 uncompressed_size) const override; private: UInt8 data_bytes_size; }; class CompressionCodecFactory; void registerCodecGorilla(CompressionCodecFactory & factory); }