mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-12 10:34:21 +00:00
Detailed docs with examples on DoubleDelta and Gorilla codecs
This commit is contained in:
parent
7183c3a7b0
commit
08059ac640
@ -5,6 +5,92 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** DoubleDelta column codec implementation.
|
||||
*
|
||||
* Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf, which was extended
|
||||
* to support 64bit types. The drawback is 1 extra bit for 32-byte wide deltas: 5-bit prefix
|
||||
* instead of 4-bit prefix.
|
||||
*
|
||||
* This codec is best used against monotonic integer sequences with constant (or almost contant)
|
||||
* stride, like event timestamp for some monitoring application.
|
||||
*
|
||||
* Given input sequence a: [a0, a1, ... an]:
|
||||
*
|
||||
* First, write number of items (sizeof(int32)*8 bits): n
|
||||
* Then write first item as is (sizeof(a[0])*8 bits): a[0]
|
||||
* Second item is written as delta (sizeof(a[0])*8 bits): a[1] - a[0]
|
||||
* Loop over remaining items and calculate double delta:
|
||||
* double_delta = a[i] - 2 * a[i - 1] + a[i - 2]
|
||||
* Write it in compact binary form with `BitWriter`
|
||||
* if double_delta == 0:
|
||||
* write 1bit: 0
|
||||
* else if -63 < double_delta < 64:
|
||||
* write 2 bit prefix: 10
|
||||
* write sign bit (1 if signed): x
|
||||
* write 7-1 bits of abs(double_delta - 1): xxxxxx
|
||||
* else if -255 < double_delta < 256:
|
||||
* write 3 bit prefix: 110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 9-1 bits of abs(double_delta - 1): xxxxxxxx
|
||||
* else if -2047 < double_delta < 2048:
|
||||
* write 4 bit prefix: 1110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 12-1 bits of abs(double_delta - 1): xxxxxxxxxxx
|
||||
* else if double_delta fits into 32-bit int:
|
||||
* write 5 bit prefix: 11110
|
||||
* write sign bit (1 if signed): x
|
||||
* write 32-1 bits of abs(double_delta - 1): xxxxxxxxxxx...
|
||||
* else
|
||||
* write 5 bit prefix: 11111
|
||||
* write sign bit (1 if signed): x
|
||||
* write 64-1 bits of abs(double_delta - 1): xxxxxxxxxxx...
|
||||
*
|
||||
* @example sequence of UInt8 values [1, 2, 3, 4, 5, 6, 7, 8, 9 10] is encoded as (codec header is ommited):
|
||||
*
|
||||
* .- 4-byte little-endian sequence length (10 == 0xa)
|
||||
* | .- 1 byte (sizeof(UInt8) a[0] : 0x01
|
||||
* | | .- 1 byte of delta: a[1] - a[0] = 2 - 1 = 1 : 0x01
|
||||
* | | | .- 8 zero bits since double delta for remaining 8 elements was 0 : 0x00
|
||||
* v_______________v___v___v___
|
||||
* \x0a\x00\x00\x00\x01\x01\x00
|
||||
*
|
||||
* @example sequence of Int16 values [-10, 10, -20, 20, -40, 40] is encoded as:
|
||||
*
|
||||
* .- 4-byte little endian sequence length = 6 : 0x00000006
|
||||
* | .- 2 bytes (sizeof(Int16) a[0] as UInt16 = -10 : 0xfff6
|
||||
* | | .- 2 bytes of delta: a[1] - a[0] = 10 - (-10) = 20 : 0x0014
|
||||
* | | | .- 4 encoded double deltas (see below)
|
||||
* v_______________ v______ v______ v______________________
|
||||
* \x06\x00\x00\x00\xf6\xff\x14\x00\xb8\xe2\x2e\xb1\xe4\x58
|
||||
*
|
||||
* 4 binary encoded double deltas (\xb8\xe2\x2e\xb1\xe4\x58):
|
||||
* double_delta (DD) = -20 - 2 * 10 + (-10) = -50
|
||||
* .- 2-bit prefix : 0b10
|
||||
* | .- sign-bit : 0b1
|
||||
* | |.- abs(DD - 1) = 49 : 0b110001
|
||||
* | ||
|
||||
* | || DD = 20 - 2 * (-20) + 10 = 70
|
||||
* | || .- 3-bit prefix : 0b110
|
||||
* | || | .- sign bit : 0b0
|
||||
* | || | |.- abs(DD - 1) = 69 : 0b1000101
|
||||
* | || | ||
|
||||
* | || | || DD = -40 - 2 * 20 + (-20) = -100
|
||||
* | || | || .- 3-bit prefix : 0b110
|
||||
* | || | || | .- sign-bit : 0b0
|
||||
* | || | || | |.- abs(DD - 1) = 99 : 0b1100011
|
||||
* | || | || | ||
|
||||
* | || | || | || DD = 40 - 2 * (-40) + 20 = 140
|
||||
* | || | || | || .- 3-bit prefix : 0b110
|
||||
* | || | || | || | .- sign bit : 0b0
|
||||
* | || | || | || | |.- abs(DD - 1) = 139 : 0b10001011
|
||||
* | || | || | || | ||
|
||||
* V_vv______V__vv________V____vv_______V__vv________,- padding bits
|
||||
* 10111000 11100010 00101110 10110001 11100100 01011000
|
||||
*
|
||||
* Please also see unit tests for:
|
||||
* * Examples on what output `BitWriter` produces on predefined input.
|
||||
* * Compatibility tests solidifying encoded binary output on set of predefined sequences.
|
||||
*/
|
||||
class CompressionCodecDoubleDelta : public ICompressionCodec
|
||||
{
|
||||
public:
|
||||
|
@ -5,6 +5,89 @@
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/** Gorilla column codec implementation.
|
||||
*
|
||||
* Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
|
||||
*
|
||||
* This codec is best used against monotonic floating sequences, like CPU usage percentage
|
||||
* or any other gauge.
|
||||
*
|
||||
* Given input sequence a: [a0, a1, ... an]
|
||||
*
|
||||
* First, write number of items (sizeof(int32)*8 bits): n
|
||||
* Then write first item as is (sizeof(a[0])*8 bits): a[0]
|
||||
* Loop over remaining items and calculate xor_diff:
|
||||
* xor_diff = a[i] ^ a[i - 1] (e.g. 00000011'10110100)
|
||||
* Write it in compact binary form with `BitWriter`
|
||||
* if xor_diff == 0:
|
||||
* write 1 bit: 0
|
||||
* else:
|
||||
* calculate leading zero bits (lzb)
|
||||
* and trailing zero bits (tzb) of xor_diff,
|
||||
* compare to lzb and tzb of previous xor_diff
|
||||
* (X = sizeof(a[i]) * 8, e.g. X = 16, lzb = 6, tzb = 2)
|
||||
* if lzb >= prev_lzb && tzb >= prev_tzb:
|
||||
* (e.g. prev_lzb=4, prev_tzb=1)
|
||||
* write 2 bit prefix: 0b10
|
||||
* write xor_diff >> prev_tzb (X - prev_lzb - prev_tzb bits):0b00111011010
|
||||
* (where X = sizeof(a[i]) * 8, e.g. 16)
|
||||
* else:
|
||||
* write 2 bit prefix: 0b11
|
||||
* write 5 bits of lzb: 0b00110
|
||||
* write 6 bits of (X - lzb - tzb)=(16-6-2)=8: 0b001000
|
||||
* write (X - lzb - tzb) non-zero bits of xor_diff: 0b11101101
|
||||
* prev_lzb = lzb
|
||||
* prev_tzb = tzb
|
||||
*
|
||||
* @example sequence of Float32 values [0.1, 0.1, 0.11, 0.2, 0.1] is encoded as:
|
||||
*
|
||||
* .- 4-byte little endian sequence length: 5 : 0x00000005
|
||||
* | .- 4 byte (sizeof(Float32) a[0] as UInt32 : -10 : 0xcdcccc3d
|
||||
* | | .- 4 encoded xor diffs (see below)
|
||||
* v_______________ v______________ v__________________________________________________
|
||||
* \x05\x00\x00\x00\xcd\xcc\xcc\x3d\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00
|
||||
*
|
||||
* 4 binary encoded xor diffs (\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00):
|
||||
*
|
||||
* ...........................................
|
||||
* a[i-1] = 00111101110011001100110011001101
|
||||
* a[i] = 00111101110011001100110011001101
|
||||
* xor_diff = 00000000000000000000000000000000
|
||||
* .- 1-bit prefix : 0b0
|
||||
* |
|
||||
* | ...........................................
|
||||
* | a[i-1] = 00111101110011001100110011001101
|
||||
* ! a[i] = 00111101111000010100011110101110
|
||||
* | xor_diff = 00000000001011011000101101100011
|
||||
* | lzb = 10
|
||||
* | tzb = 0
|
||||
* |.- 2-bit prefix : 0b11
|
||||
* || .- lzb (10) : 0b1010
|
||||
* || | .- data length (32-10-0): 22 : 0b010110
|
||||
* || | | .- data : 0b1011011000101101100011
|
||||
* || | | |
|
||||
* || | | | ...........................................
|
||||
* || | | | a[i-1] = 00111101111000010100011110101110
|
||||
* || | | | a[i] = 00111110010011001100110011001101
|
||||
* || | | | xor_diff = 00000011101011011000101101100011
|
||||
* || | | | .- 2-bit prefix : 0b11
|
||||
* || | | | | .- lzb = 6 : 0b00110
|
||||
* || | | | | | .- data length = (32 - 6) = 26 : 0b011010
|
||||
* || | | | | | | .- data : 0b11101011011000101101100011
|
||||
* || | | | | | | |
|
||||
* || | | | | | | | ...........................................
|
||||
* || | | | | | | | a[i-1] = 00111110010011001100110011001101
|
||||
* || | | | | | | | a[i] = 00111101110011001100110011001101
|
||||
* || | | | | | | | xor_diff = 00000011100000000000000000000000
|
||||
* || | | | | | | | .- 2-bit prefix : 0b10
|
||||
* || | | | | | | | | .- data : 0b11100000000000000000000000
|
||||
* VV_v____ v_____v________________________V_v_____v______v____________________________V_v_____________________________
|
||||
* 01101010 01011010 11011000 10110110 00111100 11001101 01110101 10110001 01101100 01110111 00000000 00000000 00000000
|
||||
*
|
||||
* Please also see unit tests for:
|
||||
* * Examples on what output `BitWriter` produces on predefined input.
|
||||
* * Compatibility tests solidifying encoded binary output on set of predefined sequences.
|
||||
*/
|
||||
class CompressionCodecGorilla : public ICompressionCodec
|
||||
{
|
||||
public:
|
||||
|
@ -1170,7 +1170,7 @@ auto DDCompatibilityTestSequence()
|
||||
auto ret = generateSeq<ValueType>(G(SameValueGenerator(42)), 0, 3);
|
||||
|
||||
// These values are from DoubleDelta paper (and implementation) and represent points at which DD encoded length is changed.
|
||||
// DD value less that this point are encoded shorter (bigger -> longer) binary form.
|
||||
// DD value less that this point is encoded in shorter binary form (bigger - longer binary).
|
||||
const Int64 dd_corner_points[] = {-63, 64, -255, 256, -2047, 2048, std::numeric_limits<Int32>::min(), std::numeric_limits<Int32>::max()};
|
||||
for (const auto & p : dd_corner_points)
|
||||
{
|
||||
@ -1179,8 +1179,8 @@ auto DDCompatibilityTestSequence()
|
||||
break;
|
||||
}
|
||||
|
||||
// - 4 is to allow DD value to settle before transitioning through important point, since DD depends on 2 previous values of data.
|
||||
// + 2 is arbitrary.
|
||||
// - 4 is to allow DD value to settle before transitioning through important point,
|
||||
// since DD depends on 2 previous values of data, + 2 is arbitrary.
|
||||
ret.append(generateSeq<ValueType>(G(ddGenerator), p - 4, p + 2));
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user