Detailed docs with examples on DoubleDelta and Gorilla codecs

2024-11-26 17:41:59 +00:00 · 2020-01-03 08:18:38 +03:00 · 2020-01-03 08:18:38 +03:00 · 08059ac640
commit 08059ac640
parent 7183c3a7b0
3 changed files with 172 additions and 3 deletions
--- a/dbms/src/Compression/CompressionCodecDoubleDelta.h
+++ b/dbms/src/Compression/CompressionCodecDoubleDelta.h
@ -5,6 +5,92 @@
 namespace DB
 {

+/** DoubleDelta column codec implementation.
+ *
+ * Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf, which was extended
+ * to support 64bit types. The drawback is 1 extra bit for 32-byte wide deltas: 5-bit prefix
+ * instead of 4-bit prefix.
+ *
+ * This codec is best used against monotonic integer sequences with constant (or almost contant)
+ * stride, like event timestamp for some monitoring application.
+ *
+ * Given input sequence a: [a0, a1, ... an]:
+ *
+ * First, write number of items (sizeof(int32)*8 bits):                n
+ * Then write first item as is (sizeof(a[0])*8 bits):                  a[0]
+ * Second item is written as delta (sizeof(a[0])*8 bits):              a[1] - a[0]
+ * Loop over remaining items and calculate double delta:
+ *   double_delta = a[i] - 2 * a[i - 1] + a[i - 2]
+ *   Write it in compact binary form with `BitWriter`
+ *   if double_delta == 0:
+ *      write 1bit:                                                    0
+ *   else if -63 < double_delta < 64:
+ *      write 2 bit prefix:                                            10
+ *      write sign bit (1 if signed):                                  x
+ *      write 7-1 bits of abs(double_delta - 1):                       xxxxxx
+ *   else if -255 < double_delta < 256:
+ *      write 3 bit prefix:                                            110
+ *      write sign bit (1 if signed):                                  x
+ *      write 9-1 bits of abs(double_delta - 1):                       xxxxxxxx
+ *   else if -2047 < double_delta < 2048:
+ *      write 4 bit prefix:                                            1110
+ *      write sign bit (1 if signed):                                  x
+ *      write 12-1 bits of abs(double_delta - 1):                      xxxxxxxxxxx
+ *   else if double_delta fits into 32-bit int:
+ *      write 5 bit prefix:                                            11110
+ *      write sign bit (1 if signed):                                  x
+ *      write 32-1 bits of abs(double_delta - 1):                      xxxxxxxxxxx...
+ *   else
+ *      write 5 bit prefix:                                            11111
+ *      write sign bit (1 if signed):                                  x
+ *      write 64-1 bits of abs(double_delta - 1):                      xxxxxxxxxxx...
+ *
+ * @example sequence of UInt8 values [1, 2, 3, 4, 5, 6, 7, 8, 9 10] is encoded as (codec header is ommited):
+ *
+ * .- 4-byte little-endian sequence length (10 == 0xa)
+ * |               .- 1 byte (sizeof(UInt8) a[0]                                            : 0x01
+ * |               |   .- 1 byte of delta: a[1] - a[0] = 2 - 1 = 1                          : 0x01
+ * |               |   |   .- 8 zero bits since double delta for remaining 8 elements was 0 : 0x00
+ * v_______________v___v___v___
+ * \x0a\x00\x00\x00\x01\x01\x00
+ *
+ * @example sequence of Int16 values [-10, 10, -20, 20, -40, 40] is encoded as:
+ *
+ * .- 4-byte little endian sequence length = 6                                 : 0x00000006
+ * |                .- 2 bytes (sizeof(Int16) a[0] as UInt16 = -10             : 0xfff6
+ * |                |       .- 2 bytes of delta: a[1] - a[0] = 10 - (-10) = 20 : 0x0014
+ * |                |       |       .- 4 encoded double deltas (see below)
+ * v_______________ v______ v______ v______________________
+ * \x06\x00\x00\x00\xf6\xff\x14\x00\xb8\xe2\x2e\xb1\xe4\x58
+ *
+ * 4 binary encoded double deltas (\xb8\xe2\x2e\xb1\xe4\x58):
+ * double_delta (DD) = -20 - 2 * 10 + (-10) = -50
+ * .- 2-bit prefix                                                         : 0b10
+ * | .- sign-bit                                                           : 0b1
+ * | |.- abs(DD - 1) = 49                                                  : 0b110001
+ * | ||
+ * | ||      DD = 20 - 2 * (-20) + 10 = 70
+ * | ||      .- 3-bit prefix                                               : 0b110
+ * | ||      |  .- sign bit                                                : 0b0
+ * | ||      |  |.- abs(DD - 1) = 69                                       : 0b1000101
+ * | ||      |  ||
+ * | ||      |  ||        DD = -40 - 2 * 20 + (-20) = -100
+ * | ||      |  ||        .- 3-bit prefix                                  : 0b110
+ * | ||      |  ||        |    .- sign-bit                                 : 0b0
+ * | ||      |  ||        |    |.- abs(DD - 1) = 99                        : 0b1100011
+ * | ||      |  ||        |    ||
+ * | ||      |  ||        |    ||       DD = 40 - 2 * (-40) + 20 = 140
+ * | ||      |  ||        |    ||       .- 3-bit prefix                    : 0b110
+ * | ||      |  ||        |    ||       |  .- sign bit                     : 0b0
+ * | ||      |  ||        |    ||       |  |.- abs(DD - 1) = 139           : 0b10001011
+ * | ||      |  ||        |    ||       |  ||
+ * V_vv______V__vv________V____vv_______V__vv________,- padding bits
+ * 10111000 11100010 00101110 10110001 11100100 01011000
+ *
+ * Please also see unit tests for:
+ *   * Examples on what output `BitWriter` produces on predefined input.
+ *   * Compatibility tests solidifying encoded binary output on set of predefined sequences.
+ */
 class CompressionCodecDoubleDelta : public ICompressionCodec
 {
 public:
--- a/dbms/src/Compression/CompressionCodecGorilla.h
+++ b/dbms/src/Compression/CompressionCodecGorilla.h
@ -5,6 +5,89 @@
 namespace DB
 {

+/** Gorilla column codec implementation.
+ *
+ * Based on Gorilla paper: http://www.vldb.org/pvldb/vol8/p1816-teller.pdf
+ *
+ * This codec is best used against monotonic floating sequences, like CPU usage percentage
+ * or any other gauge.
+ *
+ * Given input sequence a: [a0, a1, ... an]
+ *
+ * First, write number of items (sizeof(int32)*8 bits):                n
+ * Then write first item as is (sizeof(a[0])*8 bits):                  a[0]
+ * Loop over remaining items and calculate xor_diff:
+ *   xor_diff = a[i] ^ a[i - 1] (e.g. 00000011'10110100)
+ *   Write it in compact binary form with `BitWriter`
+ *   if xor_diff == 0:
+ *       write 1 bit:                                                  0
+ *   else:
+ *       calculate leading zero bits (lzb)
+ *       and trailing zero bits (tzb) of xor_diff,
+ *       compare to lzb and tzb of previous xor_diff
+ *       (X = sizeof(a[i]) * 8, e.g. X = 16, lzb = 6, tzb = 2)
+ *       if lzb >= prev_lzb && tzb >= prev_tzb:
+ *           (e.g. prev_lzb=4, prev_tzb=1)
+ *           write 2 bit prefix:                                       0b10
+ *           write xor_diff >> prev_tzb (X - prev_lzb - prev_tzb bits):0b00111011010
+ *           (where X = sizeof(a[i]) * 8, e.g. 16)
+ *       else:
+ *           write 2 bit prefix:                                       0b11
+ *           write 5 bits of lzb:                                      0b00110
+ *           write 6 bits of (X - lzb - tzb)=(16-6-2)=8:               0b001000
+ *           write (X - lzb - tzb) non-zero bits of xor_diff:          0b11101101
+ *           prev_lzb = lzb
+ *           prev_tzb = tzb
+ *
+ * @example sequence of Float32 values [0.1, 0.1, 0.11, 0.2, 0.1] is encoded as:
+ *
+ * .- 4-byte little endian sequence length: 5                                 : 0x00000005
+ * |                .- 4 byte (sizeof(Float32) a[0] as UInt32 : -10           : 0xcdcccc3d
+ * |                |               .- 4 encoded xor diffs (see below)
+ * v_______________ v______________ v__________________________________________________
+ * \x05\x00\x00\x00\xcd\xcc\xcc\x3d\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00
+ *
+ * 4 binary encoded xor diffs (\x6a\x5a\xd8\xb6\x3c\xcd\x75\xb1\x6c\x77\x00\x00\x00):
+ *
+ * ...........................................
+ * a[i-1]   = 00111101110011001100110011001101
+ * a[i]     = 00111101110011001100110011001101
+ * xor_diff = 00000000000000000000000000000000
+ * .- 1-bit prefix                                                           : 0b0
+ * |
+ * | ...........................................
+ * | a[i-1]   = 00111101110011001100110011001101
+ * ! a[i]     = 00111101111000010100011110101110
+ * | xor_diff = 00000000001011011000101101100011
+ * | lzb = 10
+ * | tzb = 0
+ * |.- 2-bit prefix                                                          : 0b11
+ * || .- lzb (10)                                                            : 0b1010
+ * || |     .- data length (32-10-0): 22                                     : 0b010110
+ * || |     |     .- data                                                    : 0b1011011000101101100011
+ * || |     |     |
+ * || |     |     |                        ...........................................
+ * || |     |     |                        a[i-1]   = 00111101111000010100011110101110
+ * || |     |     |                        a[i]     = 00111110010011001100110011001101
+ * || |     |     |                        xor_diff = 00000011101011011000101101100011
+ * || |     |     |                        .- 2-bit prefix                            : 0b11
+ * || |     |     |                        | .- lzb = 6                               : 0b00110
+ * || |     |     |                        | |     .- data length = (32 - 6) = 26     : 0b011010
+ * || |     |     |                        | |     |      .- data                     : 0b11101011011000101101100011
+ * || |     |     |                        | |     |      |
+ * || |     |     |                        | |     |      |                            ...........................................
+ * || |     |     |                        | |     |      |                            a[i-1]   = 00111110010011001100110011001101
+ * || |     |     |                        | |     |      |                            a[i]     = 00111101110011001100110011001101
+ * || |     |     |                        | |     |      |                            xor_diff = 00000011100000000000000000000000
+ * || |     |     |                        | |     |      |                            .- 2-bit prefix                            : 0b10
+ * || |     |     |                        | |     |      |                            | .- data                                  : 0b11100000000000000000000000
+ * VV_v____ v_____v________________________V_v_____v______v____________________________V_v_____________________________
+ * 01101010 01011010 11011000 10110110 00111100 11001101 01110101 10110001 01101100 01110111 00000000 00000000 00000000
+ *
+ * Please also see unit tests for:
+ *   * Examples on what output `BitWriter` produces on predefined input.
+ *   * Compatibility tests solidifying encoded binary output on set of predefined sequences.
+ */
 class CompressionCodecGorilla : public ICompressionCodec
 {
 public:
--- a/dbms/src/Compression/tests/gtest_compressionCodec.cpp
+++ b/dbms/src/Compression/tests/gtest_compressionCodec.cpp
@ -1170,7 +1170,7 @@ auto DDCompatibilityTestSequence()
    auto ret = generateSeq<ValueType>(G(SameValueGenerator(42)), 0, 3);

    // These values are from DoubleDelta paper (and implementation) and represent points at which DD encoded length is changed.
-    // DD value less that this point are encoded shorter (bigger -> longer) binary form.
+    // DD value less that this point is encoded in shorter binary form (bigger - longer binary).
    const Int64 dd_corner_points[] = {-63, 64, -255, 256, -2047, 2048, std::numeric_limits<Int32>::min(), std::numeric_limits<Int32>::max()};
    for (const auto & p : dd_corner_points)
    {
@ -1179,8 +1179,8 @@ auto DDCompatibilityTestSequence()
            break;
        }

-        // - 4 is to allow DD value to settle before transitioning through important point, since DD depends on 2 previous values of data.
-        // + 2 is arbitrary.
+        // - 4 is to allow DD value to settle before transitioning through important point,
+        // since DD depends on 2 previous values of data, + 2 is arbitrary.
        ret.append(generateSeq<ValueType>(G(ddGenerator), p - 4, p + 2));
    }