Fixed DoubleDelta codec edge case

Casused by mistreating negative double delta value as HUGE unsigned value, crippling compression ratio.
This commit is contained in:
Vasily Nemkov 2019-07-01 17:11:24 +03:00
parent 14d3efc058
commit c5b2ba2a25
3 changed files with 29 additions and 10 deletions

View File

@ -83,6 +83,7 @@ WriteSpec getWriteSpec(const T & value)
template <typename T, typename DeltaType>
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
{
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
using UnsignedDeltaType = typename std::make_unsigned<DeltaType>::type;
if (source_size % sizeof(T) != 0)
@ -109,7 +110,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
{
const T curr_value = unalignedLoad<T>(source);
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
unalignedStore<T>(dest, prev_delta);
unalignedStore<DeltaType>(dest, prev_delta);
source += sizeof(curr_value);
dest += sizeof(prev_delta);
@ -123,8 +124,8 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
{
const T curr_value = unalignedLoad<T>(source);
const auto delta = curr_value - prev_value;
const DeltaType double_delta = static_cast<DeltaType>(delta - static_cast<T>(prev_delta));
const DeltaType delta = static_cast<DeltaType>(curr_value - prev_value);
const DeltaType double_delta = delta - prev_delta;
prev_delta = delta;
prev_value = curr_value;
@ -153,6 +154,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
template <typename T, typename DeltaType>
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
{
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
const char * source_end = source + source_size;
const UInt32 items_count = unalignedLoad<UInt32>(source);
@ -173,7 +175,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
if (source < source_end)
{
prev_delta = unalignedLoad<DeltaType>(source);
prev_value = static_cast<T>(prev_value + prev_delta);
prev_value = prev_value + static_cast<T>(prev_delta);
unalignedStore<T>(dest, prev_value);
source += sizeof(prev_delta);
@ -208,11 +210,11 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
}
// else if first bit is zero, no need to read more data.
const T curr_value = static_cast<T>(prev_value + prev_delta + double_delta);
const T curr_value = prev_value + static_cast<T>(prev_delta + double_delta);
unalignedStore<T>(dest, curr_value);
dest += sizeof(curr_value);
prev_delta = curr_value - prev_value;
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
prev_value = curr_value;
}
}

View File

@ -8,3 +8,4 @@ I16
I8
DT
D
Compression:

View File

@ -29,23 +29,23 @@ CREATE TABLE codecTest (
-- checking for overflow
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueI64, valueI64)
VALUES (101, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807), (202, 0, 0, -9223372036854775808, -9223372036854775808), (203, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807);
VALUES (1, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807), (2, 0, 0, -9223372036854775808, -9223372036854775808), (3, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807);
-- n^3 covers all double delta storage cases, from small difference between neighbouref_values (stride) to big.
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
SELECT number as n, n * n * n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
FROM system.numbers LIMIT 101, 100;
FROM system.numbers LIMIT 101, 1000;
-- best case - constant stride
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
SELECT number as n, n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
FROM system.numbers LIMIT 201, 100;
FROM system.numbers LIMIT 2001, 1000;
-- worst case - random stride
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
SELECT number as n, n + (rand64() - 9223372036854775807)/1000 as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
FROM system.numbers LIMIT 301, 100;
FROM system.numbers LIMIT 3001, 1000;
SELECT 'U64';
@ -147,5 +147,21 @@ WHERE
dD != 0
LIMIT 10;
SELECT 'Compression:';
SELECT
table, name, type,
compression_codec,
data_uncompressed_bytes u,
data_compressed_bytes c,
round(u/c,3) ratio
FROM system.columns
WHERE
table == 'codecTest'
AND
compression_codec != ''
AND
ratio <= 1
ORDER BY
table, name, type;
DROP TABLE IF EXISTS codecTest;