mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-24 08:32:02 +00:00
Fixed DoubleDelta codec edge case
Casused by mistreating negative double delta value as HUGE unsigned value, crippling compression ratio.
This commit is contained in:
parent
14d3efc058
commit
c5b2ba2a25
@ -83,6 +83,7 @@ WriteSpec getWriteSpec(const T & value)
|
||||
template <typename T, typename DeltaType>
|
||||
UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
{
|
||||
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
|
||||
using UnsignedDeltaType = typename std::make_unsigned<DeltaType>::type;
|
||||
|
||||
if (source_size % sizeof(T) != 0)
|
||||
@ -109,7 +110,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
{
|
||||
const T curr_value = unalignedLoad<T>(source);
|
||||
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
|
||||
unalignedStore<T>(dest, prev_delta);
|
||||
unalignedStore<DeltaType>(dest, prev_delta);
|
||||
|
||||
source += sizeof(curr_value);
|
||||
dest += sizeof(prev_delta);
|
||||
@ -123,8 +124,8 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
{
|
||||
const T curr_value = unalignedLoad<T>(source);
|
||||
|
||||
const auto delta = curr_value - prev_value;
|
||||
const DeltaType double_delta = static_cast<DeltaType>(delta - static_cast<T>(prev_delta));
|
||||
const DeltaType delta = static_cast<DeltaType>(curr_value - prev_value);
|
||||
const DeltaType double_delta = delta - prev_delta;
|
||||
|
||||
prev_delta = delta;
|
||||
prev_value = curr_value;
|
||||
@ -153,6 +154,7 @@ UInt32 compressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
template <typename T, typename DeltaType>
|
||||
void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
{
|
||||
static_assert(std::is_unsigned_v<T> && std::is_signed_v<DeltaType>, "T must be unsigned, while DeltaType must be signed integer type.");
|
||||
const char * source_end = source + source_size;
|
||||
|
||||
const UInt32 items_count = unalignedLoad<UInt32>(source);
|
||||
@ -173,7 +175,7 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
if (source < source_end)
|
||||
{
|
||||
prev_delta = unalignedLoad<DeltaType>(source);
|
||||
prev_value = static_cast<T>(prev_value + prev_delta);
|
||||
prev_value = prev_value + static_cast<T>(prev_delta);
|
||||
unalignedStore<T>(dest, prev_value);
|
||||
|
||||
source += sizeof(prev_delta);
|
||||
@ -208,11 +210,11 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest)
|
||||
}
|
||||
// else if first bit is zero, no need to read more data.
|
||||
|
||||
const T curr_value = static_cast<T>(prev_value + prev_delta + double_delta);
|
||||
const T curr_value = prev_value + static_cast<T>(prev_delta + double_delta);
|
||||
unalignedStore<T>(dest, curr_value);
|
||||
dest += sizeof(curr_value);
|
||||
|
||||
prev_delta = curr_value - prev_value;
|
||||
prev_delta = static_cast<DeltaType>(curr_value - prev_value);
|
||||
prev_value = curr_value;
|
||||
}
|
||||
}
|
||||
|
@ -8,3 +8,4 @@ I16
|
||||
I8
|
||||
DT
|
||||
D
|
||||
Compression:
|
||||
|
@ -29,23 +29,23 @@ CREATE TABLE codecTest (
|
||||
|
||||
-- checking for overflow
|
||||
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueI64, valueI64)
|
||||
VALUES (101, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807), (202, 0, 0, -9223372036854775808, -9223372036854775808), (203, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807);
|
||||
VALUES (1, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807), (2, 0, 0, -9223372036854775808, -9223372036854775808), (3, 18446744073709551615, 18446744073709551615, 9223372036854775807, 9223372036854775807);
|
||||
|
||||
-- n^3 covers all double delta storage cases, from small difference between neighbouref_values (stride) to big.
|
||||
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
|
||||
SELECT number as n, n * n * n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
|
||||
FROM system.numbers LIMIT 101, 100;
|
||||
FROM system.numbers LIMIT 101, 1000;
|
||||
|
||||
-- best case - constant stride
|
||||
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
|
||||
SELECT number as n, n as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
|
||||
FROM system.numbers LIMIT 201, 100;
|
||||
FROM system.numbers LIMIT 2001, 1000;
|
||||
|
||||
|
||||
-- worst case - random stride
|
||||
INSERT INTO codecTest (key, ref_valueU64, valueU64, ref_valueU32, valueU32, ref_valueU16, valueU16, ref_valueU8, valueU8, ref_valueI64, valueI64, ref_valueI32, valueI32, ref_valueI16, valueI16, ref_valueI8, valueI8, ref_valueDT, valueDT, ref_valueD, valueD)
|
||||
SELECT number as n, n + (rand64() - 9223372036854775807)/1000 as v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, toDateTime(v), toDateTime(v), toDate(v), toDate(v)
|
||||
FROM system.numbers LIMIT 301, 100;
|
||||
FROM system.numbers LIMIT 3001, 1000;
|
||||
|
||||
|
||||
SELECT 'U64';
|
||||
@ -147,5 +147,21 @@ WHERE
|
||||
dD != 0
|
||||
LIMIT 10;
|
||||
|
||||
SELECT 'Compression:';
|
||||
SELECT
|
||||
table, name, type,
|
||||
compression_codec,
|
||||
data_uncompressed_bytes u,
|
||||
data_compressed_bytes c,
|
||||
round(u/c,3) ratio
|
||||
FROM system.columns
|
||||
WHERE
|
||||
table == 'codecTest'
|
||||
AND
|
||||
compression_codec != ''
|
||||
AND
|
||||
ratio <= 1
|
||||
ORDER BY
|
||||
table, name, type;
|
||||
|
||||
DROP TABLE IF EXISTS codecTest;
|
Loading…
Reference in New Issue
Block a user