From d36edea9e2da0c5e42a0b269fd6de576b4419378 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 5 Sep 2023 16:51:54 +0000 Subject: [PATCH 1/3] Consolidate GCD codec tests (follow up to #53149) --- .../sql-reference/statements/create/table.md | 8 ++-- src/Compression/CompressionCodecDelta.cpp | 1 - src/Compression/CompressionCodecGCD.cpp | 17 +++---- ...cimal_with_default_precision_and_scale.sql | 2 +- .../02868_gcd_codec_test_data.reference | 1 - .../0_stateless/02868_gcd_codec_test_data.sql | 15 ------- ...2870_gcd_codec_test_single_codec.reference | 0 .../02870_gcd_codec_test_single_codec.sql | 2 - ...es.reference => 02872_gcd_codec.reference} | 4 ++ ...upported_types.sql => 02872_gcd_codec.sql} | 45 +++++++++++++++---- ..._gcd_codec_test_only_zero_values.reference | 3 -- .../02873_gcd_codec_test_only_zero_values.sql | 3 -- 12 files changed, 51 insertions(+), 50 deletions(-) delete mode 100644 tests/queries/0_stateless/02868_gcd_codec_test_data.reference delete mode 100644 tests/queries/0_stateless/02868_gcd_codec_test_data.sql delete mode 100644 tests/queries/0_stateless/02870_gcd_codec_test_single_codec.reference delete mode 100644 tests/queries/0_stateless/02870_gcd_codec_test_single_codec.sql rename tests/queries/0_stateless/{02872_gcd_codec_test_supported_types.reference => 02872_gcd_codec.reference} (99%) rename tests/queries/0_stateless/{02872_gcd_codec_test_supported_types.sql => 02872_gcd_codec.sql} (75%) delete mode 100644 tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.reference delete mode 100644 tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.sql diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index a50874cdb95..2a0b70fb19b 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -391,19 +391,19 @@ DEFLATE_QPL is not available in ClickHouse Cloud. ### Specialized Codecs -These codecs are designed to make compression more effective by using specific features of data. Some of these codecs do not compress data themself. Instead, they prepare the data for a common purpose codec, which compresses it better than without this preparation. +These codecs are designed to make compression more effective by exploiting specific features of the data. Some of these codecs do not compress data themself, they instead preprocess the data such that a second compression stage using a general-purpose codec can achieve a higher data compression rate. #### Delta -`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Delta is a data preparation codec, i.e. cannot be used stand-alone. +`Delta(delta_bytes)` — Compression approach in which raw values are replaced by the difference of two neighboring values, except for the first value that stays unchanged. Up to `delta_bytes` are used for storing delta values, so `delta_bytes` is the maximum size of raw values. Possible `delta_bytes` values: 1, 2, 4, 8. The default value for `delta_bytes` is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Delta is a data preparation codec, i.e. it cannot be used stand-alone. #### DoubleDelta -`DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). DoubleDelta is a data preparation codec, i.e. cannot be used stand-alone. +`DoubleDelta(bytes_size)` — Calculates delta of deltas and writes it in compact binary form. Possible `bytes_size` values: 1, 2, 4, 8, the default value is `sizeof(type)` if equal to 1, 2, 4, or 8. In all other cases, it’s 1. Optimal compression rates are achieved for monotonic sequences with a constant stride, such as time series data. Can be used with any fixed-width type. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. Uses 1 extra bit for 32-bit deltas: 5-bit prefixes instead of 4-bit prefixes. For additional information, see Compressing Time Stamps in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). DoubleDelta is a data preparation codec, i.e. it cannot be used stand-alone. #### GCD -`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. A viable use case are timestamps or monetary values with high precision. GCD is a data preparation codec, i.e. cannot be used stand-alone. +`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. A viable use case are values which change (increase or decrease) in multiples of the GCD, e.g. 24, 28, 16, 24, 8, 24 (GCD = 4). GCD is a data preparation codec, i.e. it cannot be used stand-alone. #### Gorilla diff --git a/src/Compression/CompressionCodecDelta.cpp b/src/Compression/CompressionCodecDelta.cpp index 51472aedaf0..26beedb411f 100644 --- a/src/Compression/CompressionCodecDelta.cpp +++ b/src/Compression/CompressionCodecDelta.cpp @@ -5,7 +5,6 @@ #include #include #include -#include namespace DB diff --git a/src/Compression/CompressionCodecGCD.cpp b/src/Compression/CompressionCodecGCD.cpp index 7c6e4d86f6c..cb71f252ff5 100644 --- a/src/Compression/CompressionCodecGCD.cpp +++ b/src/Compression/CompressionCodecGCD.cpp @@ -3,14 +3,8 @@ #include #include #include -#include -#include -#include #include "Common/Exception.h" #include "DataTypes/IDataType.h" -#include "base/Decimal_fwd.h" -#include "base/types.h" -#include "config.h" #include #include @@ -84,7 +78,7 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest) const char * const source_end = source + source_size; - T gcd_divider{}; + T gcd_divider = 0; const auto * cur_source = source; while (gcd_divider != T(1) && cur_source < source_end) { @@ -100,7 +94,7 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest) if constexpr (sizeof(T) <= 8) { - /// libdivide support only UInt32 and UInt64. + /// libdivide supports only UInt32 and UInt64. using LibdivideT = std::conditional_t; libdivide::divider divider(static_cast(gcd_divider)); cur_source = source; @@ -126,8 +120,6 @@ void compressDataForType(const char * source, UInt32 source_size, char * dest) template void decompressDataForType(const char * source, UInt32 source_size, char * dest, UInt32 output_size) { - const char * const output_end = dest + output_size; - if (source_size % sizeof(T) != 0) throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is not aligned to {}", source_size, sizeof(T)); @@ -135,11 +127,14 @@ void decompressDataForType(const char * source, UInt32 source_size, char * dest, throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot GCD decompress, data size {} is less than {}", source_size, sizeof(T)); const char * const source_end = source + source_size; + const char * const dest_end = dest + output_size; + const T gcd_multiplier = unalignedLoad(source); source += sizeof(T); + while (source < source_end) { - if (dest + sizeof(T) > output_end) [[unlikely]] + if (dest + sizeof(T) > dest_end) [[unlikely]] throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress the data"); unalignedStore(dest, unalignedLoad(source) * gcd_multiplier); diff --git a/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.sql b/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.sql index 4d6048ed7ae..5132b593bcc 100644 --- a/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.sql +++ b/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.sql @@ -12,7 +12,7 @@ ORDER BY (d2, d3); INSERT INTO decimal (d1, d2, d3) VALUES (4.2, 4.2, 4.2); -SELECT type FROM system.columns WHERE table = 'decimal' AND database = currentDatabase(); +SELECT type FROM system.columns WHERE table = 'decimal' AND database = currentDatabase() ORDER BY type; SELECT toTypeName(d2), toTypeName(d3) FROM decimal LIMIT 1; diff --git a/tests/queries/0_stateless/02868_gcd_codec_test_data.reference b/tests/queries/0_stateless/02868_gcd_codec_test_data.reference deleted file mode 100644 index 573541ac970..00000000000 --- a/tests/queries/0_stateless/02868_gcd_codec_test_data.reference +++ /dev/null @@ -1 +0,0 @@ -0 diff --git a/tests/queries/0_stateless/02868_gcd_codec_test_data.sql b/tests/queries/0_stateless/02868_gcd_codec_test_data.sql deleted file mode 100644 index 1039ebcecb8..00000000000 --- a/tests/queries/0_stateless/02868_gcd_codec_test_data.sql +++ /dev/null @@ -1,15 +0,0 @@ -DROP TABLE IF EXISTS table_none; -CREATE TABLE table_none (id UInt64, ui UInt256 CODEC(LZ4)) ENGINE = Memory; -INSERT INTO table_none SELECT * FROM generateRandom() LIMIT 50; - -DROP TABLE IF EXISTS table_gcd_codec; -CREATE TABLE table_gcd_codec (id UInt64, ui UInt256 CODEC(GCD, LZ4)) ENGINE = Memory; -INSERT INTO table_gcd_codec SELECT * FROM table_none; - -SELECT COUNT(*) -FROM ( - SELECT table_none.id, table_none.ui AS ui1, table_gcd_codec.id, table_gcd_codec.ui AS ui2 - FROM table_none - JOIN table_gcd_codec ON table_none.id = table_gcd_codec.id -) -WHERE ui1 != ui2; diff --git a/tests/queries/0_stateless/02870_gcd_codec_test_single_codec.reference b/tests/queries/0_stateless/02870_gcd_codec_test_single_codec.reference deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/queries/0_stateless/02870_gcd_codec_test_single_codec.sql b/tests/queries/0_stateless/02870_gcd_codec_test_single_codec.sql deleted file mode 100644 index d101884e379..00000000000 --- a/tests/queries/0_stateless/02870_gcd_codec_test_single_codec.sql +++ /dev/null @@ -1,2 +0,0 @@ -DROP TABLE IF EXISTS table_gcd_codec; -CREATE TABLE table_gcd_codec (str UInt64 CODEC(GCD)) ENGINE = Memory; -- { serverError 36 } diff --git a/tests/queries/0_stateless/02872_gcd_codec_test_supported_types.reference b/tests/queries/0_stateless/02872_gcd_codec.reference similarity index 99% rename from tests/queries/0_stateless/02872_gcd_codec_test_supported_types.reference rename to tests/queries/0_stateless/02872_gcd_codec.reference index 2eb7ed1e475..1dd1b67e047 100644 --- a/tests/queries/0_stateless/02872_gcd_codec_test_supported_types.reference +++ b/tests/queries/0_stateless/02872_gcd_codec.reference @@ -1,4 +1,5 @@ 0 +0 1 2 3 @@ -998,3 +999,6 @@ 1970-01-01 02:00:47.000 1970-01-01 02:00:48.000 1970-01-01 02:00:49.000 +0 +0 +0 diff --git a/tests/queries/0_stateless/02872_gcd_codec_test_supported_types.sql b/tests/queries/0_stateless/02872_gcd_codec.sql similarity index 75% rename from tests/queries/0_stateless/02872_gcd_codec_test_supported_types.sql rename to tests/queries/0_stateless/02872_gcd_codec.sql index 3b6d27a7683..245a1211052 100644 --- a/tests/queries/0_stateless/02872_gcd_codec_test_supported_types.sql +++ b/tests/queries/0_stateless/02872_gcd_codec.sql @@ -1,4 +1,28 @@ --- Int +-- GCD codec can't be used stand-alone +CREATE TEMPORARY TABLE table_gcd_codec (n UInt64 CODEC(GCD)) ENGINE = Memory; -- { serverError BAD_ARGUMENTS } + +-- GCD codec rejects non-integer/decimal/datetime types +CREATE TEMPORARY TABLE table_gcd_codec (str String CODEC(GCD, LZ4)) ENGINE = Memory; -- { serverError BAD_ARGUMENTS } + +-- Basic random-based correctness test +CREATE TEMPORARY TABLE table_lz4 (id UInt64, ui UInt256 CODEC(LZ4)) ENGINE = Memory; +INSERT INTO table_lz4 SELECT * FROM generateRandom() LIMIT 50; + +CREATE TEMPORARY TABLE table_gcd (id UInt64, ui UInt256 CODEC(GCD, LZ4)) ENGINE = Memory; +INSERT INTO table_gcd SELECT * FROM table_lz4; + +SELECT COUNT(*) +FROM ( + SELECT table_lz4.id, table_lz4.ui AS ui1, table_gcd.id, table_gcd.ui AS ui2 + FROM table_lz4 JOIN table_gcd + ON table_lz4.id = table_gcd.id +) +WHERE ui1 != ui2; + +------------------------------------------------------------------------------------------- +-- Compression/decompression works for all data types supported by GCD codec + +-- Int* CREATE TEMPORARY TABLE table_gcd_codec_uint8 (n UInt8 CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_uint16 (n UInt16 CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_uint32 (n UInt32 CODEC(GCD, LZ4)) ENGINE = Memory; @@ -20,8 +44,7 @@ SELECT * FROM table_gcd_codec_uint64; SELECT * FROM table_gcd_codec_uint128; SELECT * FROM table_gcd_codec_uint256; - --- UInt +-- UInt* CREATE TEMPORARY TABLE table_gcd_codec_int8 (n Int8 CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_int16 (n Int16 CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_int32 (n Int32 CODEC(GCD, LZ4)) ENGINE = Memory; @@ -43,8 +66,7 @@ SELECT * FROM table_gcd_codec_int64; SELECT * FROM table_gcd_codec_int128; SELECT * FROM table_gcd_codec_int256; - --- Decimal +-- Decimal* CREATE TEMPORARY TABLE table_gcd_codec_decimal32 (n Decimal32(1) CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_decimal64 (n Decimal64(1) CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_decimal128 (n Decimal128(1) CODEC(GCD, LZ4)) ENGINE = Memory; @@ -60,8 +82,7 @@ SELECT * FROM table_gcd_codec_decimal64; SELECT * FROM table_gcd_codec_decimal128; SELECT * FROM table_gcd_codec_decimal256; - --- Date +-- Date[32] CREATE TEMPORARY TABLE table_gcd_codec_date (n Date CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_date32 (n Date32 CODEC(GCD, LZ4)) ENGINE = Memory; @@ -71,8 +92,7 @@ INSERT INTO table_gcd_codec_date32 SELECT number FROM system.numbers LIMIT 50; SELECT * FROM table_gcd_codec_date; SELECT * FROM table_gcd_codec_date32; - --- DateTime +-- DateTimeTime[64] CREATE TEMPORARY TABLE table_gcd_codec_datetime (n DateTime('Asia/Istanbul') CODEC(GCD, LZ4)) ENGINE = Memory; CREATE TEMPORARY TABLE table_gcd_codec_datetime64 (n DateTime64(3, 'Asia/Istanbul') CODEC(GCD, LZ4)) ENGINE = Memory; @@ -81,3 +101,10 @@ INSERT INTO table_gcd_codec_datetime64 SELECT number FROM system.numbers LIMIT 5 SELECT * FROM table_gcd_codec_datetime; SELECT * FROM table_gcd_codec_datetime64; + + +-- A column with all 0 values can be compressed/decompressed + +CREATE TEMPORARY TABLE table_gcd_codec_only_zero_values (n UInt8 CODEC(GCD, LZ4)) ENGINE = Memory; +INSERT INTO table_gcd_codec_only_zero_values VALUES (0), (0), (0); +SELECT * FROM table_gcd_codec_only_zero_values; diff --git a/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.reference b/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.reference deleted file mode 100644 index bb0b1cf658d..00000000000 --- a/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.reference +++ /dev/null @@ -1,3 +0,0 @@ -0 -0 -0 diff --git a/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.sql b/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.sql deleted file mode 100644 index 7200e0c5d84..00000000000 --- a/tests/queries/0_stateless/02873_gcd_codec_test_only_zero_values.sql +++ /dev/null @@ -1,3 +0,0 @@ -CREATE TABLE table_gcd_codec_only_zero_values (n UInt8 CODEC(GCD, LZ4)) ENGINE = Memory; -INSERT INTO table_gcd_codec_only_zero_values VALUES (0), (0), (0); -SELECT * FROM table_gcd_codec_only_zero_values; From b41a4d5c7a1df2c5b03d155f5475bb8cecb24fae Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Sep 2023 08:43:11 +0000 Subject: [PATCH 2/3] Fix test --- .../00700_decimal_with_default_precision_and_scale.reference | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.reference b/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.reference index 79e219b89fd..e4db0ebf5ef 100644 --- a/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.reference +++ b/tests/queries/0_stateless/00700_decimal_with_default_precision_and_scale.reference @@ -1,4 +1,4 @@ -Decimal(9, 8) -Decimal(18, 0) Decimal(10, 0) +Decimal(18, 0) +Decimal(9, 8) Decimal(18, 0) Decimal(10, 0) From f2f24610d002a98d535072a57458956488e9c54b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Sep 2023 15:04:18 +0200 Subject: [PATCH 3/3] Improve English grammar by 8.3% --- docs/en/sql-reference/statements/create/table.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 2a0b70fb19b..5e94a5fdc6f 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -403,7 +403,7 @@ These codecs are designed to make compression more effective by exploiting speci #### GCD -`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. A viable use case are values which change (increase or decrease) in multiples of the GCD, e.g. 24, 28, 16, 24, 8, 24 (GCD = 4). GCD is a data preparation codec, i.e. it cannot be used stand-alone. +`GCD()` - - Calculates the greatest common denominator (GCD) of the values in the column, then divides each value by the GCD. Can be used with integer, decimal and date/time columns. The codec is well suited for columns with values that change (increase or decrease) in multiples of the GCD, e.g. 24, 28, 16, 24, 8, 24 (GCD = 4). GCD is a data preparation codec, i.e. it cannot be used stand-alone. #### Gorilla