From e6167d6b3632a8cfbe5e3a1f0010da0d30093d88 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 20 Jan 2023 14:30:53 +0000 Subject: [PATCH] Deprecate Gorilla compression of non-float columns Reasons: 1. The original Gorilla paper proposed a compression schema for pairs of time stamps and double-precision FP values. ClickHouse's Gorilla codec only implements compression of the latter and it does not impose any data type restrictions. - Data types != Float* or (U)Int* (e.g. Decimal, Point etc.) are definitely not supposed to be used with Gorilla. - (U)Int* types are debatable. The paper only considers integers-stored-as-FP-values, a practical use case for which Gorilla works well. Standalone integers are not considered which makes them at least suspicious. 2. Achieve consistency with FPC, another specialized floating-point timeseries codec, which rejects non-float data. 3. On practical datasets, ZSTD is often "good enough" (**) so it should be okay to disincentive non-ZSTD codecs a little bit. If needed, Delta and DoubleDelta codecs are viable alternative for slowly changing (time-series-like) integer sequences. Since on-prem and hosted users may still have Gorilla-compressed non-float data, this combination is only deprecated for now. No warning or error will be emitted. Users are encouraged to migrate Gorilla-compressed non-float data to an alternative codec. It is planned to treat Gorilla-compressed non-float columns as "suspicious" six months after this commit (i.e. in v23.6). Even then, it will still be possible to set "allow_suspicious_codecs = true" and read and write Gorilla-compressed non-float data. (*) Sec. 4.1.2, "Gorilla restricts the value element in its tuple to a double floating point type.", https://doi.org/10.14778/2824032.2824078 (**) https://clickhouse.com/blog/optimize-clickhouse-codecs-compression-schema --- .../sql-reference/statements/create/table.md | 4 +- src/Client/Connection.cpp | 2 +- src/Compression/CompressionCodecFPC.cpp | 1 + src/Compression/CompressionCodecGorilla.cpp | 7 +- src/Compression/CompressionFactory.h | 6 +- .../CompressionFactoryAdditions.cpp | 43 ++++++---- src/Compression/ICompressionCodec.h | 2 + .../tests/gtest_compressionCodec.cpp | 83 +++++-------------- src/Core/Settings.h | 4 + src/Interpreters/InterpreterCreateQuery.cpp | 3 +- src/Server/TCPHandler.cpp | 2 +- src/Storages/AlterCommands.cpp | 10 ++- src/Storages/ColumnsDescription.cpp | 2 +- src/Storages/Distributed/DistributedSink.cpp | 2 +- src/Storages/TTLDescription.cpp | 2 +- tests/performance/codecs_int_insert.xml | 1 - tests/performance/codecs_int_select.xml | 1 - .../0_stateless/01272_suspicious_codecs.sql | 18 ++-- .../02533_gorilla_on_nonfloat.reference | 0 .../0_stateless/02533_gorilla_on_nonfloat.sql | 34 ++++++++ 20 files changed, 122 insertions(+), 105 deletions(-) create mode 100644 tests/queries/0_stateless/02533_gorilla_on_nonfloat.reference create mode 100644 tests/queries/0_stateless/02533_gorilla_on_nonfloat.sql diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 68fb968c609..fdd479b5da4 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -293,7 +293,9 @@ These codecs are designed to make compression more effective by using specific f #### Gorilla -`Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see Compressing Values in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](http://www.vldb.org/pvldb/vol8/p1816-teller.pdf). +`Gorilla` — Calculates XOR between current and previous value and writes it in compact binary form. Efficient when storing a series of floating point values that change slowly, because the best compression rate is achieved when neighboring values are binary equal. Implements the algorithm used in Gorilla TSDB, extending it to support 64-bit types. For additional information, see section 4.1 in [Gorilla: A Fast, Scalable, In-Memory Time Series Database](https://doi.org/10.14778/2824032.2824078). + +It is currently possible to apply Gorilla compression to columns of non-floating-point type but this practice is strongly discouraged and will be removed in future. #### FPC diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 6a8acace238..e7840feeeab 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -541,7 +541,7 @@ void Connection::sendQuery( if (method == "ZSTD") level = settings->network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(method, level, !settings->allow_suspicious_codecs, settings->allow_experimental_codecs, settings->enable_gorilla_codec_for_non_float_data); compression_codec = CompressionCodecFactory::instance().get(method, level); } else diff --git a/src/Compression/CompressionCodecFPC.cpp b/src/Compression/CompressionCodecFPC.cpp index 48eba210b60..0468adc4610 100644 --- a/src/Compression/CompressionCodecFPC.cpp +++ b/src/Compression/CompressionCodecFPC.cpp @@ -39,6 +39,7 @@ protected: bool isCompression() const override { return true; } bool isGenericCompression() const override { return false; } + bool isFloatingPointTimeSeries() const override { return true; } private: static constexpr UInt32 HEADER_SIZE = 2; diff --git a/src/Compression/CompressionCodecGorilla.cpp b/src/Compression/CompressionCodecGorilla.cpp index 88b8c2bc3bb..93e6fba16f9 100644 --- a/src/Compression/CompressionCodecGorilla.cpp +++ b/src/Compression/CompressionCodecGorilla.cpp @@ -123,6 +123,7 @@ protected: bool isCompression() const override { return true; } bool isGenericCompression() const override { return false; } + bool isFloatingPointTimeSeries() const override { return true; } private: UInt8 data_bytes_size; @@ -444,14 +445,14 @@ void CompressionCodecGorilla::doDecompressData(const char * source, UInt32 sourc void registerCodecGorilla(CompressionCodecFactory & factory) { UInt8 method_code = static_cast(CompressionMethodByte::Gorilla); - factory.registerCompressionCodecWithType("Gorilla", method_code, - [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr + auto codec_builder = [&](const ASTPtr & arguments, const IDataType * column_type) -> CompressionCodecPtr { if (arguments) throw Exception("Codec Gorilla does not accept any arguments", ErrorCodes::BAD_ARGUMENTS); UInt8 data_bytes_size = column_type ? getDataBytesSize(column_type) : 0; return std::make_shared(data_bytes_size); - }); + }; + factory.registerCompressionCodecWithType("Gorilla", method_code, codec_builder); } } diff --git a/src/Compression/CompressionFactory.h b/src/Compression/CompressionFactory.h index a4451f9ed2e..a85676d5964 100644 --- a/src/Compression/CompressionFactory.h +++ b/src/Compression/CompressionFactory.h @@ -40,10 +40,12 @@ public: CompressionCodecPtr getDefaultCodec() const; /// Validate codecs AST specified by user and parses codecs description (substitute default parameters) - ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs) const; + /// + /// Note: enable_gorilla_coded_for_non_float_data is a transitory parameter and can be removed after v23.6 (being implicitly false then), see comments in Core/Settings.h. + ASTPtr validateCodecAndGetPreprocessedAST(const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_gorilla_coded_for_non_float_data) const; /// Validate codecs AST specified by user - void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs) const; + void validateCodec(const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_gorilla_coded_for_non_float_data) const; /// Get codec by AST and possible column_type. Some codecs can use /// information about type to improve inner settings, but every codec should diff --git a/src/Compression/CompressionFactoryAdditions.cpp b/src/Compression/CompressionFactoryAdditions.cpp index 3e215076871..d909a3dc70f 100644 --- a/src/Compression/CompressionFactoryAdditions.cpp +++ b/src/Compression/CompressionFactoryAdditions.cpp @@ -32,7 +32,7 @@ namespace ErrorCodes void CompressionCodecFactory::validateCodec( - const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs) const + const String & family_name, std::optional level, bool sanity_check, bool allow_experimental_codecs, bool enable_gorilla_codec_for_non_float_data) const { if (family_name.empty()) throw Exception("Compression codec name cannot be empty", ErrorCodes::BAD_ARGUMENTS); @@ -41,33 +41,34 @@ void CompressionCodecFactory::validateCodec( { auto literal = std::make_shared(static_cast(*level)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", makeASTFunction(Poco::toUpper(family_name), literal)), - {}, sanity_check, allow_experimental_codecs); + {}, sanity_check, allow_experimental_codecs, enable_gorilla_codec_for_non_float_data); } else { auto identifier = std::make_shared(Poco::toUpper(family_name)); validateCodecAndGetPreprocessedAST(makeASTFunction("CODEC", identifier), - {}, sanity_check, allow_experimental_codecs); + {}, sanity_check, allow_experimental_codecs, enable_gorilla_codec_for_non_float_data); } } ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( - const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs) const + const ASTPtr & ast, const DataTypePtr & column_type, bool sanity_check, bool allow_experimental_codecs, bool enable_gorilla_codec_for_non_float_data) const { if (const auto * func = ast->as()) { ASTPtr codecs_descriptions = std::make_shared(); - bool is_compression = false; - bool has_none = false; + bool with_compression_codec = false; + bool with_none_codec = false; + bool with_floating_point_timeseries_codec = false; std::optional generic_compression_codec_pos; - std::set encryption_codecs; + std::set encryption_codecs_pos; bool can_substitute_codec_arguments = true; for (size_t i = 0, size = func->arguments->children.size(); i < size; ++i) { - const auto & inner_codec_ast = func->arguments->children[i]; + const ASTPtr & inner_codec_ast = func->arguments->children[i]; String codec_family_name; ASTPtr codec_arguments; if (const auto * family_name = inner_codec_ast->as()) @@ -136,21 +137,22 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( codecs_descriptions->children.emplace_back(result_codec->getCodecDesc()); } - is_compression |= result_codec->isCompression(); - has_none |= result_codec->isNone(); + with_compression_codec |= result_codec->isCompression(); + with_none_codec |= result_codec->isNone(); + with_floating_point_timeseries_codec |= result_codec->isFloatingPointTimeSeries(); if (!generic_compression_codec_pos && result_codec->isGenericCompression()) generic_compression_codec_pos = i; if (result_codec->isEncryption()) - encryption_codecs.insert(i); + encryption_codecs_pos.insert(i); } String codec_description = queryToString(codecs_descriptions); if (sanity_check) { - if (codecs_descriptions->children.size() > 1 && has_none) + if (codecs_descriptions->children.size() > 1 && with_none_codec) throw Exception( "It does not make sense to have codec NONE along with other compression codecs: " + codec_description + ". (Note: you can enable setting 'allow_suspicious_codecs' to skip this check).", @@ -159,7 +161,7 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( /// Allow to explicitly specify single NONE codec if user don't want any compression. /// But applying other transformations solely without compression (e.g. Delta) does not make sense. /// It's okay to apply encryption codecs solely without anything else. - if (!is_compression && !has_none && encryption_codecs.size() != codecs_descriptions->children.size()) + if (!with_compression_codec && !with_none_codec && encryption_codecs_pos.size() != codecs_descriptions->children.size()) throw Exception( "Compression codec " + codec_description + " does not compress anything." @@ -171,17 +173,26 @@ ASTPtr CompressionCodecFactory::validateCodecAndGetPreprocessedAST( /// It does not make sense to apply any non-encryption codecs /// after encryption one. - if (!encryption_codecs.empty() && - *encryption_codecs.begin() != codecs_descriptions->children.size() - encryption_codecs.size()) + if (!encryption_codecs_pos.empty() && + *encryption_codecs_pos.begin() != codecs_descriptions->children.size() - encryption_codecs_pos.size()) throw Exception("The combination of compression codecs " + codec_description + " is meaningless," " because it does not make sense to apply any non-post-processing codecs after" " post-processing ones. (Note: you can enable setting 'allow_suspicious_codecs'" " to skip this check).", ErrorCodes::BAD_ARGUMENTS); + if (column_type) + if (with_floating_point_timeseries_codec && + !WhichDataType(*column_type).isFloat() && + !enable_gorilla_codec_for_non_float_data) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The combination of compression codecs {} is meaningless," + " because it does not make sense to apply a floating-point time series codec to non-floating-point columns" + " (Note: you can enable setting 'allow_suspicious_codecs' to skip this check).", codec_description); + /// It does not make sense to apply any transformations after generic compression algorithm /// So, generic compression can be only one and only at the end. if (generic_compression_codec_pos && - *generic_compression_codec_pos != codecs_descriptions->children.size() - 1 - encryption_codecs.size()) + *generic_compression_codec_pos != codecs_descriptions->children.size() - 1 - encryption_codecs_pos.size()) throw Exception("The combination of compression codecs " + codec_description + " is meaningless," " because it does not make sense to apply any transformations after generic compression algorithm." " (Note: you can enable setting 'allow_suspicious_codecs' to skip this check).", ErrorCodes::BAD_ARGUMENTS); diff --git a/src/Compression/ICompressionCodec.h b/src/Compression/ICompressionCodec.h index f40404a84f3..05507d99863 100644 --- a/src/Compression/ICompressionCodec.h +++ b/src/Compression/ICompressionCodec.h @@ -113,6 +113,8 @@ public: /// If it does nothing. virtual bool isNone() const { return false; } + virtual bool isFloatingPointTimeSeries() const { return false; } + protected: /// This is used for fuzz testing friend int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size); diff --git a/src/Compression/tests/gtest_compressionCodec.cpp b/src/Compression/tests/gtest_compressionCodec.cpp index b4c29bf9ce6..a843144c7c4 100644 --- a/src/Compression/tests/gtest_compressionCodec.cpp +++ b/src/Compression/tests/gtest_compressionCodec.cpp @@ -529,6 +529,13 @@ public: TEST_P(CodecTest, TranscodingWithDataType) { + /// Gorilla can only be applied to floating point columns + bool codec_is_gorilla = std::get<0>(GetParam()).codec_statement.find("Gorilla") != std::string::npos; + WhichDataType which(std::get<1>(GetParam()).data_type.get()); + bool data_is_float = which.isFloat(); + if (codec_is_gorilla && !data_is_float) + GTEST_SKIP() << "Skipping Gorilla-compressed integer column"; + const auto codec = makeCodec(CODEC_WITH_DATA_TYPE); testTranscoding(*codec); } @@ -1204,68 +1211,20 @@ auto DDperformanceTestSequence() } // prime numbers in ascending order with some random repetitions hit all the cases of Gorilla. -auto PrimesWithMultiplierGenerator = [](int multiplier = 1) -{ - return [multiplier](auto i) - { - static const int vals[] = { - 2, 3, 5, 7, 11, 11, 13, 17, 19, 23, 29, 29, 31, 37, 41, 43, - 47, 47, 53, 59, 61, 61, 67, 71, 73, 79, 83, 89, 89, 97, 101, 103, - 107, 107, 109, 113, 113, 127, 127, 127 - }; - static const size_t count = sizeof(vals)/sizeof(vals[0]); - - return static_cast(vals[i % count]) * multiplier; - }; -}; - -template -auto GCompatibilityTestSequence() -{ - // Also multiply result by some factor to test large values on types that can hold those. - return generateSeq(G(PrimesWithMultiplierGenerator(intExp10(sizeof(ValueType)))), 0, 42); -} - -INSTANTIATE_TEST_SUITE_P(Gorilla, - CodecTestCompatibility, - ::testing::Combine( - ::testing::Values(Codec("Gorilla")), - ::testing::ValuesIn(std::initializer_list>{ - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x35\x00\x00\x00\x2a\x00\x00\x00\x01\x00\x2a\x00\x00\x00\x14\xe1\xdd\x25\xe5\x7b\x29\x86\xee\x2a\x16\x5a\xc5\x0b\x23\x75\x1b\x3c\xb1\x97\x8b\x5f\xcb\x43\xd9\xc5\x48\xab\x23\xaf\x62\x93\x71\x4a\x73\x0f\xc6\x0a") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x35\x00\x00\x00\x2a\x00\x00\x00\x01\x00\x2a\x00\x00\x00\x14\xe1\xdd\x25\xe5\x7b\x29\x86\xee\x2a\x16\x5a\xc5\x0b\x23\x75\x1b\x3c\xb1\x97\x8b\x5f\xcb\x43\xd9\xc5\x48\xab\x23\xaf\x62\x93\x71\x4a\x73\x0f\xc6\x0a") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x52\x00\x00\x00\x54\x00\x00\x00\x02\x00\x2a\x00\x00\x00\xc8\x00\xdc\xfe\x66\xdb\x1f\x4e\xa7\xde\xdc\xd5\xec\x6e\xf7\x37\x3a\x23\xe7\x63\xf5\x6a\x8e\x99\x37\x34\xf9\xf8\x2e\x76\x35\x2d\x51\xbb\x3b\xc3\x6d\x13\xbf\x86\x53\x9e\x25\xe4\xaf\xaf\x63\xd5\x6a\x6e\x76\x35\x3a\x27\xd3\x0f\x91\xae\x6b\x33\x57\x6e\x64\xcc\x55\x81\xe4") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x52\x00\x00\x00\x54\x00\x00\x00\x02\x00\x2a\x00\x00\x00\xc8\x00\xdc\xfe\x66\xdb\x1f\x4e\xa7\xde\xdc\xd5\xec\x6e\xf7\x37\x3a\x23\xe7\x63\xf5\x6a\x8e\x99\x37\x34\xf9\xf8\x2e\x76\x35\x2d\x51\xbb\x3b\xc3\x6d\x13\xbf\x86\x53\x9e\x25\xe4\xaf\xaf\x63\xd5\x6a\x6e\x76\x35\x3a\x27\xd3\x0f\x91\xae\x6b\x33\x57\x6e\x64\xcc\x55\x81\xe4") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x65\x00\x00\x00\xa8\x00\x00\x00\x04\x00\x2a\x00\x00\x00\x20\x4e\x00\x00\xe4\x57\x63\xc0\xbb\x67\xbc\xce\x91\x97\x99\x15\x9e\xe3\x36\x3f\x89\x5f\x8e\xf2\xec\x8e\xd3\xbf\x75\x43\x58\xc4\x7e\xcf\x93\x43\x38\xc6\x91\x36\x1f\xe7\xb6\x11\x6f\x02\x73\x46\xef\xe0\xec\x50\xfb\x79\xcb\x9c\x14\xfa\x13\xea\x8d\x66\x43\x48\xa0\xde\x3a\xcf\xff\x26\xe0\x5f\x93\xde\x5e\x7f\x6e\x36\x5e\xe6\xb4\x66\x5d\xb0\x0e\xc4") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x65\x00\x00\x00\xa8\x00\x00\x00\x04\x00\x2a\x00\x00\x00\x20\x4e\x00\x00\xe4\x57\x63\xc0\xbb\x67\xbc\xce\x91\x97\x99\x15\x9e\xe3\x36\x3f\x89\x5f\x8e\xf2\xec\x8e\xd3\xbf\x75\x43\x58\xc4\x7e\xcf\x93\x43\x38\xc6\x91\x36\x1f\xe7\xb6\x11\x6f\x02\x73\x46\xef\xe0\xec\x50\xfb\x79\xcb\x9c\x14\xfa\x13\xea\x8d\x66\x43\x48\xa0\xde\x3a\xcf\xff\x26\xe0\x5f\x93\xde\x5e\x7f\x6e\x36\x5e\xe6\xb4\x66\x5d\xb0\x0e\xc4") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x91\x00\x00\x00\x50\x01\x00\x00\x08\x00\x2a\x00\x00\x00\x00\xc2\xeb\x0b\x00\x00\x00\x00\xe3\x2b\xa0\xa6\x19\x85\x98\xdc\x45\x74\x74\x43\xc2\x57\x41\x4c\x6e\x42\x79\xd9\x8f\x88\xa5\x05\xf3\xf1\x94\xa3\x62\x1e\x02\xdf\x05\x10\xf1\x15\x97\x35\x2a\x50\x71\x0f\x09\x6c\x89\xf7\x65\x1d\x11\xb7\xcc\x7d\x0b\x70\xc1\x86\x88\x48\x47\x87\xb6\x32\x26\xa7\x86\x87\x88\xd3\x93\x3d\xfc\x28\x68\x85\x05\x0b\x13\xc6\x5f\xd4\x70\xe1\x5e\x76\xf1\x9f\xf3\x33\x2a\x14\x14\x5e\x40\xc1\x5c\x28\x3f\xec\x43\x03\x05\x11\x91\xe8\xeb\x8e\x0a\x0e\x27\x21\x55\xcb\x39\xbc\x6a\xff\x11\x5d\x81\xa0\xa6\x10") - }, - { - GCompatibilityTestSequence(), - BIN_STR("\x95\x91\x00\x00\x00\x50\x01\x00\x00\x08\x00\x2a\x00\x00\x00\x00\xc2\xeb\x0b\x00\x00\x00\x00\xe3\x2b\xa0\xa6\x19\x85\x98\xdc\x45\x74\x74\x43\xc2\x57\x41\x4c\x6e\x42\x79\xd9\x8f\x88\xa5\x05\xf3\xf1\x94\xa3\x62\x1e\x02\xdf\x05\x10\xf1\x15\x97\x35\x2a\x50\x71\x0f\x09\x6c\x89\xf7\x65\x1d\x11\xb7\xcc\x7d\x0b\x70\xc1\x86\x88\x48\x47\x87\xb6\x32\x26\xa7\x86\x87\x88\xd3\x93\x3d\xfc\x28\x68\x85\x05\x0b\x13\xc6\x5f\xd4\x70\xe1\x5e\x76\xf1\x9f\xf3\x33\x2a\x14\x14\x5e\x40\xc1\x5c\x28\x3f\xec\x43\x03\x05\x11\x91\xe8\xeb\x8e\x0a\x0e\x27\x21\x55\xcb\x39\xbc\x6a\xff\x11\x5d\x81\xa0\xa6\x10") - }, - }) - ) -); +// auto PrimesWithMultiplierGenerator = [](int multiplier = 1) +// { +// return [multiplier](auto i) +// { +// static const int vals[] = { +// 2, 3, 5, 7, 11, 11, 13, 17, 19, 23, 29, 29, 31, 37, 41, 43, +// 47, 47, 53, 59, 61, 61, 67, 71, 73, 79, 83, 89, 89, 97, 101, 103, +// 107, 107, 109, 113, 113, 127, 127, 127 +// }; +// static const size_t count = sizeof(vals)/sizeof(vals[0]); +// +// return static_cast(vals[i % count]) * multiplier; +// }; +// }; // These 'tests' try to measure performance of encoding and decoding and hence only make sense to be run locally, // also they require pretty big data to run against and generating this data slows down startup of unit test process. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1ade4ba2868..35172883c96 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -627,6 +627,10 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ \ + /* Transitory setting needed for ca. six months. After v23.6, the setting can be moved into the obsolete section below and assumed as false. */ \ + /* Gorilla is a float-codec which happend to be enabled for non-float datatypes. Users (self-hosted or on-premise) might have non-float Gorilla-compressed data so allow some time for migration. */ \ + M(Bool, enable_gorilla_codec_for_non_float_data, true, "Enable Gorilla compression of columns with non-float data type", 0) \ + \ M(UInt64, use_structure_from_insertion_table_in_table_functions, 2, "Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto", 0) \ \ M(UInt64, http_max_tries, 10, "Max attempts to read via http.", 0) \ diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bea88885d20..28b3d91c094 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -556,6 +556,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( bool sanity_check_compression_codecs = !attach && !context_->getSettingsRef().allow_suspicious_codecs; bool allow_experimental_codecs = attach || context_->getSettingsRef().allow_experimental_codecs; + bool enable_gorilla_codec_for_non_float_data = context_->getSettingsRef().enable_gorilla_codec_for_non_float_data; ColumnsDescription res; auto name_type_it = column_names_and_types.begin(); @@ -617,7 +618,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (col_decl.default_specifier == "ALIAS") throw Exception{"Cannot specify codec for column type ALIAS", ErrorCodes::BAD_ARGUMENTS}; column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs); + col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_gorilla_codec_for_non_float_data); } if (col_decl.ttl) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 5b3d18c66f6..97bec765e77 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1692,7 +1692,7 @@ void TCPHandler::initBlockOutput(const Block & block) if (state.compression == Protocol::Compression::Enable) { - CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(method, level, !query_settings.allow_suspicious_codecs, query_settings.allow_experimental_codecs, query_settings.enable_gorilla_codec_for_non_float_data); state.maybe_compressed_out = std::make_shared( *out, CompressionCodecFactory::instance().get(method, level)); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 1d4df05c723..49d95c09d74 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -389,7 +389,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) column.comment = *comment; if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type, false, true, true); column.ttl = ttl; @@ -430,7 +430,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) else { if (codec) - column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true); + column.codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(codec, data_type ? data_type : column.type, false, true, true); if (comment) column.comment = *comment; @@ -1080,7 +1080,8 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const ErrorCodes::ILLEGAL_COLUMN}; if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, + !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_gorilla_codec_for_non_float_data); all_columns.add(ColumnDescription(column_name, command.data_type)); } @@ -1104,7 +1105,8 @@ void AlterCommands::validate(const StoragePtr & table, ContextPtr context) const ErrorCodes::NOT_IMPLEMENTED}; if (command.codec) - CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(command.codec, command.data_type, + !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_gorilla_codec_for_non_float_data); auto column_default = all_columns.getDefault(column_name); if (column_default) { diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 0fdb21e064f..22641573c63 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -130,7 +130,7 @@ void ColumnDescription::readText(ReadBuffer & buf) comment = col_ast->comment->as().value.get(); if (col_ast->codec) - codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true); + codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST(col_ast->codec, type, false, true, true); if (col_ast->ttl) ttl = col_ast->ttl; diff --git a/src/Storages/Distributed/DistributedSink.cpp b/src/Storages/Distributed/DistributedSink.cpp index 8cee3e9ee91..cb38a2e2df5 100644 --- a/src/Storages/Distributed/DistributedSink.cpp +++ b/src/Storages/Distributed/DistributedSink.cpp @@ -701,7 +701,7 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const if (compression_method == "ZSTD") compression_level = settings.network_zstd_compression_level; - CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs); + CompressionCodecFactory::instance().validateCodec(compression_method, compression_level, !settings.allow_suspicious_codecs, settings.allow_experimental_codecs, settings.enable_gorilla_codec_for_non_float_data); CompressionCodecPtr compression_codec = CompressionCodecFactory::instance().get(compression_method, compression_level); /// tmp directory is used to ensure atomicity of transactions diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index 2971d977099..4f6df4ab95a 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -291,7 +291,7 @@ TTLDescription TTLDescription::getTTLFromAST( { result.recompression_codec = CompressionCodecFactory::instance().validateCodecAndGetPreprocessedAST( - ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs); + ttl_element->recompression_codec, {}, !context->getSettingsRef().allow_suspicious_codecs, context->getSettingsRef().allow_experimental_codecs, context->getSettingsRef().enable_gorilla_codec_for_non_float_data); } } diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index caefaba3725..618e20160f8 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -13,7 +13,6 @@ Delta T64 DoubleDelta - Gorilla diff --git a/tests/performance/codecs_int_select.xml b/tests/performance/codecs_int_select.xml index 7d47cd300d8..62c1ee16e7b 100644 --- a/tests/performance/codecs_int_select.xml +++ b/tests/performance/codecs_int_select.xml @@ -13,7 +13,6 @@ Delta T64 DoubleDelta - Gorilla diff --git a/tests/queries/0_stateless/01272_suspicious_codecs.sql b/tests/queries/0_stateless/01272_suspicious_codecs.sql index 5baa30e3cf4..7776e511725 100644 --- a/tests/queries/0_stateless/01272_suspicious_codecs.sql +++ b/tests/queries/0_stateless/01272_suspicious_codecs.sql @@ -11,7 +11,7 @@ CREATE TABLE codecs c Float32 CODEC(Gorilla), d UInt8 CODEC(Delta, LZ4), e Float64 CODEC(Gorilla, ZSTD), - f UInt32 CODEC(Delta, Delta, Gorilla), + f UInt32 CODEC(Delta, Delta, T64), g DateTime CODEC(DoubleDelta), h DateTime64 CODEC(DoubleDelta, LZ4), i String CODEC(NONE) @@ -21,14 +21,14 @@ DROP TABLE codecs; -- test what should not work -CREATE TABLE codecs (a UInt8 CODEC(NONE, NONE)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(NONE, LZ4)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(LZ4, NONE)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(LZ4, LZ4)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(LZ4, ZSTD)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(Delta, Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } -CREATE TABLE codecs (a UInt8 CODEC(LZ4, Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError 36 } +CREATE TABLE codecs (a UInt8 CODEC(NONE, NONE)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(NONE, LZ4)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(LZ4, NONE)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(LZ4, LZ4)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(LZ4, ZSTD)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(Delta, Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } +CREATE TABLE codecs (a UInt8 CODEC(LZ4, Delta)) ENGINE = MergeTree ORDER BY tuple(); -- { serverError BAD_ARGUMENTS } -- test that sanity check is not performed in ATTACH query diff --git a/tests/queries/0_stateless/02533_gorilla_on_nonfloat.reference b/tests/queries/0_stateless/02533_gorilla_on_nonfloat.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02533_gorilla_on_nonfloat.sql b/tests/queries/0_stateless/02533_gorilla_on_nonfloat.sql new file mode 100644 index 00000000000..d64c04cdbf9 --- /dev/null +++ b/tests/queries/0_stateless/02533_gorilla_on_nonfloat.sql @@ -0,0 +1,34 @@ +-- Welcome visitor from the future! If it is >= July 2023 and your intention is to adjust the test because "enable_gorilla_codec_for_non_float_data" +-- is now obsolete, then please also extend 01272_suspicious_codecs.sql with new tests cases for Gorilla on non-float data. + +DROP TABLE IF EXISTS test; + +-- current default behavior is to enable non-float Gorilla compressed data + +CREATE TABLE test (id UInt64, val Decimal(15,5) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test; + +CREATE TABLE test (id UInt64, val FixedString(2) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test; + +CREATE TABLE test (id UInt64, val UInt64 CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test; + +-- this can be changed (and it is planned to be changed by default in future) with a setting +SET enable_gorilla_codec_for_non_float_data = false; + +CREATE TABLE test (id UInt64, val Decimal(15,5) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; -- { serverError BAD_ARGUMENTS } +CREATE TABLE test (id UInt64, val FixedString(2) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; -- { serverError BAD_ARGUMENTS } +CREATE TABLE test (id UInt64, val UInt64 CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; -- { serverError BAD_ARGUMENTS } + +-- even with above setting, it will still be possible to create non-float Gorilla-compressed data using allow_suspicious_codecs +SET allow_suspicious_codecs = true; + +CREATE TABLE test (id UInt64, val Decimal(15,5) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test; + +CREATE TABLE test (id UInt64, val FixedString(2) CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test; + +CREATE TABLE test (id UInt64, val UInt64 CODEC (Gorilla)) ENGINE = MergeTree() ORDER BY id; +DROP TABLE IF EXISTS test;