From f2c328aa7a9c0ac7cc7a13f5d99201d46dc79361 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 6 Dec 2024 13:11:00 +0100 Subject: [PATCH 1/4] Revert "Merge pull request #72836 from ClickHouse/revert-72770-more_insistent_compress_in_memory_eng" This reverts commit f752c0b89d1a84fc92bfbb7fc510a092b5aa4c84, reversing changes made to 65d895a6dbd5f07383b871430f3cee5ce1b984a3. --- docs/en/engines/table-engines/special/memory.md | 2 ++ src/Columns/ColumnArray.cpp | 6 +++--- src/Columns/ColumnArray.h | 2 +- src/Columns/ColumnCompressed.cpp | 5 +++-- src/Columns/ColumnCompressed.h | 2 +- src/Columns/ColumnDecimal.cpp | 4 ++-- src/Columns/ColumnDecimal.h | 2 +- src/Columns/ColumnDynamic.cpp | 4 ++-- src/Columns/ColumnDynamic.h | 2 +- src/Columns/ColumnFixedString.cpp | 4 ++-- src/Columns/ColumnFixedString.h | 2 +- src/Columns/ColumnMap.cpp | 4 ++-- src/Columns/ColumnMap.h | 2 +- src/Columns/ColumnNullable.cpp | 6 +++--- src/Columns/ColumnNullable.h | 2 +- src/Columns/ColumnObject.cpp | 8 ++++---- src/Columns/ColumnObject.h | 2 +- src/Columns/ColumnSparse.cpp | 6 +++--- src/Columns/ColumnSparse.h | 2 +- src/Columns/ColumnString.cpp | 6 +++--- src/Columns/ColumnString.h | 2 +- src/Columns/ColumnTuple.cpp | 4 ++-- src/Columns/ColumnTuple.h | 2 +- src/Columns/ColumnVariant.cpp | 8 ++++---- src/Columns/ColumnVariant.h | 2 +- src/Columns/ColumnVector.cpp | 4 ++-- src/Columns/ColumnVector.h | 2 +- src/Columns/IColumn.h | 3 ++- src/Core/Block.cpp | 2 +- src/Interpreters/Cache/QueryCache.cpp | 2 +- src/Storages/StorageMemory.cpp | 7 +++---- 31 files changed, 57 insertions(+), 54 deletions(-) diff --git a/docs/en/engines/table-engines/special/memory.md b/docs/en/engines/table-engines/special/memory.md index f28157ebde2..3eb3e617ff9 100644 --- a/docs/en/engines/table-engines/special/memory.md +++ b/docs/en/engines/table-engines/special/memory.md @@ -36,6 +36,8 @@ Upper and lower bounds can be specified to limit Memory engine table size, effec - Requires `max_rows_to_keep` - `max_rows_to_keep` — Maximum rows to keep within memory table where oldest rows are deleted on each insertion (i.e circular buffer). Max rows can exceed the stated limit if the oldest batch of rows to remove falls under the `min_rows_to_keep` limit when adding a large block. - Default value: `0` +- `compress` - Whether to compress data in memory. + - Default value: `false` ## Usage {#usage} diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 3f88ca93a97..013821db2c9 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -1024,10 +1024,10 @@ void ColumnArray::updatePermutationWithCollation(const Collator & collator, Perm DefaultPartialSort()); } -ColumnPtr ColumnArray::compress() const +ColumnPtr ColumnArray::compress(bool force_compression) const { - ColumnPtr data_compressed = data->compress(); - ColumnPtr offsets_compressed = offsets->compress(); + ColumnPtr data_compressed = data->compress(force_compression); + ColumnPtr offsets_compressed = offsets->compress(force_compression); size_t byte_size = data_compressed->byteSize() + offsets_compressed->byteSize(); diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index a66f9041213..dee6ae931f2 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -159,7 +159,7 @@ public: /// For example, `getDataInRange(0, size())` is the same as `getDataPtr()->clone()`. MutableColumnPtr getDataInRange(size_t start, size_t length) const; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; ColumnCheckpointPtr getCheckpoint() const override; void updateCheckpoint(ColumnCheckpoint & checkpoint) const override; diff --git a/src/Columns/ColumnCompressed.cpp b/src/Columns/ColumnCompressed.cpp index 3bdc514d6d8..adb2a5f391d 100644 --- a/src/Columns/ColumnCompressed.cpp +++ b/src/Columns/ColumnCompressed.cpp @@ -16,7 +16,7 @@ namespace ErrorCodes } -std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size, bool always_compress) +std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size, bool force_compression) { size_t max_dest_size = LZ4_COMPRESSBOUND(data_size); @@ -35,7 +35,8 @@ std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, si throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); /// If compression is inefficient. - if (!always_compress && static_cast(compressed_size) * 2 > data_size) + const size_t threshold = force_compression ? 1 : 2; + if (static_cast(compressed_size) * threshold > data_size) return {}; /// Shrink to fit. diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index c4270e8216b..b030e762acd 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -72,7 +72,7 @@ public: /// If data is not worth to be compressed and not 'always_compress' - returns nullptr. /// Note: shared_ptr is to allow to be captured by std::function. - static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool always_compress); + static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool force_compression); static void decompressBuffer( const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size); diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index 73366150e7d..c286c54198a 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -478,7 +478,7 @@ ColumnPtr ColumnDecimal::replicate(const IColumn::Offsets & offsets) const } template -ColumnPtr ColumnDecimal::compress() const +ColumnPtr ColumnDecimal::compress(bool force_compression) const { const size_t data_size = data.size(); const size_t source_size = data_size * sizeof(T); @@ -487,7 +487,7 @@ ColumnPtr ColumnDecimal::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, force_compression); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index 690549e4a56..3e5c189b731 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -140,7 +140,7 @@ public: return false; } - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; void insertValue(const T value) { data.push_back(value); } Container & getData() { return data; } diff --git a/src/Columns/ColumnDynamic.cpp b/src/Columns/ColumnDynamic.cpp index a4c932eafdd..2d05701c57b 100644 --- a/src/Columns/ColumnDynamic.cpp +++ b/src/Columns/ColumnDynamic.cpp @@ -991,9 +991,9 @@ void ColumnDynamic::updatePermutation(IColumn::PermutationSortDirection directio updatePermutationImpl(limit, res, equal_ranges, ComparatorDescendingStable(*this, nan_direction_hint), comparator_equal, DefaultSort(), DefaultPartialSort()); } -ColumnPtr ColumnDynamic::compress() const +ColumnPtr ColumnDynamic::compress(bool force_compression) const { - ColumnPtr variant_compressed = variant_column_ptr->compress(); + ColumnPtr variant_compressed = variant_column_ptr->compress(force_compression); size_t byte_size = variant_compressed->byteSize(); return ColumnCompressed::create(size(), byte_size, [my_variant_compressed = std::move(variant_compressed), my_variant_info = variant_info, my_max_dynamic_types = max_dynamic_types, my_global_max_dynamic_types = global_max_dynamic_types, my_statistics = statistics]() mutable diff --git a/src/Columns/ColumnDynamic.h b/src/Columns/ColumnDynamic.h index bdbad99519f..093aaaf2793 100644 --- a/src/Columns/ColumnDynamic.h +++ b/src/Columns/ColumnDynamic.h @@ -335,7 +335,7 @@ public: return false; } - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; double getRatioOfDefaultRows(double sample_ratio) const override { diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 04e894ee5ab..f076f904768 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -419,7 +419,7 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const get(max_idx, max); } -ColumnPtr ColumnFixedString::compress() const +ColumnPtr ColumnFixedString::compress(bool force_compression) const { size_t source_size = chars.size(); @@ -427,7 +427,7 @@ ColumnPtr ColumnFixedString::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size, false); + auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size, force_compression); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index 8cf0a6a57da..f55fb60a976 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -175,7 +175,7 @@ public: ColumnPtr replicate(const Offsets & offsets) const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; void reserve(size_t size) override { diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index a5511dfeeb4..fb9c8c9fbaf 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -352,9 +352,9 @@ bool ColumnMap::dynamicStructureEquals(const IColumn & rhs) const return false; } -ColumnPtr ColumnMap::compress() const +ColumnPtr ColumnMap::compress(bool force_compression) const { - auto compressed = nested->compress(); + auto compressed = nested->compress(force_compression); const auto byte_size = compressed->byteSize(); /// The order of evaluation of function arguments is unspecified /// and could cause interacting with object in moved-from state diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 8dfa5bb5845..31404a3e152 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -120,7 +120,7 @@ public: const ColumnTuple & getNestedData() const { return assert_cast(getNestedColumn().getData()); } ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; bool hasDynamicStructure() const override { return nested->hasDynamicStructure(); } bool dynamicStructureEquals(const IColumn & rhs) const override; diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 6e8bd3fc70c..640550fcf9a 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -773,10 +773,10 @@ void ColumnNullable::protect() getNullMapColumn().protect(); } -ColumnPtr ColumnNullable::compress() const +ColumnPtr ColumnNullable::compress(bool force_compression) const { - ColumnPtr nested_compressed = nested_column->compress(); - ColumnPtr null_map_compressed = null_map->compress(); + ColumnPtr nested_compressed = nested_column->compress(force_compression); + ColumnPtr null_map_compressed = null_map->compress(force_compression); size_t byte_size = nested_column->byteSize() + null_map->byteSize(); diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 32ce66c5965..3a0be008cc2 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -141,7 +141,7 @@ public: // Special function for nullable minmax index void getExtremesNullLast(Field & min, Field & max) const; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; ColumnCheckpointPtr getCheckpoint() const override; void updateCheckpoint(ColumnCheckpoint & checkpoint) const override; diff --git a/src/Columns/ColumnObject.cpp b/src/Columns/ColumnObject.cpp index dae7b4c36b9..f110e8b442b 100644 --- a/src/Columns/ColumnObject.cpp +++ b/src/Columns/ColumnObject.cpp @@ -1225,14 +1225,14 @@ bool ColumnObject::structureEquals(const IColumn & rhs) const return true; } -ColumnPtr ColumnObject::compress() const +ColumnPtr ColumnObject::compress(bool force_compression) const { std::unordered_map compressed_typed_paths; compressed_typed_paths.reserve(typed_paths.size()); size_t byte_size = 0; for (const auto & [path, column] : typed_paths) { - auto compressed_column = column->compress(); + auto compressed_column = column->compress(force_compression); byte_size += compressed_column->byteSize(); compressed_typed_paths[path] = std::move(compressed_column); } @@ -1241,12 +1241,12 @@ ColumnPtr ColumnObject::compress() const compressed_dynamic_paths.reserve(dynamic_paths_ptrs.size()); for (const auto & [path, column] : dynamic_paths_ptrs) { - auto compressed_column = column->compress(); + auto compressed_column = column->compress(force_compression); byte_size += compressed_column->byteSize(); compressed_dynamic_paths[path] = std::move(compressed_column); } - auto compressed_shared_data = shared_data->compress(); + auto compressed_shared_data = shared_data->compress(force_compression); byte_size += compressed_shared_data->byteSize(); auto decompress = diff --git a/src/Columns/ColumnObject.h b/src/Columns/ColumnObject.h index 7b8a381d571..3160b66cd20 100644 --- a/src/Columns/ColumnObject.h +++ b/src/Columns/ColumnObject.h @@ -171,7 +171,7 @@ public: bool structureEquals(const IColumn & rhs) const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; void finalize() override; bool isFinalized() const override; diff --git a/src/Columns/ColumnSparse.cpp b/src/Columns/ColumnSparse.cpp index a0e47e65fc6..b7d82ed8a09 100644 --- a/src/Columns/ColumnSparse.cpp +++ b/src/Columns/ColumnSparse.cpp @@ -774,10 +774,10 @@ UInt64 ColumnSparse::getNumberOfDefaultRows() const return _size - offsets->size(); } -ColumnPtr ColumnSparse::compress() const +ColumnPtr ColumnSparse::compress(bool force_compression) const { - auto values_compressed = values->compress(); - auto offsets_compressed = offsets->compress(); + auto values_compressed = values->compress(force_compression); + auto offsets_compressed = offsets->compress(force_compression); size_t byte_size = values_compressed->byteSize() + offsets_compressed->byteSize(); diff --git a/src/Columns/ColumnSparse.h b/src/Columns/ColumnSparse.h index 619dce63c1e..f95752cd546 100644 --- a/src/Columns/ColumnSparse.h +++ b/src/Columns/ColumnSparse.h @@ -147,7 +147,7 @@ public: double getRatioOfDefaultRows(double sample_ratio) const override; UInt64 getNumberOfDefaultRows() const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; ColumnCheckpointPtr getCheckpoint() const override; void updateCheckpoint(ColumnCheckpoint & checkpoint) const override; diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 9569e9ec252..4bdc253bfc4 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -628,7 +628,7 @@ void ColumnString::getExtremes(Field & min, Field & max) const get(max_idx, max); } -ColumnPtr ColumnString::compress() const +ColumnPtr ColumnString::compress(bool force_compression) const { const size_t source_chars_size = chars.size(); const size_t source_offsets_elements = offsets.size(); @@ -638,13 +638,13 @@ ColumnPtr ColumnString::compress() const if (source_chars_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false); + auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, force_compression); /// Return original column if not compressible. if (!chars_compressed) return ColumnCompressed::wrap(this->getPtr()); - auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true); + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, /*force_compression=*/true); const size_t chars_compressed_size = chars_compressed->size(); const size_t offsets_compressed_size = offsets_compressed->size(); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 062315219b5..4bf24217383 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -272,7 +272,7 @@ public: ColumnPtr replicate(const Offsets & replicate_offsets) const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; void reserve(size_t n) override; size_t capacity() const override; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 28e5f03cc3c..9bb377f56ae 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -796,7 +796,7 @@ void ColumnTuple::takeDynamicStructureFromSourceColumns(const Columns & source_c } -ColumnPtr ColumnTuple::compress() const +ColumnPtr ColumnTuple::compress(bool force_compression) const { if (columns.empty()) { @@ -812,7 +812,7 @@ ColumnPtr ColumnTuple::compress() const compressed.reserve(columns.size()); for (const auto & column : columns) { - auto compressed_column = column->compress(); + auto compressed_column = column->compress(force_compression); byte_size += compressed_column->byteSize(); compressed.emplace_back(std::move(compressed_column)); } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index d5eee911edc..b8b3697b84d 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -125,7 +125,7 @@ public: void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; void finalize() override; bool isFinalized() const override; diff --git a/src/Columns/ColumnVariant.cpp b/src/Columns/ColumnVariant.cpp index 2fa59b8e33c..38d3bac3c10 100644 --- a/src/Columns/ColumnVariant.cpp +++ b/src/Columns/ColumnVariant.cpp @@ -1426,16 +1426,16 @@ bool ColumnVariant::dynamicStructureEquals(const IColumn & rhs) const return true; } -ColumnPtr ColumnVariant::compress() const +ColumnPtr ColumnVariant::compress(bool force_compression) const { - ColumnPtr local_discriminators_compressed = local_discriminators->compress(); - ColumnPtr offsets_compressed = offsets->compress(); + ColumnPtr local_discriminators_compressed = local_discriminators->compress(force_compression); + ColumnPtr offsets_compressed = offsets->compress(force_compression); size_t byte_size = local_discriminators_compressed->byteSize() + offsets_compressed->byteSize(); Columns compressed; compressed.reserve(variants.size()); for (const auto & variant : variants) { - auto compressed_variant = variant->compress(); + auto compressed_variant = variant->compress(force_compression); byte_size += compressed_variant->byteSize(); compressed.emplace_back(std::move(compressed_variant)); } diff --git a/src/Columns/ColumnVariant.h b/src/Columns/ColumnVariant.h index a68a961169c..c7e37517004 100644 --- a/src/Columns/ColumnVariant.h +++ b/src/Columns/ColumnVariant.h @@ -254,7 +254,7 @@ public: void forEachSubcolumn(MutableColumnCallback callback) override; void forEachSubcolumnRecursively(RecursiveMutableColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; double getRatioOfDefaultRows(double sample_ratio) const override; UInt64 getNumberOfDefaultRows() const override; void getIndicesOfNonDefaultRows(Offsets & indices, size_t from, size_t limit) const override; diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 3c7727f37c4..62f6c23c4f8 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -951,7 +951,7 @@ void ColumnVector::getExtremes(Field & min, Field & max) const } template -ColumnPtr ColumnVector::compress() const +ColumnPtr ColumnVector::compress(bool force_compression) const { const size_t data_size = data.size(); const size_t source_size = data_size * sizeof(T); @@ -960,7 +960,7 @@ ColumnPtr ColumnVector::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, force_compression); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 5247bfdf972..e5ece863a3b 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -287,7 +287,7 @@ public: ColumnPtr createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const override; - ColumnPtr compress() const override; + ColumnPtr compress(bool force_compression) const override; /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. void applyZeroMap(const IColumn::Filter & filt, bool inverted = false); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 9d1b42d2bc1..e2099ac34b9 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -601,7 +601,8 @@ public: /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. - [[nodiscard]] virtual Ptr compress() const + /// The flag `force_compression` indicates that compression should be performed even if it's not efficient (if only compression factor < 1). + [[nodiscard]] virtual Ptr compress([[maybe_unused]] bool force_compression) const { /// No compression by default. return getPtr(); diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 02176a6b77a..0efb4596dcd 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -608,7 +608,7 @@ Block Block::compress() const size_t num_columns = data.size(); Columns new_columns(num_columns); for (size_t i = 0; i < num_columns; ++i) - new_columns[i] = data[i].column->compress(); + new_columns[i] = data[i].column->compress(/*force_compression=*/false); return cloneWithColumns(new_columns); } diff --git a/src/Interpreters/Cache/QueryCache.cpp b/src/Interpreters/Cache/QueryCache.cpp index 7dbee567c5b..de3d720fc35 100644 --- a/src/Interpreters/Cache/QueryCache.cpp +++ b/src/Interpreters/Cache/QueryCache.cpp @@ -469,7 +469,7 @@ void QueryCache::Writer::finalizeWrite() Columns compressed_columns; for (const auto & column : columns) { - auto compressed_column = column->compress(); + auto compressed_column = column->compress(/*force_compression=*/false); compressed_columns.push_back(compressed_column); } Chunk compressed_chunk(compressed_columns, chunk.getNumRows()); diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index b56d6d98680..fcc24a06881 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -91,8 +91,7 @@ public: { Block compressed_block; for (const auto & elem : block) - compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); - + compressed_block.insert({elem.column->compress(/*force_compression=*/true), elem.type, elem.name}); new_blocks.push_back(std::move(compressed_block)); } else @@ -259,7 +258,7 @@ void StorageMemory::mutate(const MutationCommands & commands, ContextPtr context { if ((*memory_settings)[MemorySetting::compress]) for (auto & elem : block) - elem.column = elem.column->compress(); + elem.column = elem.column->compress(/*force_compression=*/true); out.push_back(block); } @@ -574,7 +573,7 @@ void StorageMemory::restoreDataImpl(const BackupPtr & backup, const String & dat { Block compressed_block; for (const auto & elem : block) - compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); + compressed_block.insert({elem.column->compress(/*force_compression=*/true), elem.type, elem.name}); new_blocks.push_back(std::move(compressed_block)); } From 82da99cfd317f4087696548f5173736eb6eba60f Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 6 Dec 2024 15:34:39 +0100 Subject: [PATCH 2/4] fix --- src/Columns/ColumnCompressed.h | 2 +- src/Columns/ColumnString.cpp | 43 ++++++++++---- src/Columns/ColumnString.h | 2 + src/Columns/tests/gtest_column_string.cpp | 71 +++++++++++++++++++++++ 4 files changed, 107 insertions(+), 11 deletions(-) create mode 100644 src/Columns/tests/gtest_column_string.cpp diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index b030e762acd..7d7970cce8a 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -70,7 +70,7 @@ public: /// Helper methods for compression. - /// If data is not worth to be compressed and not 'always_compress' - returns nullptr. + /// If data is not worth to be compressed and not `force_compression` - returns nullptr. /// Note: shared_ptr is to allow to be captured by std::function. static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool force_compression); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 4bdc253bfc4..3c73a005673 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -635,26 +635,39 @@ ColumnPtr ColumnString::compress(bool force_compression) const const size_t source_offsets_size = source_offsets_elements * sizeof(Offset); /// Don't compress small blocks. - if (source_chars_size < 4096) /// A wild guess. + if (source_chars_size < min_size_to_compress) + { return ColumnCompressed::wrap(this->getPtr()); + } auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, force_compression); /// Return original column if not compressible. if (!chars_compressed) + { return ColumnCompressed::wrap(this->getPtr()); + } auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, /*force_compression=*/true); + const bool offsets_were_compressed = !!offsets_compressed; + + /// Offsets are not compressible. Use the source data. + if (!offsets_compressed) + { + offsets_compressed = std::make_shared>(source_offsets_size); + memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); + } const size_t chars_compressed_size = chars_compressed->size(); const size_t offsets_compressed_size = offsets_compressed->size(); - return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size, - [ - my_chars_compressed = std::move(chars_compressed), - my_offsets_compressed = std::move(offsets_compressed), - source_chars_size, - source_offsets_elements - ] + return ColumnCompressed::create( + source_offsets_elements, + chars_compressed_size + offsets_compressed_size, + [my_chars_compressed = std::move(chars_compressed), + my_offsets_compressed = std::move(offsets_compressed), + source_chars_size, + source_offsets_elements, + offsets_were_compressed] { auto res = ColumnString::create(); @@ -664,8 +677,18 @@ ColumnPtr ColumnString::compress(bool force_compression) const ColumnCompressed::decompressBuffer( my_chars_compressed->data(), res->getChars().data(), my_chars_compressed->size(), source_chars_size); - ColumnCompressed::decompressBuffer( - my_offsets_compressed->data(), res->getOffsets().data(), my_offsets_compressed->size(), source_offsets_elements * sizeof(Offset)); + if (offsets_were_compressed) + { + ColumnCompressed::decompressBuffer( + my_offsets_compressed->data(), + res->getOffsets().data(), + my_offsets_compressed->size(), + source_offsets_elements * sizeof(Offset)); + } + else + { + memcpy(res->getOffsets().data(), my_offsets_compressed->data(), my_offsets_compressed->size()); + } return res; }); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 4bf24217383..245164ca31b 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -29,6 +29,8 @@ public: using Char = UInt8; using Chars = PaddedPODArray; + static constexpr size_t min_size_to_compress = 4096; + private: friend class COWHelper, ColumnString>; diff --git a/src/Columns/tests/gtest_column_string.cpp b/src/Columns/tests/gtest_column_string.cpp new file mode 100644 index 00000000000..13a29616802 --- /dev/null +++ b/src/Columns/tests/gtest_column_string.cpp @@ -0,0 +1,71 @@ +#include + +#include + +#include +#include + +using namespace DB; + +static pcg64 rng(randomSeed()); + +constexpr size_t bytes_per_string = sizeof(size_t) + 1; +/// Column should have enough bytes to be compressed +constexpr size_t column_size = ColumnString::min_size_to_compress / bytes_per_string + 42; + +TEST(ColumnString, Incompressible) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + auto value = rng(); + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + offsets.push_back((i + 1) * bytes_per_string); + } + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_EQ(decompressed.get(), col.get()); +} + +TEST(ColumnString, CompressibleCharsAndIncompressibleOffsets) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + static const size_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + } + offsets.push_back(chars.size()); + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_NE(decompressed.get(), col.get()); +} + +TEST(ColumnString, CompressibleCharsAndCompressibleOffsets) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + static const size_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + offsets.push_back((i + 1) * bytes_per_string); + } + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_NE(decompressed.get(), col.get()); +} From 5a9062d1200c9f9c3bae5c9601f4ab4805be59b8 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 6 Dec 2024 16:21:07 +0100 Subject: [PATCH 3/4] better --- src/Columns/tests/gtest_column_string.cpp | 37 +++++++++++++++++------ 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/src/Columns/tests/gtest_column_string.cpp b/src/Columns/tests/gtest_column_string.cpp index 13a29616802..4a0de2b5515 100644 --- a/src/Columns/tests/gtest_column_string.cpp +++ b/src/Columns/tests/gtest_column_string.cpp @@ -9,7 +9,7 @@ using namespace DB; static pcg64 rng(randomSeed()); -constexpr size_t bytes_per_string = sizeof(size_t) + 1; +constexpr size_t bytes_per_string = sizeof(uint64_t) + 1; /// Column should have enough bytes to be compressed constexpr size_t column_size = ColumnString::min_size_to_compress / bytes_per_string + 42; @@ -21,15 +21,20 @@ TEST(ColumnString, Incompressible) chars.resize(column_size * bytes_per_string); for (size_t i = 0; i < column_size; ++i) { - auto value = rng(); - memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); - chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + const uint64_t value = rng(); + memcpy(&chars[i * bytes_per_string], &value, sizeof(uint64_t)); + chars[i * bytes_per_string + sizeof(uint64_t)] = '\0'; offsets.push_back((i + 1) * bytes_per_string); } auto compressed = col->compress(true); auto decompressed = compressed->decompress(); + // When column is incompressible, we return the original column wrapped in CompressedColumn ASSERT_EQ(decompressed.get(), col.get()); + ASSERT_EQ(compressed->size(), col->size()); + ASSERT_EQ(compressed->allocatedBytes(), col->allocatedBytes()); + ASSERT_EQ(decompressed->size(), col->size()); + ASSERT_EQ(decompressed->allocatedBytes(), col->allocatedBytes()); } TEST(ColumnString, CompressibleCharsAndIncompressibleOffsets) @@ -40,15 +45,21 @@ TEST(ColumnString, CompressibleCharsAndIncompressibleOffsets) chars.resize(column_size * bytes_per_string); for (size_t i = 0; i < column_size; ++i) { - static const size_t value = 42; - memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); - chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + static const uint64_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(uint64_t)); + chars[i * bytes_per_string + sizeof(uint64_t)] = '\0'; } offsets.push_back(chars.size()); auto compressed = col->compress(true); auto decompressed = compressed->decompress(); + // For actually compressed column only compressed `chars` and `offsets` arrays are stored. + // Upon decompression, a new column is created. ASSERT_NE(decompressed.get(), col.get()); + ASSERT_EQ(compressed->size(), col->size()); + ASSERT_LE(compressed->allocatedBytes(), col->allocatedBytes()); + ASSERT_EQ(decompressed->size(), col->size()); + ASSERT_LE(decompressed->allocatedBytes(), col->allocatedBytes()); } TEST(ColumnString, CompressibleCharsAndCompressibleOffsets) @@ -59,13 +70,19 @@ TEST(ColumnString, CompressibleCharsAndCompressibleOffsets) chars.resize(column_size * bytes_per_string); for (size_t i = 0; i < column_size; ++i) { - static const size_t value = 42; - memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); - chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + static const uint64_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(uint64_t)); + chars[i * bytes_per_string + sizeof(uint64_t)] = '\0'; offsets.push_back((i + 1) * bytes_per_string); } auto compressed = col->compress(true); auto decompressed = compressed->decompress(); + // For actually compressed column only compressed `chars` and `offsets` arrays are stored. + // Upon decompression, a new column is created. ASSERT_NE(decompressed.get(), col.get()); + ASSERT_EQ(compressed->size(), col->size()); + ASSERT_LE(compressed->allocatedBytes(), col->allocatedBytes()); + ASSERT_EQ(decompressed->size(), col->size()); + ASSERT_LE(decompressed->allocatedBytes(), col->allocatedBytes()); } From 646d44e30ddcfc7d4193854c765a93b2853aa887 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 9 Dec 2024 15:14:58 +0100 Subject: [PATCH 4/4] fix comment --- src/Columns/ColumnCompressed.h | 4 +++- src/Columns/ColumnString.cpp | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index 7d7970cce8a..48f3717291b 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -70,7 +70,9 @@ public: /// Helper methods for compression. - /// If data is not worth to be compressed and not `force_compression` - returns nullptr. + /// If data is not worth to be compressed - returns nullptr. + /// By default it requires that compressed data is at least 50% smaller than original. + /// With `force_compression` set to true, it requires compressed data to be not larger than the source data. /// Note: shared_ptr is to allow to be captured by std::function. static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool force_compression); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 3c73a005673..4785f0ce28d 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -648,7 +648,7 @@ ColumnPtr ColumnString::compress(bool force_compression) const return ColumnCompressed::wrap(this->getPtr()); } - auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, /*force_compression=*/true); + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, force_compression); const bool offsets_were_compressed = !!offsets_compressed; /// Offsets are not compressible. Use the source data.