From 82da99cfd317f4087696548f5173736eb6eba60f Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 6 Dec 2024 15:34:39 +0100 Subject: [PATCH] fix --- src/Columns/ColumnCompressed.h | 2 +- src/Columns/ColumnString.cpp | 43 ++++++++++---- src/Columns/ColumnString.h | 2 + src/Columns/tests/gtest_column_string.cpp | 71 +++++++++++++++++++++++ 4 files changed, 107 insertions(+), 11 deletions(-) create mode 100644 src/Columns/tests/gtest_column_string.cpp diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index b030e762acd..7d7970cce8a 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -70,7 +70,7 @@ public: /// Helper methods for compression. - /// If data is not worth to be compressed and not 'always_compress' - returns nullptr. + /// If data is not worth to be compressed and not `force_compression` - returns nullptr. /// Note: shared_ptr is to allow to be captured by std::function. static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool force_compression); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 4bdc253bfc4..3c73a005673 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -635,26 +635,39 @@ ColumnPtr ColumnString::compress(bool force_compression) const const size_t source_offsets_size = source_offsets_elements * sizeof(Offset); /// Don't compress small blocks. - if (source_chars_size < 4096) /// A wild guess. + if (source_chars_size < min_size_to_compress) + { return ColumnCompressed::wrap(this->getPtr()); + } auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, force_compression); /// Return original column if not compressible. if (!chars_compressed) + { return ColumnCompressed::wrap(this->getPtr()); + } auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, /*force_compression=*/true); + const bool offsets_were_compressed = !!offsets_compressed; + + /// Offsets are not compressible. Use the source data. + if (!offsets_compressed) + { + offsets_compressed = std::make_shared>(source_offsets_size); + memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); + } const size_t chars_compressed_size = chars_compressed->size(); const size_t offsets_compressed_size = offsets_compressed->size(); - return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size, - [ - my_chars_compressed = std::move(chars_compressed), - my_offsets_compressed = std::move(offsets_compressed), - source_chars_size, - source_offsets_elements - ] + return ColumnCompressed::create( + source_offsets_elements, + chars_compressed_size + offsets_compressed_size, + [my_chars_compressed = std::move(chars_compressed), + my_offsets_compressed = std::move(offsets_compressed), + source_chars_size, + source_offsets_elements, + offsets_were_compressed] { auto res = ColumnString::create(); @@ -664,8 +677,18 @@ ColumnPtr ColumnString::compress(bool force_compression) const ColumnCompressed::decompressBuffer( my_chars_compressed->data(), res->getChars().data(), my_chars_compressed->size(), source_chars_size); - ColumnCompressed::decompressBuffer( - my_offsets_compressed->data(), res->getOffsets().data(), my_offsets_compressed->size(), source_offsets_elements * sizeof(Offset)); + if (offsets_were_compressed) + { + ColumnCompressed::decompressBuffer( + my_offsets_compressed->data(), + res->getOffsets().data(), + my_offsets_compressed->size(), + source_offsets_elements * sizeof(Offset)); + } + else + { + memcpy(res->getOffsets().data(), my_offsets_compressed->data(), my_offsets_compressed->size()); + } return res; }); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 4bf24217383..245164ca31b 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -29,6 +29,8 @@ public: using Char = UInt8; using Chars = PaddedPODArray; + static constexpr size_t min_size_to_compress = 4096; + private: friend class COWHelper, ColumnString>; diff --git a/src/Columns/tests/gtest_column_string.cpp b/src/Columns/tests/gtest_column_string.cpp new file mode 100644 index 00000000000..13a29616802 --- /dev/null +++ b/src/Columns/tests/gtest_column_string.cpp @@ -0,0 +1,71 @@ +#include + +#include + +#include +#include + +using namespace DB; + +static pcg64 rng(randomSeed()); + +constexpr size_t bytes_per_string = sizeof(size_t) + 1; +/// Column should have enough bytes to be compressed +constexpr size_t column_size = ColumnString::min_size_to_compress / bytes_per_string + 42; + +TEST(ColumnString, Incompressible) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + auto value = rng(); + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + offsets.push_back((i + 1) * bytes_per_string); + } + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_EQ(decompressed.get(), col.get()); +} + +TEST(ColumnString, CompressibleCharsAndIncompressibleOffsets) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + static const size_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + } + offsets.push_back(chars.size()); + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_NE(decompressed.get(), col.get()); +} + +TEST(ColumnString, CompressibleCharsAndCompressibleOffsets) +{ + auto col = ColumnString::create(); + auto & chars = col->getChars(); + auto & offsets = col->getOffsets(); + chars.resize(column_size * bytes_per_string); + for (size_t i = 0; i < column_size; ++i) + { + static const size_t value = 42; + memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t)); + chars[i * bytes_per_string + sizeof(size_t)] = '\0'; + offsets.push_back((i + 1) * bytes_per_string); + } + + auto compressed = col->compress(true); + auto decompressed = compressed->decompress(); + ASSERT_NE(decompressed.get(), col.get()); +}