This commit is contained in:
Nikita Taranov 2024-12-06 15:34:39 +01:00
parent f2c328aa7a
commit 82da99cfd3
4 changed files with 107 additions and 11 deletions

View File

@ -70,7 +70,7 @@ public:
/// Helper methods for compression.
/// If data is not worth to be compressed and not 'always_compress' - returns nullptr.
/// If data is not worth to be compressed and not `force_compression` - returns nullptr.
/// Note: shared_ptr is to allow to be captured by std::function.
static std::shared_ptr<Memory<>> compressBuffer(const void * data, size_t data_size, bool force_compression);

View File

@ -635,26 +635,39 @@ ColumnPtr ColumnString::compress(bool force_compression) const
const size_t source_offsets_size = source_offsets_elements * sizeof(Offset);
/// Don't compress small blocks.
if (source_chars_size < 4096) /// A wild guess.
if (source_chars_size < min_size_to_compress)
{
return ColumnCompressed::wrap(this->getPtr());
}
auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, force_compression);
/// Return original column if not compressible.
if (!chars_compressed)
{
return ColumnCompressed::wrap(this->getPtr());
}
auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, /*force_compression=*/true);
const bool offsets_were_compressed = !!offsets_compressed;
/// Offsets are not compressible. Use the source data.
if (!offsets_compressed)
{
offsets_compressed = std::make_shared<Memory<>>(source_offsets_size);
memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size);
}
const size_t chars_compressed_size = chars_compressed->size();
const size_t offsets_compressed_size = offsets_compressed->size();
return ColumnCompressed::create(source_offsets_elements, chars_compressed_size + offsets_compressed_size,
[
my_chars_compressed = std::move(chars_compressed),
my_offsets_compressed = std::move(offsets_compressed),
source_chars_size,
source_offsets_elements
]
return ColumnCompressed::create(
source_offsets_elements,
chars_compressed_size + offsets_compressed_size,
[my_chars_compressed = std::move(chars_compressed),
my_offsets_compressed = std::move(offsets_compressed),
source_chars_size,
source_offsets_elements,
offsets_were_compressed]
{
auto res = ColumnString::create();
@ -664,8 +677,18 @@ ColumnPtr ColumnString::compress(bool force_compression) const
ColumnCompressed::decompressBuffer(
my_chars_compressed->data(), res->getChars().data(), my_chars_compressed->size(), source_chars_size);
ColumnCompressed::decompressBuffer(
my_offsets_compressed->data(), res->getOffsets().data(), my_offsets_compressed->size(), source_offsets_elements * sizeof(Offset));
if (offsets_were_compressed)
{
ColumnCompressed::decompressBuffer(
my_offsets_compressed->data(),
res->getOffsets().data(),
my_offsets_compressed->size(),
source_offsets_elements * sizeof(Offset));
}
else
{
memcpy(res->getOffsets().data(), my_offsets_compressed->data(), my_offsets_compressed->size());
}
return res;
});

View File

@ -29,6 +29,8 @@ public:
using Char = UInt8;
using Chars = PaddedPODArray<UInt8>;
static constexpr size_t min_size_to_compress = 4096;
private:
friend class COWHelper<IColumnHelper<ColumnString>, ColumnString>;

View File

@ -0,0 +1,71 @@
#include <gtest/gtest.h>
#include <Columns/ColumnString.h>
#include <Common/randomSeed.h>
#include <Common/thread_local_rng.h>
using namespace DB;
static pcg64 rng(randomSeed());
constexpr size_t bytes_per_string = sizeof(size_t) + 1;
/// Column should have enough bytes to be compressed
constexpr size_t column_size = ColumnString::min_size_to_compress / bytes_per_string + 42;
TEST(ColumnString, Incompressible)
{
auto col = ColumnString::create();
auto & chars = col->getChars();
auto & offsets = col->getOffsets();
chars.resize(column_size * bytes_per_string);
for (size_t i = 0; i < column_size; ++i)
{
auto value = rng();
memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t));
chars[i * bytes_per_string + sizeof(size_t)] = '\0';
offsets.push_back((i + 1) * bytes_per_string);
}
auto compressed = col->compress(true);
auto decompressed = compressed->decompress();
ASSERT_EQ(decompressed.get(), col.get());
}
TEST(ColumnString, CompressibleCharsAndIncompressibleOffsets)
{
auto col = ColumnString::create();
auto & chars = col->getChars();
auto & offsets = col->getOffsets();
chars.resize(column_size * bytes_per_string);
for (size_t i = 0; i < column_size; ++i)
{
static const size_t value = 42;
memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t));
chars[i * bytes_per_string + sizeof(size_t)] = '\0';
}
offsets.push_back(chars.size());
auto compressed = col->compress(true);
auto decompressed = compressed->decompress();
ASSERT_NE(decompressed.get(), col.get());
}
TEST(ColumnString, CompressibleCharsAndCompressibleOffsets)
{
auto col = ColumnString::create();
auto & chars = col->getChars();
auto & offsets = col->getOffsets();
chars.resize(column_size * bytes_per_string);
for (size_t i = 0; i < column_size; ++i)
{
static const size_t value = 42;
memcpy(&chars[i * bytes_per_string], &value, sizeof(size_t));
chars[i * bytes_per_string + sizeof(size_t)] = '\0';
offsets.push_back((i + 1) * bytes_per_string);
}
auto compressed = col->compress(true);
auto decompressed = compressed->decompress();
ASSERT_NE(decompressed.get(), col.get());
}