Merge pull request #56226 from HarryLeeIBM/hlee-ftsearch-compress

Add compression of posting lists for inverted index
This commit is contained in:
Robert Schulze 2023-11-02 15:56:30 +01:00 committed by GitHub
commit 99dd3ed060
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 27 additions and 8 deletions

View File

@ -1,6 +1,7 @@
#include <Storages/MergeTree/GinIndexStore.h>
#include <Columns/ColumnString.h>
#include <Common/FST.h>
#include <Compression/CompressionFactory.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
@ -81,14 +82,22 @@ UInt64 GinIndexPostingsBuilder::serialize(WriteBuffer & buffer)
{
rowid_bitmap.runOptimize();
auto size = rowid_bitmap.getSizeInBytes();
auto buf = std::make_unique<char[]>(size);
rowid_bitmap.write(buf.get());
auto codec = CompressionCodecFactory::instance().get(GIN_COMPRESSION_CODEC, GIN_COMPRESSION_LEVEL);
Memory<> memory;
memory.resize(codec->getCompressedReserveSize(static_cast<UInt32>(size)));
auto compressed_size = codec->compress(buf.get(), static_cast<UInt32>(size), memory.data());
writeVarUInt(size, buffer);
written_bytes += getLengthOfVarUInt(size);
auto buf = std::make_unique<char[]>(size);
rowid_bitmap.write(buf.get());
buffer.write(buf.get(), size);
written_bytes += size;
writeVarUInt(compressed_size, buffer);
written_bytes += getLengthOfVarUInt(compressed_size);
buffer.write(memory.data(), compressed_size);
written_bytes += compressed_size;
}
else
{
@ -110,11 +119,18 @@ GinIndexPostingsListPtr GinIndexPostingsBuilder::deserialize(ReadBuffer & buffer
if (postings_list_size == USES_BIT_MAP)
{
size_t size = 0;
size_t compressed_size = 0;
readVarUInt(size, buffer);
auto buf = std::make_unique<char[]>(size);
buffer.readStrict(reinterpret_cast<char *>(buf.get()), size);
readVarUInt(compressed_size, buffer);
auto buf = std::make_unique<char[]>(compressed_size);
buffer.readStrict(reinterpret_cast<char *>(buf.get()), compressed_size);
GinIndexPostingsListPtr postings_list = std::make_shared<GinIndexPostingsList>(GinIndexPostingsList::read(buf.get()));
Memory<> memory;
memory.resize(size);
auto codec = CompressionCodecFactory::instance().get(GIN_COMPRESSION_CODEC, GIN_COMPRESSION_LEVEL);
codec->decompress(buf.get(), static_cast<UInt32>(compressed_size), memory.data());
GinIndexPostingsListPtr postings_list = std::make_shared<GinIndexPostingsList>(GinIndexPostingsList::read(memory.data()));
return postings_list;
}

View File

@ -61,6 +61,9 @@ public:
private:
constexpr static int MIN_SIZE_FOR_ROARING_ENCODING = 16;
static constexpr auto GIN_COMPRESSION_CODEC = "ZSTD";
static constexpr auto GIN_COMPRESSION_LEVEL = 1;
/// When the list length is no greater than MIN_SIZE_FOR_ROARING_ENCODING, array 'rowid_lst' is used
/// As a special case, rowid_lst[0] == CONTAINS_ALL encodes that all rowids are set.
std::array<UInt32, MIN_SIZE_FOR_ROARING_ENCODING> rowid_lst;
@ -211,7 +214,7 @@ private:
v1 = 1, /// Initial version
};
static constexpr auto CURRENT_GIN_FILE_FORMAT_VERSION = Format::v0;
static constexpr auto CURRENT_GIN_FILE_FORMAT_VERSION = Format::v1;
};
using GinIndexStorePtr = std::shared_ptr<GinIndexStore>;