mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 16:50:48 +00:00
Merge pull request #56226 from HarryLeeIBM/hlee-ftsearch-compress
Add compression of posting lists for inverted index
This commit is contained in:
commit
99dd3ed060
@ -1,6 +1,7 @@
|
||||
#include <Storages/MergeTree/GinIndexStore.h>
|
||||
#include <Columns/ColumnString.h>
|
||||
#include <Common/FST.h>
|
||||
#include <Compression/CompressionFactory.h>
|
||||
#include <DataTypes/DataTypeArray.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
@ -81,14 +82,22 @@ UInt64 GinIndexPostingsBuilder::serialize(WriteBuffer & buffer)
|
||||
{
|
||||
rowid_bitmap.runOptimize();
|
||||
auto size = rowid_bitmap.getSizeInBytes();
|
||||
auto buf = std::make_unique<char[]>(size);
|
||||
rowid_bitmap.write(buf.get());
|
||||
|
||||
auto codec = CompressionCodecFactory::instance().get(GIN_COMPRESSION_CODEC, GIN_COMPRESSION_LEVEL);
|
||||
Memory<> memory;
|
||||
memory.resize(codec->getCompressedReserveSize(static_cast<UInt32>(size)));
|
||||
auto compressed_size = codec->compress(buf.get(), static_cast<UInt32>(size), memory.data());
|
||||
|
||||
writeVarUInt(size, buffer);
|
||||
written_bytes += getLengthOfVarUInt(size);
|
||||
|
||||
auto buf = std::make_unique<char[]>(size);
|
||||
rowid_bitmap.write(buf.get());
|
||||
buffer.write(buf.get(), size);
|
||||
written_bytes += size;
|
||||
writeVarUInt(compressed_size, buffer);
|
||||
written_bytes += getLengthOfVarUInt(compressed_size);
|
||||
|
||||
buffer.write(memory.data(), compressed_size);
|
||||
written_bytes += compressed_size;
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -110,11 +119,18 @@ GinIndexPostingsListPtr GinIndexPostingsBuilder::deserialize(ReadBuffer & buffer
|
||||
if (postings_list_size == USES_BIT_MAP)
|
||||
{
|
||||
size_t size = 0;
|
||||
size_t compressed_size = 0;
|
||||
readVarUInt(size, buffer);
|
||||
auto buf = std::make_unique<char[]>(size);
|
||||
buffer.readStrict(reinterpret_cast<char *>(buf.get()), size);
|
||||
readVarUInt(compressed_size, buffer);
|
||||
auto buf = std::make_unique<char[]>(compressed_size);
|
||||
buffer.readStrict(reinterpret_cast<char *>(buf.get()), compressed_size);
|
||||
|
||||
GinIndexPostingsListPtr postings_list = std::make_shared<GinIndexPostingsList>(GinIndexPostingsList::read(buf.get()));
|
||||
Memory<> memory;
|
||||
memory.resize(size);
|
||||
auto codec = CompressionCodecFactory::instance().get(GIN_COMPRESSION_CODEC, GIN_COMPRESSION_LEVEL);
|
||||
codec->decompress(buf.get(), static_cast<UInt32>(compressed_size), memory.data());
|
||||
|
||||
GinIndexPostingsListPtr postings_list = std::make_shared<GinIndexPostingsList>(GinIndexPostingsList::read(memory.data()));
|
||||
|
||||
return postings_list;
|
||||
}
|
||||
|
@ -61,6 +61,9 @@ public:
|
||||
private:
|
||||
constexpr static int MIN_SIZE_FOR_ROARING_ENCODING = 16;
|
||||
|
||||
static constexpr auto GIN_COMPRESSION_CODEC = "ZSTD";
|
||||
static constexpr auto GIN_COMPRESSION_LEVEL = 1;
|
||||
|
||||
/// When the list length is no greater than MIN_SIZE_FOR_ROARING_ENCODING, array 'rowid_lst' is used
|
||||
/// As a special case, rowid_lst[0] == CONTAINS_ALL encodes that all rowids are set.
|
||||
std::array<UInt32, MIN_SIZE_FOR_ROARING_ENCODING> rowid_lst;
|
||||
@ -211,7 +214,7 @@ private:
|
||||
v1 = 1, /// Initial version
|
||||
};
|
||||
|
||||
static constexpr auto CURRENT_GIN_FILE_FORMAT_VERSION = Format::v0;
|
||||
static constexpr auto CURRENT_GIN_FILE_FORMAT_VERSION = Format::v1;
|
||||
};
|
||||
|
||||
using GinIndexStorePtr = std::shared_ptr<GinIndexStore>;
|
||||
|
Loading…
Reference in New Issue
Block a user