mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-18 12:22:12 +00:00
123 lines
4.2 KiB
C++
123 lines
4.2 KiB
C++
#pragma once
|
|
|
|
#include <tuple>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
#include <base/types.h>
|
|
#include <Common/PODArray.h>
|
|
|
|
|
|
namespace DB
|
|
{
|
|
|
|
/// It's a bug in clang with three-way comparison operator
|
|
/// https://github.com/llvm/llvm-project/issues/55919
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant"
|
|
|
|
/** Mark is the position in the compressed file. The compressed file consists of adjacent compressed blocks.
|
|
* Mark is a tuple - the offset in the file to the start of the compressed block, the offset in the decompressed block to the start of the data.
|
|
*/
|
|
struct MarkInCompressedFile
|
|
{
|
|
size_t offset_in_compressed_file;
|
|
size_t offset_in_decompressed_block;
|
|
|
|
auto operator<=>(const MarkInCompressedFile &) const = default;
|
|
|
|
auto asTuple() const { return std::make_tuple(offset_in_compressed_file, offset_in_decompressed_block); }
|
|
|
|
String toString() const
|
|
{
|
|
return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ")";
|
|
}
|
|
|
|
String toStringWithRows(size_t rows_num) const
|
|
{
|
|
return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ","
|
|
+ DB::toString(rows_num) + ")";
|
|
}
|
|
};
|
|
|
|
#pragma clang diagnostic pop
|
|
|
|
/**
|
|
* In-memory representation of an array of marks.
|
|
*
|
|
* Uses an ad-hoc compression scheme that decreases memory usage while allowing
|
|
* random access in O(1) time.
|
|
* This is independent from the marks *file* format, which may be uncompressed
|
|
* or use a different compression method.
|
|
*
|
|
* Typical memory usage:
|
|
* * ~3 bytes/mark for integer columns
|
|
* * ~5 bytes/mark for string columns
|
|
* * ~0.3 bytes/mark for trivial marks in auxiliary dict files of LowCardinality columns
|
|
*/
|
|
class MarksInCompressedFile
|
|
{
|
|
public:
|
|
using PlainArray = PODArray<MarkInCompressedFile>;
|
|
|
|
explicit MarksInCompressedFile(const PlainArray & marks);
|
|
|
|
MarkInCompressedFile get(size_t idx) const;
|
|
|
|
size_t approximateMemoryUsage() const;
|
|
|
|
private:
|
|
/** Throughout this class:
|
|
* * "x" stands for offset_in_compressed_file,
|
|
* * "y" stands for offset_in_decompressed_block.
|
|
*/
|
|
|
|
/** We need to store a sequence of marks, each consisting of two 64-bit integers:
|
|
* offset_in_compressed_file and offset_in_decompressed_block. We'll call them x and y for
|
|
* convenience, since compression doesn't care what they mean. The compression exploits the
|
|
* following regularities:
|
|
* * y is usually zero.
|
|
* * x usually increases steadily.
|
|
* * Differences between x values in nearby marks usually fit in much fewer than 64 bits.
|
|
*
|
|
* We split the sequence of marks into blocks, each containing MARKS_PER_BLOCK marks.
|
|
* (Not to be confused with data blocks.)
|
|
* For each mark, we store the difference [value] - [min value in the block], for each of the
|
|
* two values in the mark. Each block specifies the number of bits to use for these differences
|
|
* for all marks in this block.
|
|
* The smaller the blocks the fewer bits are required, but the bigger the relative overhead of
|
|
* block headers.
|
|
*
|
|
* Packed marks and block headers all live in one contiguous array.
|
|
*/
|
|
|
|
struct BlockInfo
|
|
{
|
|
// Min offset_in_compressed_file and offset_in_decompressed_block, correspondingly.
|
|
size_t min_x = UINT64_MAX;
|
|
size_t min_y = UINT64_MAX;
|
|
|
|
// Place in `packed` where this block start.
|
|
size_t bit_offset_in_packed_array;
|
|
|
|
// How many bits each mark takes. These numbers are bit-packed in the `packed` array.
|
|
// Can be zero. (Especially for y, which is typically all zeroes.)
|
|
UInt8 bits_for_x;
|
|
UInt8 bits_for_y;
|
|
// The `y` values should be <<'ed by this amount.
|
|
// Useful for integer columns when marks granularity is a power of 2; in this case all
|
|
// offset_in_decompressed_block values are divisible by 2^15 or so.
|
|
UInt8 trailing_zero_bits_in_y = 63;
|
|
};
|
|
|
|
static constexpr size_t MARKS_PER_BLOCK = 256;
|
|
|
|
size_t num_marks;
|
|
PODArray<BlockInfo> blocks;
|
|
PODArray<UInt64> packed;
|
|
|
|
// Mark idx -> {block info, bit offset in `packed`}.
|
|
std::tuple<const BlockInfo *, size_t> lookUpMark(size_t idx) const;
|
|
};
|
|
|
|
}
|