2015-10-23 22:43:06 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <IO/ReadBuffer.h>
|
|
|
|
#include <IO/WriteBuffer.h>
|
|
|
|
#include <IO/ReadHelpers.h>
|
|
|
|
#include <IO/WriteHelpers.h>
|
|
|
|
#include <Core/Defines.h>
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
extern const int NO_AVAILABLE_DATA;
|
2016-01-11 21:46:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Compact array for data storage, size `content_width`, in bits, of which is
|
|
|
|
* less than one byte. Instead of storing each value in a separate
|
|
|
|
* bytes, which leads to a waste of 37.5% of the space for content_width = 5, CompactArray stores
|
2017-05-09 19:07:35 +00:00
|
|
|
* adjacent `content_width`-bit values in the byte array, that is actually CompactArray
|
2017-05-07 20:25:26 +00:00
|
|
|
* simulates an array of `content_width`-bit values.
|
2015-10-23 22:43:06 +00:00
|
|
|
*/
|
|
|
|
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
|
2019-01-04 13:32:08 +00:00
|
|
|
class CompactArray final
|
2015-10-23 22:43:06 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
class Reader;
|
|
|
|
class Locus;
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
CompactArray() = default;
|
|
|
|
|
|
|
|
UInt8 ALWAYS_INLINE operator[](BucketIndex bucket_index) const
|
|
|
|
{
|
|
|
|
Locus locus(bucket_index);
|
|
|
|
|
|
|
|
if (locus.index_l == locus.index_r)
|
|
|
|
return locus.read(bitset[locus.index_l]);
|
|
|
|
else
|
|
|
|
return locus.read(bitset[locus.index_l], bitset[locus.index_r]);
|
|
|
|
}
|
|
|
|
|
|
|
|
Locus ALWAYS_INLINE operator[](BucketIndex bucket_index)
|
|
|
|
{
|
|
|
|
Locus locus(bucket_index);
|
|
|
|
|
|
|
|
locus.content_l = &bitset[locus.index_l];
|
|
|
|
|
|
|
|
if (locus.index_l == locus.index_r)
|
|
|
|
locus.content_r = locus.content_l;
|
|
|
|
else
|
|
|
|
locus.content_r = &bitset[locus.index_r];
|
|
|
|
|
|
|
|
return locus;
|
|
|
|
}
|
|
|
|
|
2019-03-07 16:26:22 +00:00
|
|
|
/// Used only in arcadia/metrika
|
|
|
|
void readText(ReadBuffer & in)
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < BITSET_SIZE; ++i)
|
|
|
|
{
|
|
|
|
if (i != 0)
|
|
|
|
assertChar(',', in);
|
|
|
|
readIntText(bitset[i], in);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Used only in arcadia/metrika
|
|
|
|
void writeText(WriteBuffer & out) const
|
|
|
|
{
|
|
|
|
for (size_t i = 0; i < BITSET_SIZE; ++i)
|
|
|
|
{
|
|
|
|
if (i != 0)
|
|
|
|
writeCString(",", out);
|
|
|
|
writeIntText(bitset[i], out);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-23 22:43:06 +00:00
|
|
|
private:
|
2017-05-07 20:25:26 +00:00
|
|
|
/// number of bytes in bitset
|
2017-04-01 07:20:54 +00:00
|
|
|
static constexpr size_t BITSET_SIZE = (static_cast<size_t>(bucket_count) * content_width + 7) / 8;
|
|
|
|
UInt8 bitset[BITSET_SIZE] = { 0 };
|
2015-10-23 22:43:06 +00:00
|
|
|
};
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** A class for sequentially reading cells from a compact array on a disk.
|
2015-10-23 22:43:06 +00:00
|
|
|
*/
|
|
|
|
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
|
|
|
|
class CompactArray<BucketIndex, content_width, bucket_count>::Reader final
|
|
|
|
{
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
Reader(ReadBuffer & in_)
|
|
|
|
: in(in_)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
Reader(const Reader &) = delete;
|
|
|
|
Reader & operator=(const Reader &) = delete;
|
|
|
|
|
|
|
|
bool next()
|
|
|
|
{
|
|
|
|
if (current_bucket_index == bucket_count)
|
|
|
|
{
|
|
|
|
is_eof = true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
locus.init(current_bucket_index);
|
|
|
|
|
|
|
|
if (current_bucket_index == 0)
|
|
|
|
{
|
|
|
|
in.readStrict(reinterpret_cast<char *>(&value_l), 1);
|
|
|
|
++read_count;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
value_l = value_r;
|
|
|
|
|
|
|
|
if (locus.index_l != locus.index_r)
|
|
|
|
{
|
|
|
|
if (read_count == BITSET_SIZE)
|
|
|
|
fits_in_byte = true;
|
|
|
|
else
|
|
|
|
{
|
|
|
|
fits_in_byte = false;
|
|
|
|
in.readStrict(reinterpret_cast<char *>(&value_r), 1);
|
|
|
|
++read_count;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
fits_in_byte = true;
|
|
|
|
value_r = value_l;
|
|
|
|
}
|
|
|
|
|
|
|
|
++current_bucket_index;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/** Return the current cell number and the corresponding content.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
inline std::pair<BucketIndex, UInt8> get() const
|
|
|
|
{
|
|
|
|
if ((current_bucket_index == 0) || is_eof)
|
|
|
|
throw Exception("No available data.", ErrorCodes::NO_AVAILABLE_DATA);
|
|
|
|
|
|
|
|
if (fits_in_byte)
|
|
|
|
return std::make_pair(current_bucket_index - 1, locus.read(value_l));
|
|
|
|
else
|
|
|
|
return std::make_pair(current_bucket_index - 1, locus.read(value_l, value_r));
|
|
|
|
}
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
ReadBuffer & in;
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The physical location of the current cell.
|
2017-04-01 07:20:54 +00:00
|
|
|
Locus locus;
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The current position in the file as a cell number.
|
2017-04-01 07:20:54 +00:00
|
|
|
BucketIndex current_bucket_index = 0;
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The number of bytes read.
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t read_count = 0;
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The content in the current position.
|
2017-04-01 07:20:54 +00:00
|
|
|
UInt8 value_l;
|
|
|
|
UInt8 value_r;
|
|
|
|
///
|
|
|
|
bool is_eof = false;
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Does the cell fully fit into one byte?
|
2017-04-01 07:20:54 +00:00
|
|
|
bool fits_in_byte;
|
2015-10-23 22:43:06 +00:00
|
|
|
};
|
|
|
|
|
2018-12-27 19:33:27 +00:00
|
|
|
/** TODO This code looks very suboptimal.
|
|
|
|
*
|
|
|
|
* The `Locus` structure contains the necessary information to find for each cell
|
2017-05-07 20:25:26 +00:00
|
|
|
* the corresponding byte and offset, in bits, from the beginning of the cell. Since in general
|
|
|
|
* case the size of one byte is not divisible by the size of one cell, cases possible
|
|
|
|
* when one cell overlaps two bytes. Therefore, the `Locus` structure contains two
|
|
|
|
* pairs (index, offset).
|
2015-10-23 22:43:06 +00:00
|
|
|
*/
|
|
|
|
template <typename BucketIndex, UInt8 content_width, size_t bucket_count>
|
|
|
|
class CompactArray<BucketIndex, content_width, bucket_count>::Locus final
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
friend class CompactArray;
|
|
|
|
friend class CompactArray::Reader;
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
public:
|
2017-04-01 07:20:54 +00:00
|
|
|
ALWAYS_INLINE operator UInt8() const
|
|
|
|
{
|
|
|
|
if (content_l == content_r)
|
|
|
|
return read(*content_l);
|
|
|
|
else
|
|
|
|
return read(*content_l, *content_r);
|
|
|
|
}
|
|
|
|
|
|
|
|
Locus ALWAYS_INLINE & operator=(UInt8 content)
|
|
|
|
{
|
|
|
|
if ((index_l == index_r) || (index_l == (BITSET_SIZE - 1)))
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The cell completely fits into one byte.
|
2017-04-01 07:20:54 +00:00
|
|
|
*content_l &= ~(((1 << content_width) - 1) << offset_l);
|
|
|
|
*content_l |= content << offset_l;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The cell overlaps two bytes.
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t left = 8 - offset_l;
|
|
|
|
|
|
|
|
*content_l &= ~(((1 << left) - 1) << offset_l);
|
|
|
|
*content_l |= (content & ((1 << left) - 1)) << offset_l;
|
|
|
|
|
|
|
|
*content_r &= ~((1 << offset_r) - 1);
|
|
|
|
*content_r |= content >> left;
|
|
|
|
}
|
|
|
|
|
|
|
|
return *this;
|
|
|
|
}
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
Locus() = default;
|
|
|
|
|
|
|
|
Locus(BucketIndex bucket_index)
|
|
|
|
{
|
|
|
|
init(bucket_index);
|
|
|
|
}
|
|
|
|
|
|
|
|
void ALWAYS_INLINE init(BucketIndex bucket_index)
|
|
|
|
{
|
2018-12-27 19:33:27 +00:00
|
|
|
/// offset in bits to the leftmost bit
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t l = static_cast<size_t>(bucket_index) * content_width;
|
|
|
|
|
2018-12-27 19:33:27 +00:00
|
|
|
/// offset of byte that contains the leftmost bit
|
|
|
|
index_l = l / 8;
|
|
|
|
|
|
|
|
/// offset in bits to the leftmost bit at that byte
|
|
|
|
offset_l = l % 8;
|
|
|
|
|
|
|
|
/// offset of byte that contains the rightmost bit
|
|
|
|
index_r = (l + content_width - 1) / 8;
|
|
|
|
|
|
|
|
/// offset in bits to the next to the rightmost bit at that byte; or zero if the rightmost bit is the rightmost bit in that byte.
|
|
|
|
offset_r = (l + content_width) % 8;
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
UInt8 ALWAYS_INLINE read(UInt8 value_l) const
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The cell completely fits into one byte.
|
2017-04-01 07:20:54 +00:00
|
|
|
return (value_l >> offset_l) & ((1 << content_width) - 1);
|
|
|
|
}
|
|
|
|
|
|
|
|
UInt8 ALWAYS_INLINE read(UInt8 value_l, UInt8 value_r) const
|
|
|
|
{
|
2017-05-07 20:25:26 +00:00
|
|
|
/// The cell overlaps two bytes.
|
2017-04-01 07:20:54 +00:00
|
|
|
return ((value_l >> offset_l) & ((1 << (8 - offset_l)) - 1))
|
|
|
|
| ((value_r & ((1 << offset_r) - 1)) << (8 - offset_l));
|
|
|
|
}
|
2015-10-23 22:43:06 +00:00
|
|
|
|
|
|
|
private:
|
2017-04-01 07:20:54 +00:00
|
|
|
size_t index_l;
|
|
|
|
size_t offset_l;
|
|
|
|
size_t index_r;
|
|
|
|
size_t offset_r;
|
2015-10-23 22:43:06 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
UInt8 * content_l;
|
|
|
|
UInt8 * content_r;
|
2015-10-23 22:43:06 +00:00
|
|
|
|
2017-05-07 20:25:26 +00:00
|
|
|
/// Checks
|
2017-04-01 07:20:54 +00:00
|
|
|
static_assert((content_width > 0) && (content_width < 8), "Invalid parameter value");
|
|
|
|
static_assert(bucket_count <= (std::numeric_limits<size_t>::max() / content_width), "Invalid parameter value");
|
2015-10-23 22:43:06 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|