mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 03:22:14 +00:00
1400 lines
50 KiB
C++
1400 lines
50 KiB
C++
#pragma once
|
|
|
|
#if defined(__linux__) || defined(__FreeBSD__)
|
|
|
|
#include <chrono>
|
|
|
|
#include <pcg_random.hpp>
|
|
#include <filesystem>
|
|
#include <city.h>
|
|
#include <fcntl.h>
|
|
#include <absl/container/flat_hash_map.h>
|
|
#include <absl/container/flat_hash_set.h>
|
|
|
|
#include <common/unaligned.h>
|
|
#include <Common/randomSeed.h>
|
|
#include <Common/Arena.h>
|
|
#include <Common/ArenaWithFreeLists.h>
|
|
#include <Common/MemorySanitizer.h>
|
|
#include <Common/HashTable/HashMap.h>
|
|
#include <IO/AIO.h>
|
|
#include <Dictionaries/DictionaryStructure.h>
|
|
#include <Dictionaries/ICacheDictionaryStorage.h>
|
|
#include <Dictionaries/DictionaryHelpers.h>
|
|
|
|
namespace ProfileEvents
|
|
{
|
|
extern const Event FileOpen;
|
|
extern const Event AIOWrite;
|
|
extern const Event AIOWriteBytes;
|
|
extern const Event AIORead;
|
|
extern const Event AIOReadBytes;
|
|
}
|
|
|
|
namespace DB
|
|
{
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int AIO_READ_ERROR;
|
|
extern const int AIO_WRITE_ERROR;
|
|
extern const int CANNOT_ALLOCATE_MEMORY;
|
|
extern const int CANNOT_CREATE_DIRECTORY;
|
|
extern const int CANNOT_FSYNC;
|
|
extern const int CANNOT_IO_GETEVENTS;
|
|
extern const int CANNOT_IO_SUBMIT;
|
|
extern const int CANNOT_OPEN_FILE;
|
|
extern const int CORRUPTED_DATA;
|
|
extern const int FILE_DOESNT_EXIST;
|
|
extern const int UNSUPPORTED_METHOD;
|
|
extern const int NOT_IMPLEMENTED;
|
|
}
|
|
|
|
struct SSDCacheDictionaryStorageConfiguration
|
|
{
|
|
const size_t strict_max_lifetime_seconds;
|
|
const DictionaryLifetime lifetime;
|
|
const std::string file_path;
|
|
const size_t max_partitions_count;
|
|
const size_t block_size;
|
|
const size_t file_blocks_size;
|
|
const size_t read_buffer_blocks_size;
|
|
const size_t write_buffer_blocks_size;
|
|
};
|
|
|
|
|
|
/** Simple Key is serialized in block with following structure
|
|
key | data_size | data
|
|
8 bytes | 8 bytes | data_size bytes
|
|
|
|
Complex Key is serialized in block with following structure
|
|
key_size | key_data | data_size | data
|
|
8 bytes | key_size bytes | 8 bytes | data_size bytes
|
|
*/
|
|
template <typename TKeyType>
|
|
struct SSDCacheKey final
|
|
{
|
|
using KeyType = TKeyType;
|
|
|
|
SSDCacheKey(KeyType key_, size_t size_, const char * data_)
|
|
: key(key_)
|
|
, size(size_)
|
|
, data(data_)
|
|
{}
|
|
|
|
KeyType key;
|
|
size_t size;
|
|
const char * data;
|
|
};
|
|
|
|
using SSDCacheSimpleKey = SSDCacheKey<UInt64>;
|
|
using SSDCacheComplexKey = SSDCacheKey<StringRef>;
|
|
|
|
/** Block is serialized with following structure
|
|
check_sum | keys_size | [keys]
|
|
8 bytes | 8 bytes |
|
|
*/
|
|
class SSDCacheBlock final
|
|
{
|
|
static constexpr size_t block_header_check_sum_size = sizeof(size_t);
|
|
static constexpr size_t block_header_keys_size = sizeof(size_t);
|
|
public:
|
|
|
|
/// Block header size
|
|
static constexpr size_t block_header_size = block_header_check_sum_size + block_header_keys_size;
|
|
|
|
explicit SSDCacheBlock(size_t block_size_)
|
|
: block_size(block_size_)
|
|
{}
|
|
|
|
/// Checks if simple key can be written in empty block with block_size
|
|
static bool canBeWrittenInEmptyBlock(SSDCacheSimpleKey & simple_key, size_t block_size)
|
|
{
|
|
static constexpr size_t simple_key_size = sizeof(simple_key.key);
|
|
|
|
return (block_header_size + simple_key_size + sizeof(simple_key.size) + simple_key.size) <= block_size;
|
|
}
|
|
|
|
/// Checks if complex key can be written in empty block with block_size
|
|
static bool canBeWrittenInEmptyBlock(SSDCacheComplexKey & complex_key, size_t block_size)
|
|
{
|
|
StringRef & key = complex_key.key;
|
|
size_t complex_key_size = sizeof(key.size) + key.size;
|
|
|
|
return (block_header_size + complex_key_size + sizeof(complex_key.size) + complex_key.size) <= block_size;
|
|
}
|
|
|
|
/// Reset block with new block_data
|
|
/// block_data must be filled with zeroes if it is new block
|
|
inline void reset(char * new_block_data)
|
|
{
|
|
block_data = new_block_data;
|
|
current_block_offset = block_header_size;
|
|
keys_size = unalignedLoad<size_t>(new_block_data + block_header_check_sum_size);
|
|
}
|
|
|
|
/// Check if it is enough place to write key in block
|
|
inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const
|
|
{
|
|
return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size;
|
|
}
|
|
|
|
/// Check if it is enough place to write key in block
|
|
inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const
|
|
{
|
|
const StringRef & key = cache_key.key;
|
|
size_t complex_key_size = sizeof(key.size) + key.size;
|
|
|
|
return (current_block_offset + (complex_key_size + sizeof(cache_key.size) + cache_key.size)) <= block_size;
|
|
}
|
|
|
|
/// Write key and returns offset in ssd cache block where data is written
|
|
/// It is client responsibility to check if there is enough place in block to write key
|
|
/// Returns true if key was written and false if there was not enough place to write key
|
|
inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block)
|
|
{
|
|
assert(cache_key.size > 0);
|
|
|
|
if (!enoughtPlaceToWriteKey(cache_key))
|
|
return false;
|
|
|
|
char * current_block_offset_data = block_data + current_block_offset;
|
|
|
|
/// Write simple key
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(&cache_key.key), sizeof(cache_key.key));
|
|
current_block_offset_data += sizeof(cache_key.key);
|
|
current_block_offset += sizeof(cache_key.key);
|
|
|
|
/// Write serialized columns size
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(&cache_key.size), sizeof(cache_key.size));
|
|
current_block_offset_data += sizeof(cache_key.size);
|
|
current_block_offset += sizeof(cache_key.size);
|
|
|
|
offset_in_block = current_block_offset;
|
|
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(cache_key.data), cache_key.size);
|
|
current_block_offset += cache_key.size;
|
|
|
|
++keys_size;
|
|
|
|
return true;
|
|
}
|
|
|
|
inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block)
|
|
{
|
|
assert(cache_key.size > 0);
|
|
|
|
if (!enoughtPlaceToWriteKey(cache_key))
|
|
return false;
|
|
|
|
char * current_block_offset_data = block_data + current_block_offset;
|
|
|
|
const StringRef & key = cache_key.key;
|
|
|
|
/// Write complex key
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(&key.size), sizeof(key.size));
|
|
current_block_offset_data += sizeof(key.size);
|
|
current_block_offset += sizeof(key.size);
|
|
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(key.data), key.size);
|
|
current_block_offset_data += key.size;
|
|
current_block_offset += key.size;
|
|
|
|
/// Write serialized columns size
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(&cache_key.size), sizeof(cache_key.size));
|
|
current_block_offset_data += sizeof(cache_key.size);
|
|
current_block_offset += sizeof(cache_key.size);
|
|
|
|
offset_in_block = current_block_offset;
|
|
|
|
memcpy(reinterpret_cast<void *>(current_block_offset_data), reinterpret_cast<const void *>(cache_key.data), cache_key.size);
|
|
current_block_offset += cache_key.size;
|
|
|
|
++keys_size;
|
|
|
|
return true;
|
|
}
|
|
|
|
inline size_t getKeysSize() const { return keys_size; }
|
|
|
|
/// Write keys size into block header
|
|
inline void writeKeysSize()
|
|
{
|
|
char * keys_size_offset_data = block_data + block_header_check_sum_size;
|
|
std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t));
|
|
}
|
|
|
|
/// Get check sum from block header
|
|
inline size_t getCheckSum() const { return unalignedLoad<size_t>(block_data); }
|
|
|
|
/// Calculate check sum in block
|
|
inline size_t calculateCheckSum() const
|
|
{
|
|
size_t calculated_check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
|
|
|
|
return calculated_check_sum;
|
|
}
|
|
|
|
/// Check if check sum from block header matched calculated check sum in block
|
|
inline bool checkCheckSum() const
|
|
{
|
|
size_t calculated_check_sum = calculateCheckSum();
|
|
size_t check_sum = getCheckSum();
|
|
|
|
return calculated_check_sum == check_sum;
|
|
}
|
|
|
|
/// Write check sum in block header
|
|
inline void writeCheckSum()
|
|
{
|
|
size_t check_sum = static_cast<size_t>(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size));
|
|
std::memcpy(block_data, &check_sum, sizeof(size_t));
|
|
}
|
|
|
|
inline size_t getBlockSize() const { return block_size; }
|
|
|
|
/// Returns block data
|
|
inline char * getBlockData() const { return block_data; }
|
|
|
|
/// Read keys that were serialized in block
|
|
/// It is client responsibility to ensure that simple or complex keys were written in block
|
|
void readSimpleKeys(PaddedPODArray<UInt64> & simple_keys) const
|
|
{
|
|
char * block_start = block_data + block_header_size;
|
|
char * block_end = block_data + block_size;
|
|
|
|
static constexpr size_t key_prefix_size = sizeof(UInt64) + sizeof(size_t);
|
|
|
|
while (block_start + key_prefix_size < block_end)
|
|
{
|
|
UInt64 key = unalignedLoad<UInt64>(block_start);
|
|
block_start += sizeof(UInt64);
|
|
|
|
size_t allocated_size = unalignedLoad<size_t>(block_start);
|
|
block_start += sizeof(size_t);
|
|
|
|
/// If we read empty allocated size that means it is end of block
|
|
if (allocated_size == 0)
|
|
break;
|
|
|
|
simple_keys.emplace_back(key);
|
|
block_start += allocated_size;
|
|
}
|
|
}
|
|
|
|
void readComplexKeys(PaddedPODArray<StringRef> & complex_keys) const
|
|
{
|
|
char * block_start = block_data + block_header_size;
|
|
char * block_end = block_data + block_size;
|
|
|
|
static constexpr size_t key_prefix_size = sizeof(size_t) + sizeof(size_t);
|
|
|
|
while (block_start + key_prefix_size < block_end)
|
|
{
|
|
size_t key_size = unalignedLoad<size_t>(block_start);
|
|
block_start += sizeof(key_size);
|
|
|
|
StringRef complex_key (block_start, key_size);
|
|
|
|
block_start += key_size;
|
|
|
|
size_t allocated_size = unalignedLoad<size_t>(block_start);
|
|
block_start += sizeof(size_t);
|
|
|
|
/// If we read empty allocated size that means it is end of block
|
|
if (allocated_size == 0)
|
|
break;
|
|
|
|
complex_keys.emplace_back(complex_key);
|
|
block_start += allocated_size;
|
|
}
|
|
}
|
|
|
|
private:
|
|
size_t block_size;
|
|
char * block_data = nullptr;
|
|
|
|
size_t current_block_offset = block_header_size;
|
|
size_t keys_size = 0;
|
|
};
|
|
|
|
struct SSDCacheIndex
|
|
{
|
|
SSDCacheIndex(size_t block_index_, size_t offset_in_block_)
|
|
: block_index(block_index_)
|
|
, offset_in_block(offset_in_block_)
|
|
{}
|
|
|
|
SSDCacheIndex() = default;
|
|
|
|
size_t block_index = 0;
|
|
size_t offset_in_block = 0;
|
|
};
|
|
|
|
inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs)
|
|
{
|
|
return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block;
|
|
}
|
|
|
|
/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block.
|
|
* If key cannot be written into current_write_block, current block keys size and check summ is written
|
|
* and buffer increase index of current_write_block_index.
|
|
* If current_write_block_index == memory_buffer_blocks_size write key will always returns true.
|
|
* If reset is called current_write_block_index is set to 0.
|
|
*/
|
|
template <typename SSDCacheKeyType>
|
|
class SSDCacheMemoryBuffer
|
|
{
|
|
public:
|
|
using KeyType = typename SSDCacheKeyType::KeyType;
|
|
|
|
explicit SSDCacheMemoryBuffer(size_t block_size_, size_t memory_buffer_blocks_size_)
|
|
: block_size(block_size_)
|
|
, partition_blocks_size(memory_buffer_blocks_size_)
|
|
, buffer(block_size * partition_blocks_size, 4096)
|
|
, current_write_block(block_size)
|
|
{
|
|
current_write_block.reset(buffer.m_data);
|
|
}
|
|
|
|
bool writeKey(const SSDCacheKeyType & key, SSDCacheIndex & index)
|
|
{
|
|
if (current_block_index == partition_blocks_size)
|
|
return false;
|
|
|
|
size_t block_offset = 0;
|
|
bool write_in_current_block = current_write_block.writeKey(key, block_offset);
|
|
|
|
if (write_in_current_block)
|
|
{
|
|
index.block_index = current_block_index;
|
|
index.offset_in_block = block_offset;
|
|
return true;
|
|
}
|
|
|
|
current_write_block.writeKeysSize();
|
|
current_write_block.writeCheckSum();
|
|
|
|
++current_block_index;
|
|
|
|
if (current_block_index == partition_blocks_size)
|
|
return false;
|
|
|
|
current_write_block.reset(buffer.m_data + (block_size * current_block_index));
|
|
|
|
write_in_current_block = current_write_block.writeKey(key, block_offset);
|
|
assert(write_in_current_block);
|
|
|
|
index.block_index = current_block_index;
|
|
index.offset_in_block = block_offset;
|
|
|
|
return write_in_current_block;
|
|
}
|
|
|
|
void writeKeysSizeAndCheckSumForCurrentWriteBlock()
|
|
{
|
|
current_write_block.writeKeysSize();
|
|
current_write_block.writeCheckSum();
|
|
}
|
|
|
|
inline char * getPlace(SSDCacheIndex index) const
|
|
{
|
|
return buffer.m_data + index.block_index * block_size + index.offset_in_block;
|
|
}
|
|
|
|
inline size_t getCurrentBlockIndex() const { return current_block_index; }
|
|
|
|
inline const char * getData() const { return buffer.m_data; }
|
|
|
|
inline size_t getSizeInBytes() const { return block_size * partition_blocks_size; }
|
|
|
|
void readKeys(PaddedPODArray<KeyType> & keys) const
|
|
{
|
|
SSDCacheBlock block(block_size);
|
|
|
|
for (size_t block_index = 0; block_index < partition_blocks_size; ++block_index)
|
|
{
|
|
block.reset(buffer.m_data + (block_index * block_size));
|
|
|
|
if constexpr (std::is_same_v<KeyType, UInt64>)
|
|
block.readSimpleKeys(keys);
|
|
else
|
|
block.readComplexKeys(keys);
|
|
}
|
|
}
|
|
|
|
inline void reset()
|
|
{
|
|
current_block_index = 0;
|
|
current_write_block.reset(buffer.m_data);
|
|
}
|
|
|
|
const size_t block_size;
|
|
|
|
const size_t partition_blocks_size;
|
|
|
|
private:
|
|
Memory<Allocator<true>> buffer;
|
|
|
|
SSDCacheBlock current_write_block;
|
|
|
|
size_t current_block_index = 0;
|
|
};
|
|
|
|
/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system
|
|
template <typename SSDCacheKeyType>
|
|
class SSDCacheFileBuffer : private boost::noncopyable
|
|
{
|
|
static constexpr auto BIN_FILE_EXT = ".bin";
|
|
|
|
public:
|
|
|
|
using KeyType = typename SSDCacheKeyType::KeyType;
|
|
|
|
explicit SSDCacheFileBuffer(
|
|
const std::string & file_path_,
|
|
size_t block_size_,
|
|
size_t file_blocks_size_)
|
|
: file_path(file_path_ + BIN_FILE_EXT)
|
|
, block_size(block_size_)
|
|
, file_blocks_size(file_blocks_size_)
|
|
{
|
|
auto path = std::filesystem::path{file_path};
|
|
auto parent_path_directory = path.parent_path();
|
|
|
|
/// If cache file is in directory that does not exists create it
|
|
if (!std::filesystem::exists(parent_path_directory))
|
|
if (!std::filesystem::create_directories(parent_path_directory))
|
|
throw Exception(ErrorCodes::CANNOT_CREATE_DIRECTORY, "Failed to create directories.");
|
|
|
|
ProfileEvents::increment(ProfileEvents::FileOpen);
|
|
|
|
file.fd = ::open(file_path.c_str(), O_RDWR | O_CREAT | O_TRUNC | O_DIRECT, 0666);
|
|
if (file.fd == -1)
|
|
{
|
|
auto error_code = (errno == ENOENT) ? ErrorCodes::FILE_DOESNT_EXIST : ErrorCodes::CANNOT_OPEN_FILE;
|
|
throwFromErrnoWithPath("Cannot open file " + file_path, file_path, error_code);
|
|
}
|
|
|
|
allocateSizeForNextPartition();
|
|
}
|
|
|
|
void allocateSizeForNextPartition()
|
|
{
|
|
if (preallocateDiskSpace(file.fd, current_blocks_size * block_size, block_size * file_blocks_size) < 0)
|
|
throwFromErrnoWithPath("Cannot preallocate space for the file " + file_path, file_path, ErrorCodes::CANNOT_ALLOCATE_MEMORY);
|
|
|
|
current_blocks_size += file_blocks_size;
|
|
}
|
|
|
|
bool writeBuffer(const char * buffer, size_t buffer_size_in_blocks)
|
|
{
|
|
if (current_block_index + buffer_size_in_blocks > current_blocks_size)
|
|
return false;
|
|
|
|
AIOContext aio_context{1};
|
|
|
|
iocb write_request{};
|
|
iocb * write_request_ptr{&write_request};
|
|
|
|
#if defined(__FreeBSD__)
|
|
write_request.aio.aio_lio_opcode = LIO_WRITE;
|
|
write_request.aio.aio_fildes = file.fd;
|
|
write_request.aio.aio_buf = reinterpret_cast<volatile void *>(const_cast<char *>(buffer));
|
|
write_request.aio.aio_nbytes = block_size * buffer_size_in_blocks;
|
|
write_request.aio.aio_offset = current_block_index * block_size;
|
|
#else
|
|
write_request.aio_lio_opcode = IOCB_CMD_PWRITE;
|
|
write_request.aio_fildes = file.fd;
|
|
write_request.aio_buf = reinterpret_cast<UInt64>(buffer);
|
|
write_request.aio_nbytes = block_size * buffer_size_in_blocks;
|
|
write_request.aio_offset = current_block_index * block_size;
|
|
#endif
|
|
|
|
while (io_submit(aio_context.ctx, 1, &write_request_ptr) < 0)
|
|
{
|
|
if (errno != EINTR)
|
|
throw Exception(ErrorCodes::CANNOT_IO_SUBMIT, "Cannot submit request for asynchronous IO on file {}", file_path);
|
|
}
|
|
|
|
// CurrentMetrics::Increment metric_increment_write{CurrentMetrics::Write};
|
|
|
|
io_event event;
|
|
|
|
while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) < 0)
|
|
{
|
|
if (errno != EINTR)
|
|
throw Exception(ErrorCodes::CANNOT_IO_GETEVENTS, "Failed to wait for asynchronous IO completion on file {}", file_path);
|
|
}
|
|
|
|
// Unpoison the memory returned from an uninstrumented system function.
|
|
__msan_unpoison(&event, sizeof(event));
|
|
|
|
auto bytes_written = eventResult(event);
|
|
|
|
ProfileEvents::increment(ProfileEvents::AIOWrite);
|
|
ProfileEvents::increment(ProfileEvents::AIOWriteBytes, bytes_written);
|
|
|
|
if (bytes_written != static_cast<decltype(bytes_written)>(block_size * buffer_size_in_blocks))
|
|
throw Exception(ErrorCodes::AIO_WRITE_ERROR,
|
|
"Not all data was written for asynchronous IO on file {}. returned: {}",
|
|
file_path,
|
|
std::to_string(bytes_written));
|
|
|
|
if (::fsync(file.fd) < 0)
|
|
throwFromErrnoWithPath("Cannot fsync " + file_path, file_path, ErrorCodes::CANNOT_FSYNC);
|
|
|
|
current_block_index += buffer_size_in_blocks;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool readKeys(size_t block_start, size_t blocks_length, PaddedPODArray<KeyType> & out) const
|
|
{
|
|
if (block_start + blocks_length > current_blocks_size)
|
|
return false;
|
|
|
|
size_t buffer_size_in_bytes = blocks_length * block_size;
|
|
|
|
Memory read_buffer_memory(block_size * blocks_length, block_size);
|
|
|
|
iocb request{};
|
|
iocb * request_ptr = &request;
|
|
|
|
#if defined(__FreeBSD__)
|
|
request.aio.aio_lio_opcode = LIO_READ;
|
|
request.aio.aio_fildes = file.fd;
|
|
request.aio.aio_buf = reinterpret_cast<volatile void *>(reinterpret_cast<UInt64>(read_buffer_memory.data()));
|
|
request.aio.aio_nbytes = buffer_size_in_bytes;
|
|
request.aio.aio_offset = block_start * block_size;
|
|
request.aio_data = 0;
|
|
#else
|
|
request.aio_lio_opcode = IOCB_CMD_PREAD;
|
|
request.aio_fildes = file.fd;
|
|
request.aio_buf = reinterpret_cast<UInt64>(read_buffer_memory.data());
|
|
request.aio_nbytes = buffer_size_in_bytes;
|
|
request.aio_offset = block_start * block_size;
|
|
request.aio_data = 0;
|
|
#endif
|
|
|
|
io_event event{};
|
|
AIOContext aio_context(1);
|
|
|
|
while (io_submit(aio_context.ctx, 1, &request_ptr) != 1)
|
|
{
|
|
if (errno != EINTR)
|
|
throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT);
|
|
}
|
|
|
|
while (io_getevents(aio_context.ctx, 1, 1, &event, nullptr) != 1)
|
|
{
|
|
if (errno != EINTR)
|
|
throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS);
|
|
}
|
|
|
|
auto read_bytes = eventResult(event);
|
|
|
|
if (read_bytes != static_cast<ssize_t>(buffer_size_in_bytes))
|
|
throw Exception(ErrorCodes::AIO_READ_ERROR,
|
|
"GC: AIO failed to read file {}. Expected bytes {}. Actual bytes {}",
|
|
file_path,
|
|
buffer_size_in_bytes,
|
|
read_bytes);
|
|
|
|
ProfileEvents::increment(ProfileEvents::AIORead);
|
|
ProfileEvents::increment(ProfileEvents::AIOReadBytes, read_bytes);
|
|
|
|
SSDCacheBlock block(block_size);
|
|
|
|
for (size_t i = 0; i < blocks_length; ++i)
|
|
{
|
|
block.reset(read_buffer_memory.data() + (i * block_size));
|
|
|
|
if constexpr (std::is_same_v<SSDCacheKeyType, SSDCacheSimpleKey>)
|
|
block.readSimpleKeys(out);
|
|
else
|
|
block.readComplexKeys(out);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
template <typename FetchBlockFunc>
|
|
void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray<size_t> & blocks_to_fetch, FetchBlockFunc && func) const
|
|
{
|
|
if (blocks_to_fetch.empty())
|
|
return;
|
|
|
|
Memory<Allocator<true>> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096);
|
|
|
|
size_t blocks_to_fetch_size = blocks_to_fetch.size();
|
|
|
|
PaddedPODArray<iocb> requests;
|
|
PaddedPODArray<iocb *> pointers;
|
|
|
|
requests.reserve(blocks_to_fetch_size);
|
|
pointers.reserve(blocks_to_fetch_size);
|
|
|
|
for (size_t block_to_fetch_index = 0; block_to_fetch_index < blocks_to_fetch_size; ++block_to_fetch_index)
|
|
{
|
|
iocb request{};
|
|
|
|
char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size);
|
|
|
|
#if defined(__FreeBSD__)
|
|
request.aio.aio_lio_opcode = LIO_READ;
|
|
request.aio.aio_fildes = file.fd;
|
|
request.aio.aio_buf = reinterpret_cast<volatile void *>(reinterpret_cast<UInt64>(buffer_place));
|
|
request.aio.aio_nbytes = block_size;
|
|
request.aio.aio_offset = block_size * blocks_to_fetch[block_to_fetch_index];
|
|
request.aio_data = block_to_fetch_index;
|
|
#else
|
|
request.aio_lio_opcode = IOCB_CMD_PREAD;
|
|
request.aio_fildes = file.fd;
|
|
request.aio_buf = reinterpret_cast<UInt64>(buffer_place);
|
|
request.aio_nbytes = block_size;
|
|
request.aio_offset = block_size * blocks_to_fetch[block_to_fetch_index];
|
|
request.aio_data = block_to_fetch_index;
|
|
#endif
|
|
|
|
requests.push_back(request);
|
|
pointers.push_back(&requests.back());
|
|
}
|
|
|
|
AIOContext aio_context(read_from_file_buffer_blocks_size);
|
|
|
|
PaddedPODArray<bool> processed(requests.size(), false);
|
|
PaddedPODArray<io_event> events;
|
|
events.resize_fill(requests.size());
|
|
|
|
size_t to_push = 0;
|
|
size_t to_pop = 0;
|
|
|
|
while (to_pop < requests.size())
|
|
{
|
|
int popped = 0;
|
|
|
|
while (to_pop < to_push && (popped = io_getevents(aio_context.ctx, to_push - to_pop, to_push - to_pop, &events[to_pop], nullptr)) <= 0)
|
|
{
|
|
if (errno != EINTR)
|
|
throwFromErrno("io_getevents: Failed to get an event for asynchronous IO", ErrorCodes::CANNOT_IO_GETEVENTS);
|
|
}
|
|
|
|
for (size_t i = to_pop; i < to_pop + popped; ++i)
|
|
{
|
|
size_t block_to_fetch_index = events[i].data;
|
|
const auto & request = requests[block_to_fetch_index];
|
|
|
|
const ssize_t read_bytes = eventResult(events[i]);
|
|
|
|
if (read_bytes != static_cast<ssize_t>(block_size))
|
|
throw Exception(ErrorCodes::AIO_READ_ERROR,
|
|
"GC: AIO failed to read file ({}). Expected bytes ({}). Actual bytes ({})", file_path, block_size, read_bytes);
|
|
|
|
ProfileEvents::increment(ProfileEvents::AIORead);
|
|
ProfileEvents::increment(ProfileEvents::AIOReadBytes, read_bytes);
|
|
|
|
char * request_buffer = getRequestBuffer(request);
|
|
|
|
// Unpoison the memory returned from an uninstrumented system function.
|
|
__msan_unpoison(request_buffer, block_size);
|
|
|
|
SSDCacheBlock block(block_size);
|
|
block.reset(request_buffer);
|
|
|
|
if (!block.checkCheckSum())
|
|
{
|
|
std::string calculated_check_sum = std::to_string(block.calculateCheckSum());
|
|
std::string check_sum = std::to_string(block.getCheckSum());
|
|
throw Exception(ErrorCodes::CORRUPTED_DATA,
|
|
"Cache data corrupted. Checksum validation failed. Calculated {} in block {}",
|
|
calculated_check_sum,
|
|
check_sum);
|
|
}
|
|
|
|
std::forward<FetchBlockFunc>(func)(blocks_to_fetch[block_to_fetch_index], block.getBlockData());
|
|
|
|
processed[block_to_fetch_index] = true;
|
|
}
|
|
|
|
while (to_pop < requests.size() && processed[to_pop])
|
|
++to_pop;
|
|
|
|
/// add new io tasks
|
|
const int new_tasks_count = std::min(read_from_file_buffer_blocks_size - (to_push - to_pop), requests.size() - to_push);
|
|
|
|
int pushed = 0;
|
|
while (new_tasks_count > 0 && (pushed = io_submit(aio_context.ctx, new_tasks_count, &pointers[to_push])) <= 0)
|
|
{
|
|
if (errno != EINTR)
|
|
throwFromErrno("io_submit: Failed to submit a request for asynchronous IO", ErrorCodes::CANNOT_IO_SUBMIT);
|
|
}
|
|
|
|
to_push += pushed;
|
|
}
|
|
}
|
|
|
|
inline size_t getCurrentBlockIndex() const { return current_block_index; }
|
|
|
|
inline void reset()
|
|
{
|
|
current_block_index = 0;
|
|
}
|
|
private:
|
|
struct FileDescriptor : private boost::noncopyable
|
|
{
|
|
|
|
FileDescriptor() = default;
|
|
|
|
FileDescriptor(FileDescriptor && rhs) : fd(rhs.fd) { rhs.fd = -1; }
|
|
|
|
FileDescriptor & operator=(FileDescriptor && rhs)
|
|
{
|
|
if (this == &rhs)
|
|
return *this;
|
|
|
|
close(fd);
|
|
|
|
fd = rhs.fd;
|
|
rhs.fd = -1;
|
|
}
|
|
|
|
~FileDescriptor()
|
|
{
|
|
if (fd != -1)
|
|
close(fd);
|
|
}
|
|
|
|
int fd = -1;
|
|
};
|
|
|
|
inline static int preallocateDiskSpace(int fd, size_t offset, size_t len)
|
|
{
|
|
#if defined(__FreeBSD__)
|
|
return posix_fallocate(fd, offset, len);
|
|
#else
|
|
return fallocate(fd, 0, offset, len);
|
|
#endif
|
|
}
|
|
|
|
inline static char * getRequestBuffer(const iocb & request)
|
|
{
|
|
char * result = nullptr;
|
|
|
|
#if defined(__FreeBSD__)
|
|
result = reinterpret_cast<char *>(reinterpret_cast<UInt64>(request.aio.aio_buf));
|
|
#else
|
|
result = reinterpret_cast<char *>(request.aio_buf);
|
|
#endif
|
|
|
|
return result;
|
|
}
|
|
|
|
inline static ssize_t eventResult(io_event & event)
|
|
{
|
|
ssize_t bytes_written;
|
|
|
|
#if defined(__FreeBSD__)
|
|
bytes_written = aio_return(reinterpret_cast<struct aiocb *>(event.udata));
|
|
#else
|
|
bytes_written = event.res;
|
|
#endif
|
|
|
|
return bytes_written;
|
|
}
|
|
|
|
String file_path;
|
|
size_t block_size;
|
|
size_t file_blocks_size;
|
|
FileDescriptor file;
|
|
|
|
size_t current_block_index = 0;
|
|
size_t current_blocks_size = 0;
|
|
};
|
|
|
|
/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions.
|
|
* Data is first written in memory buffer.
|
|
* If memory buffer is full then buffer is flushed to disk partition.
|
|
* If memory buffer cannot be flushed to associated disk partition, then if partition
|
|
* can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused.
|
|
* Index maps key to partition block and offset.
|
|
*/
|
|
template <DictionaryKeyType dictionary_key_type>
|
|
class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage
|
|
{
|
|
public:
|
|
using SSDCacheKeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::Simple, SSDCacheSimpleKey, SSDCacheComplexKey>;
|
|
using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::Simple, UInt64, StringRef>;
|
|
|
|
explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_)
|
|
: configuration(configuration_)
|
|
, file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size)
|
|
, rnd_engine(randomSeed())
|
|
{
|
|
memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size);
|
|
}
|
|
|
|
bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; }
|
|
|
|
String getName() const override
|
|
{
|
|
if (dictionary_key_type == DictionaryKeyType::Simple)
|
|
return "SSDCache";
|
|
else
|
|
return "SSDComplexKeyCache";
|
|
}
|
|
|
|
bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::Simple; }
|
|
|
|
SimpleKeysStorageFetchResult fetchColumnsForKeys(
|
|
const PaddedPODArray<UInt64> & keys,
|
|
const DictionaryStorageFetchRequest & fetch_request) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
|
return fetchColumnsForKeysImpl<SimpleKeysStorageFetchResult>(keys, fetch_request);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for complex key storage");
|
|
}
|
|
|
|
void insertColumnsForKeys(const PaddedPODArray<UInt64> & keys, Columns columns) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
|
insertColumnsForKeysImpl(keys, columns);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for complex key storage");
|
|
}
|
|
|
|
void insertDefaultKeys(const PaddedPODArray<UInt64> & keys) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
|
insertDefaultKeysImpl(keys);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for complex key storage");
|
|
}
|
|
|
|
PaddedPODArray<UInt64> getCachedSimpleKeys() const override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
|
|
return getCachedKeysImpl();
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedSimpleKeys is not supported for complex key storage");
|
|
}
|
|
|
|
bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::Complex; }
|
|
|
|
ComplexKeysStorageFetchResult fetchColumnsForKeys(
|
|
const PaddedPODArray<StringRef> & keys,
|
|
const DictionaryStorageFetchRequest & fetch_request) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
return fetchColumnsForKeysImpl<ComplexKeysStorageFetchResult>(keys, fetch_request);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method fetchColumnsForKeys is not supported for simple key storage");
|
|
}
|
|
|
|
void insertColumnsForKeys(const PaddedPODArray<StringRef> & keys, Columns columns) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
insertColumnsForKeysImpl(keys, columns);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertColumnsForKeys is not supported for simple key storage");
|
|
}
|
|
|
|
void insertDefaultKeys(const PaddedPODArray<StringRef> & keys) override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
insertDefaultKeysImpl(keys);
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method insertDefaultKeysImpl is not supported for simple key storage");
|
|
}
|
|
|
|
PaddedPODArray<StringRef> getCachedComplexKeys() const override
|
|
{
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
return getCachedKeysImpl();
|
|
else
|
|
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getCachedSimpleKeys is not supported for simple key storage");
|
|
}
|
|
|
|
size_t getSize() const override { return index.size(); }
|
|
|
|
double getLoadFactor() const override
|
|
{
|
|
size_t partitions_size = memory_buffer_partitions.size();
|
|
|
|
if (partitions_size == configuration.max_partitions_count)
|
|
return 1.0;
|
|
|
|
auto & current_memory_partition = memory_buffer_partitions[current_partition_index];
|
|
|
|
size_t full_partitions = partitions_size - 1;
|
|
size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex();
|
|
size_t blocks_on_disk = file_buffer.getCurrentBlockIndex();
|
|
|
|
size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count;
|
|
|
|
double load_factor = static_cast<double>(blocks_in_memory + blocks_on_disk) / max_blocks_size;
|
|
return load_factor;
|
|
}
|
|
|
|
size_t getBytesAllocated() const override
|
|
{
|
|
size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size;
|
|
size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size;
|
|
|
|
return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size;
|
|
}
|
|
|
|
private:
|
|
|
|
using TimePoint = std::chrono::system_clock::time_point;
|
|
|
|
struct Cell
|
|
{
|
|
enum CellState
|
|
{
|
|
in_memory,
|
|
on_disk,
|
|
default_value
|
|
};
|
|
|
|
time_t deadline;
|
|
SSDCacheIndex index;
|
|
size_t in_memory_partition_index;
|
|
CellState state;
|
|
|
|
inline bool isInMemory() const { return state == in_memory; }
|
|
inline bool isOnDisk() const { return state == on_disk; }
|
|
inline bool isDefaultValue() const { return state == default_value; }
|
|
};
|
|
|
|
struct KeyToBlockOffset
|
|
{
|
|
KeyToBlockOffset(size_t key_index_, size_t offset_in_block_)
|
|
: key_index(key_index_), offset_in_block(offset_in_block_)
|
|
{}
|
|
|
|
size_t key_index = 0;
|
|
size_t offset_in_block = 0;
|
|
};
|
|
|
|
template <typename Result>
|
|
Result fetchColumnsForKeysImpl(
|
|
const PaddedPODArray<KeyType> & keys,
|
|
const DictionaryStorageFetchRequest & fetch_request) const
|
|
{
|
|
Result result;
|
|
|
|
result.fetched_columns = fetch_request.makeAttributesResultColumns();
|
|
result.key_index_to_state.resize_fill(keys.size());
|
|
|
|
const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
|
|
|
|
size_t fetched_columns_index = 0;
|
|
|
|
using BlockIndexToKeysMap = absl::flat_hash_map<size_t, PaddedPODArray<KeyToBlockOffset>, DefaultHash<size_t>>;
|
|
BlockIndexToKeysMap block_to_keys_map;
|
|
absl::flat_hash_set<size_t, DefaultHash<size_t>> unique_blocks_to_request;
|
|
PaddedPODArray<size_t> blocks_to_request;
|
|
|
|
time_t strict_max_lifetime_seconds = static_cast<time_t>(configuration.strict_max_lifetime_seconds);
|
|
size_t keys_size = keys.size();
|
|
|
|
for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size)
|
|
if (fetch_request.shouldFillResultColumnWithIndex(attribute_size))
|
|
result.fetched_columns[attribute_size]->reserve(keys_size);
|
|
|
|
for (size_t key_index = 0; key_index < keys_size; ++key_index)
|
|
{
|
|
auto key = keys[key_index];
|
|
|
|
const auto * it = index.find(key);
|
|
|
|
if (!it)
|
|
{
|
|
++result.not_found_keys_size;
|
|
continue;
|
|
}
|
|
|
|
const auto & cell = it->getMapped();
|
|
|
|
if (unlikely(now > cell.deadline + strict_max_lifetime_seconds))
|
|
{
|
|
++result.not_found_keys_size;
|
|
continue;
|
|
}
|
|
|
|
bool cell_is_expired = false;
|
|
KeyState::State key_state = KeyState::found;
|
|
|
|
if (now > cell.deadline)
|
|
{
|
|
cell_is_expired = true;
|
|
key_state = KeyState::expired;
|
|
}
|
|
|
|
result.expired_keys_size += static_cast<size_t>(cell_is_expired);
|
|
result.found_keys_size += static_cast<size_t>(!cell_is_expired);
|
|
|
|
switch (cell.state)
|
|
{
|
|
case Cell::in_memory:
|
|
{
|
|
result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
|
|
++fetched_columns_index;
|
|
|
|
const auto & partition = memory_buffer_partitions[cell.in_memory_partition_index];
|
|
char * serialized_columns_place = partition.getPlace(cell.index);
|
|
deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, serialized_columns_place);
|
|
break;
|
|
}
|
|
case Cell::on_disk:
|
|
{
|
|
PaddedPODArray<KeyToBlockOffset> & keys_block = block_to_keys_map[cell.index.block_index];
|
|
keys_block.emplace_back(key_index, cell.index.offset_in_block);
|
|
|
|
KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found;
|
|
|
|
/// Fetched column index will be set later during fetch blocks
|
|
result.key_index_to_state[key_index] = {state, 0};
|
|
|
|
auto insert_result = unique_blocks_to_request.insert(cell.index.block_index);
|
|
bool was_inserted = insert_result.second;
|
|
|
|
if (was_inserted)
|
|
blocks_to_request.emplace_back(cell.index.block_index);
|
|
|
|
break;
|
|
}
|
|
case Cell::default_value:
|
|
{
|
|
result.key_index_to_state[key_index] = {key_state, fetched_columns_index};
|
|
result.key_index_to_state[key_index].setDefault();
|
|
++fetched_columns_index;
|
|
++result.default_keys_size;
|
|
|
|
insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index);
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Sort blocks by offset before start async io requests
|
|
std::sort(blocks_to_request.begin(), blocks_to_request.end());
|
|
|
|
file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data)
|
|
{
|
|
auto & keys_in_block = block_to_keys_map[block_index];
|
|
|
|
for (auto & key_in_block : keys_in_block)
|
|
{
|
|
char * key_data = block_data + key_in_block.offset_in_block;
|
|
deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data);
|
|
|
|
result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index);
|
|
|
|
++fetched_columns_index;
|
|
}
|
|
});
|
|
|
|
return result;
|
|
}
|
|
|
|
void insertColumnsForKeysImpl(const PaddedPODArray<KeyType> & keys, Columns columns)
|
|
{
|
|
size_t columns_to_serialize_size = columns.size();
|
|
PaddedPODArray<StringRef> temporary_column_data(columns_to_serialize_size);
|
|
|
|
Arena temporary_values_pool;
|
|
|
|
const auto now = std::chrono::system_clock::now();
|
|
|
|
for (size_t key_index = 0; key_index < keys.size(); ++key_index)
|
|
{
|
|
size_t allocated_size_for_columns = 0;
|
|
const char * block_start = nullptr;
|
|
|
|
auto key = keys[key_index];
|
|
|
|
for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index)
|
|
{
|
|
auto & column = columns[column_index];
|
|
temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start);
|
|
allocated_size_for_columns += temporary_column_data[column_index].size;
|
|
}
|
|
|
|
SSDCacheKeyType ssd_cache_key { key, allocated_size_for_columns, block_start };
|
|
|
|
if (!SSDCacheBlock::canBeWrittenInEmptyBlock(ssd_cache_key, configuration.block_size))
|
|
throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Serialized columns size is greater than allowed block size and metadata");
|
|
|
|
/// We cannot reuse place that is already allocated in file or memory cache so we erase key from index
|
|
eraseKeyFromIndex(key);
|
|
|
|
Cell cell;
|
|
setCellDeadline(cell, now);
|
|
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
{
|
|
/// Copy complex key into arena and put in cache
|
|
size_t key_size = key.size;
|
|
char * place_for_key = complex_key_arena.alloc(key_size);
|
|
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
|
|
KeyType updated_key{place_for_key, key_size};
|
|
ssd_cache_key.key = updated_key;
|
|
}
|
|
|
|
insertCell(ssd_cache_key, cell);
|
|
|
|
temporary_values_pool.rollback(allocated_size_for_columns);
|
|
}
|
|
}
|
|
|
|
void insertDefaultKeysImpl(const PaddedPODArray<KeyType> & keys)
|
|
{
|
|
const auto now = std::chrono::system_clock::now();
|
|
|
|
for (auto key : keys)
|
|
{
|
|
eraseKeyFromIndex(key);
|
|
|
|
Cell cell;
|
|
|
|
setCellDeadline(cell, now);
|
|
cell.index = {0, 0};
|
|
cell.in_memory_partition_index = 0;
|
|
cell.state = Cell::default_value;
|
|
|
|
|
|
if constexpr (dictionary_key_type == DictionaryKeyType::Complex)
|
|
{
|
|
/// Copy complex key into arena and put in cache
|
|
size_t key_size = key.size;
|
|
char * place_for_key = complex_key_arena.alloc(key_size);
|
|
memcpy(reinterpret_cast<void *>(place_for_key), reinterpret_cast<const void *>(key.data), key_size);
|
|
KeyType updated_key{place_for_key, key_size};
|
|
key = updated_key;
|
|
}
|
|
|
|
index[key] = cell;
|
|
}
|
|
}
|
|
|
|
PaddedPODArray<KeyType> getCachedKeysImpl() const
|
|
{
|
|
PaddedPODArray<KeyType> result;
|
|
result.reserve(index.size());
|
|
|
|
for (auto & node : index)
|
|
{
|
|
auto & cell = node.getMapped();
|
|
|
|
if (cell.state == Cell::default_value)
|
|
continue;
|
|
|
|
result.emplace_back(node.getKey());
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
void insertCell(SSDCacheKeyType & ssd_cache_key, Cell & cell)
|
|
{
|
|
/** InsertCell has following flow
|
|
|
|
1. We try to write key into current memory buffer, if write succeeded then return.
|
|
2. Then if we does not write key into current memory buffer, we try to flush current memory buffer
|
|
to disk.
|
|
|
|
If flush succeeded then reset current memory buffer, write key into it and return.
|
|
If flush failed that means that current partition on disk is full, need to allocate new partition
|
|
or start reusing old ones.
|
|
|
|
Retry to step 1.
|
|
*/
|
|
|
|
SSDCacheIndex cache_index {0, 0};
|
|
|
|
while (true)
|
|
{
|
|
bool started_reusing_old_partitions = memory_buffer_partitions.size() == configuration.max_partitions_count;
|
|
|
|
auto & current_memory_buffer_partition = memory_buffer_partitions[current_partition_index];
|
|
|
|
bool write_into_memory_buffer_result = current_memory_buffer_partition.writeKey(ssd_cache_key, cache_index);
|
|
|
|
if (write_into_memory_buffer_result)
|
|
{
|
|
cell.state = Cell::in_memory;
|
|
cell.index = cache_index;
|
|
cell.in_memory_partition_index = current_partition_index;
|
|
|
|
index[ssd_cache_key.key] = cell;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
/// Partition memory write buffer if full flush it to disk and retry
|
|
size_t block_index_in_file_before_write = file_buffer.getCurrentBlockIndex();
|
|
|
|
if (started_reusing_old_partitions)
|
|
{
|
|
/// If we start reusing old partitions we need to remove old keys on disk from index before writing buffer
|
|
PaddedPODArray<KeyType> old_keys;
|
|
file_buffer.readKeys(block_index_in_file_before_write, configuration.write_buffer_blocks_size, old_keys);
|
|
|
|
size_t file_read_end_block_index = block_index_in_file_before_write + configuration.write_buffer_blocks_size;
|
|
|
|
for (auto old_key : old_keys)
|
|
{
|
|
auto * it = index.find(old_key);
|
|
|
|
if (it)
|
|
{
|
|
const Cell & old_key_cell = it->getMapped();
|
|
|
|
size_t old_key_block = old_key_cell.index.block_index;
|
|
|
|
/// Check if key in index is key from old partition blocks
|
|
if (old_key_cell.isOnDisk() &&
|
|
old_key_block >= block_index_in_file_before_write &&
|
|
old_key_block < file_read_end_block_index)
|
|
eraseKeyFromIndex(old_key);
|
|
}
|
|
}
|
|
}
|
|
|
|
const char * partition_data = current_memory_buffer_partition.getData();
|
|
|
|
bool flush_to_file_result = file_buffer.writeBuffer(partition_data, configuration.write_buffer_blocks_size);
|
|
|
|
if (flush_to_file_result)
|
|
{
|
|
/// Update index cells keys offset and block index
|
|
PaddedPODArray<KeyType> keys_to_update;
|
|
current_memory_buffer_partition.readKeys(keys_to_update);
|
|
|
|
absl::flat_hash_set<KeyType, DefaultHash<KeyType>> updated_keys;
|
|
|
|
Int64 keys_to_update_size = static_cast<Int64>(keys_to_update.size());
|
|
|
|
/// Start from last to first because there can be multiple keys in same partition.
|
|
/// The valid key is the latest.
|
|
for (Int64 i = keys_to_update_size - 1; i >= 0; --i)
|
|
{
|
|
auto key_to_update = keys_to_update[i];
|
|
auto * it = index.find(key_to_update);
|
|
|
|
/// If there are no key to update or key to update not in memory
|
|
if (!it || it->getMapped().state != Cell::in_memory)
|
|
continue;
|
|
|
|
/// If there were duplicated keys in memory buffer partition
|
|
if (updated_keys.contains(it->getKey()))
|
|
continue;
|
|
|
|
updated_keys.insert(key_to_update);
|
|
|
|
Cell & cell_to_update = it->getMapped();
|
|
|
|
cell_to_update.state = Cell::on_disk;
|
|
cell_to_update.index.block_index += block_index_in_file_before_write;
|
|
}
|
|
|
|
/// Memory buffer partition flushed to disk start reusing it
|
|
current_memory_buffer_partition.reset();
|
|
memset(const_cast<char *>(current_memory_buffer_partition.getData()), 0, current_memory_buffer_partition.getSizeInBytes());
|
|
|
|
write_into_memory_buffer_result = current_memory_buffer_partition.writeKey(ssd_cache_key, cache_index);
|
|
assert(write_into_memory_buffer_result);
|
|
|
|
cell.state = Cell::in_memory;
|
|
cell.index = cache_index;
|
|
cell.in_memory_partition_index = current_partition_index;
|
|
|
|
index[ssd_cache_key.key] = cell;
|
|
break;
|
|
}
|
|
else
|
|
{
|
|
/// Partition is full need to try next partition
|
|
|
|
if (memory_buffer_partitions.size() < configuration.max_partitions_count)
|
|
{
|
|
/// Try tro create next partition without reusing old partitions
|
|
++current_partition_index;
|
|
file_buffer.allocateSizeForNextPartition();
|
|
memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size);
|
|
}
|
|
else
|
|
{
|
|
/// Start reusing old partitions
|
|
current_partition_index = (current_partition_index + 1) % memory_buffer_partitions.size();
|
|
file_buffer.reset();
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
inline void setCellDeadline(Cell & cell, TimePoint now)
|
|
{
|
|
if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0)
|
|
{
|
|
auto deadline = std::chrono::time_point<std::chrono::system_clock>::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds);
|
|
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
|
|
return;
|
|
}
|
|
|
|
size_t min_sec_lifetime = configuration.lifetime.min_sec;
|
|
size_t max_sec_lifetime = configuration.lifetime.max_sec;
|
|
|
|
std::uniform_int_distribution<UInt64> distribution{min_sec_lifetime, max_sec_lifetime};
|
|
auto deadline = now + std::chrono::seconds(distribution(rnd_engine));
|
|
cell.deadline = std::chrono::system_clock::to_time_t(deadline);
|
|
}
|
|
|
|
inline void eraseKeyFromIndex(KeyType key)
|
|
{
|
|
auto it = index.find(key);
|
|
|
|
if (it == nullptr)
|
|
return;
|
|
|
|
/// In case of complex key in arena key is serialized from hash table
|
|
KeyType key_copy = it->getKey();
|
|
|
|
index.erase(key);
|
|
|
|
if constexpr (std::is_same_v<KeyType, StringRef>)
|
|
complex_key_arena.free(const_cast<char *>(key_copy.data), key_copy.size);
|
|
}
|
|
|
|
SSDCacheDictionaryStorageConfiguration configuration;
|
|
|
|
SSDCacheFileBuffer<SSDCacheKeyType> file_buffer;
|
|
|
|
std::vector<SSDCacheMemoryBuffer<SSDCacheKeyType>> memory_buffer_partitions;
|
|
|
|
pcg64 rnd_engine;
|
|
|
|
using SimpleKeyHashMap = HashMap<UInt64, Cell>;
|
|
using ComplexKeyHashMap = HashMapWithSavedHash<StringRef, Cell>;
|
|
|
|
using CacheMap = std::conditional_t<
|
|
dictionary_key_type == DictionaryKeyType::Simple,
|
|
SimpleKeyHashMap,
|
|
ComplexKeyHashMap>;
|
|
|
|
ArenaWithFreeLists complex_key_arena;
|
|
|
|
CacheMap index;
|
|
|
|
size_t current_partition_index = 0;
|
|
|
|
};
|
|
|
|
}
|
|
|
|
#endif
|