mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 01:25:21 +00:00
codes refactor
This commit is contained in:
parent
2d226e287a
commit
d205f9ecab
@ -57,7 +57,7 @@
|
||||
#include <Storages/StorageReplicatedMergeTree.h>
|
||||
#include <Storages/System/attachSystemTables.h>
|
||||
#include <Storages/System/attachInformationSchemaTables.h>
|
||||
#include <Storages/RemoteReadBufferCache.h>
|
||||
#include <Storages/Cache/ExternalDataSourceCache.h>
|
||||
#include <Storages/RemoteFileMetadataFactory.h>
|
||||
#include <AggregateFunctions/registerAggregateFunctions.h>
|
||||
#include <Functions/registerFunctions.h>
|
||||
@ -529,7 +529,7 @@ if (ThreadFuzzer::instance().isEffective())
|
||||
UInt64 limit_size = config().getUInt64("local_cache_for_remote_fs.limit_size");
|
||||
UInt64 bytes_read_before_flush
|
||||
= config().getUInt64("local_cache_for_remote_fs.bytes_read_before_flush", DBMS_DEFAULT_BUFFER_SIZE);
|
||||
RemoteReadBufferCache::instance().initOnce(global_context, root_dir, limit_size, bytes_read_before_flush);
|
||||
ExternalDataSourceCache::instance().initOnce(global_context, root_dir, limit_size, bytes_read_before_flush);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -115,6 +115,7 @@ if (USE_HDFS)
|
||||
add_headers_and_sources(dbms Disks/HDFS)
|
||||
endif()
|
||||
|
||||
add_headers_and_sources(dbms Storages/Cache)
|
||||
if (USE_HIVE)
|
||||
add_headers_and_sources(dbms Storages/Hive)
|
||||
endif()
|
||||
|
@ -606,6 +606,10 @@
|
||||
M(650, SNAPPY_UNCOMPRESS_FAILED) \
|
||||
M(651, SNAPPY_COMPRESS_FAILED) \
|
||||
M(652, NO_HIVEMETASTORE) \
|
||||
M(653, NOT_INIT)\
|
||||
M(654, DISK_OVERFLOW)\
|
||||
M(655, FILE_BROKEN)\
|
||||
M(656, END_OF_FILE)\
|
||||
\
|
||||
M(999, KEEPER_EXCEPTION) \
|
||||
M(1000, POCO_EXCEPTION) \
|
||||
|
@ -25,12 +25,14 @@ struct TrivialWeightFunction
|
||||
template <typename T>
|
||||
struct TrivialLRUCacheEvitPolicy
|
||||
{
|
||||
inline bool canRelease(const T &) const
|
||||
// To note that the arg maybe is null
|
||||
inline bool canRelease(std::shared_ptr<T>) const
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
inline void release(T &)
|
||||
// To note that the arg is null
|
||||
inline void release(std::shared_ptr<T>)
|
||||
{
|
||||
}
|
||||
};
|
||||
@ -89,10 +91,8 @@ public:
|
||||
template <typename LoadFunc>
|
||||
std::pair<MappedPtr, bool> getOrSet(const Key & key, LoadFunc && load_func)
|
||||
{
|
||||
MappedPtr value = nullptr;
|
||||
bool is_value_loaded = false, is_value_updated = false;
|
||||
std::tie(value, is_value_loaded, is_value_updated) = getOrTrySet(key, std::move(load_func));
|
||||
return std::make_pair(value, is_value_loaded);
|
||||
auto [value, is_loaded, _] = getOrTrySet(key, std::move(load_func));
|
||||
return std::make_pair(value, is_loaded);
|
||||
}
|
||||
|
||||
/// If the value for the key is in the cache, returns it. If it is not, calls load_func() to
|
||||
@ -165,19 +165,17 @@ public:
|
||||
}
|
||||
|
||||
/// If key is not in cache or the element can be released, return is true. otherwise, return is false
|
||||
bool tryDel(const Key & key)
|
||||
bool tryRemove(const Key & key)
|
||||
{
|
||||
std::lock_guard loc(mutex);
|
||||
auto it = cells.find(key);
|
||||
if (it == cells.end())
|
||||
return true;
|
||||
auto & cell = it->second;
|
||||
if (cell.value)
|
||||
{
|
||||
if (!evict_policy.canRelease(*cell.value))
|
||||
return false;
|
||||
evict_policy.release(*cell.value);
|
||||
}
|
||||
if (!evict_policy.canRelease(cell.value))
|
||||
return false;
|
||||
evict_policy.release(cell.value);
|
||||
|
||||
current_size -= cell.size;
|
||||
cells.erase(it);
|
||||
queue.erase(cell.queue_iterator);
|
||||
@ -365,14 +363,13 @@ private:
|
||||
}
|
||||
else
|
||||
{
|
||||
if (cell.value && !evict_policy.canRelease(*cell.value))
|
||||
if (!evict_policy.canRelease(cell.value))
|
||||
{
|
||||
// the old value is refered by someone, cannot release now
|
||||
// in default policy, it is always true.
|
||||
return false;
|
||||
}
|
||||
if (cell.value)
|
||||
evict_policy.release(*cell.value); // release the old value. this action is empty in default policy.
|
||||
evict_policy.release(cell.value); // release the old value. this action is empty in default policy.
|
||||
current_size -= cell.size;
|
||||
queue.splice(queue.end(), queue, cell.queue_iterator);
|
||||
}
|
||||
@ -389,10 +386,9 @@ private:
|
||||
size_t current_weight_lost = 0;
|
||||
size_t queue_size = cells.size();
|
||||
auto key_it = queue.begin();
|
||||
|
||||
while ((current_size + required_size_to_remove > max_size || (max_elements_size != 0 && queue_size > max_elements_size))
|
||||
&& (queue_size > 1)
|
||||
&& (key_it != queue.end()))
|
||||
auto is_overflow = [&] { return (current_size + required_size_to_remove > max_size || (max_elements_size != 0 && queue_size > max_elements_size));
|
||||
};
|
||||
while (is_overflow() && (queue_size > 1) && (key_it != queue.end()))
|
||||
{
|
||||
const Key & key = *key_it;
|
||||
|
||||
@ -404,15 +400,11 @@ private:
|
||||
}
|
||||
|
||||
const auto & cell = it->second;
|
||||
bool can_evict = true;
|
||||
if (cell.value)
|
||||
can_evict = evict_policy.canRelease(*cell.value);// in default, it is true
|
||||
if (can_evict)
|
||||
if (evict_policy.canRelease(cell.value))// in default, it is true
|
||||
{
|
||||
// always call release() before erasing an element
|
||||
// in default, it's an empty action
|
||||
if (cell.value)
|
||||
evict_policy.release(*cell.value);
|
||||
evict_policy.release(cell.value);
|
||||
|
||||
current_size -= cell.size;
|
||||
current_weight_lost += cell.size;
|
||||
@ -434,7 +426,7 @@ private:
|
||||
LOG_ERROR(&Poco::Logger::get("LRUCache"), "LRUCache became inconsistent. There must be a bug in it.");
|
||||
abort();
|
||||
}
|
||||
return !(current_size + required_size_to_remove > max_size || (max_elements_size != 0 && queue_size > max_elements_size));
|
||||
return !is_overflow();
|
||||
}
|
||||
|
||||
/// Override this method if you want to track how much weight was lost in removeOverflow method.
|
||||
|
261
src/Storages/Cache/ExternalDataSourceCache.cpp
Normal file
261
src/Storages/Cache/ExternalDataSourceCache.cpp
Normal file
@ -0,0 +1,261 @@
|
||||
#include <memory>
|
||||
#include <unistd.h>
|
||||
#include <functional>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <base/logger_useful.h>
|
||||
#include <base/sleep.h>
|
||||
#include <base/errnoToString.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <Common/SipHash.h>
|
||||
#include <Common/hex.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Storages/Cache/ExternalDataSourceCache.h>
|
||||
#include <Storages/RemoteFileMetadataFactory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event ExternalDataSourceLocalCacheReadBytes;
|
||||
}
|
||||
namespace DB
|
||||
{
|
||||
namespace fs = std::filesystem;
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int OK;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int NOT_INIT;
|
||||
extern const int DISK_OVERFLOW;
|
||||
extern const int FILE_BROKEN;
|
||||
extern const int END_OF_FILE;
|
||||
}
|
||||
|
||||
|
||||
RemoteReadBuffer::RemoteReadBuffer(size_t buff_size) : BufferWithOwnMemory<SeekableReadBufferWithSize>(buff_size)
|
||||
{
|
||||
}
|
||||
|
||||
RemoteReadBuffer::~RemoteReadBuffer()
|
||||
{
|
||||
if (file_cache_controller)
|
||||
file_cache_controller->deallocFile(std::move(file_buffer));
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBuffer> RemoteReadBuffer::create(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> read_buffer)
|
||||
{
|
||||
auto * log = &Poco::Logger::get("RemoteReadBuffer");
|
||||
size_t buff_size = DBMS_DEFAULT_BUFFER_SIZE;
|
||||
if (read_buffer)
|
||||
buff_size = read_buffer->internalBuffer().size();
|
||||
/*
|
||||
* in the new implement of ReadBufferFromHDFS, buffer size is 0.
|
||||
*
|
||||
* in the common case, we don't read bytes from readbuffer directly, so set buff_size = DBMS_DEFAULT_BUFFER_SIZE
|
||||
* is OK.
|
||||
*
|
||||
* we need be careful with the case without local file reader.
|
||||
*/
|
||||
if (buff_size == 0)
|
||||
buff_size = DBMS_DEFAULT_BUFFER_SIZE;
|
||||
|
||||
auto remote_path = remote_file_metadata->remote_path;
|
||||
auto remote_read_buffer = std::make_unique<RemoteReadBuffer>(buff_size);
|
||||
ErrorCodes::ErrorCode error;
|
||||
|
||||
std::tie(remote_read_buffer->file_cache_controller, read_buffer, error) = ExternalDataSourceCache::instance().createReader(context, remote_file_metadata, read_buffer);
|
||||
if (remote_read_buffer->file_cache_controller == nullptr)
|
||||
{
|
||||
LOG_ERROR(log, "Failed to allocate local file for remote path: {}, reason: {}.", remote_path, error);
|
||||
// read_buffer is the input one.
|
||||
return read_buffer;
|
||||
}
|
||||
else
|
||||
{
|
||||
remote_read_buffer->file_buffer = remote_read_buffer->file_cache_controller->allocFile();
|
||||
if (!remote_read_buffer->file_buffer)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Create file readbuffer failed. {}",
|
||||
remote_read_buffer->file_cache_controller->getLocalPath().string());
|
||||
}
|
||||
remote_read_buffer->remote_file_size = remote_file_metadata->file_size;
|
||||
return remote_read_buffer;
|
||||
}
|
||||
|
||||
bool RemoteReadBuffer::nextImpl()
|
||||
{
|
||||
auto start_offset = file_buffer->getPosition();
|
||||
auto end_offset = start_offset + file_buffer->internalBuffer().size();
|
||||
file_cache_controller->waitMoreData(start_offset, end_offset);
|
||||
|
||||
auto status = file_buffer->next();
|
||||
if (status)
|
||||
{
|
||||
BufferBase::set(file_buffer->buffer().begin(),
|
||||
file_buffer->buffer().size(),
|
||||
file_buffer->offset());
|
||||
ProfileEvents::increment(ProfileEvents::ExternalDataSourceLocalCacheReadBytes, file_buffer->available());
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
off_t RemoteReadBuffer::seek(off_t offset, int whence)
|
||||
{
|
||||
if (!file_buffer)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot call seek() in this buffer. It's a bug!");
|
||||
/*
|
||||
* Need to wait here. For example, the current file has been download at position X, but here we try to seek to
|
||||
* postition Y (Y > X), it would fail.
|
||||
*/
|
||||
file_cache_controller->waitMoreData(offset, offset + file_buffer->internalBuffer().size());
|
||||
auto ret = file_buffer->seek(offset, whence);
|
||||
BufferBase::set(file_buffer->buffer().begin(),
|
||||
file_buffer->buffer().size(),
|
||||
file_buffer->offset());
|
||||
return ret;
|
||||
}
|
||||
|
||||
off_t RemoteReadBuffer::getPosition()
|
||||
{
|
||||
return file_buffer->getPosition();
|
||||
}
|
||||
|
||||
ExternalDataSourceCache::ExternalDataSourceCache() = default;
|
||||
|
||||
ExternalDataSourceCache::~ExternalDataSourceCache() = default;
|
||||
|
||||
ExternalDataSourceCache & ExternalDataSourceCache::instance()
|
||||
{
|
||||
static ExternalDataSourceCache instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void ExternalDataSourceCache::recoverCachedFilesMetadata(
|
||||
const fs::path & current_path,
|
||||
size_t current_depth,
|
||||
size_t max_depth)
|
||||
{
|
||||
if (current_depth >= max_depth)
|
||||
{
|
||||
std::vector<fs::path> invalid_paths;
|
||||
for (auto const & dir : fs::directory_iterator{current_path})
|
||||
{
|
||||
String path = dir.path();
|
||||
auto cache_controller = RemoteCacheController::recover(path);
|
||||
if (!cache_controller)
|
||||
{
|
||||
invalid_paths.emplace_back(path);
|
||||
continue;
|
||||
}
|
||||
if (!lru_caches->set(path, cache_controller))
|
||||
{
|
||||
invalid_paths.emplace_back(path);
|
||||
}
|
||||
}
|
||||
for (auto & path : invalid_paths)
|
||||
{
|
||||
fs::remove_all(path);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto const & dir : fs::directory_iterator{current_path})
|
||||
{
|
||||
recoverCachedFilesMetadata(dir.path(), current_depth + 1, max_depth);
|
||||
}
|
||||
}
|
||||
|
||||
void ExternalDataSourceCache::recoverTask()
|
||||
{
|
||||
recoverCachedFilesMetadata(root_dir, 1, 2);
|
||||
initialized = true;
|
||||
LOG_INFO(log, "Recovered from directory:{}", root_dir);
|
||||
}
|
||||
|
||||
void ExternalDataSourceCache::initOnce(
|
||||
ContextPtr context,
|
||||
const String & root_dir_, size_t limit_size_, size_t bytes_read_before_flush_)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (isInitialized())
|
||||
{
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot initialize ExternalDataSourceCache twice");
|
||||
}
|
||||
LOG_INFO(
|
||||
log, "Initializing local cache for remote data sources. Local cache root path: {}, cache size limit: {}", root_dir_, limit_size_);
|
||||
root_dir = root_dir_;
|
||||
local_cache_bytes_read_before_flush = bytes_read_before_flush_;
|
||||
lru_caches = std::make_unique<CacheType>(limit_size_);
|
||||
|
||||
/// create if root_dir not exists
|
||||
if (!fs::exists(fs::path(root_dir)))
|
||||
{
|
||||
fs::create_directories(fs::path(root_dir));
|
||||
}
|
||||
|
||||
recover_task_holder = context->getSchedulePool().createTask("recover local cache metadata for remote files", [this]{ recoverTask(); });
|
||||
recover_task_holder->activateAndSchedule();
|
||||
}
|
||||
|
||||
String ExternalDataSourceCache::calculateLocalPath(IRemoteFileMetadataPtr metadata) const
|
||||
{
|
||||
// add version into the full_path, and not block to read the new version
|
||||
String full_path = metadata->getName() + ":" + metadata->remote_path
|
||||
+ ":" + metadata->getVersion();
|
||||
UInt128 hashcode = sipHash128(full_path.c_str(), full_path.size());
|
||||
String hashcode_str = getHexUIntLowercase(hashcode);
|
||||
return fs::path(root_dir) / hashcode_str.substr(0, 3) / hashcode_str;
|
||||
}
|
||||
|
||||
std::tuple<RemoteCacheControllerPtr, std::unique_ptr<ReadBuffer>, ErrorCodes::ErrorCode>
|
||||
ExternalDataSourceCache::createReader(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> & read_buffer)
|
||||
{
|
||||
// If something is wrong on startup, rollback to read from the original ReadBuffer
|
||||
if (!isInitialized())
|
||||
{
|
||||
LOG_ERROR(log, "ExternalDataSourceCache has not been initialized");
|
||||
return {nullptr, std::move(read_buffer), ErrorCodes::NOT_INIT};
|
||||
}
|
||||
|
||||
auto remote_path = remote_file_metadata->remote_path;
|
||||
const auto & last_modification_timestamp = remote_file_metadata->last_modification_timestamp;
|
||||
auto local_path = calculateLocalPath(remote_file_metadata);
|
||||
std::lock_guard lock(mutex);
|
||||
auto cache = lru_caches->get(local_path);
|
||||
if (cache)
|
||||
{
|
||||
// the remote file has been updated, need to redownload
|
||||
if (!cache->isValid() || cache->isModified(remote_file_metadata))
|
||||
{
|
||||
LOG_TRACE(
|
||||
log,
|
||||
"Remote file ({}) has been updated. Last saved modification time: {}, actual last modification time: {}",
|
||||
remote_path,
|
||||
std::to_string(cache->getLastModificationTimestamp()),
|
||||
std::to_string(last_modification_timestamp));
|
||||
cache->markInvalid();
|
||||
}
|
||||
else
|
||||
{
|
||||
return {cache, nullptr, ErrorCodes::OK};
|
||||
}
|
||||
}
|
||||
|
||||
if (!fs::exists(local_path))
|
||||
fs::create_directories(local_path);
|
||||
|
||||
// cache is not found or is invalid
|
||||
auto new_cache = std::make_shared<RemoteCacheController>(remote_file_metadata, local_path, local_cache_bytes_read_before_flush);
|
||||
if (!lru_caches->set(local_path, new_cache))
|
||||
{
|
||||
LOG_ERROR(log, "Insert the new cache failed. new file size:{}, current total size:{}",
|
||||
remote_file_metadata->file_size,
|
||||
lru_caches->weight());
|
||||
return {nullptr, std::move(read_buffer), ErrorCodes::DISK_OVERFLOW};
|
||||
}
|
||||
new_cache->startBackgroundDownload(std::move(read_buffer), context->getSchedulePool());
|
||||
return {new_cache, nullptr, ErrorCodes::OK};
|
||||
}
|
||||
|
||||
}
|
97
src/Storages/Cache/ExternalDataSourceCache.h
Normal file
97
src/Storages/Cache/ExternalDataSourceCache.h
Normal file
@ -0,0 +1,97 @@
|
||||
#pragma once
|
||||
#include <mutex>
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <filesystem>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Common/LRUCache.h>
|
||||
#include <Common/ErrorCodes.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <IO/createReadBufferFromFileBase.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileBase.h>
|
||||
#include <IO/ReadBufferFromFileBase.h>
|
||||
#include <IO/ReadSettings.h>
|
||||
#include <IO/SeekableReadBuffer.h>
|
||||
#include <Storages/IRemoteFileMetadata.h>
|
||||
#include <condition_variable>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
#include <Storages/Cache/RemoteCacheController.h>
|
||||
#include <Storages/Cache/RemoteFileCachePolicy.h>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
|
||||
/*
|
||||
* FIXME:RemoteReadBuffer derive from SeekableReadBufferWithSize may cause some risks, since it's not seekable in some cases
|
||||
* But SeekableReadBuffer is not a interface which make it hard to fixup.
|
||||
*/
|
||||
class RemoteReadBuffer : public BufferWithOwnMemory<SeekableReadBufferWithSize>
|
||||
{
|
||||
public:
|
||||
explicit RemoteReadBuffer(size_t buff_size);
|
||||
~RemoteReadBuffer() override;
|
||||
static std::unique_ptr<ReadBuffer> create(ContextPtr contex, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> read_buffer);
|
||||
|
||||
bool nextImpl() override;
|
||||
off_t seek(off_t off, int whence) override;
|
||||
off_t getPosition() override;
|
||||
std::optional<size_t> getTotalSize() override { return remote_file_size; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<RemoteCacheController> file_cache_controller;
|
||||
std::unique_ptr<ReadBufferFromFileBase> file_buffer;
|
||||
size_t remote_file_size = 0;
|
||||
};
|
||||
|
||||
class ExternalDataSourceCache : private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
using CacheType = LRUCache<String, RemoteCacheController, std::hash<String>,
|
||||
RemoteFileCacheWeightFunction, RemoteFileCacheEvictPolicy>;
|
||||
~ExternalDataSourceCache();
|
||||
// global instance
|
||||
static ExternalDataSourceCache & instance();
|
||||
|
||||
void initOnce(ContextPtr context, const String & root_dir_, size_t limit_size_, size_t bytes_read_before_flush_);
|
||||
|
||||
inline bool isInitialized() const { return initialized; }
|
||||
|
||||
std::tuple<RemoteCacheControllerPtr, std::unique_ptr<ReadBuffer>, ErrorCodes::ErrorCode>
|
||||
createReader(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> & read_buffer);
|
||||
|
||||
void updateTotalSize(size_t size) { total_size += size; }
|
||||
|
||||
protected:
|
||||
ExternalDataSourceCache();
|
||||
|
||||
private:
|
||||
// root directory of local cache for remote filesystem
|
||||
String root_dir;
|
||||
size_t local_cache_bytes_read_before_flush = 0;
|
||||
|
||||
std::atomic<bool> initialized = false;
|
||||
std::atomic<size_t> total_size;
|
||||
std::mutex mutex;
|
||||
std::unique_ptr<CacheType> lru_caches;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("ExternalDataSourceCache");
|
||||
|
||||
String calculateLocalPath(IRemoteFileMetadataPtr meta) const;
|
||||
|
||||
BackgroundSchedulePool::TaskHolder recover_task_holder;
|
||||
void recoverTask();
|
||||
void recoverCachedFilesMetadata(
|
||||
const std::filesystem::path & current_path,
|
||||
size_t current_depth,
|
||||
size_t max_depth);
|
||||
};
|
||||
|
||||
}
|
249
src/Storages/Cache/RemoteCacheController.cpp
Normal file
249
src/Storages/Cache/RemoteCacheController.cpp
Normal file
@ -0,0 +1,249 @@
|
||||
#include <Storages/Cache/RemoteCacheController.h>
|
||||
#include <Storages/Cache/ExternalDataSourceCache.h>
|
||||
#include <Storages/RemoteFileMetadataFactory.h>
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/ReadSettings.h>
|
||||
#include <Poco/JSON/JSON.h>
|
||||
#include <Poco/JSON/Parser.h>
|
||||
#include <fstream>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace fs = std::filesystem;
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int OK;
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
extern const int END_OF_FILE;
|
||||
}
|
||||
|
||||
bool RemoteCacheController::loadInnerInformation(const fs::path & file_path)
|
||||
{
|
||||
if (!fs::exists(file_path))
|
||||
return false;
|
||||
std::ifstream info_file(file_path);
|
||||
Poco::JSON::Parser info_parser;
|
||||
auto info_json = info_parser.parse(info_file).extract<Poco::JSON::Object::Ptr>();
|
||||
file_status = static_cast<LocalFileStatus>(info_json->get("file_status").convert<Int32>());
|
||||
metadata_class = info_json->get("metadata_class").convert<String>();
|
||||
info_file.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<RemoteCacheController> RemoteCacheController::recover(const std::filesystem::path & local_path_)
|
||||
{
|
||||
auto * log = &Poco::Logger::get("RemoteCacheController");
|
||||
|
||||
if (!std::filesystem::exists(local_path_ / "data.bin"))
|
||||
{
|
||||
LOG_TRACE(log, "Invalid cached directory:{}", local_path_.string());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto cache_controller = std::make_shared<RemoteCacheController>(nullptr, local_path_, 0);
|
||||
if (!cache_controller->loadInnerInformation(local_path_ / "info.txt")
|
||||
|| cache_controller->file_status != DOWNLOADED)
|
||||
{
|
||||
LOG_INFO(log, "Recover cached file failed. local path:{}", local_path_.string());
|
||||
return nullptr;
|
||||
}
|
||||
try
|
||||
{
|
||||
cache_controller->file_metadata_ptr = RemoteFileMetadataFactory::instance().get(cache_controller->metadata_class);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
LOG_ERROR(log, "Get metadata class failed for {}", cache_controller->metadata_class);
|
||||
cache_controller->file_metadata_ptr = nullptr;
|
||||
}
|
||||
if (!cache_controller->file_metadata_ptr)
|
||||
{
|
||||
// do not load this invalid cached file and clear it. the clear action is in
|
||||
// ExternalDataSourceCache::recoverCachedFilesMetadata(), because deleting directories during iteration will
|
||||
// cause unexpected behaviors
|
||||
LOG_ERROR(log, "Cannot create the metadata class : {}. The cached file is invalid and will be remove. path:{}",
|
||||
cache_controller->metadata_class,
|
||||
local_path_.string());
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid metadata class:{}", cache_controller->metadata_class);
|
||||
}
|
||||
std::ifstream metadata_file(local_path_ / "metadata.txt");
|
||||
if (!cache_controller->file_metadata_ptr->fromString(std::string((std::istreambuf_iterator<char>(metadata_file)),
|
||||
std::istreambuf_iterator<char>())))
|
||||
{
|
||||
LOG_ERROR(log, "Cannot load the metadata. The cached file is invalid and will be remove. path:{}",
|
||||
local_path_.string());
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid metadata file({}) for meta class {}", local_path_.string(), cache_controller->metadata_class);
|
||||
}
|
||||
|
||||
cache_controller->current_offset = fs::file_size(local_path_ / "data.bin");
|
||||
|
||||
ExternalDataSourceCache::instance().updateTotalSize(cache_controller->file_metadata_ptr->file_size);
|
||||
return cache_controller;
|
||||
}
|
||||
|
||||
RemoteCacheController::RemoteCacheController(
|
||||
IRemoteFileMetadataPtr file_metadata_,
|
||||
const std::filesystem::path & local_path_,
|
||||
size_t cache_bytes_before_flush_)
|
||||
: file_metadata_ptr(file_metadata_)
|
||||
, local_path(local_path_)
|
||||
, valid(true)
|
||||
, local_cache_bytes_read_before_flush(cache_bytes_before_flush_)
|
||||
, current_offset(0)
|
||||
{
|
||||
// on recover, file_metadata_ptr is null, but it will be allocated after loading from metadata.txt
|
||||
// when we allocate a whole new file cache , file_metadata_ptr must not be null.
|
||||
if (file_metadata_ptr)
|
||||
{
|
||||
metadata_class = file_metadata_ptr->getName();
|
||||
auto metadata_file_writer = std::make_unique<WriteBufferFromFile>((local_path_ / "metadata.txt").string());
|
||||
auto str_buf = file_metadata_ptr->toString();
|
||||
metadata_file_writer->write(str_buf.c_str(), str_buf.size());
|
||||
metadata_file_writer->close();
|
||||
}
|
||||
}
|
||||
|
||||
ErrorCodes::ErrorCode RemoteCacheController::waitMoreData(size_t start_offset_, size_t end_offset_)
|
||||
{
|
||||
std::unique_lock lock{mutex};
|
||||
if (file_status == DOWNLOADED)
|
||||
{
|
||||
// finish reading
|
||||
if (start_offset_ >= current_offset)
|
||||
{
|
||||
lock.unlock();
|
||||
return ErrorCodes::END_OF_FILE;
|
||||
}
|
||||
}
|
||||
else // block until more data is ready
|
||||
{
|
||||
if (current_offset >= end_offset_)
|
||||
{
|
||||
lock.unlock();
|
||||
return ErrorCodes::OK;
|
||||
}
|
||||
else
|
||||
more_data_signal.wait(lock, [this, end_offset_] { return file_status == DOWNLOADED || current_offset >= end_offset_; });
|
||||
}
|
||||
lock.unlock();
|
||||
return ErrorCodes::OK;
|
||||
}
|
||||
|
||||
bool RemoteCacheController::isModified(IRemoteFileMetadataPtr file_metadata_)
|
||||
{
|
||||
return !(file_metadata_ptr->getVersion() == file_metadata_->getVersion());
|
||||
}
|
||||
|
||||
void RemoteCacheController::startBackgroundDownload(std::unique_ptr<ReadBuffer> in_readbuffer_, BackgroundSchedulePool & thread_pool)
|
||||
{
|
||||
data_file_writer = std::make_unique<WriteBufferFromFile>((fs::path(local_path) / "data.bin").string());
|
||||
flush(true);
|
||||
ReadBufferPtr in_readbuffer(in_readbuffer_.release());
|
||||
download_task_holder = thread_pool.createTask("download remote file",
|
||||
[this, in_readbuffer]{ backgroundDownload(in_readbuffer); });
|
||||
download_task_holder->activateAndSchedule();
|
||||
}
|
||||
|
||||
void RemoteCacheController::backgroundDownload(ReadBufferPtr remote_read_buffer)
|
||||
{
|
||||
file_status = DOWNLOADING;
|
||||
size_t before_unflush_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
while (!remote_read_buffer->eof())
|
||||
{
|
||||
size_t bytes = remote_read_buffer->available();
|
||||
|
||||
data_file_writer->write(remote_read_buffer->position(), bytes);
|
||||
remote_read_buffer->position() += bytes;
|
||||
total_bytes += bytes;
|
||||
before_unflush_bytes += bytes;
|
||||
if (before_unflush_bytes >= local_cache_bytes_read_before_flush)
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
current_offset += total_bytes;
|
||||
total_bytes = 0;
|
||||
flush();
|
||||
lock.unlock();
|
||||
more_data_signal.notify_all();
|
||||
before_unflush_bytes = 0;
|
||||
}
|
||||
}
|
||||
std::unique_lock lock(mutex);
|
||||
current_offset += total_bytes;
|
||||
file_status = DOWNLOADED;
|
||||
flush(true);
|
||||
data_file_writer.reset();
|
||||
lock.unlock();
|
||||
more_data_signal.notify_all();
|
||||
ExternalDataSourceCache::instance().updateTotalSize(file_metadata_ptr->file_size);
|
||||
LOG_TRACE(log, "Finish download into local path: {}, file metadata:{} ", local_path.string(), file_metadata_ptr->toString());
|
||||
}
|
||||
|
||||
void RemoteCacheController::flush(bool need_flush_status)
|
||||
{
|
||||
if (data_file_writer)
|
||||
{
|
||||
data_file_writer->sync();
|
||||
}
|
||||
if (need_flush_status)
|
||||
{
|
||||
auto file_writer = std::make_unique<WriteBufferFromFile>(local_path / "info.txt");
|
||||
Poco::JSON::Object jobj;
|
||||
jobj.set("file_status", static_cast<Int32>(file_status));
|
||||
jobj.set("metadata_class", metadata_class);
|
||||
std::stringstream buf; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
jobj.stringify(buf);
|
||||
file_writer->write(buf.str().c_str(), buf.str().size());
|
||||
file_writer->close();
|
||||
}
|
||||
}
|
||||
|
||||
RemoteCacheController::~RemoteCacheController()
|
||||
{
|
||||
if (download_task_holder)
|
||||
download_task_holder->deactivate();
|
||||
}
|
||||
|
||||
void RemoteCacheController::close()
|
||||
{
|
||||
// delete directory
|
||||
LOG_TRACE(log, "Removing the local cache. local path: {}", local_path.string());
|
||||
std::filesystem::remove_all(local_path);
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> RemoteCacheController::allocFile()
|
||||
{
|
||||
ReadSettings settings;
|
||||
//settings.local_fs_method = LocalFSReadMethod::read;
|
||||
auto file_buffer = createReadBufferFromFileBase((local_path / "data.bin").string(), settings);
|
||||
|
||||
if (file_buffer)
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
opened_file_buffer_refs.insert(reinterpret_cast<uintptr_t>(file_buffer.get()));
|
||||
}
|
||||
return file_buffer;
|
||||
}
|
||||
|
||||
void RemoteCacheController::deallocFile(std::unique_ptr<ReadBufferFromFileBase> file_buffer)
|
||||
{
|
||||
if (!file_buffer)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Try to release a null file buffer for {}", local_path.string());
|
||||
}
|
||||
auto buffer_ref = reinterpret_cast<uintptr_t>(file_buffer.get());
|
||||
std::lock_guard lock{mutex};
|
||||
auto it = opened_file_buffer_refs.find(buffer_ref);
|
||||
if (it == opened_file_buffer_refs.end())
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS,
|
||||
"Try to deallocate file with invalid handler remote path: {}, local path: {}",
|
||||
file_metadata_ptr->remote_path,
|
||||
local_path.string());
|
||||
}
|
||||
opened_file_buffer_refs.erase(it);
|
||||
}
|
||||
|
||||
}
|
114
src/Storages/Cache/RemoteCacheController.h
Normal file
114
src/Storages/Cache/RemoteCacheController.h
Normal file
@ -0,0 +1,114 @@
|
||||
#pragma once
|
||||
#include <mutex>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <filesystem>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
#include <Common/ErrorCodes.h>
|
||||
#include <Storages/IRemoteFileMetadata.h>
|
||||
#include <IO/ReadBufferFromFileBase.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileBase.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <base/logger_useful.h>
|
||||
|
||||
namespace DB
|
||||
{
|
||||
class RemoteCacheController
|
||||
{
|
||||
public:
|
||||
enum LocalFileStatus
|
||||
{
|
||||
TO_DOWNLOAD = 0,
|
||||
DOWNLOADING = 1,
|
||||
DOWNLOADED = 2,
|
||||
};
|
||||
|
||||
RemoteCacheController(
|
||||
IRemoteFileMetadataPtr file_metadata_,
|
||||
const std::filesystem::path & local_path_,
|
||||
size_t cache_bytes_before_flush_);
|
||||
~RemoteCacheController();
|
||||
|
||||
// recover from local disk
|
||||
static std::shared_ptr<RemoteCacheController>
|
||||
recover(const std::filesystem::path & local_path);
|
||||
|
||||
/**
|
||||
* Called by LocalCachedFileReader, must be used in pair
|
||||
* The second value of the return tuple is the local_path to store file.
|
||||
*/
|
||||
std::unique_ptr<ReadBufferFromFileBase> allocFile();
|
||||
void deallocFile(std::unique_ptr<ReadBufferFromFileBase> buffer);
|
||||
|
||||
/**
|
||||
* when allocFile be called, count++. deallocFile be called, count--.
|
||||
* the local file could be deleted only count==0
|
||||
*/
|
||||
inline bool closable()
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
return opened_file_buffer_refs.empty();
|
||||
}
|
||||
void close();
|
||||
|
||||
/**
|
||||
* called in LocalCachedFileReader read(), the reading process would be blocked until
|
||||
* enough data be downloaded.
|
||||
* If the file has finished download, the process would unblocked
|
||||
*/
|
||||
ErrorCodes::ErrorCode waitMoreData(size_t start_offset_, size_t end_offset_);
|
||||
|
||||
inline size_t size() const { return current_offset; }
|
||||
|
||||
inline const std::filesystem::path & getLocalPath() { return local_path; }
|
||||
inline String getRemotePath() const { return file_metadata_ptr->remote_path; }
|
||||
|
||||
inline UInt64 getLastModificationTimestamp() const { return file_metadata_ptr->last_modification_timestamp; }
|
||||
bool isModified(IRemoteFileMetadataPtr file_metadata_);
|
||||
inline void markInvalid()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
valid = false;
|
||||
}
|
||||
inline bool isValid()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
return valid;
|
||||
}
|
||||
IRemoteFileMetadataPtr getFileMetadata() { return file_metadata_ptr; }
|
||||
inline size_t getFileSize() const { return file_metadata_ptr->file_size; }
|
||||
|
||||
void startBackgroundDownload(std::unique_ptr<ReadBuffer> in_readbuffer_, BackgroundSchedulePool & thread_pool);
|
||||
|
||||
private:
|
||||
// flush file and status information
|
||||
void flush(bool need_flush_status = false);
|
||||
bool loadInnerInformation(const std::filesystem::path & file_path);
|
||||
|
||||
BackgroundSchedulePool::TaskHolder download_task_holder;
|
||||
void backgroundDownload(ReadBufferPtr remote_read_buffer);
|
||||
|
||||
std::mutex mutex;
|
||||
std::condition_variable more_data_signal;
|
||||
|
||||
std::set<uintptr_t> opened_file_buffer_refs; // refer to a buffer address
|
||||
|
||||
String metadata_class;
|
||||
LocalFileStatus file_status = TO_DOWNLOAD; // for tracking download process
|
||||
IRemoteFileMetadataPtr file_metadata_ptr;
|
||||
std::filesystem::path local_path;
|
||||
|
||||
bool valid;
|
||||
size_t local_cache_bytes_read_before_flush;
|
||||
size_t current_offset;
|
||||
|
||||
//std::shared_ptr<ReadBuffer> remote_read_buffer;
|
||||
std::unique_ptr<WriteBufferFromFileBase> data_file_writer;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("RemoteCacheController");
|
||||
};
|
||||
using RemoteCacheControllerPtr = std::shared_ptr<RemoteCacheController>;
|
||||
|
||||
}
|
25
src/Storages/Cache/RemoteFileCachePolicy.h
Normal file
25
src/Storages/Cache/RemoteFileCachePolicy.h
Normal file
@ -0,0 +1,25 @@
|
||||
#pragma once
|
||||
namespace DB
|
||||
{
|
||||
struct RemoteFileCacheWeightFunction
|
||||
{
|
||||
size_t operator()(const RemoteCacheController & cache) const
|
||||
{
|
||||
return cache.getFileSize();
|
||||
}
|
||||
};
|
||||
|
||||
struct RemoteFileCacheEvictPolicy
|
||||
{
|
||||
bool canRelease(std::shared_ptr<RemoteCacheController> cache) const
|
||||
{
|
||||
return (!cache || cache->closable());
|
||||
}
|
||||
void release(std::shared_ptr<RemoteCacheController> cache)
|
||||
{
|
||||
if (cache)
|
||||
cache->close();
|
||||
}
|
||||
};
|
||||
|
||||
}
|
@ -112,7 +112,7 @@ void HiveMetastoreClient::clearTableMetadata(const String & db_name, const Strin
|
||||
HiveTableMetadataPtr metadata = table_metadata_cache.get(cache_key);
|
||||
if (metadata)
|
||||
{
|
||||
if (!table_metadata_cache.tryDel(cache_key))
|
||||
if (!table_metadata_cache.tryRemove(cache_key))
|
||||
{
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Try to clear table metadata failed.");
|
||||
}
|
||||
|
@ -19,7 +19,7 @@
|
||||
#include <Interpreters/ExpressionAnalyzer.h>
|
||||
#include <Interpreters/TreeRewriter.h>
|
||||
#include <IO/ReadBufferFromString.h>
|
||||
#include <Storages/RemoteReadBufferCache.h>
|
||||
#include <Storages/Cache/ExternalDataSourceCache.h>
|
||||
#include <Parsers/ASTExpressionList.h>
|
||||
#include <Parsers/ASTCreateQuery.h>
|
||||
#include <Parsers/ASTFunction.h>
|
||||
@ -160,7 +160,7 @@ public:
|
||||
|
||||
/// Use local cache for remote storage if enabled.
|
||||
std::unique_ptr<ReadBuffer> remote_read_buf;
|
||||
if (RemoteReadBufferCache::instance().isInitialized() && getContext()->getSettingsRef().use_local_cache_for_remote_storage)
|
||||
if (ExternalDataSourceCache::instance().isInitialized() && getContext()->getSettingsRef().use_local_cache_for_remote_storage)
|
||||
remote_read_buf = RemoteReadBuffer::create(getContext(),
|
||||
std::make_shared<StorageHiveMetadata>("Hive", getNameNodeCluster(hdfs_namenode_url), uri_with_path, curr_file->getSize(), curr_file->getLastModTs()),
|
||||
std::move(raw_read_buf));
|
||||
|
@ -38,7 +38,7 @@ bool StorageHiveMetadata::fromString(const String &buf)
|
||||
|
||||
String StorageHiveMetadata::getVersion() const
|
||||
{
|
||||
return std::to_string(getLastModificationTimestamp());
|
||||
return std::to_string(last_modification_timestamp);
|
||||
}
|
||||
|
||||
void registerStorageHiveMetadataCreator()
|
||||
|
@ -10,8 +10,11 @@ public:
|
||||
const String & cluster_,
|
||||
const String & remote_path_,
|
||||
size_t file_size_,
|
||||
UInt64 last_modification_timestamp_):
|
||||
IRemoteFileMetadata(remote_path_, file_size_, last_modification_timestamp_),schema(schema_), cluster(cluster_){}
|
||||
UInt64 last_modification_timestamp_) : schema(schema_), cluster(cluster_){
|
||||
remote_path = remote_path_;
|
||||
file_size = file_size_;
|
||||
last_modification_timestamp = last_modification_timestamp_;
|
||||
}
|
||||
~StorageHiveMetadata() override;
|
||||
|
||||
String getName() const override { return "StorageHiveMetadata"; }
|
||||
@ -19,7 +22,7 @@ public:
|
||||
String getCluster() const { return cluster; }
|
||||
|
||||
String toString() const override;
|
||||
bool fromString(const String &buf) override;
|
||||
bool fromString(const String & buf) override;
|
||||
String getVersion() const override;
|
||||
private:
|
||||
String schema;
|
||||
|
@ -10,21 +10,8 @@ namespace DB
|
||||
class IRemoteFileMetadata
|
||||
{
|
||||
public:
|
||||
IRemoteFileMetadata() = default;
|
||||
IRemoteFileMetadata(const String & remote_path_,
|
||||
size_t file_size_,
|
||||
UInt64 last_modification_timestamp_):
|
||||
remote_path(remote_path_)
|
||||
,file_size(file_size_)
|
||||
,last_modification_timestamp(last_modification_timestamp_)
|
||||
{
|
||||
}
|
||||
virtual ~IRemoteFileMetadata() = default;
|
||||
virtual String getName() const = 0; //class name
|
||||
// methods for basic information
|
||||
inline size_t getFileSize() const { return file_size; }
|
||||
inline String getRemotePath() const { return remote_path; }
|
||||
inline UInt64 getLastModificationTimestamp() const { return last_modification_timestamp; }
|
||||
|
||||
// deserialize
|
||||
virtual bool fromString(const String & buf) = 0;
|
||||
@ -33,7 +20,7 @@ public:
|
||||
|
||||
// used for comparing two file metadatas are the same or not.
|
||||
virtual String getVersion() const = 0;
|
||||
protected:
|
||||
|
||||
String remote_path;
|
||||
size_t file_size = 0;
|
||||
UInt64 last_modification_timestamp = 0;
|
||||
|
@ -1,485 +0,0 @@
|
||||
#include <fstream>
|
||||
#include <memory>
|
||||
#include <unistd.h>
|
||||
#include <functional>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Poco/JSON/JSON.h>
|
||||
#include <Poco/JSON/Parser.h>
|
||||
#include <base/logger_useful.h>
|
||||
#include <base/sleep.h>
|
||||
#include <base/errnoToString.h>
|
||||
#include <Common/ProfileEvents.h>
|
||||
#include <Common/SipHash.h>
|
||||
#include <Common/hex.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Storages/RemoteReadBufferCache.h>
|
||||
#include <Storages/RemoteFileMetadataFactory.h>
|
||||
#include <IO/WriteHelpers.h>
|
||||
|
||||
namespace fs = std::filesystem;
|
||||
namespace ProfileEvents
|
||||
{
|
||||
extern const Event ExternalDataSourceLocalCacheReadBytes;
|
||||
}
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
bool RemoteCacheController::loadInnerInformation(const fs::path & file_path)
|
||||
{
|
||||
if (!fs::exists(file_path))
|
||||
return false;
|
||||
std::ifstream info_file(file_path);
|
||||
Poco::JSON::Parser info_parser;
|
||||
auto info_json = info_parser.parse(info_file).extract<Poco::JSON::Object::Ptr>();
|
||||
file_status = static_cast<LocalFileStatus>(info_json->get("file_status").convert<Int32>());
|
||||
metadata_class = info_json->get("metadata_class").convert<String>();
|
||||
info_file.close();
|
||||
return true;
|
||||
}
|
||||
|
||||
std::shared_ptr<RemoteCacheController> RemoteCacheController::recover(const std::filesystem::path & local_path_)
|
||||
{
|
||||
auto * log = &Poco::Logger::get("RemoteCacheController");
|
||||
|
||||
if (!std::filesystem::exists(local_path_ / "data.bin"))
|
||||
{
|
||||
LOG_TRACE(log, "Invalid cached directory:{}", local_path_.string());
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto cache_controller = std::make_shared<RemoteCacheController>(nullptr, local_path_, 0);
|
||||
if (!cache_controller->loadInnerInformation(local_path_ / "info.txt")
|
||||
|| cache_controller->file_status != DOWNLOADED)
|
||||
{
|
||||
LOG_INFO(log, "Recover cached file failed. local path:{}", local_path_.string());
|
||||
return nullptr;
|
||||
}
|
||||
try
|
||||
{
|
||||
cache_controller->file_metadata_ptr = RemoteFileMetadataFactory::instance().get(cache_controller->metadata_class);
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
LOG_ERROR(log, "Get metadata class failed for {}", cache_controller->metadata_class);
|
||||
cache_controller->file_metadata_ptr = nullptr;
|
||||
}
|
||||
if (!cache_controller->file_metadata_ptr)
|
||||
{
|
||||
// do not load this invalid cached file and clear it. the clear action is in
|
||||
// RemoteReadBufferCache::recoverCachedFilesMetadata(), because deleting directories during iteration will
|
||||
// cause unexpected behaviors
|
||||
LOG_ERROR(log, "Cannot create the metadata class : {}. The cached file is invalid and will be remove. path:{}",
|
||||
cache_controller->metadata_class,
|
||||
local_path_.string());
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Invalid metadata class:{}", cache_controller->metadata_class);
|
||||
}
|
||||
std::ifstream metadata_file(local_path_ / "metadata.txt");
|
||||
if (!cache_controller->file_metadata_ptr->fromString(std::string((std::istreambuf_iterator<char>(metadata_file)),
|
||||
std::istreambuf_iterator<char>())))
|
||||
{
|
||||
LOG_ERROR(log, "Cannot load the metadata. The cached file is invalid and will be remove. path:{}",
|
||||
local_path_.string());
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid metadata file({}) for meta class {}", local_path_.string(), cache_controller->metadata_class);
|
||||
}
|
||||
|
||||
cache_controller->current_offset = fs::file_size(local_path_ / "data.bin");
|
||||
|
||||
RemoteReadBufferCache::instance().updateTotalSize(cache_controller->file_metadata_ptr->getFileSize());
|
||||
return cache_controller;
|
||||
}
|
||||
|
||||
RemoteCacheController::RemoteCacheController(
|
||||
IRemoteFileMetadataPtr file_metadata_,
|
||||
const std::filesystem::path & local_path_,
|
||||
size_t cache_bytes_before_flush_)
|
||||
: file_metadata_ptr(file_metadata_)
|
||||
, local_path(local_path_)
|
||||
, valid(true)
|
||||
, local_cache_bytes_read_before_flush(cache_bytes_before_flush_)
|
||||
, current_offset(0)
|
||||
{
|
||||
// on recover, file_metadata_ptr is null, but it will be allocated after loading from metadata.txt
|
||||
// when we allocate a whole new file cache , file_metadata_ptr must not be null.
|
||||
if (file_metadata_ptr)
|
||||
{
|
||||
metadata_class = file_metadata_ptr->getName();
|
||||
auto metadata_file_writer = std::make_unique<WriteBufferFromFile>((local_path_ / "metadata.txt").string());
|
||||
auto str_buf = file_metadata_ptr->toString();
|
||||
metadata_file_writer->write(str_buf.c_str(), str_buf.size());
|
||||
metadata_file_writer->close();
|
||||
}
|
||||
}
|
||||
|
||||
RemoteReadBufferCacheError RemoteCacheController::waitMoreData(size_t start_offset_, size_t end_offset_)
|
||||
{
|
||||
std::unique_lock lock{mutex};
|
||||
if (file_status == DOWNLOADED)
|
||||
{
|
||||
// finish reading
|
||||
if (start_offset_ >= current_offset)
|
||||
{
|
||||
lock.unlock();
|
||||
return RemoteReadBufferCacheError::END_OF_FILE;
|
||||
}
|
||||
}
|
||||
else // block until more data is ready
|
||||
{
|
||||
if (current_offset >= end_offset_)
|
||||
{
|
||||
lock.unlock();
|
||||
return RemoteReadBufferCacheError::OK;
|
||||
}
|
||||
else
|
||||
more_data_signal.wait(lock, [this, end_offset_] { return file_status == DOWNLOADED || current_offset >= end_offset_; });
|
||||
}
|
||||
lock.unlock();
|
||||
return RemoteReadBufferCacheError::OK;
|
||||
}
|
||||
|
||||
bool RemoteCacheController::checkFileChanged(IRemoteFileMetadataPtr file_metadata_)
|
||||
{
|
||||
return !(file_metadata_ptr->getVersion() == file_metadata_->getVersion());
|
||||
}
|
||||
|
||||
void RemoteCacheController::startBackgroundDownload(std::unique_ptr<ReadBuffer> in_readbuffer_, BackgroundSchedulePool & thread_pool)
|
||||
{
|
||||
data_file_writer = std::make_unique<WriteBufferFromFile>((fs::path(local_path) / "data.bin").string());
|
||||
flush(true);
|
||||
ReadBufferPtr in_readbuffer(in_readbuffer_.release());
|
||||
download_task_holder = thread_pool.createTask("download remote file",
|
||||
[this, in_readbuffer]{ backgroundDownload(in_readbuffer); });
|
||||
download_task_holder->activateAndSchedule();
|
||||
}
|
||||
|
||||
void RemoteCacheController::backgroundDownload(ReadBufferPtr remote_read_buffer)
|
||||
{
|
||||
file_status = DOWNLOADING;
|
||||
size_t before_unflush_bytes = 0;
|
||||
size_t total_bytes = 0;
|
||||
while (!remote_read_buffer->eof())
|
||||
{
|
||||
size_t bytes = remote_read_buffer->available();
|
||||
|
||||
data_file_writer->write(remote_read_buffer->position(), bytes);
|
||||
remote_read_buffer->position() += bytes;
|
||||
total_bytes += bytes;
|
||||
before_unflush_bytes += bytes;
|
||||
if (before_unflush_bytes >= local_cache_bytes_read_before_flush)
|
||||
{
|
||||
std::unique_lock lock(mutex);
|
||||
current_offset += total_bytes;
|
||||
total_bytes = 0;
|
||||
flush();
|
||||
lock.unlock();
|
||||
more_data_signal.notify_all();
|
||||
before_unflush_bytes = 0;
|
||||
}
|
||||
}
|
||||
std::unique_lock lock(mutex);
|
||||
current_offset += total_bytes;
|
||||
file_status = DOWNLOADED;
|
||||
flush(true);
|
||||
data_file_writer.reset();
|
||||
lock.unlock();
|
||||
more_data_signal.notify_all();
|
||||
RemoteReadBufferCache::instance().updateTotalSize(file_metadata_ptr->getFileSize());
|
||||
LOG_TRACE(log, "Finish download into local path: {}, file metadata:{} ", local_path.string(), file_metadata_ptr->toString());
|
||||
}
|
||||
|
||||
void RemoteCacheController::flush(bool need_flush_status)
|
||||
{
|
||||
if (data_file_writer)
|
||||
{
|
||||
data_file_writer->sync();
|
||||
}
|
||||
if (need_flush_status)
|
||||
{
|
||||
auto file_writer = std::make_unique<WriteBufferFromFile>(local_path / "info.txt");
|
||||
Poco::JSON::Object jobj;
|
||||
jobj.set("file_status", static_cast<Int32>(file_status));
|
||||
jobj.set("metadata_class", metadata_class);
|
||||
std::stringstream buf; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
|
||||
jobj.stringify(buf);
|
||||
file_writer->write(buf.str().c_str(), buf.str().size());
|
||||
file_writer->close();
|
||||
}
|
||||
}
|
||||
|
||||
RemoteCacheController::~RemoteCacheController()
|
||||
{
|
||||
if (download_task_holder)
|
||||
download_task_holder->deactivate();
|
||||
}
|
||||
|
||||
void RemoteCacheController::close()
|
||||
{
|
||||
// delete directory
|
||||
LOG_TRACE(log, "Removing the local cache. local path: {}", local_path.string());
|
||||
std::filesystem::remove_all(local_path);
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBufferFromFileBase> RemoteCacheController::allocFile()
|
||||
{
|
||||
ReadSettings settings;
|
||||
//settings.local_fs_method = LocalFSReadMethod::read;
|
||||
auto file_buffer = createReadBufferFromFileBase((local_path / "data.bin").string(), settings);
|
||||
|
||||
if (file_buffer)
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
opened_file_buffer_refs.insert(reinterpret_cast<uintptr_t>(file_buffer.get()));
|
||||
}
|
||||
return file_buffer;
|
||||
}
|
||||
|
||||
void RemoteCacheController::deallocFile(std::unique_ptr<ReadBufferFromFileBase> file_buffer)
|
||||
{
|
||||
if (!file_buffer)
|
||||
{
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Try to release a null file buffer for {}", local_path.string());
|
||||
}
|
||||
auto buffer_ref = reinterpret_cast<uintptr_t>(file_buffer.get());
|
||||
std::lock_guard lock{mutex};
|
||||
auto it = opened_file_buffer_refs.find(buffer_ref);
|
||||
if (it == opened_file_buffer_refs.end())
|
||||
{
|
||||
throw Exception(
|
||||
ErrorCodes::BAD_ARGUMENTS,
|
||||
"Try to deallocate file with invalid handler remote path: {}, local path: {}",
|
||||
file_metadata_ptr->getRemotePath(),
|
||||
local_path.string());
|
||||
}
|
||||
opened_file_buffer_refs.erase(it);
|
||||
}
|
||||
|
||||
RemoteReadBuffer::RemoteReadBuffer(size_t buff_size) : BufferWithOwnMemory<SeekableReadBufferWithSize>(buff_size)
|
||||
{
|
||||
}
|
||||
|
||||
RemoteReadBuffer::~RemoteReadBuffer()
|
||||
{
|
||||
if (file_cache_controller)
|
||||
file_cache_controller->deallocFile(std::move(file_buffer));
|
||||
}
|
||||
|
||||
std::unique_ptr<ReadBuffer> RemoteReadBuffer::create(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> read_buffer)
|
||||
{
|
||||
auto * log = &Poco::Logger::get("RemoteReadBuffer");
|
||||
size_t buff_size = DBMS_DEFAULT_BUFFER_SIZE;
|
||||
if (read_buffer)
|
||||
buff_size = read_buffer->internalBuffer().size();
|
||||
/*
|
||||
* in the new implement of ReadBufferFromHDFS, buffer size is 0.
|
||||
*
|
||||
* in the common case, we don't read bytes from readbuffer directly, so set buff_size = DBMS_DEFAULT_BUFFER_SIZE
|
||||
* is OK.
|
||||
*
|
||||
* we need be careful with the case without local file reader.
|
||||
*/
|
||||
if (buff_size == 0)
|
||||
buff_size = DBMS_DEFAULT_BUFFER_SIZE;
|
||||
|
||||
auto remote_path = remote_file_metadata->getRemotePath();
|
||||
auto remote_read_buffer = std::make_unique<RemoteReadBuffer>(buff_size);
|
||||
RemoteReadBufferCacheError error;
|
||||
|
||||
std::tie(remote_read_buffer->file_cache_controller, read_buffer, error) = RemoteReadBufferCache::instance().createReader(context, remote_file_metadata, read_buffer);
|
||||
if (remote_read_buffer->file_cache_controller == nullptr)
|
||||
{
|
||||
LOG_ERROR(log, "Failed to allocate local file for remote path: {}, reason: {}.", remote_path, error);
|
||||
// read_buffer is the input one.
|
||||
return read_buffer;
|
||||
}
|
||||
else
|
||||
{
|
||||
remote_read_buffer->file_buffer = remote_read_buffer->file_cache_controller->allocFile();
|
||||
if (!remote_read_buffer->file_buffer)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Create file readbuffer failed. {}",
|
||||
remote_read_buffer->file_cache_controller->getLocalPath().string());
|
||||
}
|
||||
remote_read_buffer->remote_file_size = remote_file_metadata->getFileSize();
|
||||
return remote_read_buffer;
|
||||
}
|
||||
|
||||
bool RemoteReadBuffer::nextImpl()
|
||||
{
|
||||
auto start_offset = file_buffer->getPosition();
|
||||
auto end_offset = start_offset + file_buffer->internalBuffer().size();
|
||||
file_cache_controller->waitMoreData(start_offset, end_offset);
|
||||
|
||||
auto status = file_buffer->next();
|
||||
if (status)
|
||||
{
|
||||
BufferBase::set(file_buffer->buffer().begin(),
|
||||
file_buffer->buffer().size(),
|
||||
file_buffer->offset());
|
||||
ProfileEvents::increment(ProfileEvents::ExternalDataSourceLocalCacheReadBytes, file_buffer->available());
|
||||
}
|
||||
return status;
|
||||
}
|
||||
|
||||
off_t RemoteReadBuffer::seek(off_t offset, int whence)
|
||||
{
|
||||
if (!file_buffer)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot call seek() in this buffer. It's a bug!");
|
||||
/*
|
||||
* Need to wait here. For example, the current file has been download at position X, but here we try to seek to
|
||||
* postition Y (Y > X), it would fail.
|
||||
*/
|
||||
file_cache_controller->waitMoreData(offset, offset + file_buffer->internalBuffer().size());
|
||||
auto ret = file_buffer->seek(offset, whence);
|
||||
BufferBase::set(file_buffer->buffer().begin(),
|
||||
file_buffer->buffer().size(),
|
||||
file_buffer->offset());
|
||||
return ret;
|
||||
}
|
||||
|
||||
off_t RemoteReadBuffer::getPosition()
|
||||
{
|
||||
return file_buffer->getPosition();
|
||||
}
|
||||
|
||||
RemoteReadBufferCache::RemoteReadBufferCache() = default;
|
||||
|
||||
RemoteReadBufferCache::~RemoteReadBufferCache() = default;
|
||||
|
||||
RemoteReadBufferCache & RemoteReadBufferCache::instance()
|
||||
{
|
||||
static RemoteReadBufferCache instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
void RemoteReadBufferCache::recoverCachedFilesMetadata(
|
||||
const fs::path & current_path,
|
||||
size_t current_depth,
|
||||
size_t max_depth)
|
||||
{
|
||||
if (current_depth >= max_depth)
|
||||
{
|
||||
std::vector<fs::path> invalid_paths;
|
||||
for (auto const & dir : fs::directory_iterator{current_path})
|
||||
{
|
||||
String path = dir.path();
|
||||
auto cache_controller = RemoteCacheController::recover(path);
|
||||
if (!cache_controller)
|
||||
{
|
||||
invalid_paths.emplace_back(path);
|
||||
continue;
|
||||
}
|
||||
if (!lru_caches->set(path, cache_controller))
|
||||
{
|
||||
invalid_paths.emplace_back(path);
|
||||
}
|
||||
}
|
||||
for (auto & path : invalid_paths)
|
||||
{
|
||||
fs::remove_all(path);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto const & dir : fs::directory_iterator{current_path})
|
||||
{
|
||||
recoverCachedFilesMetadata(dir.path(), current_depth + 1, max_depth);
|
||||
}
|
||||
}
|
||||
|
||||
void RemoteReadBufferCache::recoverTask()
|
||||
{
|
||||
recoverCachedFilesMetadata(root_dir, 1, 2);
|
||||
initialized = true;
|
||||
LOG_INFO(log, "Recovered from directory:{}", root_dir);
|
||||
}
|
||||
|
||||
void RemoteReadBufferCache::initOnce(
|
||||
ContextPtr context,
|
||||
const String & root_dir_, size_t limit_size_, size_t bytes_read_before_flush_)
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
if (isInitialized())
|
||||
{
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot initialize RemoteReadBufferCache twice");
|
||||
}
|
||||
LOG_INFO(
|
||||
log, "Initializing local cache for remote data sources. Local cache root path: {}, cache size limit: {}", root_dir_, limit_size_);
|
||||
root_dir = root_dir_;
|
||||
local_cache_bytes_read_before_flush = bytes_read_before_flush_;
|
||||
lru_caches = std::make_unique<CacheType>(limit_size_);
|
||||
|
||||
/// create if root_dir not exists
|
||||
if (!fs::exists(fs::path(root_dir)))
|
||||
{
|
||||
fs::create_directories(fs::path(root_dir));
|
||||
}
|
||||
|
||||
recover_task_holder = context->getSchedulePool().createTask("recover local cache metadata for remote files", [this]{ recoverTask(); });
|
||||
recover_task_holder->activateAndSchedule();
|
||||
}
|
||||
|
||||
String RemoteReadBufferCache::calculateLocalPath(IRemoteFileMetadataPtr metadata) const
|
||||
{
|
||||
// add version into the full_path, and not block to read the new version
|
||||
String full_path = metadata->getName() + ":" + metadata->getRemotePath()
|
||||
+ ":" + metadata->getVersion();
|
||||
UInt128 hashcode = sipHash128(full_path.c_str(), full_path.size());
|
||||
String hashcode_str = getHexUIntLowercase(hashcode);
|
||||
return fs::path(root_dir) / hashcode_str.substr(0, 3) / hashcode_str;
|
||||
}
|
||||
|
||||
std::tuple<RemoteCacheControllerPtr, std::unique_ptr<ReadBuffer>, RemoteReadBufferCacheError>
|
||||
RemoteReadBufferCache::createReader(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> & read_buffer)
|
||||
{
|
||||
// If something is wrong on startup, rollback to read from the original ReadBuffer
|
||||
if (!isInitialized())
|
||||
{
|
||||
LOG_ERROR(log, "RemoteReadBufferCache has not been initialized");
|
||||
return {nullptr, std::move(read_buffer), RemoteReadBufferCacheError::NOT_INIT};
|
||||
}
|
||||
|
||||
auto remote_path = remote_file_metadata->getRemotePath();
|
||||
const auto & last_modification_timestamp = remote_file_metadata->getLastModificationTimestamp();
|
||||
auto local_path = calculateLocalPath(remote_file_metadata);
|
||||
std::lock_guard lock(mutex);
|
||||
auto cache = lru_caches->get(local_path);
|
||||
if (cache)
|
||||
{
|
||||
// the remote file has been updated, need to redownload
|
||||
if (!cache->isValid() || cache->checkFileChanged(remote_file_metadata))
|
||||
{
|
||||
LOG_TRACE(
|
||||
log,
|
||||
"Remote file ({}) has been updated. Last saved modification time: {}, actual last modification time: {}",
|
||||
remote_path,
|
||||
std::to_string(cache->getLastModificationTimestamp()),
|
||||
std::to_string(last_modification_timestamp));
|
||||
cache->markInvalid();
|
||||
}
|
||||
else
|
||||
{
|
||||
return {cache, nullptr, RemoteReadBufferCacheError::OK};
|
||||
}
|
||||
}
|
||||
|
||||
if (!fs::exists(local_path))
|
||||
fs::create_directories(local_path);
|
||||
|
||||
// cache is not found or is invalid
|
||||
auto new_cache = std::make_shared<RemoteCacheController>(remote_file_metadata, local_path, local_cache_bytes_read_before_flush);
|
||||
if (!lru_caches->set(local_path, new_cache))
|
||||
{
|
||||
LOG_ERROR(log, "Insert the new cache failed. new file size:{}, current total size:{}",
|
||||
remote_file_metadata->getFileSize(),
|
||||
lru_caches->weight());
|
||||
return {nullptr, std::move(read_buffer), RemoteReadBufferCacheError::DISK_FULL};
|
||||
}
|
||||
new_cache->startBackgroundDownload(std::move(read_buffer), context->getSchedulePool());
|
||||
return {new_cache, nullptr, RemoteReadBufferCacheError::OK};
|
||||
}
|
||||
|
||||
}
|
@ -1,219 +0,0 @@
|
||||
#pragma once
|
||||
#include <mutex>
|
||||
#include <list>
|
||||
#include <set>
|
||||
#include <map>
|
||||
#include <memory>
|
||||
#include <filesystem>
|
||||
#include <Core/BackgroundSchedulePool.h>
|
||||
#include <Poco/Logger.h>
|
||||
#include <Common/LRUCache.h>
|
||||
#include <Common/ThreadPool.h>
|
||||
#include <IO/ReadBuffer.h>
|
||||
#include <IO/BufferWithOwnMemory.h>
|
||||
#include <IO/createReadBufferFromFileBase.h>
|
||||
#include <IO/WriteBufferFromFile.h>
|
||||
#include <IO/WriteBufferFromFileBase.h>
|
||||
#include <IO/ReadBufferFromFileBase.h>
|
||||
#include <IO/ReadSettings.h>
|
||||
#include <IO/SeekableReadBuffer.h>
|
||||
#include <Storages/IRemoteFileMetadata.h>
|
||||
#include <condition_variable>
|
||||
#include <Interpreters/Context.h>
|
||||
#include <boost/core/noncopyable.hpp>
|
||||
|
||||
|
||||
namespace DB
|
||||
{
|
||||
enum class RemoteReadBufferCacheError : int8_t
|
||||
{
|
||||
OK,
|
||||
NOT_INIT = 10,
|
||||
DISK_FULL = 11,
|
||||
FILE_INVALID = 12,
|
||||
END_OF_FILE = 20,
|
||||
};
|
||||
|
||||
class RemoteCacheController
|
||||
{
|
||||
public:
|
||||
enum LocalFileStatus
|
||||
{
|
||||
TO_DOWNLOAD = 0,
|
||||
DOWNLOADING = 1,
|
||||
DOWNLOADED = 2,
|
||||
};
|
||||
|
||||
RemoteCacheController(
|
||||
IRemoteFileMetadataPtr file_metadata_,
|
||||
const std::filesystem::path & local_path_,
|
||||
size_t cache_bytes_before_flush_);
|
||||
~RemoteCacheController();
|
||||
|
||||
// recover from local disk
|
||||
static std::shared_ptr<RemoteCacheController>
|
||||
recover(const std::filesystem::path & local_path);
|
||||
|
||||
/**
|
||||
* Called by LocalCachedFileReader, must be used in pair
|
||||
* The second value of the return tuple is the local_path to store file.
|
||||
*/
|
||||
std::unique_ptr<ReadBufferFromFileBase> allocFile();
|
||||
void deallocFile(std::unique_ptr<ReadBufferFromFileBase> buffer);
|
||||
|
||||
/**
|
||||
* when allocFile be called, count++. deallocFile be called, count--.
|
||||
* the local file could be deleted only count==0
|
||||
*/
|
||||
inline bool closable()
|
||||
{
|
||||
std::lock_guard lock{mutex};
|
||||
//return opened_file_streams.empty() && remote_read_buffer == nullptr;
|
||||
return opened_file_buffer_refs.empty() && file_status == DOWNLOADED;
|
||||
}
|
||||
void close();
|
||||
|
||||
/**
|
||||
* called in LocalCachedFileReader read(), the reading process would be blocked until
|
||||
* enough data be downloaded.
|
||||
* If the file has finished download, the process would unblocked
|
||||
*/
|
||||
RemoteReadBufferCacheError waitMoreData(size_t start_offset_, size_t end_offset_);
|
||||
|
||||
inline size_t size() const { return current_offset; }
|
||||
|
||||
inline const std::filesystem::path & getLocalPath() { return local_path; }
|
||||
inline String getRemotePath() const { return file_metadata_ptr->getRemotePath(); }
|
||||
|
||||
inline UInt64 getLastModificationTimestamp() const { return file_metadata_ptr->getLastModificationTimestamp(); }
|
||||
bool checkFileChanged(IRemoteFileMetadataPtr file_metadata_);
|
||||
inline void markInvalid()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
valid = false;
|
||||
}
|
||||
inline bool isValid()
|
||||
{
|
||||
std::lock_guard lock(mutex);
|
||||
return valid;
|
||||
}
|
||||
IRemoteFileMetadataPtr getFileMetadata() { return file_metadata_ptr; }
|
||||
inline size_t getFileSize() const { return file_metadata_ptr->getFileSize(); }
|
||||
|
||||
void startBackgroundDownload(std::unique_ptr<ReadBuffer> in_readbuffer_, BackgroundSchedulePool & thread_pool);
|
||||
|
||||
private:
|
||||
// flush file and status information
|
||||
void flush(bool need_flush_status = false);
|
||||
bool loadInnerInformation(const std::filesystem::path & file_path);
|
||||
|
||||
BackgroundSchedulePool::TaskHolder download_task_holder;
|
||||
void backgroundDownload(ReadBufferPtr remote_read_buffer);
|
||||
|
||||
std::mutex mutex;
|
||||
std::condition_variable more_data_signal;
|
||||
|
||||
std::set<uintptr_t> opened_file_buffer_refs; // refer to a buffer address
|
||||
|
||||
String metadata_class;
|
||||
LocalFileStatus file_status = TO_DOWNLOAD; // for tracking download process
|
||||
IRemoteFileMetadataPtr file_metadata_ptr;
|
||||
std::filesystem::path local_path;
|
||||
|
||||
bool valid;
|
||||
size_t local_cache_bytes_read_before_flush;
|
||||
size_t current_offset;
|
||||
|
||||
//std::shared_ptr<ReadBuffer> remote_read_buffer;
|
||||
std::unique_ptr<WriteBufferFromFileBase> data_file_writer;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("RemoteCacheController");
|
||||
};
|
||||
using RemoteCacheControllerPtr = std::shared_ptr<RemoteCacheController>;
|
||||
|
||||
/*
|
||||
* FIXME:RemoteReadBuffer derive from SeekableReadBufferWithSize may cause some risks, since it's not seekable in some cases
|
||||
* But SeekableReadBuffer is not a interface which make it hard to fixup.
|
||||
*/
|
||||
class RemoteReadBuffer : public BufferWithOwnMemory<SeekableReadBufferWithSize>
|
||||
{
|
||||
public:
|
||||
explicit RemoteReadBuffer(size_t buff_size);
|
||||
~RemoteReadBuffer() override;
|
||||
static std::unique_ptr<ReadBuffer> create(ContextPtr contex, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> read_buffer);
|
||||
|
||||
bool nextImpl() override;
|
||||
off_t seek(off_t off, int whence) override;
|
||||
off_t getPosition() override;
|
||||
std::optional<size_t> getTotalSize() override { return remote_file_size; }
|
||||
|
||||
private:
|
||||
std::shared_ptr<RemoteCacheController> file_cache_controller;
|
||||
std::unique_ptr<ReadBufferFromFileBase> file_buffer;
|
||||
size_t remote_file_size = 0;
|
||||
};
|
||||
|
||||
struct RemoteFileCacheWeightFunction
|
||||
{
|
||||
size_t operator()(const RemoteCacheController & cache) const
|
||||
{
|
||||
return cache.getFileSize();
|
||||
}
|
||||
};
|
||||
|
||||
struct RemoteFileCacheEvictPolicy
|
||||
{
|
||||
bool canRelease(RemoteCacheController & cache) const
|
||||
{
|
||||
return cache.closable();
|
||||
}
|
||||
void release(RemoteCacheController & cache)
|
||||
{
|
||||
cache.close();
|
||||
}
|
||||
};
|
||||
|
||||
class RemoteReadBufferCache : private boost::noncopyable
|
||||
{
|
||||
public:
|
||||
using CacheType = LRUCache<String, RemoteCacheController, std::hash<String>,
|
||||
RemoteFileCacheWeightFunction, RemoteFileCacheEvictPolicy>;
|
||||
~RemoteReadBufferCache();
|
||||
// global instance
|
||||
static RemoteReadBufferCache & instance();
|
||||
|
||||
void initOnce(ContextPtr context, const String & root_dir_, size_t limit_size_, size_t bytes_read_before_flush_);
|
||||
|
||||
inline bool isInitialized() const { return initialized; }
|
||||
|
||||
std::tuple<RemoteCacheControllerPtr, std::unique_ptr<ReadBuffer>, RemoteReadBufferCacheError>
|
||||
createReader(ContextPtr context, IRemoteFileMetadataPtr remote_file_metadata, std::unique_ptr<ReadBuffer> & read_buffer);
|
||||
|
||||
void updateTotalSize(size_t size) { total_size += size; }
|
||||
|
||||
protected:
|
||||
RemoteReadBufferCache();
|
||||
|
||||
private:
|
||||
// root directory of local cache for remote filesystem
|
||||
String root_dir;
|
||||
size_t local_cache_bytes_read_before_flush = 0;
|
||||
|
||||
std::atomic<bool> initialized = false;
|
||||
std::atomic<size_t> total_size;
|
||||
std::mutex mutex;
|
||||
std::unique_ptr<CacheType> lru_caches;
|
||||
|
||||
Poco::Logger * log = &Poco::Logger::get("RemoteReadBufferCache");
|
||||
|
||||
String calculateLocalPath(IRemoteFileMetadataPtr meta) const;
|
||||
|
||||
BackgroundSchedulePool::TaskHolder recover_task_holder;
|
||||
void recoverTask();
|
||||
void recoverCachedFilesMetadata(
|
||||
const std::filesystem::path & current_path,
|
||||
size_t current_depth,
|
||||
size_t max_depth);
|
||||
};
|
||||
|
||||
}
|
Loading…
Reference in New Issue
Block a user