Unite readBuffers

This commit is contained in:
kssenii 2021-05-20 06:38:18 +00:00
parent 9a5a6f39b7
commit 550a4e33f9
6 changed files with 197 additions and 238 deletions

View File

@ -2,9 +2,9 @@
#include <Storages/HDFS/ReadBufferFromHDFS.h>
#include <Storages/HDFS/WriteBufferFromHDFS.h>
#include "ReadIndirectBufferFromHDFS.h"
#include "WriteIndirectBufferFromHDFS.h"
#include <IO/SeekAvoidingReadBuffer.h>
#include <IO/ReadIndirectBufferFromRemoteFS.h>
#include <Common/checkStackSize.h>
#include <Common/quoteString.h>
#include <common/logger_useful.h>
@ -22,6 +22,36 @@ namespace ErrorCodes
extern const int UNKNOWN_FORMAT;
}
/// Reads data from HDFS using stored paths in metadata.
class ReadIndirectBufferFromHDFS final : public ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>
{
public:
ReadIndirectBufferFromHDFS(
const Poco::Util::AbstractConfiguration & config_,
const String & hdfs_uri_,
DiskHDFS::Metadata metadata_,
size_t buf_size_)
: ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>(metadata_)
, config(config_)
, buf_size(buf_size_)
{
const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
hdfs_directory = hdfs_uri_.substr(begin_of_path);
hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
}
std::unique_ptr<ReadBufferFromHDFS> createReadBuffer(const String & path) override
{
return std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_directory + path, config, buf_size);
}
private:
const Poco::Util::AbstractConfiguration & config;
String hdfs_uri;
String hdfs_directory;
size_t buf_size;
};
DiskHDFS::DiskHDFS(
const String & disk_name_,

View File

@ -1,142 +0,0 @@
#pragma once
#include <IO/ReadBufferFromFile.h>
#include <Disks/IDiskRemote.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_SEEK_THROUGH_FILE;
}
/// Reads data from HDFS using stored paths in metadata.
class ReadIndirectBufferFromHDFS final : public ReadBufferFromFileBase
{
public:
ReadIndirectBufferFromHDFS(
const Poco::Util::AbstractConfiguration & config_,
const String & hdfs_uri_,
DiskHDFS::Metadata metadata_,
size_t buf_size_)
: config(config_)
, metadata(std::move(metadata_))
, buf_size(buf_size_)
{
const size_t begin_of_path = hdfs_uri_.find('/', hdfs_uri_.find("//") + 2);
hdfs_directory = hdfs_uri_.substr(begin_of_path);
hdfs_uri = hdfs_uri_.substr(0, begin_of_path);
}
off_t seek(off_t offset_, int whence) override
{
if (whence == SEEK_CUR)
{
/// If position within current working buffer - shift pos.
if (working_buffer.size() && size_t(getPosition() + offset_) < absolute_position)
{
pos += offset_;
return getPosition();
}
else
{
absolute_position += offset_;
}
}
else if (whence == SEEK_SET)
{
/// If position within current working buffer - shift pos.
if (working_buffer.size() && size_t(offset_) >= absolute_position - working_buffer.size()
&& size_t(offset_) < absolute_position)
{
pos = working_buffer.end() - (absolute_position - offset_);
return getPosition();
}
else
{
absolute_position = offset_;
}
}
else
throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
current_buf = initialize();
pos = working_buffer.end();
return absolute_position;
}
off_t getPosition() override { return absolute_position - available(); }
std::string getFileName() const override { return metadata.metadata_file_path; }
private:
std::unique_ptr<ReadBufferFromHDFS> initialize()
{
size_t offset = absolute_position;
for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
{
current_buf_idx = i;
const auto & [file_name, size] = metadata.remote_fs_objects[i];
if (size > offset)
{
auto buf = std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_directory + file_name, config, buf_size);
buf->seek(offset, SEEK_SET);
return buf;
}
offset -= size;
}
return nullptr;
}
bool nextImpl() override
{
/// Find first available buffer that fits to given offset.
if (!current_buf)
current_buf = initialize();
/// If current buffer has remaining data - use it.
if (current_buf && current_buf->next())
{
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
/// If there is no available buffers - nothing to read.
if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
return false;
++current_buf_idx;
const auto & file_name = metadata.remote_fs_objects[current_buf_idx].first;
current_buf = std::make_unique<ReadBufferFromHDFS>(hdfs_uri, hdfs_directory + file_name, config, buf_size);
current_buf->next();
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
const Poco::Util::AbstractConfiguration & config;
String hdfs_uri;
String hdfs_directory;
DiskHDFS::Metadata metadata;
size_t buf_size;
size_t absolute_position = 0;
size_t current_buf_idx = 0;
std::unique_ptr<ReadBufferFromHDFS> current_buf;
};
}

View File

@ -7,8 +7,8 @@
#include <utility>
#include <IO/ReadBufferFromString.h>
#include <Interpreters/Context.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/ReadBufferFromS3.h>
#include <IO/ReadIndirectBufferFromRemoteFS.h>
#include <IO/ReadHelpers.h>
#include <IO/SeekAvoidingReadBuffer.h>
#include <IO/WriteBufferFromFile.h>
@ -69,7 +69,7 @@ void throwIfError(const Aws::Utils::Outcome<Result, Error> & response)
}
/// Reads data from S3 using stored paths in metadata.
class ReadIndirectBufferFromS3 final : public ReadBufferFromFileBase
class ReadIndirectBufferFromS3 final : public ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>
{
public:
ReadIndirectBufferFromS3(
@ -78,112 +78,24 @@ public:
DiskS3::Metadata metadata_,
size_t s3_max_single_read_retries_,
size_t buf_size_)
: client_ptr(std::move(client_ptr_))
: ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>(metadata_)
, client_ptr(std::move(client_ptr_))
, bucket(bucket_)
, metadata(std::move(metadata_))
, s3_max_single_read_retries(s3_max_single_read_retries_)
, buf_size(buf_size_)
{
}
off_t seek(off_t offset_, int whence) override
std::unique_ptr<ReadBufferFromS3> createReadBuffer(const String & path) override
{
if (whence == SEEK_CUR)
{
/// If position within current working buffer - shift pos.
if (!working_buffer.empty() && size_t(getPosition() + offset_) < absolute_position)
{
pos += offset_;
return getPosition();
}
else
{
absolute_position += offset_;
}
}
else if (whence == SEEK_SET)
{
/// If position within current working buffer - shift pos.
if (!working_buffer.empty() && size_t(offset_) >= absolute_position - working_buffer.size()
&& size_t(offset_) < absolute_position)
{
pos = working_buffer.end() - (absolute_position - offset_);
return getPosition();
}
else
{
absolute_position = offset_;
}
}
else
throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
current_buf = initialize();
pos = working_buffer.end();
return absolute_position;
return std::make_unique<ReadBufferFromS3>(client_ptr, bucket, metadata.remote_fs_root_path + path, s3_max_single_read_retries, buf_size);
}
off_t getPosition() override { return absolute_position - available(); }
std::string getFileName() const override { return metadata.metadata_file_path; }
private:
std::unique_ptr<ReadBufferFromS3> initialize()
{
size_t offset = absolute_position;
for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
{
current_buf_idx = i;
const auto & [path, size] = metadata.remote_fs_objects[i];
if (size > offset)
{
auto buf = std::make_unique<ReadBufferFromS3>(client_ptr, bucket, metadata.remote_fs_root_path + path, s3_max_single_read_retries, buf_size);
buf->seek(offset, SEEK_SET);
return buf;
}
offset -= size;
}
return nullptr;
}
bool nextImpl() override
{
/// Find first available buffer that fits to given offset.
if (!current_buf)
current_buf = initialize();
/// If current buffer has remaining data - use it.
if (current_buf && current_buf->next())
{
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
/// If there is no available buffers - nothing to read.
if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
return false;
++current_buf_idx;
const auto & path = metadata.remote_fs_objects[current_buf_idx].first;
current_buf = std::make_unique<ReadBufferFromS3>(client_ptr, bucket, metadata.remote_fs_root_path + path, s3_max_single_read_retries, buf_size);
current_buf->next();
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
std::shared_ptr<Aws::S3::S3Client> client_ptr;
const String & bucket;
DiskS3::Metadata metadata;
size_t s3_max_single_read_retries;
size_t buf_size;
size_t absolute_position = 0;
size_t current_buf_idx = 0;
std::unique_ptr<ReadBufferFromS3> current_buf;
};
/// Stores data in S3 and adds the object key (S3 path) and object size to metadata file on local FS.

View File

@ -4,7 +4,7 @@
#include "gtest_disk.h"
/// To enable tests set to 1. It is set to 0, because there is not HDFS instance in CI.
#define RUN_HDFS_TEST 0
#define RUN_HDFS_TEST 1
#if RUN_HDFS_TEST

View File

@ -0,0 +1,120 @@
#include "ReadIndirectBufferFromRemoteFS.h"
#include <IO/ReadBufferFromS3.h>
#include <Storages/HDFS/ReadBufferFromHDFS.h>
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_SEEK_THROUGH_FILE;
}
template<typename T>
ReadIndirectBufferFromRemoteFS<T>::ReadIndirectBufferFromRemoteFS(
IDiskRemote::Metadata metadata_)
: metadata(std::move(metadata_))
{
}
template<typename T>
off_t ReadIndirectBufferFromRemoteFS<T>::seek(off_t offset_, int whence)
{
if (whence == SEEK_CUR)
{
/// If position within current working buffer - shift pos.
if (!working_buffer.empty() && size_t(getPosition() + offset_) < absolute_position)
{
pos += offset_;
return getPosition();
}
else
{
absolute_position += offset_;
}
}
else if (whence == SEEK_SET)
{
/// If position within current working buffer - shift pos.
if (!working_buffer.empty() && size_t(offset_) >= absolute_position - working_buffer.size()
&& size_t(offset_) < absolute_position)
{
pos = working_buffer.end() - (absolute_position - offset_);
return getPosition();
}
else
{
absolute_position = offset_;
}
}
else
throw Exception("Only SEEK_SET or SEEK_CUR modes are allowed.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE);
current_buf = initialize();
pos = working_buffer.end();
return absolute_position;
}
template<typename T>
std::unique_ptr<T> ReadIndirectBufferFromRemoteFS<T>::initialize()
{
size_t offset = absolute_position;
for (size_t i = 0; i < metadata.remote_fs_objects.size(); ++i)
{
current_buf_idx = i;
const auto & [file_path, size] = metadata.remote_fs_objects[i];
if (size > offset)
{
auto buf = createReadBuffer(file_path);
buf->seek(offset, SEEK_SET);
return buf;
}
offset -= size;
}
return nullptr;
}
template<typename T>
bool ReadIndirectBufferFromRemoteFS<T>::nextImpl()
{
/// Find first available buffer that fits to given offset.
if (!current_buf)
current_buf = initialize();
/// If current buffer has remaining data - use it.
if (current_buf && current_buf->next())
{
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
/// If there is no available buffers - nothing to read.
if (current_buf_idx + 1 >= metadata.remote_fs_objects.size())
return false;
++current_buf_idx;
const auto & path = metadata.remote_fs_objects[current_buf_idx].first;
current_buf = createReadBuffer(path);
current_buf->next();
working_buffer = current_buf->buffer();
absolute_position += working_buffer.size();
return true;
}
template
class ReadIndirectBufferFromRemoteFS<ReadBufferFromS3>;
template
class ReadIndirectBufferFromRemoteFS<ReadBufferFromHDFS>;
}

View File

@ -0,0 +1,39 @@
#include <IO/ReadBufferFromFile.h>
#include <Disks/IDiskRemote.h>
#include <utility>
namespace DB
{
/// Reads data from S3/HDFS using stored paths in metadata.
template <typename T>
class ReadIndirectBufferFromRemoteFS : public ReadBufferFromFileBase
{
public:
ReadIndirectBufferFromRemoteFS(IDiskRemote::Metadata metadata_);
off_t seek(off_t offset_, int whence) override;
off_t getPosition() override { return absolute_position - available(); }
String getFileName() const override { return metadata.metadata_file_path; }
virtual std::unique_ptr<T> createReadBuffer(const String & path) = 0;
protected:
IDiskRemote::Metadata metadata;
private:
std::unique_ptr<T> initialize();
bool nextImpl() override;
size_t absolute_position = 0;
size_t current_buf_idx = 0;
std::unique_ptr<T> current_buf;
};
}