ClickHouse/src/IO/WriteBufferFromS3.h

127 lines
3.8 KiB
C++
Raw Normal View History

2019-05-31 10:58:43 +00:00
#pragma once
2019-12-06 14:37:21 +00:00
#include <Common/config.h>
#if USE_AWS_S3
#include <memory>
#include <vector>
#include <list>
2022-05-14 12:26:04 +00:00
#include <base/types.h>
#include <Common/logger_useful.h>
#include <Common/ThreadPool.h>
#include <IO/BufferWithOwnMemory.h>
#include <IO/WriteBuffer.h>
#include <IO/WriteSettings.h>
2022-04-03 22:33:59 +00:00
#include <Storages/StorageS3Settings.h>
#include <aws/core/utils/memory/stl/AWSStringStream.h>
2019-05-31 10:58:43 +00:00
2022-05-14 12:26:04 +00:00
2019-12-11 14:21:48 +00:00
namespace Aws::S3
{
2020-01-28 13:05:37 +00:00
class S3Client;
2019-12-11 14:21:48 +00:00
}
2019-05-31 10:58:43 +00:00
namespace Aws::S3::Model
{
class UploadPartRequest;
class PutObjectRequest;
}
2019-05-31 10:58:43 +00:00
namespace DB
{
2022-04-07 16:46:46 +00:00
using ScheduleFunc = std::function<void(std::function<void()>)>;
class WriteBufferFromFile;
/**
* Buffer to write a data to a S3 object with specified bucket and key.
* If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload.
* In another case multipart upload is used:
* Data is divided on chunks with size greater than 'minimum_upload_part_size'. Last chunk can be less than this threshold.
* Each chunk is written as a part to S3.
2019-05-31 10:58:43 +00:00
*/
class WriteBufferFromS3 final : public BufferWithOwnMemory<WriteBuffer>
2019-05-31 10:58:43 +00:00
{
public:
WriteBufferFromS3(
2022-05-11 22:04:54 +00:00
std::shared_ptr<const Aws::S3::S3Client> client_ptr_,
2020-01-28 13:05:37 +00:00
const String & bucket_,
const String & key_,
2022-04-03 22:33:59 +00:00
const S3Settings::ReadWriteSettings & s3_settings_,
std::optional<std::map<String, String>> object_metadata_ = std::nullopt,
size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
ScheduleFunc schedule_ = {},
2022-07-16 20:49:15 +00:00
const WriteSettings & write_settings_ = {});
2019-05-31 10:58:43 +00:00
2021-11-10 22:58:56 +00:00
~WriteBufferFromS3() override;
2019-06-17 00:06:14 +00:00
2021-11-10 22:58:56 +00:00
void nextImpl() override;
2019-06-17 00:06:14 +00:00
void preFinalize() override;
2019-06-17 00:42:47 +00:00
private:
void allocateBuffer();
void createMultipartUpload();
void writePart();
void completeMultipartUpload();
void makeSinglepartUpload();
2020-09-30 13:24:36 +00:00
2021-11-10 22:58:56 +00:00
/// Receives response from the server after sending all data.
void finalizeImpl() override;
struct UploadPartTask;
void fillUploadRequest(Aws::S3::Model::UploadPartRequest & req, int part_number);
void processUploadRequest(UploadPartTask & task);
struct PutObjectTask;
void fillPutRequest(Aws::S3::Model::PutObjectRequest & req);
2022-09-02 15:48:36 +00:00
void processPutRequest(const PutObjectTask & task);
void waitForReadyBackGroundTasks();
void waitForAllBackGroundTasks();
2022-09-02 13:32:46 +00:00
void waitForAllBackGroundTasksUnlocked(std::unique_lock<std::mutex> & bg_tasks_lock);
2022-09-02 15:48:36 +00:00
const String bucket;
const String key;
const S3Settings::ReadWriteSettings s3_settings;
const std::shared_ptr<const Aws::S3::S3Client> client_ptr;
const std::optional<std::map<String, String>> object_metadata;
2022-05-11 22:04:54 +00:00
2022-09-02 15:48:36 +00:00
size_t upload_part_size = 0;
std::shared_ptr<Aws::StringStream> temporary_buffer; /// Buffer to accumulate data.
size_t last_part_size = 0;
std::atomic<size_t> total_parts_uploaded = 0;
2021-11-10 22:58:56 +00:00
/// Upload in S3 is made in parts.
/// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts.
String multipart_upload_id;
2022-09-02 15:48:36 +00:00
std::vector<String> TSA_GUARDED_BY(bg_tasks_mutex) part_tags;
2021-11-10 22:58:56 +00:00
bool is_prefinalized = false;
/// Following fields are for background uploads in thread pool (if specified).
/// We use std::function to avoid dependency of Interpreters
2022-09-02 15:48:36 +00:00
const ScheduleFunc schedule;
std::unique_ptr<PutObjectTask> put_object_task; /// Does not need protection by mutex because of the logic around is_finished field.
std::list<UploadPartTask> TSA_GUARDED_BY(bg_tasks_mutex) upload_object_tasks;
size_t num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
size_t num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0;
std::mutex bg_tasks_mutex;
std::condition_variable bg_tasks_condvar;
2021-11-10 22:58:56 +00:00
Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3");
WriteSettings write_settings;
2019-05-31 10:58:43 +00:00
};
}
2019-12-06 14:37:21 +00:00
#endif