2019-05-31 10:58:43 +00:00
|
|
|
#pragma once
|
|
|
|
|
2019-12-06 14:37:21 +00:00
|
|
|
#include <Common/config.h>
|
|
|
|
|
|
|
|
#if USE_AWS_S3
|
|
|
|
|
2020-01-28 13:05:37 +00:00
|
|
|
# include <memory>
|
|
|
|
# include <vector>
|
2022-03-14 05:17:14 +00:00
|
|
|
# include <list>
|
2021-10-02 07:13:14 +00:00
|
|
|
# include <base/logger_useful.h>
|
|
|
|
# include <base/types.h>
|
2020-12-09 14:09:04 +00:00
|
|
|
|
2020-01-28 13:05:37 +00:00
|
|
|
# include <IO/BufferWithOwnMemory.h>
|
|
|
|
# include <IO/WriteBuffer.h>
|
2020-12-09 14:09:04 +00:00
|
|
|
|
2021-10-27 23:10:39 +00:00
|
|
|
# include <aws/core/utils/memory/stl/AWSStringStream.h>
|
2019-05-31 10:58:43 +00:00
|
|
|
|
2019-12-11 14:21:48 +00:00
|
|
|
namespace Aws::S3
|
|
|
|
{
|
2020-01-28 13:05:37 +00:00
|
|
|
class S3Client;
|
2019-12-11 14:21:48 +00:00
|
|
|
}
|
2019-05-31 10:58:43 +00:00
|
|
|
|
2022-02-01 10:36:51 +00:00
|
|
|
namespace Aws::S3::Model
|
|
|
|
{
|
|
|
|
class UploadPartRequest;
|
|
|
|
class PutObjectRequest;
|
|
|
|
}
|
|
|
|
|
2019-05-31 10:58:43 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2020-12-09 14:09:04 +00:00
|
|
|
|
2022-02-01 08:19:26 +00:00
|
|
|
using ScheduleFunc = std::function<void(std::function<void()>)>;
|
|
|
|
|
2020-12-09 14:09:04 +00:00
|
|
|
/**
|
|
|
|
* Buffer to write a data to a S3 object with specified bucket and key.
|
|
|
|
* If data size written to the buffer is less than 'max_single_part_upload_size' write is performed using singlepart upload.
|
|
|
|
* In another case multipart upload is used:
|
|
|
|
* Data is divided on chunks with size greater than 'minimum_upload_part_size'. Last chunk can be less than this threshold.
|
|
|
|
* Each chunk is written as a part to S3.
|
2019-05-31 10:58:43 +00:00
|
|
|
*/
|
2022-02-01 10:36:51 +00:00
|
|
|
class WriteBufferFromS3 final : public BufferWithOwnMemory<WriteBuffer>
|
2019-05-31 10:58:43 +00:00
|
|
|
{
|
|
|
|
public:
|
2020-01-28 13:05:37 +00:00
|
|
|
explicit WriteBufferFromS3(
|
|
|
|
std::shared_ptr<Aws::S3::S3Client> client_ptr_,
|
|
|
|
const String & bucket_,
|
|
|
|
const String & key_,
|
|
|
|
size_t minimum_upload_part_size_,
|
2022-02-08 16:38:04 +00:00
|
|
|
size_t upload_part_size_multiply_factor_,
|
|
|
|
size_t upload_part_size_multiply_threshold_,
|
2020-12-09 14:09:04 +00:00
|
|
|
size_t max_single_part_upload_size_,
|
2020-12-08 18:31:57 +00:00
|
|
|
std::optional<std::map<String, String>> object_metadata_ = std::nullopt,
|
2022-02-01 10:36:51 +00:00
|
|
|
size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE,
|
2022-02-02 17:48:41 +00:00
|
|
|
ScheduleFunc schedule_ = {});
|
2019-05-31 10:58:43 +00:00
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
~WriteBufferFromS3() override;
|
2019-06-17 00:06:14 +00:00
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
void nextImpl() override;
|
2019-06-17 00:06:14 +00:00
|
|
|
|
2022-02-01 10:36:51 +00:00
|
|
|
void preFinalize() override;
|
|
|
|
|
2019-06-17 00:42:47 +00:00
|
|
|
private:
|
2021-03-17 14:20:55 +00:00
|
|
|
void allocateBuffer();
|
|
|
|
|
2020-12-09 14:09:04 +00:00
|
|
|
void createMultipartUpload();
|
|
|
|
void writePart();
|
|
|
|
void completeMultipartUpload();
|
|
|
|
|
|
|
|
void makeSinglepartUpload();
|
2020-09-30 13:24:36 +00:00
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
/// Receives response from the server after sending all data.
|
|
|
|
void finalizeImpl() override;
|
|
|
|
|
2022-02-01 10:36:51 +00:00
|
|
|
struct UploadPartTask;
|
|
|
|
void fillUploadRequest(Aws::S3::Model::UploadPartRequest & req, int part_number);
|
|
|
|
void processUploadRequest(UploadPartTask & task);
|
|
|
|
|
|
|
|
struct PutObjectTask;
|
|
|
|
void fillPutRequest(Aws::S3::Model::PutObjectRequest & req);
|
|
|
|
void processPutRequest(PutObjectTask & task);
|
|
|
|
|
|
|
|
void waitForReadyBackGroundTasks();
|
|
|
|
void waitForAllBackGroundTasks();
|
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
String bucket;
|
|
|
|
String key;
|
|
|
|
std::optional<std::map<String, String>> object_metadata;
|
|
|
|
std::shared_ptr<Aws::S3::S3Client> client_ptr;
|
2022-02-08 16:38:04 +00:00
|
|
|
size_t upload_part_size;
|
|
|
|
const size_t upload_part_size_multiply_factor;
|
|
|
|
const size_t upload_part_size_multiply_threshold;
|
|
|
|
const size_t max_single_part_upload_size;
|
2021-11-10 22:58:56 +00:00
|
|
|
/// Buffer to accumulate data.
|
|
|
|
std::shared_ptr<Aws::StringStream> temporary_buffer;
|
2022-02-08 16:38:04 +00:00
|
|
|
size_t last_part_size = 0;
|
|
|
|
std::atomic<size_t> total_parts_uploaded = 0;
|
2021-11-10 22:58:56 +00:00
|
|
|
|
|
|
|
/// Upload in S3 is made in parts.
|
|
|
|
/// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts.
|
|
|
|
String multipart_upload_id;
|
|
|
|
std::vector<String> part_tags;
|
|
|
|
|
2022-02-01 10:36:51 +00:00
|
|
|
bool is_prefinalized = false;
|
|
|
|
|
|
|
|
/// Following fields are for background uploads in thread pool (if specified).
|
2022-02-01 08:19:26 +00:00
|
|
|
/// We use std::function to avoid dependency of Interpreters
|
|
|
|
ScheduleFunc schedule;
|
2022-02-01 10:36:51 +00:00
|
|
|
std::unique_ptr<PutObjectTask> put_object_task;
|
|
|
|
std::list<UploadPartTask> upload_object_tasks;
|
|
|
|
size_t num_added_bg_tasks = 0;
|
|
|
|
size_t num_finished_bg_tasks = 0;
|
|
|
|
std::mutex bg_tasks_mutex;
|
|
|
|
std::condition_variable bg_tasks_condvar;
|
|
|
|
|
2021-11-10 22:58:56 +00:00
|
|
|
Poco::Logger * log = &Poco::Logger::get("WriteBufferFromS3");
|
2019-05-31 10:58:43 +00:00
|
|
|
};
|
|
|
|
|
|
|
|
}
|
2019-12-06 14:37:21 +00:00
|
|
|
|
2019-12-09 12:36:06 +00:00
|
|
|
#endif
|