ClickHouse/src/IO/WriteBufferFromS3.cpp

239 lines
6.8 KiB
C++
Raw Normal View History

2019-12-06 14:37:21 +00:00
#include <Common/config.h>
#if USE_AWS_S3
2019-06-01 21:18:20 +00:00
2020-01-28 13:05:37 +00:00
# include <IO/WriteBufferFromS3.h>
# include <IO/WriteHelpers.h>
# include <Common/MemoryTracker.h>
2019-06-17 07:16:43 +00:00
2020-01-28 13:05:37 +00:00
# include <aws/s3/S3Client.h>
# include <aws/s3/model/CreateMultipartUploadRequest.h>
# include <aws/s3/model/CompleteMultipartUploadRequest.h>
# include <aws/s3/model/PutObjectRequest.h>
2020-01-28 13:05:37 +00:00
# include <aws/s3/model/UploadPartRequest.h>
# include <common/logger_useful.h>
2019-12-03 16:23:24 +00:00
2020-01-28 13:05:37 +00:00
# include <utility>
2019-06-01 21:18:20 +00:00
namespace ProfileEvents
{
extern const Event S3WriteBytes;
}
2021-03-22 19:12:42 +00:00
2019-06-01 21:18:20 +00:00
namespace DB
{
// S3 protocol does not allow to have multipart upload with more than 10000 parts.
// In case server does not return an error on exceeding that number, we print a warning
// because custom S3 implementation may allow relaxed requirements on that.
2019-09-22 10:42:47 +00:00
const int S3_WARN_MAX_PARTS = 10000;
2019-06-17 07:16:43 +00:00
namespace ErrorCodes
{
2019-12-03 16:23:24 +00:00
extern const int S3_ERROR;
2019-06-17 07:16:43 +00:00
}
2019-06-01 21:18:20 +00:00
WriteBufferFromS3::WriteBufferFromS3(
2019-12-03 16:23:24 +00:00
std::shared_ptr<Aws::S3::S3Client> client_ptr_,
const String & bucket_,
const String & key_,
2019-06-21 05:24:01 +00:00
size_t minimum_upload_part_size_,
size_t max_single_part_upload_size_,
std::optional<std::map<String, String>> object_metadata_,
2020-01-28 13:05:37 +00:00
size_t buffer_size_)
2019-12-03 16:23:24 +00:00
: BufferWithOwnMemory<WriteBuffer>(buffer_size_, nullptr, 0)
, bucket(bucket_)
, key(key_)
, object_metadata(std::move(object_metadata_))
2019-12-03 16:23:24 +00:00
, client_ptr(std::move(client_ptr_))
, minimum_upload_part_size(minimum_upload_part_size_)
, max_single_part_upload_size(max_single_part_upload_size_)
{
allocateBuffer();
}
2019-06-01 21:18:20 +00:00
2019-06-17 00:06:14 +00:00
void WriteBufferFromS3::nextImpl()
{
if (!offset())
return;
2019-06-17 00:42:47 +00:00
temporary_buffer->write(working_buffer.begin(), offset());
ProfileEvents::increment(ProfileEvents::S3WriteBytes, offset());
last_part_size += offset();
/// Data size exceeds singlepart upload threshold, need to use multipart upload.
if (multipart_upload_id.empty() && last_part_size > max_single_part_upload_size)
createMultipartUpload();
if (!multipart_upload_id.empty() && last_part_size > minimum_upload_part_size)
2019-06-17 00:42:47 +00:00
{
writePart();
allocateBuffer();
2019-06-17 00:42:47 +00:00
}
2019-06-17 00:06:14 +00:00
}
void WriteBufferFromS3::allocateBuffer()
{
temporary_buffer = Aws::MakeShared<Aws::StringStream>("temporary buffer");
temporary_buffer->exceptions(std::ios::badbit);
last_part_size = 0;
}
2019-06-01 21:18:20 +00:00
void WriteBufferFromS3::finalize()
{
/// FIXME move final flush into the caller
MemoryTracker::LockExceptionInThread lock(VariableContext::Global);
2020-09-30 13:24:36 +00:00
finalizeImpl();
}
2020-09-30 13:24:36 +00:00
void WriteBufferFromS3::finalizeImpl()
{
if (finalized)
return;
2019-06-17 00:06:14 +00:00
next();
2020-09-30 13:24:36 +00:00
if (multipart_upload_id.empty())
{
makeSinglepartUpload();
2020-09-30 13:24:36 +00:00
}
else
{
/// Write rest of the data as last part.
writePart();
completeMultipartUpload();
}
finalized = true;
2020-09-30 13:24:36 +00:00
}
void WriteBufferFromS3::createMultipartUpload()
2019-06-17 00:42:47 +00:00
{
2019-12-03 16:23:24 +00:00
Aws::S3::Model::CreateMultipartUploadRequest req;
req.SetBucket(bucket);
req.SetKey(key);
if (object_metadata.has_value())
req.SetMetadata(object_metadata.value());
2019-06-17 07:16:43 +00:00
2019-12-03 16:23:24 +00:00
auto outcome = client_ptr->CreateMultipartUpload(req);
2019-06-17 07:16:43 +00:00
2019-12-06 14:48:56 +00:00
if (outcome.IsSuccess())
{
multipart_upload_id = outcome.GetResult().GetUploadId();
LOG_DEBUG(log, "Multipart upload has created. Bucket: {}, Key: {}, Upload id: {}", bucket, key, multipart_upload_id);
2019-06-17 07:16:43 +00:00
}
2019-12-06 14:48:56 +00:00
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
2019-06-17 00:42:47 +00:00
}
2019-06-17 18:06:28 +00:00
void WriteBufferFromS3::writePart()
2019-06-17 00:42:47 +00:00
{
auto size = temporary_buffer->tellp();
LOG_DEBUG(log, "Writing part. Bucket: {}, Key: {}, Upload_id: {}, Size: {}", bucket, key, multipart_upload_id, size);
if (size < 0)
throw Exception("Failed to write part. Buffer in invalid state.", ErrorCodes::S3_ERROR);
if (size == 0)
{
LOG_DEBUG(log, "Skipping writing part. Buffer is empty.");
return;
}
2019-09-22 10:42:47 +00:00
if (part_tags.size() == S3_WARN_MAX_PARTS)
{
2019-09-22 10:42:47 +00:00
// Don't throw exception here by ourselves but leave the decision to take by S3 server.
2020-05-23 22:24:01 +00:00
LOG_WARNING(log, "Maximum part number in S3 protocol has reached (too many parts). Server may not accept this whole upload.");
}
2019-12-03 16:23:24 +00:00
Aws::S3::Model::UploadPartRequest req;
2019-12-03 16:23:24 +00:00
req.SetBucket(bucket);
req.SetKey(key);
req.SetPartNumber(part_tags.size() + 1);
req.SetUploadId(multipart_upload_id);
req.SetContentLength(size);
req.SetBody(temporary_buffer);
2019-12-03 16:23:24 +00:00
auto outcome = client_ptr->UploadPart(req);
2019-12-06 14:48:56 +00:00
if (outcome.IsSuccess())
{
2019-12-03 16:23:24 +00:00
auto etag = outcome.GetResult().GetETag();
part_tags.push_back(etag);
LOG_DEBUG(log, "Writing part finished. Bucket: {}, Key: {}, Upload_id: {}, Etag: {}, Parts: {}", bucket, key, multipart_upload_id, etag, part_tags.size());
2019-06-17 07:16:43 +00:00
}
2019-12-06 14:48:56 +00:00
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
2019-06-01 21:18:20 +00:00
}
void WriteBufferFromS3::completeMultipartUpload()
2019-06-17 00:06:14 +00:00
{
LOG_DEBUG(log, "Completing multipart upload. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
if (part_tags.empty())
throw Exception("Failed to complete multipart upload. No parts have uploaded", ErrorCodes::S3_ERROR);
Aws::S3::Model::CompleteMultipartUploadRequest req;
req.SetBucket(bucket);
req.SetKey(key);
req.SetUploadId(multipart_upload_id);
Aws::S3::Model::CompletedMultipartUpload multipart_upload;
for (size_t i = 0; i < part_tags.size(); ++i)
{
Aws::S3::Model::CompletedPart part;
multipart_upload.AddParts(part.WithETag(part_tags[i]).WithPartNumber(i + 1));
}
req.SetMultipartUpload(multipart_upload);
auto outcome = client_ptr->CompleteMultipartUpload(req);
if (outcome.IsSuccess())
LOG_DEBUG(log, "Multipart upload has completed. Bucket: {}, Key: {}, Upload_id: {}, Parts: {}", bucket, key, multipart_upload_id, part_tags.size());
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
}
void WriteBufferFromS3::makeSinglepartUpload()
{
auto size = temporary_buffer->tellp();
LOG_DEBUG(log, "Making single part upload. Bucket: {}, Key: {}, Size: {}", bucket, key, size);
2019-12-03 16:23:24 +00:00
if (size < 0)
throw Exception("Failed to make single part upload. Buffer in invalid state", ErrorCodes::S3_ERROR);
if (size == 0)
{
LOG_DEBUG(log, "Skipping single part upload. Buffer is empty.");
return;
}
Aws::S3::Model::PutObjectRequest req;
req.SetBucket(bucket);
req.SetKey(key);
req.SetContentLength(size);
req.SetBody(temporary_buffer);
if (object_metadata.has_value())
req.SetMetadata(object_metadata.value());
auto outcome = client_ptr->PutObject(req);
if (outcome.IsSuccess())
LOG_DEBUG(log, "Single part upload has completed. Bucket: {}, Key: {}, Object size: {}", bucket, key, req.GetContentLength());
else
throw Exception(outcome.GetError().GetMessage(), ErrorCodes::S3_ERROR);
2019-06-17 00:06:14 +00:00
}
2019-06-01 21:18:20 +00:00
}
2019-12-06 14:37:21 +00:00
#endif