mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-10 09:32:06 +00:00
Add function copyS3FileFromDisk().
This commit is contained in:
parent
69114cb550
commit
101aa6eff0
@ -176,27 +176,9 @@ DataSourceDescription BackupWriterS3::getDataSourceDescription() const
|
|||||||
|
|
||||||
void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name)
|
void BackupWriterS3::copyFileFromDisk(DiskPtr src_disk, const String & src_file_name, UInt64 src_offset, UInt64 src_size, const String & dest_file_name)
|
||||||
{
|
{
|
||||||
/// copyS3File() can copy to another S3 bucket, but it requires the same S3 URI endpoint.
|
copyS3FileFromDisk(src_disk, src_file_name, src_offset, src_size,
|
||||||
/// We don't check `has_throttling` here (compare with BackupWriterDisk::copyFileFromDisk()) because
|
client, s3_uri.bucket, fs::path(s3_uri.key) / dest_file_name, read_settings, request_settings,
|
||||||
/// copyS3File() almost doesn't use network so the throttling is not needed.
|
threadPoolCallbackRunner<void>(BackupsIOThreadPool::get(), "BackupWriterS3"));
|
||||||
if (getDataSourceDescription() == src_disk->getDataSourceDescription())
|
|
||||||
{
|
|
||||||
/// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket.
|
|
||||||
/// In this case we can't use the native copy.
|
|
||||||
if (auto blob_path = src_disk->getBlobPath(src_file_name))
|
|
||||||
{
|
|
||||||
/// Use more optimal way.
|
|
||||||
LOG_TRACE(log, "Copying file {} using native copy", src_file_name);
|
|
||||||
const auto & [src_bucket, src_key] = *blob_path;
|
|
||||||
auto dest_key = fs::path(s3_uri.key) / dest_file_name;
|
|
||||||
copyS3File(client, src_bucket, src_key, src_offset, src_size, s3_uri.bucket, dest_key, request_settings, {},
|
|
||||||
threadPoolCallbackRunner<void>(BackupsIOThreadPool::get(), "BackupWriterS3"));
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Fallback to BackupWriterS3::copyDataToFile().
|
|
||||||
IBackupWriter::copyFileFromDisk(src_disk, src_file_name, src_offset, src_size, dest_file_name);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void BackupWriterS3::copyDataToFile(
|
void BackupWriterS3::copyDataToFile(
|
||||||
|
@ -13,7 +13,7 @@ namespace DB
|
|||||||
{
|
{
|
||||||
|
|
||||||
void copyS3FileToDisk(
|
void copyS3FileToDisk(
|
||||||
const std::shared_ptr<const S3::Client> & s3_client,
|
const std::shared_ptr<const S3::Client> & src_s3_client,
|
||||||
const String & src_bucket,
|
const String & src_bucket,
|
||||||
const String & src_key,
|
const String & src_key,
|
||||||
const std::optional<String> & version_id,
|
const std::optional<String> & version_id,
|
||||||
@ -31,47 +31,85 @@ void copyS3FileToDisk(
|
|||||||
src_offset = 0;
|
src_offset = 0;
|
||||||
|
|
||||||
if (!src_size)
|
if (!src_size)
|
||||||
src_size = S3::getObjectSize(*s3_client, src_bucket, src_key, version_id.value_or(""), request_settings) - *src_offset;
|
src_size = S3::getObjectSize(*src_s3_client, src_bucket, src_key, version_id.value_or(""), request_settings) - *src_offset;
|
||||||
|
|
||||||
auto destination_data_source_description = destination_disk->getDataSourceDescription();
|
auto destination_data_source_description = destination_disk->getDataSourceDescription();
|
||||||
if (destination_data_source_description != DataSourceDescription{DataSourceType::S3, s3_client->getInitialEndpoint(), false, false})
|
if (destination_data_source_description == DataSourceDescription{DataSourceType::S3, src_s3_client->getInitialEndpoint(), false, false})
|
||||||
{
|
{
|
||||||
LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} through buffers", src_key, destination_disk->getName());
|
/// Use native copy, the more optimal way.
|
||||||
ReadBufferFromS3 read_buffer{s3_client, src_bucket, src_key, {}, request_settings, read_settings};
|
LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} using native copy", src_key, destination_disk->getName());
|
||||||
if (*src_offset)
|
auto write_blob_function = [&](const std::pair<String, String> & blob_path_, WriteMode write_mode_, const std::optional<ObjectAttributes> & object_attributes_) -> size_t
|
||||||
read_buffer.seek(*src_offset, SEEK_SET);
|
{
|
||||||
auto write_buffer = destination_disk->writeFile(destination_path, std::min<size_t>(*src_size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings);
|
/// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files.
|
||||||
copyData(read_buffer, *write_buffer, *src_size);
|
chassert(write_mode_ == WriteMode::Rewrite);
|
||||||
write_buffer->finalize();
|
|
||||||
|
copyS3File(
|
||||||
|
src_s3_client,
|
||||||
|
src_bucket,
|
||||||
|
src_key,
|
||||||
|
*src_offset,
|
||||||
|
*src_size,
|
||||||
|
/* dest_bucket= */ blob_path_.first,
|
||||||
|
/* dest_key= */ blob_path_.second,
|
||||||
|
request_settings,
|
||||||
|
object_attributes_,
|
||||||
|
scheduler,
|
||||||
|
/* for_disk_s3= */ true);
|
||||||
|
|
||||||
|
return *src_size;
|
||||||
|
};
|
||||||
|
|
||||||
|
destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} using native copy", src_key, destination_disk->getName());
|
/// Fallback to copy through buffers.
|
||||||
|
LOG_TRACE(&Poco::Logger::get("copyS3FileToDisk"), "Copying {} to disk {} through buffers", src_key, destination_disk->getName());
|
||||||
|
ReadBufferFromS3 read_buffer{src_s3_client, src_bucket, src_key, {}, request_settings, read_settings};
|
||||||
|
if (*src_offset)
|
||||||
|
read_buffer.seek(*src_offset, SEEK_SET);
|
||||||
|
auto write_buffer = destination_disk->writeFile(destination_path, std::min<size_t>(*src_size, DBMS_DEFAULT_BUFFER_SIZE), write_mode, write_settings);
|
||||||
|
copyData(read_buffer, *write_buffer, *src_size);
|
||||||
|
write_buffer->finalize();
|
||||||
|
}
|
||||||
|
|
||||||
String dest_bucket = destination_disk->getObjectStorage()->getObjectsNamespace();
|
void copyS3FileFromDisk(
|
||||||
|
DiskPtr src_disk,
|
||||||
|
const String & src_path,
|
||||||
|
std::optional<size_t> src_offset,
|
||||||
|
std::optional<size_t> src_size,
|
||||||
|
const std::shared_ptr<const S3::Client> & dest_s3_client,
|
||||||
|
const String & dest_bucket,
|
||||||
|
const String & dest_key,
|
||||||
|
const ReadSettings & read_settings,
|
||||||
|
const S3Settings::RequestSettings & request_settings,
|
||||||
|
ThreadPoolCallbackRunner<void> scheduler)
|
||||||
|
{
|
||||||
|
if (!src_offset)
|
||||||
|
src_offset = 0;
|
||||||
|
|
||||||
auto write_blob_function = [&](const std::pair<String, String> & blob_path_, WriteMode write_mode_, const std::optional<ObjectAttributes> & object_attributes_) -> size_t
|
if (!src_size)
|
||||||
|
src_size = src_disk->getFileSize(src_path) - *src_offset;
|
||||||
|
|
||||||
|
auto source_data_source_description = src_disk->getDataSourceDescription();
|
||||||
|
if (source_data_source_description == DataSourceDescription{DataSourceType::S3, dest_s3_client->getInitialEndpoint(), false, false})
|
||||||
{
|
{
|
||||||
/// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files.
|
/// getBlobPath() can return std::nullopt if the file is stored as multiple objects in S3 bucket.
|
||||||
chassert(write_mode_ == WriteMode::Rewrite);
|
/// In this case we can't use native copy.
|
||||||
|
if (auto blob_path = src_disk->getBlobPath(src_path))
|
||||||
|
{
|
||||||
|
/// Use native copy, the more optimal way.
|
||||||
|
LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying file {} to S3 using native copy", src_path);
|
||||||
|
const auto & [src_bucket, src_key] = *blob_path;
|
||||||
|
copyS3File(dest_s3_client, src_bucket, src_key, *src_offset, *src_size, dest_bucket, dest_key, request_settings, {}, scheduler);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
copyS3File(
|
/// Fallback to copy through buffers.
|
||||||
s3_client,
|
LOG_TRACE(&Poco::Logger::get("copyS3FileFromDisk"), "Copying {} to S3 through buffers", src_path);
|
||||||
src_bucket,
|
auto create_read_buffer = [src_disk, &src_path, &read_settings] { return src_disk->readFile(src_path, read_settings); };
|
||||||
src_key,
|
copyDataToS3File(create_read_buffer, *src_offset, *src_size, dest_s3_client, dest_bucket, dest_key, request_settings, {}, scheduler);
|
||||||
*src_offset,
|
|
||||||
*src_size,
|
|
||||||
/* dest_bucket= */ blob_path_.first,
|
|
||||||
/* dest_key= */ blob_path_.second,
|
|
||||||
request_settings,
|
|
||||||
object_attributes_,
|
|
||||||
scheduler,
|
|
||||||
/* for_disk_s3= */ true);
|
|
||||||
|
|
||||||
return *src_size;
|
|
||||||
};
|
|
||||||
|
|
||||||
destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -13,11 +13,11 @@ namespace DB
|
|||||||
{
|
{
|
||||||
|
|
||||||
/// Copies an object from S3 bucket to a disk of any type.
|
/// Copies an object from S3 bucket to a disk of any type.
|
||||||
/// Depending on the disk the function can either do copying though buffers
|
/// Depending on the disk the function can either do copying through buffers
|
||||||
/// (i.e. download the object by portions and then write those portions to the specified disk),
|
/// (i.e. download the object by portions and then write those portions to the specified disk),
|
||||||
/// or perform a server-side copy.
|
/// or perform a server-side copy.
|
||||||
void copyS3FileToDisk(
|
void copyS3FileToDisk(
|
||||||
const std::shared_ptr<const S3::Client> & s3_client,
|
const std::shared_ptr<const S3::Client> & src_s3_client,
|
||||||
const String & src_bucket,
|
const String & src_bucket,
|
||||||
const String & src_key,
|
const String & src_key,
|
||||||
const std::optional<String> & version_id,
|
const std::optional<String> & version_id,
|
||||||
@ -31,6 +31,22 @@ void copyS3FileToDisk(
|
|||||||
const S3Settings::RequestSettings & request_settings = {},
|
const S3Settings::RequestSettings & request_settings = {},
|
||||||
ThreadPoolCallbackRunner<void> scheduler = {});
|
ThreadPoolCallbackRunner<void> scheduler = {});
|
||||||
|
|
||||||
|
/// Copies an object from a disk of any type to S3 bucket.
|
||||||
|
/// Depending on the disk the function can either do copying through buffers
|
||||||
|
/// (i.e. read the object by portions and then upload those portions to the specified disk),
|
||||||
|
/// or perform a server-side copy.
|
||||||
|
void copyS3FileFromDisk(
|
||||||
|
DiskPtr src_disk,
|
||||||
|
const String & src_path,
|
||||||
|
std::optional<size_t> src_offset,
|
||||||
|
std::optional<size_t> src_size,
|
||||||
|
const std::shared_ptr<const S3::Client> & dest_s3_client,
|
||||||
|
const String & dest_bucket,
|
||||||
|
const String & dest_key,
|
||||||
|
const ReadSettings & read_settings = {},
|
||||||
|
const S3Settings::RequestSettings & request_settings = {},
|
||||||
|
ThreadPoolCallbackRunner<void> scheduler = {});
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
@ -141,7 +141,7 @@ def test_backup_to_s3_native_copy():
|
|||||||
f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')"
|
f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')"
|
||||||
)
|
)
|
||||||
check_backup_and_restore(storage_policy, backup_destination)
|
check_backup_and_restore(storage_policy, backup_destination)
|
||||||
assert node.contains_in_log("BackupWriterS3.*using native copy")
|
assert node.contains_in_log("copyS3FileFromDisk.*using native copy")
|
||||||
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
||||||
assert node.contains_in_log(
|
assert node.contains_in_log(
|
||||||
f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}"
|
f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}"
|
||||||
@ -155,7 +155,7 @@ def test_backup_to_s3_native_copy_other_bucket():
|
|||||||
f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')"
|
f"S3('http://minio1:9001/root/data/backups/{backup_name}', 'minio', 'minio123')"
|
||||||
)
|
)
|
||||||
check_backup_and_restore(storage_policy, backup_destination)
|
check_backup_and_restore(storage_policy, backup_destination)
|
||||||
assert node.contains_in_log("BackupWriterS3.*using native copy")
|
assert node.contains_in_log("copyS3FileFromDisk.*using native copy")
|
||||||
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
||||||
assert node.contains_in_log(
|
assert node.contains_in_log(
|
||||||
f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}"
|
f"copyS3File: Single operation copy has completed. Bucket: root, Key: data/backups/{backup_name}"
|
||||||
@ -167,7 +167,7 @@ def test_backup_to_s3_native_copy_multipart():
|
|||||||
backup_name = new_backup_name()
|
backup_name = new_backup_name()
|
||||||
backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart/{backup_name}', 'minio', 'minio123')"
|
backup_destination = f"S3('http://minio1:9001/root/data/backups/multipart/{backup_name}', 'minio', 'minio123')"
|
||||||
check_backup_and_restore(storage_policy, backup_destination, size=1000000)
|
check_backup_and_restore(storage_policy, backup_destination, size=1000000)
|
||||||
assert node.contains_in_log("BackupWriterS3.*using native copy")
|
assert node.contains_in_log("copyS3FileFromDisk.*using native copy")
|
||||||
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
assert node.contains_in_log("copyS3FileToDisk.*using native copy")
|
||||||
assert node.contains_in_log(
|
assert node.contains_in_log(
|
||||||
f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/"
|
f"copyS3File: Multipart upload has completed. Bucket: root, Key: data/backups/multipart/{backup_name}/"
|
||||||
|
Loading…
Reference in New Issue
Block a user