From 9a520879898ec6deb026722ea6cec0194f383e6d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 16 Jan 2023 20:14:39 +0100 Subject: [PATCH] More complex logic: GetObjectAttributes requests will be used only if the endpoint is "*.amazonaws.com", otherwise HeadObject requests will be used. --- src/IO/S3Common.cpp | 115 ++++++++++++++++++++++++++++++++++---------- src/IO/S3Common.h | 11 +---- 2 files changed, 91 insertions(+), 35 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 297ab46017a..a18fcf70566 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -29,6 +29,7 @@ # include # include # include +# include # include # include @@ -42,9 +43,11 @@ namespace ProfileEvents { extern const Event S3GetObjectAttributes; - extern const Event DiskS3GetObjectAttributes; extern const Event S3GetObjectMetadata; + extern const Event S3HeadObject; + extern const Event DiskS3GetObjectAttributes; extern const Event DiskS3GetObjectMetadata; + extern const Event DiskS3HeadObject; } namespace DB @@ -702,26 +705,90 @@ public: } }; -/// Performs a GetObjectAttributes request. -Aws::S3::Model::GetObjectAttributesOutcome getObjectAttributes( +/// Extracts the endpoint from a constructed S3 client. +String getEndpoint(const Aws::S3::S3Client & client) +{ + const auto * endpoint_provider = dynamic_cast(const_cast(client).accessEndpointProvider().get()); + if (!endpoint_provider) + return {}; + String endpoint; + endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint); + return endpoint; +} + +/// Performs a request to get the size and last modification time of an object. +/// The function performs either HeadObject or GetObjectAttributes request depending on the endpoint. +std::pair, Aws::S3::S3Error> tryGetObjectInfo( const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3) { - ProfileEvents::increment(ProfileEvents::S3GetObjectAttributes); - if (for_disk_s3) - ProfileEvents::increment(ProfileEvents::DiskS3GetObjectAttributes); + auto endpoint = getEndpoint(client); + bool use_get_object_attributes_request = (endpoint.find(".amazonaws.com") != String::npos); - /// We must not use the `HeadObject` request, see the comment about `HeadObjectRequest` in S3Common.h. + if (use_get_object_attributes_request) + { + /// It's better not to use `HeadObject` requests for AWS S3 because they don't work well with the global region. + /// Details: `HeadObject` request never returns a response body (even if there is an error) however + /// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv" + /// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways + /// to determine the correct region and try to repeat the request again with the correct region. + /// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies, + /// but for `HeadObject` there is no response body so this way doesn't work. That's why we use `GetObjectAttributes` request instead. + /// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information. - Aws::S3::Model::GetObjectAttributesRequest req; - req.SetBucket(bucket); - req.SetKey(key); + ProfileEvents::increment(ProfileEvents::S3GetObjectAttributes); + if (for_disk_s3) + ProfileEvents::increment(ProfileEvents::DiskS3GetObjectAttributes); - if (!version_id.empty()) - req.SetVersionId(version_id); + Aws::S3::Model::GetObjectAttributesRequest req; + req.SetBucket(bucket); + req.SetKey(key); - req.SetObjectAttributes({Aws::S3::Model::ObjectAttributes::ObjectSize}); + if (!version_id.empty()) + req.SetVersionId(version_id); - return client.GetObjectAttributes(req); + req.SetObjectAttributes({Aws::S3::Model::ObjectAttributes::ObjectSize}); + + auto outcome = client.GetObjectAttributes(req); + if (outcome.IsSuccess()) + { + const auto & result = outcome.GetResult(); + DB::S3::ObjectInfo object_info; + object_info.size = static_cast(result.GetObjectSize()); + object_info.last_modification_time = result.GetLastModified().Millis() / 1000; + return {object_info, {}}; + } + + return {std::nullopt, outcome.GetError()}; + } + else + { + /// By default we use `HeadObject` requests. + /// We cannot just use `GetObjectAttributes` requests always because some S3 providers (e.g. Minio) + /// don't support `GetObjectAttributes` requests. + + ProfileEvents::increment(ProfileEvents::S3HeadObject); + if (for_disk_s3) + ProfileEvents::increment(ProfileEvents::DiskS3HeadObject); + + Aws::S3::Model::HeadObjectRequest req; + req.SetBucket(bucket); + req.SetKey(key); + + if (!version_id.empty()) + req.SetVersionId(version_id); + + auto outcome = client.HeadObject(req); + if (outcome.IsSuccess()) + { + const auto & result = outcome.GetResult(); + DB::S3::ObjectInfo object_info; + object_info.size = static_cast(result.GetContentLength()); + object_info.last_modification_time = result.GetLastModified().Millis() / 1000; + return {object_info, {}}; + } + + return {std::nullopt, outcome.GetError()}; + } } } @@ -919,17 +986,15 @@ namespace S3 return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY; } - S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error) + ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error) { - auto outcome = getObjectAttributes(client, bucket, key, version_id, for_disk_s3); - if (outcome.IsSuccess()) + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3); + if (object_info) { - const auto & result = outcome.GetResult(); - return {.size = static_cast(result.GetObjectSize()), .last_modification_time = result.GetLastModified().Millis() / 1000}; + return *object_info; } else if (throw_on_error) { - const auto & error = outcome.GetError(); throw DB::Exception(ErrorCodes::S3_ERROR, "Failed to get object attributes: {}. HTTP response code: {}", error.GetMessage(), static_cast(error.GetResponseCode())); @@ -944,11 +1009,10 @@ namespace S3 bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3) { - auto outcome = getObjectAttributes(client, bucket, key, version_id, for_disk_s3); - if (outcome.IsSuccess()) + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3); + if (object_info) return true; - const auto & error = outcome.GetError(); if (isNotFoundError(error.GetErrorType())) return false; @@ -959,10 +1023,9 @@ namespace S3 void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, std::string_view description) { - auto outcome = getObjectAttributes(client, bucket, key, version_id, for_disk_s3); - if (outcome.IsSuccess()) + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3); + if (object_info) return; - const auto & error = outcome.GetError(); throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}", (description.empty() ? "" : (String(description) + ": ")), key, bucket, error.GetMessage()); } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 473ed3f4bc3..69ae1cbb4f4 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -122,14 +122,7 @@ struct URI }; /// WARNING: Don't use `HeadObjectRequest`! Use the functions below instead. -/// Explanation: The `HeadObject` request never returns a response body (even if there is an error) however -/// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv" -/// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways -/// to determine the correct region and try to repeat the request again with the correct region. -/// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies, -/// but for `HeadObject` there is no response body so this way doesn't work. -/// That's why it's better to avoid using `HeadObject` requests at all. -/// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more details. +/// For explanation see the comment about `HeadObject` request in the function tryGetObjectInfo(). struct ObjectInfo { @@ -137,7 +130,7 @@ struct ObjectInfo time_t last_modification_time = 0; }; -S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true); +ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true); size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);