From 6fe63a80bc6addbb063074f665f71f2a7842612c Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 20 Aug 2021 16:12:30 +0300 Subject: [PATCH] Fix validateKey/Bucket for S3 --- src/IO/S3Common.cpp | 74 ++++++++++--------- src/IO/S3Common.h | 3 +- src/Storages/StorageS3.cpp | 21 ++---- .../0_stateless/01944_insert_partition_by.sql | 4 +- 4 files changed, 50 insertions(+), 52 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 50acd40bc23..ff06d6777e7 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -617,51 +617,55 @@ namespace S3 uri = uri_; storage_name = S3; - try + if (uri.getHost().empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); + + String name; + String endpoint_authority_from_uri; + + if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri)) { - if (uri.getHost().empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); + is_virtual_hosted_style = true; + endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; + validateBucket(bucket, uri); - String name; - String endpoint_authority_from_uri; - - if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri)) + if (!uri.getPath().empty()) { - is_virtual_hosted_style = true; - endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; - - if (!uri.getPath().empty()) - { - /// Remove leading '/' from path to extract key. - key = uri.getPath().substr(1); - } - - boost::to_upper(name); - if (name != S3 && name != COS) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name)); - } - if (name == S3) - { - storage_name = name; - } - else - { - storage_name = COSN; - } + /// Remove leading '/' from path to extract key. + key = uri.getPath().substr(1); } - else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key)) + + boost::to_upper(name); + if (name != S3 && name != COS) { - is_virtual_hosted_style = false; - endpoint = uri.getScheme() + "://" + uri.getAuthority(); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name)); + } + if (name == S3) + { + storage_name = name; } else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI."); + { + storage_name = COSN; + } } - catch (const Exception & e) + else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key)) { - throw Exception(e.code(), "{} ({})", e.message(), uri.toString()); + is_virtual_hosted_style = false; + endpoint = uri.getScheme() + "://" + uri.getAuthority(); + validateBucket(bucket, uri); } + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI."); + } + + void URI::validateBucket(const String & bucket, const Poco::URI & uri) + { + /// S3 specification requires at least 3 and at most 63 characters in bucket name. + /// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html + if (bucket.length() < 3 || bucket.length() > 63) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}", + quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : ""); } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 613696344a0..20ec982138a 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -75,8 +75,7 @@ struct URI explicit URI(const Poco::URI & uri_); - static void validateBucket(const String & bucket); - static void validateKey(const String & bucket); + static void validateBucket(const String & bucket, const Poco::URI & uri); }; } diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index f4ac256e069..0e84b7a26e7 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -68,6 +68,7 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_PARSE_TEXT; + extern const int BAD_ARGUMENTS; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int S3_ERROR; extern const int UNEXPECTED_EXPRESSION; @@ -478,32 +479,26 @@ private: return it->second; } - static void validateBucket(const StringRef & str) + static void validateBucket(const String & str) { - /// See: - /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html - /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket + S3::URI::validateBucket(str, {}); - if (str.size < 3 || 222 < str.size) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, - "Bucket name length is out of bounds in virtual hosted style S3 URI: {}", quoteString(str)); - - if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data), str.size)) + if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); validatePartitionKey(str, false); } - static void validateKey(const StringRef & str) + static void validateKey(const String & str) { /// See: /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject - if (str.size < 1 || 1024 < str.size) - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect key length (min - 2, max - 1023 characters), got: {}", str.size); + if (str.empty() || str.size() > 1024) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size()); - if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data), str.size)) + if (!DB::UTF8::isValidUTF8(reinterpret_cast(str.data()), str.size())) throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); validatePartitionKey(str, true); diff --git a/tests/queries/0_stateless/01944_insert_partition_by.sql b/tests/queries/0_stateless/01944_insert_partition_by.sql index 52c7319022f..1c7d96d7762 100644 --- a/tests/queries/0_stateless/01944_insert_partition_by.sql +++ b/tests/queries/0_stateless/01944_insert_partition_by.sql @@ -4,6 +4,6 @@ INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.cs INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc\xc3\x28abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT } -INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError CANNOT_PARSE_TEXT } -INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError CANNOT_PARSE_TEXT } +INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } +INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT }