Fix validateKey/Bucket for S3

This commit is contained in:
vdimir 2021-08-20 16:12:30 +03:00 committed by Vladimir Chebotarev
parent 295c8d5686
commit 6fe63a80bc
4 changed files with 50 additions and 52 deletions

View File

@ -617,51 +617,55 @@ namespace S3
uri = uri_; uri = uri_;
storage_name = S3; storage_name = S3;
try if (uri.getHost().empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
String name;
String endpoint_authority_from_uri;
if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
{ {
if (uri.getHost().empty()) is_virtual_hosted_style = true;
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
validateBucket(bucket, uri);
String name; if (!uri.getPath().empty())
String endpoint_authority_from_uri;
if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
{ {
is_virtual_hosted_style = true; /// Remove leading '/' from path to extract key.
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; key = uri.getPath().substr(1);
if (!uri.getPath().empty())
{
/// Remove leading '/' from path to extract key.
key = uri.getPath().substr(1);
}
boost::to_upper(name);
if (name != S3 && name != COS)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name));
}
if (name == S3)
{
storage_name = name;
}
else
{
storage_name = COSN;
}
} }
else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
boost::to_upper(name);
if (name != S3 && name != COS)
{ {
is_virtual_hosted_style = false; throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name));
endpoint = uri.getScheme() + "://" + uri.getAuthority(); }
if (name == S3)
{
storage_name = name;
} }
else else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI."); {
storage_name = COSN;
}
} }
catch (const Exception & e) else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
{ {
throw Exception(e.code(), "{} ({})", e.message(), uri.toString()); is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
validateBucket(bucket, uri);
} }
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
}
void URI::validateBucket(const String & bucket, const Poco::URI & uri)
{
/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {}{}",
quoteString(bucket), !uri.empty() ? " (" + uri.toString() + ")" : "");
} }
} }

View File

@ -75,8 +75,7 @@ struct URI
explicit URI(const Poco::URI & uri_); explicit URI(const Poco::URI & uri_);
static void validateBucket(const String & bucket); static void validateBucket(const String & bucket, const Poco::URI & uri);
static void validateKey(const String & bucket);
}; };
} }

View File

@ -68,6 +68,7 @@ namespace DB
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int CANNOT_PARSE_TEXT; extern const int CANNOT_PARSE_TEXT;
extern const int BAD_ARGUMENTS;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int S3_ERROR; extern const int S3_ERROR;
extern const int UNEXPECTED_EXPRESSION; extern const int UNEXPECTED_EXPRESSION;
@ -478,32 +479,26 @@ private:
return it->second; return it->second;
} }
static void validateBucket(const StringRef & str) static void validateBucket(const String & str)
{ {
/// See: S3::URI::validateBucket(str, {});
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket
if (str.size < 3 || 222 < str.size) if (!DB::UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size()))
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {}", quoteString(str));
if (!DB::UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data), str.size))
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name"); throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in bucket name");
validatePartitionKey(str, false); validatePartitionKey(str, false);
} }
static void validateKey(const StringRef & str) static void validateKey(const String & str)
{ {
/// See: /// See:
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject
if (str.size < 1 || 1024 < str.size) if (str.empty() || str.size() > 1024)
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect key length (min - 2, max - 1023 characters), got: {}", str.size); throw Exception(ErrorCodes::BAD_ARGUMENTS, "Incorrect key length (not empty, max 1023 characters), got: {}", str.size());
if (!DB::UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data), str.size)) if (!DB::UTF8::isValidUTF8(reinterpret_cast<const UInt8 *>(str.data()), str.size()))
throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key"); throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Incorrect non-UTF8 sequence in key");
validatePartitionKey(str, true); validatePartitionKey(str, true);

View File

@ -4,6 +4,6 @@ INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.cs
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc\xc3\x28abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc\xc3\x28abc'); -- { serverError CANNOT_PARSE_TEXT }
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc}{abc'); -- { serverError CANNOT_PARSE_TEXT }
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/test_{_partition_id}.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'abc*abc'); -- { serverError CANNOT_PARSE_TEXT }
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/foo/{_partition_id}', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS }
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, ''); -- { serverError BAD_ARGUMENTS }
INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT } INSERT INTO TABLE FUNCTION s3('http://localhost:9001/{_partition_id}/key.csv', 'admin', 'admin', 'CSV', 'id Int32, val String') PARTITION BY val VALUES (1, 'aa/bb'); -- { serverError CANNOT_PARSE_TEXT }