Improved validation of S3 buckets and keys.

This commit is contained in:
Vladimir Chebotarev 2021-07-25 21:46:31 +03:00
parent 6e10d28c78
commit 0479edd47e
4 changed files with 125 additions and 66 deletions

View File

@ -1,3 +1,5 @@
#include <Functions/isValidUTF8.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringOrArrayToT.h> #include <Functions/FunctionStringOrArrayToT.h>
@ -10,15 +12,17 @@
# include <tmmintrin.h> # include <tmmintrin.h>
#endif #endif
namespace DB namespace DB
{ {
namespace ErrorCodes
{ namespace ErrorCodes
extern const int ILLEGAL_TYPE_OF_ARGUMENT; {
} extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
/// inspired by https://github.com/cyb70289/utf8/ /// inspired by https://github.com/cyb70289/utf8/
struct ValidUTF8Impl
{
/* /*
MIT License MIT License
@ -71,7 +75,7 @@ SOFTWARE.
* +--------------------+------------+-------------+------------+-------------+ * +--------------------+------------+-------------+------------+-------------+
*/ */
static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len) inline UInt8 ValidUTF8Impl::isValidUTF8Naive(const UInt8 * data, UInt64 len)
{ {
while (len) while (len)
{ {
@ -139,9 +143,9 @@ SOFTWARE.
} }
#ifndef __SSE4_1__ #ifndef __SSE4_1__
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); } inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
#else #else
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len)
{ {
/* /*
* Map high nibble of "First Byte" to legal character length minus 1 * Map high nibble of "First Byte" to legal character length minus 1
@ -291,9 +295,7 @@ SOFTWARE.
} }
#endif #endif
static constexpr bool is_fixed_to_constant = false; void ValidUTF8Impl::vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
{ {
size_t size = offsets.size(); size_t size = offsets.size();
size_t prev_offset = 0; size_t prev_offset = 0;
@ -304,35 +306,34 @@ SOFTWARE.
} }
} }
static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {} void ValidUTF8Impl::vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res) void ValidUTF8Impl::vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
{ {
size_t size = data.size() / n; size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i) for (size_t i = 0; i < size; ++i)
res[i] = isValidUTF8(data.data() + i * n, n); res[i] = isValidUTF8(data.data() + i * n, n);
} }
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &) [[noreturn]] void ValidUTF8Impl::array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
{ {
throw Exception("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); throw Exception("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
} }
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &) [[noreturn]] void ValidUTF8Impl::uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &)
{ {
throw Exception("Cannot apply function isValidUTF8 to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); throw Exception("Cannot apply function isValidUTF8 to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
} }
};
struct NameIsValidUTF8 struct NameIsValidUTF8
{ {
static constexpr auto name = "isValidUTF8"; static constexpr auto name = "isValidUTF8";
}; };
using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>; using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
void registerFunctionIsValidUTF8(FunctionFactory & factory) void registerFunctionIsValidUTF8(FunctionFactory & factory)
{ {
factory.registerFunction<FunctionValidUTF8>(); factory.registerFunction<FunctionValidUTF8>();
} }
} }

View File

@ -0,0 +1,26 @@
#pragma once
#include <Core/Types.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
namespace DB
{
struct ValidUTF8Impl
{
static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len);
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len);
static constexpr bool is_fixed_to_constant = false;
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res);
static void vectorFixedToConstant(const ColumnString::Chars & data, size_t n, UInt8 & res);
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res);
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &);
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &);
};
}

View File

@ -2,9 +2,11 @@
#if USE_AWS_S3 #if USE_AWS_S3
# include <IO/S3Common.h>
# include <Common/quoteString.h> # include <Common/quoteString.h>
# include <IO/S3Common.h> # include <Functions/isValidUTF8.h>
# include <IO/WriteBufferFromString.h> # include <IO/WriteBufferFromString.h>
# include <Storages/StorageS3Settings.h> # include <Storages/StorageS3Settings.h>
@ -616,55 +618,82 @@ namespace S3
uri = uri_; uri = uri_;
storage_name = S3; storage_name = S3;
if (uri.getHost().empty()) try
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI: {}", uri.toString());
String name;
String endpoint_authority_from_uri;
if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
{ {
is_virtual_hosted_style = true; if (uri.getHost().empty())
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
/// S3 specification requires at least 3 and at most 63 characters in bucket name. String name;
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html String endpoint_authority_from_uri;
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
if (!uri.getPath().empty()) if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
{ {
/// Remove leading '/' from path to extract key. is_virtual_hosted_style = true;
key = uri.getPath().substr(1); endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
if (!uri.getPath().empty())
{
/// Remove leading '/' from path to extract key.
key = uri.getPath().substr(1);
}
boost::to_upper(name);
if (name != S3 && name != COS)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name));
}
if (name == S3)
{
storage_name = name;
}
else
{
storage_name = COSN;
}
} }
else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
boost::to_upper(name);
if (name != S3 && name != COS)
{ {
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {} ({})", quoteString(name), uri.toString()); is_virtual_hosted_style = false;
} endpoint = uri.getScheme() + "://" + uri.getAuthority();
if (name == S3)
{
storage_name = name;
} }
else else
{ throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
storage_name = COSN;
}
}
else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
{
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
/// S3 specification requires at least 3 and at most 63 characters in bucket name. validateBucket(bucket);
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html validateKey(key);
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
} }
else catch(const Exception & e)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI: {}", uri.toString()); {
throw Exception(e.code(), "{} ({})", e.message(), uri.toString());
}
}
void URI::validateBucket(const String & bucket)
{
/// See:
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket
if (bucket.length() < 3 || bucket.length() > 222)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {}", quoteString(bucket));
}
void URI::validateKey(const String & /*key*/)
{
/// See:
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject
/// The following is valid for AWS S3:
/// if (key.length() > 1024)
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too long (showing first 1024 characters): {}", quoteString(key.substr(0, 1024) + "..."));
/// if (!ValidUTF8Impl::isValidUTF8(key.data(), key.size()))
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name must be valid UTF-8 string: {}", quoteString(key));
/// The following is valid for IBM COS:
/// if (key.length() < 1)
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too short (0 bytes long)");
} }
} }

View File

@ -74,6 +74,9 @@ struct URI
bool is_virtual_hosted_style; bool is_virtual_hosted_style;
explicit URI(const Poco::URI & uri_); explicit URI(const Poco::URI & uri_);
static void validateBucket(const String & bucket);
static void validateKey(const String & bucket);
}; };
} }