Improved validation of S3 buckets and keys.

This commit is contained in:
Vladimir Chebotarev 2021-07-25 21:46:31 +03:00
parent 6e10d28c78
commit 0479edd47e
4 changed files with 125 additions and 66 deletions

View File

@ -1,3 +1,5 @@
#include <Functions/isValidUTF8.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionStringOrArrayToT.h>
@ -10,15 +12,17 @@
# include <tmmintrin.h>
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
/// inspired by https://github.com/cyb70289/utf8/
struct ValidUTF8Impl
{
/*
MIT License
@ -71,7 +75,7 @@ SOFTWARE.
* +--------------------+------------+-------------+------------+-------------+
*/
static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len)
inline UInt8 ValidUTF8Impl::isValidUTF8Naive(const UInt8 * data, UInt64 len)
{
while (len)
{
@ -139,9 +143,9 @@ SOFTWARE.
}
#ifndef __SSE4_1__
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); }
#else
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len)
inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len)
{
/*
* Map high nibble of "First Byte" to legal character length minus 1
@ -291,9 +295,7 @@ SOFTWARE.
}
#endif
static constexpr bool is_fixed_to_constant = false;
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
void ValidUTF8Impl::vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res)
{
size_t size = offsets.size();
size_t prev_offset = 0;
@ -304,35 +306,34 @@ SOFTWARE.
}
}
static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}
void ValidUTF8Impl::vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {}
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
void ValidUTF8Impl::vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res)
{
size_t size = data.size() / n;
for (size_t i = 0; i < size; ++i)
res[i] = isValidUTF8(data.data() + i * n, n);
}
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
[[noreturn]] void ValidUTF8Impl::array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &)
{
throw Exception("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &)
[[noreturn]] void ValidUTF8Impl::uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &)
{
throw Exception("Cannot apply function isValidUTF8 to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
};
struct NameIsValidUTF8
{
static constexpr auto name = "isValidUTF8";
};
using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
struct NameIsValidUTF8
{
static constexpr auto name = "isValidUTF8";
};
using FunctionValidUTF8 = FunctionStringOrArrayToT<ValidUTF8Impl, NameIsValidUTF8, UInt8>;
void registerFunctionIsValidUTF8(FunctionFactory & factory)
{
factory.registerFunction<FunctionValidUTF8>();
}
void registerFunctionIsValidUTF8(FunctionFactory & factory)
{
factory.registerFunction<FunctionValidUTF8>();
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include <Core/Types.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
namespace DB
{
struct ValidUTF8Impl
{
static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len);
static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len);
static constexpr bool is_fixed_to_constant = false;
static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray<UInt8> & res);
static void vectorFixedToConstant(const ColumnString::Chars & data, size_t n, UInt8 & res);
static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray<UInt8> & res);
[[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray<UInt8> &);
[[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray<UInt8> &);
};
}

View File

@ -2,9 +2,11 @@
#if USE_AWS_S3
# include <IO/S3Common.h>
# include <Common/quoteString.h>
# include <IO/S3Common.h>
# include <Functions/isValidUTF8.h>
# include <IO/WriteBufferFromString.h>
# include <Storages/StorageS3Settings.h>
@ -616,55 +618,82 @@ namespace S3
uri = uri_;
storage_name = S3;
if (uri.getHost().empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI: {}", uri.toString());
String name;
String endpoint_authority_from_uri;
if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
try
{
is_virtual_hosted_style = true;
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
if (uri.getHost().empty())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI.");
/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
String name;
String endpoint_authority_from_uri;
if (!uri.getPath().empty())
if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri))
{
/// Remove leading '/' from path to extract key.
key = uri.getPath().substr(1);
is_virtual_hosted_style = true;
endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri;
if (!uri.getPath().empty())
{
/// Remove leading '/' from path to extract key.
key = uri.getPath().substr(1);
}
boost::to_upper(name);
if (name != S3 && name != COS)
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name));
}
if (name == S3)
{
storage_name = name;
}
else
{
storage_name = COSN;
}
}
boost::to_upper(name);
if (name != S3 && name != COS)
else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
{
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {} ({})", quoteString(name), uri.toString());
}
if (name == S3)
{
storage_name = name;
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
}
else
{
storage_name = COSN;
}
}
else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key))
{
is_virtual_hosted_style = false;
endpoint = uri.getScheme() + "://" + uri.getAuthority();
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI.");
/// S3 specification requires at least 3 and at most 63 characters in bucket name.
/// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html
if (bucket.length() < 3 || bucket.length() > 63)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString());
validateBucket(bucket);
validateKey(key);
}
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI: {}", uri.toString());
catch(const Exception & e)
{
throw Exception(e.code(), "{} ({})", e.message(), uri.toString());
}
}
void URI::validateBucket(const String & bucket)
{
/// See:
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket
if (bucket.length() < 3 || bucket.length() > 222)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Bucket name length is out of bounds in virtual hosted style S3 URI: {}", quoteString(bucket));
}
void URI::validateKey(const String & /*key*/)
{
/// See:
/// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html
/// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject
/// The following is valid for AWS S3:
/// if (key.length() > 1024)
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too long (showing first 1024 characters): {}", quoteString(key.substr(0, 1024) + "..."));
/// if (!ValidUTF8Impl::isValidUTF8(key.data(), key.size()))
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name must be valid UTF-8 string: {}", quoteString(key));
/// The following is valid for IBM COS:
/// if (key.length() < 1)
/// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too short (0 bytes long)");
}
}

View File

@ -74,6 +74,9 @@ struct URI
bool is_virtual_hosted_style;
explicit URI(const Poco::URI & uri_);
static void validateBucket(const String & bucket);
static void validateKey(const String & bucket);
};
}