diff --git a/src/Functions/isValidUTF8.cpp b/src/Functions/isValidUTF8.cpp index abdda53990d..5332396fe10 100644 --- a/src/Functions/isValidUTF8.cpp +++ b/src/Functions/isValidUTF8.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -10,15 +12,17 @@ # include #endif + namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} + + namespace ErrorCodes + { + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + } + + /// inspired by https://github.com/cyb70289/utf8/ -struct ValidUTF8Impl -{ /* MIT License @@ -71,7 +75,7 @@ SOFTWARE. * +--------------------+------------+-------------+------------+-------------+ */ - static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len) + inline UInt8 ValidUTF8Impl::isValidUTF8Naive(const UInt8 * data, UInt64 len) { while (len) { @@ -139,9 +143,9 @@ SOFTWARE. } #ifndef __SSE4_1__ - static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); } + inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len) { return isValidUTF8Naive(data, len); } #else - static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len) + inline UInt8 ValidUTF8Impl::isValidUTF8(const UInt8 * data, UInt64 len) { /* * Map high nibble of "First Byte" to legal character length minus 1 @@ -291,9 +295,7 @@ SOFTWARE. } #endif - static constexpr bool is_fixed_to_constant = false; - - static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) + void ValidUTF8Impl::vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res) { size_t size = offsets.size(); size_t prev_offset = 0; @@ -304,35 +306,34 @@ SOFTWARE. } } - static void vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {} + void ValidUTF8Impl::vectorFixedToConstant(const ColumnString::Chars & /*data*/, size_t /*n*/, UInt8 & /*res*/) {} - static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray & res) + void ValidUTF8Impl::vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray & res) { size_t size = data.size() / n; for (size_t i = 0; i < size; ++i) res[i] = isValidUTF8(data.data() + i * n, n); } - [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray &) + [[noreturn]] void ValidUTF8Impl::array(const ColumnString::Offsets &, PaddedPODArray &) { throw Exception("Cannot apply function isValidUTF8 to Array argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } - [[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray &) + [[noreturn]] void ValidUTF8Impl::uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray &) { throw Exception("Cannot apply function isValidUTF8 to UUID argument", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } -}; -struct NameIsValidUTF8 -{ - static constexpr auto name = "isValidUTF8"; -}; -using FunctionValidUTF8 = FunctionStringOrArrayToT; + struct NameIsValidUTF8 + { + static constexpr auto name = "isValidUTF8"; + }; + using FunctionValidUTF8 = FunctionStringOrArrayToT; -void registerFunctionIsValidUTF8(FunctionFactory & factory) -{ - factory.registerFunction(); -} + void registerFunctionIsValidUTF8(FunctionFactory & factory) + { + factory.registerFunction(); + } } diff --git a/src/Functions/isValidUTF8.h b/src/Functions/isValidUTF8.h new file mode 100644 index 00000000000..7f71ab88adb --- /dev/null +++ b/src/Functions/isValidUTF8.h @@ -0,0 +1,26 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +struct ValidUTF8Impl +{ + static inline UInt8 isValidUTF8Naive(const UInt8 * data, UInt64 len); + static inline UInt8 isValidUTF8(const UInt8 * data, UInt64 len); + + static constexpr bool is_fixed_to_constant = false; + + static void vector(const ColumnString::Chars & data, const ColumnString::Offsets & offsets, PaddedPODArray & res); + static void vectorFixedToConstant(const ColumnString::Chars & data, size_t n, UInt8 & res); + static void vectorFixedToVector(const ColumnString::Chars & data, size_t n, PaddedPODArray & res); + + [[noreturn]] static void array(const ColumnString::Offsets &, PaddedPODArray &); + [[noreturn]] static void uuid(const ColumnUUID::Container &, size_t &, PaddedPODArray &); +}; + +} diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 74c328661c4..8c83f05f8de 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -2,9 +2,11 @@ #if USE_AWS_S3 +# include + # include -# include +# include # include # include @@ -616,55 +618,82 @@ namespace S3 uri = uri_; storage_name = S3; - if (uri.getHost().empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI: {}", uri.toString()); - - String name; - String endpoint_authority_from_uri; - - if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri)) + try { - is_virtual_hosted_style = true; - endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; + if (uri.getHost().empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Host is empty in S3 URI."); - /// S3 specification requires at least 3 and at most 63 characters in bucket name. - /// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html - if (bucket.length() < 3 || bucket.length() > 63) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString()); + String name; + String endpoint_authority_from_uri; - if (!uri.getPath().empty()) + if (re2::RE2::FullMatch(uri.getAuthority(), virtual_hosted_style_pattern, &bucket, &name, &endpoint_authority_from_uri)) { - /// Remove leading '/' from path to extract key. - key = uri.getPath().substr(1); + is_virtual_hosted_style = true; + endpoint = uri.getScheme() + "://" + name + endpoint_authority_from_uri; + + if (!uri.getPath().empty()) + { + /// Remove leading '/' from path to extract key. + key = uri.getPath().substr(1); + } + + boost::to_upper(name); + if (name != S3 && name != COS) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name)); + } + if (name == S3) + { + storage_name = name; + } + else + { + storage_name = COSN; + } } - - boost::to_upper(name); - if (name != S3 && name != COS) + else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key)) { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {} ({})", quoteString(name), uri.toString()); - } - if (name == S3) - { - storage_name = name; + is_virtual_hosted_style = false; + endpoint = uri.getScheme() + "://" + uri.getAuthority(); } else - { - storage_name = COSN; - } - } - else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key)) - { - is_virtual_hosted_style = false; - endpoint = uri.getScheme() + "://" + uri.getAuthority(); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI."); - /// S3 specification requires at least 3 and at most 63 characters in bucket name. - /// https://docs.aws.amazon.com/awscloudtrail/latest/userguide/cloudtrail-s3-bucket-naming-requirements.html - if (bucket.length() < 3 || bucket.length() > 63) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket name length is out of bounds in virtual hosted style S3 URI: {} ({})", quoteString(bucket), uri.toString()); + validateBucket(bucket); + validateKey(key); } - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bucket or key name are invalid in S3 URI: {}", uri.toString()); + catch(const Exception & e) + { + throw Exception(e.code(), "{} ({})", e.message(), uri.toString()); + } + } + + void URI::validateBucket(const String & bucket) + { + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/bucketnamingrules.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket + + if (bucket.length() < 3 || bucket.length() > 222) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Bucket name length is out of bounds in virtual hosted style S3 URI: {}", quoteString(bucket)); + } + + void URI::validateKey(const String & /*key*/) + { + /// See: + /// - https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-keys.html + /// - https://cloud.ibm.com/apidocs/cos/cos-compatibility#putobject + + /// The following is valid for AWS S3: + /// if (key.length() > 1024) + /// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too long (showing first 1024 characters): {}", quoteString(key.substr(0, 1024) + "...")); + /// if (!ValidUTF8Impl::isValidUTF8(key.data(), key.size())) + /// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name must be valid UTF-8 string: {}", quoteString(key)); + + /// The following is valid for IBM COS: + /// if (key.length() < 1) + /// throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key name is too short (0 bytes long)"); } } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 886230e36c6..613696344a0 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -74,6 +74,9 @@ struct URI bool is_virtual_hosted_style; explicit URI(const Poco::URI & uri_); + + static void validateBucket(const String & bucket); + static void validateKey(const String & bucket); }; }