From 94684c07bf2615671a8bab61bcc98dfcf4faf530 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 4 Nov 2019 22:57:03 +0300 Subject: [PATCH 01/19] Added syntax for `access_key_id` and `secret_access_key` in S3 table function and storage. --- dbms/src/IO/ReadBufferFromS3.cpp | 23 ++++--- dbms/src/IO/ReadBufferFromS3.h | 11 ++-- dbms/src/IO/WriteBufferFromS3.cpp | 34 +++------- dbms/src/IO/WriteBufferFromS3.h | 9 +-- dbms/src/Storages/StorageS3.cpp | 56 ++++++++++++---- dbms/src/Storages/StorageS3.h | 8 ++- dbms/src/TableFunctions/TableFunctionS3.cpp | 71 ++++++++++++++++++++- dbms/src/TableFunctions/TableFunctionS3.h | 15 +++-- 8 files changed, 160 insertions(+), 67 deletions(-) diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index ae09f0fb189..718f9d21f8a 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -10,15 +10,18 @@ namespace DB const int DEFAULT_S3_MAX_FOLLOW_GET_REDIRECT = 2; -ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, - const ConnectionTimeouts & timeouts, - const Poco::Net::HTTPBasicCredentials & credentials, - size_t buffer_size_) +ReadBufferFromS3::ReadBufferFromS3(const Poco::URI & uri_, + const String & access_key_id_, + const String & secret_access_key_, + const ConnectionTimeouts & timeouts) : ReadBuffer(nullptr, 0) , uri {uri_} - , method {Poco::Net::HTTPRequest::HTTP_GET} + , access_key_id {access_key_id_} + , secret_access_key {secret_access_key_} , session {makeHTTPSession(uri_, timeouts)} { + /// FIXME: Implement rest of S3 authorization. + Poco::Net::HTTPResponse response; std::unique_ptr request; @@ -28,12 +31,12 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, if (uri.getPath().empty()) uri.setPath("/"); - request = std::make_unique(method, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); + request = std::make_unique( + Poco::Net::HTTPRequest::HTTP_GET, + uri.getPathAndQuery(), + Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header - if (!credentials.getUsername().empty()) - credentials.authenticate(*request); - LOG_TRACE((&Logger::get("ReadBufferFromS3")), "Sending request to " << uri.toString()); session->sendRequest(*request); @@ -54,7 +57,7 @@ ReadBufferFromS3::ReadBufferFromS3(Poco::URI uri_, } assertResponseIsOk(*request, response, *istr); - impl = std::make_unique(*istr, buffer_size_); + impl = std::make_unique(*istr, DBMS_DEFAULT_BUFFER_SIZE); } diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h index ffc0c5c0ab1..e787828f0e7 100644 --- a/dbms/src/IO/ReadBufferFromS3.h +++ b/dbms/src/IO/ReadBufferFromS3.h @@ -17,17 +17,18 @@ class ReadBufferFromS3 : public ReadBuffer { protected: Poco::URI uri; - std::string method; + String access_key_id; + String secret_access_key; HTTPSessionPtr session; std::istream * istr; /// owned by session std::unique_ptr impl; public: - explicit ReadBufferFromS3(Poco::URI uri_, - const ConnectionTimeouts & timeouts = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + explicit ReadBufferFromS3(const Poco::URI & uri_, + const String & access_key_id_, + const String & secret_access_key_, + const ConnectionTimeouts & timeouts = {}); bool nextImpl() override; }; diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 9604b6ce199..977a5b22fdc 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -30,22 +30,22 @@ namespace ErrorCodes WriteBufferFromS3::WriteBufferFromS3( const Poco::URI & uri_, + const String & access_key_id_, + const String & secret_access_key_, size_t minimum_upload_part_size_, - const ConnectionTimeouts & timeouts_, - const Poco::Net::HTTPBasicCredentials & credentials, size_t buffer_size_ -) - : BufferWithOwnMemory(buffer_size_, nullptr, 0) + const ConnectionTimeouts & timeouts_) + : BufferWithOwnMemory(DBMS_DEFAULT_BUFFER_SIZE, nullptr, 0) , uri {uri_} + , access_key_id {access_key_id_} + , secret_access_key {secret_access_key_} , minimum_upload_part_size {minimum_upload_part_size_} , timeouts {timeouts_} - , auth_request {Poco::Net::HTTPRequest::HTTP_PUT, uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1} , temporary_buffer {std::make_unique(buffer_string)} , last_part_size {0} { - if (!credentials.getUsername().empty()) - credentials.authenticate(auth_request); - initiate(); + + /// FIXME: Implement rest of S3 authorization. } @@ -113,12 +113,6 @@ void WriteBufferFromS3::initiate() request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(initiate_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) - { - Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request_ptr); - } - request_ptr->setContentLength(0); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << initiate_uri.toString()); @@ -179,12 +173,6 @@ void WriteBufferFromS3::writePart(const String & data) request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(part_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) - { - Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request_ptr); - } - request_ptr->setExpectContinue(true); request_ptr->setContentLength(data.size()); @@ -252,12 +240,6 @@ void WriteBufferFromS3::complete() request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(complete_uri.getHost()); // use original, not resolved host name in header - if (auth_request.hasCredentials()) - { - Poco::Net::HTTPBasicCredentials credentials(auth_request); - credentials.authenticate(*request_ptr); - } - request_ptr->setExpectContinue(true); request_ptr->setContentLength(data.size()); diff --git a/dbms/src/IO/WriteBufferFromS3.h b/dbms/src/IO/WriteBufferFromS3.h index 9a619f8c8bc..6f89f7c36ec 100644 --- a/dbms/src/IO/WriteBufferFromS3.h +++ b/dbms/src/IO/WriteBufferFromS3.h @@ -21,9 +21,10 @@ class WriteBufferFromS3 : public BufferWithOwnMemory { private: Poco::URI uri; + String access_key_id; + String secret_access_key; size_t minimum_upload_part_size; ConnectionTimeouts timeouts; - Poco::Net::HTTPRequest auth_request; String buffer_string; std::unique_ptr temporary_buffer; size_t last_part_size; @@ -35,10 +36,10 @@ private: public: explicit WriteBufferFromS3(const Poco::URI & uri, + const String & access_key_id, + const String & secret_access_key, size_t minimum_upload_part_size_, - const ConnectionTimeouts & timeouts = {}, - const Poco::Net::HTTPBasicCredentials & credentials = {}, - size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); + const ConnectionTimeouts & timeouts = {}); void nextImpl() override; diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index ed9173c52ec..0813f39d51f 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -32,6 +32,8 @@ namespace { public: StorageS3BlockInputStream(const Poco::URI & uri, + const String & access_key_id, + const String & secret_access_key, const String & format, const String & name_, const Block & sample_block, @@ -41,7 +43,7 @@ namespace const CompressionMethod compression_method) : name(name_) { - read_buf = getReadBuffer(compression_method, uri, timeouts); + read_buf = getReadBuffer(compression_method, uri, access_key_id, secret_access_key, timeouts); reader = FormatFactory::instance().getInput(format, *read_buf, sample_block, context, max_block_size); } @@ -80,6 +82,8 @@ namespace { public: StorageS3BlockOutputStream(const Poco::URI & uri, + const String & access_key_id, + const String & secret_access_key, const String & format, UInt64 min_upload_part_size, const Block & sample_block_, @@ -88,7 +92,13 @@ namespace const CompressionMethod compression_method) : sample_block(sample_block_) { - write_buf = getWriteBuffer(compression_method, uri, min_upload_part_size, timeouts); + write_buf = getWriteBuffer( + compression_method, + uri, + access_key_id, + secret_access_key, + min_upload_part_size, + timeouts); writer = FormatFactory::instance().getOutput(format, *write_buf, sample_block, context); } @@ -124,6 +134,8 @@ namespace StorageS3::StorageS3( const Poco::URI & uri_, + const String & access_key_id_, + const String & secret_access_key_, const std::string & database_name_, const std::string & table_name_, const String & format_name_, @@ -134,6 +146,8 @@ StorageS3::StorageS3( const String & compression_method_ = "") : IStorage(columns_) , uri(uri_) + , access_key_id(access_key_id_) + , secret_access_key(secret_access_key_) , context_global(context_) , format_name(format_name_) , database_name(database_name_) @@ -156,6 +170,8 @@ BlockInputStreams StorageS3::read( { BlockInputStreamPtr block_input = std::make_shared( uri, + access_key_id, + secret_access_key, format_name, getName(), getHeaderBlock(column_names), @@ -179,7 +195,13 @@ void StorageS3::rename(const String & /*new_path_to_db*/, const String & new_dat BlockOutputStreamPtr StorageS3::write(const ASTPtr & /*query*/, const Context & /*context*/) { return std::make_shared( - uri, format_name, min_upload_part_size, getSampleBlock(), context_global, + uri, + access_key_id, + secret_access_key, + format_name, + min_upload_part_size, + getSampleBlock(), + context_global, ConnectionTimeouts::getHTTPTimeouts(context_global), IStorage::chooseCompressionMethod(uri.toString(), compression_method)); } @@ -190,29 +212,35 @@ void registerStorageS3(StorageFactory & factory) { ASTs & engine_args = args.engine_args; - if (engine_args.size() != 2 && engine_args.size() != 3) + if (engine_args.size() < 2 || engine_args.size() > 5) throw Exception( - "Storage S3 requires 2 or 3 arguments: url, name of used format and compression_method.", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + "Storage S3 requires 2 to 5 arguments: url, [access_key_id, secret_access_key], name of used format and [compression_method].", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); - engine_args[0] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[0], args.local_context); + for (size_t i = 0; i < engine_args.size(); ++i) + engine_args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[i], args.local_context); String url = engine_args[0]->as().value.safeGet(); Poco::URI uri(url); - engine_args[1] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[1], args.local_context); + String format_name = engine_args[engine_args.size()-1]->as().value.safeGet(); - String format_name = engine_args[1]->as().value.safeGet(); + String access_key_id; + String secret_access_key; + if (engine_args.size() >= 4) + { + access_key_id = engine_args[1]->as().value.safeGet(); + secret_access_key = engine_args[2]->as().value.safeGet(); + } UInt64 min_upload_part_size = args.local_context.getSettingsRef().s3_min_upload_part_size; String compression_method; - if (engine_args.size() == 3) - { - engine_args[2] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[2], args.local_context); - compression_method = engine_args[2]->as().value.safeGet(); - } else compression_method = "auto"; + if (engine_args.size() == 3 || engine_args.size() == 5) + compression_method = engine_args.back()->as().value.safeGet(); + else + compression_method = "auto"; - return StorageS3::create(uri, args.database_name, args.table_name, format_name, min_upload_part_size, args.columns, args.constraints, args.context); + return StorageS3::create(uri, access_key_id, secret_access_key, args.database_name, args.table_name, format_name, min_upload_part_size, args.columns, args.constraints, args.context); }); } } diff --git a/dbms/src/Storages/StorageS3.h b/dbms/src/Storages/StorageS3.h index 88b470ac2ac..4a5288271a2 100644 --- a/dbms/src/Storages/StorageS3.h +++ b/dbms/src/Storages/StorageS3.h @@ -18,8 +18,10 @@ class StorageS3 : public ext::shared_ptr_helper, public IStorage public: StorageS3( const Poco::URI & uri_, - const std::string & database_name_, - const std::string & table_name_, + const String & access_key_id, + const String & secret_access_key, + const String & database_name_, + const String & table_name_, const String & format_name_, UInt64 min_upload_part_size_, const ColumnsDescription & columns_, @@ -56,6 +58,8 @@ public: private: Poco::URI uri; + String access_key_id; + String secret_access_key; const Context & context_global; String format_name; diff --git a/dbms/src/TableFunctions/TableFunctionS3.cpp b/dbms/src/TableFunctions/TableFunctionS3.cpp index a9ee5ebf691..d203801d9c1 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.cpp +++ b/dbms/src/TableFunctions/TableFunctionS3.cpp @@ -1,17 +1,84 @@ #include +#include #include #include +#include +#include #include namespace DB { +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + +StoragePtr TableFunctionS3::executeImpl(const ASTPtr & ast_function, const Context & context, const std::string & table_name) const +{ + /// Parse args + ASTs & args_func = ast_function->children; + + if (args_func.size() != 1) + throw Exception("Table function '" + getName() + "' must have arguments.", ErrorCodes::LOGICAL_ERROR); + + ASTs & args = args_func.at(0)->children; + + if (args.size() < 3 || args.size() > 6) + throw Exception("Table function '" + getName() + "' requires 3 to 6 arguments: url, [access_key_id, secret_access_key,] format, structure and [compression_method].", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + for (size_t i = 0; i < args.size(); ++i) + args[i] = evaluateConstantExpressionOrIdentifierAsLiteral(args[i], context); + + String filename = args[0]->as().value.safeGet(); + String format; + String structure; + String access_key_id; + String secret_access_key; + + if (args.size() < 5) + { + format = args[1]->as().value.safeGet(); + structure = args[2]->as().value.safeGet(); + } + else + { + access_key_id = args[1]->as().value.safeGet(); + secret_access_key = args[2]->as().value.safeGet(); + format = args[3]->as().value.safeGet(); + structure = args[4]->as().value.safeGet(); + } + + String compression_method; + if (args.size() == 4 || args.size() == 6) + compression_method = args.back()->as().value.safeGet(); + else + compression_method = "auto"; + + ColumnsDescription columns = parseColumnsListFromString(structure, context); + + /// Create table + StoragePtr storage = getStorage(filename, access_key_id, secret_access_key, format, columns, const_cast(context), table_name, compression_method); + + storage->startup(); + + return storage; +} + StoragePtr TableFunctionS3::getStorage( - const String & source, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name, const String & compression_method) const + const String & source, + const String & access_key_id, + const String & secret_access_key, + const String & format, + const ColumnsDescription & columns, + Context & global_context, + const std::string & table_name, + const String & compression_method) const { Poco::URI uri(source); UInt64 min_upload_part_size = global_context.getSettingsRef().s3_min_upload_part_size; - return StorageS3::create(uri, getDatabaseName(), table_name, format, min_upload_part_size, columns, ConstraintsDescription{}, global_context, compression_method); + return StorageS3::create(uri, access_key_id, secret_access_key, getDatabaseName(), table_name, format, min_upload_part_size, columns, ConstraintsDescription{}, global_context, compression_method); } void registerTableFunctionS3(TableFunctionFactory & factory) diff --git a/dbms/src/TableFunctions/TableFunctionS3.h b/dbms/src/TableFunctions/TableFunctionS3.h index 2f14e0319d4..0c81e0ed2a7 100644 --- a/dbms/src/TableFunctions/TableFunctionS3.h +++ b/dbms/src/TableFunctions/TableFunctionS3.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -8,9 +8,9 @@ namespace DB class Context; -/* s3(source, format, structure) - creates a temporary storage for a file in S3 +/* s3(source, [access_key_id, secret_access_key,] format, structure) - creates a temporary storage for a file in S3 */ -class TableFunctionS3 : public ITableFunctionFileLike +class TableFunctionS3 : public ITableFunction { public: static constexpr auto name = "s3"; @@ -20,13 +20,20 @@ public: } private: + StoragePtr executeImpl( + const ASTPtr & ast_function, + const Context & context, + const std::string & table_name) const override; + StoragePtr getStorage( const String & source, + const String & access_key_id, + const String & secret_access_key, const String & format, const ColumnsDescription & columns, Context & global_context, const std::string & table_name, - const String & compression_method) const override; + const String & compression_method) const; }; } From 0a19b4fbd68f1c6673146a8e76f3406f62b15d68 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Tue, 5 Nov 2019 10:54:13 +0300 Subject: [PATCH 02/19] Attempt to add S3 authentication. --- dbms/src/IO/ReadBufferFromS3.cpp | 7 +++-- dbms/src/IO/ReadBufferFromS3.h | 3 --- dbms/src/IO/S3Common.cpp | 44 +++++++++++++++++++++++++++++++ dbms/src/IO/S3Common.h | 18 +++++++++++++ dbms/src/IO/WriteBufferFromS3.cpp | 7 +++++ 5 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 dbms/src/IO/S3Common.cpp create mode 100644 dbms/src/IO/S3Common.h diff --git a/dbms/src/IO/ReadBufferFromS3.cpp b/dbms/src/IO/ReadBufferFromS3.cpp index 718f9d21f8a..b26a8b8c316 100644 --- a/dbms/src/IO/ReadBufferFromS3.cpp +++ b/dbms/src/IO/ReadBufferFromS3.cpp @@ -1,6 +1,7 @@ #include #include +#include #include @@ -16,12 +17,8 @@ ReadBufferFromS3::ReadBufferFromS3(const Poco::URI & uri_, const ConnectionTimeouts & timeouts) : ReadBuffer(nullptr, 0) , uri {uri_} - , access_key_id {access_key_id_} - , secret_access_key {secret_access_key_} , session {makeHTTPSession(uri_, timeouts)} { - /// FIXME: Implement rest of S3 authorization. - Poco::Net::HTTPResponse response; std::unique_ptr request; @@ -37,6 +34,8 @@ ReadBufferFromS3::ReadBufferFromS3(const Poco::URI & uri_, Poco::Net::HTTPRequest::HTTP_1_1); request->setHost(uri.getHost()); // use original, not resolved host name in header + S3Helper::authenticateRequest(*request, access_key_id_, secret_access_key_); + LOG_TRACE((&Logger::get("ReadBufferFromS3")), "Sending request to " << uri.toString()); session->sendRequest(*request); diff --git a/dbms/src/IO/ReadBufferFromS3.h b/dbms/src/IO/ReadBufferFromS3.h index e787828f0e7..071ee7802a2 100644 --- a/dbms/src/IO/ReadBufferFromS3.h +++ b/dbms/src/IO/ReadBufferFromS3.h @@ -17,9 +17,6 @@ class ReadBufferFromS3 : public ReadBuffer { protected: Poco::URI uri; - String access_key_id; - String secret_access_key; - HTTPSessionPtr session; std::istream * istr; /// owned by session std::unique_ptr impl; diff --git a/dbms/src/IO/S3Common.cpp b/dbms/src/IO/S3Common.cpp new file mode 100644 index 00000000000..dea2f888662 --- /dev/null +++ b/dbms/src/IO/S3Common.cpp @@ -0,0 +1,44 @@ +#include + +#include +#include + +#include +#include +#include +#include + + +namespace DB +{ + +void S3Helper::authenticateRequest(Poco::Net::HTTPRequest & request, + const String & access_key_id, + const String & secret_access_key) +{ + /// See https://docs.aws.amazon.com/AmazonS3/latest/dev/RESTAuthentication.html + + if (access_key_id.empty()) + return; + + /// Limitations: + /// 1. Virtual hosted-style requests are not supported. + /// 2. AMZ headers are not supported (TODO). + + String string_to_sign = request.getMethod() + "\n" + + request.get("Content-MD5", "") + "\n" + + request.get("Content-Type", "") + "\n" + + request.get("Date", "") + "\n" + + Poco::URI(request.getURI()).getPath(); + + Poco::HMACEngine engine(secret_access_key); + engine.update(string_to_sign); + auto digest = engine.digest(); + std::ostringstream signature; + Poco::Base64Encoder encoder(signature); + std::copy(digest.begin(), digest.end(), std::ostream_iterator(encoder)); + + request.set("Authorization", "AWS " + access_key_id + ":" + signature.str()); +} + +} \ No newline at end of file diff --git a/dbms/src/IO/S3Common.h b/dbms/src/IO/S3Common.h new file mode 100644 index 00000000000..2f18021c93e --- /dev/null +++ b/dbms/src/IO/S3Common.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +#include + + +namespace DB +{ + +struct S3Helper +{ + static void authenticateRequest(Poco::Net::HTTPRequest & request, + const String & access_key_id, + const String & secret_access_key); +}; + +} diff --git a/dbms/src/IO/WriteBufferFromS3.cpp b/dbms/src/IO/WriteBufferFromS3.cpp index 977a5b22fdc..4154db48282 100644 --- a/dbms/src/IO/WriteBufferFromS3.cpp +++ b/dbms/src/IO/WriteBufferFromS3.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -113,6 +114,8 @@ void WriteBufferFromS3::initiate() request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, initiate_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(initiate_uri.getHost()); // use original, not resolved host name in header + S3Helper::authenticateRequest(*request_ptr, access_key_id, secret_access_key); + request_ptr->setContentLength(0); LOG_TRACE((&Logger::get("WriteBufferFromS3")), "Sending request to " << initiate_uri.toString()); @@ -173,6 +176,8 @@ void WriteBufferFromS3::writePart(const String & data) request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_PUT, part_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(part_uri.getHost()); // use original, not resolved host name in header + S3Helper::authenticateRequest(*request_ptr, access_key_id, secret_access_key); + request_ptr->setExpectContinue(true); request_ptr->setContentLength(data.size()); @@ -240,6 +245,8 @@ void WriteBufferFromS3::complete() request_ptr = std::make_unique(Poco::Net::HTTPRequest::HTTP_POST, complete_uri.getPathAndQuery(), Poco::Net::HTTPRequest::HTTP_1_1); request_ptr->setHost(complete_uri.getHost()); // use original, not resolved host name in header + S3Helper::authenticateRequest(*request_ptr, access_key_id, secret_access_key); + request_ptr->setExpectContinue(true); request_ptr->setContentLength(data.size()); From d0760bc1690dd3ff542821a1b7f6301b29454cb3 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 1 Dec 2019 14:24:55 +0300 Subject: [PATCH 03/19] Fixed tests and logic of authorization in S3. --- dbms/src/IO/S3Common.cpp | 19 +++- .../tests/integration/test_storage_s3/test.py | 91 ++++++++++++++----- 2 files changed, 82 insertions(+), 28 deletions(-) diff --git a/dbms/src/IO/S3Common.cpp b/dbms/src/IO/S3Common.cpp index dea2f888662..92dc9559614 100644 --- a/dbms/src/IO/S3Common.cpp +++ b/dbms/src/IO/S3Common.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -22,14 +23,23 @@ void S3Helper::authenticateRequest(Poco::Net::HTTPRequest & request, return; /// Limitations: - /// 1. Virtual hosted-style requests are not supported. + /// 1. Virtual hosted-style requests are not supported (e.g. `http://johnsmith.net.s3.amazonaws.com/homepage.html`). /// 2. AMZ headers are not supported (TODO). + if (!request.has("Date")) + { + char buffer[1024]; + time_t now = std::time(nullptr); + struct tm *my_tm = std::gmtime(&now); + std::strftime(buffer, sizeof(buffer), "%a, %d %b %Y %H:%M:%S GMT", my_tm); /// See RFC 1123. + request.set("Date", buffer); + } + String string_to_sign = request.getMethod() + "\n" + request.get("Content-MD5", "") + "\n" + request.get("Content-Type", "") + "\n" - + request.get("Date", "") + "\n" - + Poco::URI(request.getURI()).getPath(); + + request.get("Date") + "\n" + + Poco::URI(request.getURI()).getPathAndQuery(); Poco::HMACEngine engine(secret_access_key); engine.update(string_to_sign); @@ -37,8 +47,9 @@ void S3Helper::authenticateRequest(Poco::Net::HTTPRequest & request, std::ostringstream signature; Poco::Base64Encoder encoder(signature); std::copy(digest.begin(), digest.end(), std::ostream_iterator(encoder)); + encoder.close(); request.set("Authorization", "AWS " + access_key_id + ":" + signature.str()); } -} \ No newline at end of file +} diff --git a/dbms/tests/integration/test_storage_s3/test.py b/dbms/tests/integration/test_storage_s3/test.py index 1db472e3019..ed447274e86 100644 --- a/dbms/tests/integration/test_storage_s3/test.py +++ b/dbms/tests/integration/test_storage_s3/test.py @@ -5,6 +5,9 @@ import pytest from helpers.cluster import ClickHouseCluster, ClickHouseInstance +import helpers.client + + logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) @@ -53,12 +56,18 @@ def prepare_s3_bucket(cluster): minio_client.set_bucket_policy(cluster.minio_bucket, json.dumps(bucket_read_write_policy)) + cluster.minio_restricted_bucket = "{}-with-auth".format(cluster.minio_bucket) + if minio_client.bucket_exists(cluster.minio_restricted_bucket): + minio_client.remove_bucket(cluster.minio_restricted_bucket) + + minio_client.make_bucket(cluster.minio_restricted_bucket) + # Returns content of given S3 file as string. -def get_s3_file_content(cluster, filename): +def get_s3_file_content(cluster, bucket, filename): # type: (ClickHouseCluster, str) -> str - data = cluster.minio_client.get_object(cluster.minio_bucket, filename) + data = cluster.minio_client.get_object(bucket, filename) data_str = "" for chunk in data.stream(): data_str += chunk @@ -101,53 +110,76 @@ def run_query(instance, query, stdin=None, settings=None): # Test simple put. -def test_put(cluster): +@pytest.mark.parametrize("maybe_auth,positive", [ + ("",True), + ("'minio','minio123',",True), + ("'wrongid','wrongkey',",False) +]) +def test_put(cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None + bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket instance = cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)" values_csv = "1,2,3\n3,2,1\n78,43,45\n" filename = "test.csv" - put_query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( - cluster.minio_host, cluster.minio_port, cluster.minio_bucket, filename, table_format, values) - run_query(instance, put_query) + put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') values {}".format( + cluster.minio_host, cluster.minio_port, bucket, filename, maybe_auth, table_format, values) - assert values_csv == get_s3_file_content(cluster, filename) + try: + run_query(instance, put_query) + except helpers.client.QueryRuntimeException: + assert not positive + else: + assert positive + assert values_csv == get_s3_file_content(cluster, bucket, filename) # Test put values in CSV format. -def test_put_csv(cluster): +@pytest.mark.parametrize("maybe_auth,positive", [ + ("",True), + ("'minio','minio123',",True), + ("'wrongid','wrongkey',",False) +]) +def test_put_csv(cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None + bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket instance = cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" filename = "test.csv" - put_query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') format CSV".format( - cluster.minio_host, cluster.minio_port, cluster.minio_bucket, filename, table_format) + put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format( + cluster.minio_host, cluster.minio_port, bucket, filename, maybe_auth, table_format) csv_data = "8,9,16\n11,18,13\n22,14,2\n" - run_query(instance, put_query, stdin=csv_data) - assert csv_data == get_s3_file_content(cluster, filename) + try: + run_query(instance, put_query, stdin=csv_data) + except helpers.client.QueryRuntimeException: + assert not positive + else: + assert positive + assert csv_data == get_s3_file_content(cluster, bucket, filename) # Test put and get with S3 server redirect. def test_put_get_with_redirect(cluster): # type: (ClickHouseCluster) -> None + bucket = cluster.minio_bucket instance = cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" values = "(1, 1, 1), (1, 1, 1), (11, 11, 11)" values_csv = "1,1,1\n1,1,1\n11,11,11\n" filename = "test.csv" query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( - cluster.minio_redirect_host, cluster.minio_redirect_port, cluster.minio_bucket, filename, table_format, values) + cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, table_format, values) run_query(instance, query) - assert values_csv == get_s3_file_content(cluster, filename) + assert values_csv == get_s3_file_content(cluster, bucket, filename) query = "select *, column1*column2*column3 from s3('http://{}:{}/{}/{}', 'CSV', '{}')".format( - cluster.minio_redirect_host, cluster.minio_redirect_port, cluster.minio_bucket, filename, table_format) + cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, table_format) stdout = run_query(instance, query) assert list(map(str.split, stdout.splitlines())) == [ @@ -158,9 +190,15 @@ def test_put_get_with_redirect(cluster): # Test multipart put. -def test_multipart_put(cluster): +@pytest.mark.parametrize("maybe_auth,positive", [ + ("",True), + ("'minio','minio123',",True), + ("'wrongid','wrongkey',",False) +]) +def test_multipart_put(cluster, maybe_auth, positive): # type: (ClickHouseCluster) -> None + bucket = cluster.minio_bucket if not maybe_auth else cluster.minio_restricted_bucket instance = cluster.instances["dummy"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" @@ -178,14 +216,19 @@ def test_multipart_put(cluster): assert len(csv_data) > min_part_size_bytes filename = "test_multipart.csv" - put_query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') format CSV".format( - cluster.minio_redirect_host, cluster.minio_redirect_port, cluster.minio_bucket, filename, table_format) + put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format( + cluster.minio_redirect_host, cluster.minio_redirect_port, bucket, filename, maybe_auth, table_format) - run_query(instance, put_query, stdin=csv_data, settings={'s3_min_upload_part_size': min_part_size_bytes}) + try: + run_query(instance, put_query, stdin=csv_data, settings={'s3_min_upload_part_size': min_part_size_bytes}) + except helpers.client.QueryRuntimeException: + assert not positive + else: + assert positive - # Use Nginx access logs to count number of parts uploaded to Minio. - nginx_logs = get_nginx_access_logs() - uploaded_parts = filter(lambda log_line: log_line.find(filename) >= 0 and log_line.find("PUT") >= 0, nginx_logs) - assert uploaded_parts > 1 + # Use Nginx access logs to count number of parts uploaded to Minio. + nginx_logs = get_nginx_access_logs() + uploaded_parts = filter(lambda log_line: log_line.find(filename) >= 0 and log_line.find("PUT") >= 0, nginx_logs) + assert uploaded_parts > 1 - assert csv_data == get_s3_file_content(cluster, filename) + assert csv_data == get_s3_file_content(cluster, bucket, filename) From 03ec61fff3226aec27818fe47bfda804fb35fa46 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Sun, 1 Dec 2019 15:35:04 +0300 Subject: [PATCH 04/19] Fixed `01030_storage_s3_syntax` stateless test. --- dbms/tests/queries/0_stateless/01030_storage_s3_syntax.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/tests/queries/0_stateless/01030_storage_s3_syntax.sql b/dbms/tests/queries/0_stateless/01030_storage_s3_syntax.sql index 6579984f57d..44cd149dd51 100644 --- a/dbms/tests/queries/0_stateless/01030_storage_s3_syntax.sql +++ b/dbms/tests/queries/0_stateless/01030_storage_s3_syntax.sql @@ -2,7 +2,7 @@ drop table if exists test_table_s3_syntax ; create table test_table_s3_syntax (id UInt32) ENGINE = S3('') ; -- { serverError 42 } -create table test_table_s3_syntax (id UInt32) ENGINE = S3('','','','') +create table test_table_s3_syntax (id UInt32) ENGINE = S3('','','','','','') ; -- { serverError 42 } drop table if exists test_table_s3_syntax ; From fe05565cece98e85dade22769efc4e557ca55f8a Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Tue, 3 Dec 2019 03:03:44 +0300 Subject: [PATCH 05/19] Update StorageS3.cpp --- dbms/src/Storages/StorageS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/StorageS3.cpp b/dbms/src/Storages/StorageS3.cpp index 0813f39d51f..df7313805d9 100644 --- a/dbms/src/Storages/StorageS3.cpp +++ b/dbms/src/Storages/StorageS3.cpp @@ -222,7 +222,7 @@ void registerStorageS3(StorageFactory & factory) String url = engine_args[0]->as().value.safeGet(); Poco::URI uri(url); - String format_name = engine_args[engine_args.size()-1]->as().value.safeGet(); + String format_name = engine_args[engine_args.size() - 1]->as().value.safeGet(); String access_key_id; String secret_access_key; From 8fa05a212b0e4794ccffe5d249b055a67d60b3a1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 04:22:25 +0300 Subject: [PATCH 06/19] Fixed bug in #7623 --- dbms/src/Common/ErrorCodes.cpp | 1 + dbms/src/Functions/formatDateTime.cpp | 14 +--- dbms/src/IO/S3Common.cpp | 19 +++-- dbms/src/IO/S3Common.h | 5 +- dbms/src/IO/WriteHelpers.h | 108 +++++++++++++++----------- 5 files changed, 79 insertions(+), 68 deletions(-) diff --git a/dbms/src/Common/ErrorCodes.cpp b/dbms/src/Common/ErrorCodes.cpp index 4f49ca92df4..0ea287a01e9 100644 --- a/dbms/src/Common/ErrorCodes.cpp +++ b/dbms/src/Common/ErrorCodes.cpp @@ -464,6 +464,7 @@ namespace ErrorCodes extern const int CANNOT_GET_CREATE_DICTIONARY_QUERY = 487; extern const int UNKNOWN_DICTIONARY = 488; extern const int INCORRECT_DICTIONARY_DEFINITION = 489; + extern const int CANNOT_FORMAT_DATETIME = 490; extern const int KEEPER_EXCEPTION = 999; extern const int POCO_EXCEPTION = 1000; diff --git a/dbms/src/Functions/formatDateTime.cpp b/dbms/src/Functions/formatDateTime.cpp index 8cecdb69717..c7150515935 100644 --- a/dbms/src/Functions/formatDateTime.cpp +++ b/dbms/src/Functions/formatDateTime.cpp @@ -91,19 +91,7 @@ private: template static inline void writeNumber2(char * p, T v) { - static const char digits[201] = - "00010203040506070809" - "10111213141516171819" - "20212223242526272829" - "30313233343536373839" - "40414243444546474849" - "50515253545556575859" - "60616263646566676869" - "70717273747576777879" - "80818283848586878889" - "90919293949596979899"; - - memcpy(p, &digits[v * 2], 2); + memcpy(p, &digits100[v * 2], 2); } template diff --git a/dbms/src/IO/S3Common.cpp b/dbms/src/IO/S3Common.cpp index 92dc9559614..1233bae38e1 100644 --- a/dbms/src/IO/S3Common.cpp +++ b/dbms/src/IO/S3Common.cpp @@ -1,6 +1,7 @@ #include +#include +#include -#include #include #include @@ -13,7 +14,13 @@ namespace DB { -void S3Helper::authenticateRequest(Poco::Net::HTTPRequest & request, +namespace ErrorCodes +{ + extern const int CANNOT_FORMAT_DATETIME; +} + +void S3Helper::authenticateRequest( + Poco::Net::HTTPRequest & request, const String & access_key_id, const String & secret_access_key) { @@ -28,11 +35,9 @@ void S3Helper::authenticateRequest(Poco::Net::HTTPRequest & request, if (!request.has("Date")) { - char buffer[1024]; - time_t now = std::time(nullptr); - struct tm *my_tm = std::gmtime(&now); - std::strftime(buffer, sizeof(buffer), "%a, %d %b %Y %H:%M:%S GMT", my_tm); /// See RFC 1123. - request.set("Date", buffer); + WriteBufferFromOwnString out; + writeDateTimeTextRFC1123(time(nullptr), out, DateLUT::instance("UTC")); + request.set("Date", out.str()); } String string_to_sign = request.getMethod() + "\n" diff --git a/dbms/src/IO/S3Common.h b/dbms/src/IO/S3Common.h index 2f18021c93e..b68f5c9b536 100644 --- a/dbms/src/IO/S3Common.h +++ b/dbms/src/IO/S3Common.h @@ -8,9 +8,10 @@ namespace DB { -struct S3Helper +namespace S3Helper { - static void authenticateRequest(Poco::Net::HTTPRequest & request, + void authenticateRequest( + Poco::Net::HTTPRequest & request, const String & access_key_id, const String & secret_access_key); }; diff --git a/dbms/src/IO/WriteHelpers.h b/dbms/src/IO/WriteHelpers.h index 0163a3c3740..573a03519e0 100644 --- a/dbms/src/IO/WriteHelpers.h +++ b/dbms/src/IO/WriteHelpers.h @@ -568,45 +568,46 @@ inline void writeUUIDText(const UUID & uuid, WriteBuffer & buf) buf.write(s, sizeof(s)); } + +static const char digits100[201] = + "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + /// in YYYY-MM-DD format template inline void writeDateText(const LocalDate & date, WriteBuffer & buf) { - static const char digits[201] = - "00010203040506070809" - "10111213141516171819" - "20212223242526272829" - "30313233343536373839" - "40414243444546474849" - "50515253545556575859" - "60616263646566676869" - "70717273747576777879" - "80818283848586878889" - "90919293949596979899"; - if (buf.position() + 10 <= buf.buffer().end()) { - memcpy(buf.position(), &digits[date.year() / 100 * 2], 2); + memcpy(buf.position(), &digits100[date.year() / 100 * 2], 2); buf.position() += 2; - memcpy(buf.position(), &digits[date.year() % 100 * 2], 2); + memcpy(buf.position(), &digits100[date.year() % 100 * 2], 2); buf.position() += 2; *buf.position() = delimiter; ++buf.position(); - memcpy(buf.position(), &digits[date.month() * 2], 2); + memcpy(buf.position(), &digits100[date.month() * 2], 2); buf.position() += 2; *buf.position() = delimiter; ++buf.position(); - memcpy(buf.position(), &digits[date.day() * 2], 2); + memcpy(buf.position(), &digits100[date.day() * 2], 2); buf.position() += 2; } else { - buf.write(&digits[date.year() / 100 * 2], 2); - buf.write(&digits[date.year() % 100 * 2], 2); + buf.write(&digits100[date.year() / 100 * 2], 2); + buf.write(&digits100[date.year() % 100 * 2], 2); buf.write(delimiter); - buf.write(&digits[date.month() * 2], 2); + buf.write(&digits100[date.month() * 2], 2); buf.write(delimiter); - buf.write(&digits[date.day() * 2], 2); + buf.write(&digits100[date.day() * 2], 2); } } @@ -628,59 +629,47 @@ inline void writeDateText(DayNum date, WriteBuffer & buf) template inline void writeDateTimeText(const LocalDateTime & datetime, WriteBuffer & buf) { - static const char digits[201] = - "00010203040506070809" - "10111213141516171819" - "20212223242526272829" - "30313233343536373839" - "40414243444546474849" - "50515253545556575859" - "60616263646566676869" - "70717273747576777879" - "80818283848586878889" - "90919293949596979899"; - if (buf.position() + 19 <= buf.buffer().end()) { - memcpy(buf.position(), &digits[datetime.year() / 100 * 2], 2); + memcpy(buf.position(), &digits100[datetime.year() / 100 * 2], 2); buf.position() += 2; - memcpy(buf.position(), &digits[datetime.year() % 100 * 2], 2); + memcpy(buf.position(), &digits100[datetime.year() % 100 * 2], 2); buf.position() += 2; *buf.position() = date_delimeter; ++buf.position(); - memcpy(buf.position(), &digits[datetime.month() * 2], 2); + memcpy(buf.position(), &digits100[datetime.month() * 2], 2); buf.position() += 2; *buf.position() = date_delimeter; ++buf.position(); - memcpy(buf.position(), &digits[datetime.day() * 2], 2); + memcpy(buf.position(), &digits100[datetime.day() * 2], 2); buf.position() += 2; *buf.position() = between_date_time_delimiter; ++buf.position(); - memcpy(buf.position(), &digits[datetime.hour() * 2], 2); + memcpy(buf.position(), &digits100[datetime.hour() * 2], 2); buf.position() += 2; *buf.position() = time_delimeter; ++buf.position(); - memcpy(buf.position(), &digits[datetime.minute() * 2], 2); + memcpy(buf.position(), &digits100[datetime.minute() * 2], 2); buf.position() += 2; *buf.position() = time_delimeter; ++buf.position(); - memcpy(buf.position(), &digits[datetime.second() * 2], 2); + memcpy(buf.position(), &digits100[datetime.second() * 2], 2); buf.position() += 2; } else { - buf.write(&digits[datetime.year() / 100 * 2], 2); - buf.write(&digits[datetime.year() % 100 * 2], 2); + buf.write(&digits100[datetime.year() / 100 * 2], 2); + buf.write(&digits100[datetime.year() % 100 * 2], 2); buf.write(date_delimeter); - buf.write(&digits[datetime.month() * 2], 2); + buf.write(&digits100[datetime.month() * 2], 2); buf.write(date_delimeter); - buf.write(&digits[datetime.day() * 2], 2); + buf.write(&digits100[datetime.day() * 2], 2); buf.write(between_date_time_delimiter); - buf.write(&digits[datetime.hour() * 2], 2); + buf.write(&digits100[datetime.hour() * 2], 2); buf.write(time_delimeter); - buf.write(&digits[datetime.minute() * 2], 2); + buf.write(&digits100[datetime.minute() * 2], 2); buf.write(time_delimeter); - buf.write(&digits[datetime.second() * 2], 2); + buf.write(&digits100[datetime.second() * 2], 2); } } @@ -707,6 +696,33 @@ inline void writeDateTimeText(time_t datetime, WriteBuffer & buf, const DateLUTI } +/// In the RFC 1123 format: "Tue, 03 Dec 2019 00:11:50 GMT". You must provide GMT DateLUT. +/// This is needed for HTTP requests. +inline void writeDateTimeTextRFC1123(time_t datetime, WriteBuffer & buf, const DateLUTImpl & date_lut) +{ + const auto & values = date_lut.getValues(datetime); + + static const char week_days[3 * 7 + 1] = "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"; + static const char months[3 * 12 + 1] = "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"; + + buf.write(&week_days[values.day_of_week * 3], 3); + buf.write(", ", 2); + buf.write(&digits100[values.day_of_month * 2], 2); + buf.write(' '); + buf.write(&months[values.month * 3], 3); + buf.write(' '); + buf.write(&digits100[values.year / 100 * 2], 2); + buf.write(&digits100[values.year % 100 * 2], 2); + buf.write(' '); + buf.write(&digits100[date_lut.toHour(datetime) * 2], 2); + buf.write(':'); + buf.write(&digits100[date_lut.toMinute(datetime) * 2], 2); + buf.write(':'); + buf.write(&digits100[date_lut.toSecond(datetime) * 2], 2); + buf.write(" GMT", 4); +} + + /// Methods for output in binary format. template inline std::enable_if_t, void> From 199a209c82b349613460c3cca51fa2e44533d4eb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 04:23:17 +0300 Subject: [PATCH 07/19] Added a test --- dbms/src/IO/tests/gtest_rfc1123.cpp | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 dbms/src/IO/tests/gtest_rfc1123.cpp diff --git a/dbms/src/IO/tests/gtest_rfc1123.cpp b/dbms/src/IO/tests/gtest_rfc1123.cpp new file mode 100644 index 00000000000..255f3f7ed07 --- /dev/null +++ b/dbms/src/IO/tests/gtest_rfc1123.cpp @@ -0,0 +1,13 @@ +#include + +#include +#include +#include + + +TEST(RFC1123, Test) +{ + WriteBufferFromOwnString out; + writeDateTimeTextRFC1123(1111111111, out, DateLUT::instance("UTC")); + ASSERT_EQ(out.str(), "Fri, 18 Mar 2005 01:58:31 GMT"); +} From 93ef18501f898d75c2d47e3b120ca68355e37601 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 04:27:48 +0300 Subject: [PATCH 08/19] Added a test --- dbms/src/IO/tests/gtest_rfc1123.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/IO/tests/gtest_rfc1123.cpp b/dbms/src/IO/tests/gtest_rfc1123.cpp index 255f3f7ed07..66d7484de1f 100644 --- a/dbms/src/IO/tests/gtest_rfc1123.cpp +++ b/dbms/src/IO/tests/gtest_rfc1123.cpp @@ -1,12 +1,13 @@ #include -#include +#include #include #include TEST(RFC1123, Test) { + using namespace DB; WriteBufferFromOwnString out; writeDateTimeTextRFC1123(1111111111, out, DateLUT::instance("UTC")); ASSERT_EQ(out.str(), "Fri, 18 Mar 2005 01:58:31 GMT"); From 49d25e06a8cdff61de8377126922befd4525975b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 04:29:19 +0300 Subject: [PATCH 09/19] Added a test --- dbms/src/IO/WriteHelpers.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/IO/WriteHelpers.h b/dbms/src/IO/WriteHelpers.h index 573a03519e0..509c37257ad 100644 --- a/dbms/src/IO/WriteHelpers.h +++ b/dbms/src/IO/WriteHelpers.h @@ -702,8 +702,8 @@ inline void writeDateTimeTextRFC1123(time_t datetime, WriteBuffer & buf, const D { const auto & values = date_lut.getValues(datetime); - static const char week_days[3 * 7 + 1] = "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"; - static const char months[3 * 12 + 1] = "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"; + static const char week_days[3 * 8 + 1] = "XXX" "Mon" "Tue" "Wed" "Thu" "Fri" "Sat" "Sun"; + static const char months[3 * 13 + 1] = "XXX" "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov" "Dec"; buf.write(&week_days[values.day_of_week * 3], 3); buf.write(", ", 2); From 2474cdfa09f04b5b5cf16a127341ea3677916996 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 04:55:46 +0300 Subject: [PATCH 10/19] Removed tons of garbage from "greatCircleDistance" function. But algorithm is still unclear. --- dbms/src/Columns/ColumnConst.h | 5 + dbms/src/Columns/ColumnLowCardinality.h | 1 + dbms/src/Columns/ColumnUnique.h | 1 + dbms/src/Columns/ColumnVector.cpp | 6 + dbms/src/Columns/ColumnVector.h | 1 + dbms/src/Columns/IColumn.h | 5 + dbms/src/Functions/greatCircleDistance.cpp | 183 ++++++++------------- 7 files changed, 84 insertions(+), 118 deletions(-) diff --git a/dbms/src/Columns/ColumnConst.h b/dbms/src/Columns/ColumnConst.h index 5fdf9db1ab2..0b8ca38e823 100644 --- a/dbms/src/Columns/ColumnConst.h +++ b/dbms/src/Columns/ColumnConst.h @@ -105,6 +105,11 @@ public: return data->getFloat64(0); } + Float32 getFloat32(size_t) const override + { + return data->getFloat32(0); + } + bool isNullAt(size_t) const override { return data->isNullAt(0); diff --git a/dbms/src/Columns/ColumnLowCardinality.h b/dbms/src/Columns/ColumnLowCardinality.h index 74ea04cb08f..c69e5fc039d 100644 --- a/dbms/src/Columns/ColumnLowCardinality.h +++ b/dbms/src/Columns/ColumnLowCardinality.h @@ -59,6 +59,7 @@ public: UInt64 getUInt(size_t n) const override { return getDictionary().getUInt(getIndexes().getUInt(n)); } Int64 getInt(size_t n) const override { return getDictionary().getInt(getIndexes().getUInt(n)); } Float64 getFloat64(size_t n) const override { return getDictionary().getInt(getIndexes().getFloat64(n)); } + Float32 getFloat32(size_t n) const override { return getDictionary().getInt(getIndexes().getFloat32(n)); } bool getBool(size_t n) const override { return getDictionary().getInt(getIndexes().getBool(n)); } bool isNullAt(size_t n) const override { return getDictionary().isNullAt(getIndexes().getUInt(n)); } ColumnPtr cut(size_t start, size_t length) const override diff --git a/dbms/src/Columns/ColumnUnique.h b/dbms/src/Columns/ColumnUnique.h index 62a468e5821..5b53f515001 100644 --- a/dbms/src/Columns/ColumnUnique.h +++ b/dbms/src/Columns/ColumnUnique.h @@ -66,6 +66,7 @@ public: UInt64 getUInt(size_t n) const override { return getNestedColumn()->getUInt(n); } Int64 getInt(size_t n) const override { return getNestedColumn()->getInt(n); } Float64 getFloat64(size_t n) const override { return getNestedColumn()->getFloat64(n); } + Float32 getFloat32(size_t n) const override { return getNestedColumn()->getFloat32(n); } bool getBool(size_t n) const override { return getNestedColumn()->getBool(n); } bool isNullAt(size_t n) const override { return is_nullable && n == getNullValueIndex(); } StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; diff --git a/dbms/src/Columns/ColumnVector.cpp b/dbms/src/Columns/ColumnVector.cpp index 5ec436cd28b..9d56281ea1c 100644 --- a/dbms/src/Columns/ColumnVector.cpp +++ b/dbms/src/Columns/ColumnVector.cpp @@ -222,6 +222,12 @@ Float64 ColumnVector::getFloat64(size_t n) const return static_cast(data[n]); } +template +Float32 ColumnVector::getFloat32(size_t n) const +{ + return static_cast(data[n]); +} + template void ColumnVector::insertRangeFrom(const IColumn & src, size_t start, size_t length) { diff --git a/dbms/src/Columns/ColumnVector.h b/dbms/src/Columns/ColumnVector.h index 28307cb33f0..072f9b48960 100644 --- a/dbms/src/Columns/ColumnVector.h +++ b/dbms/src/Columns/ColumnVector.h @@ -205,6 +205,7 @@ public: UInt64 get64(size_t n) const override; Float64 getFloat64(size_t n) const override; + Float32 getFloat32(size_t n) const override; UInt64 getUInt(size_t n) const override { diff --git a/dbms/src/Columns/IColumn.h b/dbms/src/Columns/IColumn.h index 2b340a84783..7478083ff70 100644 --- a/dbms/src/Columns/IColumn.h +++ b/dbms/src/Columns/IColumn.h @@ -100,6 +100,11 @@ public: throw Exception("Method getFloat64 is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + virtual Float32 getFloat32(size_t /*n*/) const + { + throw Exception("Method getFloat32 is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); + } + /** If column is numeric, return value of n-th element, casted to UInt64. * For NULL values of Nullable column it is allowed to return arbitrary value. * Otherwise throw an exception. diff --git a/dbms/src/Functions/greatCircleDistance.cpp b/dbms/src/Functions/greatCircleDistance.cpp index 2d1c310cd40..eb305109689 100644 --- a/dbms/src/Functions/greatCircleDistance.cpp +++ b/dbms/src/Functions/greatCircleDistance.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include @@ -21,19 +21,32 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +/** https://en.wikipedia.org/wiki/Great-circle_distance + * + * The function calculates distance in meters between two points on Earth specified by longitude and latitude in degrees. + * The function uses great circle distance formula https://en.wikipedia.org/wiki/Great-circle_distance . + * Throws exception when one or several input values are not within reasonable bounds. + * Latitude must be in [-90, 90], longitude must be [-180, 180]. + * Original code of this implementation of this function is here https://github.com/sphinxsearch/sphinx/blob/409f2c2b5b2ff70b04e38f92b6b1a890326bad65/src/sphinxexpr.cpp#L3825. + * Andrey Aksenov, the author of original code, permitted to use this code in ClickHouse under the Apache 2.0 license. + * Presentation about this code from Highload++ Siberia 2019 is here https://github.com/ClickHouse/ClickHouse/files/3324740/1_._._GEODIST_._.pdf + * The main idea of this implementation is optimisations based on Taylor series, trigonometric identity and calculated constants once for cosine, arcsine(sqrt) and look up table. + */ + namespace { -const double PI = 3.14159265358979323846; -const float TO_RADF = static_cast(PI / 180.0); -const float TO_RADF2 = static_cast(PI / 360.0); -const int GEODIST_TABLE_COS = 1024; // maxerr 0.00063% -const int GEODIST_TABLE_ASIN = 512; -const int GEODIST_TABLE_K = 1024; +constexpr double PI = 3.14159265358979323846; +constexpr float TO_RADF = static_cast(PI / 180.0); +constexpr float TO_RADF2 = static_cast(PI / 360.0); + +constexpr size_t GEODIST_TABLE_COS = 1024; // maxerr 0.00063% +constexpr size_t GEODIST_TABLE_ASIN = 512; +constexpr size_t GEODIST_TABLE_K = 1024; float g_GeoCos[GEODIST_TABLE_COS + 1]; /// cos(x) table float g_GeoAsin[GEODIST_TABLE_ASIN + 1]; /// asin(sqrt(x)) table -float g_GeoFlatK[GEODIST_TABLE_K + 1][2]; /// geodistAdaptive() flat ellipsoid method k1,k2 coeffs table +float g_GeoFlatK[GEODIST_TABLE_K + 1][2]; /// geodistAdaptive() flat ellipsoid method k1, k2 coeffs table inline double sqr(double v) { @@ -48,7 +61,7 @@ inline float fsqr(float v) void geodistInit() { for (size_t i = 0; i <= GEODIST_TABLE_COS; ++i) - g_GeoCos[i] = static_cast(cos(2 * PI * i / GEODIST_TABLE_COS)); // [0, 2pi] -> [0, COSTABLE] + g_GeoCos[i] = static_cast(cos(2 * PI * i / GEODIST_TABLE_COS)); // [0, 2 * pi] -> [0, COSTABLE] for (size_t i = 0; i <= GEODIST_TABLE_ASIN; ++i) g_GeoAsin[i] = static_cast(asin( @@ -56,7 +69,7 @@ void geodistInit() for (size_t i = 0; i <= GEODIST_TABLE_K; ++i) { - double x = PI * i / GEODIST_TABLE_K - PI * 0.5; // [-pi/2, pi/2] -> [0, KTABLE] + double x = PI * i / GEODIST_TABLE_K - PI * 0.5; // [-pi / 2, pi / 2] -> [0, KTABLE] g_GeoFlatK[i][0] = static_cast(sqr(111132.09 - 566.05 * cos(2 * x) + 1.20 * cos(4 * x))); g_GeoFlatK[i][1] = static_cast(sqr(111415.13 * cos(x) - 94.55 * cos(3 * x) + 0.12 * cos(5 * x))); } @@ -86,11 +99,10 @@ inline float geodistFastSin(float x) float y = static_cast(fabs(x) * GEODIST_TABLE_COS / PI / 2); int i = static_cast(y); y -= i; - i = (i - GEODIST_TABLE_COS / 4) & (GEODIST_TABLE_COS - 1); // cos(x-pi/2)=sin(x), costable/4=pi/2 + i = (i - GEODIST_TABLE_COS / 4) & (GEODIST_TABLE_COS - 1); // cos(x - pi / 2) = sin(x), costable / 4 = pi / 2 return g_GeoCos[i] + (g_GeoCos[i + 1] - g_GeoCos[i]) * y; } - /// fast implementation of asin(sqrt(x)) /// max error in floats 0.00369%, in doubles 0.00072% inline float geodistFastAsinSqrt(float x) @@ -110,17 +122,10 @@ inline float geodistFastAsinSqrt(float x) } return static_cast(asin(sqrt(x))); // distance over 17083km, just compute honestly } + } -/** - * The function calculates distance in meters between two points on Earth specified by longitude and latitude in degrees. - * The function uses great circle distance formula https://en.wikipedia.org/wiki/Great-circle_distance . - * Throws exception when one or several input values are not within reasonable bounds. - * Latitude must be in [-90, 90], longitude must be [-180, 180]. - * Original code of this implementation of this function is here https://github.com/sphinxsearch/sphinx/blob/409f2c2b5b2ff70b04e38f92b6b1a890326bad65/src/sphinxexpr.cpp#L3825. - * Andrey Aksenov, the author of original code, permitted to use this code in ClickHouse under the Apache 2.0 license. - * Presentation about this code from Highload++ Siberia 2019 is here https://github.com/ClickHouse/ClickHouse/files/3324740/1_._._GEODIST_._.pdf - * The main idea of this implementation is optimisations based on Taylor series, trigonometric identity and calculated constants once for cosine, arcsine(sqrt) and look up table. - */ + + class FunctionGreatCircleDistance : public IFunction { public: @@ -128,134 +133,76 @@ public: static FunctionPtr create(const Context &) { return std::make_shared(); } private: - - enum class instr_type : uint8_t - { - get_float_64, - get_const_float_64 - }; - - using instr_t = std::pair; - using instrs_t = std::array; - String getName() const override { return name; } - size_t getNumberOfArguments() const override { return 4; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { for (const auto arg_idx : ext::range(0, arguments.size())) { const auto arg = arguments[arg_idx].get(); - if (!WhichDataType(arg).isFloat64()) + if (!WhichDataType(arg).isFloat()) throw Exception( "Illegal type " + arg->getName() + " of argument " + std::to_string(arg_idx + 1) + " of function " + getName() + ". Must be Float64", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } - return std::make_shared(); + return std::make_shared(); } - instrs_t getInstructions(const Block & block, const ColumnNumbers & arguments, bool & out_const) + Float32 greatCircleDistance(Float32 lon1deg, Float32 lat1deg, Float32 lon2deg, Float32 lat2deg) { - instrs_t result; - out_const = true; - - for (const auto arg_idx : ext::range(0, arguments.size())) - { - const auto column = block.getByPosition(arguments[arg_idx]).column.get(); - - if (const auto col = checkAndGetColumn>(column)) - { - out_const = false; - result[arg_idx] = instr_t{instr_type::get_float_64, col}; - } - else if (const auto col_const = checkAndGetColumnConst>(column)) - { - result[arg_idx] = instr_t{instr_type::get_const_float_64, col_const}; - } - else - throw Exception("Illegal column " + column->getName() + " of argument of function " + getName(), - ErrorCodes::ILLEGAL_COLUMN); - } - - return result; - } - - /// https://en.wikipedia.org/wiki/Great-circle_distance - Float64 greatCircleDistance(Float64 lon1Deg, Float64 lat1Deg, Float64 lon2Deg, Float64 lat2Deg) - { - if (lon1Deg < -180 || lon1Deg > 180 || - lon2Deg < -180 || lon2Deg > 180 || - lat1Deg < -90 || lat1Deg > 90 || - lat2Deg < -90 || lat2Deg > 90) + if (lon1deg < -180 || lon1deg > 180 || + lon2deg < -180 || lon2deg > 180 || + lat1deg < -90 || lat1deg > 90 || + lat2deg < -90 || lat2deg > 90) { throw Exception("Arguments values out of bounds for function " + getName(), ErrorCodes::ARGUMENT_OUT_OF_BOUND); } - float dlat = geodistDegDiff(lat1Deg - lat2Deg); - float dlon = geodistDegDiff(lon1Deg - lon2Deg); + float lat_diff = geodistDegDiff(lat1deg - lat2deg); + float lon_diff = geodistDegDiff(lon1deg - lon2deg); - if (dlon < 13) + if (lon_diff < 13) { // points are close enough; use flat ellipsoid model // interpolate sqr(k1), sqr(k2) coefficients using latitudes midpoint - float m = (lat1Deg + lat2Deg + 180) * GEODIST_TABLE_K / 360; // [-90, 90] degrees -> [0, KTABLE] indexes - int i = static_cast(m); - i &= (GEODIST_TABLE_K - 1); + float m = (lat1deg + lat2deg + 180) * GEODIST_TABLE_K / 360; // [-90, 90] degrees -> [0, KTABLE] indexes + size_t i = static_cast(m) & (GEODIST_TABLE_K - 1); float kk1 = g_GeoFlatK[i][0] + (g_GeoFlatK[i + 1][0] - g_GeoFlatK[i][0]) * (m - i); float kk2 = g_GeoFlatK[i][1] + (g_GeoFlatK[i + 1][1] - g_GeoFlatK[i][1]) * (m - i); - return static_cast(sqrt(kk1 * dlat * dlat + kk2 * dlon * dlon)); - } - // points too far away; use haversine - static const float D = 2 * 6371000; - float a = fsqr(geodistFastSin(dlat * TO_RADF2)) + - geodistFastCos(lat1Deg * TO_RADF) * geodistFastCos(lat2Deg * TO_RADF) * - fsqr(geodistFastSin(dlon * TO_RADF2)); - return static_cast(D * geodistFastAsinSqrt(a)); - } - - - void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override - { - const auto size = input_rows_count; - - bool result_is_const{}; - auto instrs = getInstructions(block, arguments, result_is_const); - - if (result_is_const) - { - const auto & colLon1 = assert_cast(block.getByPosition(arguments[0]).column.get())->getValue(); - const auto & colLat1 = assert_cast(block.getByPosition(arguments[1]).column.get())->getValue(); - const auto & colLon2 = assert_cast(block.getByPosition(arguments[2]).column.get())->getValue(); - const auto & colLat2 = assert_cast(block.getByPosition(arguments[3]).column.get())->getValue(); - - Float64 res = greatCircleDistance(colLon1, colLat1, colLon2, colLat2); - block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(size, res); + return static_cast(sqrt(kk1 * lat_diff * lat_diff + kk2 * lon_diff * lon_diff)); } else { - auto dst = ColumnVector::create(); - auto & dst_data = dst->getData(); - dst_data.resize(size); - Float64 vals[instrs.size()]; - for (const auto row : ext::range(0, size)) - { - for (const auto idx : ext::range(0, instrs.size())) - { - if (instr_type::get_float_64 == instrs[idx].first) - vals[idx] = assert_cast *>(instrs[idx].second)->getData()[row]; - else if (instr_type::get_const_float_64 == instrs[idx].first) - vals[idx] = assert_cast(instrs[idx].second)->getValue(); - else - throw Exception{"Unknown instruction type in implementation of greatCircleDistance function", ErrorCodes::LOGICAL_ERROR}; - } - dst_data[row] = greatCircleDistance(vals[0], vals[1], vals[2], vals[3]); - } - block.getByPosition(result).column = std::move(dst); + // points too far away; use haversine + static const float d = 2 * 6371000; + float a = fsqr(geodistFastSin(lat_diff * TO_RADF2)) + + geodistFastCos(lat1deg * TO_RADF) * geodistFastCos(lat2deg * TO_RADF) * + fsqr(geodistFastSin(lon_diff * TO_RADF2)); + return static_cast(d * geodistFastAsinSqrt(a)); } } + + void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t input_rows_count) override + { + auto dst = ColumnVector::create(); + auto & dst_data = dst->getData(); + dst_data.resize(input_rows_count); + + const IColumn & col_lon1 = *block.getByPosition(arguments[0]).column; + const IColumn & col_lat1 = *block.getByPosition(arguments[1]).column; + const IColumn & col_lon2 = *block.getByPosition(arguments[2]).column; + const IColumn & col_lat2 = *block.getByPosition(arguments[3]).column; + + for (size_t row_num = 0; row_num < input_rows_count; ++row_num) + dst_data[row_num] = greatCircleDistance(col_lon1.getFloat32(), col_lat1.getFloat32(), col_lon2.getFloat32(), col_lat2.getFloat32()); + + block.getByPosition(result).column = std::move(dst); + } }; From 40b0f127557a7e8c5c107914b0d714cca04ede2c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 3 Dec 2019 05:25:23 +0300 Subject: [PATCH 11/19] Removed tons of garbage from "greatCircleDistance" function. But algorithm is still unclear. --- dbms/src/Functions/greatCircleDistance.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/dbms/src/Functions/greatCircleDistance.cpp b/dbms/src/Functions/greatCircleDistance.cpp index eb305109689..ec852b06251 100644 --- a/dbms/src/Functions/greatCircleDistance.cpp +++ b/dbms/src/Functions/greatCircleDistance.cpp @@ -199,7 +199,9 @@ private: const IColumn & col_lat2 = *block.getByPosition(arguments[3]).column; for (size_t row_num = 0; row_num < input_rows_count; ++row_num) - dst_data[row_num] = greatCircleDistance(col_lon1.getFloat32(), col_lat1.getFloat32(), col_lon2.getFloat32(), col_lat2.getFloat32()); + dst_data[row_num] = greatCircleDistance( + col_lon1.getFloat32(row_num), col_lat1.getFloat32(row_num), + col_lon2.getFloat32(row_num), col_lat2.getFloat32(row_num)); block.getByPosition(result).column = std::move(dst); } From d9607c50fe3ee2401a0b57f39fb98f8cee71ced4 Mon Sep 17 00:00:00 2001 From: litao91 Date: Tue, 3 Dec 2019 16:58:00 +0800 Subject: [PATCH 12/19] Fix a minor typo on formatting UNION ALL AST --- dbms/src/Parsers/ASTSelectWithUnionQuery.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Parsers/ASTSelectWithUnionQuery.cpp b/dbms/src/Parsers/ASTSelectWithUnionQuery.cpp index a590891db98..96cac839c58 100644 --- a/dbms/src/Parsers/ASTSelectWithUnionQuery.cpp +++ b/dbms/src/Parsers/ASTSelectWithUnionQuery.cpp @@ -28,7 +28,7 @@ void ASTSelectWithUnionQuery::formatQueryImpl(const FormatSettings & settings, F if (it != list_of_selects->children.begin()) settings.ostr << settings.nl_or_ws << indent_str << (settings.hilite ? hilite_keyword : "") - << "UNION ALL" << (settings.hilite ? hilite_keyword : "") + << "UNION ALL" << (settings.hilite ? hilite_none : "") << settings.nl_or_ws; (*it)->formatImpl(settings, state, frame); From f2ff3b3dddb096c190bb7b8a7bf44c9ce88a0e4a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 3 Dec 2019 12:59:41 +0300 Subject: [PATCH 13/19] Added argument --print-time to tests. --- dbms/tests/clickhouse-test | 42 +++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/dbms/tests/clickhouse-test b/dbms/tests/clickhouse-test index eab850d3e48..5f3a2933732 100755 --- a/dbms/tests/clickhouse-test +++ b/dbms/tests/clickhouse-test @@ -1,4 +1,5 @@ #!/usr/bin/env python +from __future__ import print_function import sys import os import os.path @@ -72,6 +73,8 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: sleep(0.01) + total_time = (datetime.now() - start_time).total_seconds() + # Normalize randomized database names in stdout, stderr files. os.system("sed -i -e 's/{test_db}/default/g' {file}".format(test_db=args.database, file=stdout_file)) os.system("sed -i -e 's/{test_db}/default/g' {file}".format(test_db=args.database, file=stderr_file)) @@ -81,7 +84,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std stderr = open(stderr_file, 'r').read() if os.path.exists(stderr_file) else '' stderr = unicode(stderr, errors='replace', encoding='utf-8') - return proc, stdout, stderr + return proc, stdout, stderr, total_time def need_retry(stderr): @@ -149,6 +152,10 @@ def run_tests_array(all_tests_with_params): client_options = get_additional_client_options(args) + def print_test_time(test_time): + if args.print_time: + print(" {0:.2f} sec.".format(test_time), end='') + if len(all_tests): print("\nRunning {} {} tests.".format(len(all_tests), suite) + "\n") @@ -194,7 +201,7 @@ def run_tests_array(all_tests_with_params): stdout_file = os.path.join(suite_tmp_dir, name) + '.stdout' stderr_file = os.path.join(suite_tmp_dir, name) + '.stderr' - proc, stdout, stderr = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) + proc, stdout, stderr, total_time = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) if proc.returncode is None: try: proc.kill() @@ -203,11 +210,13 @@ def run_tests_array(all_tests_with_params): raise failures += 1 - print("{0} - Timeout!".format(MSG_FAIL)) + print(MSG_FAIL, end='') + print_test_time(total_time) + print(" - Timeout!") else: counter = 1 while proc.returncode != 0 and need_retry(stderr): - proc, stdout, stderr = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) + proc, stdout, stderr, total_time = run_single_test(args, ext, server_logs_level, client_options, case_file, stdout_file, stderr_file) sleep(2**counter) counter += 1 if counter > 6: @@ -216,7 +225,9 @@ def run_tests_array(all_tests_with_params): if proc.returncode != 0: failures += 1 failures_chain += 1 - print("{0} - return code {1}".format(MSG_FAIL, proc.returncode)) + print(MSG_FAIL, end='') + print_test_time(total_time) + print(" - return code {}".format(proc.returncode)) if stderr: print(stderr.encode('utf-8')) @@ -227,24 +238,34 @@ def run_tests_array(all_tests_with_params): elif stderr: failures += 1 failures_chain += 1 - print("{0} - having stderror:\n{1}".format(MSG_FAIL, stderr.encode('utf-8'))) + print(MSG_FAIL, end='') + print_test_time(total_time) + print(" - having stderror:\n{}".format(stderr.encode('utf-8'))) elif 'Exception' in stdout: failures += 1 failures_chain += 1 - print("{0} - having exception:\n{1}".format(MSG_FAIL, stdout.encode('utf-8'))) + print(MSG_FAIL, end='') + print_test_time(total_time) + print(" - having exception:\n{}".format(stdout.encode('utf-8'))) elif not os.path.isfile(reference_file): - print("{0} - no reference file".format(MSG_UNKNOWN)) + print(MSG_UNKNOWN, end='') + print_test_time(total_time) + print(" - no reference file") else: result_is_different = subprocess.call(['diff', '-q', reference_file, stdout_file], stdout = PIPE) if result_is_different: diff = Popen(['diff', '-U', str(args.unified), reference_file, stdout_file], stdout = PIPE).communicate()[0] failures += 1 - print("{0} - result differs with reference:\n{1}".format(MSG_FAIL, diff)) + print(MSG_FAIL, end='') + print_test_time(total_time) + print(" - result differs with reference:\n{}".format(diff)) else: passed_total += 1 failures_chain = 0 - print(MSG_OK) + print(MSG_OK, end='') + print_test_time(total_time) + print() if os.path.exists(stdout_file): os.remove(stdout_file) if os.path.exists(stderr_file): @@ -503,6 +524,7 @@ if __name__ == '__main__': parser.add_argument('--skip', nargs='+', help="Skip these tests") parser.add_argument('--no-long', action='store_false', dest='no_long', help='Do not run long tests') parser.add_argument('--client-option', nargs='+', help='Specify additional client argument') + parser.add_argument('--print-time', action='store_true', dest='print_time', help='Print test time') group=parser.add_mutually_exclusive_group(required=False) group.add_argument('--zookeeper', action='store_true', default=None, dest='zookeeper', help='Run zookeeper related tests') group.add_argument('--no-zookeeper', action='store_false', default=None, dest='zookeeper', help='Do not run zookeeper related tests') From 448d755a3b8ef2ddf616e02d9cece9fe0cd09d2b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 3 Dec 2019 14:56:51 +0300 Subject: [PATCH 14/19] Add forgotten rows number check to chunk constructor. --- dbms/src/Processors/Chunk.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dbms/src/Processors/Chunk.cpp b/dbms/src/Processors/Chunk.cpp index d9d0574d3b8..554ccab2af3 100644 --- a/dbms/src/Processors/Chunk.cpp +++ b/dbms/src/Processors/Chunk.cpp @@ -35,11 +35,13 @@ static Columns unmuteColumns(MutableColumns && mut_columns) Chunk::Chunk(MutableColumns columns_, UInt64 num_rows_) : columns(unmuteColumns(std::move(columns_))), num_rows(num_rows_) { + checkNumRowsIsConsistent(); } Chunk::Chunk(MutableColumns columns_, UInt64 num_rows_, ChunkInfoPtr chunk_info_) : columns(unmuteColumns(std::move(columns_))), num_rows(num_rows_), chunk_info(std::move(chunk_info_)) { + checkNumRowsIsConsistent(); } Chunk Chunk::clone() const From 828f3ac3b2574baafcd251c628cb46f377f8ff21 Mon Sep 17 00:00:00 2001 From: CurtizJ Date: Tue, 3 Dec 2019 16:28:07 +0300 Subject: [PATCH 15/19] add perf test for function and --- dbms/tests/performance/and_function.xml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 dbms/tests/performance/and_function.xml diff --git a/dbms/tests/performance/and_function.xml b/dbms/tests/performance/and_function.xml new file mode 100644 index 00000000000..08fd07ea7e5 --- /dev/null +++ b/dbms/tests/performance/and_function.xml @@ -0,0 +1,22 @@ + + loop + + + + 3 + 10000 + + + 5 + 60000 + + + + + + + + select count() from numbers(10000000) where number != 96594 AND number != 18511 AND number != 98085 AND number != 84177 AND number != 70314 AND number != 28083 AND number != 54202 AND number != 66522 AND number != 66939 AND number != 99469 AND number != 65776 AND number != 22876 AND number != 42151 AND number != 19924 AND number != 66681 AND number != 63022 AND number != 17487 AND number != 83914 AND number != 59754 AND number != 968 AND number != 73334 AND number != 68569 AND number != 49853 AND number != 33155 AND number != 31777 AND number != 99698 AND number != 26708 AND number != 76409 AND number != 42191 AND number != 55397 AND number != 25724 AND number != 39170 AND number != 22728 AND number != 98238 AND number != 86052 AND number != 12756 AND number != 13948 AND number != 57774 AND number != 82511 AND number != 11337 AND number != 23506 AND number != 11875 AND number != 58536 AND number != 56919 AND number != 25986 AND number != 80710 AND number != 61797 AND number != 99244 AND number != 11665 AND number != 15758 AND number != 82899 AND number != 63150 AND number != 7198 AND number != 40071 AND number != 46310 AND number != 78488 AND number != 9273 AND number != 91878 AND number != 57904 AND number != 53941 AND number != 75675 AND number != 12093 AND number != 50090 AND number != 59675 AND number != 41632 AND number != 81448 AND number != 46821 AND number != 51919 AND number != 49028 AND number != 71059 AND number != 15673 AND number != 6132 AND number != 15473 AND number != 32527 AND number != 63842 AND number != 33121 AND number != 53271 AND number != 86033 AND number != 96807 AND number != 4791 AND number != 80089 AND number != 51616 AND number != 46311 AND number != 82844 AND number != 59353 AND number != 63538 AND number != 64857 AND number != 58471 AND number != 29870 AND number != 80209 AND number != 61000 AND number != 75991 AND number != 44506 AND number != 11283 AND number != 6335 AND number != 73502 AND number != 22354 AND number != 72816 AND number != 66399 AND number != 61703 + + select count() from numbers(10000000) where number != 96594 AND number != 18511 AND number != 98085 AND number != 84177 AND number != 70314 AND number != 28083 AND number != 54202 AND number != 66522 AND number != 66939 AND number != 99469 + From fa4894bd47f2348da553584bf5262a6aa629f9e4 Mon Sep 17 00:00:00 2001 From: Ivan Blinkov Date: Tue, 3 Dec 2019 16:34:19 +0300 Subject: [PATCH 16/19] Fix toc_en.yml error --- docs/toc_en.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/toc_en.yml b/docs/toc_en.yml index d2b50c7f421..c598dacf5d8 100644 --- a/docs/toc_en.yml +++ b/docs/toc_en.yml @@ -215,7 +215,8 @@ nav: - 'Overview of ClickHouse Architecture': 'development/architecture.md' - 'How to Build ClickHouse on Linux': 'development/build.md' - 'How to Build ClickHouse on Mac OS X': 'development/build_osx.md' - - 'How to Build ClickHouse on Linux for Mac OS X': 'development/build_cross.md' + - 'How to Build ClickHouse on Linux for Mac OS X': 'development/build_cross_osx.md' + - 'How to Build ClickHouse on Linux for AARCH64 (ARM64)': 'development/build_cross_arm.md' - 'How to Write C++ Code': 'development/style.md' - 'How to Run ClickHouse Tests': 'development/tests.md' - 'The Beginner ClickHouse Developer Instruction': 'development/developer_instruction.md' From 108c792fd9ff8cdc50fa01b00f3da15f037ce451 Mon Sep 17 00:00:00 2001 From: Alexander Burmak Date: Thu, 7 Nov 2019 14:44:02 +0300 Subject: [PATCH 17/19] Fixed typos in ExternalLoader.h and CrossToInnerJoinVisitor.cpp --- dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp | 2 +- dbms/src/Interpreters/ExternalLoader.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp index 4fd67c2031b..61e57c4d490 100644 --- a/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/dbms/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -167,7 +167,7 @@ private: size_t canMoveEqualsToJoinOn(const ASTFunction & node) { if (!node.arguments) - throw Exception("Logical error: function requires argiment", ErrorCodes::LOGICAL_ERROR); + throw Exception("Logical error: function requires arguments", ErrorCodes::LOGICAL_ERROR); if (node.arguments->children.size() != 2) return false; diff --git a/dbms/src/Interpreters/ExternalLoader.h b/dbms/src/Interpreters/ExternalLoader.h index 67be8fc5076..19570d897a5 100644 --- a/dbms/src/Interpreters/ExternalLoader.h +++ b/dbms/src/Interpreters/ExternalLoader.h @@ -27,7 +27,7 @@ struct ExternalLoaderConfigSettings }; -/** Iterface for manage user-defined objects. +/** Interface for manage user-defined objects. * Monitors configuration file and automatically reloads objects in separate threads. * The monitoring thread wakes up every 'check_period_sec' seconds and checks * modification time of objects' configuration file. If said time is greater than From 31d5c66867f4a6e6027797fc099cc51ff08b790d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 3 Dec 2019 16:43:40 +0300 Subject: [PATCH 18/19] Fix num rows in IRowInputFormat --- dbms/src/Processors/Formats/IRowInputFormat.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Processors/Formats/IRowInputFormat.cpp b/dbms/src/Processors/Formats/IRowInputFormat.cpp index fc9bbe146d3..66e53378071 100644 --- a/dbms/src/Processors/Formats/IRowInputFormat.cpp +++ b/dbms/src/Processors/Formats/IRowInputFormat.cpp @@ -43,7 +43,6 @@ Chunk IRowInputFormat::generate() size_t num_columns = header.columns(); MutableColumns columns = header.cloneEmptyColumns(); - size_t prev_rows = total_rows; ///auto chunk_missing_values = std::make_unique(); block_missing_values.clear(); @@ -149,7 +148,8 @@ Chunk IRowInputFormat::generate() return {}; } - Chunk chunk(std::move(columns), total_rows - prev_rows); + auto num_rows = columns.front()->size(); + Chunk chunk(std::move(columns), num_rows); //chunk.setChunkInfo(std::move(chunk_missing_values)); return chunk; } From 52d684614c5cd10157c3c19ba6c9b0128034864d Mon Sep 17 00:00:00 2001 From: BayoNet Date: Wed, 4 Dec 2019 09:46:31 +0300 Subject: [PATCH 19/19] DOCS-265: Docs for the max_http_get_redirects setting (#7684) * CLICKHOUSEDOCS-265: The max_http_get_redirects setting description. * CLICKHOUSEDOCS-265: Clarification. * CLICKHOUSEDOCS-265: Clarification. --- docs/en/operations/settings/settings.md | 11 +++++++++++ docs/en/operations/table_engines/url.md | 2 ++ 2 files changed, 13 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 7928f2bfe5b..1886cf8f0a0 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -130,6 +130,17 @@ Possible values: Default value: 0. +## max_http_get_redirects {#setting-max_http_get_redirects} + +Limits the maximum number of HTTP GET redirect hops for [URL](../table_engines/url.md)-engine tables. The setting applies to the both types of tables: created by [CREATE TABLE](../../query_language/create/#create-table-query) query and by [url](../../query_language/table_functions/url.md) table function. + +Possible values: + +- Positive integer number of hops. +- 0 — Unlimited number of hops. + +Default value: 0. + ## input_format_allow_errors_num {#settings-input_format_allow_errors_num} Sets the maximum number of acceptable errors when reading from text formats (CSV, TSV, etc.). diff --git a/docs/en/operations/table_engines/url.md b/docs/en/operations/table_engines/url.md index 6521604171c..cb7b57b35c3 100644 --- a/docs/en/operations/table_engines/url.md +++ b/docs/en/operations/table_engines/url.md @@ -17,6 +17,8 @@ additional headers for getting a response from the server. respectively. For processing `POST` requests, the remote server must support [Chunked transfer encoding](https://en.wikipedia.org/wiki/Chunked_transfer_encoding). +You can limit the maximum number of HTTP GET redirect hops by the [max_http_get_redirects](../settings/settings.md#setting-max_http_get_redirects) setting. + **Example:** **1.** Create a `url_engine_table` table on the server :