From d7b34a80bbbf98ea11e0d679eeede076421748f1 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 16:09:14 +0000 Subject: [PATCH 001/142] stash --- .../Net/include/Poco/Net/HTTPServerSession.h | 5 +++++ base/poco/Net/src/HTTPServerSession.cpp | 22 ++++++++++--------- .../library-bridge/LibraryBridgeHandlers.cpp | 16 +++++--------- .../library-bridge/LibraryBridgeHandlers.h | 6 ++--- programs/odbc-bridge/PingHandler.cpp | 2 +- programs/server/Server.cpp | 1 + src/Core/ServerSettings.h | 1 + src/IO/HTTPCommon.cpp | 15 +++++++++---- src/IO/HTTPCommon.h | 2 +- src/Server/HTTP/HTTPServer.cpp | 3 ++- src/Server/HTTP/HTTPServerResponse.h | 2 ++ .../WriteBufferFromHTTPServerResponse.cpp | 4 +--- .../HTTP/WriteBufferFromHTTPServerResponse.h | 2 -- src/Server/HTTPHandler.cpp | 3 +-- src/Server/InterserverIOHTTPHandler.cpp | 3 +-- src/Server/PrometheusRequestHandler.cpp | 10 ++------- src/Server/PrometheusRequestHandler.h | 7 +----- src/Server/ReplicasStatusHandler.cpp | 3 +-- src/Server/StaticRequestHandler.cpp | 10 ++++----- src/Server/WebUIRequestHandler.cpp | 9 +++----- .../00408_http_keep_alive.reference | 6 ++--- .../0_stateless/00501_http_head.reference | 4 ++-- 22 files changed, 63 insertions(+), 73 deletions(-) diff --git a/base/poco/Net/include/Poco/Net/HTTPServerSession.h b/base/poco/Net/include/Poco/Net/HTTPServerSession.h index ec928af304f..192d71962bc 100644 --- a/base/poco/Net/include/Poco/Net/HTTPServerSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPServerSession.h @@ -56,10 +56,15 @@ namespace Net SocketAddress serverAddress(); /// Returns the server's address. + size_t getKeepAliveTimeout() const { return _params->getKeepAliveTimeout().totalSeconds(); } + + size_t getMaxKeepAliveRequests() const { return _params->getMaxKeepAliveRequests(); } + private: bool _firstRequest; Poco::Timespan _keepAliveTimeout; int _maxKeepAliveRequests; + HTTPServerParams::Ptr _params; }; diff --git a/base/poco/Net/src/HTTPServerSession.cpp b/base/poco/Net/src/HTTPServerSession.cpp index d4f2b24879e..3ea689cb0cf 100644 --- a/base/poco/Net/src/HTTPServerSession.cpp +++ b/base/poco/Net/src/HTTPServerSession.cpp @@ -19,11 +19,12 @@ namespace Poco { namespace Net { -HTTPServerSession::HTTPServerSession(const StreamSocket& socket, HTTPServerParams::Ptr pParams): - HTTPSession(socket, pParams->getKeepAlive()), - _firstRequest(true), - _keepAliveTimeout(pParams->getKeepAliveTimeout()), - _maxKeepAliveRequests(pParams->getMaxKeepAliveRequests()) +HTTPServerSession::HTTPServerSession(const StreamSocket & socket, HTTPServerParams::Ptr pParams) + : HTTPSession(socket, pParams->getKeepAlive()) + , _firstRequest(true) + , _keepAliveTimeout(pParams->getKeepAliveTimeout()) + , _maxKeepAliveRequests(pParams->getMaxKeepAliveRequests()) + , _params(pParams) { setTimeout(pParams->getTimeout()); } @@ -46,11 +47,12 @@ bool HTTPServerSession::hasMoreRequests() } else if (_maxKeepAliveRequests != 0 && getKeepAlive()) { - if (_maxKeepAliveRequests > 0) - --_maxKeepAliveRequests; - return buffered() > 0 || socket().poll(_keepAliveTimeout, Socket::SELECT_READ); - } - else return false; + if (_maxKeepAliveRequests > 0) + --_maxKeepAliveRequests; + return buffered() > 0 || socket().poll(_keepAliveTimeout, Socket::SELECT_READ); + } + else + return false; } diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index 26d887cfc98..094cef6716d 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -374,10 +374,8 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ } -ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("ExternalDictionaryLibraryBridgeExistsHandler")) +ExternalDictionaryLibraryBridgeExistsHandler::ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("ExternalDictionaryLibraryBridgeExistsHandler")) { } @@ -401,7 +399,7 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque String res = library_handler ? "1" : "0"; - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); LOG_TRACE(log, "Sending ping response: {} (dictionary id: {})", res, dictionary_id); response.sendBuffer(res.data(), res.size()); } @@ -617,10 +615,8 @@ void CatBoostLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & requ } -CatBoostLibraryBridgeExistsHandler::CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("CatBoostLibraryBridgeExistsHandler")) +CatBoostLibraryBridgeExistsHandler::CatBoostLibraryBridgeExistsHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("CatBoostLibraryBridgeExistsHandler")) { } @@ -634,7 +630,7 @@ void CatBoostLibraryBridgeExistsHandler::handleRequest(HTTPServerRequest & reque String res = "1"; - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); LOG_TRACE(log, "Sending ping response: {}", res); response.sendBuffer(res.data(), res.size()); } diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 1db71eb24cb..83bca24ce1f 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -34,12 +34,11 @@ private: class ExternalDictionaryLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; @@ -77,12 +76,11 @@ private: class CatBoostLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeExistsHandler(size_t keep_alive_timeout_, ContextPtr context_); + CatBoostLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; diff --git a/programs/odbc-bridge/PingHandler.cpp b/programs/odbc-bridge/PingHandler.cpp index 80d0e2bf4a9..e5d094fb7eb 100644 --- a/programs/odbc-bridge/PingHandler.cpp +++ b/programs/odbc-bridge/PingHandler.cpp @@ -10,7 +10,7 @@ void PingHandler::handleRequest(HTTPServerRequest & /* request */, HTTPServerRes { try { - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); const char * data = "Ok.\n"; response.sendBuffer(data, strlen(data)); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b67a4eccd15..b741cd7f644 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -2251,6 +2251,7 @@ void Server::createServers( Poco::Net::HTTPServerParams::Ptr http_params = new Poco::Net::HTTPServerParams; http_params->setTimeout(settings.http_receive_timeout); http_params->setKeepAliveTimeout(global_context->getServerSettings().keep_alive_timeout); + http_params->setMaxKeepAliveRequests(static_cast(global_context->getServerSettings().max_keep_alive_requests)); Poco::Util::AbstractConfiguration::Keys protocols; config.keys("protocols", protocols); diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index da82cdea5a4..7480d94e81d 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -113,6 +113,7 @@ namespace DB M(Bool, async_load_databases, false, "Enable asynchronous loading of databases and tables to speedup server startup. Queries to not yet loaded entity will be blocked until load is finished.", 0) \ M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \ M(Seconds, keep_alive_timeout, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT, "The number of seconds that ClickHouse waits for incoming requests before closing the connection.", 0) \ + M(UInt64, max_keep_alive_requests, 10000, "The number of seconds that ClickHouse waits for incoming requests before closing the connection.", 0) \ M(Seconds, replicated_fetches_http_connection_timeout, 0, "HTTP connection timeout for part fetch requests. Inherited from default profile `http_connection_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index 09f7724d613..2b3f7f062bc 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -33,14 +34,20 @@ namespace ErrorCodes extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS; } -void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout) +void setResponseDefaultHeaders(HTTPServerResponse & response) { if (!response.getKeepAlive()) return; - Poco::Timespan timeout(keep_alive_timeout, 0); - if (timeout.totalSeconds()) - response.set("Keep-Alive", "timeout=" + std::to_string(timeout.totalSeconds())); + const size_t keep_alive_timeout = response.getSession().getKeepAliveTimeout(); + const size_t keep_alive_max_requests = response.getSession().getMaxKeepAliveRequests(); + if (keep_alive_timeout) + { + if (keep_alive_max_requests) + response.set("Keep-Alive", fmt::format("timeout={}, max={}", keep_alive_timeout, keep_alive_max_requests)); + else + response.set("Keep-Alive", fmt::format("timeout={}", keep_alive_timeout)); + } } HTTPSessionPtr makeHTTPSession( diff --git a/src/IO/HTTPCommon.h b/src/IO/HTTPCommon.h index 63dffcf6878..fa6086224f5 100644 --- a/src/IO/HTTPCommon.h +++ b/src/IO/HTTPCommon.h @@ -54,7 +54,7 @@ private: using HTTPSessionPtr = std::shared_ptr; -void setResponseDefaultHeaders(HTTPServerResponse & response, size_t keep_alive_timeout); +void setResponseDefaultHeaders(HTTPServerResponse & response); /// Create session object to perform requests and set required parameters. HTTPSessionPtr makeHTTPSession( diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp index 90bdebf6451..9b8feae3e26 100644 --- a/src/Server/HTTP/HTTPServer.cpp +++ b/src/Server/HTTP/HTTPServer.cpp @@ -13,7 +13,8 @@ HTTPServer::HTTPServer( Poco::Net::HTTPServerParams::Ptr params, const ProfileEvents::Event & read_event, const ProfileEvents::Event & write_event) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_, read_event, write_event), thread_pool, socket_, params), factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_, read_event, write_event), thread_pool, socket_, params) + , factory(factory_) { } diff --git a/src/Server/HTTP/HTTPServerResponse.h b/src/Server/HTTP/HTTPServerResponse.h index 8edb785e7c5..9793fc8b24b 100644 --- a/src/Server/HTTP/HTTPServerResponse.h +++ b/src/Server/HTTP/HTTPServerResponse.h @@ -245,6 +245,8 @@ public: void attachRequest(HTTPServerRequest * request_) { request = request_; } + const Poco::Net::HTTPServerSession & getSession() const { return session; } + private: Poco::Net::HTTPServerSession & session; HTTPServerRequest * request = nullptr; diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 8098671a903..a39f6de51d0 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -30,7 +30,7 @@ void WriteBufferFromHTTPServerResponse::startSendHeaders() if (add_cors_header) response.set("Access-Control-Allow-Origin", "*"); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); std::stringstream header; //STYLE_CHECK_ALLOW_STD_STRING_STREAM response.beginWrite(header); @@ -119,12 +119,10 @@ void WriteBufferFromHTTPServerResponse::nextImpl() WriteBufferFromHTTPServerResponse::WriteBufferFromHTTPServerResponse( HTTPServerResponse & response_, bool is_http_method_head_, - UInt64 keep_alive_timeout_, const ProfileEvents::Event & write_event_) : HTTPWriteBuffer(response_.getSocket(), write_event_) , response(response_) , is_http_method_head(is_http_method_head_) - , keep_alive_timeout(keep_alive_timeout_) { } diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h index a3952b7c553..f0c80f24582 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h @@ -29,7 +29,6 @@ public: WriteBufferFromHTTPServerResponse( HTTPServerResponse & response_, bool is_http_method_head_, - UInt64 keep_alive_timeout_, const ProfileEvents::Event & write_event_ = ProfileEvents::end()); ~WriteBufferFromHTTPServerResponse() override; @@ -91,7 +90,6 @@ private: bool is_http_method_head; bool add_cors_header = false; - size_t keep_alive_timeout = 0; bool initialized = false; diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index c112eefec6c..ac6c9d6a0a5 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -621,7 +621,6 @@ void HTTPHandler::processQuery( std::make_shared( response, request.getMethod() == HTTPRequest::HTTP_HEAD, - context->getServerSettings().keep_alive_timeout.totalSeconds(), write_event); used_output.out = used_output.out_holder; used_output.out_maybe_compressed = used_output.out_holder; @@ -926,7 +925,7 @@ try if (!used_output.out_holder && !used_output.exception_is_written) { /// If nothing was sent yet and we don't even know if we must compress the response. - WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT).writeln(s); + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD).writeln(s); } else if (used_output.out_maybe_compressed) { diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 28045380cd7..9a87992731c 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -87,9 +87,8 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe response.setChunkedTransferEncoding(true); Output used_output; - const auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); used_output.out = std::make_shared( - response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event); + response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, write_event); auto finalize_output = [&] { diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index dff960f7031..0ad5f907467 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -18,21 +18,15 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe { try { - /// Raw config reference is used here to avoid dependency on Context and ServerSettings. - /// This is painful, because this class is also used in a build with CLICKHOUSE_KEEPER_STANDALONE_BUILD=1 - /// And there ordinary Context is replaced with a tiny clone. - const auto & config = server.config(); - unsigned keep_alive_timeout = config.getUInt("keep_alive_timeout", DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT); - /// In order to make keep-alive works. if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); response.setContentType("text/plain; version=0.0.4; charset=UTF-8"); - WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout, write_event); + WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, write_event); try { metrics_writer->write(wb); diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index d120752c8c5..cc7848d1dd0 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -12,15 +12,10 @@ class IServer; class PrometheusRequestHandler : public HTTPRequestHandler { private: - IServer & server; PrometheusMetricsWriterPtr metrics_writer; public: - PrometheusRequestHandler(IServer & server_, PrometheusMetricsWriterPtr metrics_writer_) - : server(server_) - , metrics_writer(std::move(metrics_writer_)) - { - } + PrometheusRequestHandler(IServer &, PrometheusMetricsWriterPtr metrics_writer_) : metrics_writer(std::move(metrics_writer_)) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; }; diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index 91c6bd722d3..964e3834037 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -84,8 +84,7 @@ void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServe } } - const auto & server_settings = getContext()->getServerSettings(); - setResponseDefaultHeaders(response, server_settings.keep_alive_timeout.totalSeconds()); + setResponseDefaultHeaders(response); if (!ok) { diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index 67bf3875de4..3d618031875 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -33,10 +33,9 @@ namespace ErrorCodes extern const int INVALID_CONFIG_PARAMETER; } -static inline std::unique_ptr -responseWriteBuffer(HTTPServerRequest & request, HTTPServerResponse & response, UInt64 keep_alive_timeout) +static inline std::unique_ptr responseWriteBuffer(HTTPServerRequest & request, HTTPServerResponse & response) { - auto buf = std::unique_ptr(new WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout)); + auto buf = std::unique_ptr(new WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD)); /// The client can pass a HTTP header indicating supported compression method (gzip or deflate). String http_response_compression_methods = request.get("Accept-Encoding", ""); @@ -89,8 +88,7 @@ static inline void trySendExceptionToClient( void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & /*write_event*/) { - auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); - auto out = responseWriteBuffer(request, response, keep_alive_timeout); + auto out = responseWriteBuffer(request, response); try { @@ -105,7 +103,7 @@ void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServer "The Transfer-Encoding is not chunked and there " "is no Content-Length header for POST request"); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTPStatus(status)); writeResponse(*out); } diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index 68d3ff0b325..faad9d57519 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -29,18 +29,15 @@ DashboardWebUIRequestHandler::DashboardWebUIRequestHandler(IServer & server_) : BinaryWebUIRequestHandler::BinaryWebUIRequestHandler(IServer & server_) : server(server_) {} JavaScriptWebUIRequestHandler::JavaScriptWebUIRequestHandler(IServer & server_) : server(server_) {} -static void handle(const IServer & server, HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) +static void handle(const IServer &, HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) { - auto keep_alive_timeout = server.context()->getServerSettings().keep_alive_timeout.totalSeconds(); - response.setContentType("text/html; charset=UTF-8"); if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - setResponseDefaultHeaders(response, keep_alive_timeout); + setResponseDefaultHeaders(response); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK); - WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD, keep_alive_timeout).write(html.data(), html.size()); - + WriteBufferFromHTTPServerResponse(response, request.getMethod() == HTTPRequest::HTTP_HEAD).write(html.data(), html.size()); } void PlayWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) diff --git a/tests/queries/0_stateless/00408_http_keep_alive.reference b/tests/queries/0_stateless/00408_http_keep_alive.reference index 17a7fd690a8..d5d7dacce9e 100644 --- a/tests/queries/0_stateless/00408_http_keep_alive.reference +++ b/tests/queries/0_stateless/00408_http_keep_alive.reference @@ -1,6 +1,6 @@ < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=10000 < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=10000 < Connection: Keep-Alive -< Keep-Alive: timeout=10 +< Keep-Alive: timeout=10, max=10000 diff --git a/tests/queries/0_stateless/00501_http_head.reference b/tests/queries/0_stateless/00501_http_head.reference index 8351327b356..db82132b145 100644 --- a/tests/queries/0_stateless/00501_http_head.reference +++ b/tests/queries/0_stateless/00501_http_head.reference @@ -2,11 +2,11 @@ HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10 +Keep-Alive: timeout=10, max=10000 HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10 +Keep-Alive: timeout=10, max=10000 From 7aace4d876173ce18ade57ec1bdc332efff7ce80 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 17:24:25 +0000 Subject: [PATCH 002/142] add test --- .../test_server_keep_alive/__init__.py | 0 .../configs/keep_alive_settings.xml | 4 ++ .../test_server_keep_alive/test.py | 46 +++++++++++++++++++ 3 files changed, 50 insertions(+) create mode 100644 tests/integration/test_server_keep_alive/__init__.py create mode 100644 tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml create mode 100644 tests/integration/test_server_keep_alive/test.py diff --git a/tests/integration/test_server_keep_alive/__init__.py b/tests/integration/test_server_keep_alive/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml b/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml new file mode 100644 index 00000000000..06e68044817 --- /dev/null +++ b/tests/integration/test_server_keep_alive/configs/keep_alive_settings.xml @@ -0,0 +1,4 @@ + + 3600 + 5 + diff --git a/tests/integration/test_server_keep_alive/test.py b/tests/integration/test_server_keep_alive/test.py new file mode 100644 index 00000000000..0f88fe47673 --- /dev/null +++ b/tests/integration/test_server_keep_alive/test.py @@ -0,0 +1,46 @@ +import logging +import pytest +import requests + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance("node", main_configs=["configs/keep_alive_settings.xml"]) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def test_requests_with_keep_alive(start_cluster): + # In this test we have `keep_alive_timeout` set to one hour to never trigger connection reset by timeout, `max_keep_alive_requests` is set to 5. + # We expect server to close connection after each 5 requests. We detect connection reset by change in src port. + # So the first 5 requests should come from the same port, the following 5 requests should come from another port. + session = requests.Session() + for i in range(10): + session.get( + f"http://{node.ip_address}:8123/?query=select%201&log_comment=test_requests_with_keep_alive_{i}" + ) + + ports = node.query( + """ + SYSTEM FLUSH LOGS; + + SELECT port + FROM system.query_log + WHERE log_comment like 'test_requests_with_keep_alive_%' AND type = 'QueryFinish' + ORDER BY log_comment + """ + ).split("\n")[:-1] + + expected = 5 * [ports[0]] + [ports[5]] * 5 + + assert ports == expected From 146d7603388ca161ee3340ab1f582971a4e45a03 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 17:38:06 +0000 Subject: [PATCH 003/142] rm more --- programs/keeper/Keeper.cpp | 2 +- src/IO/HTTPCommon.cpp | 1 - src/Server/HTTP/HTTPServer.cpp | 3 +-- src/Server/HTTPHandlerFactory.cpp | 10 ++++------ src/Server/HTTPHandlerFactory.h | 8 ++------ src/Server/PrometheusRequestHandler.cpp | 13 +++---------- src/Server/PrometheusRequestHandler.h | 2 +- src/Server/WebUIRequestHandler.cpp | 14 +++++++------- 8 files changed, 19 insertions(+), 34 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index a558ed64bf9..238964fb25e 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -507,7 +507,7 @@ try "Prometheus: http://" + address.toString(), std::make_unique( std::move(my_http_context), - createPrometheusMainHandlerFactory(*this, config_getter(), metrics_writer, "PrometheusHandler-factory"), + createPrometheusMainHandlerFactory(config_getter(), metrics_writer, "PrometheusHandler-factory"), server_pool, socket, http_params)); diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index 2b3f7f062bc..56226941228 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -1,4 +1,3 @@ -#include #include #include diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp index 9b8feae3e26..90bdebf6451 100644 --- a/src/Server/HTTP/HTTPServer.cpp +++ b/src/Server/HTTP/HTTPServer.cpp @@ -13,8 +13,7 @@ HTTPServer::HTTPServer( Poco::Net::HTTPServerParams::Ptr params, const ProfileEvents::Event & read_event, const ProfileEvents::Event & write_event) - : TCPServer(new HTTPServerConnectionFactory(context, params, factory_, read_event, write_event), thread_pool, socket_, params) - , factory(factory_) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_, read_event, write_event), thread_pool, socket_, params), factory(factory_) { } diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 9a67e576345..23d4c081d2d 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -123,7 +123,7 @@ static inline auto createHandlersFactoryFromConfig( } else if (handler_type == "prometheus") { - main_handler_factory->addHandler(createPrometheusHandlerFactory(server, config, async_metrics, prefix + "." + key)); + main_handler_factory->addHandler(createPrometheusHandlerFactory(config, async_metrics, prefix + "." + key)); } else if (handler_type == "replicas_status") { @@ -202,7 +202,7 @@ HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, const Poco:: else if (name == "PrometheusHandler-factory") { auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); - return createPrometheusMainHandlerFactory(server, config, metrics_writer, name); + return createPrometheusMainHandlerFactory(config, metrics_writer, name); } throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown HTTP handler factory name."); @@ -294,10 +294,8 @@ void addDefaultHandlersFactory( if (config.has("prometheus") && config.getInt("prometheus.port", 0) == 0) { auto writer = std::make_shared(config, "prometheus", async_metrics); - auto creator = [&server, writer] () -> std::unique_ptr - { - return std::make_unique(server, writer); - }; + auto creator + = [writer]() -> std::unique_ptr { return std::make_unique(writer); }; auto prometheus_handler = std::make_shared>(std::move(creator)); prometheus_handler->attachStrictPath(config.getString("prometheus.endpoint", "/metrics")); prometheus_handler->allowGetAndHeadRequest(); diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index ac18c36e6c9..5c1a12d9e06 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -126,14 +126,10 @@ HTTPRequestHandlerFactoryPtr createReplicasStatusHandlerFactory(IServer & server const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); -HTTPRequestHandlerFactoryPtr -createPrometheusHandlerFactory(IServer & server, - const Poco::Util::AbstractConfiguration & config, - AsynchronousMetrics & async_metrics, - const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( + const Poco::Util::AbstractConfiguration & config, AsynchronousMetrics & async_metrics, const std::string & config_prefix); HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( - IServer & server, const Poco::Util::AbstractConfiguration & config, PrometheusMetricsWriterPtr metrics_writer, const std::string & name); diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 0ad5f907467..1a04311116f 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -44,16 +44,12 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe } HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( - IServer & server, const Poco::Util::AbstractConfiguration & config, AsynchronousMetrics & async_metrics, const std::string & config_prefix) { auto writer = std::make_shared(config, config_prefix + ".handler", async_metrics); - auto creator = [&server, writer]() -> std::unique_ptr - { - return std::make_unique(server, writer); - }; + auto creator = [writer]() -> std::unique_ptr { return std::make_unique(writer); }; auto factory = std::make_shared>(std::move(creator)); factory->addFiltersFromConfig(config, config_prefix); @@ -61,13 +57,10 @@ HTTPRequestHandlerFactoryPtr createPrometheusHandlerFactory( } HTTPRequestHandlerFactoryPtr createPrometheusMainHandlerFactory( - IServer & server, const Poco::Util::AbstractConfiguration & config, PrometheusMetricsWriterPtr metrics_writer, const std::string & name) + const Poco::Util::AbstractConfiguration & config, PrometheusMetricsWriterPtr metrics_writer, const std::string & name) { auto factory = std::make_shared(name); - auto creator = [&server, metrics_writer] - { - return std::make_unique(server, metrics_writer); - }; + auto creator = [metrics_writer] { return std::make_unique(metrics_writer); }; auto handler = std::make_shared>(std::move(creator)); handler->attachStrictPath(config.getString("prometheus.endpoint", "/metrics")); diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index cc7848d1dd0..7f4d3c14f62 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -15,7 +15,7 @@ private: PrometheusMetricsWriterPtr metrics_writer; public: - PrometheusRequestHandler(IServer &, PrometheusMetricsWriterPtr metrics_writer_) : metrics_writer(std::move(metrics_writer_)) { } + PrometheusRequestHandler(PrometheusMetricsWriterPtr metrics_writer_) : metrics_writer(std::move(metrics_writer_)) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; }; diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index faad9d57519..e43412550f9 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -29,7 +29,7 @@ DashboardWebUIRequestHandler::DashboardWebUIRequestHandler(IServer & server_) : BinaryWebUIRequestHandler::BinaryWebUIRequestHandler(IServer & server_) : server(server_) {} JavaScriptWebUIRequestHandler::JavaScriptWebUIRequestHandler(IServer & server_) : server(server_) {} -static void handle(const IServer &, HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) +static void handle(HTTPServerRequest & request, HTTPServerResponse & response, std::string_view html) { response.setContentType("text/html; charset=UTF-8"); if (request.getVersion() == HTTPServerRequest::HTTP_1_1) @@ -42,7 +42,7 @@ static void handle(const IServer &, HTTPServerRequest & request, HTTPServerRespo void PlayWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(server, request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_play_htmlData), gresource_play_htmlSize}); } void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) @@ -60,23 +60,23 @@ void DashboardWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HT static re2::RE2 lz_string_url = R"(https://[^\s"'`]+lz-string[^\s"'`]*\.js)"; RE2::Replace(&html, lz_string_url, "/js/lz-string.js"); - handle(server, request, response, html); + handle(request, response, html); } void BinaryWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { - handle(server, request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); } void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event &) { if (request.getURI() == "/js/uplot.js") { - handle(server, request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}); + handle(request, response, {reinterpret_cast(gresource_uplot_jsData), gresource_uplot_jsSize}); } else if (request.getURI() == "/js/lz-string.js") { - handle(server, request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}); + handle(request, response, {reinterpret_cast(gresource_lz_string_jsData), gresource_lz_string_jsSize}); } else { @@ -84,7 +84,7 @@ void JavaScriptWebUIRequestHandler::handleRequest(HTTPServerRequest & request, H *response.send() << "Not found.\n"; } - handle(server, request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); + handle(request, response, {reinterpret_cast(gresource_binary_htmlData), gresource_binary_htmlSize}); } } From bd04fc5346d83e8450fa98578e325923a609abda Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 17:41:53 +0000 Subject: [PATCH 004/142] rename test --- tests/integration/test_server_keep_alive/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_server_keep_alive/test.py b/tests/integration/test_server_keep_alive/test.py index 0f88fe47673..96f08a37adb 100644 --- a/tests/integration/test_server_keep_alive/test.py +++ b/tests/integration/test_server_keep_alive/test.py @@ -20,7 +20,7 @@ def start_cluster(): cluster.shutdown() -def test_requests_with_keep_alive(start_cluster): +def test_max_keep_alive_requests_on_user_side(start_cluster): # In this test we have `keep_alive_timeout` set to one hour to never trigger connection reset by timeout, `max_keep_alive_requests` is set to 5. # We expect server to close connection after each 5 requests. We detect connection reset by change in src port. # So the first 5 requests should come from the same port, the following 5 requests should come from another port. From c153fae0b8377aeca5e636a7ad0370e6ada42688 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 19:10:06 +0000 Subject: [PATCH 005/142] add docs --- .../server-configuration-parameters/settings.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 07c9a2b88ab..0efb5a9e6e4 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1298,6 +1298,16 @@ The number of seconds that ClickHouse waits for incoming requests before closing 10 ``` +## max_keep_alive_requests {#max-keep-alive-requests} + +Maximal number of requests through a single keep-alive connection until it will be closed by ClickHouse server. Default to 10000. + +**Example** + +``` xml +10 +``` + ## listen_host {#listen_host} Restriction on hosts that requests can come from. If you want the server to answer all of them, specify `::`. From b93f483a0e2f312d50685fd499d2f52717b83925 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 20:07:12 +0000 Subject: [PATCH 006/142] fix build --- programs/library-bridge/LibraryBridge.cpp | 2 +- .../LibraryBridgeHandlerFactory.cpp | 10 ++++------ .../LibraryBridgeHandlerFactory.h | 2 -- .../library-bridge/LibraryBridgeHandlers.cpp | 17 ++++++----------- programs/library-bridge/LibraryBridgeHandlers.h | 6 ++---- programs/odbc-bridge/ColumnInfoHandler.cpp | 5 +---- programs/odbc-bridge/ColumnInfoHandler.h | 8 +------- programs/odbc-bridge/IdentifierQuoteHandler.cpp | 2 +- programs/odbc-bridge/IdentifierQuoteHandler.h | 8 +------- programs/odbc-bridge/MainHandler.cpp | 2 +- programs/odbc-bridge/MainHandler.h | 3 --- programs/odbc-bridge/ODBCHandlerFactory.cpp | 10 +++++----- programs/odbc-bridge/SchemaAllowedHandler.cpp | 2 +- programs/odbc-bridge/SchemaAllowedHandler.h | 8 +------- 14 files changed, 25 insertions(+), 60 deletions(-) diff --git a/programs/library-bridge/LibraryBridge.cpp b/programs/library-bridge/LibraryBridge.cpp index 8a07ca57104..f86e469a307 100644 --- a/programs/library-bridge/LibraryBridge.cpp +++ b/programs/library-bridge/LibraryBridge.cpp @@ -25,7 +25,7 @@ std::string LibraryBridge::bridgeName() const LibraryBridge::HandlerFactoryPtr LibraryBridge::getHandlerFactoryPtr(ContextPtr context) const { - return std::make_shared("LibraryRequestHandlerFactory", keep_alive_timeout, context); + return std::make_shared("LibraryRequestHandlerFactory", context); } } diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp index e5ab22f2d40..234904c6265 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.cpp +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.cpp @@ -9,12 +9,10 @@ namespace DB { LibraryBridgeHandlerFactory::LibraryBridgeHandlerFactory( const std::string & name_, - size_t keep_alive_timeout_, ContextPtr context_) : WithContext(context_) , log(getLogger(name_)) , name(name_) - , keep_alive_timeout(keep_alive_timeout_) { } @@ -26,17 +24,17 @@ std::unique_ptr LibraryBridgeHandlerFactory::createRequestHa if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) { if (uri.getPath() == "/extdict_ping") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); else if (uri.getPath() == "/catboost_ping") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); } if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/extdict_request") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); else if (uri.getPath() == "/catboost_request") - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); } return nullptr; diff --git a/programs/library-bridge/LibraryBridgeHandlerFactory.h b/programs/library-bridge/LibraryBridgeHandlerFactory.h index 5b0f088bc29..c65394efa3b 100644 --- a/programs/library-bridge/LibraryBridgeHandlerFactory.h +++ b/programs/library-bridge/LibraryBridgeHandlerFactory.h @@ -13,7 +13,6 @@ class LibraryBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContex public: LibraryBridgeHandlerFactory( const std::string & name_, - size_t keep_alive_timeout_, ContextPtr context_); std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; @@ -21,7 +20,6 @@ public: private: LoggerPtr log; const std::string name; - const size_t keep_alive_timeout; }; } diff --git a/programs/library-bridge/LibraryBridgeHandlers.cpp b/programs/library-bridge/LibraryBridgeHandlers.cpp index 094cef6716d..bd8faf76188 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.cpp +++ b/programs/library-bridge/LibraryBridgeHandlers.cpp @@ -86,10 +86,8 @@ static void writeData(Block data, OutputFormatPtr format) } -ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("ExternalDictionaryLibraryBridgeRequestHandler")) +ExternalDictionaryLibraryBridgeRequestHandler::ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("ExternalDictionaryLibraryBridgeRequestHandler")) { } @@ -136,7 +134,7 @@ void ExternalDictionaryLibraryBridgeRequestHandler::handleRequest(HTTPServerRequ const String & dictionary_id = params.get("dictionary_id"); LOG_TRACE(log, "Library method: '{}', dictionary id: {}", method, dictionary_id); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { @@ -410,11 +408,8 @@ void ExternalDictionaryLibraryBridgeExistsHandler::handleRequest(HTTPServerReque } -CatBoostLibraryBridgeRequestHandler::CatBoostLibraryBridgeRequestHandler( - size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , keep_alive_timeout(keep_alive_timeout_) - , log(getLogger("CatBoostLibraryBridgeRequestHandler")) +CatBoostLibraryBridgeRequestHandler::CatBoostLibraryBridgeRequestHandler(ContextPtr context_) + : WithContext(context_), log(getLogger("CatBoostLibraryBridgeRequestHandler")) { } @@ -453,7 +448,7 @@ void CatBoostLibraryBridgeRequestHandler::handleRequest(HTTPServerRequest & requ const String & method = params.get("method"); LOG_TRACE(log, "Library method: '{}'", method); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 83bca24ce1f..70e3c9c78da 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -18,14 +18,13 @@ namespace DB class ExternalDictionaryLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: static constexpr inline auto FORMAT = "RowBinary"; - const size_t keep_alive_timeout; LoggerPtr log; }; @@ -62,12 +61,11 @@ private: class CatBoostLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeRequestHandler(size_t keep_alive_timeout_, ContextPtr context_); + CatBoostLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: - const size_t keep_alive_timeout; LoggerPtr log; }; diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index 4cb15de3b2c..438062e8169 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -200,10 +200,7 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ if (columns.empty()) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Columns definition was not returned"); - WriteBufferFromHTTPServerResponse out( - response, - request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, - keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeStringBinary(columns.toString(), out); diff --git a/programs/odbc-bridge/ColumnInfoHandler.h b/programs/odbc-bridge/ColumnInfoHandler.h index ca7044fdf32..f16e09ec3f9 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.h +++ b/programs/odbc-bridge/ColumnInfoHandler.h @@ -16,18 +16,12 @@ namespace DB class ODBCColumnsInfoHandler : public HTTPRequestHandler, WithContext { public: - ODBCColumnsInfoHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("ODBCColumnsInfoHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + ODBCColumnsInfoHandler(ContextPtr context_) : WithContext(context_), log(getLogger("ODBCColumnsInfoHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index cf5acdc4534..0bd1e8758cd 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -73,7 +73,7 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ auto identifier = getIdentifierQuote(std::move(connection)); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeStringBinary(identifier, out); diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.h b/programs/odbc-bridge/IdentifierQuoteHandler.h index 7b78c5b4b93..c0e07795ea5 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.h +++ b/programs/odbc-bridge/IdentifierQuoteHandler.h @@ -14,18 +14,12 @@ namespace DB class IdentifierQuoteHandler : public HTTPRequestHandler, WithContext { public: - IdentifierQuoteHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("IdentifierQuoteHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + IdentifierQuoteHandler(ContextPtr context_) : WithContext(context_), log(getLogger("IdentifierQuoteHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index e350afa2b10..b086397446e 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -131,7 +131,7 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse return; } - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { diff --git a/programs/odbc-bridge/MainHandler.h b/programs/odbc-bridge/MainHandler.h index ed0c6b2e28c..0fcad61d274 100644 --- a/programs/odbc-bridge/MainHandler.h +++ b/programs/odbc-bridge/MainHandler.h @@ -20,12 +20,10 @@ class ODBCHandler : public HTTPRequestHandler, WithContext { public: ODBCHandler( - size_t keep_alive_timeout_, ContextPtr context_, const String & mode_) : WithContext(context_) , log(getLogger("ODBCHandler")) - , keep_alive_timeout(keep_alive_timeout_) , mode(mode_) { } @@ -35,7 +33,6 @@ public: private: LoggerPtr log; - size_t keep_alive_timeout; String mode; static inline std::mutex mutex; diff --git a/programs/odbc-bridge/ODBCHandlerFactory.cpp b/programs/odbc-bridge/ODBCHandlerFactory.cpp index eebb0c24c7a..7f095666447 100644 --- a/programs/odbc-bridge/ODBCHandlerFactory.cpp +++ b/programs/odbc-bridge/ODBCHandlerFactory.cpp @@ -30,26 +30,26 @@ std::unique_ptr ODBCBridgeHandlerFactory::createRequestHandl if (uri.getPath() == "/columns_info") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/identifier_quote") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/schema_allowed") #if USE_ODBC - return std::make_unique(keep_alive_timeout, getContext()); + return std::make_unique(getContext()); #else return nullptr; #endif else if (uri.getPath() == "/write") - return std::make_unique(keep_alive_timeout, getContext(), "write"); + return std::make_unique(getContext(), "write"); else - return std::make_unique(keep_alive_timeout, getContext(), "read"); + return std::make_unique(getContext(), "read"); } return nullptr; } diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index c7025ca4311..5dc0cb3aa2b 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -86,7 +86,7 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer bool result = isSchemaAllowed(std::move(connection)); - WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD); try { writeBoolText(result, out); diff --git a/programs/odbc-bridge/SchemaAllowedHandler.h b/programs/odbc-bridge/SchemaAllowedHandler.h index 8dc725dbb33..e73c0a2cb26 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.h +++ b/programs/odbc-bridge/SchemaAllowedHandler.h @@ -17,18 +17,12 @@ class Context; class SchemaAllowedHandler : public HTTPRequestHandler, WithContext { public: - SchemaAllowedHandler(size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger("SchemaAllowedHandler")) - , keep_alive_timeout(keep_alive_timeout_) - { - } + SchemaAllowedHandler(ContextPtr context_) : WithContext(context_), log(getLogger("SchemaAllowedHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; private: LoggerPtr log; - size_t keep_alive_timeout; }; } From c33511dcb9314da6b64b11cf21d231c9d3896dad Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 22 Mar 2024 20:24:53 +0000 Subject: [PATCH 007/142] remove more --- programs/odbc-bridge/ODBCBridge.cpp | 2 +- programs/odbc-bridge/ODBCHandlerFactory.cpp | 9 +++------ programs/odbc-bridge/ODBCHandlerFactory.h | 3 +-- programs/odbc-bridge/PingHandler.h | 4 ---- 4 files changed, 5 insertions(+), 13 deletions(-) diff --git a/programs/odbc-bridge/ODBCBridge.cpp b/programs/odbc-bridge/ODBCBridge.cpp index e91cc3158df..2cde5bbf9f5 100644 --- a/programs/odbc-bridge/ODBCBridge.cpp +++ b/programs/odbc-bridge/ODBCBridge.cpp @@ -25,7 +25,7 @@ std::string ODBCBridge::bridgeName() const ODBCBridge::HandlerFactoryPtr ODBCBridge::getHandlerFactoryPtr(ContextPtr context) const { - return std::make_shared("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context); + return std::make_shared("ODBCRequestHandlerFactory-factory", context); } } diff --git a/programs/odbc-bridge/ODBCHandlerFactory.cpp b/programs/odbc-bridge/ODBCHandlerFactory.cpp index 7f095666447..b5d0be908f4 100644 --- a/programs/odbc-bridge/ODBCHandlerFactory.cpp +++ b/programs/odbc-bridge/ODBCHandlerFactory.cpp @@ -9,11 +9,8 @@ namespace DB { -ODBCBridgeHandlerFactory::ODBCBridgeHandlerFactory(const std::string & name_, size_t keep_alive_timeout_, ContextPtr context_) - : WithContext(context_) - , log(getLogger(name_)) - , name(name_) - , keep_alive_timeout(keep_alive_timeout_) +ODBCBridgeHandlerFactory::ODBCBridgeHandlerFactory(const std::string & name_, ContextPtr context_) + : WithContext(context_), log(getLogger(name_)), name(name_) { } @@ -23,7 +20,7 @@ std::unique_ptr ODBCBridgeHandlerFactory::createRequestHandl LOG_TRACE(log, "Request URI: {}", uri.toString()); if (uri.getPath() == "/ping" && request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) - return std::make_unique(keep_alive_timeout); + return std::make_unique(); if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { diff --git a/programs/odbc-bridge/ODBCHandlerFactory.h b/programs/odbc-bridge/ODBCHandlerFactory.h index 4aaf1b55453..f4a2717dc9f 100644 --- a/programs/odbc-bridge/ODBCHandlerFactory.h +++ b/programs/odbc-bridge/ODBCHandlerFactory.h @@ -17,14 +17,13 @@ namespace DB class ODBCBridgeHandlerFactory : public HTTPRequestHandlerFactory, WithContext { public: - ODBCBridgeHandlerFactory(const std::string & name_, size_t keep_alive_timeout_, ContextPtr context_); + ODBCBridgeHandlerFactory(const std::string & name_, ContextPtr context_); std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; private: LoggerPtr log; std::string name; - size_t keep_alive_timeout; }; } diff --git a/programs/odbc-bridge/PingHandler.h b/programs/odbc-bridge/PingHandler.h index c5447107e0c..4c557bd3cf6 100644 --- a/programs/odbc-bridge/PingHandler.h +++ b/programs/odbc-bridge/PingHandler.h @@ -9,11 +9,7 @@ namespace DB class PingHandler : public HTTPRequestHandler { public: - explicit PingHandler(size_t keep_alive_timeout_) : keep_alive_timeout(keep_alive_timeout_) {} void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; - -private: - size_t keep_alive_timeout; }; } From 1115fa4bc74d840e8fc3230908310cb7311dd0d0 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 25 Mar 2024 15:13:13 +0000 Subject: [PATCH 008/142] fix tidy --- src/Server/PrometheusRequestHandler.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 7f4d3c14f62..a1bd18b394a 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -15,7 +15,7 @@ private: PrometheusMetricsWriterPtr metrics_writer; public: - PrometheusRequestHandler(PrometheusMetricsWriterPtr metrics_writer_) : metrics_writer(std::move(metrics_writer_)) { } + explicit PrometheusRequestHandler(PrometheusMetricsWriterPtr metrics_writer_) : metrics_writer(std::move(metrics_writer_)) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; }; From 1cfbc548bb415e89e294a22da0eea59302269c37 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 25 Mar 2024 17:28:51 +0100 Subject: [PATCH 009/142] Fix copy-paste Co-authored-by: Michael Lex --- src/Core/ServerSettings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 7480d94e81d..4a22082cdda 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -113,7 +113,7 @@ namespace DB M(Bool, async_load_databases, false, "Enable asynchronous loading of databases and tables to speedup server startup. Queries to not yet loaded entity will be blocked until load is finished.", 0) \ M(Bool, display_secrets_in_show_and_select, false, "Allow showing secrets in SHOW and SELECT queries via a format setting and a grant", 0) \ M(Seconds, keep_alive_timeout, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT, "The number of seconds that ClickHouse waits for incoming requests before closing the connection.", 0) \ - M(UInt64, max_keep_alive_requests, 10000, "The number of seconds that ClickHouse waits for incoming requests before closing the connection.", 0) \ + M(UInt64, max_keep_alive_requests, 10000, "The maximum number of requests handled via a single http keepalive connection before the server closes this connection.", 0) \ M(Seconds, replicated_fetches_http_connection_timeout, 0, "HTTP connection timeout for part fetch requests. Inherited from default profile `http_connection_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ From 3c2915934f31d82176b65e1e998eca030671d872 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 26 Mar 2024 04:29:34 +0000 Subject: [PATCH 010/142] update fuzzers --- ..._function_state_deserialization_fuzzer.cpp | 23 +++++++++++++++++++ src/Client/ClientBase.cpp | 2 +- src/Core/fuzzers/names_and_types_fuzzer.cpp | 22 ++++++++++++++++++ .../data_type_deserialization_fuzzer.cpp | 22 ++++++++++++++++++ src/Formats/fuzzers/format_fuzzer.cpp | 20 ++++++++++++++++ .../fuzzers/codegen_fuzzer/CMakeLists.txt | 2 +- .../codegen_fuzzer/codegen_select_fuzzer.cpp | 2 +- src/Parsers/fuzzers/create_parser_fuzzer.cpp | 2 +- .../fuzzers/columns_description_fuzzer.cpp | 22 ++++++++++++++++++ 9 files changed, 113 insertions(+), 4 deletions(-) diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp index 425364efb9c..9d490432c60 100644 --- a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp +++ b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp @@ -12,10 +12,33 @@ #include +#include + #include #include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); +} + +} + + extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 767a9b2b9f9..bdee2233b27 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -2729,7 +2729,7 @@ void ClientBase::runLibFuzzer() for (auto & arg : fuzzer_args_holder) fuzzer_args.emplace_back(arg.data()); - int fuzzer_argc = fuzzer_args.size(); + int fuzzer_argc = static_cast(fuzzer_args.size()); char ** fuzzer_argv = fuzzer_args.data(); LLVMFuzzerRunDriver(&fuzzer_argc, &fuzzer_argv, [](const uint8_t * data, size_t size) diff --git a/src/Core/fuzzers/names_and_types_fuzzer.cpp b/src/Core/fuzzers/names_and_types_fuzzer.cpp index 6fdd8703014..bc8cb7af61f 100644 --- a/src/Core/fuzzers/names_and_types_fuzzer.cpp +++ b/src/Core/fuzzers/names_and_types_fuzzer.cpp @@ -1,7 +1,29 @@ +#include +#include #include #include +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); +} + +} + + extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp index 0ae325871fb..f1b03147929 100644 --- a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp +++ b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp @@ -8,11 +8,33 @@ #include #include +#include + #include #include +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); +} + +} + + extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Formats/fuzzers/format_fuzzer.cpp b/src/Formats/fuzzers/format_fuzzer.cpp index 46661e4828c..4426301b6e7 100644 --- a/src/Formats/fuzzers/format_fuzzer.cpp +++ b/src/Formats/fuzzers/format_fuzzer.cpp @@ -21,6 +21,26 @@ #include +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); +} + +} + + extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt index 20fd951d390..74fdcff79f7 100644 --- a/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt +++ b/src/Parsers/fuzzers/codegen_fuzzer/CMakeLists.txt @@ -39,7 +39,7 @@ set(CMAKE_INCLUDE_CURRENT_DIR TRUE) clickhouse_add_executable(codegen_select_fuzzer ${FUZZER_SRCS}) -set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier") +set_source_files_properties("${PROTO_SRCS}" "out.cpp" PROPERTIES COMPILE_FLAGS "-Wno-reserved-identifier -Wno-extra-semi-stmt -Wno-used-but-marked-unused") # contrib/libprotobuf-mutator/src/libfuzzer/libfuzzer_macro.h:143:44: error: no newline at end of file [-Werror,-Wnewline-eof] target_compile_options (codegen_select_fuzzer PRIVATE -Wno-newline-eof) diff --git a/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp b/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp index 9310d7d59f7..55daa370651 100644 --- a/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp +++ b/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp @@ -27,7 +27,7 @@ DEFINE_BINARY_PROTO_FUZZER(const Sentence& main) DB::ParserQueryWithOutput parser(input.data() + input.size()); try { - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 0); + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 0, 0); DB::WriteBufferFromOStream out(std::cerr, 4096); DB::formatAST(*ast, out); diff --git a/src/Parsers/fuzzers/create_parser_fuzzer.cpp b/src/Parsers/fuzzers/create_parser_fuzzer.cpp index 854885ad33b..1d5c3e27232 100644 --- a/src/Parsers/fuzzers/create_parser_fuzzer.cpp +++ b/src/Parsers/fuzzers/create_parser_fuzzer.cpp @@ -14,7 +14,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) std::string input = std::string(reinterpret_cast(data), size); DB::ParserCreateQuery parser; - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000); + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000, 0); const UInt64 max_ast_depth = 1000; ast->checkDepth(max_ast_depth); diff --git a/src/Storages/fuzzers/columns_description_fuzzer.cpp b/src/Storages/fuzzers/columns_description_fuzzer.cpp index b703a1e7051..cb0c6168225 100644 --- a/src/Storages/fuzzers/columns_description_fuzzer.cpp +++ b/src/Storages/fuzzers/columns_description_fuzzer.cpp @@ -1,4 +1,26 @@ +#include #include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); +} + +} extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) From cb40fd7d0c9096a8df8d5e7c2e9924f66d51e061 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 26 Mar 2024 15:27:01 +0000 Subject: [PATCH 011/142] minor fixes --- .../fuzzers/aggregate_function_state_deserialization_fuzzer.cpp | 2 +- src/Core/fuzzers/names_and_types_fuzzer.cpp | 2 +- src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp | 2 +- src/Formats/fuzzers/format_fuzzer.cpp | 2 +- src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp | 2 +- src/Parsers/fuzzers/create_parser_fuzzer.cpp | 2 +- src/Storages/fuzzers/columns_description_fuzzer.cpp | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp index 9d490432c60..a956d9906bc 100644 --- a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp +++ b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp @@ -33,7 +33,7 @@ using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for aggregate_function_state_deserialization_fuzzer"); } } diff --git a/src/Core/fuzzers/names_and_types_fuzzer.cpp b/src/Core/fuzzers/names_and_types_fuzzer.cpp index bc8cb7af61f..74debedf2a3 100644 --- a/src/Core/fuzzers/names_and_types_fuzzer.cpp +++ b/src/Core/fuzzers/names_and_types_fuzzer.cpp @@ -18,7 +18,7 @@ using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for names_and_types_fuzzer"); } } diff --git a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp index f1b03147929..7d9a0513d18 100644 --- a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp +++ b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp @@ -29,7 +29,7 @@ using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for data_type_deserialization_fuzzer"); } } diff --git a/src/Formats/fuzzers/format_fuzzer.cpp b/src/Formats/fuzzers/format_fuzzer.cpp index 4426301b6e7..2c1ec65e54d 100644 --- a/src/Formats/fuzzers/format_fuzzer.cpp +++ b/src/Formats/fuzzers/format_fuzzer.cpp @@ -35,7 +35,7 @@ using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for format_fuzzer"); } } diff --git a/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp b/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp index 55daa370651..6b25b581532 100644 --- a/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp +++ b/src/Parsers/fuzzers/codegen_fuzzer/codegen_select_fuzzer.cpp @@ -27,7 +27,7 @@ DEFINE_BINARY_PROTO_FUZZER(const Sentence& main) DB::ParserQueryWithOutput parser(input.data() + input.size()); try { - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 0, 0); + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 0, DB::DBMS_DEFAULT_MAX_PARSER_BACKTRACKS); DB::WriteBufferFromOStream out(std::cerr, 4096); DB::formatAST(*ast, out); diff --git a/src/Parsers/fuzzers/create_parser_fuzzer.cpp b/src/Parsers/fuzzers/create_parser_fuzzer.cpp index 1d5c3e27232..bab8db5671d 100644 --- a/src/Parsers/fuzzers/create_parser_fuzzer.cpp +++ b/src/Parsers/fuzzers/create_parser_fuzzer.cpp @@ -14,7 +14,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) std::string input = std::string(reinterpret_cast(data), size); DB::ParserCreateQuery parser; - DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000, 0); + DB::ASTPtr ast = parseQuery(parser, input.data(), input.data() + input.size(), "", 0, 1000, DB::DBMS_DEFAULT_MAX_PARSER_BACKTRACKS); const UInt64 max_ast_depth = 1000; ast->checkDepth(max_ast_depth); diff --git a/src/Storages/fuzzers/columns_description_fuzzer.cpp b/src/Storages/fuzzers/columns_description_fuzzer.cpp index cb0c6168225..ac285ea50f7 100644 --- a/src/Storages/fuzzers/columns_description_fuzzer.cpp +++ b/src/Storages/fuzzers/columns_description_fuzzer.cpp @@ -17,7 +17,7 @@ using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for columns_description_fuzzer"); } } From 64e6c6a2fcf2f7017ec3749ad05eed2daeeb4b42 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 26 Mar 2024 22:31:40 +0000 Subject: [PATCH 012/142] fix tidy --- programs/library-bridge/LibraryBridgeHandlers.h | 8 ++++---- programs/odbc-bridge/ColumnInfoHandler.h | 2 +- programs/odbc-bridge/IdentifierQuoteHandler.h | 2 +- programs/odbc-bridge/SchemaAllowedHandler.h | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/programs/library-bridge/LibraryBridgeHandlers.h b/programs/library-bridge/LibraryBridgeHandlers.h index 70e3c9c78da..582619e174e 100644 --- a/programs/library-bridge/LibraryBridgeHandlers.h +++ b/programs/library-bridge/LibraryBridgeHandlers.h @@ -18,7 +18,7 @@ namespace DB class ExternalDictionaryLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_); + explicit ExternalDictionaryLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; @@ -33,7 +33,7 @@ private: class ExternalDictionaryLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_); + explicit ExternalDictionaryLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; @@ -61,7 +61,7 @@ private: class CatBoostLibraryBridgeRequestHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeRequestHandler(ContextPtr context_); + explicit CatBoostLibraryBridgeRequestHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; @@ -74,7 +74,7 @@ private: class CatBoostLibraryBridgeExistsHandler : public HTTPRequestHandler, WithContext { public: - CatBoostLibraryBridgeExistsHandler(ContextPtr context_); + explicit CatBoostLibraryBridgeExistsHandler(ContextPtr context_); void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; diff --git a/programs/odbc-bridge/ColumnInfoHandler.h b/programs/odbc-bridge/ColumnInfoHandler.h index f16e09ec3f9..bbbf0da218b 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.h +++ b/programs/odbc-bridge/ColumnInfoHandler.h @@ -16,7 +16,7 @@ namespace DB class ODBCColumnsInfoHandler : public HTTPRequestHandler, WithContext { public: - ODBCColumnsInfoHandler(ContextPtr context_) : WithContext(context_), log(getLogger("ODBCColumnsInfoHandler")) { } + explicit ODBCColumnsInfoHandler(ContextPtr context_) : WithContext(context_), log(getLogger("ODBCColumnsInfoHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.h b/programs/odbc-bridge/IdentifierQuoteHandler.h index c0e07795ea5..a85b56a9f6a 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.h +++ b/programs/odbc-bridge/IdentifierQuoteHandler.h @@ -14,7 +14,7 @@ namespace DB class IdentifierQuoteHandler : public HTTPRequestHandler, WithContext { public: - IdentifierQuoteHandler(ContextPtr context_) : WithContext(context_), log(getLogger("IdentifierQuoteHandler")) { } + explicit IdentifierQuoteHandler(ContextPtr context_) : WithContext(context_), log(getLogger("IdentifierQuoteHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; diff --git a/programs/odbc-bridge/SchemaAllowedHandler.h b/programs/odbc-bridge/SchemaAllowedHandler.h index e73c0a2cb26..59022151b53 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.h +++ b/programs/odbc-bridge/SchemaAllowedHandler.h @@ -17,7 +17,7 @@ class Context; class SchemaAllowedHandler : public HTTPRequestHandler, WithContext { public: - SchemaAllowedHandler(ContextPtr context_) : WithContext(context_), log(getLogger("SchemaAllowedHandler")) { } + explicit SchemaAllowedHandler(ContextPtr context_) : WithContext(context_), log(getLogger("SchemaAllowedHandler")) { } void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event) override; From 8357bc7b1b2d48e808b63cc0aa6fb7c7aa36e98b Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sun, 31 Mar 2024 23:33:35 +0000 Subject: [PATCH 013/142] fix build --- base/base/CMakeLists.txt | 2 +- cmake/sanitize.cmake | 2 +- programs/CMakeLists.txt | 2 +- src/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 27aa0bd6baf..7b1da9ab4ad 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_CLANG_TIDY) set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 9d53b2004b4..227d96357b5 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -64,7 +64,7 @@ option(WITH_COVERAGE "Instrumentation for code coverage with default implementat if (WITH_COVERAGE) message (STATUS "Enabled instrumentation for code coverage") - set(COVERAGE_FLAGS "-fprofile-instr-generate -fcoverage-mapping") + set(COVERAGE_FLAGS -fprofile-instr-generate -fcoverage-mapping) endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index 0d91de2dad8..aa7781498c8 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_CLANG_TIDY) set (CMAKE_CXX_CLANG_TIDY "${CLANG_TIDY_PATH}") diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 73aa409e995..bd603c9f15e 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,4 +1,4 @@ -add_compile_options($<$,$>:${COVERAGE_FLAGS}>) +add_compile_options("$<$,$>:${COVERAGE_FLAGS}>") if (USE_INCLUDE_WHAT_YOU_USE) set (CMAKE_CXX_INCLUDE_WHAT_YOU_USE ${IWYU_PATH}) From 99e25d762c2db3c544dd5590726fc039b1828d16 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Mon, 1 Apr 2024 12:28:51 +0000 Subject: [PATCH 014/142] remove WITH_COVERAGE for fuzzers build --- docker/packager/packager | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 23fc26bc1a4..355149df38c 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -276,7 +276,6 @@ def parse_env_variables( elif package_type == "fuzzers": cmake_flags.append("-DENABLE_FUZZING=1") cmake_flags.append("-DENABLE_PROTOBUF=1") - cmake_flags.append("-DWITH_COVERAGE=1") # Reduce linking and building time by avoid *install/all dependencies cmake_flags.append("-DCMAKE_SKIP_INSTALL_ALL_DEPENDENCY=ON") From 8d667ad5a34d1ba3d9008a5a6308598483281b35 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Mon, 1 Apr 2024 22:55:51 +0000 Subject: [PATCH 015/142] fix build.sh --- docker/packager/binary-builder/build.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/packager/binary-builder/build.sh b/docker/packager/binary-builder/build.sh index 032aceb0af3..cbd14b1eac2 100755 --- a/docker/packager/binary-builder/build.sh +++ b/docker/packager/binary-builder/build.sh @@ -108,7 +108,8 @@ if [ -n "$MAKE_DEB" ]; then bash -x /build/packages/build fi -mv ./programs/clickhouse* /output || mv ./programs/*_fuzzer /output +mv ./programs/clickhouse* /output ||: +mv ./programs/*_fuzzer /output ||: [ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output [ -x ./programs/self-extracting/clickhouse-stripped ] && mv ./programs/self-extracting/clickhouse-stripped /output mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds From db3d923d4cae57254cadcef7f6997f3912d46515 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Wed, 3 Apr 2024 20:25:29 +0000 Subject: [PATCH 016/142] return WITH_COVERAGE, fix build --- cmake/sanitize.cmake | 3 ++- docker/packager/packager | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 227d96357b5..9f4fa7081c6 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -64,7 +64,8 @@ option(WITH_COVERAGE "Instrumentation for code coverage with default implementat if (WITH_COVERAGE) message (STATUS "Enabled instrumentation for code coverage") - set(COVERAGE_FLAGS -fprofile-instr-generate -fcoverage-mapping) + set (COVERAGE_FLAGS -fprofile-instr-generate -fcoverage-mapping) + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fprofile-instr-generate -fcoverage-mapping") endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) diff --git a/docker/packager/packager b/docker/packager/packager index 355149df38c..23fc26bc1a4 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -276,6 +276,7 @@ def parse_env_variables( elif package_type == "fuzzers": cmake_flags.append("-DENABLE_FUZZING=1") cmake_flags.append("-DENABLE_PROTOBUF=1") + cmake_flags.append("-DWITH_COVERAGE=1") # Reduce linking and building time by avoid *install/all dependencies cmake_flags.append("-DCMAKE_SKIP_INSTALL_ALL_DEPENDENCY=ON") From 04b8b1e76c467ae527202a75141ac8981a1c4ac5 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 2 Jul 2024 14:01:19 +0000 Subject: [PATCH 017/142] initial commit for Hive-style partitioning --- src/Core/Settings.h | 5 + src/Core/SettingsChangesHistory.h | 5 + .../ObjectStorage/StorageObjectStorage.cpp | 32 ++- .../StorageObjectStorageSource.cpp | 14 +- src/Storages/StorageFile.cpp | 39 +++- src/Storages/StorageURL.cpp | 16 +- src/Storages/VirtualColumnUtils.cpp | 52 ++++- src/Storages/VirtualColumnUtils.h | 7 +- .../__init__.py | 0 .../configs/cluster_azure.xml | 39 ++++ .../configs/cluster_hdfs.xml | 33 +++ .../configs/disable_profilers_azure.xml | 9 + .../configs/macro_hdfs.xml | 5 + .../configs/named_collections_azure.xml | 14 ++ .../configs/schema_cache_azure.xml | 3 + .../configs/schema_cache_hdfs.xml | 3 + .../configs/users_azure.xml | 9 + .../test_azure.py | 204 ++++++++++++++++++ .../test_hdfs.py | 81 +++++++ .../03203_hive_style_partitioning.reference | 96 +++++++++ .../03203_hive_style_partitioning.sh | 93 ++++++++ .../column1=Gordon/sample.parquet | Bin 0 -> 1308 bytes .../column1=Schmidt/sample.parquet | Bin 0 -> 1308 bytes .../column0=Elizabeth/sample.parquet | Bin 0 -> 1308 bytes .../sample.parquet | Bin 0 -> 1308 bytes .../column1=Gordon/sample.parquet | Bin 0 -> 1308 bytes .../column1=Schmidt/sample.parquet | Bin 0 -> 1308 bytes .../coumn0=Elizabeth/sample.parquet | Bin 0 -> 1308 bytes .../sample.parquet | Bin 0 -> 1308 bytes 29 files changed, 749 insertions(+), 10 deletions(-) create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py create mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py create mode 100644 tests/queries/0_stateless/03203_hive_style_partitioning.reference create mode 100755 tests/queries/0_stateless/03203_hive_style_partitioning.sh create mode 100644 tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet create mode 100644 tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Gordon/sample.parquet create mode 100644 tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Schmidt/sample.parquet create mode 100644 tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/sample.parquet create mode 100644 tests/queries/0_stateless/data_minio/hive_partitioning/non_existing_column=Elizabeth/sample.parquet diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 14fe0924b40..738c0129d2d 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1106,6 +1106,11 @@ class IColumn; M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \ + M(Bool, file_hive_partitioning, false, "Allows to use hive partitioning for file format", 0)\ + M(Bool, url_hive_partitioning, false, "Allows to use hive partitioning for url format", 0)\ + M(Bool, s3_hive_partitioning, false, "Allows to use hive partitioning for s3 format", 0)\ + M(Bool, azure_blob_storage_hive_partitioning, false, "Allows to use hive partitioning for AzureBlobStorage format", 0)\ + M(Bool, hdfs_hive_partitioning, false, "Allows to use hive partitioning for hdfs format", 0)\ \ M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 4ac25a649b7..dd778149674 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -118,6 +118,11 @@ static const std::map +#include #include #include @@ -32,6 +33,19 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } + +bool checkIfHiveSettingEnabled(const ContextPtr & context, const std::string & storage_type_name) +{ + if (storage_type_name == "s3") + return context->getSettings().s3_hive_partitioning; + else if (storage_type_name == "hdfs") + return context->getSettings().hdfs_hive_partitioning; + else if (storage_type_name == "azure") + return context->getSettings().azure_blob_storage_hive_partitioning; + else + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported storage type: {}", storage_type_name); +} + StorageObjectStorage::StorageObjectStorage( ConfigurationPtr configuration_, ObjectStoragePtr object_storage_, @@ -60,7 +74,23 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); + auto file_iterator = StorageObjectStorageSource::createFileIterator( + configuration, + object_storage, + distributed_processing_, + context, + {}, // predicate + metadata.getColumns().getAll(), // virtual_columns + nullptr, // read_keys + {} // file_progress_callback + ); + + Strings paths; + + if (checkIfHiveSettingEnabled(context, configuration->getTypeName())) + if (auto file = file_iterator->next(0)) + paths = {file->getPath()}; + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), paths)); setInMemoryMetadata(metadata); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index aef783fc3c4..2741cfecf6b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -13,6 +13,7 @@ #include #include #include +#include namespace fs = std::filesystem; @@ -195,13 +196,24 @@ Chunk StorageObjectStorageSource::generate() const auto & object_info = reader.getObjectInfo(); const auto & filename = object_info->getFileName(); chassert(object_info->metadata); + + auto hive_map = VirtualColumnUtils::parsePartitionMapFromPath(object_info->getPath()); + bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), + [&](const auto& pair) { + return read_from_format_info.requested_virtual_columns.contains(pair.first); + }); + + if (!contains_virtual_column) + hive_map.clear(); // If we cannot find any virual column in requested, we don't add any of them to chunk + VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, { .path = getUniqueStoragePathIdentifier(*configuration, *object_info, false), .size = object_info->metadata->size_bytes, .filename = &filename, - .last_modified = object_info->metadata->last_modified + .last_modified = object_info->metadata->last_modified, + .hive_partitioning_map = hive_map }); return chunk; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 7f39ff615f0..0c32f29cb34 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -52,6 +52,7 @@ #include #include #include +#include #include #include @@ -1095,7 +1096,11 @@ void StorageFile::setStorageMetadata(CommonArguments args) storage_metadata.setConstraints(args.constraints); storage_metadata.setComment(args.comment); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + + Strings paths_for_virtuals; + if (args.getContext()->getSettingsRef().file_hive_partitioning) + paths_for_virtuals = paths; + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), paths_for_virtuals)); } @@ -1437,6 +1442,15 @@ Chunk StorageFileSource::generate() chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); + std::map hive_map; + if (getContext()->getSettingsRef().file_hive_partitioning) + { + hive_map = VirtualColumnUtils::parsePartitionMapFromPath(current_path); + + for (const auto& item : hive_map) + requested_virtual_columns.push_back(NameAndTypePair(item.first, std::make_shared())); + } + /// Enrich with virtual columns. VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, requested_virtual_columns, @@ -1444,7 +1458,8 @@ Chunk StorageFileSource::generate() .path = current_path, .size = current_file_size, .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), - .last_modified = current_file_last_modified + .last_modified = current_file_last_modified, + .hive_partitioning_map = hive_map }); return chunk; @@ -1621,6 +1636,16 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->distributed_processing); } +void addPartitionColumnsToInfoHeader(Strings paths, ReadFromFormatInfo & info) +{ + for (const auto& path : paths) + { + auto map = VirtualColumnUtils::parsePartitionMapFromPath(path); + for (const auto& item : map) + info.source_header.insertUnique(ColumnWithTypeAndName(std::make_shared(), item.first)); + } +} + void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { createIterator(nullptr); @@ -1628,10 +1653,20 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui size_t num_streams = max_num_streams; size_t files_to_read = 0; + Strings paths; if (storage->archive_info) + { files_to_read = storage->archive_info->paths_to_archives.size(); + paths = storage->archive_info->paths_to_archives; + } else + { files_to_read = storage->paths.size(); + paths = storage->paths; + } + + if (getContext()->getSettingsRef().file_hive_partitioning) + addPartitionColumnsToInfoHeader(paths, info); if (max_num_streams > files_to_read) num_streams = files_to_read; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 895da028fc2..f6374701fc2 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -151,7 +152,11 @@ IStorageURLBase::IStorageURLBase( storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + + Strings uri_for_partitioning; + if (context_->getSettingsRef().url_hive_partitioning) + uri_for_partitioning = {uri}; + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), uri_for_partitioning)); } @@ -410,12 +415,17 @@ Chunk StorageURLSource::generate() size_t chunk_size = 0; if (input_format) chunk_size = input_format->getApproxBytesReadForChunk(); + std::map hive_map; + if (getContext()->getSettingsRef().url_hive_partitioning) + hive_map = VirtualColumnUtils::parsePartitionMapFromPath(curr_uri.getPath()); + progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, requested_virtual_columns, { .path = curr_uri.getPath(), - .size = current_file_size + .size = current_file_size, + .hive_partitioning_map = hive_map }); return chunk; } @@ -1170,6 +1180,7 @@ void ReadFromURL::createIterator(const ActionsDAG::Node * predicate) void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { createIterator(nullptr); + const auto & settings = context->getSettingsRef(); if (is_empty_glob) { @@ -1180,7 +1191,6 @@ void ReadFromURL::initializePipeline(QueryPipelineBuilder & pipeline, const Buil Pipes pipes; pipes.reserve(num_streams); - const auto & settings = context->getSettingsRef(); const size_t max_parsing_threads = num_streams >= settings.max_parsing_threads ? 1 : (settings.max_parsing_threads / num_streams); for (size_t i = 0; i < num_streams; ++i) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 778c9e13adb..0b79e3b7a16 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -37,6 +36,7 @@ #include #include +#include #include #include "Functions/FunctionsLogical.h" #include "Functions/IFunction.h" @@ -115,7 +115,22 @@ NameSet getVirtualNamesForFileLikeStorage() return {"_path", "_file", "_size", "_time"}; } -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns) +Strings parseVirtualColumnNameFromPath(const std::string & path) +{ + std::string pattern = "/([^/]+)=([^/]+)"; + // Map to store the key-value pairs + std::map key_values; + + re2::StringPiece input_piece(path); + std::string key; + Strings result; + while (RE2::FindAndConsume(&input_piece, pattern, &key)) + result.push_back(key); + + return result; +} + +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, Strings paths) { VirtualColumnsDescription desc; @@ -132,6 +147,13 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription add_virtual("_size", makeNullable(std::make_shared())); add_virtual("_time", makeNullable(std::make_shared())); + for (const auto& path : paths) + { + auto names = parseVirtualColumnNameFromPath(path); + for (const auto& name : names) + add_virtual("_" + name, std::make_shared(std::make_shared())); + } + return desc; } @@ -178,6 +200,8 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const { if (column.name == "_file" || column.name == "_path") block.insert({column.type->createColumn(), column.type, column.name}); + if (!getVirtualNamesForFileLikeStorage().contains(column.name)) + block.insert({column.type->createColumn(), column.type, column.name}); } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); @@ -189,6 +213,21 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const return block.getByName("_idx").column; } +std::map parsePartitionMapFromPath(const std::string & path) +{ + std::string pattern = "/([^/]+)=([^/]+)"; // Regex to capture key=value pairs + // Map to store the key-value pairs + std::map key_values; + + re2::StringPiece input_piece(path); + std::string key; + std::string value; + while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) + key_values["_" + key] = value; + + return key_values; +} + void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, VirtualsForFileLikeStorage virtual_values) @@ -226,6 +265,15 @@ void addRequestedFileLikeStorageVirtualsToChunk( else chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } + else + { + auto it = virtual_values.hive_partitioning_map.find(virtual_column.getNameInStorage()); + if (it != virtual_values.hive_partitioning_map.end()) + { + chunk.addColumn(virtual_column.getTypeInStorage()->createColumnConst(chunk.getNumRows(), it->second)->convertToFullColumnIfConst()); + virtual_values.hive_partitioning_map.erase(it); + } + } } } diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index fbfbdd6c6cc..a03d4c7447f 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -6,6 +6,8 @@ #include #include +#include +#include #include @@ -47,7 +49,7 @@ auto extractSingleValueFromBlock(const Block & block, const String & name) } NameSet getVirtualNamesForFileLikeStorage(); -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns); +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, Strings paths = {}); ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); @@ -74,9 +76,12 @@ struct VirtualsForFileLikeStorage std::optional size { std::nullopt }; const String * filename { nullptr }; std::optional last_modified { std::nullopt }; + std::map hive_partitioning_map; }; +std::map parsePartitionMapFromPath(const std::string & path); + void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, VirtualsForFileLikeStorage virtual_values); diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml new file mode 100644 index 00000000000..ffa4673c9ee --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml @@ -0,0 +1,39 @@ + + + + + + node_0 + 9000 + + + node_1 + 9000 + + + node_2 + 9000 + + + + + + + + node_0 + 9000 + + + + + node_1 + 19000 + + + + + + + simple_cluster + + \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml new file mode 100644 index 00000000000..b99b21ea40b --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml @@ -0,0 +1,33 @@ + + + + + + node1 + 9000 + + + + + node1 + 19000 + + + + + + + + 127.0.0.1 + 9000 + + + + + 127.0.0.2 + 9000 + + + + + diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml new file mode 100644 index 00000000000..a39badbf8ec --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml @@ -0,0 +1,9 @@ + + + + + 0 + 0 + + + diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml new file mode 100644 index 00000000000..c2e11b47a5e --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml @@ -0,0 +1,5 @@ + + + test_cluster_two_shards + + \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml new file mode 100644 index 00000000000..bd7f9ff97f1 --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml @@ -0,0 +1,14 @@ + + + + cont + test_simple_write_named.csv + key UInt64, data String + CSV + + + devstoreaccount1 + Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== + + + diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml new file mode 100644 index 00000000000..e2168ecd06d --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml @@ -0,0 +1,3 @@ + + 2 + \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml new file mode 100644 index 00000000000..37639649b5f --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml @@ -0,0 +1,3 @@ + + 2 + \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml new file mode 100644 index 00000000000..4b6ba057ecb --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml @@ -0,0 +1,9 @@ + + + + + default + 1 + + + diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py new file mode 100644 index 00000000000..c9b2c9fec2e --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py @@ -0,0 +1,204 @@ +#!/usr/bin/env python3 + +import pytest +import time + +from helpers.cluster import ClickHouseCluster, is_arm +import re + +from azure.storage.blob import BlobServiceClient +from helpers.cluster import ClickHouseCluster, ClickHouseInstance + +if is_arm(): + pytestmark = pytest.mark.skip + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["configs/named_collections_azure.xml", "configs/schema_cache_azure.xml"], + user_configs=["configs/disable_profilers_azure.xml", "configs/users_azure.xml"], + with_azurite=True, + ) + cluster.start() + container_client = cluster.blob_service_client.get_container_client("cont") + container_client.create_container() + yield cluster + finally: + cluster.shutdown() + + +def azure_query( + node, query, expect_error=False, try_num=10, settings={}, query_on_retry=None +): + for i in range(try_num): + try: + if expect_error: + return node.query_and_get_error(query, settings=settings) + else: + return node.query(query, settings=settings) + except Exception as ex: + retriable_errors = [ + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Error while polling for socket ready read", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Connection closed before getting full response or response is less than expected", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Error while polling for socket ready read", + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + print(f"Try num: {i}. Having retriable error: {ex}") + time.sleep(i) + break + if not retry or i == try_num - 1: + raise Exception(ex) + if query_on_retry is not None: + node.query(query_on_retry) + continue + + +def get_azure_file_content(filename, port): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string( + str(connection_string) + ) + container_client = blob_service_client.get_container_client(container_name) + blob_client = container_client.get_blob_client(filename) + download_stream = blob_client.download_blob() + return download_stream.readall().decode("utf-8") + + +@pytest.fixture(autouse=True, scope="function") +def delete_all_files(cluster): + port = cluster.env_variables["AZURITE_PORT"] + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + containers = blob_service_client.list_containers() + for container in containers: + container_client = blob_service_client.get_container_client(container) + blob_list = container_client.list_blobs() + for blob in blob_list: + print(blob) + blob_client = container_client.get_blob_client(blob) + blob_client.delete_blob() + + assert len(list(container_client.list_blobs())) == 0 + + yield + + +def test_azure_partitioning_with_one_parameter(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values = f"('Elizabeth', 'Gordon')" + path = "a/column1=Elizabeth/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}')" + ) + assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format( + bucket="cont", max_path=path + ) + ] + + query = ( + f"SELECT column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + "Gordon" + ] + +def test_azure_partitioning_with_two_parameters(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values_1 = f"('Elizabeth', 'Gordon')" + values_2 = f"('Emilia', 'Gregor')" + path = "a/column1=Elizabeth/column2=Gordon/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format( + bucket="cont", max_path=path + ) + ] + + query = ( + f"SELECT column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;" + ) + assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + "Elizabeth" + ] + + query = ( + f"SELECT column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;" + ) + assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + "Elizabeth" + ] + +def test_azure_partitioning_without_setting(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values_1 = f"('Elizabeth', 'Gordon')" + values_2 = f"('Emilia', 'Gregor')" + path = "a/column1=Elizabeth/column2=Gordon/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + pattern = re.compile(r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL) + + with pytest.raises(Exception, match=pattern): + azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 0}) diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py new file mode 100644 index 00000000000..38641b63960 --- /dev/null +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python3 + +import pytest + +from helpers.client import QueryRuntimeException +from helpers.cluster import ClickHouseCluster, is_arm +import re + +from helpers.cluster import ClickHouseCluster + +if is_arm(): + pytestmark = pytest.mark.skip + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance( + "node1", + main_configs=[ + "configs/macro_hdfs.xml", + "configs/schema_cache_hdfs.xml", + "configs/cluster_hdfs.xml", + ], + with_hdfs=True, +) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + +def test_hdfs_partitioning_with_one_parameter(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data( + f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n" + ) + assert ( + hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") + == f"Elizabeth\tGordon\n" + ) + + r = node1.query( + "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", settings={"hdfs_hive_partitioning": 1} + ) + assert (r == f"Elizabeth\n") + +def test_hdfs_partitioning_with_two_parameters(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data( + f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" + ) + assert ( + hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") + == f"Elizabeth\tGordon\n" + ) + + r = node1.query( + "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", settings={"hdfs_hive_partitioning": 1} + ) + assert (r == f"Gordon\n") + +def test_hdfs_partitioning_without_setting(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data( + f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" + ) + assert ( + hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") + == f"Elizabeth\tGordon\n" + ) + pattern = re.compile(r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL) + + with pytest.raises(QueryRuntimeException, match=pattern): + node1.query(f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", settings={"hdfs_hive_partitioning": 0}) + +if __name__ == "__main__": + cluster.start() + input("Cluster created, press any key to destroy...") + cluster.shutdown() diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference new file mode 100644 index 00000000000..6ef1fcdf652 --- /dev/null +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -0,0 +1,96 @@ +TESTING THE FILE HIVE PARTITIONING +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +Eva Schmidt Elizabeth Schmidt +Samuel Schmidt Elizabeth Schmidt +Eva Schmidt Elizabeth +Samuel Schmidt Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +1 +TESTING THE URL PARTITIONING +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +Eva Schmidt Elizabeth Schmidt +Samuel Schmidt Elizabeth Schmidt +Eva Schmidt Elizabeth +Samuel Schmidt Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +1 +TESTING THE S3 PARTITIONING +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +Eva Schmidt Elizabeth Schmidt +Samuel Schmidt Elizabeth Schmidt +Eva Schmidt Elizabeth +Samuel Schmidt Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +Elizabeth Gordon Elizabeth Gordon +Elizabeth Gordon Elizabeth +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +1 diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh new file mode 100755 index 00000000000..a5d4c85a33b --- /dev/null +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -0,0 +1,93 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" + + +$CLICKHOUSE_LOCAL -n -q """set file_hive_partitioning = 1; + +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; + +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; + +SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" + +$CLICKHOUSE_LOCAL -n -q """set file_hive_partitioning = 0; + +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" + + +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" + + +$CLICKHOUSE_LOCAL -n -q """set url_hive_partitioning = 1; + +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; + +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; + +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _non_existing_column FROM url('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" + +$CLICKHOUSE_LOCAL -n -q """set url_hive_partitioning = 0; + +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" + + +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" + + +$CLICKHOUSE_LOCAL -n -q """set s3_hive_partitioning = 1; + +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; + +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; + +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; + +SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" + +$CLICKHOUSE_LOCAL -n -q """set s3_hive_partitioning = 0; + +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" + diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Gordon/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Gordon/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Schmidt/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Schmidt/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/non_existing_column=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/non_existing_column=Elizabeth/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 From 83462d743e76dcfa8fd35b8b30335682f86d9374 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:13:44 +0200 Subject: [PATCH 018/142] enhance SettingsChangesHistory.cpp --- src/Core/SettingsChangesHistory.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index b0725340f46..607f9b6d858 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -59,6 +59,11 @@ static std::initializer_list Date: Tue, 2 Jul 2024 14:43:45 +0000 Subject: [PATCH 019/142] style check --- .../ObjectStorage/StorageObjectStorage.cpp | 2 +- .../StorageObjectStorageSource.cpp | 7 +-- .../test_azure.py | 43 +++++++++++++------ .../test_hdfs.py | 32 ++++++++------ .../03203_hive_style_partitioning.sh | 24 +++++++---- 5 files changed, 69 insertions(+), 39 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index b169f02940e..ae7c211330c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -86,7 +86,7 @@ StorageObjectStorage::StorageObjectStorage( ); Strings paths; - + if (checkIfHiveSettingEnabled(context, configuration->getTypeName())) if (auto file = file_iterator->next(0)) paths = {file->getPath()}; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 2741cfecf6b..afb23961312 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -198,13 +198,14 @@ Chunk StorageObjectStorageSource::generate() chassert(object_info->metadata); auto hive_map = VirtualColumnUtils::parsePartitionMapFromPath(object_info->getPath()); - bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), - [&](const auto& pair) { + bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), + [&](const auto& pair) + { return read_from_format_info.requested_virtual_columns.contains(pair.first); }); if (!contains_virtual_column) - hive_map.clear(); // If we cannot find any virual column in requested, we don't add any of them to chunk + hive_map.clear(); // If we cannot find any virtual column in requested, we don't add any of them to chunk VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py index c9b2c9fec2e..0be697821f0 100644 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py @@ -12,14 +12,21 @@ from helpers.cluster import ClickHouseCluster, ClickHouseInstance if is_arm(): pytestmark = pytest.mark.skip + @pytest.fixture(scope="module") def cluster(): try: cluster = ClickHouseCluster(__file__) cluster.add_instance( "node", - main_configs=["configs/named_collections_azure.xml", "configs/schema_cache_azure.xml"], - user_configs=["configs/disable_profilers_azure.xml", "configs/users_azure.xml"], + main_configs=[ + "configs/named_collections_azure.xml", + "configs/schema_cache_azure.xml", + ], + user_configs=[ + "configs/disable_profilers_azure.xml", + "configs/users_azure.xml", + ], with_azurite=True, ) cluster.start() @@ -121,7 +128,9 @@ def test_azure_partitioning_with_one_parameter(cluster): f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}')" ) - assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == [ "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format( bucket="cont", max_path=path ) @@ -132,9 +141,10 @@ def test_azure_partitioning_with_one_parameter(cluster): f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" ) - assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ - "Gordon" - ] + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Gordon"] + def test_azure_partitioning_with_two_parameters(cluster): # type: (ClickHouseCluster) -> None @@ -155,7 +165,9 @@ def test_azure_partitioning_with_two_parameters(cluster): f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" ) - assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == [ "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format( bucket="cont", max_path=path ) @@ -166,18 +178,19 @@ def test_azure_partitioning_with_two_parameters(cluster): f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;" ) - assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ - "Elizabeth" - ] + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Elizabeth"] query = ( f"SELECT column1 FROM azureBlobStorage(azure_conf2, " f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;" ) - assert azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 1}).splitlines() == [ - "Elizabeth" - ] + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Elizabeth"] + def test_azure_partitioning_without_setting(cluster): # type: (ClickHouseCluster) -> None @@ -198,7 +211,9 @@ def test_azure_partitioning_without_setting(cluster): f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" ) - pattern = re.compile(r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL) + pattern = re.compile( + r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL + ) with pytest.raises(Exception, match=pattern): azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 0}) diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py index 38641b63960..4667d18688a 100644 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py +++ b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py @@ -31,20 +31,18 @@ def started_cluster(): finally: cluster.shutdown() + def test_hdfs_partitioning_with_one_parameter(started_cluster): hdfs_api = started_cluster.hdfs_api - hdfs_api.write_data( - f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n" - ) - assert ( - hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") - == f"Elizabeth\tGordon\n" - ) + hdfs_api.write_data(f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n") + assert hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") == f"Elizabeth\tGordon\n" r = node1.query( - "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", settings={"hdfs_hive_partitioning": 1} + "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", + settings={"hdfs_hive_partitioning": 1}, ) - assert (r == f"Elizabeth\n") + assert r == f"Elizabeth\n" + def test_hdfs_partitioning_with_two_parameters(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -57,9 +55,11 @@ def test_hdfs_partitioning_with_two_parameters(started_cluster): ) r = node1.query( - "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", settings={"hdfs_hive_partitioning": 1} + "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", + settings={"hdfs_hive_partitioning": 1}, ) - assert (r == f"Gordon\n") + assert r == f"Gordon\n" + def test_hdfs_partitioning_without_setting(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -70,10 +70,16 @@ def test_hdfs_partitioning_without_setting(started_cluster): hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") == f"Elizabeth\tGordon\n" ) - pattern = re.compile(r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL) + pattern = re.compile( + r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL + ) with pytest.raises(QueryRuntimeException, match=pattern): - node1.query(f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", settings={"hdfs_hive_partitioning": 0}) + node1.query( + f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", + settings={"hdfs_hive_partitioning": 0}, + ) + if __name__ == "__main__": cluster.start() diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index a5d4c85a33b..83a8f87a813 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -8,7 +8,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """set file_hive_partitioning = 1; +$CLICKHOUSE_LOCAL -n -q """ +set file_hive_partitioning = 1; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -31,13 +32,15 @@ SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.pa $CLICKHOUSE_LOCAL -n -q """set file_hive_partitioning = 0; -SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """set url_hive_partitioning = 1; +$CLICKHOUSE_LOCAL -n -q """ +set url_hive_partitioning = 1; SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -60,13 +63,15 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=*/ $CLICKHOUSE_LOCAL -n -q """set url_hive_partitioning = 0; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" +SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """set s3_hive_partitioning = 1; +$CLICKHOUSE_LOCAL -n -q """ +set s3_hive_partitioning = 1; SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -85,9 +90,12 @@ SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/c SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0; +""" -$CLICKHOUSE_LOCAL -n -q """set s3_hive_partitioning = 0; +$CLICKHOUSE_LOCAL -n -q """ +set s3_hive_partitioning = 0; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10;""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" +SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" From cd5cdcc124f95204a6f63e8a1ce4d7148d8fec7f Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:00:59 +0200 Subject: [PATCH 020/142] Shellcheck fix --- tests/queries/0_stateless/03203_hive_style_partitioning.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 83a8f87a813..98c039f3454 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -30,7 +30,8 @@ SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/c SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" -$CLICKHOUSE_LOCAL -n -q """set file_hive_partitioning = 0; +$CLICKHOUSE_LOCAL -n -q """ +set file_hive_partitioning = 0; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" @@ -61,7 +62,8 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=El SELECT *, _non_existing_column FROM url('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" -$CLICKHOUSE_LOCAL -n -q """set url_hive_partitioning = 0; +$CLICKHOUSE_LOCAL -n -q """ +set url_hive_partitioning = 0; SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" @@ -98,4 +100,3 @@ set s3_hive_partitioning = 0; SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" - From dc9dc1676d8f8af74c20173927c6027623cc788c Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 2 Jul 2024 17:37:47 +0200 Subject: [PATCH 021/142] add default for map in VirtualsForFileLikeStorage --- src/Storages/VirtualColumnUtils.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index a03d4c7447f..f9b49cc48ed 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -76,7 +76,7 @@ struct VirtualsForFileLikeStorage std::optional size { std::nullopt }; const String * filename { nullptr }; std::optional last_modified { std::nullopt }; - std::map hive_partitioning_map; + std::map hive_partitioning_map {}; }; From 942f7d7532059cf931242ce5c94a39ea0344b50b Mon Sep 17 00:00:00 2001 From: yariks5s Date: Fri, 5 Jul 2024 12:44:31 +0000 Subject: [PATCH 022/142] fixes after review --- programs/obfuscator/Obfuscator.cpp | 4 +- src/Core/Settings.h | 6 +- src/Core/SettingsChangesHistory.cpp | 6 +- src/Formats/ReadSchemaUtils.cpp | 23 +- src/Formats/ReadSchemaUtils.h | 6 +- .../DataLakes/IStorageDataLake.h | 3 +- .../ObjectStorage/ReadBufferIterator.cpp | 2 +- .../ObjectStorage/ReadBufferIterator.h | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 60 ++--- .../ObjectStorage/StorageObjectStorage.h | 6 + .../StorageObjectStorageCluster.cpp | 3 +- .../StorageObjectStorageSource.cpp | 13 +- src/Storages/ObjectStorage/Utils.cpp | 7 +- src/Storages/ObjectStorage/Utils.h | 1 + .../StorageObjectStorageQueue.cpp | 3 +- src/Storages/StorageFile.cpp | 53 ++--- src/Storages/StorageURL.cpp | 24 +- src/Storages/VirtualColumnUtils.cpp | 63 +++-- src/Storages/VirtualColumnUtils.h | 9 +- src/TableFunctions/TableFunctionFormat.cpp | 10 +- .../TableFunctionObjectStorage.cpp | 3 +- .../__init__.py | 0 .../configs/cluster_azure.xml | 39 ---- .../configs/cluster_hdfs.xml | 33 --- .../configs/disable_profilers_azure.xml | 9 - .../configs/macro_hdfs.xml | 5 - .../configs/named_collections_azure.xml | 14 -- .../configs/schema_cache_azure.xml | 3 - .../configs/schema_cache_hdfs.xml | 3 - .../configs/users_azure.xml | 9 - .../test_azure.py | 219 ------------------ .../test_hdfs.py | 87 ------- .../test_storage_azure_blob_storage/test.py | 110 +++++++++ tests/integration/test_storage_hdfs/test.py | 49 ++++ 34 files changed, 304 insertions(+), 583 deletions(-) delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py delete mode 100644 tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 688ae1a1143..11e85bc1302 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1307,7 +1307,9 @@ try throw ErrnoException(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Input must be seekable file (it will be read twice)"); SingleReadBufferIterator read_buffer_iterator(std::move(file)); - schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const); + + std::string sample_string; + schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, sample_string, context_const); } else { diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 8399d3925db..65b93b893b6 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -935,6 +935,7 @@ class IColumn; M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \ M(Bool, allow_deprecated_error_prone_window_functions, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)", 0) \ M(Bool, allow_deprecated_snowflake_conversion_functions, false, "Enables deprecated functions snowflakeToDateTime[64] and dateTime[64]ToSnowflake.", 0) \ + M(Bool, use_hive_partitioning, false, "Allows to use hive partitioning for File, URL, S3, AzureBlobStorage and HDFS engines.", 0)\ // End of COMMON_SETTINGS // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS, move obsolete settings to OBSOLETE_SETTINGS and obsolete format settings to OBSOLETE_FORMAT_SETTINGS. @@ -1106,11 +1107,6 @@ class IColumn; M(Bool, input_format_tsv_skip_trailing_empty_lines, false, "Skip trailing empty lines in TSV format", 0) \ M(Bool, input_format_custom_skip_trailing_empty_lines, false, "Skip trailing empty lines in CustomSeparated format", 0) \ M(Bool, input_format_tsv_crlf_end_of_line, false, "If it is set true, file function will read TSV format with \\r\\n instead of \\n.", 0) \ - M(Bool, file_hive_partitioning, false, "Allows to use hive partitioning for file format", 0)\ - M(Bool, url_hive_partitioning, false, "Allows to use hive partitioning for url format", 0)\ - M(Bool, s3_hive_partitioning, false, "Allows to use hive partitioning for s3 format", 0)\ - M(Bool, azure_blob_storage_hive_partitioning, false, "Allows to use hive partitioning for AzureBlobStorage format", 0)\ - M(Bool, hdfs_hive_partitioning, false, "Allows to use hive partitioning for hdfs format", 0)\ \ M(Bool, input_format_native_allow_types_conversion, true, "Allow data types conversion in Native input format", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 607f9b6d858..b676cd85ce6 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -59,11 +59,7 @@ static std::initializer_list readSchemaFromFormatImpl( std::optional format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, + std::string & sample_path, const ContextPtr & context) try { @@ -143,6 +144,10 @@ try { iterator_data = read_buffer_iterator.next(); + /// Extracting the File path for hive-style partitioning + if (sample_path.empty()) + sample_path = read_buffer_iterator.getLastFilePath(); + /// Read buffer iterator can determine the data format if it's unknown. /// For example by scanning schema cache or by finding new file with format extension. if (!format_name && iterator_data.format_name) @@ -163,7 +168,7 @@ try return {*iterator_data.cached_columns, *format_name}; } - schemas_for_union_mode.emplace_back(iterator_data.cached_columns->getAll(), read_buffer_iterator.getLastFileName()); + schemas_for_union_mode.emplace_back(iterator_data.cached_columns->getAll(), read_buffer_iterator.getLastFilePath()); continue; } @@ -249,7 +254,7 @@ try if (!names_and_types.empty()) read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); - schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFilePath()); } catch (...) { @@ -410,7 +415,7 @@ try throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); - schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFilePath()); } if (format_name && mode == SchemaInferenceMode::DEFAULT) @@ -526,9 +531,9 @@ try } catch (Exception & e) { - auto file_name = read_buffer_iterator.getLastFileName(); - if (!file_name.empty()) - e.addMessage(fmt::format("(in file/uri {})", file_name)); + auto file_path = read_buffer_iterator.getLastFilePath(); + if (!file_path.empty()) + e.addMessage(fmt::format("(in file/uri {})", file_path)); throw; } @@ -536,17 +541,19 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, + std::string & sample_path, const ContextPtr & context) { - return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first; + return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, sample_path, context).first; } std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, + std::string & sample_path, const ContextPtr & context) { - return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context); + return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, sample_path, context); } SchemaCache::Key getKeyForSchemaCache( diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index bb5e068f696..6c562a06bf0 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -56,8 +56,8 @@ struct IReadBufferIterator /// Set auto detected format name. virtual void setFormatName(const String & /*format_name*/) {} - /// Get last processed file name for better exception messages. - virtual String getLastFileName() const { return ""; } + /// Get last processed file path for better exception messages. + virtual String getLastFilePath() const { return ""; } /// Return true if method recreateLastReadBuffer is implemented. virtual bool supportsLastReadBufferRecreation() const { return false; } @@ -122,6 +122,7 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, + std::string & sample_path, const ContextPtr & context); /// Try to detect the format of the data and it's schema. @@ -131,6 +132,7 @@ ColumnsDescription readSchemaFromFormat( std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, + std::string & sample_path, const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional & format_settings, const ContextPtr & context); diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index 83865c47eb8..5c40cda442b 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -89,8 +89,9 @@ public: { ConfigurationPtr configuration = base_configuration->clone(); configuration->setPaths(metadata->getDataFiles()); + std::string sample_string; return Storage::resolveSchemaFromData( - object_storage_, configuration, format_settings_, local_context); + object_storage_, configuration, format_settings_, sample_string, local_context); } } diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.cpp b/src/Storages/ObjectStorage/ReadBufferIterator.cpp index 78cdc442f64..a47049791ae 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.cpp +++ b/src/Storages/ObjectStorage/ReadBufferIterator.cpp @@ -131,7 +131,7 @@ void ReadBufferIterator::setFormatName(const String & format_name) format = format_name; } -String ReadBufferIterator::getLastFileName() const +String ReadBufferIterator::getLastFilePath() const { if (current_object_info) return current_object_info->getPath(); diff --git a/src/Storages/ObjectStorage/ReadBufferIterator.h b/src/Storages/ObjectStorage/ReadBufferIterator.h index 6eeb52ec2ed..b81aebb7b07 100644 --- a/src/Storages/ObjectStorage/ReadBufferIterator.h +++ b/src/Storages/ObjectStorage/ReadBufferIterator.h @@ -33,7 +33,7 @@ public: void setResultingSchema(const ColumnsDescription & columns) override; - String getLastFileName() const override; + String getLastFilePath() const override; void setFormatName(const String & format_name) override; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index ae7c211330c..717f48983f3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -33,17 +33,22 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } - -bool checkIfHiveSettingEnabled(const ContextPtr & context, const std::string & storage_type_name) +std::string StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) { - if (storage_type_name == "s3") - return context->getSettings().s3_hive_partitioning; - else if (storage_type_name == "hdfs") - return context->getSettings().hdfs_hive_partitioning; - else if (storage_type_name == "azure") - return context->getSettings().azure_blob_storage_hive_partitioning; - else - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported storage type: {}", storage_type_name); + auto file_iterator = StorageObjectStorageSource::createFileIterator( + configuration, + object_storage, + distributed_processing, + context, + {}, // predicate + metadata.getColumns().getAll(), // virtual_columns + nullptr, // read_keys + {} // file_progress_callback + ); + + if (auto file = file_iterator->next(0)) + return file->getPath(); + return ""; } StorageObjectStorage::StorageObjectStorage( @@ -66,7 +71,9 @@ StorageObjectStorage::StorageObjectStorage( , log(getLogger(fmt::format("Storage{}({})", configuration->getEngineName(), table_id_.getFullTableName()))) { ColumnsDescription columns{columns_}; - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, context); + + std::string sample_path; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, sample_path, context); configuration->check(context); StorageInMemoryMetadata metadata; @@ -74,23 +81,13 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - auto file_iterator = StorageObjectStorageSource::createFileIterator( - configuration, - object_storage, - distributed_processing_, - context, - {}, // predicate - metadata.getColumns().getAll(), // virtual_columns - nullptr, // read_keys - {} // file_progress_callback - ); + + if (sample_path.empty() && context->getSettings().use_hive_partitioning) + sample_path = getPathSample(metadata, context); + else if (!context->getSettings().use_hive_partitioning) + sample_path = ""; - Strings paths; - - if (checkIfHiveSettingEnabled(context, configuration->getTypeName())) - if (auto file = file_iterator->next(0)) - paths = {file->getPath()}; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), paths)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), sample_path)); setInMemoryMetadata(metadata); } @@ -386,33 +383,36 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context) { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - return readSchemaFromFormat(configuration->format, format_settings, *iterator, context); + return readSchemaFromFormat(configuration->format, format_settings, *iterator, sample_path, context); } std::string StorageObjectStorage::resolveFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context) { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - return detectFormatAndReadSchema(format_settings, *iterator, context).second; + return detectFormatAndReadSchema(format_settings, *iterator, sample_path, context).second; } std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context) { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context); + auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, sample_path, context); configuration->format = format; return std::pair(columns, format); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index cf8ec113653..dd7ec31c970 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -42,6 +42,7 @@ public: size_t list_object_keys_size; bool throw_on_zero_files_match; bool ignore_non_existent_file; + bool use_hive_partitioning; }; StorageObjectStorage( @@ -100,23 +101,28 @@ public: const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context); static std::string resolveFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context); static std::pair resolveSchemaAndFormatFromData( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, const std::optional & format_settings, + std::string & sample_path, const ContextPtr & context); protected: virtual void updateConfiguration(ContextPtr local_context); + std::string getPathSample(StorageInMemoryMetadata metadata, ContextPtr context); + static std::unique_ptr createReadBufferIterator( const ObjectStoragePtr & object_storage, const ConfigurationPtr & configuration, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 78f568d8ae2..0dc4b845a47 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -33,7 +33,8 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( , object_storage(object_storage_) { ColumnsDescription columns{columns_}; - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, {}, context_); + std::string sample_path; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, {}, sample_path, context_); configuration->check(context_); StorageInMemoryMetadata metadata; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index afb23961312..ecb3ff9d856 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -197,16 +197,6 @@ Chunk StorageObjectStorageSource::generate() const auto & filename = object_info->getFileName(); chassert(object_info->metadata); - auto hive_map = VirtualColumnUtils::parsePartitionMapFromPath(object_info->getPath()); - bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), - [&](const auto& pair) - { - return read_from_format_info.requested_virtual_columns.contains(pair.first); - }); - - if (!contains_virtual_column) - hive_map.clear(); // If we cannot find any virtual column in requested, we don't add any of them to chunk - VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, read_from_format_info.requested_virtual_columns, { @@ -214,8 +204,7 @@ Chunk StorageObjectStorageSource::generate() .size = object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, - .hive_partitioning_map = hive_map - }); + }, object_info->getPath()); return chunk; } diff --git a/src/Storages/ObjectStorage/Utils.cpp b/src/Storages/ObjectStorage/Utils.cpp index e49e14d2a0c..73410d959e0 100644 --- a/src/Storages/ObjectStorage/Utils.cpp +++ b/src/Storages/ObjectStorage/Utils.cpp @@ -49,19 +49,20 @@ void resolveSchemaAndFormat( ObjectStoragePtr object_storage, const StorageObjectStorage::ConfigurationPtr & configuration, std::optional format_settings, + std::string & sample_path, const ContextPtr & context) { if (columns.empty()) { if (format == "auto") std::tie(columns, format) = - StorageObjectStorage::resolveSchemaAndFormatFromData(object_storage, configuration, format_settings, context); + StorageObjectStorage::resolveSchemaAndFormatFromData(object_storage, configuration, format_settings, sample_path, context); else - columns = StorageObjectStorage::resolveSchemaFromData(object_storage, configuration, format_settings, context); + columns = StorageObjectStorage::resolveSchemaFromData(object_storage, configuration, format_settings, sample_path, context); } else if (format == "auto") { - format = StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, context); + format = StorageObjectStorage::resolveFormatFromData(object_storage, configuration, format_settings, sample_path, context); } if (!columns.hasOnlyOrdinary()) diff --git a/src/Storages/ObjectStorage/Utils.h b/src/Storages/ObjectStorage/Utils.h index 2077999df41..7ee14f50979 100644 --- a/src/Storages/ObjectStorage/Utils.h +++ b/src/Storages/ObjectStorage/Utils.h @@ -19,6 +19,7 @@ void resolveSchemaAndFormat( ObjectStoragePtr object_storage, const StorageObjectStorage::ConfigurationPtr & configuration, std::optional format_settings, + std::string & sample_path, const ContextPtr & context); } diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index 95265cde9ea..c12cdddeec7 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -160,7 +160,8 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( configuration->check(context_); ColumnsDescription columns{columns_}; - resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, context_); + std::string sample_path; + resolveSchemaAndFormat(columns, configuration->format, object_storage, configuration, format_settings, sample_path, context_); configuration->check(context_); StorageInMemoryMetadata storage_metadata; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 0c32f29cb34..9751d596fff 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -502,7 +502,7 @@ namespace StorageFile::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } - String getLastFileName() const override + String getLastFilePath() const override { if (current_index != 0) return paths[current_index - 1]; @@ -777,7 +777,7 @@ namespace format = format_name; } - String getLastFileName() const override + String getLastFilePath() const override { return last_read_file_path; } @@ -880,10 +880,11 @@ std::pair StorageFile::getTableStructureAndFormatFro auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); ColumnsDescription columns; + std::string sample_path; if (format) - columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context); + columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context); else - std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); if (peekable_read_buffer_from_fd) @@ -928,20 +929,21 @@ std::pair StorageFile::getTableStructureAndFormatFro } + std::string sample_path; if (archive_info) { ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); } ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); } ColumnsDescription StorageFile::getTableStructureFromFile( @@ -1097,10 +1099,10 @@ void StorageFile::setStorageMetadata(CommonArguments args) storage_metadata.setComment(args.comment); setInMemoryMetadata(storage_metadata); - Strings paths_for_virtuals; - if (args.getContext()->getSettingsRef().file_hive_partitioning) - paths_for_virtuals = paths; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), paths_for_virtuals)); + std::string path_for_virtuals; + if (args.getContext()->getSettingsRef().use_hive_partitioning && !paths.empty()) + path_for_virtuals = paths[0]; + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), path_for_virtuals, format_settings.value_or(FormatSettings{}))); } @@ -1442,14 +1444,9 @@ Chunk StorageFileSource::generate() chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - std::map hive_map; - if (getContext()->getSettingsRef().file_hive_partitioning) - { - hive_map = VirtualColumnUtils::parsePartitionMapFromPath(current_path); - - for (const auto& item : hive_map) - requested_virtual_columns.push_back(NameAndTypePair(item.first, std::make_shared())); - } + std::string hive_partitioning_path; + if (getContext()->getSettingsRef().use_hive_partitioning) + hive_partitioning_path = current_path; /// Enrich with virtual columns. VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( @@ -1459,8 +1456,7 @@ Chunk StorageFileSource::generate() .size = current_file_size, .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), .last_modified = current_file_last_modified, - .hive_partitioning_map = hive_map - }); + }, hive_partitioning_path); return chunk; } @@ -1636,16 +1632,6 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->distributed_processing); } -void addPartitionColumnsToInfoHeader(Strings paths, ReadFromFormatInfo & info) -{ - for (const auto& path : paths) - { - auto map = VirtualColumnUtils::parsePartitionMapFromPath(path); - for (const auto& item : map) - info.source_header.insertUnique(ColumnWithTypeAndName(std::make_shared(), item.first)); - } -} - void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) { createIterator(nullptr); @@ -1665,9 +1651,6 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui paths = storage->paths; } - if (getContext()->getSettingsRef().file_hive_partitioning) - addPartitionColumnsToInfoHeader(paths, info); - if (max_num_streams > files_to_read) num_streams = files_to_read; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index f6374701fc2..59c5465a381 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -153,10 +153,10 @@ IStorageURLBase::IStorageURLBase( storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - Strings uri_for_partitioning; - if (context_->getSettingsRef().url_hive_partitioning) - uri_for_partitioning = {uri}; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), uri_for_partitioning)); + std::string uri_for_partitioning; + if (context_->getSettingsRef().use_hive_partitioning) + uri_for_partitioning = uri; + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), uri_for_partitioning, format_settings.value_or(FormatSettings{}))); } @@ -415,9 +415,9 @@ Chunk StorageURLSource::generate() size_t chunk_size = 0; if (input_format) chunk_size = input_format->getApproxBytesReadForChunk(); - std::map hive_map; - if (getContext()->getSettingsRef().url_hive_partitioning) - hive_map = VirtualColumnUtils::parsePartitionMapFromPath(curr_uri.getPath()); + std::string hive_partitioning_path; + if (getContext()->getSettingsRef().use_hive_partitioning) + hive_partitioning_path = curr_uri.getPath(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( @@ -425,8 +425,7 @@ Chunk StorageURLSource::generate() { .path = curr_uri.getPath(), .size = current_file_size, - .hive_partitioning_map = hive_map - }); + }, hive_partitioning_path); return chunk; } @@ -859,7 +858,7 @@ namespace format = format_name; } - String getLastFileName() const override { return current_url_option; } + String getLastFilePath() const override { return current_url_option; } bool supportsLastReadBufferRecreation() const override { return true; } @@ -960,9 +959,10 @@ std::pair IStorageURLBase::getTableStructureAndForma urls_to_check = {uri}; ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); + std::string sample_path; if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); } ColumnsDescription IStorageURLBase::getTableStructureFromData( diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 0b79e3b7a16..379b14d8e51 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include "Functions/FunctionsLogical.h" #include "Functions/IFunction.h" #include "Functions/IFunctionAdaptors.h" @@ -115,22 +116,19 @@ NameSet getVirtualNamesForFileLikeStorage() return {"_path", "_file", "_size", "_time"}; } -Strings parseVirtualColumnNameFromPath(const std::string & path) +std::map parseFromPath(const std::string& path) { std::string pattern = "/([^/]+)=([^/]+)"; - // Map to store the key-value pairs - std::map key_values; - re2::StringPiece input_piece(path); - std::string key; - Strings result; - while (RE2::FindAndConsume(&input_piece, pattern, &key)) - result.push_back(key); - return result; + std::map key_values; + std::string key, value; + while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) + key_values["_" + key] = value; + return key_values; } -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, Strings paths) +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path, FormatSettings settings) { VirtualColumnsDescription desc; @@ -147,11 +145,13 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription add_virtual("_size", makeNullable(std::make_shared())); add_virtual("_time", makeNullable(std::make_shared())); - for (const auto& path : paths) + auto map = parseFromPath(path); + for (const auto& item : map) { - auto names = parseVirtualColumnNameFromPath(path); - for (const auto& name : names) - add_virtual("_" + name, std::make_shared(std::make_shared())); + auto type = tryInferDataTypeForSingleField(item.second, settings); + if (type == nullptr) + type = std::make_shared(); + add_virtual(item.first, std::make_shared(type)); } return desc; @@ -213,25 +213,11 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const return block.getByName("_idx").column; } -std::map parsePartitionMapFromPath(const std::string & path) -{ - std::string pattern = "/([^/]+)=([^/]+)"; // Regex to capture key=value pairs - // Map to store the key-value pairs - std::map key_values; - - re2::StringPiece input_piece(path); - std::string key; - std::string value; - while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) - key_values["_" + key] = value; - - return key_values; -} - void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values) + VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path) { + auto hive_map = parseFromPath(hive_partitioning_path); for (const auto & virtual_column : requested_virtual_columns) { if (virtual_column.name == "_path") @@ -265,13 +251,22 @@ void addRequestedFileLikeStorageVirtualsToChunk( else chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } - else + else if (!hive_map.empty()) { - auto it = virtual_values.hive_partitioning_map.find(virtual_column.getNameInStorage()); - if (it != virtual_values.hive_partitioning_map.end()) + bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), + [&](const auto& pair) + { + return requested_virtual_columns.contains(pair.first); + }); + + if (!contains_virtual_column) + hive_map.clear(); // If we cannot find any virtual column in requested, we don't add any of them to chunk + + auto it = hive_map.find(virtual_column.getNameInStorage()); + if (it != hive_map.end()) { chunk.addColumn(virtual_column.getTypeInStorage()->createColumnConst(chunk.getNumRows(), it->second)->convertToFullColumnIfConst()); - virtual_values.hive_partitioning_map.erase(it); + hive_map.erase(it); } } } diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index f9b49cc48ed..72922be60bd 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -49,7 +50,7 @@ auto extractSingleValueFromBlock(const Block & block, const String & name) } NameSet getVirtualNamesForFileLikeStorage(); -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, Strings paths = {}); +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path = "", FormatSettings settings = FormatSettings()); ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); @@ -76,15 +77,13 @@ struct VirtualsForFileLikeStorage std::optional size { std::nullopt }; const String * filename { nullptr }; std::optional last_modified { std::nullopt }; - std::map hive_partitioning_map {}; - }; -std::map parsePartitionMapFromPath(const std::string & path); +std::map parseFromPath(const std::string& path); void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values); + VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path = ""); } } diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index ad2a142a140..66152cb0c91 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -85,9 +85,10 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte if (structure == "auto") { SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); + std::string sample_path; if (format == "auto") - return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first; - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context).first; + return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); } return parseColumnsListFromString(structure, context); } @@ -131,11 +132,12 @@ StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, Con String format_name = format; if (structure == "auto") { + std::string sample_path; SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); if (format_name == "auto") - std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context); + std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context); else - columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); } else { diff --git a/src/TableFunctions/TableFunctionObjectStorage.cpp b/src/TableFunctions/TableFunctionObjectStorage.cpp index 550d9cc799b..39392a4c44c 100644 --- a/src/TableFunctions/TableFunctionObjectStorage.cpp +++ b/src/TableFunctions/TableFunctionObjectStorage.cpp @@ -84,7 +84,8 @@ ColumnsDescription TableFunctionObjectStorage< context->checkAccess(getSourceAccessType()); ColumnsDescription columns; auto storage = getObjectStorage(context, !is_insert_query); - resolveSchemaAndFormat(columns, configuration->format, storage, configuration, std::nullopt, context); + std::string sample_path; + resolveSchemaAndFormat(columns, configuration->format, storage, configuration, std::nullopt, sample_path, context); return columns; } else diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml deleted file mode 100644 index ffa4673c9ee..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_azure.xml +++ /dev/null @@ -1,39 +0,0 @@ - - - - - - node_0 - 9000 - - - node_1 - 9000 - - - node_2 - 9000 - - - - - - - - node_0 - 9000 - - - - - node_1 - 19000 - - - - - - - simple_cluster - - \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml deleted file mode 100644 index b99b21ea40b..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/cluster_hdfs.xml +++ /dev/null @@ -1,33 +0,0 @@ - - - - - - node1 - 9000 - - - - - node1 - 19000 - - - - - - - - 127.0.0.1 - 9000 - - - - - 127.0.0.2 - 9000 - - - - - diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml deleted file mode 100644 index a39badbf8ec..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/disable_profilers_azure.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - 0 - 0 - - - diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml deleted file mode 100644 index c2e11b47a5e..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/macro_hdfs.xml +++ /dev/null @@ -1,5 +0,0 @@ - - - test_cluster_two_shards - - \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml deleted file mode 100644 index bd7f9ff97f1..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/named_collections_azure.xml +++ /dev/null @@ -1,14 +0,0 @@ - - - - cont - test_simple_write_named.csv - key UInt64, data String - CSV - - - devstoreaccount1 - Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== - - - diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml deleted file mode 100644 index e2168ecd06d..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_azure.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 2 - \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml deleted file mode 100644 index 37639649b5f..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/schema_cache_hdfs.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 2 - \ No newline at end of file diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml b/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml deleted file mode 100644 index 4b6ba057ecb..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/configs/users_azure.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - default - 1 - - - diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py deleted file mode 100644 index 0be697821f0..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_azure.py +++ /dev/null @@ -1,219 +0,0 @@ -#!/usr/bin/env python3 - -import pytest -import time - -from helpers.cluster import ClickHouseCluster, is_arm -import re - -from azure.storage.blob import BlobServiceClient -from helpers.cluster import ClickHouseCluster, ClickHouseInstance - -if is_arm(): - pytestmark = pytest.mark.skip - - -@pytest.fixture(scope="module") -def cluster(): - try: - cluster = ClickHouseCluster(__file__) - cluster.add_instance( - "node", - main_configs=[ - "configs/named_collections_azure.xml", - "configs/schema_cache_azure.xml", - ], - user_configs=[ - "configs/disable_profilers_azure.xml", - "configs/users_azure.xml", - ], - with_azurite=True, - ) - cluster.start() - container_client = cluster.blob_service_client.get_container_client("cont") - container_client.create_container() - yield cluster - finally: - cluster.shutdown() - - -def azure_query( - node, query, expect_error=False, try_num=10, settings={}, query_on_retry=None -): - for i in range(try_num): - try: - if expect_error: - return node.query_and_get_error(query, settings=settings) - else: - return node.query(query, settings=settings) - except Exception as ex: - retriable_errors = [ - "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", - "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", - "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", - "DB::Exception: Azure::Core::Http::TransportException: Error while polling for socket ready read", - "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", - "Azure::Core::Http::TransportException, e.what() = Connection closed before getting full response or response is less than expected", - "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", - "Azure::Core::Http::TransportException, e.what() = Error while polling for socket ready read", - ] - retry = False - for error in retriable_errors: - if error in str(ex): - retry = True - print(f"Try num: {i}. Having retriable error: {ex}") - time.sleep(i) - break - if not retry or i == try_num - 1: - raise Exception(ex) - if query_on_retry is not None: - node.query(query_on_retry) - continue - - -def get_azure_file_content(filename, port): - container_name = "cont" - connection_string = ( - f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" - f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" - f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" - ) - blob_service_client = BlobServiceClient.from_connection_string( - str(connection_string) - ) - container_client = blob_service_client.get_container_client(container_name) - blob_client = container_client.get_blob_client(filename) - download_stream = blob_client.download_blob() - return download_stream.readall().decode("utf-8") - - -@pytest.fixture(autouse=True, scope="function") -def delete_all_files(cluster): - port = cluster.env_variables["AZURITE_PORT"] - connection_string = ( - f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" - f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" - f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" - ) - blob_service_client = BlobServiceClient.from_connection_string(connection_string) - containers = blob_service_client.list_containers() - for container in containers: - container_client = blob_service_client.get_container_client(container) - blob_list = container_client.list_blobs() - for blob in blob_list: - print(blob) - blob_client = container_client.get_blob_client(blob) - blob_client.delete_blob() - - assert len(list(container_client.list_blobs())) == 0 - - yield - - -def test_azure_partitioning_with_one_parameter(cluster): - # type: (ClickHouseCluster) -> None - node = cluster.instances["node"] # type: ClickHouseInstance - table_format = "column1 String, column2 String" - values = f"('Elizabeth', 'Gordon')" - path = "a/column1=Elizabeth/sample.csv" - - azure_query( - node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", - ) - - query = ( - f"SELECT column1, column2, _file, _path, _column1 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}')" - ) - assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} - ).splitlines() == [ - "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format( - bucket="cont", max_path=path - ) - ] - - query = ( - f"SELECT column2 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" - ) - assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} - ).splitlines() == ["Gordon"] - - -def test_azure_partitioning_with_two_parameters(cluster): - # type: (ClickHouseCluster) -> None - node = cluster.instances["node"] # type: ClickHouseInstance - table_format = "column1 String, column2 String" - values_1 = f"('Elizabeth', 'Gordon')" - values_2 = f"('Emilia', 'Gregor')" - path = "a/column1=Elizabeth/column2=Gordon/sample.csv" - - azure_query( - node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", - ) - - query = ( - f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" - ) - assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} - ).splitlines() == [ - "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format( - bucket="cont", max_path=path - ) - ] - - query = ( - f"SELECT column1 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;" - ) - assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} - ).splitlines() == ["Elizabeth"] - - query = ( - f"SELECT column1 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;" - ) - assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} - ).splitlines() == ["Elizabeth"] - - -def test_azure_partitioning_without_setting(cluster): - # type: (ClickHouseCluster) -> None - node = cluster.instances["node"] # type: ClickHouseInstance - table_format = "column1 String, column2 String" - values_1 = f"('Elizabeth', 'Gordon')" - values_2 = f"('Emilia', 'Gregor')" - path = "a/column1=Elizabeth/column2=Gordon/sample.csv" - - azure_query( - node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", - ) - - query = ( - f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " - f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " - f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" - ) - pattern = re.compile( - r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL - ) - - with pytest.raises(Exception, match=pattern): - azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 0}) diff --git a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py b/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py deleted file mode 100644 index 4667d18688a..00000000000 --- a/tests/integration/test_hive_style_partitioning_hdfs_azure/test_hdfs.py +++ /dev/null @@ -1,87 +0,0 @@ -#!/usr/bin/env python3 - -import pytest - -from helpers.client import QueryRuntimeException -from helpers.cluster import ClickHouseCluster, is_arm -import re - -from helpers.cluster import ClickHouseCluster - -if is_arm(): - pytestmark = pytest.mark.skip - -cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance( - "node1", - main_configs=[ - "configs/macro_hdfs.xml", - "configs/schema_cache_hdfs.xml", - "configs/cluster_hdfs.xml", - ], - with_hdfs=True, -) - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - yield cluster - finally: - cluster.shutdown() - - -def test_hdfs_partitioning_with_one_parameter(started_cluster): - hdfs_api = started_cluster.hdfs_api - hdfs_api.write_data(f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n") - assert hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") == f"Elizabeth\tGordon\n" - - r = node1.query( - "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", - settings={"hdfs_hive_partitioning": 1}, - ) - assert r == f"Elizabeth\n" - - -def test_hdfs_partitioning_with_two_parameters(started_cluster): - hdfs_api = started_cluster.hdfs_api - hdfs_api.write_data( - f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" - ) - assert ( - hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") - == f"Elizabeth\tGordon\n" - ) - - r = node1.query( - "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", - settings={"hdfs_hive_partitioning": 1}, - ) - assert r == f"Gordon\n" - - -def test_hdfs_partitioning_without_setting(started_cluster): - hdfs_api = started_cluster.hdfs_api - hdfs_api.write_data( - f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" - ) - assert ( - hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") - == f"Elizabeth\tGordon\n" - ) - pattern = re.compile( - r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL - ) - - with pytest.raises(QueryRuntimeException, match=pattern): - node1.query( - f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", - settings={"hdfs_hive_partitioning": 0}, - ) - - -if __name__ == "__main__": - cluster.start() - input("Cluster created, press any key to destroy...") - cluster.shutdown() diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 20b004a7605..893df6d23aa 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -5,6 +5,7 @@ import json import logging import os import io +import re import random import threading import time @@ -1462,3 +1463,112 @@ def test_insert_create_new_file(cluster): assert TSV(res) == TSV( "test_create_new_file.csv\t1\ntest_create_new_file.1.csv\t2\n" ) + + +def test_hive_partitioning_with_one_parameter(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values = f"('Elizabeth', 'Gordon')" + path = "a/column1=Elizabeth/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}')" + ) + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == [ + "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format( + bucket="cont", max_path=path + ) + ] + + query = ( + f"SELECT column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Gordon"] + + +def test_hive_partitioning_with_two_parameters(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values_1 = f"('Elizabeth', 'Gordon')" + values_2 = f"('Emilia', 'Gregor')" + path = "a/column1=Elizabeth/column2=Gordon/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == [ + "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format( + bucket="cont", max_path=path + ) + ] + + query = ( + f"SELECT column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;" + ) + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Elizabeth"] + + query = ( + f"SELECT column1 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;" + ) + assert azure_query( + node, query, settings={"azure_blob_storage_hive_partitioning": 1} + ).splitlines() == ["Elizabeth"] + + +def test_hive_partitioning_without_setting(cluster): + # type: (ClickHouseCluster) -> None + node = cluster.instances["node"] # type: ClickHouseInstance + table_format = "column1 String, column2 String" + values_1 = f"('Elizabeth', 'Gordon')" + values_2 = f"('Emilia', 'Gregor')" + path = "a/column1=Elizabeth/column2=Gordon/sample.csv" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + ) + + query = ( + f"SELECT column1, column2, _file, _path, _column1, _column2 FROM azureBlobStorage(azure_conf2, " + f"storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', container='cont', " + f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" + ) + pattern = re.compile( + r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL + ) + + with pytest.raises(Exception, match=pattern): + azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 0}) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 47d8f44c0b7..8071b520a4f 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -3,6 +3,7 @@ import os import pytest import time from helpers.cluster import ClickHouseCluster, is_arm +from helpers.client import QueryRuntimeException from helpers.test_tools import TSV from pyhdfs import HdfsClient @@ -1180,6 +1181,54 @@ def test_respect_object_existence_on_partitioned_write(started_cluster): assert int(result) == 44 +def test_hive_partitioning_with_one_parameter(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data(f"/column0=Elizabeth/parquet_1", f"Elizabeth\tGordon\n") + assert hdfs_api.read_data(f"/column0=Elizabeth/parquet_1") == f"Elizabeth\tGordon\n" + + r = node1.query( + "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", + settings={"hdfs_hive_partitioning": 1}, + ) + assert r == f"Elizabeth\n" + + +def test_hive_partitioning_with_two_parameters(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data( + f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" + ) + assert ( + hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") + == f"Elizabeth\tGordon\n" + ) + + r = node1.query( + "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", + settings={"hdfs_hive_partitioning": 1}, + ) + assert r == f"Gordon\n" + + +def test_hive_partitioning_without_setting(started_cluster): + hdfs_api = started_cluster.hdfs_api + hdfs_api.write_data( + f"/column0=Elizabeth/column1=Gordon/parquet_2", f"Elizabeth\tGordon\n" + ) + assert ( + hdfs_api.read_data(f"/column0=Elizabeth/column1=Gordon/parquet_2") + == f"Elizabeth\tGordon\n" + ) + pattern = re.compile( + r"DB::Exception: Unknown expression identifier '.*' in scope.*", re.DOTALL + ) + + with pytest.raises(QueryRuntimeException, match=pattern): + node1.query( + f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", + settings={"hdfs_hive_partitioning": 0}, + ) + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") From 3fb50aa5d8560f3cd5f4fc9b35bfa47a60d2ca80 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Fri, 5 Jul 2024 13:28:47 +0000 Subject: [PATCH 023/142] style fix --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 1 - tests/integration/test_storage_hdfs/test.py | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 717f48983f3..4b5b514e67d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -81,7 +81,6 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - if (sample_path.empty() && context->getSettings().use_hive_partitioning) sample_path = getPathSample(metadata, context); else if (!context->getSettings().use_hive_partitioning) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 8071b520a4f..da46756841d 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -1229,6 +1229,7 @@ def test_hive_partitioning_without_setting(started_cluster): settings={"hdfs_hive_partitioning": 0}, ) + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") From 6c4c17f119fb95d66f894a92ce8c91fa6664ff5b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 5 Jul 2024 16:44:26 +0200 Subject: [PATCH 024/142] remove use_hive_partitioning from query settings --- src/Storages/ObjectStorage/StorageObjectStorage.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 3fbfc3aacd7..f97d2620fe5 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -43,7 +43,6 @@ public: size_t list_object_keys_size; bool throw_on_zero_files_match; bool ignore_non_existent_file; - bool use_hive_partitioning; }; StorageObjectStorage( From 9064bb1b8389e301f45bc78a5365665292f51c6e Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 5 Jul 2024 19:36:59 +0200 Subject: [PATCH 025/142] fix settings in tests --- .../test_storage_azure_blob_storage/test.py | 12 ++++++------ tests/integration/test_storage_hdfs/test.py | 6 +++--- .../0_stateless/03203_hive_style_partitioning.sh | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index f2a1f9e35a9..6966abfee4f 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -1484,7 +1484,7 @@ def test_hive_partitioning_with_one_parameter(cluster): f"blob_path='{path}', format='CSV', structure='{table_format}')" ) assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} + node, query, settings={"use_hive_partitioning": 1} ).splitlines() == [ "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth".format( bucket="cont", max_path=path @@ -1497,7 +1497,7 @@ def test_hive_partitioning_with_one_parameter(cluster): f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" ) assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} + node, query, settings={"use_hive_partitioning": 1} ).splitlines() == ["Gordon"] @@ -1521,7 +1521,7 @@ def test_hive_partitioning_with_two_parameters(cluster): f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column1=_column1;" ) assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} + node, query, settings={"use_hive_partitioning": 1} ).splitlines() == [ "Elizabeth\tGordon\tsample.csv\t{bucket}/{max_path}\tElizabeth\tGordon".format( bucket="cont", max_path=path @@ -1534,7 +1534,7 @@ def test_hive_partitioning_with_two_parameters(cluster): f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2;" ) assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} + node, query, settings={"use_hive_partitioning": 1} ).splitlines() == ["Elizabeth"] query = ( @@ -1543,7 +1543,7 @@ def test_hive_partitioning_with_two_parameters(cluster): f"blob_path='{path}', format='CSV', structure='{table_format}') WHERE column2=_column2 AND column1=_column1;" ) assert azure_query( - node, query, settings={"azure_blob_storage_hive_partitioning": 1} + node, query, settings={"use_hive_partitioning": 1} ).splitlines() == ["Elizabeth"] @@ -1571,4 +1571,4 @@ def test_hive_partitioning_without_setting(cluster): ) with pytest.raises(Exception, match=pattern): - azure_query(node, query, settings={"azure_blob_storage_hive_partitioning": 0}) + azure_query(node, query, settings={"use_hive_partitioning": 0}) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index da46756841d..aa3efb8ba4a 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -1188,7 +1188,7 @@ def test_hive_partitioning_with_one_parameter(started_cluster): r = node1.query( "SELECT _column0 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/parquet_1', 'TSV')", - settings={"hdfs_hive_partitioning": 1}, + settings={"use_hive_partitioning": 1}, ) assert r == f"Elizabeth\n" @@ -1205,7 +1205,7 @@ def test_hive_partitioning_with_two_parameters(started_cluster): r = node1.query( "SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", - settings={"hdfs_hive_partitioning": 1}, + settings={"use_hive_partitioning": 1}, ) assert r == f"Gordon\n" @@ -1226,7 +1226,7 @@ def test_hive_partitioning_without_setting(started_cluster): with pytest.raises(QueryRuntimeException, match=pattern): node1.query( f"SELECT _column1 FROM hdfs('hdfs://hdfs1:9000/column0=Elizabeth/column1=Gordon/parquet_2', 'TSV');", - settings={"hdfs_hive_partitioning": 0}, + settings={"use_hive_partitioning": 0}, ) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 98c039f3454..544fd17ffff 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -9,7 +9,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" $CLICKHOUSE_LOCAL -n -q """ -set file_hive_partitioning = 1; +set use_hive_partitioning = 1; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -31,7 +31,7 @@ SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_exi SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" $CLICKHOUSE_LOCAL -n -q """ -set file_hive_partitioning = 0; +set use_hive_partitioning = 0; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" @@ -41,7 +41,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" $CLICKHOUSE_LOCAL -n -q """ -set url_hive_partitioning = 1; +set use_hive_partitioning = 1; SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -63,7 +63,7 @@ SELECT *, _non_existing_column FROM url('http://localhost:11111/test/partitionin SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" $CLICKHOUSE_LOCAL -n -q """ -set url_hive_partitioning = 0; +set use_hive_partitioning = 0; SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" @@ -73,7 +73,7 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" $CLICKHOUSE_LOCAL -n -q """ -set s3_hive_partitioning = 1; +set use_hive_partitioning = 1; SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -96,7 +96,7 @@ SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=*/s """ $CLICKHOUSE_LOCAL -n -q """ -set s3_hive_partitioning = 0; +set use_hive_partitioning = 0; SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" From 6e5e680797f5b2147455826e4e223c27be5039a6 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 6 Jul 2024 05:42:21 +0000 Subject: [PATCH 026/142] bump libprotobuf-mutator, fix build --- contrib/libprotobuf-mutator | 2 +- src/AggregateFunctions/fuzzers/CMakeLists.txt | 2 +- ..._function_state_deserialization_fuzzer.cpp | 24 +------------------ src/Core/fuzzers/CMakeLists.txt | 2 +- src/Core/fuzzers/names_and_types_fuzzer.cpp | 22 ----------------- src/DataTypes/fuzzers/CMakeLists.txt | 2 +- .../data_type_deserialization_fuzzer.cpp | 22 ----------------- src/Formats/fuzzers/CMakeLists.txt | 2 +- src/Formats/fuzzers/format_fuzzer.cpp | 21 ---------------- src/Storages/fuzzers/CMakeLists.txt | 2 +- .../fuzzers/columns_description_fuzzer.cpp | 21 ---------------- 11 files changed, 7 insertions(+), 115 deletions(-) diff --git a/contrib/libprotobuf-mutator b/contrib/libprotobuf-mutator index a304ec48dcf..b922c8ab900 160000 --- a/contrib/libprotobuf-mutator +++ b/contrib/libprotobuf-mutator @@ -1 +1 @@ -Subproject commit a304ec48dcf15d942607032151f7e9ee504b5dcf +Subproject commit b922c8ab9004ef9944982e4f165e2747b13223fa diff --git a/src/AggregateFunctions/fuzzers/CMakeLists.txt b/src/AggregateFunctions/fuzzers/CMakeLists.txt index 907a275b4b3..1ce0c52feb7 100644 --- a/src/AggregateFunctions/fuzzers/CMakeLists.txt +++ b/src/AggregateFunctions/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(aggregate_function_state_deserialization_fuzzer aggregate_function_state_deserialization_fuzzer.cpp ${SRCS}) -target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions) +target_link_libraries(aggregate_function_state_deserialization_fuzzer PRIVATE clickhouse_functions clickhouse_aggregate_functions) diff --git a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp index a956d9906bc..31fc93e4288 100644 --- a/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp +++ b/src/AggregateFunctions/fuzzers/aggregate_function_state_deserialization_fuzzer.cpp @@ -12,33 +12,11 @@ #include -#include - +#include #include #include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -class IFunctionBase; -using FunctionBasePtr = std::shared_ptr; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for aggregate_function_state_deserialization_fuzzer"); -} - -} - - extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Core/fuzzers/CMakeLists.txt b/src/Core/fuzzers/CMakeLists.txt index 51db6fa0b53..61d6b9629eb 100644 --- a/src/Core/fuzzers/CMakeLists.txt +++ b/src/Core/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable (names_and_types_fuzzer names_and_types_fuzzer.cpp) -target_link_libraries (names_and_types_fuzzer PRIVATE dbms) +target_link_libraries (names_and_types_fuzzer PRIVATE clickhouse_functions) diff --git a/src/Core/fuzzers/names_and_types_fuzzer.cpp b/src/Core/fuzzers/names_and_types_fuzzer.cpp index 74debedf2a3..6fdd8703014 100644 --- a/src/Core/fuzzers/names_and_types_fuzzer.cpp +++ b/src/Core/fuzzers/names_and_types_fuzzer.cpp @@ -1,29 +1,7 @@ -#include -#include #include #include -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -class IFunctionBase; -using FunctionBasePtr = std::shared_ptr; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for names_and_types_fuzzer"); -} - -} - - extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/DataTypes/fuzzers/CMakeLists.txt b/src/DataTypes/fuzzers/CMakeLists.txt index 939bf5f5e3f..e54ef0a860c 100644 --- a/src/DataTypes/fuzzers/CMakeLists.txt +++ b/src/DataTypes/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(data_type_deserialization_fuzzer data_type_deserialization_fuzzer.cpp ${SRCS}) -target_link_libraries(data_type_deserialization_fuzzer PRIVATE dbms clickhouse_aggregate_functions) +target_link_libraries(data_type_deserialization_fuzzer PRIVATE clickhouse_functions clickhouse_aggregate_functions) diff --git a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp index 7d9a0513d18..0ae325871fb 100644 --- a/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp +++ b/src/DataTypes/fuzzers/data_type_deserialization_fuzzer.cpp @@ -8,33 +8,11 @@ #include #include -#include - #include #include -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -class IFunctionBase; -using FunctionBasePtr = std::shared_ptr; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for data_type_deserialization_fuzzer"); -} - -} - - extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Formats/fuzzers/CMakeLists.txt b/src/Formats/fuzzers/CMakeLists.txt index 38009aeec1d..b8a7e78b6e2 100644 --- a/src/Formats/fuzzers/CMakeLists.txt +++ b/src/Formats/fuzzers/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable(format_fuzzer format_fuzzer.cpp ${SRCS}) -target_link_libraries(format_fuzzer PRIVATE dbms clickhouse_aggregate_functions) +target_link_libraries(format_fuzzer PRIVATE clickhouse_functions clickhouse_aggregate_functions) diff --git a/src/Formats/fuzzers/format_fuzzer.cpp b/src/Formats/fuzzers/format_fuzzer.cpp index 2c1ec65e54d..27f7d7b292f 100644 --- a/src/Formats/fuzzers/format_fuzzer.cpp +++ b/src/Formats/fuzzers/format_fuzzer.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include @@ -21,26 +20,6 @@ #include -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -class IFunctionBase; -using FunctionBasePtr = std::shared_ptr; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for format_fuzzer"); -} - -} - - extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try diff --git a/src/Storages/fuzzers/CMakeLists.txt b/src/Storages/fuzzers/CMakeLists.txt index 719b9b77cd9..7bee2da2e26 100644 --- a/src/Storages/fuzzers/CMakeLists.txt +++ b/src/Storages/fuzzers/CMakeLists.txt @@ -4,4 +4,4 @@ clickhouse_add_executable (mergetree_checksum_fuzzer mergetree_checksum_fuzzer.c target_link_libraries (mergetree_checksum_fuzzer PRIVATE dbms) clickhouse_add_executable (columns_description_fuzzer columns_description_fuzzer.cpp) -target_link_libraries (columns_description_fuzzer PRIVATE dbms) +target_link_libraries (columns_description_fuzzer PRIVATE clickhouse_functions) diff --git a/src/Storages/fuzzers/columns_description_fuzzer.cpp b/src/Storages/fuzzers/columns_description_fuzzer.cpp index ac285ea50f7..e10e0cc52f5 100644 --- a/src/Storages/fuzzers/columns_description_fuzzer.cpp +++ b/src/Storages/fuzzers/columns_description_fuzzer.cpp @@ -1,28 +1,7 @@ -#include #include #include -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -class IFunctionBase; -using FunctionBasePtr = std::shared_ptr; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) -{ - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for columns_description_fuzzer"); -} - -} - - extern "C" int LLVMFuzzerTestOneInput(const uint8_t * data, size_t size) { try From c13371166c9a9545155c9274d36c3b91a1ade5cb Mon Sep 17 00:00:00 2001 From: yariks5s Date: Sun, 7 Jul 2024 22:25:15 +0000 Subject: [PATCH 027/142] add import re to tests --- tests/integration/test_storage_hdfs/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index da46756841d..9a166cba2ab 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -2,6 +2,7 @@ import os import pytest import time +import re from helpers.cluster import ClickHouseCluster, is_arm from helpers.client import QueryRuntimeException from helpers.test_tools import TSV From a3c4cbfce257f171e66e04598ac9eae548c3836f Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 8 Jul 2024 13:57:54 +0000 Subject: [PATCH 028/142] clang-tidy, fix cp with minio --- docker/test/stateless/setup_minio.sh | 2 +- .../03203_hive_style_partitioning.sh | 52 +++++++++---------- utils/keeper-bench/Runner.cpp | 3 +- 3 files changed, 29 insertions(+), 28 deletions(-) diff --git a/docker/test/stateless/setup_minio.sh b/docker/test/stateless/setup_minio.sh index 2b9433edd20..aacb9d88a45 100755 --- a/docker/test/stateless/setup_minio.sh +++ b/docker/test/stateless/setup_minio.sh @@ -101,7 +101,7 @@ upload_data() { # shellcheck disable=SC2045 for file in $(ls "${data_path}"); do echo "${file}"; - ./mc cp "${data_path}"/"${file}" clickminio/test/"${file}"; + ./mc cp --recursive "${data_path}"/"${file}" clickminio/test/"${file}"; done } diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 544fd17ffff..334bfef4f02 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -43,29 +43,29 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 1; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; -SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _non_existing_column FROM url('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" +SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; -SELECT *, _column0 FROM url('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" @@ -75,28 +75,28 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 1; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; -SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0, _column1 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=*/sample.parquet') WHERE column0 = _column0; +SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0; """ $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; -SELECT *, _column0 FROM s3('http://localhost:11111/test/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index 587e015b340..f8a0e37d1a9 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -544,7 +544,8 @@ struct ZooKeeperRequestFromLogReader file_read_buf = DB::wrapReadBufferWithCompressionMethod(std::move(file_read_buf), compression_method); DB::SingleReadBufferIterator read_buffer_iterator(std::move(file_read_buf)); - auto [columns_description, format] = DB::detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + std::string sample_path; + auto [columns_description, format] = DB::detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); DB::ColumnsWithTypeAndName columns; columns.reserve(columns_description.size()); From 3cc6a133c64861f4493849905950abb5cc1fbaac Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 9 Jul 2024 12:38:16 +0200 Subject: [PATCH 029/142] Update setup_minio.sh --- docker/test/stateless/setup_minio.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/stateless/setup_minio.sh b/docker/test/stateless/setup_minio.sh index aacb9d88a45..0e344cbb9c4 100755 --- a/docker/test/stateless/setup_minio.sh +++ b/docker/test/stateless/setup_minio.sh @@ -101,7 +101,7 @@ upload_data() { # shellcheck disable=SC2045 for file in $(ls "${data_path}"); do echo "${file}"; - ./mc cp --recursive "${data_path}"/"${file}" clickminio/test/"${file}"; + ./mc cp --recursive "${data_path}"/ clickminio/test/; done } @@ -148,4 +148,4 @@ main() { setup_aws_credentials } -main "$@" \ No newline at end of file +main "$@" From 362bf4befcd55de5f49e2665c7c7f9483a700dc8 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:30:08 +0200 Subject: [PATCH 030/142] Update setup_minio.sh --- docker/test/stateless/setup_minio.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stateless/setup_minio.sh b/docker/test/stateless/setup_minio.sh index 0e344cbb9c4..8bd75f16321 100755 --- a/docker/test/stateless/setup_minio.sh +++ b/docker/test/stateless/setup_minio.sh @@ -101,7 +101,7 @@ upload_data() { # shellcheck disable=SC2045 for file in $(ls "${data_path}"); do echo "${file}"; - ./mc cp --recursive "${data_path}"/ clickminio/test/; + ./mc cp "${data_path}"/"${file}" clickminio/test/"${file}"; done } From 1761102b3a071143b40039ee1e83666ebffa88fb Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 9 Jul 2024 17:31:41 +0200 Subject: [PATCH 031/142] fix path --- .../column1=Gordon/sample.parquet | Bin .../column1=Schmidt/sample.parquet | Bin .../sample.parquet | Bin 3 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/data_minio/hive_partitioning/{coumn0=Elizabeth => column0=Elizabeth}/column1=Gordon/sample.parquet (100%) rename tests/queries/0_stateless/data_minio/hive_partitioning/{coumn0=Elizabeth => column0=Elizabeth}/column1=Schmidt/sample.parquet (100%) rename tests/queries/0_stateless/data_minio/hive_partitioning/{coumn0=Elizabeth => column0=Elizabeth}/sample.parquet (100%) diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Gordon/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet similarity index 100% rename from tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Gordon/sample.parquet rename to tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Schmidt/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet similarity index 100% rename from tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/column1=Schmidt/sample.parquet rename to tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet diff --git a/tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/sample.parquet similarity index 100% rename from tests/queries/0_stateless/data_minio/hive_partitioning/coumn0=Elizabeth/sample.parquet rename to tests/queries/0_stateless/data_minio/hive_partitioning/column0=Elizabeth/sample.parquet From d7f08ffdb74b4fce89eff3133e36a5f50fc4ef0b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 9 Jul 2024 21:01:37 +0200 Subject: [PATCH 032/142] Update setup_minio.sh --- docker/test/stateless/setup_minio.sh | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docker/test/stateless/setup_minio.sh b/docker/test/stateless/setup_minio.sh index 8bd75f16321..49837fdb1ac 100755 --- a/docker/test/stateless/setup_minio.sh +++ b/docker/test/stateless/setup_minio.sh @@ -99,10 +99,7 @@ upload_data() { # iterating over globs will cause redundant file variable to be # a path to a file, not a filename # shellcheck disable=SC2045 - for file in $(ls "${data_path}"); do - echo "${file}"; - ./mc cp "${data_path}"/"${file}" clickminio/test/"${file}"; - done + ./mc cp --recursive "${data_path}"/ clickminio/test/ } setup_aws_credentials() { From 7b58722c07e8feb6acca5f0762411a55b8c58915 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 10 Jul 2024 13:15:44 +0200 Subject: [PATCH 033/142] Update setup_minio.sh --- docker/test/stateless/setup_minio.sh | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/test/stateless/setup_minio.sh b/docker/test/stateless/setup_minio.sh index 49837fdb1ac..02e3d117de2 100755 --- a/docker/test/stateless/setup_minio.sh +++ b/docker/test/stateless/setup_minio.sh @@ -99,7 +99,9 @@ upload_data() { # iterating over globs will cause redundant file variable to be # a path to a file, not a filename # shellcheck disable=SC2045 - ./mc cp --recursive "${data_path}"/ clickminio/test/ + if [ -d "${data_path}" ]; then + ./mc cp --recursive "${data_path}"/ clickminio/test/ + fi } setup_aws_credentials() { From a751719a33e2691426bdb057eaf509a74e84753d Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 10 Jul 2024 17:44:06 +0000 Subject: [PATCH 034/142] fixes due to review --- docs/en/operations/settings/settings.md | 6 +++++ docs/en/sql-reference/table-functions/file.md | 17 ++++++++++++ docs/en/sql-reference/table-functions/hdfs.md | 17 ++++++++++++ docs/en/sql-reference/table-functions/s3.md | 17 ++++++++++++ programs/obfuscator/Obfuscator.cpp | 3 +-- src/Formats/ReadSchemaUtils.cpp | 11 ++------ src/Formats/ReadSchemaUtils.h | 2 -- src/Storages/Hive/StorageHive.cpp | 2 +- .../DataLakes/IStorageDataLake.h | 4 +-- .../ObjectStorage/StorageObjectStorage.cpp | 14 ++++++---- .../StorageObjectStorageCluster.cpp | 2 +- .../StorageObjectStorageSource.cpp | 11 +++++--- .../StorageObjectStorageSource.h | 3 ++- .../StorageObjectStorageQueue.cpp | 2 +- src/Storages/StorageFile.cpp | 20 +++++++------- src/Storages/StorageFileCluster.cpp | 2 +- src/Storages/StorageURL.cpp | 23 +++++++++++----- src/Storages/StorageURLCluster.cpp | 2 +- src/Storages/VirtualColumnUtils.cpp | 25 +++++++++++------- src/Storages/VirtualColumnUtils.h | 9 +++++-- src/TableFunctions/TableFunctionFormat.cpp | 10 +++---- .../03203_hive_style_partitioning.reference | 5 +++- .../03203_hive_style_partitioning.sh | 11 +++++--- .../array=[1,2,3]/float=42.42/sample.parquet | Bin 0 -> 1308 bytes .../number=42/date=2020-01-01/sample.parquet | Bin 0 -> 1308 bytes 25 files changed, 152 insertions(+), 66 deletions(-) create mode 100644 tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 1d74a63b972..e100e0f27f7 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5591,3 +5591,9 @@ Default value: `10000000`. Minimal size of block to compress in CROSS JOIN. Zero value means - disable this threshold. This block is compressed when any of the two thresholds (by rows or by bytes) are reached. Default value: `1GiB`. + +## use_hive_partitioning + +Allows the usage of Hive-style partitioning in queries. When enabled, ClickHouse interprets and maintains table partitions in a way that is consistent with the Hive partitioning scheme, which is commonly used in Hadoop ecosystems. + +Default value: `0`. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 3a3162dad9a..88af3663552 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -198,6 +198,23 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. +## Hive-style partitioning {#hive-style-patitioning} + +When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. + +**Example** + +Use virtual column, created with Hive-style partitioning + +``` sql +SET use_hive_patitioning = 1; +SELECT _specified_column from file('/specified_column=specified_data/file.txt'); +``` + +``` reference +specified_data +``` + ## Settings {#settings} - [engine_file_empty_if_not_exists](/docs/en/operations/settings/settings.md#engine-file-empty_if-not-exists) - allows to select empty data from a file that doesn't exist. Disabled by default. diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 28cba5ccc6a..beb1ad12532 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -99,6 +99,23 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. +## Hive-style partitioning {#hive-style-patitioning} + +When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. + +**Example** + +Use virtual column, created with Hive-style partitioning + +``` sql +SET use_hive_patitioning = 1; +SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); +``` + +``` reference +specified_data +``` + ## Storage Settings {#storage-settings} - [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 35e5d86034c..45c4caa1a13 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -274,6 +274,23 @@ FROM s3( - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. +## Hive-style partitioning {#hive-style-patitioning} + +When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. + +**Example** + +Use virtual column, created with Hive-style partitioning + +``` sql +SET use_hive_patitioning = 1; +SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); +``` + +``` reference +specified_data +``` + ## Storage Settings {#storage-settings} - [s3_truncate_on_insert](/docs/en/operations/settings/settings.md#s3_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 11e85bc1302..4c3981c1d01 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1308,8 +1308,7 @@ try SingleReadBufferIterator read_buffer_iterator(std::move(file)); - std::string sample_string; - schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, sample_string, context_const); + schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const); } else { diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 1e70840f91f..1920459c378 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -94,7 +94,6 @@ std::pair readSchemaFromFormatImpl( std::optional format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - std::string & sample_path, const ContextPtr & context) try { @@ -144,10 +143,6 @@ try { iterator_data = read_buffer_iterator.next(); - /// Extracting the File path for hive-style partitioning - if (sample_path.empty()) - sample_path = read_buffer_iterator.getLastFilePath(); - /// Read buffer iterator can determine the data format if it's unknown. /// For example by scanning schema cache or by finding new file with format extension. if (!format_name && iterator_data.format_name) @@ -541,19 +536,17 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - std::string & sample_path, const ContextPtr & context) { - return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, sample_path, context).first; + return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first; } std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - std::string & sample_path, const ContextPtr & context) { - return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, sample_path, context); + return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context); } SchemaCache::Key getKeyForSchemaCache( diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 6c562a06bf0..7168e7f0817 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -122,7 +122,6 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - std::string & sample_path, const ContextPtr & context); /// Try to detect the format of the data and it's schema. @@ -132,7 +131,6 @@ ColumnsDescription readSchemaFromFormat( std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - std::string & sample_path, const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional & format_settings, const ContextPtr & context); diff --git a/src/Storages/Hive/StorageHive.cpp b/src/Storages/Hive/StorageHive.cpp index 28d8128e052..255dadea387 100644 --- a/src/Storages/Hive/StorageHive.cpp +++ b/src/Storages/Hive/StorageHive.cpp @@ -445,7 +445,7 @@ StorageHive::StorageHive( storage_metadata.partition_key = KeyDescription::getKeyFromAST(partition_by_ast, storage_metadata.columns, getContext()); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), getContext())); } void StorageHive::lazyInitialize() diff --git a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h index a651be6017f..ec8e740b1c9 100644 --- a/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h +++ b/src/Storages/ObjectStorage/DataLakes/IStorageDataLake.h @@ -89,9 +89,9 @@ public: { ConfigurationPtr configuration = base_configuration->clone(); configuration->setPaths(metadata->getDataFiles()); - std::string sample_string; + std::string sample_path; return Storage::resolveSchemaFromData( - object_storage_, configuration, format_settings_, sample_string, local_context); + object_storage_, configuration, format_settings_, sample_path, local_context); } } diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 8ab8b6b6881..48e9118e321 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -43,7 +43,8 @@ std::string StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata {}, // predicate metadata.getColumns().getAll(), // virtual_columns nullptr, // read_keys - {} // file_progress_callback + {}, // file_progress_callback + true // override_settings_for_hive_partitioning ); if (auto file = file_iterator->next(0)) @@ -86,7 +87,7 @@ StorageObjectStorage::StorageObjectStorage( else if (!context->getSettings().use_hive_partitioning) sample_path = ""; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), sample_path)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path)); setInMemoryMetadata(metadata); } @@ -396,7 +397,8 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData( { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - return readSchemaFromFormat(configuration->format, format_settings, *iterator, sample_path, context); + sample_path = iterator->getLastFilePath(); + return readSchemaFromFormat(configuration->format, format_settings, *iterator, context); } std::string StorageObjectStorage::resolveFormatFromData( @@ -408,7 +410,8 @@ std::string StorageObjectStorage::resolveFormatFromData( { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - return detectFormatAndReadSchema(format_settings, *iterator, sample_path, context).second; + sample_path = iterator->getLastFilePath(); + return detectFormatAndReadSchema(format_settings, *iterator, context).second; } std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( @@ -420,7 +423,8 @@ std::pair StorageObjectStorage::resolveSchemaAn { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, sample_path, context); + sample_path = iterator->getLastFilePath(); + auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context); configuration->format = format; return std::pair(columns, format); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 0dc4b845a47..92327b4cde0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -41,7 +41,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( metadata.setColumns(columns); metadata.setConstraints(constraints_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_)); setInMemoryMetadata(metadata); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 88ae0a2319c..e5c9318de5d 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -105,7 +105,8 @@ std::shared_ptr StorageObjectStorageSourc const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - std::function file_progress_callback) + std::function file_progress_callback, + bool override_settings_for_hive_partitioning) { if (distributed_processing) return std::make_shared( @@ -122,11 +123,14 @@ std::shared_ptr StorageObjectStorageSourc std::unique_ptr iterator; if (configuration->isPathWithGlobs()) { + bool throw_on_zero_files_match = settings.throw_on_zero_files_match; + if (override_settings_for_hive_partitioning) + throw_on_zero_files_match = false; /// Iterate through disclosed globs and make a source for each file iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, - settings.throw_on_zero_files_match, file_progress_callback); + throw_on_zero_files_match, file_progress_callback); } else { @@ -204,7 +208,8 @@ Chunk StorageObjectStorageSource::generate() .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, - }, object_info->getPath()); + .hive_partitioning_path = object_info->getPath(), + }); const auto & partition_columns = configuration->getPartitionColumns(); if (!partition_columns.empty() && chunk_size && chunk.hasColumns()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 271b38fa75c..a99bb068372 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -58,7 +58,8 @@ public: const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - std::function file_progress_callback = {}); + std::function file_progress_callback = {}, + bool override_settings_for_hive_partitioning = false); static std::string getUniqueStoragePathIdentifier( const Configuration & configuration, diff --git a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp index 98f8cdc7e7a..a9239e3ad06 100644 --- a/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp +++ b/src/Storages/ObjectStorageQueue/StorageObjectStorageQueue.cpp @@ -168,7 +168,7 @@ StorageObjectStorageQueue::StorageObjectStorageQueue( storage_metadata.setColumns(columns); storage_metadata.setConstraints(constraints_); storage_metadata.setComment(comment); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_)); setInMemoryMetadata(storage_metadata); LOG_INFO(log, "Using zookeeper path: {}", zk_path.string()); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 9751d596fff..e6b9137444e 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -52,6 +52,7 @@ #include #include #include +#include "Formats/FormatSettings.h" #include #include @@ -880,11 +881,10 @@ std::pair StorageFile::getTableStructureAndFormatFro auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); ColumnsDescription columns; - std::string sample_path; if (format) - columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context); + columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context); else - std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); + std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); if (peekable_read_buffer_from_fd) @@ -929,21 +929,20 @@ std::pair StorageFile::getTableStructureAndFormatFro } - std::string sample_path; if (archive_info) { ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ColumnsDescription StorageFile::getTableStructureFromFile( @@ -1102,7 +1101,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) std::string path_for_virtuals; if (args.getContext()->getSettingsRef().use_hive_partitioning && !paths.empty()) path_for_virtuals = paths[0]; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), path_for_virtuals, format_settings.value_or(FormatSettings{}))); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), path_for_virtuals, format_settings.value_or(FormatSettings{}))); } @@ -1456,7 +1455,8 @@ Chunk StorageFileSource::generate() .size = current_file_size, .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), .last_modified = current_file_last_modified, - }, hive_partitioning_path); + .hive_partitioning_path = hive_partitioning_path, + }); return chunk; } diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index d43e242f70c..f7684182e79 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -61,7 +61,7 @@ StorageFileCluster::StorageFileCluster( storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context)); } void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 59c5465a381..5da42638b87 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -99,6 +99,17 @@ static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout); } +String getSampleURI(String uri, ContextPtr context) +{ + if (urlWithGlobs(uri)) + { + auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements); + if (!uris.empty()) + return uris[0]; + } + return uri; +} + IStorageURLBase::IStorageURLBase( const String & uri_, const ContextPtr & context_, @@ -155,8 +166,8 @@ IStorageURLBase::IStorageURLBase( std::string uri_for_partitioning; if (context_->getSettingsRef().use_hive_partitioning) - uri_for_partitioning = uri; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), uri_for_partitioning, format_settings.value_or(FormatSettings{}))); + uri_for_partitioning = getSampleURI(uri, context_); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, uri_for_partitioning, format_settings.value_or(FormatSettings{}))); } @@ -425,7 +436,8 @@ Chunk StorageURLSource::generate() { .path = curr_uri.getPath(), .size = current_file_size, - }, hive_partitioning_path); + .hive_partitioning_path = hive_partitioning_path, + }); return chunk; } @@ -959,10 +971,9 @@ std::pair IStorageURLBase::getTableStructureAndForma urls_to_check = {uri}; ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); - std::string sample_path; if (format) - return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, sample_path, context), *format}; - return detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ColumnsDescription IStorageURLBase::getTableStructureFromData( diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 2e7c63d0097..592bd71f546 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -75,7 +75,7 @@ StorageURLCluster::StorageURLCluster( storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns())); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context)); } void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 379b14d8e51..b7669c65992 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -39,10 +39,13 @@ #include #include #include +#include +#include #include "Functions/FunctionsLogical.h" #include "Functions/IFunction.h" #include "Functions/IFunctionAdaptors.h" #include "Functions/indexHint.h" +#include #include #include #include @@ -116,7 +119,7 @@ NameSet getVirtualNamesForFileLikeStorage() return {"_path", "_file", "_size", "_time"}; } -std::map parseFromPath(const std::string& path) +std::map parseHivePartitioningKeysAndValues(const std::string& path) { std::string pattern = "/([^/]+)=([^/]+)"; re2::StringPiece input_piece(path); @@ -128,7 +131,7 @@ std::map parseFromPath(const std::string& path) return key_values; } -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path, FormatSettings settings) +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, std::string path, std::optional format_settings_) { VirtualColumnsDescription desc; @@ -145,13 +148,17 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription add_virtual("_size", makeNullable(std::make_shared())); add_virtual("_time", makeNullable(std::make_shared())); - auto map = parseFromPath(path); - for (const auto& item : map) + auto map = parseHivePartitioningKeysAndValues(path); + for (auto& item : map) { - auto type = tryInferDataTypeForSingleField(item.second, settings); + auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); + auto type = tryInferDataTypeByEscapingRule(item.second, format_settings, FormatSettings::EscapingRule::Raw); if (type == nullptr) type = std::make_shared(); - add_virtual(item.first, std::make_shared(type)); + if (type->canBeInsideLowCardinality()) + add_virtual(item.first, std::make_shared(type)); + else + add_virtual(item.first, type); } return desc; @@ -215,9 +222,9 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path) + VirtualsForFileLikeStorage virtual_values) { - auto hive_map = parseFromPath(hive_partitioning_path); + auto hive_map = parseHivePartitioningKeysAndValues(virtual_values.hive_partitioning_path); for (const auto & virtual_column : requested_virtual_columns) { if (virtual_column.name == "_path") @@ -265,7 +272,7 @@ void addRequestedFileLikeStorageVirtualsToChunk( auto it = hive_map.find(virtual_column.getNameInStorage()); if (it != hive_map.end()) { - chunk.addColumn(virtual_column.getTypeInStorage()->createColumnConst(chunk.getNumRows(), it->second)->convertToFullColumnIfConst()); + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *virtual_column.type))->convertToFullColumnIfConst()); hive_map.erase(it); } } diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 72922be60bd..594253a32c1 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -50,7 +50,11 @@ auto extractSingleValueFromBlock(const Block & block, const String & name) } NameSet getVirtualNamesForFileLikeStorage(); -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, std::string path = "", FormatSettings settings = FormatSettings()); +VirtualColumnsDescription getVirtualsForFileLikeStorage( + const ColumnsDescription & storage_columns, + const ContextPtr & context, + std::string sample_path = "", + std::optional format_settings_ = std::nullopt); ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); @@ -77,13 +81,14 @@ struct VirtualsForFileLikeStorage std::optional size { std::nullopt }; const String * filename { nullptr }; std::optional last_modified { std::nullopt }; + const String & hive_partitioning_path = ""; }; std::map parseFromPath(const std::string& path); void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values, const std::string & hive_partitioning_path = ""); + VirtualsForFileLikeStorage virtual_values); } } diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index 66152cb0c91..ad2a142a140 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -85,10 +85,9 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte if (structure == "auto") { SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); - std::string sample_path; if (format == "auto") - return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context).first; - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first; + return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); } return parseColumnsListFromString(structure, context); } @@ -132,12 +131,11 @@ StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, Con String format_name = format; if (structure == "auto") { - std::string sample_path; SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); if (format_name == "auto") - std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, sample_path, context); + std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context); else - columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, sample_path, context); + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); } else { diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 6ef1fcdf652..e0f46caf1c8 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -60,7 +60,10 @@ Stanley Gibson Elizabeth Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth -Elizabeth Gordon Elizabeth +42 2020-01-01 +[1,2,3] 42.42 +Array(Int64) LowCardinality(Float64) +101 1 TESTING THE S3 PARTITIONING first last Elizabeth diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 334bfef4f02..9d805b39b8a 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -28,7 +28,13 @@ SELECT *, _column0, _column1 FROM file('$CURDIR/data_hive/partitioning/column0=E SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _non_existing_column FROM file('$CURDIR/data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = _column0; + +SELECT _number, _date FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') LIMIT 1; +SELECT _array, _float FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1; +SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet') LIMIT 1; +SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42; +""" $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; @@ -59,8 +65,7 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/colum SELECT *, _column0, _column1 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Gordon/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0;""" +SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;""" $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; diff --git a/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/float=42.42/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 From fa65e374dcb66c2e927f52ea521e9a8586feef65 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 11 Jul 2024 11:57:33 +0200 Subject: [PATCH 035/142] fix docs --- docs/en/sql-reference/table-functions/file.md | 4 ++-- docs/en/sql-reference/table-functions/hdfs.md | 4 ++-- docs/en/sql-reference/table-functions/s3.md | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index d21f523ab8e..838a7ab61de 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -206,7 +206,7 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## Hive-style partitioning {#hive-style-patitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. @@ -215,7 +215,7 @@ When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtu Use virtual column, created with Hive-style partitioning ``` sql -SET use_hive_patitioning = 1; +SET use_hive_partitioning = 1; SELECT _specified_column from file('/specified_column=specified_data/file.txt'); ``` diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index beb1ad12532..fc84c431066 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -99,7 +99,7 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## Hive-style partitioning {#hive-style-patitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. @@ -108,7 +108,7 @@ When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtu Use virtual column, created with Hive-style partitioning ``` sql -SET use_hive_patitioning = 1; +SET use_hive_partitioning = 1; SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); ``` diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 45c4caa1a13..15074a77475 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -274,7 +274,7 @@ FROM s3( - `_size` — Size of the file in bytes. Type: `Nullable(UInt64)`. If the file size is unknown, the value is `NULL`. In case of archive shows uncompressed file size of the file inside the archive. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. -## Hive-style partitioning {#hive-style-patitioning} +## Hive-style partitioning {#hive-style-partitioning} When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. @@ -283,7 +283,7 @@ When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtu Use virtual column, created with Hive-style partitioning ``` sql -SET use_hive_patitioning = 1; +SET use_hive_partitioning = 1; SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); ``` From df104abcc60366df3946a23c52c2e67a92dcb545 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 11 Jul 2024 17:20:19 +0200 Subject: [PATCH 036/142] try to fix tests --- .../03203_hive_style_partitioning.reference | 8 ++++---- .../03203_hive_style_partitioning.sh | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index e0f46caf1c8..0e6b6052946 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -29,6 +29,10 @@ Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth Elizabeth Gordon Elizabeth +42 2020-01-01 +[1,2,3] 42.42 +Array(Int64) LowCardinality(Float64) +101 1 TESTING THE URL PARTITIONING first last Elizabeth @@ -60,10 +64,6 @@ Stanley Gibson Elizabeth Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth -42 2020-01-01 -[1,2,3] 42.42 -Array(Int64) LowCardinality(Float64) -101 1 TESTING THE S3 PARTITIONING first last Elizabeth diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 9d805b39b8a..e74f24bfd80 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -5,10 +5,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" +$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 1; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -36,17 +36,17 @@ SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/parti SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42; """ -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" -$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" +$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE URL PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 1; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -67,17 +67,17 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/colum SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;""" -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" -$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" +$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE S3 PARTITIONING'" -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 1; SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -100,7 +100,7 @@ SELECT *, _non_existing_column FROM s3('http://localhost:11111/test/hive_partiti SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=*/sample.parquet') WHERE column0 = _column0; """ -$CLICKHOUSE_LOCAL -n -q """ +$CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; From 10dd4a9fe66914899ec5e5c89e5b9cc24096f64c Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 11 Jul 2024 21:16:47 +0200 Subject: [PATCH 037/142] debugging tests --- .../0_stateless/03203_hive_style_partitioning.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index e74f24bfd80..14b4a116596 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -5,10 +5,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE FILE HIVE PARTITIONING'" -$CLICKHOUSE_CLIENT -n -q """ +$CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 1; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -36,17 +36,17 @@ SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/parti SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42; """ -$CLICKHOUSE_CLIENT -n -q """ +$CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" -$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE URL PARTITIONING'" +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE URL PARTITIONING'" -$CLICKHOUSE_CLIENT -n -q """ +$CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 1; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; @@ -67,14 +67,14 @@ SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/colum SELECT *, _non_existing_column FROM url('http://localhost:11111/test/hive_partitioning/non_existing_column=Elizabeth/sample.parquet') LIMIT 10;""" -$CLICKHOUSE_CLIENT -n -q """ +$CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM url('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -c "UNKNOWN_IDENTIFIER" -$CLICKHOUSE_CLIENT -q "SELECT 'TESTING THE S3 PARTITIONING'" +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3 PARTITIONING'" $CLICKHOUSE_CLIENT -n -q """ From 119777cd7346a32f45588d069bf6a89efe091867 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:03:25 +0200 Subject: [PATCH 038/142] update reference --- .../0_stateless/03203_hive_style_partitioning.reference | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 0e6b6052946..d187f4cdd2c 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -29,9 +29,9 @@ Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth Elizabeth Gordon Elizabeth -42 2020-01-01 -[1,2,3] 42.42 -Array(Int64) LowCardinality(Float64) +42 2020-01-01 +[1,2,3] 42.42 +Array(Int64) LowCardinality(Float64) 101 1 TESTING THE URL PARTITIONING From 0988e1deadf34736f126e9eb3a2162d3abbe1314 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 12 Jul 2024 14:59:43 +0200 Subject: [PATCH 039/142] update tests --- .../queries/0_stateless/03203_hive_style_partitioning.reference | 2 +- tests/queries/0_stateless/03203_hive_style_partitioning.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index d187f4cdd2c..be43048dd01 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -96,4 +96,4 @@ Eugenia Greer Elizabeth Jeffery Delgado Elizabeth Clara Cross Elizabeth Elizabeth Gordon Elizabeth -1 +OK diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 14b4a116596..58a74a3ca8f 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -104,4 +104,4 @@ $CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 0; SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -""" 2>&1 | grep -c "UNKNOWN_IDENTIFIER" +""" 2>&1 | grep -F -q "UNKNOWN_IDENTIFIER" && echo "OK" || echo "FAIL"; From 0fc14520c821f22b493d32657fede6be10832d60 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 13 Jul 2024 23:06:37 +0000 Subject: [PATCH 040/142] add server termination on exit --- programs/server/fuzzers/tcp_protocol_fuzzer.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/programs/server/fuzzers/tcp_protocol_fuzzer.cpp b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp index 950ea09669a..7cebdc2ad65 100644 --- a/programs/server/fuzzers/tcp_protocol_fuzzer.cpp +++ b/programs/server/fuzzers/tcp_protocol_fuzzer.cpp @@ -10,6 +10,7 @@ #include #include +#include #include @@ -25,6 +26,12 @@ static int64_t port = 9000; using namespace std::chrono_literals; +void on_exit() +{ + BaseDaemon::terminate(); + main_app.wait(); +} + extern "C" int LLVMFuzzerInitialize(int * argc, char ***argv) { @@ -60,6 +67,8 @@ int LLVMFuzzerInitialize(int * argc, char ***argv) exit(-1); } + atexit(on_exit); + return 0; } From 3ccc2aed4c76eba20e0fc88768412bbfacafbb95 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 13 Jul 2024 23:44:13 +0000 Subject: [PATCH 041/142] add fuzzer_arguments to fuzzer runner --- docker/test/libfuzzer/run_libfuzzer.py | 7 +++++++ tests/fuzz/tcp_protocol_fuzzer.options | 4 ++++ 2 files changed, 11 insertions(+) create mode 100644 tests/fuzz/tcp_protocol_fuzzer.options diff --git a/docker/test/libfuzzer/run_libfuzzer.py b/docker/test/libfuzzer/run_libfuzzer.py index 5ed019490d5..cdd09dfa3be 100755 --- a/docker/test/libfuzzer/run_libfuzzer.py +++ b/docker/test/libfuzzer/run_libfuzzer.py @@ -20,6 +20,7 @@ def run_fuzzer(fuzzer: str): options_file = f"{fuzzer}.options" custom_libfuzzer_options = "" + fuzzer_arguments = "" with Path(options_file) as path: if path.exists() and path.is_file(): @@ -47,6 +48,12 @@ def run_fuzzer(fuzzer: str): for key, value in parser["libfuzzer"].items() ) + if parser.has_section("fuzzer_arguments"): + fuzzer_arguments = " ".join( + ("%s" % key) if value == "" else ("%s=%s" % (key, value)) + for key, value in parser["fuzzer_arguments"].items() + ) + cmd_line = f"{DEBUGGER} ./{fuzzer} {FUZZER_ARGS} {corpus_dir}" if custom_libfuzzer_options: cmd_line += f" {custom_libfuzzer_options}" diff --git a/tests/fuzz/tcp_protocol_fuzzer.options b/tests/fuzz/tcp_protocol_fuzzer.options new file mode 100644 index 00000000000..4885669d91d --- /dev/null +++ b/tests/fuzz/tcp_protocol_fuzzer.options @@ -0,0 +1,4 @@ +[fuzzer_arguments] +--log-file=tcp_protocol_fuzzer.log +--= +--logging.terminal=0 From 7841a8b401d0ddb3f9bce9e5dc03049a65a1067a Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 16:27:38 +0000 Subject: [PATCH 042/142] fixes after review --- docs/en/operations/settings/settings.md | 2 +- .../table-functions/azureBlobStorage.md | 13 ++++ docs/en/sql-reference/table-functions/file.md | 8 +-- docs/en/sql-reference/table-functions/hdfs.md | 4 +- docs/en/sql-reference/table-functions/s3.md | 8 +-- docs/en/sql-reference/table-functions/url.md | 13 ++++ .../ObjectStorage/StorageObjectStorage.cpp | 16 +++-- .../StorageObjectStorageCluster.cpp | 4 +- .../StorageObjectStorageSource.cpp | 17 ++--- .../StorageObjectStorageSource.h | 4 +- src/Storages/StorageFile.cpp | 20 ++---- src/Storages/StorageURL.cpp | 11 +-- src/Storages/VirtualColumnUtils.cpp | 64 +++++++++--------- src/Storages/VirtualColumnUtils.h | 7 +- .../03203_hive_style_partitioning.reference | 5 ++ .../03203_hive_style_partitioning.sh | 13 ++++ .../column0=Elizabeth/sample.parquet | Bin 0 -> 1308 bytes .../partitioning/identifier=2070/email.csv | 5 ++ 18 files changed, 114 insertions(+), 100 deletions(-) create mode 100644 tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/identifier=2070/email.csv diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 84912b4574f..fb076e76bdd 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5611,6 +5611,6 @@ Default value: `1GiB`. ## use_hive_partitioning -Allows the usage of Hive-style partitioning in queries. When enabled, ClickHouse interprets and maintains table partitions in a way that is consistent with the Hive partitioning scheme, which is commonly used in Hadoop ecosystems. +When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. Default value: `0`. diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index f59fedeb3a2..104ac4e26df 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -77,3 +77,16 @@ SELECT count(*) FROM azureBlobStorage('DefaultEndpointsProtocol=https;AccountNam **See Also** - [AzureBlobStorage Table Engine](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) + +## Hive-style partitioning {#hive-style-partitioning} + +When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. + +**Example** + +Use virtual column, created with Hive-style partitioning + +``` sql +SET use_hive_partitioning = 1; +SELECT * from azureBlobStorage(config, storage_account_url='...', container='...', blob_path='http://data/path/date=*/country=*/code=*/*.parquet', format='Parquet', structure='Date DateTime64, Country String, Code UInt64') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +``` diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 838a7ab61de..0669609a22a 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -208,7 +208,7 @@ SELECT count(*) FROM file('big_dir/**/file002', 'CSV', 'name String, value UInt3 ## Hive-style partitioning {#hive-style-partitioning} -When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. +When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. **Example** @@ -216,11 +216,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT _specified_column from file('/specified_column=specified_data/file.txt'); -``` - -``` reference -specified_data +SELECT * from file('data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; ``` ## Settings {#settings} diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index fc84c431066..6963d4e4b79 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -101,7 +101,7 @@ FROM hdfs('hdfs://hdfs1:9000/big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name Strin ## Hive-style partitioning {#hive-style-partitioning} -When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. +When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. **Example** @@ -109,7 +109,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); +SELECT * from HDFS('hdfs://hdfs1:9000/data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; ``` ``` reference diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 15074a77475..f3ee83afef4 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -276,7 +276,7 @@ FROM s3( ## Hive-style partitioning {#hive-style-partitioning} -When setting `use_hive_partitioning` is set to 1, ClickHouse can introduce virtual columns due to Hive partitioning style if the path has the specific structure. +When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. **Example** @@ -284,11 +284,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT _specified_column from HDFS('hdfs://hdfs1:9000/specified_column=specified_data/file.txt'); -``` - -``` reference -specified_data +SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; ``` ## Storage Settings {#storage-settings} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index 3bb7aff53a7..596355e2577 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -55,6 +55,19 @@ Character `|` inside patterns is used to specify failover addresses. They are it - `_size` — Size of the resource in bytes. Type: `Nullable(UInt64)`. If the size is unknown, the value is `NULL`. - `_time` — Last modified time of the file. Type: `Nullable(DateTime)`. If the time is unknown, the value is `NULL`. +## Hive-style partitioning {#hive-style-partitioning} + +When setting `use_hive_partitioning` is set to 1, ClickHouse will detect Hive-style partitioning in the path (`/name=value/`) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. + +**Example** + +Use virtual column, created with Hive-style partitioning + +``` sql +SET use_hive_partitioning = 1; +SELECT * from url('http://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +``` + ## Storage Settings {#storage-settings} - [engine_url_skip_empty_files](/docs/en/operations/settings/settings.md#engine_url_skip_empty_files) - allows to skip empty files while reading. Disabled by default. diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 48e9118e321..35cd1492642 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -35,16 +35,19 @@ namespace ErrorCodes std::string StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) { + auto query_settings = configuration->getQuerySettings(context); + /// We don't want to throw an exception if there are no files with specified path. + query_settings.throw_on_zero_files_match = false; auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, + query_settings, object_storage, distributed_processing, context, {}, // predicate metadata.getColumns().getAll(), // virtual_columns nullptr, // read_keys - {}, // file_progress_callback - true // override_settings_for_hive_partitioning + {} // file_progress_callback ); if (auto file = file_iterator->next(0)) @@ -82,12 +85,10 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - if (sample_path.empty() && context->getSettings().use_hive_partitioning) + if (sample_path.empty()) sample_path = getPathSample(metadata, context); - else if (!context->getSettings().use_hive_partitioning) - sample_path = ""; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path, format_settings)); setInMemoryMetadata(metadata); } @@ -224,7 +225,7 @@ private: return; auto context = getContext(); iterator_wrapper = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, distributed_processing, + configuration, configuration->getQuerySettings(context), object_storage, distributed_processing, context, predicate, virtual_columns, nullptr, context->getFileProgressCallback()); } }; @@ -376,6 +377,7 @@ std::unique_ptr StorageObjectStorage::createReadBufferIterat { auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, + configuration->getQuerySettings(context), object_storage, false/* distributed_processing */, context, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 92327b4cde0..a88532e1ea9 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -84,8 +84,8 @@ RemoteQueryExecutor::Extension StorageObjectStorageCluster::getTaskIteratorExten const ActionsDAG::Node * predicate, const ContextPtr & local_context) const { auto iterator = StorageObjectStorageSource::createFileIterator( - configuration, object_storage, /* distributed_processing */false, local_context, - predicate, virtual_columns, nullptr, local_context->getFileProgressCallback()); + configuration, configuration->getQuerySettings(local_context), object_storage, /* distributed_processing */false, + local_context, predicate, virtual_columns, nullptr, local_context->getFileProgressCallback()); auto callback = std::make_shared>([iterator]() mutable -> String { diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 2e5416d1ffd..707e7603368 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -99,14 +99,14 @@ std::string StorageObjectStorageSource::getUniqueStoragePathIdentifier( std::shared_ptr StorageObjectStorageSource::createFileIterator( ConfigurationPtr configuration, + const StorageObjectStorage::QuerySettings & query_settings, ObjectStoragePtr object_storage, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - std::function file_progress_callback, - bool override_settings_for_hive_partitioning) + std::function file_progress_callback) { if (distributed_processing) return std::make_shared( @@ -117,20 +117,16 @@ std::shared_ptr StorageObjectStorageSourc throw Exception(ErrorCodes::BAD_ARGUMENTS, "Expression can not have wildcards inside {} name", configuration->getNamespaceType()); - auto settings = configuration->getQuerySettings(local_context); const bool is_archive = configuration->isArchive(); std::unique_ptr iterator; if (configuration->isPathWithGlobs()) { - bool throw_on_zero_files_match = settings.throw_on_zero_files_match; - if (override_settings_for_hive_partitioning) - throw_on_zero_files_match = false; /// Iterate through disclosed globs and make a source for each file iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, - local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, - throw_on_zero_files_match, file_progress_callback); + local_context, is_archive ? nullptr : read_keys, query_settings.list_object_keys_size, + query_settings.throw_on_zero_files_match, file_progress_callback); } else { @@ -149,7 +145,7 @@ std::shared_ptr StorageObjectStorageSourc iterator = std::make_unique( object_storage, copy_configuration, virtual_columns, is_archive ? nullptr : read_keys, - settings.ignore_non_existent_file, file_progress_callback); + query_settings.ignore_non_existent_file, file_progress_callback); } if (is_archive) @@ -208,8 +204,7 @@ Chunk StorageObjectStorageSource::generate() .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, - .hive_partitioning_path = object_info->getPath(), - }); + }, read_from_format_info.columns_description, getContext()); const auto & partition_columns = configuration->getPartitionColumns(); if (!partition_columns.empty() && chunk_size && chunk.hasColumns()) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index a99bb068372..ff6d588b364 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -52,14 +52,14 @@ public: static std::shared_ptr createFileIterator( ConfigurationPtr configuration, + const StorageObjectStorage::QuerySettings & query_settings, ObjectStoragePtr object_storage, bool distributed_processing, const ContextPtr & local_context, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, ObjectInfos * read_keys, - std::function file_progress_callback = {}, - bool override_settings_for_hive_partitioning = false); + std::function file_progress_callback = {}); static std::string getUniqueStoragePathIdentifier( const Configuration & configuration, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 047631cbc54..42e27a13ca9 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1108,9 +1108,9 @@ void StorageFile::setStorageMetadata(CommonArguments args) setInMemoryMetadata(storage_metadata); std::string path_for_virtuals; - if (args.getContext()->getSettingsRef().use_hive_partitioning && !paths.empty()) + if (!paths.empty()) path_for_virtuals = paths[0]; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), path_for_virtuals, format_settings.value_or(FormatSettings{}))); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), path_for_virtuals, format_settings)); } @@ -1452,10 +1452,6 @@ Chunk StorageFileSource::generate() chunk_size = input_format->getApproxBytesReadForChunk(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); - std::string hive_partitioning_path; - if (getContext()->getSettingsRef().use_hive_partitioning) - hive_partitioning_path = current_path; - /// Enrich with virtual columns. VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( chunk, requested_virtual_columns, @@ -1463,9 +1459,8 @@ Chunk StorageFileSource::generate() .path = current_path, .size = current_file_size, .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), - .last_modified = current_file_last_modified, - .hive_partitioning_path = hive_partitioning_path, - }); + .last_modified = current_file_last_modified + }, columns_description, getContext()); return chunk; } @@ -1648,17 +1643,10 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui size_t num_streams = max_num_streams; size_t files_to_read = 0; - Strings paths; if (storage->archive_info) - { files_to_read = storage->archive_info->paths_to_archives.size(); - paths = storage->archive_info->paths_to_archives; - } else - { files_to_read = storage->paths.size(); - paths = storage->paths; - } if (max_num_streams > files_to_read) num_streams = files_to_read; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 5da42638b87..f7560fa7910 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -164,10 +164,7 @@ IStorageURLBase::IStorageURLBase( storage_metadata.setComment(comment); setInMemoryMetadata(storage_metadata); - std::string uri_for_partitioning; - if (context_->getSettingsRef().use_hive_partitioning) - uri_for_partitioning = getSampleURI(uri, context_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, uri_for_partitioning, format_settings.value_or(FormatSettings{}))); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context_, getSampleURI(uri, context_), format_settings)); } @@ -426,9 +423,6 @@ Chunk StorageURLSource::generate() size_t chunk_size = 0; if (input_format) chunk_size = input_format->getApproxBytesReadForChunk(); - std::string hive_partitioning_path; - if (getContext()->getSettingsRef().use_hive_partitioning) - hive_partitioning_path = curr_uri.getPath(); progress(num_rows, chunk_size ? chunk_size : chunk.bytes()); VirtualColumnUtils::addRequestedFileLikeStorageVirtualsToChunk( @@ -436,8 +430,7 @@ Chunk StorageURLSource::generate() { .path = curr_uri.getPath(), .size = current_file_size, - .hive_partitioning_path = hive_partitioning_path, - }); + }, columns_description, getContext()); return chunk; } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 0efa9522ac6..fb5a345f424 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -119,15 +119,25 @@ NameSet getVirtualNamesForFileLikeStorage() return {"_path", "_file", "_size", "_time"}; } -std::map parseHivePartitioningKeysAndValues(const std::string& path) +std::unordered_map parseHivePartitioningKeysAndValues(const std::string& path, const ColumnsDescription & storage_columns) { std::string pattern = "/([^/]+)=([^/]+)"; re2::StringPiece input_piece(path); - std::map key_values; + std::unordered_map key_values; std::string key, value; + std::set used_keys; while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) - key_values["_" + key] = value; + { + if (used_keys.contains(key)) + throw Exception(ErrorCodes::INCORRECT_DATA, "Link to file with enabled hive-style partitioning contains duplicated key {}, only unique keys required", key); + used_keys.insert(key); + + auto col_name = "_" + key; + while (storage_columns.has(col_name)) + col_name = "_" + col_name; + key_values[col_name] = value; + } return key_values; } @@ -148,17 +158,20 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription add_virtual("_size", makeNullable(std::make_shared())); add_virtual("_time", makeNullable(std::make_shared())); - auto map = parseHivePartitioningKeysAndValues(path); - for (auto& item : map) + if (context->getSettingsRef().use_hive_partitioning) { + auto map = parseHivePartitioningKeysAndValues(path, storage_columns); auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); - auto type = tryInferDataTypeByEscapingRule(item.second, format_settings, FormatSettings::EscapingRule::Raw); - if (type == nullptr) - type = std::make_shared(); - if (type->canBeInsideLowCardinality()) - add_virtual(item.first, std::make_shared(type)); - else - add_virtual(item.first, type); + for (auto & item : map) + { + auto type = tryInferDataTypeByEscapingRule(item.second, format_settings, FormatSettings::EscapingRule::Raw); + if (type == nullptr) + type = std::make_shared(); + if (type->canBeInsideLowCardinality()) + add_virtual(item.first, std::make_shared(type)); + else + add_virtual(item.first, type); + } } return desc; @@ -207,8 +220,6 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const { if (column.name == "_file" || column.name == "_path") block.insert({column.type->createColumn(), column.type, column.name}); - if (!getVirtualNamesForFileLikeStorage().contains(column.name)) - block.insert({column.type->createColumn(), column.type, column.name}); } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); @@ -222,9 +233,12 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values) + VirtualsForFileLikeStorage virtual_values, ColumnsDescription columns, ContextPtr context) { - auto hive_map = parseHivePartitioningKeysAndValues(virtual_values.hive_partitioning_path); + std::unordered_map hive_map; + if (context->getSettingsRef().use_hive_partitioning) + hive_map = parseHivePartitioningKeysAndValues(virtual_values.path, columns); + for (const auto & virtual_column : requested_virtual_columns) { if (virtual_column.name == "_path") @@ -258,23 +272,9 @@ void addRequestedFileLikeStorageVirtualsToChunk( else chunk.addColumn(virtual_column.type->createColumnConstWithDefaultValue(chunk.getNumRows())->convertToFullColumnIfConst()); } - else if (!hive_map.empty()) + else if (auto it = hive_map.find(virtual_column.getNameInStorage()); it != hive_map.end()) { - bool contains_virtual_column = std::any_of(hive_map.begin(), hive_map.end(), - [&](const auto& pair) - { - return requested_virtual_columns.contains(pair.first); - }); - - if (!contains_virtual_column) - hive_map.clear(); // If we cannot find any virtual column in requested, we don't add any of them to chunk - - auto it = hive_map.find(virtual_column.getNameInStorage()); - if (it != hive_map.end()) - { - chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *virtual_column.type))->convertToFullColumnIfConst()); - hive_map.erase(it); - } + chunk.addColumn(virtual_column.type->createColumnConst(chunk.getNumRows(), convertFieldToType(Field(it->second), *virtual_column.type))->convertToFullColumnIfConst()); } } } diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index aa7d4c4605b..29ec32ab375 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -7,8 +7,6 @@ #include #include -#include -#include #include @@ -81,14 +79,11 @@ struct VirtualsForFileLikeStorage std::optional size { std::nullopt }; const String * filename { nullptr }; std::optional last_modified { std::nullopt }; - const String & hive_partitioning_path = ""; }; -std::map parseFromPath(const std::string& path); - void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values); + VirtualsForFileLikeStorage virtual_values, ColumnsDescription columns = {}, ContextPtr context = {}); } } diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index be43048dd01..fc6da3a55c1 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -33,6 +33,11 @@ Elizabeth Gordon Elizabeth [1,2,3] 42.42 Array(Int64) LowCardinality(Float64) 101 +2070 +4081 +2070 +2070 +1 1 TESTING THE URL PARTITIONING first last Elizabeth diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 58a74a3ca8f..a5f3e36763d 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -36,6 +36,19 @@ SELECT toTypeName(_array), toTypeName(_float) FROM file('$CURDIR/data_hive/parti SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01-01/sample.parquet') WHERE _number = 42; """ +$CLICKHOUSE_LOCAL -n -q """ +set use_hive_partitioning = 1; + +SELECT _identifier FROM file('$CURDIR/data_hive/partitioning/identitier=*/email.csv') LIMIT 2; +SELECT __identifier FROM file('$CURDIR/data_hive/partitioning/identitier=*/email.csv') LIMIT 2; +""" + +$CLICKHOUSE_LOCAL -n -q """ +set use_hive_partitioning = 1; + +SELECT *, _column0 FROM file('$CURDIR/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet') LIMIT 10; +""" 2>&1 | grep -c "INCORRECT_DATA" + $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 0; diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Elizabeth/column0=Elizabeth/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/identifier=2070/email.csv b/tests/queries/0_stateless/data_hive/partitioning/identifier=2070/email.csv new file mode 100644 index 00000000000..936d995cc64 --- /dev/null +++ b/tests/queries/0_stateless/data_hive/partitioning/identifier=2070/email.csv @@ -0,0 +1,5 @@ +_login_email,_identifier,_first_name,_last_name +laura@example.com,2070,Laura,Grey +craig@example.com,4081,Craig,Johnson +mary@example.com,9346,Mary,Jenkins +jamie@example.com,5079,Jamie,Smith From 899c5a64e078820caf2e68cbaf892d5d39e0af06 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 17:14:11 +0000 Subject: [PATCH 043/142] some more fixes (docs + storageObjectStorage) --- docs/en/sql-reference/table-functions/azureBlobStorage.md | 2 +- docs/en/sql-reference/table-functions/file.md | 2 +- docs/en/sql-reference/table-functions/hdfs.md | 2 +- docs/en/sql-reference/table-functions/s3.md | 2 +- docs/en/sql-reference/table-functions/url.md | 2 +- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 7 ++++--- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/en/sql-reference/table-functions/azureBlobStorage.md b/docs/en/sql-reference/table-functions/azureBlobStorage.md index 104ac4e26df..6936c807f96 100644 --- a/docs/en/sql-reference/table-functions/azureBlobStorage.md +++ b/docs/en/sql-reference/table-functions/azureBlobStorage.md @@ -88,5 +88,5 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT * from azureBlobStorage(config, storage_account_url='...', container='...', blob_path='http://data/path/date=*/country=*/code=*/*.parquet', format='Parquet', structure='Date DateTime64, Country String, Code UInt64') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +SELECT * from azureBlobStorage(config, storage_account_url='...', container='...', blob_path='http://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 0669609a22a..7908a3cb934 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -216,7 +216,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT * from file('data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +SELECT * from file('data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` ## Settings {#settings} diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 6963d4e4b79..73fdc263d68 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -109,7 +109,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT * from HDFS('hdfs://hdfs1:9000/data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +SELECT * from HDFS('hdfs://hdfs1:9000/data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` ``` reference diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index f3ee83afef4..1bd9f38517e 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -284,7 +284,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +SELECT * from s3('s3://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` ## Storage Settings {#storage-settings} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index 596355e2577..b4027594e7c 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -65,7 +65,7 @@ Use virtual column, created with Hive-style partitioning ``` sql SET use_hive_partitioning = 1; -SELECT * from url('http://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and code = 42; +SELECT * from url('http://data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` ## Storage Settings {#storage-settings} diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 35cd1492642..d2cc73f14d7 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -85,7 +85,7 @@ StorageObjectStorage::StorageObjectStorage( metadata.setConstraints(constraints_); metadata.setComment(comment); - if (sample_path.empty()) + if (sample_path.empty() && context->getSettingsRef().use_hive_partitioning) sample_path = getPathSample(metadata, context); setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context, sample_path, format_settings)); @@ -412,8 +412,9 @@ std::string StorageObjectStorage::resolveFormatFromData( { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + auto format_and_schema = detectFormatAndReadSchema(format_settings, *iterator, context).second; sample_path = iterator->getLastFilePath(); - return detectFormatAndReadSchema(format_settings, *iterator, context).second; + return format_and_schema; } std::pair StorageObjectStorage::resolveSchemaAndFormatFromData( @@ -425,8 +426,8 @@ std::pair StorageObjectStorage::resolveSchemaAn { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); - sample_path = iterator->getLastFilePath(); auto [columns, format] = detectFormatAndReadSchema(format_settings, *iterator, context); + sample_path = iterator->getLastFilePath(); configuration->format = format; return std::pair(columns, format); } From 7d39535c989e16d818370cbbd9cbea891b21d07a Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 17:20:46 +0000 Subject: [PATCH 044/142] storageObjectStorage small fix --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 8291327992c..ee1169d2c5c 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -400,8 +400,9 @@ ColumnsDescription StorageObjectStorage::resolveSchemaFromData( { ObjectInfos read_keys; auto iterator = createReadBufferIterator(object_storage, configuration, format_settings, read_keys, context); + auto schema = readSchemaFromFormat(configuration->format, format_settings, *iterator, context); sample_path = iterator->getLastFilePath(); - return readSchemaFromFormat(configuration->format, format_settings, *iterator, context); + return schema; } std::string StorageObjectStorage::resolveFormatFromData( From 12794601921e2d465b27e665b072267b658b8e4c Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 22:09:30 +0000 Subject: [PATCH 045/142] fix after review --- .../ObjectStorage/StorageObjectStorage.cpp | 2 +- .../ObjectStorage/StorageObjectStorage.h | 2 +- .../StorageObjectStorageCluster.cpp | 29 ++++++++++++++++++- .../StorageObjectStorageCluster.h | 2 ++ src/Storages/StorageFile.cpp | 5 +--- src/Storages/VirtualColumnUtils.cpp | 6 ++-- src/Storages/VirtualColumnUtils.h | 2 +- .../03203_hive_style_partitioning.reference | 17 +++++++++++ .../03203_hive_style_partitioning.sh | 16 ++++++++++ 9 files changed, 70 insertions(+), 11 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index ee1169d2c5c..ca0ced8dcd3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -34,7 +34,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::string StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) +String StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) { auto query_settings = configuration->getQuerySettings(context); /// We don't want to throw an exception if there are no files with specified path. diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index 6ee4ce0c16f..cae0db48f31 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -122,7 +122,7 @@ public: protected: virtual void updateConfiguration(ContextPtr local_context); - std::string getPathSample(StorageInMemoryMetadata metadata, ContextPtr context); + String getPathSample(StorageInMemoryMetadata metadata, ContextPtr context); virtual ReadFromFormatInfo prepareReadingFromFormat( const Strings & requested_columns, diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index a88532e1ea9..7f6b3338f9b 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -1,6 +1,8 @@ #include "Storages/ObjectStorage/StorageObjectStorageCluster.h" #include +#include +#include #include #include #include @@ -19,6 +21,28 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +String StorageObjectStorageCluster::getPathSample(StorageInMemoryMetadata metadata, ContextPtr context) +{ + auto query_settings = configuration->getQuerySettings(context); + /// We don't want to throw an exception if there are no files with specified path. + query_settings.throw_on_zero_files_match = false; + auto file_iterator = StorageObjectStorageSource::createFileIterator( + configuration, + query_settings, + object_storage, + false, // distributed_processing + context, + {}, // predicate + metadata.getColumns().getAll(), // virtual_columns + nullptr, // read_keys + {} // file_progress_callback + ); + + if (auto file = file_iterator->next(0)) + return file->getPath(); + return ""; +} + StorageObjectStorageCluster::StorageObjectStorageCluster( const String & cluster_name_, ConfigurationPtr configuration_, @@ -41,7 +65,10 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( metadata.setColumns(columns); metadata.setConstraints(constraints_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_)); + if (sample_path.empty() && context_->getSettingsRef().use_hive_partitioning) + sample_path = getPathSample(metadata, context_); + + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_, sample_path, getFormatSettings(context_))); setInMemoryMetadata(metadata); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h index 108aa109616..0088ff28fc2 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.h @@ -27,6 +27,8 @@ public: RemoteQueryExecutor::Extension getTaskIteratorExtension( const ActionsDAG::Node * predicate, const ContextPtr & context) const override; + String getPathSample(StorageInMemoryMetadata metadata, ContextPtr context); + private: void updateQueryToSendIfNeeded( ASTPtr & query, diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index e2b010bf48f..b43fce370a1 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1109,10 +1109,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) storage_metadata.setComment(args.comment); setInMemoryMetadata(storage_metadata); - std::string path_for_virtuals; - if (!paths.empty()) - path_for_virtuals = paths[0]; - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), path_for_virtuals, format_settings)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), args.getContext(), paths.empty() ? "" : paths[0], format_settings)); } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index fb5a345f424..352fd0d7a76 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -119,18 +119,18 @@ NameSet getVirtualNamesForFileLikeStorage() return {"_path", "_file", "_size", "_time"}; } -std::unordered_map parseHivePartitioningKeysAndValues(const std::string& path, const ColumnsDescription & storage_columns) +std::unordered_map parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns) { std::string pattern = "/([^/]+)=([^/]+)"; re2::StringPiece input_piece(path); std::unordered_map key_values; std::string key, value; - std::set used_keys; + std::unordered_set used_keys; while (RE2::FindAndConsume(&input_piece, pattern, &key, &value)) { if (used_keys.contains(key)) - throw Exception(ErrorCodes::INCORRECT_DATA, "Link to file with enabled hive-style partitioning contains duplicated key {}, only unique keys required", key); + throw Exception(ErrorCodes::INCORRECT_DATA, "Path '{}' to file with enabled hive-style partitioning contains duplicated partition key {}, only unique keys are allowed", path, key); used_keys.insert(key); auto col_name = "_" + key; diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 29ec32ab375..fef32b149ec 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -51,7 +51,7 @@ NameSet getVirtualNamesForFileLikeStorage(); VirtualColumnsDescription getVirtualsForFileLikeStorage( const ColumnsDescription & storage_columns, const ContextPtr & context, - std::string sample_path = "", + const std::string & sample_path = "", std::optional format_settings_ = std::nullopt); ActionsDAGPtr createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index fc6da3a55c1..430a3582f65 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -102,3 +102,20 @@ Jeffery Delgado Elizabeth Clara Cross Elizabeth Elizabeth Gordon Elizabeth OK +TESTING THE S3CLUSTER PARTITIONING +first last Elizabeth +Jorge Frank Elizabeth +Hunter Moreno Elizabeth +Esther Guzman Elizabeth +Dennis Stephens Elizabeth +Nettie Franklin Elizabeth +Stanley Gibson Elizabeth +Eugenia Greer Elizabeth +Jeffery Delgado Elizabeth +Clara Cross Elizabeth +Elizabeth Gordon Elizabeth +Eva Schmidt Elizabeth Schmidt +Samuel Schmidt Elizabeth Schmidt +Eva Schmidt Elizabeth +Samuel Schmidt Elizabeth +Elizabeth Gordon Elizabeth Gordon diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index a5f3e36763d..d2b1f31c85f 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -118,3 +118,19 @@ set use_hive_partitioning = 0; SELECT *, _column0 FROM s3('http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; """ 2>&1 | grep -F -q "UNKNOWN_IDENTIFIER" && echo "OK" || echo "FAIL"; + +$CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3CLUSTER PARTITIONING'" + +$CLICKHOUSE_CLIENT -n -q """ +set use_hive_partitioning = 1; + +SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; + +SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; + +SELECT *, _column0, _column1 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; + +SELECT *, _column0, _column1 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +""" From 4305a38e7394d7bbf3c3455c3b52b1dc9b86f3c9 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 22:11:08 +0000 Subject: [PATCH 046/142] add include --- src/Storages/VirtualColumnUtils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 352fd0d7a76..938972cffca 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -41,6 +41,7 @@ #include #include #include +#include #include "Functions/FunctionsLogical.h" #include "Functions/IFunction.h" #include "Functions/IFunctionAdaptors.h" From dcf14e68afbc741fdbf830fc7d01ba84817c0760 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 22:45:21 +0000 Subject: [PATCH 047/142] small fix --- src/Storages/VirtualColumnUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 938972cffca..e84979833ab 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -142,7 +142,7 @@ std::unordered_map parseHivePartitioningKeysAndValues( return key_values; } -VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, std::string path, std::optional format_settings_) +VirtualColumnsDescription getVirtualsForFileLikeStorage(const ColumnsDescription & storage_columns, const ContextPtr & context, const std::string & path, std::optional format_settings_) { VirtualColumnsDescription desc; From 177d006307515e62bbe60082c293c03de4d4cc7c Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 15 Jul 2024 23:02:24 +0000 Subject: [PATCH 048/142] add errorcodes --- src/Storages/VirtualColumnUtils.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index e84979833ab..24d0b7160b2 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -56,6 +56,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int INCORRECT_DATA; +} + namespace VirtualColumnUtils { From 82d283357755e3b667074596ec254ca54598ee5d Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Mon, 15 Jul 2024 23:29:25 +0000 Subject: [PATCH 049/142] clang tidy fix --- src/Interpreters/Context.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index fc1e87e7b7e..2602afd8b78 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include From 9690a5a334b4991eaa9dfa58ce804c91bbff4385 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 16 Jul 2024 13:37:59 +0000 Subject: [PATCH 050/142] fix --- tests/ci/libfuzzer_test_check.py | 2 +- {utils/libfuzzer => tests/fuzz}/runner.py | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename {utils/libfuzzer => tests/fuzz}/runner.py (100%) diff --git a/tests/ci/libfuzzer_test_check.py b/tests/ci/libfuzzer_test_check.py index d9e33229932..8f19dd7d023 100644 --- a/tests/ci/libfuzzer_test_check.py +++ b/tests/ci/libfuzzer_test_check.py @@ -75,7 +75,7 @@ def get_run_command( f"--volume={result_path}:/test_output " "--security-opt seccomp=unconfined " # required to issue io_uring sys-calls f"--cap-add=SYS_PTRACE {env_str} {additional_options_str} {image} " - "python3 ./utils/runner.py" + "python3 /usr/share/clickhouse-test/fuzz/runner.py" ) diff --git a/utils/libfuzzer/runner.py b/tests/fuzz/runner.py similarity index 100% rename from utils/libfuzzer/runner.py rename to tests/fuzz/runner.py From c9e02eee7a1d7b2e7aa85bfc87d6a54a3bcaedfa Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 16 Jul 2024 13:48:52 +0000 Subject: [PATCH 051/142] fix after review --- docs/en/sql-reference/table-functions/hdfs.md | 4 ---- .../StorageObjectStorageSource.cpp | 2 +- .../ObjectStorageQueueSource.cpp | 2 +- src/Storages/StorageFile.cpp | 2 +- src/Storages/StorageFileCluster.cpp | 2 +- src/Storages/StorageURL.cpp | 19 +------------------ src/Storages/StorageURL.h | 18 ++++++++++++++++++ src/Storages/StorageURLCluster.cpp | 3 +-- src/Storages/VirtualColumnUtils.cpp | 2 +- src/Storages/VirtualColumnUtils.h | 2 +- 10 files changed, 26 insertions(+), 30 deletions(-) diff --git a/docs/en/sql-reference/table-functions/hdfs.md b/docs/en/sql-reference/table-functions/hdfs.md index 73fdc263d68..60c2fd40e6a 100644 --- a/docs/en/sql-reference/table-functions/hdfs.md +++ b/docs/en/sql-reference/table-functions/hdfs.md @@ -112,10 +112,6 @@ SET use_hive_partitioning = 1; SELECT * from HDFS('hdfs://hdfs1:9000/data/path/date=*/country=*/code=*/*.parquet') where _date > '2020-01-01' and _country = 'Netherlands' and _code = 42; ``` -``` reference -specified_data -``` - ## Storage Settings {#storage-settings} - [hdfs_truncate_on_insert](/docs/en/operations/settings/settings.md#hdfs_truncate_on_insert) - allows to truncate file before insert into it. Disabled by default. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 93f8eaacbc0..d29e33444b0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -204,7 +204,7 @@ Chunk StorageObjectStorageSource::generate() .size = object_info->isArchive() ? object_info->fileSizeInArchive() : object_info->metadata->size_bytes, .filename = &filename, .last_modified = object_info->metadata->last_modified, - }, read_from_format_info.columns_description, getContext()); + }, getContext(), read_from_format_info.columns_description); const auto & partition_columns = configuration->getPartitionColumns(); if (!partition_columns.empty() && chunk_size && chunk.hasColumns()) diff --git a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp index 4d921003e04..2634a7b2f1e 100644 --- a/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp +++ b/src/Storages/ObjectStorageQueue/ObjectStorageQueueSource.cpp @@ -524,7 +524,7 @@ Chunk ObjectStorageQueueSource::generateImpl() { .path = path, .size = reader.getObjectInfo()->metadata->size_bytes - }); + }, getContext(), read_from_format_info.columns_description); return chunk; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index b43fce370a1..5cbc2b38887 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1459,7 +1459,7 @@ Chunk StorageFileSource::generate() .size = current_file_size, .filename = (filename_override.has_value() ? &filename_override.value() : nullptr), .last_modified = current_file_last_modified - }, columns_description, getContext()); + }, getContext(), columns_description); return chunk; } diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index f7684182e79..82ae0b761ae 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -61,7 +61,7 @@ StorageFileCluster::StorageFileCluster( storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, paths.empty() ? "" : paths[0])); } void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 87911230819..6e7788cfc1d 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -39,7 +39,6 @@ #include #include -#include #include #include @@ -92,27 +91,11 @@ static const std::vector> optional_regex_keys = { std::make_shared(R"(headers.header\[[0-9]*\].value)"), }; -static bool urlWithGlobs(const String & uri) -{ - return (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) || uri.find('|') != std::string::npos; -} - static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) { return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout); } -String getSampleURI(String uri, ContextPtr context) -{ - if (urlWithGlobs(uri)) - { - auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements); - if (!uris.empty()) - return uris[0]; - } - return uri; -} - IStorageURLBase::IStorageURLBase( const String & uri_, const ContextPtr & context_, @@ -433,7 +416,7 @@ Chunk StorageURLSource::generate() { .path = curr_uri.getPath(), .size = current_file_size, - }, columns_description, getContext()); + }, getContext(), columns_description); return chunk; } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index fa7cc6eeeef..a874ca9147c 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include namespace DB @@ -267,6 +269,22 @@ private: bool cancelled = false; }; +static bool urlWithGlobs(const String & uri) +{ + return (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) || uri.find('|') != std::string::npos; +} + +inline String getSampleURI(String uri, ContextPtr context) +{ + if (urlWithGlobs(uri)) + { + auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements); + if (!uris.empty()) + return uris[0]; + } + return uri; +} + class StorageURL : public IStorageURLBase { public: diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 664f170c17e..e80f4ebcd06 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include #include @@ -76,7 +75,7 @@ StorageURLCluster::StorageURLCluster( storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context)); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, getSampleURI(uri, context), getFormatSettings(context))); } void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 24d0b7160b2..31cee485dde 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -239,7 +239,7 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values, ColumnsDescription columns, ContextPtr context) + VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns) { std::unordered_map hive_map; if (context->getSettingsRef().use_hive_partitioning) diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index fef32b149ec..1bd74189559 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -83,7 +83,7 @@ struct VirtualsForFileLikeStorage void addRequestedFileLikeStorageVirtualsToChunk( Chunk & chunk, const NamesAndTypesList & requested_virtual_columns, - VirtualsForFileLikeStorage virtual_values, ColumnsDescription columns = {}, ContextPtr context = {}); + VirtualsForFileLikeStorage virtual_values, ContextPtr context, const ColumnsDescription & columns); } } From e7e62b358360083eda6d2ec983fb5a1b733d1eba Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 16 Jul 2024 14:17:51 +0000 Subject: [PATCH 052/142] fix style --- tests/fuzz/runner.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/fuzz/runner.py b/tests/fuzz/runner.py index bbe648dbbc2..0862ea29e42 100644 --- a/tests/fuzz/runner.py +++ b/tests/fuzz/runner.py @@ -11,7 +11,7 @@ FUZZER_ARGS = os.getenv("FUZZER_ARGS", "") def run_fuzzer(fuzzer: str): - logging.info(f"Running fuzzer {fuzzer}...") + logging.info("Running fuzzer %s...", fuzzer) corpus_dir = f"{fuzzer}.in" with Path(corpus_dir) as path: @@ -29,28 +29,28 @@ def run_fuzzer(fuzzer: str): if parser.has_section("asan"): os.environ["ASAN_OPTIONS"] = ( - f"{os.environ['ASAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['asan'].items())}" + f"{os.environ['ASAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['asan'].items())}" ) if parser.has_section("msan"): os.environ["MSAN_OPTIONS"] = ( - f"{os.environ['MSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['msan'].items())}" + f"{os.environ['MSAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['msan'].items())}" ) if parser.has_section("ubsan"): os.environ["UBSAN_OPTIONS"] = ( - f"{os.environ['UBSAN_OPTIONS']}:{':'.join('%s=%s' % (key, value) for key, value in parser['ubsan'].items())}" + f"{os.environ['UBSAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['ubsan'].items())}" ) if parser.has_section("libfuzzer"): custom_libfuzzer_options = " ".join( - "-%s=%s" % (key, value) + f"-{key}={value}" for key, value in parser["libfuzzer"].items() ) if parser.has_section("fuzzer_arguments"): fuzzer_arguments = " ".join( - ("%s" % key) if value == "" else ("%s=%s" % (key, value)) + (f"{key}") if value == "" else (f"{key}={value}") for key, value in parser["fuzzer_arguments"].items() ) @@ -65,7 +65,7 @@ def run_fuzzer(fuzzer: str): cmd_line += " < /dev/null" - logging.info(f"...will execute: {cmd_line}") + logging.info("...will execute: %s", cmd_line) subprocess.check_call(cmd_line, shell=True) From c974430e68bff97986379410e9a94c1ea641d1bd Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Tue, 16 Jul 2024 15:01:43 +0000 Subject: [PATCH 053/142] fix --- tests/fuzz/runner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/fuzz/runner.py b/tests/fuzz/runner.py index 0862ea29e42..047a2245bfa 100644 --- a/tests/fuzz/runner.py +++ b/tests/fuzz/runner.py @@ -29,17 +29,17 @@ def run_fuzzer(fuzzer: str): if parser.has_section("asan"): os.environ["ASAN_OPTIONS"] = ( - f"{os.environ['ASAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['asan'].items())}" + f"{os.environ['ASAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['asan'].items())}" ) if parser.has_section("msan"): os.environ["MSAN_OPTIONS"] = ( - f"{os.environ['MSAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['msan'].items())}" + f"{os.environ['MSAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['msan'].items())}" ) if parser.has_section("ubsan"): os.environ["UBSAN_OPTIONS"] = ( - f"{os.environ['UBSAN_OPTIONS']}:{':'.join(f"{key}={value}" for key, value in parser['ubsan'].items())}" + f"{os.environ['UBSAN_OPTIONS']}:{':'.join(f'{key}={value}' for key, value in parser['ubsan'].items())}" ) if parser.has_section("libfuzzer"): From 8660aec5d79f7a16ab3bcac2aaab291e4bcf0c2d Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Tue, 16 Jul 2024 15:16:11 +0000 Subject: [PATCH 054/142] Automatic style fix --- tests/fuzz/runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/fuzz/runner.py b/tests/fuzz/runner.py index 047a2245bfa..44259228f60 100644 --- a/tests/fuzz/runner.py +++ b/tests/fuzz/runner.py @@ -44,8 +44,7 @@ def run_fuzzer(fuzzer: str): if parser.has_section("libfuzzer"): custom_libfuzzer_options = " ".join( - f"-{key}={value}" - for key, value in parser["libfuzzer"].items() + f"-{key}={value}" for key, value in parser["libfuzzer"].items() ) if parser.has_section("fuzzer_arguments"): From 3ae4211b3af575cf8d7186a4cc915f9ecb6b4182 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 16 Jul 2024 17:59:57 +0200 Subject: [PATCH 055/142] fix tests --- tests/queries/0_stateless/03203_hive_style_partitioning.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index d2b1f31c85f..0f687d532b0 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -39,8 +39,8 @@ SELECT count(*) FROM file('$CURDIR/data_hive/partitioning/number=42/date=2020-01 $CLICKHOUSE_LOCAL -n -q """ set use_hive_partitioning = 1; -SELECT _identifier FROM file('$CURDIR/data_hive/partitioning/identitier=*/email.csv') LIMIT 2; -SELECT __identifier FROM file('$CURDIR/data_hive/partitioning/identitier=*/email.csv') LIMIT 2; +SELECT _identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2; +SELECT __identifier FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') LIMIT 2; """ $CLICKHOUSE_LOCAL -n -q """ From d91cb40bbdbd18a2bef811002033d0c99fe693d3 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 16 Jul 2024 16:15:24 +0000 Subject: [PATCH 056/142] fix include and remove unused getFormatSettings --- src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp | 2 +- src/Storages/StorageFile.cpp | 1 - src/Storages/StorageURLCluster.cpp | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp index 7f6b3338f9b..c214665f7e0 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageCluster.cpp @@ -68,7 +68,7 @@ StorageObjectStorageCluster::StorageObjectStorageCluster( if (sample_path.empty() && context_->getSettingsRef().use_hive_partitioning) sample_path = getPathSample(metadata, context_); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_, sample_path, getFormatSettings(context_))); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(metadata.getColumns(), context_, sample_path)); setInMemoryMetadata(metadata); } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 5cbc2b38887..ed05f57b418 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -52,7 +52,6 @@ #include #include #include -#include "Formats/FormatSettings.h" #include #include diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index e80f4ebcd06..1522a18a083 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -75,7 +75,7 @@ StorageURLCluster::StorageURLCluster( storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); - setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, getSampleURI(uri, context), getFormatSettings(context))); + setVirtuals(VirtualColumnUtils::getVirtualsForFileLikeStorage(storage_metadata.getColumns(), context, getSampleURI(uri, context))); } void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) From d11d1a42bc7101993e5c85f1c1c3298e6334dbf9 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 16 Jul 2024 16:49:33 +0000 Subject: [PATCH 057/142] fix for storageURL functions --- src/Storages/StorageURL.cpp | 16 ++++++++++++++++ src/Storages/StorageURL.h | 19 +++---------------- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 6e7788cfc1d..1d1deebf9f5 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -91,6 +91,22 @@ static const std::vector> optional_regex_keys = { std::make_shared(R"(headers.header\[[0-9]*\].value)"), }; +bool urlWithGlobs(const String & uri) +{ + return (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) || uri.find('|') != std::string::npos; +} + +String getSampleURI(String uri, ContextPtr context) +{ + if (urlWithGlobs(uri)) + { + auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements); + if (!uris.empty()) + return uris[0]; + } + return uri; +} + static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) { return ConnectionTimeouts::getHTTPTimeouts(context->getSettingsRef(), context->getServerSettings().keep_alive_timeout); diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index a874ca9147c..cd48ecb767b 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -143,6 +143,9 @@ private: virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; +bool urlWithGlobs(const String & uri); + +String getSampleURI(String uri, ContextPtr context); class StorageURLSource : public SourceWithKeyCondition, WithContext { @@ -269,22 +272,6 @@ private: bool cancelled = false; }; -static bool urlWithGlobs(const String & uri) -{ - return (uri.find('{') != std::string::npos && uri.find('}') != std::string::npos) || uri.find('|') != std::string::npos; -} - -inline String getSampleURI(String uri, ContextPtr context) -{ - if (urlWithGlobs(uri)) - { - auto uris = parseRemoteDescription(uri, 0, uri.size(), ',', context->getSettingsRef().glob_expansion_max_elements); - if (!uris.empty()) - return uris[0]; - } - return uri; -} - class StorageURL : public IStorageURLBase { public: From 14dcb97e353fb4739c7f7d37b9c3c11c9ad40923 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 16 Jul 2024 19:09:18 +0200 Subject: [PATCH 058/142] Update src/Storages/StorageURL.h Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Storages/StorageURL.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index cd48ecb767b..1f3d63b4c85 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -12,8 +12,6 @@ #include #include #include -#include -#include namespace DB From 771b39fa2179a9a548580c41859bbecf0165000d Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 16 Jul 2024 19:44:10 +0200 Subject: [PATCH 059/142] Update StorageURLCluster.cpp --- src/Storages/StorageURLCluster.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 1522a18a083..7c7a299c64e 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include From 9d332911fb0f4b25bbdb67a9c10c5f3b42db4ea6 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 16 Jul 2024 20:10:19 +0200 Subject: [PATCH 060/142] Update StorageURL.cpp --- src/Storages/StorageURL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 1d1deebf9f5..4cf191f7e8a 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -39,6 +39,7 @@ #include #include +#include #include #include From 949e69c0573354417e21bd83d446e1ea085db04d Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 16 Jul 2024 21:58:48 +0200 Subject: [PATCH 061/142] add documentation for getSubcolumn and getTypeSerializationStreams --- .../functions/other-functions.md | 91 +++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 260457b3be1..40f1b82562d 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -4055,3 +4055,94 @@ Result: │ 32 │ └─────────────────────────────┘ ``` + +## getSubcolumn + +Takes a table expression or identifier and constant string with the name of the sub-column, and returns the requested sub-column extracted from the expression. + +**Syntax** + +```sql +getSubcolumn(col_name, subcol_name) +``` + +**Arguments** + +- `col_name` — Table expression or identifier. [Expression](../syntax.md/#expressions), [Identifier](../syntax.md/#identifiers). +- `subcol_name` — The name of the sub-column. [String](../data-types/string.md). + +**Returned value** + +- Returns the extracted sub-colum. + +**Example** + +Query: + +```sql +CREATE TABLE t_arr (arr Array(Tuple(subcolumn1 UInt32, subcolumn2 String))) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO t_arr VALUES ([(1, 'Hello'), (2, 'World')]), ([(3, 'This'), (4, 'is'), (5, 'subcolumn')]); +SELECT getSubcolumn(arr, 'subcolumn1'), getSubcolumn(arr, 'subcolumn2') FROM t_arr; +``` + +Result: + +```response + ┌─getSubcolumn(arr, 'subcolumn1')─┬─getSubcolumn(arr, 'subcolumn2')─┐ +1. │ [1,2] │ ['Hello','World'] │ +2. │ [3,4,5] │ ['This','is','subcolumn'] │ + └─────────────────────────────────┴─────────────────────────────────┘ +``` + +## getTypeSerializationStreams + +Enumerates stream paths of a data type. + +:::note +This function is intended for use by developers. +::: + +**Syntax** + +```sql +getTypeSerializationStreams(col) +``` + +**Arguments** + +- `col` — Column or string representation of a data-type from which the data type will be detected. + +**Returned value** + +- Returns an array with all the serialization sub-stream paths.[Array](../data-types/array.md)([String](../data-types/string.md)). + +**Examples** + +Query: + +```sql +SELECT getTypeSerializationStreams(tuple('a', 1, 'b', 2)); +``` + +Result: + +```response + ┌─getTypeSerializationStreams(('a', 1, 'b', 2))─────────────────────────────────────────────────────────────────────────┐ +1. │ ['{TupleElement(1), Regular}','{TupleElement(2), Regular}','{TupleElement(3), Regular}','{TupleElement(4), Regular}'] │ + └───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + +Query: + +```sql +SELECT getTypeSerializationStreams('Map(String, Int64)'); +``` + +Result: + +```response + ┌─getTypeSerializationStreams('Map(String, Int64)')────────────────────────────────────────────────────────────────┐ +1. │ ['{ArraySizes}','{ArrayElements, TupleElement(keys), Regular}','{ArrayElements, TupleElement(values), Regular}'] │ + └──────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ +``` + From 63936364b1abf345349403e656c4cf58c44715bc Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 17 Jul 2024 16:53:35 +0200 Subject: [PATCH 062/142] fixes of tests --- src/Storages/VirtualColumnUtils.cpp | 2 +- .../0_stateless/03203_hive_style_partitioning.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 31cee485dde..87c1aecc3a7 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -127,7 +127,7 @@ NameSet getVirtualNamesForFileLikeStorage() std::unordered_map parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns) { - std::string pattern = "/([^/]+)=([^/]+)"; + std::string pattern = "([^/]+)=([^/]+)/"; re2::StringPiece input_piece(path); std::unordered_map key_values; diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.sh b/tests/queries/0_stateless/03203_hive_style_partitioning.sh index 0f687d532b0..db1f073d736 100755 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.sh +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.sh @@ -124,13 +124,13 @@ $CLICKHOUSE_LOCAL -q "SELECT 'TESTING THE S3CLUSTER PARTITIONING'" $CLICKHOUSE_CLIENT -n -q """ set use_hive_partitioning = 1; -SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; +SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') LIMIT 10; -SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; +SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/sample.parquet') WHERE column0 = _column0; -SELECT *, _column0, _column1 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; +SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column1 = _column1; -SELECT *, _column0, _column1 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; -SELECT *, _column0 FROM s3Cluster('test_cluster_two_shards_localhost', 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0, _column1 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; +SELECT *, _column0 FROM s3Cluster(test_cluster_one_shard_three_replicas_localhost, 'http://localhost:11111/test/hive_partitioning/column0=Elizabeth/column1=Schmidt/sample.parquet') WHERE column0 = _column0 AND column1 = _column1; """ From 938071cd55913c3bb2b8781750ef37bf6307acab Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Wed, 17 Jul 2024 14:56:00 +0000 Subject: [PATCH 063/142] add ci_include_fuzzer to PR body template --- .github/PULL_REQUEST_TEMPLATE.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index e045170561d..146542e980c 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -59,6 +59,8 @@ At a minimum, the following information should be added (but add more as needed) - [ ] Exclude: All with TSAN, MSAN, UBSAN, Coverage - [ ] Exclude: All with aarch64, release, debug --- +- [ ] Run only libFuzzer related jobs +--- - [ ] Do not test - [ ] Woolen Wolfdog - [ ] Upload binaries for special builds From bb01920370e1dd5faa7b17694f74175190537445 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Wed, 17 Jul 2024 16:19:02 +0000 Subject: [PATCH 064/142] add ci_exclude_ast to PR body template --- .github/PULL_REQUEST_TEMPLATE.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 146542e980c..8b6e957e1d8 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -59,7 +59,8 @@ At a minimum, the following information should be added (but add more as needed) - [ ] Exclude: All with TSAN, MSAN, UBSAN, Coverage - [ ] Exclude: All with aarch64, release, debug --- -- [ ] Run only libFuzzer related jobs +- [ ] Run only fuzzers related jobs (libFuzzer fuzzers, AST fuzzers, etc.) +- [ ] Exclude AST fuzzers --- - [ ] Do not test - [ ] Woolen Wolfdog From d5065a43ae4ae5ba0f068e3fdf5952dd5319f561 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:55:01 +0200 Subject: [PATCH 065/142] Update StorageObjectStorage.cpp --- src/Storages/ObjectStorage/StorageObjectStorage.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index ca0ced8dcd3..d9c82d68791 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -39,11 +39,16 @@ String StorageObjectStorage::getPathSample(StorageInMemoryMetadata metadata, Con auto query_settings = configuration->getQuerySettings(context); /// We don't want to throw an exception if there are no files with specified path. query_settings.throw_on_zero_files_match = false; + + bool local_distributed_processing = distributed_processing; + if (context->getSettingsRef().use_hive_partitioning) + local_distributed_processing = false; + auto file_iterator = StorageObjectStorageSource::createFileIterator( configuration, query_settings, object_storage, - distributed_processing, + local_distributed_processing, context, {}, // predicate metadata.getColumns().getAll(), // virtual_columns From f9a5210bacc418e354ddcf8893fa8c5a291b46d4 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 29 Jul 2024 19:36:31 +0200 Subject: [PATCH 066/142] solve Alexey's review --- src/Storages/VirtualColumnUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 87c1aecc3a7..257a77547c0 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -127,7 +127,7 @@ NameSet getVirtualNamesForFileLikeStorage() std::unordered_map parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns) { - std::string pattern = "([^/]+)=([^/]+)/"; + std::string pattern = "([^/])=([^/]+)/"; re2::StringPiece input_piece(path); std::unordered_map key_values; From 0c9fa155d4993220c00e4b41c0354b20d3312f33 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 30 Jul 2024 11:58:35 +0200 Subject: [PATCH 067/142] revert last commit --- src/Storages/VirtualColumnUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 4edab01925d..f16eff7edb6 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -131,7 +131,7 @@ NameSet getVirtualNamesForFileLikeStorage() std::unordered_map parseHivePartitioningKeysAndValues(const String & path, const ColumnsDescription & storage_columns) { - std::string pattern = "([^/])=([^/]+)/"; + std::string pattern = "([^/]+)=([^/]+)/"; re2::StringPiece input_piece(path); std::unordered_map key_values; From 9d0608ce001b35e17ff81c00d4965dbe4938b56b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 30 Jul 2024 16:55:37 +0200 Subject: [PATCH 068/142] Update Runner.cpp --- utils/keeper-bench/Runner.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp index d99f2645a31..59761d827e1 100644 --- a/utils/keeper-bench/Runner.cpp +++ b/utils/keeper-bench/Runner.cpp @@ -545,8 +545,7 @@ struct ZooKeeperRequestFromLogReader file_read_buf = DB::wrapReadBufferWithCompressionMethod(std::move(file_read_buf), compression_method); DB::SingleReadBufferIterator read_buffer_iterator(std::move(file_read_buf)); - std::string sample_path; - auto [columns_description, format] = DB::detectFormatAndReadSchema(format_settings, read_buffer_iterator, sample_path, context); + auto [columns_description, format] = DB::detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); DB::ColumnsWithTypeAndName columns; columns.reserve(columns_description.size()); From 08ecf6c6642f7fba6f5503f5121c832b982de945 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Tue, 30 Jul 2024 20:07:37 +0000 Subject: [PATCH 069/142] Reset formatter after buffer is restarted Formatter can use buffer to initialize some of its members. If buffer is restarted after those members are initialized, then the buffer restart might invalidate pointers/references/iterators hold into buffer. --- src/Storages/MessageQueueSink.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MessageQueueSink.cpp b/src/Storages/MessageQueueSink.cpp index 36899011e33..d4dabd60ef8 100644 --- a/src/Storages/MessageQueueSink.cpp +++ b/src/Storages/MessageQueueSink.cpp @@ -60,12 +60,12 @@ void MessageQueueSink::consume(Chunk & chunk) row_format->writeRow(columns, row); } row_format->finalize(); - row_format->resetFormatter(); producer->produce(buffer->str(), i, columns, row - 1); /// Reallocate buffer if it's capacity is large then DBMS_DEFAULT_BUFFER_SIZE, /// because most likely in this case we serialized abnormally large row /// and won't need this large allocated buffer anymore. buffer->restart(DBMS_DEFAULT_BUFFER_SIZE); + row_format->resetFormatter(); } } else @@ -73,8 +73,8 @@ void MessageQueueSink::consume(Chunk & chunk) format->write(getHeader().cloneWithColumns(chunk.detachColumns())); format->finalize(); producer->produce(buffer->str(), chunk.getNumRows(), columns, chunk.getNumRows() - 1); - format->resetFormatter(); buffer->restart(); + format->resetFormatter(); } } From 9fb610ae10da22f521a1ab2e4442c78766d5be37 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 31 Jul 2024 11:52:36 +0200 Subject: [PATCH 070/142] fix tests --- .../queries/0_stateless/03203_hive_style_partitioning.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 430a3582f65..a4a2e48e046 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -118,4 +118,3 @@ Eva Schmidt Elizabeth Schmidt Samuel Schmidt Elizabeth Schmidt Eva Schmidt Elizabeth Samuel Schmidt Elizabeth -Elizabeth Gordon Elizabeth Gordon From e7fc206069796f662bb31aab671cbf75b95984ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 31 Jul 2024 17:20:01 +0000 Subject: [PATCH 071/142] Add test --- .../format_schemas/string_key_value.capnp | 6 + .../format_schemas/string_key_value.format | 1 + .../format_schemas/string_key_value.proto | 6 + .../clickhouse_path/format_schemas/test.capnp | 2 +- tests/integration/test_storage_kafka/test.py | 129 ++++++++++++++++++ 5 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.capnp create mode 100644 tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.format create mode 100644 tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.proto diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.capnp b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.capnp new file mode 100644 index 00000000000..4f3eabe22f0 --- /dev/null +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.capnp @@ -0,0 +1,6 @@ +@0x99f75f775fe63dae; + +struct StringKeyValuePair { + key@0 : Text; + value@1 : Text; +} diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.format b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.format new file mode 100644 index 00000000000..83dff6ce401 --- /dev/null +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.format @@ -0,0 +1 @@ +(key = ${key:CSV}, value = ${value:CSV}) diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.proto b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.proto new file mode 100644 index 00000000000..71905c63bdf --- /dev/null +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/string_key_value.proto @@ -0,0 +1,6 @@ +syntax = "proto3"; + +message StringKeyValuePair { + string key = 1; + string value = 2; +} diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp index 44f1961205b..247e7b9ceca 100644 --- a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp @@ -7,4 +7,4 @@ struct TestRecordStruct val1 @2 : Text; val2 @3 : Float32; val3 @4 : UInt8; -} \ No newline at end of file +} diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 8393e88db88..596933c1566 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -5097,6 +5097,135 @@ def test_multiple_read_in_materialized_views(kafka_cluster, max_retries=15): ) +def test_kafka_produce_http_interface_row_based_format(kafka_cluster): + # reproduction of #https://github.com/ClickHouse/ClickHouse/issues/61060 with validating the written messages + admin_client = KafkaAdminClient( + bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port) + ) + + + topic_prefix = "http_row_" + + # It is important to have: + # - long enough messages + # - enough messages + # I don't know the exact requirement for message sizes, but it doesn't reproduce with short messages + # For the number of messages it seems like at least 3 messages is necessary + expected_key = "01234567890123456789" + expected_value = "aaaaabbbbbccccc" + + insert_query_end = f"(key, value) VALUES ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}')" + insert_query_template = "INSERT INTO {table_name} " + insert_query_end + + extra_settings = { + "Protobuf": ", kafka_schema = 'string_key_value.proto:StringKeyValuePair'", + "CapnProto": ", kafka_schema='string_key_value:StringKeyValuePair'", + "Template": ", format_template_row='string_key_value.format'" + } + + # Only the formats that can be used both and input and output format are tested + # Reasons to exclude following formats: + # - JSONStrings: not actually an input format + # - ProtobufSingle: I cannot make it work to parse the messages. Probably something is broken, + # because the producer can write multiple rows into a same message, which makes them impossible to parse properly. Should added after #67549 is fixed. + # - ProtobufList: I didn't want to deal with the envelope and stuff + # - Npy: supports only single column + # - LineAsString: supports only single column + # - RawBLOB: supports only single column + formats_to_test = [ + "TabSeparated", + "TabSeparatedRaw", + "TabSeparatedWithNames", + "TabSeparatedWithNamesAndTypes", + "TabSeparatedRawWithNames", + "TabSeparatedRawWithNamesAndTypes", + "Template", + "CSV", + "CSVWithNames", + "CSVWithNamesAndTypes", + "CustomSeparated", + "CustomSeparatedWithNames", + "CustomSeparatedWithNamesAndTypes", + "Values", + "JSON", + "JSONColumns", + "JSONColumnsWithMetadata", + "JSONCompact", + "JSONCompactColumns", + "JSONEachRow", + "JSONStringsEachRow", + "JSONCompactEachRow", + "JSONCompactEachRowWithNames", + "JSONCompactEachRowWithNamesAndTypes", + "JSONCompactStringsEachRow", + "JSONCompactStringsEachRowWithNames", + "JSONCompactStringsEachRowWithNamesAndTypes", + "JSONObjectEachRow", + "BSONEachRow", + "TSKV", + "Protobuf", + "Avro", + "Parquet", + "Arrow", + "ArrowStream", + "ORC", + "RowBinary", + "RowBinaryWithNames", + "RowBinaryWithNamesAndTypes", + "Native", + "CapnProto", + "MsgPack", + ] + for format in formats_to_test: + logging.debug(f"Creating tables and writing messages to {format}") + topic = topic_prefix + format + kafka_create_topic(admin_client, topic) + + extra_setting = extra_settings.get(format, "") + + # kafka_max_rows_per_message is set to 2 to make sure every format produces at least 2 messages, thus increasing the chance of catching a bug + instance.query( + f""" + DROP TABLE IF EXISTS test.view_{topic}; + DROP TABLE IF EXISTS test.consumer_{topic}; + CREATE TABLE test.kafka_writer_{topic} (key String, value String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kafka1:19092', + kafka_topic_list = '{topic}', + kafka_group_name = '{topic}', + kafka_format = '{format}', + kafka_max_rows_per_message = 2 {extra_setting}; + + CREATE TABLE test.kafka_{topic} (key String, value String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kafka1:19092', + kafka_topic_list = '{topic}', + kafka_group_name = '{topic}', + kafka_format = '{format}' {extra_setting}; + + CREATE MATERIALIZED VIEW test.view_{topic} Engine=Log AS + SELECT key, value FROM test.kafka_{topic}; + """ + ) + + instance.http_query(insert_query_template.format(table_name="test.kafka_writer_"+topic), method="POST") + + expected = f"""\ +{expected_key}\t{expected_value} +{expected_key}\t{expected_value} +{expected_key}\t{expected_value} +""" + # give some times for the readers to read the messages + for format in formats_to_test: + logging.debug(f"Checking result for {format}") + topic = topic_prefix + format + + result = instance.query_with_retry(f"SELECT * FROM test.view_{topic}", check_callback=lambda res: res.count("\n") == 3) + + assert TSV(result) == TSV(expected) + + kafka_delete_topic(admin_client, topic) + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") From bada9ea9be4888ab4ef1b3f7fcdf015e11d994d1 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 31 Jul 2024 17:30:24 +0000 Subject: [PATCH 072/142] Automatic style fix --- tests/integration/test_storage_kafka/test.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 596933c1566..dd0bf1bf28f 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -5103,7 +5103,6 @@ def test_kafka_produce_http_interface_row_based_format(kafka_cluster): bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port) ) - topic_prefix = "http_row_" # It is important to have: @@ -5120,7 +5119,7 @@ def test_kafka_produce_http_interface_row_based_format(kafka_cluster): extra_settings = { "Protobuf": ", kafka_schema = 'string_key_value.proto:StringKeyValuePair'", "CapnProto": ", kafka_schema='string_key_value:StringKeyValuePair'", - "Template": ", format_template_row='string_key_value.format'" + "Template": ", format_template_row='string_key_value.format'", } # Only the formats that can be used both and input and output format are tested @@ -5208,7 +5207,10 @@ def test_kafka_produce_http_interface_row_based_format(kafka_cluster): """ ) - instance.http_query(insert_query_template.format(table_name="test.kafka_writer_"+topic), method="POST") + instance.http_query( + insert_query_template.format(table_name="test.kafka_writer_" + topic), + method="POST", + ) expected = f"""\ {expected_key}\t{expected_value} @@ -5220,12 +5222,16 @@ def test_kafka_produce_http_interface_row_based_format(kafka_cluster): logging.debug(f"Checking result for {format}") topic = topic_prefix + format - result = instance.query_with_retry(f"SELECT * FROM test.view_{topic}", check_callback=lambda res: res.count("\n") == 3) + result = instance.query_with_retry( + f"SELECT * FROM test.view_{topic}", + check_callback=lambda res: res.count("\n") == 3, + ) assert TSV(result) == TSV(expected) kafka_delete_topic(admin_client, topic) + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") From d2e0668d5129a4e60f462de5e5b683099f49bf4b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:51:35 +0200 Subject: [PATCH 073/142] fix settingsChangesHistory after merge with master --- src/Core/SettingsChangesHistory.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index a01c5faaf10..28a732c6177 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -81,9 +81,6 @@ static std::initializer_list Date: Thu, 1 Aug 2024 20:16:10 +0200 Subject: [PATCH 074/142] fix merge with master --- docs/en/operations/settings/settings.md | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index b880c42a45b..c621f2db5ae 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5615,18 +5615,6 @@ Disable all insert and mutations (alter table update / alter table delete / alte Default value: `false`. -## restore_replace_external_engines_to_null - -For testing purposes. Replaces all external engines to Null to not initiate external connections. - -Default value: `False` - -## restore_replace_external_table_functions_to_null - -For testing purposes. Replaces all external table functions to Null to not initiate external connections. - -Default value: `False` - ## use_hive_partitioning When enabled, ClickHouse will detect Hive-style partitioning in path (`/name=value/`) in file-like table engines [File](../../engines/table-engines/special/file.md#hive-style-partitioning)/[S3](../../engines/table-engines/integrations/s3.md#hive-style-partitioning)/[URL](../../engines/table-engines/special/url.md#hive-style-partitioning)/[HDFS](../../engines/table-engines/integrations/hdfs.md#hive-style-partitioning)/[AzureBlobStorage](../../engines/table-engines/integrations/azureBlobStorage.md#hive-style-partitioning) and will allow to use partition columns as virtual columns in the query. These virtual columns will have the same names as in the partitioned path, but starting with `_`. From c50ef37a03438003c21076c5700d9c1f52c1c435 Mon Sep 17 00:00:00 2001 From: pufit Date: Fri, 2 Aug 2024 00:01:41 -0400 Subject: [PATCH 075/142] Fix inconsistent formatting for `GRANT CURRENT GRANTS` --- src/Parsers/Access/ASTGrantQuery.cpp | 21 +++------------------ 1 file changed, 3 insertions(+), 18 deletions(-) diff --git a/src/Parsers/Access/ASTGrantQuery.cpp b/src/Parsers/Access/ASTGrantQuery.cpp index f60fa7e4a23..eac88c75513 100644 --- a/src/Parsers/Access/ASTGrantQuery.cpp +++ b/src/Parsers/Access/ASTGrantQuery.cpp @@ -97,24 +97,9 @@ namespace void formatCurrentGrantsElements(const AccessRightsElements & elements, const IAST::FormatSettings & settings) { - for (size_t i = 0; i != elements.size(); ++i) - { - const auto & element = elements[i]; - - bool next_element_on_same_db_and_table = false; - if (i != elements.size() - 1) - { - const auto & next_element = elements[i + 1]; - if (element.sameDatabaseAndTableAndParameter(next_element)) - next_element_on_same_db_and_table = true; - } - - if (!next_element_on_same_db_and_table) - { - settings.ostr << " "; - formatONClause(element, settings); - } - } + settings.ostr << "("; + formatElementsWithoutOptions(elements, settings); + settings.ostr << ")"; } } From f816158dbc000c4fb02ba413f5b47349795995c4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 5 Aug 2024 09:11:55 +0000 Subject: [PATCH 076/142] Address review comments --- src/Storages/MessageQueueSink.cpp | 4 ++-- tests/integration/test_storage_kafka/test.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MessageQueueSink.cpp b/src/Storages/MessageQueueSink.cpp index d4dabd60ef8..104e3506ade 100644 --- a/src/Storages/MessageQueueSink.cpp +++ b/src/Storages/MessageQueueSink.cpp @@ -46,6 +46,8 @@ void MessageQueueSink::consume(Chunk & chunk) if (columns.empty()) return; + /// The formatter might hold pointers to buffer (e.g. if PeekableWriteBuffer is used), which means the formatter + /// needs to be reset after buffer might reallocate its memory. In this exact case after restarting the buffer. if (row_format) { size_t row = 0; @@ -77,6 +79,4 @@ void MessageQueueSink::consume(Chunk & chunk) format->resetFormatter(); } } - - } diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index dd0bf1bf28f..8793ab72a16 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -5098,7 +5098,7 @@ def test_multiple_read_in_materialized_views(kafka_cluster, max_retries=15): def test_kafka_produce_http_interface_row_based_format(kafka_cluster): - # reproduction of #https://github.com/ClickHouse/ClickHouse/issues/61060 with validating the written messages + # reproduction of #61060 with validating the written messages admin_client = KafkaAdminClient( bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port) ) From fd1e354e8503a968eff9dbe614a47c1135f67bdd Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 5 Aug 2024 13:43:06 +0200 Subject: [PATCH 077/142] fix flaky check for integration tests --- .../test_storage_azure_blob_storage/test.py | 82 +++++++++++++------ .../test_cluster.py | 18 ++-- tests/integration/test_storage_hdfs/test.py | 47 +++++++---- 3 files changed, 94 insertions(+), 53 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 6966abfee4f..15a1f6db2c1 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -135,6 +135,7 @@ def test_create_table_connection_string(cluster): Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV') """, ) + azure_query(node, "DROP TABLE IF EXISTS test_create_table_conn_string") def test_create_table_account_string(cluster): @@ -144,6 +145,7 @@ def test_create_table_account_string(cluster): f"CREATE TABLE test_create_table_account_url (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," f"'cont', 'test_create_connection_string', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV')", ) + azure_query(node, "DROP TABLE IF EXISTS test_create_table_account_url") def test_simple_write_account_string(cluster): @@ -157,6 +159,7 @@ def test_simple_write_account_string(cluster): azure_query(node, "INSERT INTO test_simple_write VALUES (1, 'a')") print(get_azure_file_content("test_simple_write.csv", port)) assert get_azure_file_content("test_simple_write.csv", port) == '1,"a"\n' + azure_query(node, "DROP TABLE test_simple_write") def test_simple_write_connection_string(cluster): @@ -170,6 +173,7 @@ def test_simple_write_connection_string(cluster): azure_query(node, "INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") print(get_azure_file_content("test_simple_write_c.csv", port)) assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' + azure_query(node, "DROP TABLE test_simple_write_connection_string") def test_simple_write_named_collection_1(cluster): @@ -185,7 +189,7 @@ def test_simple_write_named_collection_1(cluster): ) print(get_azure_file_content("test_simple_write_named.csv", port)) assert get_azure_file_content("test_simple_write_named.csv", port) == '1,"a"\n' - azure_query(node, "TRUNCATE TABLE test_simple_write_named_collection_1") + azure_query(node, "DROP TABLE test_simple_write_named_collection_1") def test_simple_write_named_collection_2(cluster): @@ -202,6 +206,7 @@ def test_simple_write_named_collection_2(cluster): ) print(get_azure_file_content("test_simple_write_named_2.csv", port)) assert get_azure_file_content("test_simple_write_named_2.csv", port) == '1,"a"\n' + azure_query(node, "DROP TABLE test_simple_write_named_collection_2") def test_partition_by(cluster): @@ -223,6 +228,7 @@ def test_partition_by(cluster): assert "1,2,3\n" == get_azure_file_content("test_3.csv", port) assert "3,2,1\n" == get_azure_file_content("test_1.csv", port) assert "78,43,45\n" == get_azure_file_content("test_45.csv", port) + azure_query(node, "DROP TABLE test_partitioned_write") def test_partition_by_string_column(cluster): @@ -243,6 +249,7 @@ def test_partition_by_string_column(cluster): assert '1,"foo/bar"\n' == get_azure_file_content("test_foo/bar.csv", port) assert '3,"йцук"\n' == get_azure_file_content("test_йцук.csv", port) assert '78,"你好"\n' == get_azure_file_content("test_你好.csv", port) + azure_query(node, "DROP TABLE test_partitioned_string_write") def test_partition_by_const_column(cluster): @@ -261,6 +268,7 @@ def test_partition_by_const_column(cluster): ) azure_query(node, f"INSERT INTO test_partitioned_const_write VALUES {values}") assert values_csv == get_azure_file_content("test_88.csv", port) + azure_query(node, "DROP TABLE test_partitioned_const_write") def test_truncate(cluster): @@ -276,6 +284,7 @@ def test_truncate(cluster): azure_query(node, "TRUNCATE TABLE test_truncate") with pytest.raises(Exception): print(get_azure_file_content("test_truncate.csv", port)) + azure_query(node, "DROP TABLE test_truncate") def test_simple_read_write(cluster): @@ -292,6 +301,7 @@ def test_simple_read_write(cluster): assert get_azure_file_content("test_simple_read_write.csv", port) == '1,"a"\n' print(azure_query(node, "SELECT * FROM test_simple_read_write")) assert azure_query(node, "SELECT * FROM test_simple_read_write") == "1\ta\n" + azure_query(node, "DROP TABLE test_simple_read_write") def test_create_new_files_on_insert(cluster): @@ -344,6 +354,7 @@ def test_overwrite(cluster): result = azure_query(node, f"select count() from test_overwrite") assert int(result) == 200 + azure_query(node, f"DROP TABLE test_overwrite") def test_insert_with_path_with_globs(cluster): @@ -356,6 +367,7 @@ def test_insert_with_path_with_globs(cluster): node.query_and_get_error( f"insert into table function test_insert_globs SELECT number, randomString(100) FROM numbers(500)" ) + azure_query(node, f"DROP TABLE test_insert_globs") def test_put_get_with_globs(cluster): @@ -364,6 +376,7 @@ def test_put_get_with_globs(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" max_path = "" + used_names = [] for i in range(10): for j in range(10): path = "{}/{}_{}/{}.csv".format( @@ -372,6 +385,8 @@ def test_put_get_with_globs(cluster): max_path = max(path, max_path) values = f"({i},{j},{i + j})" + used_names.append(f"test_put_{i}_{j}") + azure_query( node, f"CREATE TABLE test_put_{i}_{j} ({table_format}) Engine = AzureBlobStorage(azure_conf2, " @@ -392,6 +407,9 @@ def test_put_get_with_globs(cluster): bucket="cont", max_path=max_path ) ] + azure_query(node, "DROP TABLE test_glob_select") + for name in used_names: + azure_query(node, f"DROP TABLE {name}") def test_azure_glob_scheherazade(cluster): @@ -400,12 +418,14 @@ def test_azure_glob_scheherazade(cluster): values = "(1, 1, 1)" nights_per_job = 1001 // 30 jobs = [] + used_names = [] for night in range(0, 1001, nights_per_job): def add_tales(start, end): for i in range(start, end): path = "night_{}/tale.csv".format(i) unique_num = random.randint(1, 10000) + used_names.append(f"test_scheherazade_{i}_{unique_num}") azure_query( node, f"CREATE TABLE test_scheherazade_{i}_{unique_num} ({table_format}) Engine = AzureBlobStorage(azure_conf2, " @@ -433,6 +453,9 @@ def test_azure_glob_scheherazade(cluster): ) query = "select count(), sum(column1), sum(column2), sum(column3) from test_glob_select_scheherazade" assert azure_query(node, query).splitlines() == ["1001\t1001\t1001\t1001"] + azure_query(node, "DROP TABLE test_glob_select_scheherazade") + for name in used_names: + azure_query(node, f"DROP TABLE {name}") @pytest.mark.parametrize( @@ -506,6 +529,8 @@ def test_schema_inference_no_globs(cluster): assert azure_query(node, query).splitlines() == [ "499500\t2890\t332833500\ttest_schema_inference_no_globs.csv\tcont/test_schema_inference_no_globs.csv" ] + azure_query(node, f"DROP TABLE test_schema_inference_src") + azure_query(node, f"DROP TABLE test_select_inference") def test_schema_inference_from_globs(cluster): @@ -514,6 +539,7 @@ def test_schema_inference_from_globs(cluster): node = cluster.instances["node"] # type: ClickHouseInstance table_format = "column1 UInt32, column2 UInt32, column3 UInt32" max_path = "" + used_names = [] for i in range(10): for j in range(10): path = "{}/{}_{}/{}.csv".format( @@ -521,6 +547,7 @@ def test_schema_inference_from_globs(cluster): ) max_path = max(path, max_path) values = f"({i},{j},{i + j})" + used_names.append(f"test_schema_{i}_{j}") azure_query( node, @@ -546,6 +573,9 @@ def test_schema_inference_from_globs(cluster): bucket="cont", max_path=max_path ) ] + azure_query(node, "DROP TABLE test_glob_select_inference") + for name in used_names: + azure_query(node, f"DROP TABLE {name}") def test_simple_write_account_string_table_function(cluster): @@ -595,7 +625,7 @@ def test_simple_write_named_collection_1_table_function(cluster): azure_query( node, - "TRUNCATE TABLE drop_table", + "DROP TABLE drop_table", ) @@ -605,7 +635,7 @@ def test_simple_write_named_collection_2_table_function(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')", + f" container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a') SETTINGS azure_truncate_on_insert=1", ) print(get_azure_file_content("test_simple_write_named_2_tf.csv", port)) assert get_azure_file_content("test_simple_write_named_2_tf.csv", port) == '1,"a"\n' @@ -628,7 +658,7 @@ def test_put_get_with_globs_tf(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1", ) query = ( f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, " @@ -649,7 +679,7 @@ def test_schema_inference_no_globs_tf(cluster): query = ( f"insert into table function azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', " f"container='cont', blob_path='test_schema_inference_no_globs_tf.csv', format='CSVWithNames', structure='{table_format}') " - f"SELECT number, toString(number), number * number FROM numbers(1000)" + f"SELECT number, toString(number), number * number FROM numbers(1000) SETTINGS azure_truncate_on_insert=1" ) azure_query(node, query) @@ -680,7 +710,7 @@ def test_schema_inference_from_globs_tf(cluster): query = ( f"insert into table function azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', " - f"container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" + f"container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1" ) azure_query(node, query) @@ -708,7 +738,7 @@ def test_partition_by_tf(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', " f"'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', " - f"'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}", + f"'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", ) assert "1,2,3\n" == get_azure_file_content("test_partition_tf_3.csv", port) @@ -727,7 +757,7 @@ def test_filter_using_file(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', 'cont', '{filename}', " f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', " - f"'{table_format}') PARTITION BY {partition_by} VALUES {values}", + f"'{table_format}') PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", ) query = ( @@ -745,7 +775,7 @@ def test_read_subcolumns(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.tsv', " f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," - f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)", + f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS azure_truncate_on_insert=1", ) azure_query( @@ -795,7 +825,7 @@ def test_read_subcolumn_time(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv', " f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," - f" 'a UInt32') select (42)", + f" 'a UInt32') select (42) SETTINGS azure_truncate_on_insert=1", ) res = node.query( @@ -825,7 +855,7 @@ def test_function_signatures(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_signature.csv', '{account_name}', '{account_key}', 'CSV', 'auto', 'column1 UInt32') VALUES (1),(2),(3)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_signature.csv', '{account_name}', '{account_key}', 'CSV', 'auto', 'column1 UInt32') VALUES (1),(2),(3) SETTINGS azure_truncate_on_insert=1", ) # " - connection_string, container_name, blobpath\n" @@ -939,12 +969,12 @@ def test_union_schema_inference_mode(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference1.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'a UInt32') VALUES (1)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference1.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'a UInt32') VALUES (1) SETTINGS azure_truncate_on_insert=1", ) azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference2.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'b UInt32') VALUES (2)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference2.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'b UInt32') VALUES (2) SETTINGS azure_truncate_on_insert=1", ) node.query("system drop schema cache for azure") @@ -981,7 +1011,7 @@ def test_union_schema_inference_mode(cluster): assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference3.jsonl', '{account_name}', '{account_key}', 'CSV', 'auto', 's String') VALUES ('Error')", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference3.jsonl', '{account_name}', '{account_key}', 'CSV', 'auto', 's String') VALUES ('Error') SETTINGS azure_truncate_on_insert=1", ) error = azure_query( @@ -1003,7 +1033,7 @@ def test_schema_inference_cache(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cache0.jsonl', '{account_name}', '{account_key}') " - f"select * from numbers(100)", + f"select * from numbers(100) SETTINGS azure_truncate_on_insert=1", ) time.sleep(1) @@ -1210,19 +1240,19 @@ def test_filtering_by_file_or_path(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}','cont', 'test_filter1.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 1", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 1 SETTINGS azure_truncate_on_insert=1", ) azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}','cont', 'test_filter2.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 2", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 2 SETTINGS azure_truncate_on_insert=1", ) azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_filter3.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 3", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 3 SETTINGS azure_truncate_on_insert=1", ) node.query( @@ -1246,19 +1276,19 @@ def test_size_virtual_column(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}','cont', 'test_size_virtual_column1.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 1", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 1 SETTINGS azure_truncate_on_insert=1", ) azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}','cont', 'test_size_virtual_column2.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 11", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 11 SETTINGS azure_truncate_on_insert=1", ) azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_size_virtual_column3.tsv', 'devstoreaccount1', " - f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 111", + f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto', 'x UInt64') select 111 SETTINGS azure_truncate_on_insert=1", ) result = azure_query( @@ -1281,7 +1311,7 @@ def test_format_detection(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(0)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(0) SETTINGS azure_truncate_on_insert=1", ) azure_query( @@ -1351,7 +1381,7 @@ def test_write_to_globbed_partitioned_path(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" error = azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_data_*_{{_partition_id}}', '{account_name}', '{account_key}', 'CSV', 'auto', 'x UInt64') partition by 42 select 42", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_data_*_{{_partition_id}}', '{account_name}', '{account_key}', 'CSV', 'auto', 'x UInt64') partition by 42 select 42 SETTINGS azure_truncate_on_insert=1", expect_error="true", ) @@ -1475,7 +1505,7 @@ def test_hive_partitioning_with_one_parameter(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1", ) query = ( @@ -1512,7 +1542,7 @@ def test_hive_partitioning_with_two_parameters(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2} SETTINGS azure_truncate_on_insert=1", ) query = ( @@ -1558,7 +1588,7 @@ def test_hive_partitioning_without_setting(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2} SETTINGS azure_truncate_on_insert=1", ) query = ( diff --git a/tests/integration/test_storage_azure_blob_storage/test_cluster.py b/tests/integration/test_storage_azure_blob_storage/test_cluster.py index 6c5e2d20ca5..4d63016cf9a 100644 --- a/tests/integration/test_storage_azure_blob_storage/test_cluster.py +++ b/tests/integration/test_storage_azure_blob_storage/test_cluster.py @@ -71,7 +71,7 @@ def test_select_all(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_select_all.csv', 'devstoreaccount1'," f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') " - f"VALUES (1, 'a'), (2, 'b')", + f"VALUES (1, 'a'), (2, 'b') SETTINGS azure_truncate_on_insert=1", ) print(get_azure_file_content("test_cluster_select_all.csv", port)) @@ -100,7 +100,7 @@ def test_count(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_count.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', " - f"'auto', 'key UInt64') VALUES (1), (2)", + f"'auto', 'key UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", ) print(get_azure_file_content("test_cluster_count.csv", port)) @@ -128,7 +128,7 @@ def test_union_all(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_parquet_union_all', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'Parquet', " - f"'auto', 'a Int32, b String') VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", + f"'auto', 'a Int32, b String') VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd') SETTINGS azure_truncate_on_insert=1", ) pure_azure = azure_query( @@ -179,7 +179,7 @@ def test_skip_unavailable_shards(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_skip_unavailable.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2)", + f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", ) result = azure_query( node, @@ -199,7 +199,7 @@ def test_unset_skip_unavailable_shards(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_unset_skip_unavailable.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2)", + f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", ) result = azure_query( node, @@ -217,7 +217,7 @@ def test_cluster_with_named_collection(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_with_named_collection.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2)", + f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", ) pure_azure = azure_query( @@ -248,7 +248,7 @@ def test_partition_parallel_reading_with_cluster(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', '{filename}', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') " - f"PARTITION BY {partition_by} VALUES {values}", + f"PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", ) assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv", port) @@ -272,12 +272,12 @@ def test_format_detection(cluster): azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10) SETTINGS azure_truncate_on_insert=1", ) azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10, 10)", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10, 10) SETTINGS azure_truncate_on_insert=1", ) expected_desc_result = azure_query( diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 79914855782..ca072f59e4b 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -41,6 +41,7 @@ def test_read_write_storage(started_cluster): node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)") assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" assert node1.query("select * from SimpleHDFSStorage") == "1\tMark\t72.53\n" + node1.query("drop table if exists SimpleHDFSStorage") def test_read_write_storage_with_globs(started_cluster): @@ -94,6 +95,11 @@ def test_read_write_storage_with_globs(started_cluster): print(ex) assert "in readonly mode" in str(ex) + node1.query("DROP TABLE HDFSStorageWithRange") + node1.query("DROP TABLE HDFSStorageWithEnum") + node1.query("DROP TABLE HDFSStorageWithQuestionMark") + node1.query("DROP TABLE HDFSStorageWithAsterisk") + def test_storage_with_multidirectory_glob(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -335,6 +341,7 @@ def test_virtual_columns(started_cluster): ) == expected ) + node1.query("DROP TABLE virual_cols") def test_read_files_with_spaces(started_cluster): @@ -356,6 +363,7 @@ def test_read_files_with_spaces(started_cluster): ) assert node1.query("select * from test order by id") == "1\n2\n3\n" fs.delete(dir, recursive=True) + node1.query("DROP TABLE test") def test_truncate_table(started_cluster): @@ -427,7 +435,7 @@ def test_seekable_formats(started_cluster): f"hdfs('hdfs://hdfs1:9000/parquet', 'Parquet', 'a Int32, b String')" ) node1.query( - f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" + f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query(f"SELECT count() FROM {table_function}") @@ -435,7 +443,7 @@ def test_seekable_formats(started_cluster): table_function = f"hdfs('hdfs://hdfs1:9000/orc', 'ORC', 'a Int32, b String')" node1.query( - f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" + f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query(f"SELECT count() FROM {table_function}") assert int(result) == 5000000 @@ -459,7 +467,7 @@ def test_read_table_with_default(started_cluster): def test_schema_inference(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)" + f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") @@ -512,6 +520,7 @@ def test_hdfs_directory_not_exist(started_cluster): assert "" == node1.query( "select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1" ) + node1.query("DROP TABLE HDFSStorageWithNotExistDir") def test_overwrite(started_cluster): @@ -531,6 +540,7 @@ def test_overwrite(started_cluster): result = node1.query(f"select count() from test_overwrite") assert int(result) == 10 + node1.query(f"DROP TABLE test_overwrite") def test_multiple_inserts(started_cluster): @@ -567,6 +577,7 @@ def test_multiple_inserts(started_cluster): result = node1.query(f"select count() from test_multiple_inserts") assert int(result) == 60 + node1.query(f"DROP TABLE test_multiple_inserts") def test_format_detection(started_cluster): @@ -580,10 +591,10 @@ def test_format_detection(started_cluster): def test_schema_inference_with_globs(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" + f"insert into table function hdfs('hdfs://hdfs1:9000/data1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL SETTINGS hdfs_truncate_on_insert=1" ) node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0" + f"insert into table function hdfs('hdfs://hdfs1:9000/data2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0 SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query( @@ -597,7 +608,7 @@ def test_schema_inference_with_globs(started_cluster): assert sorted(result.split()) == ["0", "\\N"] node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" + f"insert into table function hdfs('hdfs://hdfs1:9000/data3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL SETTINGS hdfs_truncate_on_insert=1" ) filename = "data{1,3}.jsoncompacteachrow" @@ -609,7 +620,7 @@ def test_schema_inference_with_globs(started_cluster): assert "All attempts to extract table structure from files failed" in result node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'" + f"insert into table function hdfs('hdfs://hdfs1:9000/data0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]' SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query_and_get_error( @@ -621,7 +632,7 @@ def test_schema_inference_with_globs(started_cluster): def test_insert_select_schema_inference(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x" + f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x SETTINGS hdfs_truncate_on_insert=1" ) result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/test.native.zst')") @@ -664,7 +675,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_2', 'Parquet', 'a Int32, b String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") + node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "parquet_2" @@ -672,7 +683,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") + node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "kek" @@ -969,11 +980,11 @@ def test_read_subcolumns(started_cluster): node = started_cluster.instances["node1"] node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS hdfs_truncate_on_insert=1" ) node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS hdfs_truncate_on_insert=1" ) res = node.query( @@ -1019,11 +1030,11 @@ def test_union_schema_inference_mode(started_cluster): node = started_cluster.instances["node1"] node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference1.jsonl') select 1 as a" + "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference1.jsonl') select 1 as a SETTINGS hdfs_truncate_on_insert=1" ) node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') select 2 as b" + "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') select 2 as b SETTINGS hdfs_truncate_on_insert=1" ) node.query("system drop schema cache for hdfs") @@ -1055,7 +1066,7 @@ def test_union_schema_inference_mode(started_cluster): ) assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n" node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference3.jsonl', TSV) select 'Error'" + f"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference3.jsonl', TSV) select 'Error' SETTINGS hdfs_truncate_on_insert=1" ) error = node.query_and_get_error( @@ -1068,11 +1079,11 @@ def test_format_detection(started_cluster): node = started_cluster.instances["node1"] node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)" + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0) SETTINGS hdfs_truncate_on_insert=1" ) node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)" + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10) SETTINGS hdfs_truncate_on_insert=1" ) expected_desc_result = node.query( @@ -1136,7 +1147,7 @@ def test_write_to_globbed_partitioned_path(started_cluster): node = started_cluster.instances["node1"] error = node.query_and_get_error( - "insert into function hdfs('hdfs://hdfs1:9000/test_data_*_{_partition_id}.csv') partition by 42 select 42" + "insert into function hdfs('hdfs://hdfs1:9000/test_data_*_{_partition_id}.csv') partition by 42 select 42 SETTINGS hdfs_truncate_on_insert=1" ) assert "DATABASE_ACCESS_DENIED" in error From d080f863ea41420ecbb1c5d65769d74e21a46aba Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 5 Aug 2024 17:07:17 +0200 Subject: [PATCH 078/142] fix black --- tests/integration/test_storage_hdfs/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 9ee8ac4cdfd..856715f28c8 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -675,7 +675,9 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_2', 'Parquet', 'a Int32, b String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") + node1.query( + f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" + ) result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "parquet_2" @@ -683,7 +685,9 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") + node1.query( + f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" + ) result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "kek" From d8aa219783b6715f5424772bf25c092a26be5e2d Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 5 Aug 2024 17:43:29 +0200 Subject: [PATCH 079/142] fix build --- src/Core/SettingsChangesHistory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 03e0f4f0dc8..2080e8fbf0d 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -77,7 +77,7 @@ static std::initializer_list Date: Tue, 6 Aug 2024 13:23:12 +0800 Subject: [PATCH 080/142] Small refactors in ORC output format --- .../Formats/Impl/ORCBlockOutputFormat.cpp | 99 +++++++------------ 1 file changed, 33 insertions(+), 66 deletions(-) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 6f543a05fba..bd89ae0fa86 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -203,25 +204,15 @@ template void ORCBlockOutputFormat::writeNumbers( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, ConvertFunc convert) { NumberVectorBatch & number_orc_column = dynamic_cast(orc_column); const auto & number_column = assert_cast &>(column); - number_orc_column.resize(number_column.size()); + number_orc_column.data.resize(number_column.size()); for (size_t i = 0; i != number_column.size(); ++i) - { - if (null_bytemap && (*null_bytemap)[i]) - { - number_orc_column.notNull[i] = 0; - continue; - } - - number_orc_column.notNull[i] = 1; number_orc_column.data[i] = convert(number_column.getElement(i)); - } - number_orc_column.numElements = number_column.size(); } template @@ -229,7 +220,7 @@ void ORCBlockOutputFormat::writeDecimals( orc::ColumnVectorBatch & orc_column, const IColumn & column, DataTypePtr & type, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, ConvertFunc convert) { DecimalVectorBatch & decimal_orc_column = dynamic_cast(orc_column); @@ -238,71 +229,49 @@ void ORCBlockOutputFormat::writeDecimals( decimal_orc_column.precision = decimal_type->getPrecision(); decimal_orc_column.scale = decimal_type->getScale(); decimal_orc_column.resize(decimal_column.size()); - for (size_t i = 0; i != decimal_column.size(); ++i) - { - if (null_bytemap && (*null_bytemap)[i]) - { - decimal_orc_column.notNull[i] = 0; - continue; - } - decimal_orc_column.notNull[i] = 1; + decimal_orc_column.values.resize(decimal_column.size()); + for (size_t i = 0; i != decimal_column.size(); ++i) decimal_orc_column.values[i] = convert(decimal_column.getElement(i).value); - } - decimal_orc_column.numElements = decimal_column.size(); } template void ORCBlockOutputFormat::writeStrings( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap) + const PaddedPODArray * /*null_bytemap*/) { orc::StringVectorBatch & string_orc_column = dynamic_cast(orc_column); const auto & string_column = assert_cast(column); - string_orc_column.resize(string_column.size()); + string_orc_column.data.resize(string_column.size()); + string_orc_column.length.resize(string_column.size()); for (size_t i = 0; i != string_column.size(); ++i) { - if (null_bytemap && (*null_bytemap)[i]) - { - string_orc_column.notNull[i] = 0; - continue; - } - - string_orc_column.notNull[i] = 1; const std::string_view & string = string_column.getDataAt(i).toView(); string_orc_column.data[i] = const_cast(string.data()); string_orc_column.length[i] = string.size(); } - string_orc_column.numElements = string_column.size(); } template void ORCBlockOutputFormat::writeDateTimes( orc::ColumnVectorBatch & orc_column, const IColumn & column, - const PaddedPODArray * null_bytemap, + const PaddedPODArray * /*null_bytemap*/, GetSecondsFunc get_seconds, GetNanosecondsFunc get_nanoseconds) { orc::TimestampVectorBatch & timestamp_orc_column = dynamic_cast(orc_column); const auto & timestamp_column = assert_cast(column); - timestamp_orc_column.resize(timestamp_column.size()); + timestamp_orc_column.data.resize(timestamp_column.size()); + timestamp_orc_column.nanoseconds.resize(timestamp_column.size()); for (size_t i = 0; i != timestamp_column.size(); ++i) { - if (null_bytemap && (*null_bytemap)[i]) - { - timestamp_orc_column.notNull[i] = 0; - continue; - } - - timestamp_orc_column.notNull[i] = 1; timestamp_orc_column.data[i] = static_cast(get_seconds(timestamp_column.getElement(i))); timestamp_orc_column.nanoseconds[i] = static_cast(get_nanoseconds(timestamp_column.getElement(i))); } - timestamp_orc_column.numElements = timestamp_column.size(); } void ORCBlockOutputFormat::writeColumn( @@ -311,9 +280,19 @@ void ORCBlockOutputFormat::writeColumn( DataTypePtr & type, const PaddedPODArray * null_bytemap) { - orc_column.notNull.resize(column.size()); + orc_column.numElements = column.size(); if (null_bytemap) - orc_column.hasNulls = true; + { + orc_column.hasNulls = !memoryIsZero(null_bytemap->data(), 0, null_bytemap->size()); + if (orc_column.hasNulls) + { + orc_column.notNull.resize(null_bytemap->size()); + for (size_t i = 0; i < null_bytemap->size(); ++i) + orc_column.notNull[i] = !(*null_bytemap)[i]; + } + } + else + orc_column.hasNulls = false; /// ORC doesn't have unsigned types, so cast everything to signed and sign-extend to Int64 to /// make the ORC library calculate min and max correctly. @@ -471,6 +450,7 @@ void ORCBlockOutputFormat::writeColumn( } case TypeIndex::Nullable: { + chassert(!null_bytemap); const auto & nullable_column = assert_cast(column); const PaddedPODArray & new_null_bytemap = assert_cast &>(*nullable_column.getNullMapColumnPtr()).getData(); auto nested_type = removeNullable(type); @@ -485,19 +465,15 @@ void ORCBlockOutputFormat::writeColumn( const ColumnArray::Offsets & offsets = list_column.getOffsets(); size_t column_size = list_column.size(); - list_orc_column.resize(column_size); + list_orc_column.offsets.resize(column_size + 1); /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. list_orc_column.offsets[0] = 0; for (size_t i = 0; i != column_size; ++i) - { list_orc_column.offsets[i + 1] = offsets[i]; - list_orc_column.notNull[i] = 1; - } orc::ColumnVectorBatch & nested_orc_column = *list_orc_column.elements; - writeColumn(nested_orc_column, list_column.getData(), nested_type, null_bytemap); - list_orc_column.numElements = column_size; + writeColumn(nested_orc_column, list_column.getData(), nested_type, nullptr); break; } case TypeIndex::Tuple: @@ -505,10 +481,8 @@ void ORCBlockOutputFormat::writeColumn( orc::StructVectorBatch & struct_orc_column = dynamic_cast(orc_column); const auto & tuple_column = assert_cast(column); auto nested_types = assert_cast(type.get())->getElements(); - for (size_t i = 0; i != tuple_column.size(); ++i) - struct_orc_column.notNull[i] = 1; for (size_t i = 0; i != tuple_column.tupleSize(); ++i) - writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], null_bytemap); + writeColumn(*struct_orc_column.fields[i], tuple_column.getColumn(i), nested_types[i], nullptr); break; } case TypeIndex::Map: @@ -520,25 +494,21 @@ void ORCBlockOutputFormat::writeColumn( size_t column_size = list_column.size(); - map_orc_column.resize(list_column.size()); + map_orc_column.offsets.resize(column_size + 1); /// The length of list i in ListVectorBatch is offsets[i+1] - offsets[i]. map_orc_column.offsets[0] = 0; for (size_t i = 0; i != column_size; ++i) - { map_orc_column.offsets[i + 1] = offsets[i]; - map_orc_column.notNull[i] = 1; - } + const auto nested_columns = assert_cast(list_column.getDataPtr().get())->getColumns(); orc::ColumnVectorBatch & keys_orc_column = *map_orc_column.keys; auto key_type = map_type.getKeyType(); - writeColumn(keys_orc_column, *nested_columns[0], key_type, null_bytemap); + writeColumn(keys_orc_column, *nested_columns[0], key_type, nullptr); orc::ColumnVectorBatch & values_orc_column = *map_orc_column.elements; auto value_type = map_type.getValueType(); - writeColumn(values_orc_column, *nested_columns[1], value_type, null_bytemap); - - map_orc_column.numElements = column_size; + writeColumn(values_orc_column, *nested_columns[1], value_type, nullptr); break; } default: @@ -575,10 +545,7 @@ void ORCBlockOutputFormat::consume(Chunk chunk) size_t columns_num = chunk.getNumColumns(); size_t rows_num = chunk.getNumRows(); - /// getMaxColumnSize is needed to write arrays. - /// The size of the batch must be no less than total amount of array elements - /// and no less than the number of rows (ORC writes a null bit for every row). - std::unique_ptr batch = writer->createRowBatch(getMaxColumnSize(chunk)); + std::unique_ptr batch = writer->createRowBatch(chunk.getNumRows()); orc::StructVectorBatch & root = dynamic_cast(*batch); auto columns = chunk.detachColumns(); From aa3d8086c32ce2b5a90fbe4788579cae970ec32f Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 6 Aug 2024 12:30:39 +0200 Subject: [PATCH 081/142] fix integration tests --- .../test_storage_azure_blob_storage/test.py | 37 ++++++++++++------- .../test_cluster.py | 21 +++++++---- tests/integration/test_storage_hdfs/test.py | 10 ++--- 3 files changed, 41 insertions(+), 27 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 15a1f6db2c1..092c124855c 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -635,7 +635,8 @@ def test_simple_write_named_collection_2_table_function(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a') SETTINGS azure_truncate_on_insert=1", + f" container='cont', blob_path='test_simple_write_named_2_tf.csv', format='CSV', structure='key UInt64, data String') VALUES (1, 'a')", + settings={"azure_truncate_on_insert": 1}, ) print(get_azure_file_content("test_simple_write_named_2_tf.csv", port)) assert get_azure_file_content("test_simple_write_named_2_tf.csv", port) == '1,"a"\n' @@ -658,7 +659,8 @@ def test_put_get_with_globs_tf(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + settings={"azure_truncate_on_insert": 1}, ) query = ( f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, " @@ -710,9 +712,9 @@ def test_schema_inference_from_globs_tf(cluster): query = ( f"insert into table function azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', " - f"container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1" + f"container='cont', blob_path='{path}', format='CSVWithNames', structure='{table_format}') VALUES {values}" ) - azure_query(node, query) + azure_query(node, query, settings={"azure_truncate_on_insert": 1}) query = ( f"select sum(column1), sum(column2), sum(column3), min(_file), max(_path) from azureBlobStorage(azure_conf2, " @@ -738,7 +740,8 @@ def test_partition_by_tf(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', " f"'cont', '{filename}', 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', " - f"'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", + f"'CSV', 'auto', '{table_format}') PARTITION BY {partition_by} VALUES {values}", + settings={"azure_truncate_on_insert": 1}, ) assert "1,2,3\n" == get_azure_file_content("test_partition_tf_3.csv", port) @@ -757,7 +760,8 @@ def test_filter_using_file(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}', 'cont', '{filename}', " f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', " - f"'{table_format}') PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", + f"'{table_format}') PARTITION BY {partition_by} VALUES {values}", + settings={"azure_truncate_on_insert": 1}, ) query = ( @@ -855,7 +859,8 @@ def test_function_signatures(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_signature.csv', '{account_name}', '{account_key}', 'CSV', 'auto', 'column1 UInt32') VALUES (1),(2),(3) SETTINGS azure_truncate_on_insert=1", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_signature.csv', '{account_name}', '{account_key}', 'CSV', 'auto', 'column1 UInt32') VALUES (1),(2),(3)", + settings={"azure_truncate_on_insert": 1}, ) # " - connection_string, container_name, blobpath\n" @@ -969,12 +974,14 @@ def test_union_schema_inference_mode(cluster): account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference1.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'a UInt32') VALUES (1) SETTINGS azure_truncate_on_insert=1", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference1.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'a UInt32') VALUES (1)", + settings={"azure_truncate_on_insert": 1}, ) azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference2.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'b UInt32') VALUES (2) SETTINGS azure_truncate_on_insert=1", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference2.jsonl', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'b UInt32') VALUES (2)", + settings={"azure_truncate_on_insert": 1}, ) node.query("system drop schema cache for azure") @@ -1011,7 +1018,8 @@ def test_union_schema_inference_mode(cluster): assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n" azure_query( node, - f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference3.jsonl', '{account_name}', '{account_key}', 'CSV', 'auto', 's String') VALUES ('Error') SETTINGS azure_truncate_on_insert=1", + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference3.jsonl', '{account_name}', '{account_key}', 'CSV', 'auto', 's String') VALUES ('Error')", + settings={"azure_truncate_on_insert": 1}, ) error = azure_query( @@ -1505,7 +1513,8 @@ def test_hive_partitioning_with_one_parameter(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values} SETTINGS azure_truncate_on_insert=1", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values}", + settings={"azure_truncate_on_insert": 1}, ) query = ( @@ -1542,7 +1551,8 @@ def test_hive_partitioning_with_two_parameters(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2} SETTINGS azure_truncate_on_insert=1", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + settings={"azure_truncate_on_insert": 1}, ) query = ( @@ -1588,7 +1598,8 @@ def test_hive_partitioning_without_setting(cluster): azure_query( node, f"INSERT INTO TABLE FUNCTION azureBlobStorage(azure_conf2, storage_account_url = '{cluster.env_variables['AZURITE_STORAGE_ACCOUNT_URL']}'," - f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2} SETTINGS azure_truncate_on_insert=1", + f" container='cont', blob_path='{path}', format='CSV', compression='auto', structure='{table_format}') VALUES {values_1}, {values_2}", + settings={"azure_truncate_on_insert": 1}, ) query = ( diff --git a/tests/integration/test_storage_azure_blob_storage/test_cluster.py b/tests/integration/test_storage_azure_blob_storage/test_cluster.py index 4d63016cf9a..04baf007c69 100644 --- a/tests/integration/test_storage_azure_blob_storage/test_cluster.py +++ b/tests/integration/test_storage_azure_blob_storage/test_cluster.py @@ -71,7 +71,8 @@ def test_select_all(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_select_all.csv', 'devstoreaccount1'," f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', 'key UInt64, data String') " - f"VALUES (1, 'a'), (2, 'b') SETTINGS azure_truncate_on_insert=1", + f"VALUES (1, 'a'), (2, 'b')", + settings={"azure_truncate_on_insert": 1}, ) print(get_azure_file_content("test_cluster_select_all.csv", port)) @@ -100,7 +101,8 @@ def test_count(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_count.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', " - f"'auto', 'key UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", + f"'auto', 'key UInt64') VALUES (1), (2)", + settings={"azure_truncate_on_insert": 1}, ) print(get_azure_file_content("test_cluster_count.csv", port)) @@ -128,7 +130,8 @@ def test_union_all(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_parquet_union_all', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'Parquet', " - f"'auto', 'a Int32, b String') VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd') SETTINGS azure_truncate_on_insert=1", + f"'auto', 'a Int32, b String') VALUES (1, 'a'), (2, 'b'), (3, 'c'), (4, 'd')", + settings={"azure_truncate_on_insert": 1}, ) pure_azure = azure_query( @@ -179,7 +182,8 @@ def test_skip_unavailable_shards(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_skip_unavailable.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", + f"'auto', 'a UInt64') VALUES (1), (2)", + settings={"azure_truncate_on_insert": 1}, ) result = azure_query( node, @@ -199,7 +203,8 @@ def test_unset_skip_unavailable_shards(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_unset_skip_unavailable.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", + f"'auto', 'a UInt64') VALUES (1), (2)", + settings={"azure_truncate_on_insert": 1}, ) result = azure_query( node, @@ -217,7 +222,8 @@ def test_cluster_with_named_collection(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_cluster_with_named_collection.csv', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', " - f"'auto', 'a UInt64') VALUES (1), (2) SETTINGS azure_truncate_on_insert=1", + f"'auto', 'a UInt64') VALUES (1), (2)", + settings={"azure_truncate_on_insert": 1}, ) pure_azure = azure_query( @@ -248,7 +254,8 @@ def test_partition_parallel_reading_with_cluster(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', '{filename}', 'devstoreaccount1', " f"'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'CSV', 'auto', '{table_format}') " - f"PARTITION BY {partition_by} VALUES {values} SETTINGS azure_truncate_on_insert=1", + f"PARTITION BY {partition_by} VALUES {values}", + settings={"azure_truncate_on_insert": 1}, ) assert "1,2,3\n" == get_azure_file_content("test_tf_3.csv", port) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 856715f28c8..3fef6bc46cf 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -341,7 +341,7 @@ def test_virtual_columns(started_cluster): ) == expected ) - node1.query("DROP TABLE virual_cols") + node1.query("DROP TABLE virtual_cols") def test_read_files_with_spaces(started_cluster): @@ -675,9 +675,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_2', 'Parquet', 'a Int32, b String')" ) - node1.query( - f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" - ) + node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "parquet_2" @@ -685,9 +683,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" ) - node1.query( - f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" - ) + node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "kek" From 27cdbb54d73f8f4b82d63850c1e6f6fd5669646e Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 6 Aug 2024 13:10:03 +0200 Subject: [PATCH 082/142] fix black --- tests/integration/test_storage_hdfs/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 3fef6bc46cf..77921b885b0 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -675,7 +675,9 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_2', 'Parquet', 'a Int32, b String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") + node1.query( + f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" + ) result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "parquet_2" @@ -683,7 +685,9 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" ) - node1.query(f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1") + node1.query( + f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" + ) result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "kek" From 04438784e2178820537c65b66b5a8341f3d63b8d Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 6 Aug 2024 16:45:46 -0400 Subject: [PATCH 083/142] add a stateless test for `grant current grants` --- .../03215_grant_current_grants.reference | 2 ++ .../0_stateless/03215_grant_current_grants.sh | 26 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tests/queries/0_stateless/03215_grant_current_grants.reference create mode 100755 tests/queries/0_stateless/03215_grant_current_grants.sh diff --git a/tests/queries/0_stateless/03215_grant_current_grants.reference b/tests/queries/0_stateless/03215_grant_current_grants.reference new file mode 100644 index 00000000000..e4f6850b806 --- /dev/null +++ b/tests/queries/0_stateless/03215_grant_current_grants.reference @@ -0,0 +1,2 @@ +GRANT SELECT, CREATE TABLE, CREATE VIEW ON default.* +GRANT SELECT ON default.* diff --git a/tests/queries/0_stateless/03215_grant_current_grants.sh b/tests/queries/0_stateless/03215_grant_current_grants.sh new file mode 100755 index 00000000000..68af4a62bba --- /dev/null +++ b/tests/queries/0_stateless/03215_grant_current_grants.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +user1="user03215_1_${CLICKHOUSE_DATABASE}_$RANDOM" +user2="user03215_2_${CLICKHOUSE_DATABASE}_$RANDOM" +user3="user03215_3_${CLICKHOUSE_DATABASE}_$RANDOM" +db=${CLICKHOUSE_DATABASE} + + +${CLICKHOUSE_CLIENT} --query "CREATE USER $user1, $user2, $user3;"; +${CLICKHOUSE_CLIENT} --query "GRANT SELECT, CREATE TABLE, CREATE VIEW ON $db.* TO $user1 WITH GRANT OPTION;"; + +${CLICKHOUSE_CLIENT} --query "GRANT CURRENT GRANTS ON $db.* TO $user2" --user $user1; +${CLICKHOUSE_CLIENT} --query "GRANT CURRENT GRANTS ON $db.* TO $user3" --user $user2; + +${CLICKHOUSE_CLIENT} --query "SHOW GRANTS FOR $user2" | sed 's/ TO.*//'; +${CLICKHOUSE_CLIENT} --query "SHOW GRANTS FOR $user3" | sed 's/ TO.*//'; + +${CLICKHOUSE_CLIENT} --query "GRANT CURRENT GRANTS(SELECT ON $db.*) TO $user3" --user $user1; +${CLICKHOUSE_CLIENT} --query "SHOW GRANTS FOR $user3" | sed 's/ TO.*//'; + +${CLICKHOUSE_CLIENT} --query "DROP USER IF EXISTS $user1, $user2, $user3"; From 5ae5cd35b5b263d14bdd62aa5cbaa1e22219208a Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 6 Aug 2024 21:50:31 +0100 Subject: [PATCH 084/142] update --- base/poco/Net/include/Poco/Net/HTTPServerSession.h | 4 ++-- src/Server/HTTP/sendExceptionToHTTPClient.cpp | 2 +- .../0_stateless/00408_http_keep_alive.reference | 6 +++--- tests/queries/0_stateless/00408_http_keep_alive.sh | 7 ++++--- tests/queries/0_stateless/00501_http_head.re | 12 ++++++++++++ tests/queries/0_stateless/00501_http_head.reference | 4 ++-- tests/queries/0_stateless/00501_http_head.sh | 5 +++-- 7 files changed, 27 insertions(+), 13 deletions(-) create mode 100644 tests/queries/0_stateless/00501_http_head.re diff --git a/base/poco/Net/include/Poco/Net/HTTPServerSession.h b/base/poco/Net/include/Poco/Net/HTTPServerSession.h index 93f31012336..54e7f2c8c50 100644 --- a/base/poco/Net/include/Poco/Net/HTTPServerSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPServerSession.h @@ -57,10 +57,10 @@ namespace Net /// Returns the server's address. void setKeepAliveTimeout(Poco::Timespan keepAliveTimeout); - + size_t getKeepAliveTimeout() const { return _keepAliveTimeout.totalSeconds(); } - size_t getMaxKeepAliveRequests() const { return _maxKeepAliveRequests; } + size_t getMaxKeepAliveRequests() const { return _maxKeepAliveRequests; } private: bool _firstRequest; diff --git a/src/Server/HTTP/sendExceptionToHTTPClient.cpp b/src/Server/HTTP/sendExceptionToHTTPClient.cpp index 022a763a9a2..658b7a4707a 100644 --- a/src/Server/HTTP/sendExceptionToHTTPClient.cpp +++ b/src/Server/HTTP/sendExceptionToHTTPClient.cpp @@ -29,7 +29,7 @@ void sendExceptionToHTTPClient( if (!out) { /// If nothing was sent yet. - WriteBufferFromHTTPServerResponse out_for_message{response, request.getMethod() == HTTPRequest::HTTP_HEAD, DEFAULT_HTTP_KEEP_ALIVE_TIMEOUT}; + WriteBufferFromHTTPServerResponse out_for_message{response, request.getMethod() == HTTPRequest::HTTP_HEAD}; out_for_message.writeln(exception_message); out_for_message.finalize(); diff --git a/tests/queries/0_stateless/00408_http_keep_alive.reference b/tests/queries/0_stateless/00408_http_keep_alive.reference index d5d7dacce9e..5402036bfd7 100644 --- a/tests/queries/0_stateless/00408_http_keep_alive.reference +++ b/tests/queries/0_stateless/00408_http_keep_alive.reference @@ -1,6 +1,6 @@ < Connection: Keep-Alive -< Keep-Alive: timeout=10, max=10000 +< Keep-Alive: timeout=10, max=? < Connection: Keep-Alive -< Keep-Alive: timeout=10, max=10000 +< Keep-Alive: timeout=10, max=? < Connection: Keep-Alive -< Keep-Alive: timeout=10, max=10000 +< Keep-Alive: timeout=10, max=? diff --git a/tests/queries/0_stateless/00408_http_keep_alive.sh b/tests/queries/0_stateless/00408_http_keep_alive.sh index 4bd0e494eb8..4a1cb4ed712 100755 --- a/tests/queries/0_stateless/00408_http_keep_alive.sh +++ b/tests/queries/0_stateless/00408_http_keep_alive.sh @@ -6,9 +6,10 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) URL="${CLICKHOUSE_PORT_HTTP_PROTO}://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/" -${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< "SELECT 1" 2>&1 | perl -lnE 'print if /Keep-Alive/'; -${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< " error here " 2>&1 | perl -lnE 'print if /Keep-Alive/'; -${CLICKHOUSE_CURL} -vsS "${URL}"ping 2>&1 | perl -lnE 'print if /Keep-Alive/'; +# the sed command here replaces the real number of left requests with a question mark, because it can vary and we don't really have control over it +${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< "SELECT 1" 2>&1 | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; +${CLICKHOUSE_CURL} -vsS "${URL}" --data-binary @- <<< " error here " 2>&1 | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; +${CLICKHOUSE_CURL} -vsS "${URL}"ping 2>&1 | perl -lnE 'print if /Keep-Alive/' | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -i 'keep-alive'; # no keep-alive: ${CLICKHOUSE_CURL} -vsS "${URL}"404/not/found/ 2>&1 | perl -lnE 'print if /Keep-Alive/'; diff --git a/tests/queries/0_stateless/00501_http_head.re b/tests/queries/0_stateless/00501_http_head.re new file mode 100644 index 00000000000..807bcd4922e --- /dev/null +++ b/tests/queries/0_stateless/00501_http_head.re @@ -0,0 +1,12 @@ +HTTP/1.1 200 OK +Connection: Keep-Alive +Content-Type: text/tab-separated-values; charset=UTF-8 +Transfer-Encoding: chunked +Keep-Alive: timeout=10, max=? + +HTTP/1.1 200 OK +Connection: Keep-Alive +Content-Type: text/tab-separated-values; charset=UTF-8 +Transfer-Encoding: chunked +Keep-Alive: timeout=10, max=? + diff --git a/tests/queries/0_stateless/00501_http_head.reference b/tests/queries/0_stateless/00501_http_head.reference index db82132b145..807bcd4922e 100644 --- a/tests/queries/0_stateless/00501_http_head.reference +++ b/tests/queries/0_stateless/00501_http_head.reference @@ -2,11 +2,11 @@ HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10, max=10000 +Keep-Alive: timeout=10, max=? HTTP/1.1 200 OK Connection: Keep-Alive Content-Type: text/tab-separated-values; charset=UTF-8 Transfer-Encoding: chunked -Keep-Alive: timeout=10, max=10000 +Keep-Alive: timeout=10, max=? diff --git a/tests/queries/0_stateless/00501_http_head.sh b/tests/queries/0_stateless/00501_http_head.sh index 60283f26833..30da64c31f0 100755 --- a/tests/queries/0_stateless/00501_http_head.sh +++ b/tests/queries/0_stateless/00501_http_head.sh @@ -4,8 +4,9 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201"; - ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" | grep -v "X-ClickHouse-Timezone:" +# the sed command here replaces the real number of left requests with a question mark, because it can vary and we don't really have control over it +( ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=SELECT%201" | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I'; + ${CLICKHOUSE_CURL} -s --head "${CLICKHOUSE_URL}&query=select+*+from+system.numbers+limit+1000000" ) | sed -r 's/(keep-alive: timeout=10, max=)[0-9]+/\1?/I' | grep -v "Date:" | grep -v "X-ClickHouse-Server-Display-Name:" | grep -v "X-ClickHouse-Query-Id:" | grep -v "X-ClickHouse-Format:" | grep -v "X-ClickHouse-Timezone:" if [[ $(${CLICKHOUSE_CURL} -sS -X POST -I "${CLICKHOUSE_URL}&query=SELECT+1" | grep -c '411 Length Required') -ne 1 ]]; then echo FAIL From 1f5c4101b2d74d7ccf798621083fb536bf35de18 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 6 Aug 2024 21:54:15 +0100 Subject: [PATCH 085/142] rm redundant file --- tests/queries/0_stateless/00501_http_head.re | 12 ------------ 1 file changed, 12 deletions(-) delete mode 100644 tests/queries/0_stateless/00501_http_head.re diff --git a/tests/queries/0_stateless/00501_http_head.re b/tests/queries/0_stateless/00501_http_head.re deleted file mode 100644 index 807bcd4922e..00000000000 --- a/tests/queries/0_stateless/00501_http_head.re +++ /dev/null @@ -1,12 +0,0 @@ -HTTP/1.1 200 OK -Connection: Keep-Alive -Content-Type: text/tab-separated-values; charset=UTF-8 -Transfer-Encoding: chunked -Keep-Alive: timeout=10, max=? - -HTTP/1.1 200 OK -Connection: Keep-Alive -Content-Type: text/tab-separated-values; charset=UTF-8 -Transfer-Encoding: chunked -Keep-Alive: timeout=10, max=? - From 49871bacc1d56fb82b78c70dbfc92d52003e2e99 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 7 Aug 2024 12:37:39 +0100 Subject: [PATCH 086/142] fix test --- .../poco/Net/include/Poco/Net/HTTPServerSession.h | 1 - base/poco/Net/src/HTTPServerSession.cpp | 1 - tests/integration/test_server_keep_alive/test.py | 15 ++++++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/base/poco/Net/include/Poco/Net/HTTPServerSession.h b/base/poco/Net/include/Poco/Net/HTTPServerSession.h index 54e7f2c8c50..b0659ca405c 100644 --- a/base/poco/Net/include/Poco/Net/HTTPServerSession.h +++ b/base/poco/Net/include/Poco/Net/HTTPServerSession.h @@ -66,7 +66,6 @@ namespace Net bool _firstRequest; Poco::Timespan _keepAliveTimeout; int _maxKeepAliveRequests; - HTTPServerParams::Ptr _params; }; diff --git a/base/poco/Net/src/HTTPServerSession.cpp b/base/poco/Net/src/HTTPServerSession.cpp index 3093f215952..8eec3e14872 100644 --- a/base/poco/Net/src/HTTPServerSession.cpp +++ b/base/poco/Net/src/HTTPServerSession.cpp @@ -24,7 +24,6 @@ HTTPServerSession::HTTPServerSession(const StreamSocket & socket, HTTPServerPara , _firstRequest(true) , _keepAliveTimeout(pParams->getKeepAliveTimeout()) , _maxKeepAliveRequests(pParams->getMaxKeepAliveRequests()) - , _params(pParams) { setTimeout(pParams->getTimeout()); } diff --git a/tests/integration/test_server_keep_alive/test.py b/tests/integration/test_server_keep_alive/test.py index 96f08a37adb..e550319b6df 100644 --- a/tests/integration/test_server_keep_alive/test.py +++ b/tests/integration/test_server_keep_alive/test.py @@ -1,5 +1,6 @@ import logging import pytest +import random import requests from helpers.cluster import ClickHouseCluster @@ -24,19 +25,27 @@ def test_max_keep_alive_requests_on_user_side(start_cluster): # In this test we have `keep_alive_timeout` set to one hour to never trigger connection reset by timeout, `max_keep_alive_requests` is set to 5. # We expect server to close connection after each 5 requests. We detect connection reset by change in src port. # So the first 5 requests should come from the same port, the following 5 requests should come from another port. + + log_comments = [] + for _ in range(10): + rand_id = random.randint(0, 1000000) + log_comment = f"test_requests_with_keep_alive_{rand_id}" + log_comments.append(log_comment) + log_comments = sorted(log_comments) + session = requests.Session() for i in range(10): session.get( - f"http://{node.ip_address}:8123/?query=select%201&log_comment=test_requests_with_keep_alive_{i}" + f"http://{node.ip_address}:8123/?query=select%201&log_comment={log_comments[i]}" ) ports = node.query( - """ + f""" SYSTEM FLUSH LOGS; SELECT port FROM system.query_log - WHERE log_comment like 'test_requests_with_keep_alive_%' AND type = 'QueryFinish' + WHERE log_comment IN ({", ".join(f"'{comment}'" for comment in log_comments)}) AND type = 'QueryFinish' ORDER BY log_comment """ ).split("\n")[:-1] From c6c0a44b93c382b384eb3ef83cf9da5102629de8 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 7 Aug 2024 23:57:19 +0200 Subject: [PATCH 087/142] fix flaky tests --- .../test_storage_azure_blob_storage/test.py | 2 +- tests/integration/test_storage_hdfs/test.py | 51 +++++++------------ 2 files changed, 19 insertions(+), 34 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 092c124855c..fbdc7f29f98 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -1272,7 +1272,7 @@ def test_filtering_by_file_or_path(cluster): node.query("SYSTEM FLUSH LOGS") result = node.query( - f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query ilike '%select%azure%test_filter%' AND type='QueryFinish'" + f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query ilike '%select%azure%test_filter%' AND type='QueryFinish' ORDER BY event_time_microseconds DESC LIMIT 1" ) assert int(result) == 1 diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 77921b885b0..c52e99b800e 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -41,7 +41,6 @@ def test_read_write_storage(started_cluster): node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)") assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" assert node1.query("select * from SimpleHDFSStorage") == "1\tMark\t72.53\n" - node1.query("drop table if exists SimpleHDFSStorage") def test_read_write_storage_with_globs(started_cluster): @@ -95,11 +94,6 @@ def test_read_write_storage_with_globs(started_cluster): print(ex) assert "in readonly mode" in str(ex) - node1.query("DROP TABLE HDFSStorageWithRange") - node1.query("DROP TABLE HDFSStorageWithEnum") - node1.query("DROP TABLE HDFSStorageWithQuestionMark") - node1.query("DROP TABLE HDFSStorageWithAsterisk") - def test_storage_with_multidirectory_glob(started_cluster): hdfs_api = started_cluster.hdfs_api @@ -341,7 +335,6 @@ def test_virtual_columns(started_cluster): ) == expected ) - node1.query("DROP TABLE virtual_cols") def test_read_files_with_spaces(started_cluster): @@ -363,7 +356,6 @@ def test_read_files_with_spaces(started_cluster): ) assert node1.query("select * from test order by id") == "1\n2\n3\n" fs.delete(dir, recursive=True) - node1.query("DROP TABLE test") def test_truncate_table(started_cluster): @@ -435,7 +427,7 @@ def test_seekable_formats(started_cluster): f"hdfs('hdfs://hdfs1:9000/parquet', 'Parquet', 'a Int32, b String')" ) node1.query( - f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" ) result = node1.query(f"SELECT count() FROM {table_function}") @@ -443,7 +435,7 @@ def test_seekable_formats(started_cluster): table_function = f"hdfs('hdfs://hdfs1:9000/orc', 'ORC', 'a Int32, b String')" node1.query( - f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" ) result = node1.query(f"SELECT count() FROM {table_function}") assert int(result) == 5000000 @@ -467,7 +459,7 @@ def test_read_table_with_default(started_cluster): def test_schema_inference(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)" ) result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") @@ -520,7 +512,6 @@ def test_hdfs_directory_not_exist(started_cluster): assert "" == node1.query( "select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1" ) - node1.query("DROP TABLE HDFSStorageWithNotExistDir") def test_overwrite(started_cluster): @@ -540,7 +531,6 @@ def test_overwrite(started_cluster): result = node1.query(f"select count() from test_overwrite") assert int(result) == 10 - node1.query(f"DROP TABLE test_overwrite") def test_multiple_inserts(started_cluster): @@ -577,7 +567,6 @@ def test_multiple_inserts(started_cluster): result = node1.query(f"select count() from test_multiple_inserts") assert int(result) == 60 - node1.query(f"DROP TABLE test_multiple_inserts") def test_format_detection(started_cluster): @@ -591,10 +580,10 @@ def test_format_detection(started_cluster): def test_schema_inference_with_globs(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/data1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" ) node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0 SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/data2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0" ) result = node1.query( @@ -608,7 +597,7 @@ def test_schema_inference_with_globs(started_cluster): assert sorted(result.split()) == ["0", "\\N"] node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/data3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL" ) filename = "data{1,3}.jsoncompacteachrow" @@ -620,7 +609,7 @@ def test_schema_inference_with_globs(started_cluster): assert "All attempts to extract table structure from files failed" in result node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/data0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]' SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/data0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'" ) result = node1.query_and_get_error( @@ -632,7 +621,7 @@ def test_schema_inference_with_globs(started_cluster): def test_insert_select_schema_inference(started_cluster): node1.query( - f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x SETTINGS hdfs_truncate_on_insert=1" + f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x" ) result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/test.native.zst')") @@ -675,9 +664,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_2', 'Parquet', 'a Int32, b String')" ) - node1.query( - f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" - ) + node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "parquet_2" @@ -685,9 +672,7 @@ def test_virtual_columns_2(started_cluster): table_function = ( f"hdfs('hdfs://hdfs1:9000/parquet_3', 'Parquet', 'a Int32, _path String')" ) - node1.query( - f"insert into table function {table_function} SELECT 1, 'kek' SETTINGS hdfs_truncate_on_insert=1" - ) + node1.query(f"insert into table function {table_function} SELECT 1, 'kek'") result = node1.query(f"SELECT _path FROM {table_function}") assert result.strip() == "kek" @@ -984,11 +969,11 @@ def test_read_subcolumns(started_cluster): node = started_cluster.instances["node1"] node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS hdfs_truncate_on_insert=1" + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" ) node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS hdfs_truncate_on_insert=1" + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" ) res = node.query( @@ -1034,11 +1019,11 @@ def test_union_schema_inference_mode(started_cluster): node = started_cluster.instances["node1"] node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference1.jsonl') select 1 as a SETTINGS hdfs_truncate_on_insert=1" + "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference1.jsonl') select 1 as a" ) node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') select 2 as b SETTINGS hdfs_truncate_on_insert=1" + "insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') select 2 as b" ) node.query("system drop schema cache for hdfs") @@ -1070,7 +1055,7 @@ def test_union_schema_inference_mode(started_cluster): ) assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n" node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference3.jsonl', TSV) select 'Error' SETTINGS hdfs_truncate_on_insert=1" + f"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference3.jsonl', TSV) select 'Error'" ) error = node.query_and_get_error( @@ -1083,11 +1068,11 @@ def test_format_detection(started_cluster): node = started_cluster.instances["node1"] node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0) SETTINGS hdfs_truncate_on_insert=1" + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)" ) node.query( - "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10) SETTINGS hdfs_truncate_on_insert=1" + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)" ) expected_desc_result = node.query( @@ -1151,7 +1136,7 @@ def test_write_to_globbed_partitioned_path(started_cluster): node = started_cluster.instances["node1"] error = node.query_and_get_error( - "insert into function hdfs('hdfs://hdfs1:9000/test_data_*_{_partition_id}.csv') partition by 42 select 42 SETTINGS hdfs_truncate_on_insert=1" + "insert into function hdfs('hdfs://hdfs1:9000/test_data_*_{_partition_id}.csv') partition by 42 select 42" ) assert "DATABASE_ACCESS_DENIED" in error From c19ee360d1a4cf0bc7607923505ba1e2a3848132 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 8 Aug 2024 00:44:42 +0200 Subject: [PATCH 088/142] Update StorageObjectStorageSource.cpp --- src/Storages/ObjectStorage/StorageObjectStorageSource.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 810bad4788b..d8e26977e75 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -208,7 +208,6 @@ Chunk StorageObjectStorageSource::generate() .filename = &filename, .last_modified = object_info->metadata->last_modified, .etag = &(object_info->metadata->etag) - .last_modified = object_info->metadata->last_modified, }, getContext(), read_from_format_info.columns_description); const auto & partition_columns = configuration->getPartitionColumns(); From dccb6bdd88ef26244ddb1c9de8d1232140036294 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 9 Aug 2024 18:33:05 +0800 Subject: [PATCH 089/142] fix failed uts --- .../Formats/Impl/ORCBlockOutputFormat.cpp | 47 +++++++------------ .../Formats/Impl/ORCBlockOutputFormat.h | 5 -- 2 files changed, 17 insertions(+), 35 deletions(-) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index bd89ae0fa86..4a7a23158ff 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -280,20 +280,28 @@ void ORCBlockOutputFormat::writeColumn( DataTypePtr & type, const PaddedPODArray * null_bytemap) { - orc_column.numElements = column.size(); + size_t rows = column.size(); + orc_column.resize(rows); + orc_column.numElements = rows; + + /// Calculate orc_column.hasNulls if (null_bytemap) - { orc_column.hasNulls = !memoryIsZero(null_bytemap->data(), 0, null_bytemap->size()); - if (orc_column.hasNulls) - { - orc_column.notNull.resize(null_bytemap->size()); - for (size_t i = 0; i < null_bytemap->size(); ++i) - orc_column.notNull[i] = !(*null_bytemap)[i]; - } - } else orc_column.hasNulls = false; + /// Fill orc_column.notNull + if (orc_column.hasNulls) + { + for (size_t i = 0; i < rows; ++i) + orc_column.notNull[i] = !(*null_bytemap)[i]; + } + else + { + for (size_t i = 0; i < rows; ++i) + orc_column.notNull[i] = 1; + } + /// ORC doesn't have unsigned types, so cast everything to signed and sign-extend to Int64 to /// make the ORC library calculate min and max correctly. switch (type->getTypeId()) @@ -516,27 +524,6 @@ void ORCBlockOutputFormat::writeColumn( } } -size_t ORCBlockOutputFormat::getColumnSize(const IColumn & column, DataTypePtr & type) -{ - if (type->getTypeId() == TypeIndex::Array) - { - auto nested_type = assert_cast(*type).getNestedType(); - const IColumn & nested_column = assert_cast(column).getData(); - return std::max(column.size(), getColumnSize(nested_column, nested_type)); - } - - return column.size(); -} - -size_t ORCBlockOutputFormat::getMaxColumnSize(Chunk & chunk) -{ - size_t columns_num = chunk.getNumColumns(); - size_t max_column_size = 0; - for (size_t i = 0; i != columns_num; ++i) - max_column_size = std::max(max_column_size, getColumnSize(*chunk.getColumns()[i], data_types[i])); - return max_column_size; -} - void ORCBlockOutputFormat::consume(Chunk chunk) { if (!writer) diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h index 28837193d1a..06ecac9b820 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.h @@ -69,11 +69,6 @@ private: void writeColumn(orc::ColumnVectorBatch & orc_column, const IColumn & column, DataTypePtr & type, const PaddedPODArray * null_bytemap); - /// These two functions are needed to know maximum nested size of arrays to - /// create an ORC Batch with the appropriate size - size_t getColumnSize(const IColumn & column, DataTypePtr & type); - size_t getMaxColumnSize(Chunk & chunk); - void prepareWriter(); const FormatSettings format_settings; From 21be195dd8f841c3cfaf453bb7faba02627b9c0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Fri, 9 Aug 2024 13:14:09 +0000 Subject: [PATCH 090/142] Revert unnecessary change --- .../clickhouse_path/format_schemas/test.capnp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp index 247e7b9ceca..44f1961205b 100644 --- a/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp +++ b/tests/integration/test_storage_kafka/clickhouse_path/format_schemas/test.capnp @@ -7,4 +7,4 @@ struct TestRecordStruct val1 @2 : Text; val2 @3 : Float32; val3 @4 : UInt8; -} +} \ No newline at end of file From b757522fc4ac545451acc398ab230323fb7c0fd3 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 9 Aug 2024 14:20:57 +0100 Subject: [PATCH 091/142] fix build --- programs/keeper/Keeper.cpp | 2 +- src/Server/HTTPHandlerFactory.cpp | 41 ++----------------------- src/Server/PrometheusRequestHandler.cpp | 4 +-- src/Server/PrometheusRequestHandler.h | 7 +++-- 4 files changed, 10 insertions(+), 44 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index ae51a62ff9c..a447a9e50f6 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -515,7 +515,7 @@ try "Prometheus: http://" + address.toString(), std::make_unique( std::move(my_http_context), - createKeeperPrometheusHandlerFactory(config_getter(), async_metrics, "PrometheusHandler-factory"), + createKeeperPrometheusHandlerFactory(*this, config_getter(), async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 0ee45783d52..fc31ad2874e 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -122,7 +122,8 @@ static inline auto createHandlersFactoryFromConfig( } else if (handler_type == "prometheus") { - main_handler_factory->addHandler(createPrometheusHandlerFactoryForHTTPRule(config, prefix + "." + key, async_metrics)); + main_handler_factory->addHandler( + createPrometheusHandlerFactoryForHTTPRule(server, config, prefix + "." + key, async_metrics)); } else if (handler_type == "replicas_status") { @@ -199,19 +200,7 @@ HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, const Poco:: else if (name == "InterserverIOHTTPHandler-factory" || name == "InterserverIOHTTPSHandler-factory") return createInterserverHTTPHandlerFactory(server, name); else if (name == "PrometheusHandler-factory") -<<<<<<< HEAD - { - auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); - return createPrometheusMainHandlerFactory(config, metrics_writer, name); - } -||||||| 02b8d563e3a - { - auto metrics_writer = std::make_shared(config, "prometheus", async_metrics); - return createPrometheusMainHandlerFactory(server, config, metrics_writer, name); - } -======= return createPrometheusHandlerFactory(server, config, async_metrics, name); ->>>>>>> master throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown HTTP handler factory name."); } @@ -298,34 +287,8 @@ void addDefaultHandlersFactory( ); factory.addHandler(query_handler); -<<<<<<< HEAD - /// We check that prometheus handler will be served on current (default) port. - /// Otherwise it will be created separately, see createHandlerFactory(...). - if (config.has("prometheus") && config.getInt("prometheus.port", 0) == 0) - { - auto writer = std::make_shared(config, "prometheus", async_metrics); - auto creator - = [writer]() -> std::unique_ptr { return std::make_unique(writer); }; - auto prometheus_handler = std::make_shared>(std::move(creator)); - prometheus_handler->attachStrictPath(config.getString("prometheus.endpoint", "/metrics")); - prometheus_handler->allowGetAndHeadRequest(); -||||||| 02b8d563e3a - /// We check that prometheus handler will be served on current (default) port. - /// Otherwise it will be created separately, see createHandlerFactory(...). - if (config.has("prometheus") && config.getInt("prometheus.port", 0) == 0) - { - auto writer = std::make_shared(config, "prometheus", async_metrics); - auto creator = [&server, writer] () -> std::unique_ptr - { - return std::make_unique(server, writer); - }; - auto prometheus_handler = std::make_shared>(std::move(creator)); - prometheus_handler->attachStrictPath(config.getString("prometheus.endpoint", "/metrics")); - prometheus_handler->allowGetAndHeadRequest(); -======= /// createPrometheusHandlerFactoryForHTTPRuleDefaults() can return nullptr if prometheus protocols must not be served on http port. if (auto prometheus_handler = createPrometheusHandlerFactoryForHTTPRuleDefaults(server, config, async_metrics)) ->>>>>>> master factory.addHandler(prometheus_handler); } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 52cda92d9b4..ae1fb6d629e 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -95,7 +95,7 @@ public: class PrometheusRequestHandler::ImplWithContext : public Impl { public: - explicit ImplWithContext(PrometheusRequestHandler & parent) : Impl(parent), default_settings(parent.server.context()->getSettingsRef()) { } + explicit ImplWithContext(PrometheusRequestHandler & parent) : Impl(parent), default_settings(server().context()->getSettingsRef()) { } virtual void handlingRequestWithContext(HTTPServerRequest & request, HTTPServerResponse & response) = 0; @@ -353,7 +353,7 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); - setResponseDefaultHeaders(response); + setResponseDefaultHeaders(response); impl->beforeHandlingRequest(request); impl->handleRequest(request, response); diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 7aeed11d6b8..281ecf5260e 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -15,8 +15,11 @@ class WriteBufferFromHTTPServerResponse; class PrometheusRequestHandler : public HTTPRequestHandler { public: - PrometheusRequestHandler(const PrometheusRequestHandlerConfig & config_, - const AsynchronousMetrics & async_metrics_, std::shared_ptr metrics_writer_); + PrometheusRequestHandler( + IServer & server_, + const PrometheusRequestHandlerConfig & config_, + const AsynchronousMetrics & async_metrics_, + std::shared_ptr metrics_writer_); ~PrometheusRequestHandler() override; void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response, const ProfileEvents::Event & write_event_) override; From 43a38fb5f0563f50d38cf5d988db3b181b64f606 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 9 Aug 2024 15:11:08 +0100 Subject: [PATCH 092/142] rm redundant file --- programs/server/config.d/listen.xml | 3 --- 1 file changed, 3 deletions(-) delete mode 100644 programs/server/config.d/listen.xml diff --git a/programs/server/config.d/listen.xml b/programs/server/config.d/listen.xml deleted file mode 100644 index f94e5c88568..00000000000 --- a/programs/server/config.d/listen.xml +++ /dev/null @@ -1,3 +0,0 @@ - - :: - From 556f66987897bd5065426e187bef4e0cba2a975c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Sat, 10 Aug 2024 21:52:55 +0000 Subject: [PATCH 093/142] Separate test into separate file to pass bugfix check --- tests/integration/parallel_skip.json | 4 + tests/integration/test_storage_kafka/test.py | 135 ---------- .../test_produce_http_interface.py | 243 ++++++++++++++++++ 3 files changed, 247 insertions(+), 135 deletions(-) create mode 100644 tests/integration/test_storage_kafka/test_produce_http_interface.py diff --git a/tests/integration/parallel_skip.json b/tests/integration/parallel_skip.json index 99fa626bd1e..fca2126d824 100644 --- a/tests/integration/parallel_skip.json +++ b/tests/integration/parallel_skip.json @@ -162,9 +162,13 @@ "test_storage_kafka/test.py::test_system_kafka_consumers_rebalance_mv", "test_storage_kafka/test.py::test_formats_errors", "test_storage_kafka/test.py::test_multiple_read_in_materialized_views", + "test_storage_kafka/test.py::test_kafka_null_message", + + "test_storage_kafka/test_produce_http_interface.py::test_kafka_produce_http_interface_row_based_format", "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string", "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string_request_new_ticket_after_expiration", "test_storage_kerberized_kafka/test.py::test_kafka_json_as_string_no_kdc", "test_storage_kerberized_kafka/test.py::test_kafka_config_from_sql_named_collection" + ] diff --git a/tests/integration/test_storage_kafka/test.py b/tests/integration/test_storage_kafka/test.py index 9b2f465c1b6..4b6c9922d74 100644 --- a/tests/integration/test_storage_kafka/test.py +++ b/tests/integration/test_storage_kafka/test.py @@ -5524,141 +5524,6 @@ def test_kafka_null_message(kafka_cluster, create_query_generator): ) -def test_kafka_produce_http_interface_row_based_format(kafka_cluster): - # reproduction of #61060 with validating the written messages - admin_client = KafkaAdminClient( - bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port) - ) - - topic_prefix = "http_row_" - - # It is important to have: - # - long enough messages - # - enough messages - # I don't know the exact requirement for message sizes, but it doesn't reproduce with short messages - # For the number of messages it seems like at least 3 messages is necessary - expected_key = "01234567890123456789" - expected_value = "aaaaabbbbbccccc" - - insert_query_end = f"(key, value) VALUES ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}')" - insert_query_template = "INSERT INTO {table_name} " + insert_query_end - - extra_settings = { - "Protobuf": ", kafka_schema = 'string_key_value.proto:StringKeyValuePair'", - "CapnProto": ", kafka_schema='string_key_value:StringKeyValuePair'", - "Template": ", format_template_row='string_key_value.format'", - } - - # Only the formats that can be used both and input and output format are tested - # Reasons to exclude following formats: - # - JSONStrings: not actually an input format - # - ProtobufSingle: I cannot make it work to parse the messages. Probably something is broken, - # because the producer can write multiple rows into a same message, which makes them impossible to parse properly. Should added after #67549 is fixed. - # - ProtobufList: I didn't want to deal with the envelope and stuff - # - Npy: supports only single column - # - LineAsString: supports only single column - # - RawBLOB: supports only single column - formats_to_test = [ - "TabSeparated", - "TabSeparatedRaw", - "TabSeparatedWithNames", - "TabSeparatedWithNamesAndTypes", - "TabSeparatedRawWithNames", - "TabSeparatedRawWithNamesAndTypes", - "Template", - "CSV", - "CSVWithNames", - "CSVWithNamesAndTypes", - "CustomSeparated", - "CustomSeparatedWithNames", - "CustomSeparatedWithNamesAndTypes", - "Values", - "JSON", - "JSONColumns", - "JSONColumnsWithMetadata", - "JSONCompact", - "JSONCompactColumns", - "JSONEachRow", - "JSONStringsEachRow", - "JSONCompactEachRow", - "JSONCompactEachRowWithNames", - "JSONCompactEachRowWithNamesAndTypes", - "JSONCompactStringsEachRow", - "JSONCompactStringsEachRowWithNames", - "JSONCompactStringsEachRowWithNamesAndTypes", - "JSONObjectEachRow", - "BSONEachRow", - "TSKV", - "Protobuf", - "Avro", - "Parquet", - "Arrow", - "ArrowStream", - "ORC", - "RowBinary", - "RowBinaryWithNames", - "RowBinaryWithNamesAndTypes", - "Native", - "CapnProto", - "MsgPack", - ] - for format in formats_to_test: - logging.debug(f"Creating tables and writing messages to {format}") - topic = topic_prefix + format - kafka_create_topic(admin_client, topic) - - extra_setting = extra_settings.get(format, "") - - # kafka_max_rows_per_message is set to 2 to make sure every format produces at least 2 messages, thus increasing the chance of catching a bug - instance.query( - f""" - DROP TABLE IF EXISTS test.view_{topic}; - DROP TABLE IF EXISTS test.consumer_{topic}; - CREATE TABLE test.kafka_writer_{topic} (key String, value String) - ENGINE = Kafka - SETTINGS kafka_broker_list = 'kafka1:19092', - kafka_topic_list = '{topic}', - kafka_group_name = '{topic}', - kafka_format = '{format}', - kafka_max_rows_per_message = 2 {extra_setting}; - - CREATE TABLE test.kafka_{topic} (key String, value String) - ENGINE = Kafka - SETTINGS kafka_broker_list = 'kafka1:19092', - kafka_topic_list = '{topic}', - kafka_group_name = '{topic}', - kafka_format = '{format}' {extra_setting}; - - CREATE MATERIALIZED VIEW test.view_{topic} Engine=Log AS - SELECT key, value FROM test.kafka_{topic}; - """ - ) - - instance.http_query( - insert_query_template.format(table_name="test.kafka_writer_" + topic), - method="POST", - ) - - expected = f"""\ -{expected_key}\t{expected_value} -{expected_key}\t{expected_value} -{expected_key}\t{expected_value} -""" - # give some times for the readers to read the messages - for format in formats_to_test: - logging.debug(f"Checking result for {format}") - topic = topic_prefix + format - - result = instance.query_with_retry( - f"SELECT * FROM test.view_{topic}", - check_callback=lambda res: res.count("\n") == 3, - ) - - assert TSV(result) == TSV(expected) - - kafka_delete_topic(admin_client, topic) - - if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_kafka/test_produce_http_interface.py b/tests/integration/test_storage_kafka/test_produce_http_interface.py new file mode 100644 index 00000000000..fc10a07f239 --- /dev/null +++ b/tests/integration/test_storage_kafka/test_produce_http_interface.py @@ -0,0 +1,243 @@ +import time +import logging + +import pytest +from helpers.cluster import ClickHouseCluster, is_arm +from helpers.test_tools import TSV +from kafka import KafkaAdminClient +from kafka.admin import NewTopic + +if is_arm(): + pytestmark = pytest.mark.skip + +cluster = ClickHouseCluster(__file__) +instance = cluster.add_instance( + "instance", + main_configs=["configs/kafka.xml", "configs/named_collection.xml"], + user_configs=["configs/users.xml"], + with_kafka=True, + with_zookeeper=True, # For Replicated Table + macros={ + "kafka_broker": "kafka1", + "kafka_topic_old": "old", + "kafka_group_name_old": "old", + "kafka_topic_new": "new", + "kafka_group_name_new": "new", + "kafka_client_id": "instance", + "kafka_format_json_each_row": "JSONEachRow", + }, + clickhouse_path_dir="clickhouse_path", +) + + +@pytest.fixture(scope="module") +def kafka_cluster(): + try: + cluster.start() + kafka_id = instance.cluster.kafka_docker_id + print(("kafka_id is {}".format(kafka_id))) + yield cluster + finally: + cluster.shutdown() + + +@pytest.fixture(autouse=True) +def kafka_setup_teardown(): + instance.query("DROP DATABASE IF EXISTS test; CREATE DATABASE test;") + # logging.debug("kafka is available - running test") + yield # run test + + +def kafka_create_topic( + admin_client, + topic_name, + num_partitions=1, + replication_factor=1, + max_retries=50, + config=None, +): + logging.debug( + f"Kafka create topic={topic_name}, num_partitions={num_partitions}, replication_factor={replication_factor}" + ) + topics_list = [ + NewTopic( + name=topic_name, + num_partitions=num_partitions, + replication_factor=replication_factor, + topic_configs=config, + ) + ] + retries = 0 + while True: + try: + admin_client.create_topics(new_topics=topics_list, validate_only=False) + logging.debug("Admin client succeed") + return + except Exception as e: + retries += 1 + time.sleep(0.5) + if retries < max_retries: + logging.warning(f"Failed to create topic {e}") + else: + raise + + +def kafka_delete_topic(admin_client, topic, max_retries=50): + result = admin_client.delete_topics([topic]) + for topic, e in result.topic_error_codes: + if e == 0: + logging.debug(f"Topic {topic} deleted") + else: + logging.error(f"Failed to delete topic {topic}: {e}") + + retries = 0 + while True: + topics_listed = admin_client.list_topics() + logging.debug(f"TOPICS LISTED: {topics_listed}") + if topic not in topics_listed: + return + else: + retries += 1 + time.sleep(0.5) + if retries > max_retries: + raise Exception(f"Failed to delete topics {topic}, {result}") + + +def test_kafka_produce_http_interface_row_based_format(kafka_cluster): + # reproduction of #61060 with validating the written messages + admin_client = KafkaAdminClient( + bootstrap_servers="localhost:{}".format(kafka_cluster.kafka_port) + ) + + topic_prefix = "http_row_" + + # It is important to have: + # - long enough messages + # - enough messages + # I don't know the exact requirement for message sizes, but it doesn't reproduce with short messages + # For the number of messages it seems like at least 3 messages is necessary + expected_key = "01234567890123456789" + expected_value = "aaaaabbbbbccccc" + + insert_query_end = f"(key, value) VALUES ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}'), ('{expected_key}', '{expected_value}')" + insert_query_template = "INSERT INTO {table_name} " + insert_query_end + + extra_settings = { + "Protobuf": ", kafka_schema = 'string_key_value.proto:StringKeyValuePair'", + "CapnProto": ", kafka_schema='string_key_value:StringKeyValuePair'", + "Template": ", format_template_row='string_key_value.format'", + } + + # Only the formats that can be used both and input and output format are tested + # Reasons to exclude following formats: + # - JSONStrings: not actually an input format + # - ProtobufSingle: I cannot make it work to parse the messages. Probably something is broken, + # because the producer can write multiple rows into a same message, which makes them impossible to parse properly. Should added after #67549 is fixed. + # - ProtobufList: I didn't want to deal with the envelope and stuff + # - Npy: supports only single column + # - LineAsString: supports only single column + # - RawBLOB: supports only single column + formats_to_test = [ + "TabSeparated", + "TabSeparatedRaw", + "TabSeparatedWithNames", + "TabSeparatedWithNamesAndTypes", + "TabSeparatedRawWithNames", + "TabSeparatedRawWithNamesAndTypes", + "Template", + "CSV", + "CSVWithNames", + "CSVWithNamesAndTypes", + "CustomSeparated", + "CustomSeparatedWithNames", + "CustomSeparatedWithNamesAndTypes", + "Values", + "JSON", + "JSONColumns", + "JSONColumnsWithMetadata", + "JSONCompact", + "JSONCompactColumns", + "JSONEachRow", + "JSONStringsEachRow", + "JSONCompactEachRow", + "JSONCompactEachRowWithNames", + "JSONCompactEachRowWithNamesAndTypes", + "JSONCompactStringsEachRow", + "JSONCompactStringsEachRowWithNames", + "JSONCompactStringsEachRowWithNamesAndTypes", + "JSONObjectEachRow", + "BSONEachRow", + "TSKV", + "Protobuf", + "Avro", + "Parquet", + "Arrow", + "ArrowStream", + "ORC", + "RowBinary", + "RowBinaryWithNames", + "RowBinaryWithNamesAndTypes", + "Native", + "CapnProto", + "MsgPack", + ] + for format in formats_to_test: + logging.debug(f"Creating tables and writing messages to {format}") + topic = topic_prefix + format + kafka_create_topic(admin_client, topic) + + extra_setting = extra_settings.get(format, "") + + # kafka_max_rows_per_message is set to 2 to make sure every format produces at least 2 messages, thus increasing the chance of catching a bug + instance.query( + f""" + DROP TABLE IF EXISTS test.view_{topic}; + DROP TABLE IF EXISTS test.consumer_{topic}; + CREATE TABLE test.kafka_writer_{topic} (key String, value String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kafka1:19092', + kafka_topic_list = '{topic}', + kafka_group_name = '{topic}', + kafka_format = '{format}', + kafka_max_rows_per_message = 2 {extra_setting}; + + CREATE TABLE test.kafka_{topic} (key String, value String) + ENGINE = Kafka + SETTINGS kafka_broker_list = 'kafka1:19092', + kafka_topic_list = '{topic}', + kafka_group_name = '{topic}', + kafka_format = '{format}' {extra_setting}; + + CREATE MATERIALIZED VIEW test.view_{topic} Engine=Log AS + SELECT key, value FROM test.kafka_{topic}; + """ + ) + instance.http_query( + insert_query_template.format(table_name="test.kafka_writer_" + topic), + method="POST", + ) + + expected = f"""\ +{expected_key}\t{expected_value} +{expected_key}\t{expected_value} +{expected_key}\t{expected_value} +""" + # give some times for the readers to read the messages + for format in formats_to_test: + logging.debug(f"Checking result for {format}") + topic = topic_prefix + format + + result = instance.query_with_retry( + f"SELECT * FROM test.view_{topic}", + check_callback=lambda res: res.count("\n") == 3, + ) + + assert TSV(result) == TSV(expected) + + kafka_delete_topic(admin_client, topic) + + +if __name__ == "__main__": + cluster.start() + input("Cluster created, press any key to destroy...") + cluster.shutdown() From 8a48b3334433fe5e77c23ff6df10e454db2d3f82 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 11 Aug 2024 21:27:08 +0200 Subject: [PATCH 094/142] Fix settings/current_database in system.processes for async BACKUP/RESTORE Signed-off-by: Azat Khuzhin --- src/Backups/BackupsWorker.cpp | 4 ++++ src/Interpreters/ProcessList.h | 3 +++ 2 files changed, 7 insertions(+) diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp index 0b93ae6d547..8b45c816817 100644 --- a/src/Backups/BackupsWorker.cpp +++ b/src/Backups/BackupsWorker.cpp @@ -490,6 +490,8 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context /// process_list_element_holder is used to make an element in ProcessList live while BACKUP is working asynchronously. auto process_list_element = context_in_use->getProcessListElement(); + /// Update context to preserve query information in processlist (settings, current_database) + process_list_element->updateContext(context_in_use); thread_pool.scheduleOrThrowOnError( [this, @@ -853,6 +855,8 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt /// process_list_element_holder is used to make an element in ProcessList live while RESTORE is working asynchronously. auto process_list_element = context_in_use->getProcessListElement(); + /// Update context to preserve query information in processlist (settings, current_database) + process_list_element->updateContext(context_in_use); thread_pool.scheduleOrThrowOnError( [this, diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index accb73e12df..248ba947bc1 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -244,6 +244,9 @@ public: /// Same as checkTimeLimit but it never throws [[nodiscard]] bool checkTimeLimitSoft(); + /// Use it in case of the query left in background to execute asynchronously + void updateContext(ContextWeakPtr weak_context) { context = std::move(weak_context); } + /// Get the reference for the start of the query. Used to synchronize with other Stopwatches UInt64 getQueryCPUStartTime() { return watch.getStart(); } }; From 957a0b6ea4c3e262a5c1fa664d81ab31d7e0d757 Mon Sep 17 00:00:00 2001 From: sakulali Date: Sun, 11 Aug 2024 00:12:36 +0800 Subject: [PATCH 095/142] Add a setting query_cache_tag --- docs/en/operations/query-cache.md | 10 ++++++ docs/en/operations/settings/settings.md | 11 +++++++ .../operations/system-tables/query_cache.md | 2 ++ src/Core/Settings.h | 1 + src/Core/SettingsChangesHistory.cpp | 1 + src/Interpreters/Cache/QueryCache.cpp | 16 +++++++--- src/Interpreters/Cache/QueryCache.h | 13 ++++++-- src/Interpreters/executeQuery.cpp | 5 +-- .../System/StorageSystemQueryCache.cpp | 5 ++- .../02494_query_cache_tag.reference | 14 ++++++++ .../0_stateless/02494_query_cache_tag.sql | 32 +++++++++++++++++++ 11 files changed, 100 insertions(+), 10 deletions(-) create mode 100644 tests/queries/0_stateless/02494_query_cache_tag.reference create mode 100644 tests/queries/0_stateless/02494_query_cache_tag.sql diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index 7a920671fc2..a6c4d74f4ac 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -143,6 +143,16 @@ value can be specified at session, profile or query level using setting [query_c Entries in the query cache are compressed by default. This reduces the overall memory consumption at the cost of slower writes into / reads from the query cache. To disable compression, use setting [query_cache_compress_entries](settings/settings.md#query-cache-compress-entries). +Entries in the query cache can separate by tag, using setting [query_cache_tag](settings/settings.md#query-cache-tag). Queries with different tags are considered different entries. For example, the result of query + +``` sql +SELECT 1 SETTINGS use_query_cache = true; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one diff'; +``` + +have different entries in the query cache, find the specified tag in system table [system.query_cache](system-tables/query_cache.md) + ClickHouse reads table data in blocks of [max_block_size](settings/settings.md#setting-max_block_size) rows. Due to filtering, aggregation, etc., result blocks are typically much smaller than 'max_block_size' but there are also cases where they are much bigger. Setting [query_cache_squash_partial_results](settings/settings.md#query-cache-squash-partial-results) (enabled by default) controls if result blocks diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index e432f4e038f..7b855665efb 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1800,6 +1800,17 @@ Possible values: Default value: `0`. +## query_cache_tag {#query-cache-tag} + +An arbitrary string to separate entries in the [query cache](../query-cache.md). +Queries with different values of this setting are considered different. + +Possible values: + +- string: name of query cache tag + +Default value: `''`. + ## query_cache_max_size_in_bytes {#query-cache-max-size-in-bytes} The maximum amount of memory (in bytes) the current user may allocate in the [query cache](../query-cache.md). 0 means unlimited. diff --git a/docs/en/operations/system-tables/query_cache.md b/docs/en/operations/system-tables/query_cache.md index a9f86f5fc2b..393b37d3616 100644 --- a/docs/en/operations/system-tables/query_cache.md +++ b/docs/en/operations/system-tables/query_cache.md @@ -14,6 +14,7 @@ Columns: - `compressed` ([UInt8](../../sql-reference/data-types/int-uint.md)) — If the query cache entry is compressed. - `expires_at` ([DateTime](../../sql-reference/data-types/datetime.md)) — When the query cache entry becomes stale. - `key_hash` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — A hash of the query string, used as a key to find query cache entries. +- `tag` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — An arbitrary string to separate entries in the query cache. **Example** @@ -31,6 +32,7 @@ shared: 0 compressed: 1 expires_at: 2023-10-13 13:35:45 key_hash: 12188185624808016954 +tag: 1 row in set. Elapsed: 0.004 sec. ``` diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4559cc67b35..ed58f8041d0 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -676,6 +676,7 @@ class IColumn; M(Bool, query_cache_squash_partial_results, true, "Squash partial result blocks to blocks of size 'max_block_size'. Reduces performance of inserts into the query cache but improves the compressability of cache entries.", 0) \ M(Seconds, query_cache_ttl, 60, "After this time in seconds entries in the query cache become stale", 0) \ M(Bool, query_cache_share_between_users, false, "Allow other users to read entry in the query cache", 0) \ + M(String, query_cache_tag, "", "An arbitrary string to separate entries in the query cache. Queries with different values of this setting are considered different.", 0) \ M(Bool, enable_sharing_sets_for_mutations, true, "Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption", 0) \ \ M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index c6392044f72..49a325b07b1 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -84,6 +84,7 @@ static std::initializer_list user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, - bool is_compressed_) - : ast_hash(calculateAstHash(ast_, current_database, settings)) + bool is_compressed_, + const String & tag_) + : ast_hash(calculateAstHash(ast_, current_database, settings, tag_)) , header(header_) , user_id(user_id_) , current_user_roles(current_user_roles_) @@ -242,11 +247,12 @@ QueryCache::Key::Key( , expires_at(expires_at_) , is_compressed(is_compressed_) , query_string(queryStringFromAST(ast_)) + , tag(tag_) { } -QueryCache::Key::Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_) - : QueryCache::Key(ast_, current_database, settings, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) /// dummy values for everything != AST, current database, user name/roles +QueryCache::Key::Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_, const String & tag_) + : QueryCache::Key(ast_, current_database, settings, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false, tag_) /// dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index 461197cac32..54de5edb145 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -88,6 +88,10 @@ public: /// SYSTEM.QUERY_CACHE. const String query_string; + /// An arbitrary string to separate entries in the query cache. + /// Queries with different values of this setting are considered different. + const String tag; + /// Ctor to construct a Key for writing into query cache. Key(ASTPtr ast_, const String & current_database, @@ -96,10 +100,15 @@ public: std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, - bool is_compressed); + bool is_compressed, + const String & tag_); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). - Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_); + Key(ASTPtr ast_, + const String & current_database, + const Settings & settings, + std::optional user_id_, const std::vector & current_user_roles_, + const String & tag_); bool operator==(const Key & other) const; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index fe87eed5570..6422d3128fa 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1129,7 +1129,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getCurrentDatabase(), *settings_copy, context->getUserID(), context->getCurrentRoles()); + QueryCache::Key key(ast, context->getCurrentDatabase(), *settings_copy, context->getUserID(), context->getCurrentRoles(), settings.query_cache_tag); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1258,7 +1258,8 @@ static std::tuple executeQueryImpl( context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), - settings.query_cache_compress_entries); + settings.query_cache_compress_entries, + settings.query_cache_tag); const size_t num_query_runs = settings.query_cache_min_query_runs ? query_cache->recordQueryRun(key) : 1; /// try to avoid locking a mutex in recordQueryRun() if (num_query_runs <= settings.query_cache_min_query_runs) diff --git a/src/Storages/System/StorageSystemQueryCache.cpp b/src/Storages/System/StorageSystemQueryCache.cpp index 4c54d4ae16f..f81d50e8806 100644 --- a/src/Storages/System/StorageSystemQueryCache.cpp +++ b/src/Storages/System/StorageSystemQueryCache.cpp @@ -1,6 +1,7 @@ #include "StorageSystemQueryCache.h" #include #include +#include #include #include #include @@ -19,7 +20,8 @@ ColumnsDescription StorageSystemQueryCache::getColumnsDescription() {"shared", std::make_shared(), "If the query cache entry is shared between multiple users."}, {"compressed", std::make_shared(), "If the query cache entry is compressed."}, {"expires_at", std::make_shared(), "When the query cache entry becomes stale."}, - {"key_hash", std::make_shared(), "A hash of the query string, used as a key to find query cache entries."} + {"key_hash", std::make_shared(), "A hash of the query string, used as a key to find query cache entries."}, + {"tag", std::make_shared(std::make_shared()), "An arbitrary string to separate entries in the query cache."} }; } @@ -56,6 +58,7 @@ void StorageSystemQueryCache::fillData(MutableColumns & res_columns, ContextPtr res_columns[4]->insert(key.is_compressed); res_columns[5]->insert(std::chrono::system_clock::to_time_t(key.expires_at)); res_columns[6]->insert(key.ast_hash.low64); /// query cache considers aliases (issue #56258) + res_columns[7]->insert(key.tag); } } diff --git a/tests/queries/0_stateless/02494_query_cache_tag.reference b/tests/queries/0_stateless/02494_query_cache_tag.reference new file mode 100644 index 00000000000..055d3d4c5bb --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_tag.reference @@ -0,0 +1,14 @@ +1 +1 +--- +1 +1 +1 +2 +--- +1 +1 +1 +2 +1 +3 diff --git a/tests/queries/0_stateless/02494_query_cache_tag.sql b/tests/queries/0_stateless/02494_query_cache_tag.sql new file mode 100644 index 00000000000..054607058e8 --- /dev/null +++ b/tests/queries/0_stateless/02494_query_cache_tag.sql @@ -0,0 +1,32 @@ +-- Tags: no-parallel +-- Tag no-parallel: Messes with internal cache + +SYSTEM DROP QUERY CACHE; + +-- Cache the query after the query invocation +SELECT 1 SETTINGS use_query_cache = true; +SELECT COUNT(*) FROM system.query_cache; + +SELECT '---'; + +SYSTEM DROP QUERY CACHE; + +-- Queries with tag value of this setting or not are considered different cache entries. +SELECT 1 SETTINGS use_query_cache = true; +SELECT COUNT(*) FROM system.query_cache; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; +SELECT COUNT(*) FROM system.query_cache; + +SELECT '---'; + +SYSTEM DROP QUERY CACHE; + +-- Queries with different tags values of this setting are considered different cache entries. +SELECT 1 SETTINGS use_query_cache = true; +SELECT COUNT(*) FROM system.query_cache; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; +SELECT COUNT(*) FROM system.query_cache; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one diff'; +SELECT COUNT(*) FROM system.query_cache; + +SYSTEM DROP QUERY CACHE; From 5acf9f6f8160ee6de2845b13bed07d63832ca3fa Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 12 Aug 2024 07:01:52 +0200 Subject: [PATCH 096/142] Fix `test_cluster_all_replicas` --- tests/integration/test_cluster_all_replicas/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_cluster_all_replicas/test.py b/tests/integration/test_cluster_all_replicas/test.py index d8bad180e1b..9797db7c498 100644 --- a/tests/integration/test_cluster_all_replicas/test.py +++ b/tests/integration/test_cluster_all_replicas/test.py @@ -21,14 +21,14 @@ def start_cluster(): def test_cluster(start_cluster): assert ( node1.query( - "SELECT hostName() FROM clusterAllReplicas('one_shard_two_nodes', system.one)" + "SELECT hostName() FROM clusterAllReplicas('one_shard_two_nodes', system.one) ORDER BY ALL" ) == "node1\nnode2\n" ) assert set( node1.query( - """SELECT hostName(), * FROM clusterAllReplicas("one_shard_two_nodes", system.one) ORDER BY dummy""" + """SELECT hostName(), * FROM clusterAllReplicas("one_shard_two_nodes", system.one) ORDER BY ALL""" ).splitlines() ) == {"node1\t0", "node2\t0"} @@ -48,7 +48,7 @@ def test_global_in(start_cluster): assert set( node1.query( - """SELECT hostName(), * FROM clusterAllReplicas("one_shard_two_nodes", system.one) where dummy GLOBAL IN u""" + """SELECT hostName(), * FROM clusterAllReplicas("one_shard_two_nodes", system.one) where dummy GLOBAL IN u ORDER BY ALL""" ).splitlines() ) == {"node1\t0", "node2\t0"} @@ -63,7 +63,7 @@ def test_global_in(start_cluster): def test_skip_unavailable_replica(start_cluster, cluster): assert ( node1.query( - f"SELECT hostName() FROM clusterAllReplicas('{cluster}', system.one) settings skip_unavailable_shards=1" + f"SELECT hostName() FROM clusterAllReplicas('{cluster}', system.one) ORDER BY ALL settings skip_unavailable_shards=1" ) == "node1\nnode2\n" ) @@ -81,5 +81,5 @@ def test_error_on_unavailable_replica(start_cluster, cluster): # so when skip_unavailable_shards=0 - any unavailable replica should lead to an error with pytest.raises(QueryRuntimeException): node1.query( - f"SELECT hostName() FROM clusterAllReplicas('{cluster}', system.one) settings skip_unavailable_shards=0" + f"SELECT hostName() FROM clusterAllReplicas('{cluster}', system.one) ORDER BY ALL settings skip_unavailable_shards=0" ) From 06ceaee50218507f49bfc714903240ad4b5d81a0 Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Mon, 12 Aug 2024 11:09:45 +0000 Subject: [PATCH 097/142] Fix test 01903_correct_block_size_prediction_with_default - Don't allow random settings that affect the memory usage - Run two queries and compare the memory usage, rather than having an arbitrary hardcoded value --- ...ock_size_prediction_with_default.reference | 6 ++++ ...rect_block_size_prediction_with_default.sh | 36 +++++++++++++++++++ ...ect_block_size_prediction_with_default.sql | 13 ------- 3 files changed, 42 insertions(+), 13 deletions(-) create mode 100755 tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh delete mode 100644 tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference index b70a1cb7c75..2c66db91737 100644 --- a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.reference @@ -1,3 +1,9 @@ 8 +8 +1 4 4 +1 +4 +4 +1 diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh new file mode 100755 index 00000000000..922dcb957e5 --- /dev/null +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh @@ -0,0 +1,36 @@ +#!/usr/bin/env bash +# Tags: no-random-merge-tree-settings, no-random-settings + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +sql="toUInt16OrNull(arrayFirst((v, k) -> (k = '4Id'), arr[2], arr[1]))" + +# Create the table and fill it +$CLICKHOUSE_CLIENT -n --query=" + CREATE TABLE test_extract(str String, arr Array(Array(String)) ALIAS extractAllGroupsHorizontal(str, '\\W(\\w+)=(\"[^\"]*?\"|[^\",}]*)')) ENGINE=MergeTree() PARTITION BY tuple() ORDER BY tuple(); + INSERT INTO test_extract (str) WITH range(8) as range_arr, arrayMap(x-> concat(toString(x),'Id'), range_arr) as key, arrayMap(x -> rand() % 8, range_arr) as val, arrayStringConcat(arrayMap((x,y) -> concat(x,'=',toString(y)), key, val),',') as str SELECT str FROM numbers(500000); + ALTER TABLE test_extract ADD COLUMN 15Id Nullable(UInt16) DEFAULT $sql;" + +function test() +{ + # Execute two queries and compare if they have similar memory usage: + # The first query uses the default column value, while the second explicitly uses the same SQL as the default value. + # Follow https://github.com/ClickHouse/ClickHouse/issues/17317 for more info about the issue + where=$1 + + uuid_1=$($CLICKHOUSE_CLIENT --query="SELECT generateUUIDv4()") + $CLICKHOUSE_CLIENT --query="SELECT uniq(15Id) FROM test_extract $where SETTINGS max_threads=1" --query_id=$uuid_1 + uuid_2=$($CLICKHOUSE_CLIENT --query="SELECT generateUUIDv4()") + $CLICKHOUSE_CLIENT --query="SELECT uniq($sql) FROM test_extract $where SETTINGS max_threads=1" --query_id=$uuid_2 + $CLICKHOUSE_CLIENT -n --query=" + SYSTEM FLUSH LOGS; + WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE event_time > now() - INTERVAL 5 MINUTE AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), + memory_2 AS (SELECT memory_usage FROM system.query_log WHERE event_time > now() - INTERVAL 5 MINUTE AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) + SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" +} + +test "" +test "PREWHERE 15Id < 4" +test "WHERE 15Id < 4" diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql deleted file mode 100644 index 2eec08635eb..00000000000 --- a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sql +++ /dev/null @@ -1,13 +0,0 @@ --- Tags: no-random-merge-tree-settings - -CREATE TABLE test_extract(str String, arr Array(Array(String)) ALIAS extractAllGroupsHorizontal(str, '\\W(\\w+)=("[^"]*?"|[^",}]*)')) ENGINE=MergeTree() PARTITION BY tuple() ORDER BY tuple(); - -INSERT INTO test_extract (str) WITH range(8) as range_arr, arrayMap(x-> concat(toString(x),'Id'), range_arr) as key, arrayMap(x -> rand() % 8, range_arr) as val, arrayStringConcat(arrayMap((x,y) -> concat(x,'=',toString(y)), key, val),',') as str SELECT str FROM numbers(500000); - -ALTER TABLE test_extract ADD COLUMN `15Id` Nullable(UInt16) DEFAULT toUInt16OrNull(arrayFirst((v, k) -> (k = '4Id'), arr[2], arr[1])); - -SELECT uniq(15Id) FROM test_extract SETTINGS max_threads=1, max_memory_usage=100000000; - -SELECT uniq(15Id) FROM test_extract PREWHERE 15Id < 4 SETTINGS max_threads=1, max_memory_usage=100000000; - -SELECT uniq(15Id) FROM test_extract WHERE 15Id < 4 SETTINGS max_threads=1, max_memory_usage=100000000; From 34643ee16c5452a08feac63aaaea605a8c912b37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 12 Aug 2024 13:30:25 +0000 Subject: [PATCH 098/142] Run test only from modified files --- tests/ci/integration_tests_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ci/integration_tests_runner.py b/tests/ci/integration_tests_runner.py index 84718462ab5..7b9e7d1f63e 100755 --- a/tests/ci/integration_tests_runner.py +++ b/tests/ci/integration_tests_runner.py @@ -69,9 +69,9 @@ def get_changed_tests_to_run(pr_info, repo_path): return [] for fpath in changed_files: - if "tests/integration/test_" in fpath: + if re.search("tests/integration/test_.*/test.*\.py", fpath) is not None: logging.info("File %s changed and seems like integration test", fpath) - result.add(fpath.split("/")[2]) + result.add("/".join(fpath.split("/")[2:])) return filter_existing_tests(result, repo_path) From f7c6eabb498b47b21c13dbf55efbda551902d09c Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Mon, 12 Aug 2024 13:44:05 +0000 Subject: [PATCH 099/142] Small fix to filter by current_database in system.query_log --- ...01903_correct_block_size_prediction_with_default.sh | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh index 922dcb957e5..e898a9d5ee8 100755 --- a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh @@ -20,15 +20,15 @@ function test() # Follow https://github.com/ClickHouse/ClickHouse/issues/17317 for more info about the issue where=$1 - uuid_1=$($CLICKHOUSE_CLIENT --query="SELECT generateUUIDv4()") + uuid_1=$(cat /proc/sys/kernel/random/uuid) $CLICKHOUSE_CLIENT --query="SELECT uniq(15Id) FROM test_extract $where SETTINGS max_threads=1" --query_id=$uuid_1 - uuid_2=$($CLICKHOUSE_CLIENT --query="SELECT generateUUIDv4()") + uuid_2=$(cat /proc/sys/kernel/random/uuid) $CLICKHOUSE_CLIENT --query="SELECT uniq($sql) FROM test_extract $where SETTINGS max_threads=1" --query_id=$uuid_2 $CLICKHOUSE_CLIENT -n --query=" SYSTEM FLUSH LOGS; - WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE event_time > now() - INTERVAL 5 MINUTE AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), - memory_2 AS (SELECT memory_usage FROM system.query_log WHERE event_time > now() - INTERVAL 5 MINUTE AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) - SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" + WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), + memory_2 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) + SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" } test "" From 6cde029ed9cd6a22937193e12863974f1fa8f160 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 12 Aug 2024 13:48:44 +0000 Subject: [PATCH 100/142] Fix style --- tests/ci/integration_tests_runner.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/integration_tests_runner.py b/tests/ci/integration_tests_runner.py index 7b9e7d1f63e..f5dbef4f6db 100755 --- a/tests/ci/integration_tests_runner.py +++ b/tests/ci/integration_tests_runner.py @@ -69,7 +69,7 @@ def get_changed_tests_to_run(pr_info, repo_path): return [] for fpath in changed_files: - if re.search("tests/integration/test_.*/test.*\.py", fpath) is not None: + if re.search(r"tests/integration/test_.*/test.*\.py", fpath) is not None: logging.info("File %s changed and seems like integration test", fpath) result.add("/".join(fpath.split("/")[2:])) return filter_existing_tests(result, repo_path) From da5b9582a990f7a2c05c1a3dede3739fb9cbfcae Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Mon, 12 Aug 2024 13:54:17 +0000 Subject: [PATCH 101/142] Fix indent --- .../01903_correct_block_size_prediction_with_default.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh index e898a9d5ee8..075d9a1dacf 100755 --- a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh @@ -26,9 +26,9 @@ function test() $CLICKHOUSE_CLIENT --query="SELECT uniq($sql) FROM test_extract $where SETTINGS max_threads=1" --query_id=$uuid_2 $CLICKHOUSE_CLIENT -n --query=" SYSTEM FLUSH LOGS; - WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), - memory_2 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) - SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" + WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), + memory_2 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) + SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" } test "" From eb3ffb71847fc5a204af31665a4594b0918fc1d7 Mon Sep 17 00:00:00 2001 From: divanik Date: Mon, 12 Aug 2024 15:09:16 +0000 Subject: [PATCH 102/142] Add supportsReplication --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 625b1281c61..f925cb773f5 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2351,7 +2351,7 @@ size_t MergeTreeData::clearOldTemporaryDirectories(const String & root_path, siz /// We don't control the amount of refs for temporary parts so we cannot decide can we remove blobs /// or not. So we are not doing it bool keep_shared = false; - if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication) + if (disk->supportZeroCopyReplication() && settings->allow_remote_fs_zero_copy_replication && supportsReplication()) { LOG_WARNING(log, "Since zero-copy replication is enabled we are not going to remove blobs from shared storage for {}", full_path); keep_shared = true; From a39b9cf643bff565728be4083eb024ff5254f363 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 11 May 2024 13:05:24 +0000 Subject: [PATCH 103/142] Un-screw usearch's build description No directory 'SimSIMD-map' exists, the build only worked because SimSIMD support in usearch was (accidentally?) disabled. This commit corrects the build description. SimSIMD support in usearch will be enabled by a later commit. --- contrib/CMakeLists.txt | 2 +- contrib/usearch-cmake/CMakeLists.txt | 8 +++----- src/Storages/MergeTree/MergeTreeIndexUSearch.h | 1 - 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index b33e7083e32..98b992e1080 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -207,7 +207,7 @@ if (ARCH_S390X) endif() add_contrib (annoy-cmake annoy) -option(ENABLE_USEARCH "Enable USearch (Approximate Neighborhood Search, HNSW) support" ${ENABLE_LIBRARIES}) +option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES}) if (ENABLE_USEARCH) add_contrib (FP16-cmake FP16) add_contrib (robin-map-cmake robin-map) diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt index 29fbe57106c..0b6f60e106b 100644 --- a/contrib/usearch-cmake/CMakeLists.txt +++ b/contrib/usearch-cmake/CMakeLists.txt @@ -1,9 +1,7 @@ -set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch") -set(USEARCH_SOURCE_DIR "${USEARCH_PROJECT_DIR}/include") - set(FP16_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/FP16") set(ROBIN_MAP_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/robin-map") -set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD-map") +set(SIMSIMD_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/SimSIMD") +set(USEARCH_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/usearch") add_library(_usearch INTERFACE) @@ -11,7 +9,7 @@ target_include_directories(_usearch SYSTEM INTERFACE ${FP16_PROJECT_DIR}/include ${ROBIN_MAP_PROJECT_DIR}/include ${SIMSIMD_PROJECT_DIR}/include - ${USEARCH_SOURCE_DIR}) + ${USEARCH_PROJECT_DIR}/include) add_library(ch_contrib::usearch ALIAS _usearch) target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH) diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.h b/src/Storages/MergeTree/MergeTreeIndexUSearch.h index 41de94402c9..e6068790d22 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.h +++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.h @@ -113,4 +113,3 @@ private: #endif - From d7211f9d12d33c54929fb24991fe7e46939be67d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Aug 2024 09:22:38 +0000 Subject: [PATCH 104/142] Fix CMake integration of usearch and annoy Registers usearch and annoy properly via configure_config.cmake and config.h.in like all other 3rd party libs, instead of (mis)using target_compile_definitions. --- contrib/annoy-cmake/CMakeLists.txt | 1 - contrib/usearch-cmake/CMakeLists.txt | 1 - src/Common/config.h.in | 2 ++ src/Processors/QueryPlan/ReadFromMergeTree.cpp | 6 ++++-- src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 4 +++- src/Storages/MergeTree/MergeTreeIndexUSearch.cpp | 6 +++--- src/Storages/MergeTree/MergeTreeIndexUSearch.h | 8 ++++++-- src/Storages/MergeTree/MergeTreeIndices.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeIndices.h | 5 +++-- src/configure_config.cmake | 6 ++++++ 11 files changed, 31 insertions(+), 16 deletions(-) diff --git a/contrib/annoy-cmake/CMakeLists.txt b/contrib/annoy-cmake/CMakeLists.txt index bdef7d92132..f6579c12412 100644 --- a/contrib/annoy-cmake/CMakeLists.txt +++ b/contrib/annoy-cmake/CMakeLists.txt @@ -20,5 +20,4 @@ add_library(_annoy INTERFACE) target_include_directories(_annoy SYSTEM INTERFACE ${ANNOY_SOURCE_DIR}) add_library(ch_contrib::annoy ALIAS _annoy) -target_compile_definitions(_annoy INTERFACE ENABLE_ANNOY) target_compile_definitions(_annoy INTERFACE ANNOYLIB_MULTITHREADED_BUILD) diff --git a/contrib/usearch-cmake/CMakeLists.txt b/contrib/usearch-cmake/CMakeLists.txt index 0b6f60e106b..6be622275ae 100644 --- a/contrib/usearch-cmake/CMakeLists.txt +++ b/contrib/usearch-cmake/CMakeLists.txt @@ -12,4 +12,3 @@ target_include_directories(_usearch SYSTEM INTERFACE ${USEARCH_PROJECT_DIR}/include) add_library(ch_contrib::usearch ALIAS _usearch) -target_compile_definitions(_usearch INTERFACE ENABLE_USEARCH) diff --git a/src/Common/config.h.in b/src/Common/config.h.in index e3f8882850f..0fa5f4313b2 100644 --- a/src/Common/config.h.in +++ b/src/Common/config.h.in @@ -58,6 +58,8 @@ #cmakedefine01 USE_FILELOG #cmakedefine01 USE_ODBC #cmakedefine01 USE_BLAKE3 +#cmakedefine01 USE_ANNOY +#cmakedefine01 USE_USEARCH #cmakedefine01 USE_SKIM #cmakedefine01 USE_PRQL #cmakedefine01 USE_ULID diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 901d7c61167..0ec7bde933c 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -52,6 +52,8 @@ #include #include +#include "config.h" + using namespace DB; namespace @@ -1476,11 +1478,11 @@ static void buildIndexes( MergeTreeIndexConditionPtr condition; if (index_helper->isVectorSearch()) { -#ifdef ENABLE_ANNOY +#if USE_ANNOY if (const auto * annoy = typeid_cast(index_helper.get())) condition = annoy->createIndexCondition(query_info, context); #endif -#ifdef ENABLE_USEARCH +#if USE_USEARCH if (const auto * usearch = typeid_cast(index_helper.get())) condition = usearch->createIndexCondition(query_info, context); #endif diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index b68e48eeb3a..cec0e0926f0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -1,7 +1,7 @@ -#ifdef ENABLE_ANNOY - #include +#if USE_ANNOY + #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h index 282920c608e..8e0e0e621a0 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.h +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.h @@ -1,6 +1,8 @@ #pragma once -#ifdef ENABLE_ANNOY +#include "config.h" + +#if USE_ANNOY #include diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp index efd9bb754e1..5a532803d84 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp @@ -1,10 +1,10 @@ -#ifdef ENABLE_USEARCH +#include + +#if USE_USEARCH #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpass-failed" -#include - #include #include #include diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.h b/src/Storages/MergeTree/MergeTreeIndexUSearch.h index e6068790d22..6923ef2f807 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.h +++ b/src/Storages/MergeTree/MergeTreeIndexUSearch.h @@ -1,12 +1,16 @@ #pragma once -#ifdef ENABLE_USEARCH +#include "config.h" -#include +#if USE_USEARCH #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wpass-failed" + +#include + #include + #pragma clang diagnostic pop namespace DB diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp index bded961db8e..32ac629e706 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -127,12 +127,12 @@ MergeTreeIndexFactory::MergeTreeIndexFactory() registerCreator("hypothesis", hypothesisIndexCreator); registerValidator("hypothesis", hypothesisIndexValidator); -#ifdef ENABLE_ANNOY +#if USE_ANNOY registerCreator("annoy", annoyIndexCreator); registerValidator("annoy", annoyIndexValidator); #endif -#ifdef ENABLE_USEARCH +#if USE_USEARCH registerCreator("usearch", usearchIndexCreator); registerValidator("usearch", usearchIndexValidator); #endif diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 1be73e1c811..355f1b69356 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -15,6 +15,7 @@ #include #include +#include "config.h" constexpr auto INDEX_FILE_PREFIX = "skp_idx_"; @@ -230,12 +231,12 @@ void bloomFilterIndexValidator(const IndexDescription & index, bool attach); MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index); void hypothesisIndexValidator(const IndexDescription & index, bool attach); -#ifdef ENABLE_ANNOY +#if USE_ANNOY MergeTreeIndexPtr annoyIndexCreator(const IndexDescription & index); void annoyIndexValidator(const IndexDescription & index, bool attach); #endif -#ifdef ENABLE_USEARCH +#if USE_USEARCH MergeTreeIndexPtr usearchIndexCreator(const IndexDescription& index); void usearchIndexValidator(const IndexDescription& index, bool attach); #endif diff --git a/src/configure_config.cmake b/src/configure_config.cmake index 5b24f79ef6e..702875b1f40 100644 --- a/src/configure_config.cmake +++ b/src/configure_config.cmake @@ -164,6 +164,12 @@ endif() if (TARGET ch_contrib::bcrypt) set(USE_BCRYPT 1) endif() +if (TARGET ch_contrib::annoy) + set(USE_ANNOY 1) +endif() +if (TARGET ch_contrib::usearch) + set(USE_USEARCH 1) +endif() if (TARGET ch_contrib::ssh) set(USE_SSH 1) endif() From 7c419399216a714f9dcffe7835f951718851bceb Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Aug 2024 09:36:39 +0000 Subject: [PATCH 105/142] Fix test results (no analyzer support yet ...) --- tests/queries/0_stateless/02354_vector_search_queries.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 64051aa8544..87d27be0ea4 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -8,6 +8,8 @@ SET allow_experimental_annoy_index = 1; SET allow_experimental_usearch_index = 1; +SET enable_analyzer = 0; + SELECT 'ARRAY, 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block'; DROP TABLE IF EXISTS tab_annoy; From 218421c255cadbe65406e6a040d05942cc4efc3e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Aug 2024 09:47:50 +0000 Subject: [PATCH 106/142] Remove Annoy indexes Annoy indexes fell out of favor in the community, at least when it comes to vector databases. Such indexes work okay-ish low dimensions but they suffers badly from a curse of dimensionality which makes them inapt for a high number of dimensions. Now that Annoy is gone, issue (*) also disappears and we can drop 'no-ubsan', 'no-cpu-aarch64', and 'no-asan' from tests. (*) spotify/annoy#456 --- .gitmodules | 3 - contrib/CMakeLists.txt | 1 - contrib/annoy | 1 - contrib/annoy-cmake/CMakeLists.txt | 23 - .../mergetree-family/annindexes.md | 87 +--- src/CMakeLists.txt | 4 - src/Common/config.h.in | 1 - src/Core/Settings.h | 6 +- src/Databases/DatabaseReplicated.cpp | 1 - src/Interpreters/InterpreterCreateQuery.cpp | 2 - src/Parsers/ASTIndexDeclaration.h | 1 - src/Parsers/ParserCreateIndexQuery.cpp | 4 +- src/Parsers/ParserCreateQuery.cpp | 4 +- .../QueryPlan/ReadFromMergeTree.cpp | 5 - .../MergeTree/MergeTreeIOSettings.cpp | 1 - src/Storages/MergeTree/MergeTreeIOSettings.h | 2 - .../MergeTree/MergeTreeIndexAnnoy.cpp | 416 ------------------ src/Storages/MergeTree/MergeTreeIndexAnnoy.h | 114 ----- src/Storages/MergeTree/MergeTreeIndices.cpp | 4 - src/Storages/MergeTree/MergeTreeIndices.h | 5 - src/configure_config.cmake | 3 - .../02354_vector_search_bugs.reference | 10 - .../0_stateless/02354_vector_search_bugs.sql | 75 +--- ...ector_search_default_granularity.reference | 6 +- ...2354_vector_search_default_granularity.sql | 21 +- ...r_search_index_creation_negative.reference | 2 +- ..._vector_search_index_creation_negative.sql | 22 +- .../02354_vector_search_queries.reference | 99 ----- .../02354_vector_search_queries.sql | 119 +---- 29 files changed, 32 insertions(+), 1010 deletions(-) delete mode 160000 contrib/annoy delete mode 100644 contrib/annoy-cmake/CMakeLists.txt delete mode 100644 src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp delete mode 100644 src/Storages/MergeTree/MergeTreeIndexAnnoy.h diff --git a/.gitmodules b/.gitmodules index 7fdfb1103c5..0a66031de8d 100644 --- a/.gitmodules +++ b/.gitmodules @@ -230,9 +230,6 @@ [submodule "contrib/minizip-ng"] path = contrib/minizip-ng url = https://github.com/zlib-ng/minizip-ng -[submodule "contrib/annoy"] - path = contrib/annoy - url = https://github.com/ClickHouse/annoy [submodule "contrib/qpl"] path = contrib/qpl url = https://github.com/intel/qpl diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 98b992e1080..dc2ad2a3150 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -205,7 +205,6 @@ add_contrib (morton-nd-cmake morton-nd) if (ARCH_S390X) add_contrib(crc32-s390x-cmake crc32-s390x) endif() -add_contrib (annoy-cmake annoy) option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES}) if (ENABLE_USEARCH) diff --git a/contrib/annoy b/contrib/annoy deleted file mode 160000 index f2ac8e7b48f..00000000000 --- a/contrib/annoy +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f2ac8e7b48f9a9cf676d3b58286e5455aba8e956 diff --git a/contrib/annoy-cmake/CMakeLists.txt b/contrib/annoy-cmake/CMakeLists.txt deleted file mode 100644 index f6579c12412..00000000000 --- a/contrib/annoy-cmake/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ -option(ENABLE_ANNOY "Enable Annoy index support" ${ENABLE_LIBRARIES}) - -# Annoy index should be disabled with undefined sanitizer. Because of memory storage optimizations -# (https://github.com/ClickHouse/annoy/blob/9d8a603a4cd252448589e84c9846f94368d5a289/src/annoylib.h#L442-L463) -# UBSan fails and leads to crash. Simmilar issue is already opened in Annoy repo -# https://github.com/spotify/annoy/issues/456 -# Problem with aligment can lead to errors like -# (https://stackoverflow.com/questions/46790550/c-undefined-behavior-strict-aliasing-rule-or-incorrect-alignment) -# or will lead to crash on arm https://developer.arm.com/documentation/ka003038/latest -# This issues should be resolved before annoy became non-experimental (--> setting "allow_experimental_annoy_index") -if ((NOT ENABLE_ANNOY) OR (SANITIZE STREQUAL "undefined") OR (ARCH_AARCH64)) - message (STATUS "Not using annoy") - return() -endif() - -set(ANNOY_PROJECT_DIR "${ClickHouse_SOURCE_DIR}/contrib/annoy") -set(ANNOY_SOURCE_DIR "${ANNOY_PROJECT_DIR}/src") - -add_library(_annoy INTERFACE) -target_include_directories(_annoy SYSTEM INTERFACE ${ANNOY_SOURCE_DIR}) - -add_library(ch_contrib::annoy ALIAS _annoy) -target_compile_definitions(_annoy INTERFACE ANNOYLIB_MULTITHREADED_BUILD) diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 5a81313f62e..9a80542522e 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -126,81 +126,8 @@ was specified for ANN indexes, the default value is 100 million. # Available ANN Indexes {#available_ann_indexes} -- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy) - - [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch) -## Annoy {#annoy} - -Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently -disabled on ARM due to memory safety problems with the algorithm. - -This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random -linear surfaces (lines in 2D, planes in 3D etc.). - -
- -
- -Syntax to create an Annoy index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column: - -```sql -CREATE TABLE table_with_annoy_index -( - id Int64, - vectors Array(Float32), - INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N] -) -ENGINE = MergeTree -ORDER BY id; -``` - -Annoy currently supports two distance functions: -- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space - ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). -- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors - ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). - -For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no -distance function was specified during index creation, `L2Distance` is used as default. - -Parameter `NumTrees` is the number of trees which the algorithm creates (default if not specified: 100). Higher values of `NumTree` mean -more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes. - -:::note -All arrays must have same length. To avoid errors, you can use a -[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK -length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported. -::: - -The creation of Annoy indexes (whenever a new part is build, e.g. at the end of a merge) is a relatively slow process. You can increase -setting `max_threads_for_annoy_index_creation` (default: 4) which controls how many threads are used to create an Annoy index. Please be -careful with this setting, it is possible that multiple indexes are created in parallel in which case there can be overparallelization. - -Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger -values mean more accurate results at the cost of longer query runtime: - -```sql -SELECT * -FROM table_name -ORDER BY L2Distance(vectors, Point) -LIMIT N -SETTINGS annoy_index_search_k_nodes=100; -``` - -:::note -The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see -[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml. -::: - ## USearch {#usearch} This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW @@ -211,6 +138,8 @@ that are expensive to load and compare. The library also has several hardware-sp distance computations on modern Arm (NEON and SVE) and x86 (AVX2 and AVX-512) CPUs and OS-specific optimizations to allow efficient navigation around immutable persistent files, without loading them into RAM. +USearch indexes are currently experimental, to use them you first need to `SET allow_experimental_usearch_index = 1`. +
-
- -Syntax to create an USearch index over an [Array](../../../sql-reference/data-types/array.md) column: - -```sql -CREATE TABLE table_with_usearch_index -( - id Int64, - vectors Array(Float32), - INDEX [ann_index_name] vectors TYPE usearch([Distance[, ScalarKind]]) [GRANULARITY N] -) -ENGINE = MergeTree -ORDER BY id; -``` - -USearch currently supports two distance functions: -- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space - ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)). -- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors - ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)). - -USearch allows storing the vectors in reduced precision formats. Supported scalar kinds are `f64`, `f32`, `f16` or `i8`. If no scalar kind -was specified during index creation, `f16` is used as default. - -For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no -distance function was specified during index creation, `L2Distance` is used as default. - -:::note -All arrays must have same length. To avoid errors, you can use a -[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK -length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported. -::: - -:::note -The USearch index currently does not work with per-table, non-default `index_granularity` settings (see -[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml. -::: - diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 23ad12bb017..e9f3b95dbc1 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -907,9 +907,9 @@ class IColumn; M(Bool, allow_experimental_hash_functions, false, "Enable experimental hash functions", 0) \ M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \ M(Bool, allow_experimental_time_series_table, false, "Allows experimental TimeSeries table engine", 0) \ + M(Bool, allow_experimental_vector_similarity_index, false, "Allow experimental vector similarity index", 0) \ M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \ M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \ - M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \ M(Bool, allow_experimental_codecs, false, "If it is set to true, allow to specify experimental compression codecs (but we don't have those yet and this option does nothing).", 0) \ M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \ M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \ @@ -1036,6 +1036,7 @@ class IColumn; MAKE_OBSOLETE(M, Bool, allow_experimental_annoy_index, false) \ MAKE_OBSOLETE(M, UInt64, max_threads_for_annoy_index_creation, 4) \ MAKE_OBSOLETE(M, Int64, annoy_index_search_k_nodes, -1) \ + MAKE_OBSOLETE(M, Bool, allow_experimental_usearch_index, false) \ MAKE_OBSOLETE(M, Bool, optimize_move_functions_out_of_any, false) \ MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 511723f1873..8fabd1ecf91 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -85,6 +85,7 @@ static std::initializer_listsetSetting("allow_experimental_object_type", 1); query_context->setSetting("allow_experimental_variant_type", 1); query_context->setSetting("allow_experimental_dynamic_type", 1); - query_context->setSetting("allow_experimental_usearch_index", 1); + query_context->setSetting("allow_experimental_vector_similarity_index", 1); query_context->setSetting("allow_experimental_bigint_types", 1); query_context->setSetting("allow_experimental_window_functions", 1); query_context->setSetting("allow_experimental_geo_types", 1); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index a1ffcf07588..95143031707 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -787,8 +787,8 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'"); /// ---- - if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index) - throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index"); + if (index_desc.type == "vector_similarity" && !settings.allow_experimental_vector_similarity_index) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index is disabled. Turn on allow_experimental_vector_similarity_index"); properties.indices.push_back(index_desc); } diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h index 90645f12b7c..72f3f017a99 100644 --- a/src/Parsers/ASTIndexDeclaration.h +++ b/src/Parsers/ASTIndexDeclaration.h @@ -13,7 +13,7 @@ class ASTIndexDeclaration : public IAST { public: static const auto DEFAULT_INDEX_GRANULARITY = 1uz; - static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz; + static const auto DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY = 100'000'000uz; ASTIndexDeclaration(ASTPtr expression, ASTPtr type, const String & name_); diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp index e7cfd753f99..ed89b80edca 100644 --- a/src/Parsers/ParserCreateIndexQuery.cpp +++ b/src/Parsers/ParserCreateIndexQuery.cpp @@ -89,8 +89,8 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected else { auto index_type = index->getType(); - if (index_type && index_type->name == "usearch") - index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY; + if (index_type && index_type->name == "vector_similarity") + index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY; else index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index b31fe21c4cc..cc4e02f46a3 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -214,8 +214,8 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe else { auto index_type = index->getType(); - if (index_type->name == "usearch") - index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY; + if (index_type->name == "vector_similarity") + index->granularity = ASTIndexDeclaration::DEFAULT_VECTOR_SIMILARITY_INDEX_GRANULARITY; else index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY; } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 3324cc4e42a..1f30725b4d0 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include #include @@ -1478,8 +1478,8 @@ static void buildIndexes( if (index_helper->isVectorSimilarityIndex()) { #if USE_USEARCH - if (const auto * usearch_index = typeid_cast(index_helper.get())) - condition = usearch_index->createIndexCondition(query_info, context); + if (const auto * vector_similarity_index = typeid_cast(index_helper.get())) + condition = vector_similarity_index->createIndexCondition(query_info, context); #endif if (!condition) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name); diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp similarity index 76% rename from src/Storages/MergeTree/MergeTreeIndexUSearch.cpp rename to src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 1aa6c9c14d4..6f3b1b043cd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -1,4 +1,4 @@ -#include +#include #if USE_USEARCH @@ -90,7 +90,7 @@ void USearchIndexWithSerialization::serialize(WriteBuffer & ostr) const auto result = Base::save_to_stream(callback); if (result.error) - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not save USearch index, error: " + String(result.error.release())); + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not save vector similarity index, error: " + String(result.error.release())); } void USearchIndexWithSerialization::deserialize(ReadBuffer & istr) @@ -104,7 +104,7 @@ void USearchIndexWithSerialization::deserialize(ReadBuffer & istr) auto result = Base::load_from_stream(callback); if (result.error) /// See the comment in MergeTreeIndexGranuleVectorSimilarity::deserializeBinary why we throw here - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not load USearch index, error: " + String(result.error.release()) + " Please drop the index and create it again."); + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not load vector similarity index, error: " + String(result.error.release()) + " Please drop the index and create it again."); } USearchIndexWithSerialization::Statistics USearchIndexWithSerialization::getStatistics() const @@ -121,16 +121,16 @@ USearchIndexWithSerialization::Statistics USearchIndexWithSerialization::getStat return statistics; } -MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( +MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_) - : MergeTreeIndexGranuleUSearch(index_name_, index_sample_block_, metric_kind_, scalar_kind_, nullptr) + : MergeTreeIndexGranuleVectorSimilarity(index_name_, index_sample_block_, metric_kind_, scalar_kind_, nullptr) { } -MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( +MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, @@ -144,7 +144,7 @@ MergeTreeIndexGranuleUSearch::MergeTreeIndexGranuleUSearch( { } -void MergeTreeIndexGranuleUSearch::serializeBinary(WriteBuffer & ostr) const +void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr) const { if (empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to write empty minmax index {}", backQuote(index_name)); @@ -158,18 +158,18 @@ void MergeTreeIndexGranuleUSearch::serializeBinary(WriteBuffer & ostr) const index->serialize(ostr); auto statistics = index->getStatistics(); - LOG_TRACE(logger, "Wrote USearch index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", + LOG_TRACE(logger, "Wrote vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage)); } -void MergeTreeIndexGranuleUSearch::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) +void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion /*version*/) { UInt64 file_version; readIntBinary(file_version, istr); if (file_version != FILE_FORMAT_VERSION) throw Exception( ErrorCodes::FORMAT_VERSION_TOO_OLD, - "USearch index could not be loaded because its version is too old (current version: {}, persisted version: {}). Please drop the index and create it again.", + "Vector similarity index could not be loaded because its version is too old (current version: {}, persisted version: {}). Please drop the index and create it again.", FILE_FORMAT_VERSION, file_version); /// More fancy error handling would be: Set a flag on the index that it failed to load. During usage return all granules, i.e. /// behave as if the index does not exist. Since format changes are expected to happen only rarely and it is "only" an index, keep it simple for now. @@ -181,11 +181,11 @@ void MergeTreeIndexGranuleUSearch::deserializeBinary(ReadBuffer & istr, MergeTre index->deserialize(istr); auto statistics = index->getStatistics(); - LOG_TRACE(logger, "Loaded USearch index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", + LOG_TRACE(logger, "Loaded vector similarity index: max_level = {}, connectivity = {}, size = {}, capacity = {}, memory_usage = {}", statistics.max_level, statistics.connectivity, statistics.size, statistics.capacity, ReadableSize(statistics.memory_usage)); } -MergeTreeIndexAggregatorUSearch::MergeTreeIndexAggregatorUSearch( +MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, @@ -197,14 +197,14 @@ MergeTreeIndexAggregatorUSearch::MergeTreeIndexAggregatorUSearch( { } -MergeTreeIndexGranulePtr MergeTreeIndexAggregatorUSearch::getGranuleAndReset() +MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAndReset() { - auto granule = std::make_shared(index_name, index_sample_block, metric_kind, scalar_kind, index); + auto granule = std::make_shared(index_name, index_sample_block, metric_kind, scalar_kind, index); index = nullptr; return granule; } -void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, size_t limit) +void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_t * pos, size_t limit) { if (*pos >= block.rows()) throw Exception( @@ -239,8 +239,8 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, if (column_array->empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Array is unexpectedly empty"); - /// The Usearch algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays - /// are INSERTed into an Usearch-indexed column or if no value was specified at all in which case the arrays take on their default + /// The vector similarity algorithm naturally assumes that the indexed vectors have dimension >= 1. This condition is violated if empty arrays + /// are INSERTed into an vector-similarity-indexed column or if no value was specified at all in which case the arrays take on their default /// values which is also empty. if (column_array->isDefaultAt(0)) throw Exception(ErrorCodes::INCORRECT_DATA, "The arrays in column '{}' must not be empty. Did you try to INSERT default values?", index_column_name); @@ -262,13 +262,13 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, /// Reserving space is mandatory if (!index->reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows))) - throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for usearch index"); + throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index"); for (size_t current_row = 0; current_row < num_rows; ++current_row) { auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]); if (!rc) - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to USearch index, error: " + String(rc.error.release())); + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index, error: " + String(rc.error.release())); ProfileEvents::increment(ProfileEvents::USearchAddCount); ProfileEvents::increment(ProfileEvents::USearchAddVisitedMembers, rc.visited_members); @@ -281,7 +281,7 @@ void MergeTreeIndexAggregatorUSearch::update(const Block & block, size_t * pos, *pos += rows_read; } -MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch( +MergeTreeIndexConditionVectorSimilarity::MergeTreeIndexConditionVectorSimilarity( const IndexDescription & /*index_description*/, const SelectQueryInfo & query, unum::usearch::metric_kind_t metric_kind_, @@ -291,12 +291,12 @@ MergeTreeIndexConditionUSearch::MergeTreeIndexConditionUSearch( { } -bool MergeTreeIndexConditionUSearch::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const +bool MergeTreeIndexConditionVectorSimilarity::mayBeTrueOnGranule(MergeTreeIndexGranulePtr) const { throw Exception(ErrorCodes::LOGICAL_ERROR, "mayBeTrueOnGranule is not supported for ANN skip indexes"); } -bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const +bool MergeTreeIndexConditionVectorSimilarity::alwaysUnknownOrTrue() const { String index_distance_function; switch (metric_kind) @@ -308,14 +308,14 @@ bool MergeTreeIndexConditionUSearch::alwaysUnknownOrTrue() const return vector_similarity_condition.alwaysUnknownOrTrue(index_distance_function); } -std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeIndexGranulePtr granule_) const +std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(MergeTreeIndexGranulePtr granule_) const { const UInt64 limit = vector_similarity_condition.getLimit(); const UInt64 index_granularity = vector_similarity_condition.getIndexGranularity(); const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); - const auto granule = std::dynamic_pointer_cast(granule_); + const auto granule = std::dynamic_pointer_cast(granule_); if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); @@ -328,7 +328,7 @@ std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeInd auto result = index->search(reference_vector.data(), limit); if (result.error) - throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in USearch index, error: " + String(result.error.release())); + throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index, error: " + String(result.error.release())); ProfileEvents::increment(ProfileEvents::USearchSearchCount); ProfileEvents::increment(ProfileEvents::USearchSearchVisitedMembers, result.visited_members); @@ -350,34 +350,34 @@ std::vector MergeTreeIndexConditionUSearch::getUsefulRanges(MergeTreeInd return granules; } -MergeTreeIndexUSearch::MergeTreeIndexUSearch(const IndexDescription & index_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_) +MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity(const IndexDescription & index_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_) : IMergeTreeIndex(index_) , metric_kind(metric_kind_) , scalar_kind(scalar_kind_) { } -MergeTreeIndexGranulePtr MergeTreeIndexUSearch::createIndexGranule() const +MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() const { - return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); } -MergeTreeIndexAggregatorPtr MergeTreeIndexUSearch::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const +MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const { - return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); } -MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const +MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const { - return std::make_shared(index, query, metric_kind, context); + return std::make_shared(index, query, metric_kind, context); }; -MergeTreeIndexConditionPtr MergeTreeIndexUSearch::createIndexCondition(const ActionsDAG *, ContextPtr) const +MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MergeTreeIndexAnnoy cannot be created with ActionsDAG"); } -MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index) +MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) { static constexpr auto default_metric_kind = unum::usearch::metric_kind_t::l2sq_k; auto metric_kind = default_metric_kind; @@ -389,25 +389,25 @@ MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index) if (index.arguments.size() > 1) scalar_kind = quantizationToScalarKind.at(index.arguments[1].safeGet()); - return std::make_shared(index, metric_kind, scalar_kind); + return std::make_shared(index, metric_kind, scalar_kind); } -void usearchIndexValidator(const IndexDescription & index, bool /* attach */) +void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */) { - /// Check number and type of USearch index arguments: + /// Check number and type of index arguments: if (index.arguments.size() > 2) - throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index must not have more than one parameters"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must not have more than one parameters"); if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of USearch index (distance function) must be of type String"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (distance function) must be of type String"); if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of USearch index (scalar type) must be of type String"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (scalar type) must be of type String"); /// Check that the index is created on a single column if (index.column_names.size() != 1 || index.data_types.size() != 1) - throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "USearch indexes must be created on a single column"); + throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column"); /// Check that a supported metric was passed as first argument @@ -420,16 +420,15 @@ void usearchIndexValidator(const IndexDescription & index, bool /* attach */) throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized scalar kind (second argument) for vector index. Supported kinds are: {}", keysAsString(quantizationToScalarKind)); /// Check data type of indexed column: - DataTypePtr data_type = index.sample_block.getDataTypes()[0]; if (const auto * data_type_array = typeid_cast(data_type.get())) { TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "USearch can only be created on columns of type Array(Float32)"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity index can only be created on columns of type Array(Float32)"); } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "USearch can only be created on columns of type Array(Float32)"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity index can only be created on columns of type Array(Float32)"); } } diff --git a/src/Storages/MergeTree/MergeTreeIndexUSearch.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h similarity index 84% rename from src/Storages/MergeTree/MergeTreeIndexUSearch.h rename to src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index d4df6658a90..95ea3cd5240 100644 --- a/src/Storages/MergeTree/MergeTreeIndexUSearch.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -48,22 +48,22 @@ public: using USearchIndexWithSerializationPtr = std::shared_ptr; -struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule +struct MergeTreeIndexGranuleVectorSimilarity final : public IMergeTreeIndexGranule { - MergeTreeIndexGranuleUSearch( + MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_); - MergeTreeIndexGranuleUSearch( + MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_, USearchIndexWithSerializationPtr index_); - ~MergeTreeIndexGranuleUSearch() override = default; + ~MergeTreeIndexGranuleVectorSimilarity() override = default; void serializeBinary(WriteBuffer & ostr) const override; void deserializeBinary(ReadBuffer & istr, MergeTreeIndexVersion version) override; @@ -76,7 +76,7 @@ struct MergeTreeIndexGranuleUSearch final : public IMergeTreeIndexGranule const unum::usearch::scalar_kind_t scalar_kind; USearchIndexWithSerializationPtr index; - LoggerPtr logger = getLogger("USearchIndex"); + LoggerPtr logger = getLogger("VectorSimilarityIndex"); private: /// The version of the persistence format of USearch index. Increment whenever you change the format. @@ -87,15 +87,15 @@ private: }; -struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator +struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregator { - MergeTreeIndexAggregatorUSearch( + MergeTreeIndexAggregatorVectorSimilarity( const String & index_name_, const Block & index_sample_block, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_); - ~MergeTreeIndexAggregatorUSearch() override = default; + ~MergeTreeIndexAggregatorVectorSimilarity() override = default; bool empty() const override { return !index || index->size() == 0; } MergeTreeIndexGranulePtr getGranuleAndReset() override; @@ -109,16 +109,16 @@ struct MergeTreeIndexAggregatorUSearch final : IMergeTreeIndexAggregator }; -class MergeTreeIndexConditionUSearch final : public IMergeTreeIndexCondition +class MergeTreeIndexConditionVectorSimilarity final : public IMergeTreeIndexCondition { public: - MergeTreeIndexConditionUSearch( + MergeTreeIndexConditionVectorSimilarity( const IndexDescription & index_description, const SelectQueryInfo & query, unum::usearch::metric_kind_t metric_kind_, ContextPtr context); - ~MergeTreeIndexConditionUSearch() override = default; + ~MergeTreeIndexConditionVectorSimilarity() override = default; bool alwaysUnknownOrTrue() const override; bool mayBeTrueOnGranule(MergeTreeIndexGranulePtr granule) const override; @@ -130,15 +130,15 @@ private: }; -class MergeTreeIndexUSearch : public IMergeTreeIndex +class MergeTreeIndexVectorSimilarity : public IMergeTreeIndex { public: - MergeTreeIndexUSearch( + MergeTreeIndexVectorSimilarity( const IndexDescription & index_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_); - ~MergeTreeIndexUSearch() override = default; + ~MergeTreeIndexVectorSimilarity() override = default; MergeTreeIndexGranulePtr createIndexGranule() const override; MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings & settings) const override; diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp index f07449f762c..89aed7873a4 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -129,8 +129,8 @@ MergeTreeIndexFactory::MergeTreeIndexFactory() registerValidator("hypothesis", hypothesisIndexValidator); #if USE_USEARCH - registerCreator("usearch", usearchIndexCreator); - registerValidator("usearch", usearchIndexValidator); + registerCreator("vector_similarity", vectorSimilarityIndexCreator); + registerValidator("vector_similarity", vectorSimilarityIndexValidator); #endif registerCreator("inverted", fullTextIndexCreator); diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 3dee79aae85..48ef2a4739e 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -239,8 +239,8 @@ MergeTreeIndexPtr hypothesisIndexCreator(const IndexDescription & index); void hypothesisIndexValidator(const IndexDescription & index, bool attach); #if USE_USEARCH -MergeTreeIndexPtr usearchIndexCreator(const IndexDescription & index); -void usearchIndexValidator(const IndexDescription & index, bool attach); +MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index); +void vectorSimilarityIndexValidator(const IndexDescription & index, bool attach); #endif MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription & index); diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index de36683ede1..2ef75d0a7fe 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -2,21 +2,21 @@ -- Tests various bugs and special cases for vector indexes. -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; SET enable_analyzer = 1; -- 0 vs. 1 produce slightly different error codes, make it future-proof DROP TABLE IF EXISTS tab; SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected'; -CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree() ORDER BY id; +CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree() ORDER BY id; INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA } INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA } DROP TABLE tab; SELECT 'It is possible to create parts with different Array vector sizes but there will be an error at query time'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; SYSTEM STOP MERGES tab; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]); INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]); @@ -31,7 +31,7 @@ DROP TABLE tab; SELECT 'Correctness of index with > 1 mark'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000); WITH [1.0, 0.0] AS reference_vec diff --git a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql index ff659b56033..a19a0d17536 100644 --- a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql +++ b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql @@ -2,17 +2,17 @@ -- Tests that vector search indexes use a (non-standard) index granularity of 100 mio by default. -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; -- After CREATE TABLE DROP TABLE IF EXISTS tab; -CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE usearch) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; -- After ALTER TABLE DROP TABLE tab; CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id; -ALTER TABLE tab ADD INDEX idx(vec) TYPE usearch; +ALTER TABLE tab ADD INDEX idx(vec) TYPE vector_similarity; SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_detach_attach.sql b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql index 92e8efd918b..36241dfabf7 100644 --- a/tests/queries/0_stateless/02354_vector_search_detach_attach.sql +++ b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql @@ -2,10 +2,10 @@ -- Tests that vector similarity indexes can be detached/attached. -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); DETACH TABLE tab SYNC; diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql index 60bd54d1dbe..912f7d7fcae 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql @@ -2,36 +2,36 @@ -- Tests that various conditions are checked during creation of vector search indexes. -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; SELECT 'At most two index arguments'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('too', 'many', 'args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('too', 'many', 'args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } SELECT '1st argument (distance function) must be String'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } SELECT 'Unsupported distance functions are rejected'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT '2nd argument (scalar kind) must be String'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } SELECT 'Unsupported scalar kinds are rejected'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('L2Distance', 'invalidKind')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('L2Distance', 'invalidKind')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT 'Must be created on single column'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } SELECT 'Must be created on Array(Float32) columns'; SET allow_suspicious_low_cardinality_types = 1; -CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } SELECT 'Rejects INSERTs of Arrays with different sizes'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_queries.reference b/tests/queries/0_stateless/02354_vector_search_queries.reference index 22ad46f802c..7c8e4c0ca59 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.reference +++ b/tests/queries/0_stateless/02354_vector_search_queries.reference @@ -1,9 +1,9 @@ 10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule, 1 indexed block -- Usearch: ORDER-BY-type +- ORDER-BY-type 5 [0,2] 0 6 [0,2.1] 0.09999990463256836 7 [0,2.2] 0.20000004768371582 -- Usearch: ORDER-BY-type, EXPLAIN +- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -16,15 +16,15 @@ Expression (Projection) Granules: 1/1 Skip Name: idx - Description: usearch GRANULARITY 100000000 + Description: vector_similarity GRANULARITY 100000000 Parts: 1/1 Granules: 1/1 12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block -- Usearch: ORDER-BY-type +- ORDER-BY-type 6 [0,2] 0 7 [0,2.1] 0.09999990463256836 8 [0,2.2] 0.20000004768371582 -- Usearch: ORDER-BY-type, EXPLAIN +- ORDER-BY-type, EXPLAIN Expression (Projection) Limit (preliminary LIMIT (without OFFSET)) Sorting (Sorting for ORDER BY) @@ -37,11 +37,11 @@ Expression (Projection) Granules: 4/4 Skip Name: idx - Description: usearch GRANULARITY 2 + Description: vector_similarity GRANULARITY 2 Parts: 1/1 Granules: 2/4 Special cases -- Usearch: ORDER-BY-type +- ORDER-BY-type 6 [1,9.3] 0.005731362878640178 1 [2,3.2] 0.15200169244542905 7 [5.5,4.7] 0.3503476876550442 diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 555f47b364f..50537ad6244 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -2,7 +2,7 @@ -- Tests various simple approximate nearest neighborhood (ANN) queries that utilize vector search indexes. -SET allow_experimental_usearch_index = 1; +SET allow_experimental_vector_similarity_index = 1; SET enable_analyzer = 0; @@ -10,18 +10,18 @@ SELECT '10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -SELECT '- Usearch: ORDER-BY-type'; +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- Usearch: ORDER-BY-type, EXPLAIN'; +SELECT '- ORDER-BY-type, EXPLAIN'; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) @@ -34,17 +34,17 @@ DROP TABLE tab; SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); -SELECT '- Usearch: ORDER-BY-type'; +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) FROM tab ORDER BY L2Distance(vec, reference_vec) LIMIT 3; -SELECT '- Usearch: ORDER-BY-type, EXPLAIN'; +SELECT '- ORDER-BY-type, EXPLAIN'; EXPLAIN indexes = 1 WITH [0.0, 2.0] AS reference_vec SELECT id, vec, L2Distance(vec, reference_vec) @@ -58,10 +58,10 @@ DROP TABLE tab; SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen. -- Just for jun, use metric = 'cosineDistance', scalarKind = 'f64' -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE usearch('cosineDistance', 'f64') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cosineDistance', 'f64') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); -SELECT '- Usearch: ORDER-BY-type'; +SELECT '- ORDER-BY-type'; WITH [0.0, 2.0] AS reference_vec SELECT id, vec, cosineDistance(vec, reference_vec) FROM tab From cc5c64e1ede7284d91ada1f28edbb18a457f5894 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 19:48:51 +0000 Subject: [PATCH 131/142] Add migration helper for legacy 'annoy' and 'usearch' indexes types Index types 'annoy' and 'usearch' were removed and replaced by 'vector_similarity' indexes in an earlier commit. This means unfortuantely, that if customers have tables with these indexes and upgrade, their database might not start anymore - the system loads the metadata at startup, thinks something is wrong with such tables, and halts immediately. This commit adds support for loading and attaching such indexes back. Data insert or use (search) return an error which recommends a migration to 'vector_similarity' indexes. The implementation is generally similar to what has recently been implemented for 'full_text' indexes [1, 2]. [1] https://github.com/ClickHouse/ClickHouse/pull/64656 [2] https://github.com/ClickHouse/ClickHouse/pull/64846 --- .../QueryPlan/ReadFromMergeTree.cpp | 3 ++ .../MergeTreeIndexLegacyVectorSimilarity.cpp | 45 +++++++++++++++++++ .../MergeTreeIndexLegacyVectorSimilarity.h | 26 +++++++++++ src/Storages/MergeTree/MergeTreeIndices.cpp | 10 +++++ src/Storages/MergeTree/MergeTreeIndices.h | 3 ++ ...earch_legacy_index_compatibility.reference | 2 + ...ctor_search_legacy_index_compatibility.sql | 43 ++++++++++++++++++ 7 files changed, 132 insertions(+) create mode 100644 src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp create mode 100644 src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h create mode 100644 tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference create mode 100644 tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 1f30725b4d0..348019d7d10 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -1481,6 +1482,8 @@ static void buildIndexes( if (const auto * vector_similarity_index = typeid_cast(index_helper.get())) condition = vector_similarity_index->createIndexCondition(query_info, context); #endif + if (const auto * legacy_vector_similarity_index = typeid_cast(index_helper.get())) + condition = legacy_vector_similarity_index->createIndexCondition(query_info, context); if (!condition) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown vector search index {}", index_helper->index.name); } diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp new file mode 100644 index 00000000000..29de109d4fc --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.cpp @@ -0,0 +1,45 @@ +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_INDEX; +} + +MergeTreeIndexLegacyVectorSimilarity::MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_) + : IMergeTreeIndex(index_) +{ +} + +MergeTreeIndexGranulePtr MergeTreeIndexLegacyVectorSimilarity::createIndexGranule() const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexAggregatorPtr MergeTreeIndexLegacyVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings &) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const SelectQueryInfo &, ContextPtr) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +}; + +MergeTreeIndexConditionPtr MergeTreeIndexLegacyVectorSimilarity::createIndexCondition(const ActionsDAG *, ContextPtr) const +{ + throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'annoy' or 'usearch' are no longer supported. Please drop and recreate the index as type 'vector_similarity'"); +} + +MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index) +{ + return std::make_shared(index); +} + +void legacyVectorSimilarityIndexValidator(const IndexDescription &, bool) +{ +} + +} diff --git a/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h new file mode 100644 index 00000000000..1015401823d --- /dev/null +++ b/src/Storages/MergeTree/MergeTreeIndexLegacyVectorSimilarity.h @@ -0,0 +1,26 @@ +#pragma once + +#include + +/// Walking corpse implementation for removed skipping index of type "annoy" and "usearch". +/// Its only purpose is to allow loading old tables with indexes of these types. +/// Data insertion and index usage/search will throw an exception, suggesting to migrate to "vector_similarity" indexes. + +namespace DB +{ + +class MergeTreeIndexLegacyVectorSimilarity : public IMergeTreeIndex +{ +public: + explicit MergeTreeIndexLegacyVectorSimilarity(const IndexDescription & index_); + ~MergeTreeIndexLegacyVectorSimilarity() override = default; + + MergeTreeIndexGranulePtr createIndexGranule() const override; + MergeTreeIndexAggregatorPtr createIndexAggregator(const MergeTreeWriterSettings &) const override; + MergeTreeIndexConditionPtr createIndexCondition(const SelectQueryInfo &, ContextPtr) const; + MergeTreeIndexConditionPtr createIndexCondition(const ActionsDAG *, ContextPtr) const override; + + bool isVectorSimilarityIndex() const override { return true; } +}; + +} diff --git a/src/Storages/MergeTree/MergeTreeIndices.cpp b/src/Storages/MergeTree/MergeTreeIndices.cpp index 89aed7873a4..d2fc0e84b56 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.cpp +++ b/src/Storages/MergeTree/MergeTreeIndices.cpp @@ -132,6 +132,16 @@ MergeTreeIndexFactory::MergeTreeIndexFactory() registerCreator("vector_similarity", vectorSimilarityIndexCreator); registerValidator("vector_similarity", vectorSimilarityIndexValidator); #endif + /// ------ + /// TODO: remove this block at the end of 2024. + /// Index types 'annoy' and 'usearch' are no longer supported as of June 2024. Their successor is index type 'vector_similarity'. + /// To support loading tables with old indexes during a transition period, register dummy indexes which allow load/attaching but + /// throw an exception when the user attempts to use them. + registerCreator("annoy", legacyVectorSimilarityIndexCreator); + registerValidator("annoy", legacyVectorSimilarityIndexValidator); + registerCreator("usearch", legacyVectorSimilarityIndexCreator); + registerValidator("usearch", legacyVectorSimilarityIndexValidator); + /// ------ registerCreator("inverted", fullTextIndexCreator); registerValidator("inverted", fullTextIndexValidator); diff --git a/src/Storages/MergeTree/MergeTreeIndices.h b/src/Storages/MergeTree/MergeTreeIndices.h index 48ef2a4739e..c52d7ffe131 100644 --- a/src/Storages/MergeTree/MergeTreeIndices.h +++ b/src/Storages/MergeTree/MergeTreeIndices.h @@ -243,6 +243,9 @@ MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index); void vectorSimilarityIndexValidator(const IndexDescription & index, bool attach); #endif +MergeTreeIndexPtr legacyVectorSimilarityIndexCreator(const IndexDescription & index); +void legacyVectorSimilarityIndexValidator(const IndexDescription & index, bool attach); + MergeTreeIndexPtr fullTextIndexCreator(const IndexDescription & index); void fullTextIndexValidator(const IndexDescription & index, bool attach); diff --git a/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference new file mode 100644 index 00000000000..030bfa9b1bd --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.reference @@ -0,0 +1,2 @@ +Annoy +Usearch diff --git a/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql new file mode 100644 index 00000000000..0889aa74f7a --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_legacy_index_compatibility.sql @@ -0,0 +1,43 @@ +-- Indexes of type 'annoy' or 'usearch' are no longer supported. +-- Test what happens when ClickHouse encounters tables with the old index type. + +DROP TABLE IF EXISTS tab; + +SELECT 'Annoy'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE annoy()) ENGINE = MergeTree ORDER BY id; + +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX } + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3; +-- (*) The search succeeds because the index contains no data (i.e. some shortcut) +-- If it had data (can't really test in SQL tests ...), this statement would also return an error, trust me. + +-- Detach and attach should work. +DETACH TABLE tab; +ATTACH TABLE tab; + +DROP TABLE tab; + +SELECT 'Usearch'; + +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX vec_idx vec TYPE usearch()) ENGINE = MergeTree ORDER BY id; + +INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); -- { serverError ILLEGAL_INDEX } + +WITH [0.0, 2.0] AS reference_vec +SELECT id, vec, L2Distance(vec, reference_vec) +FROM tab +ORDER BY L2Distance(vec, reference_vec) +LIMIT 3; +-- see above: (*) + +-- Detach and attach should work. +DETACH TABLE tab; +ATTACH TABLE tab; + +DROP TABLE tab; From d2e79f0b92936eb3ec3f6409fe6db18a3091919d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 9 Aug 2024 15:28:38 +0000 Subject: [PATCH 132/142] Rework vector index parameters USearch (similar to FAISS) allows to specify the distance function, quantization, and various HNSW meta-parameters for index creation and sarch. Some users wished for greater configurability, so let's expose them. Index creation now requires either - 2 parameters (with the other 4 parameters taking on default values), or - 6 parameters for full control This commit also remove quantization `f64` (that would be upsampling). --- .../mergetree-family/annindexes.md | 12 +- .../MergeTreeIndexVectorSimilarity.cpp | 166 ++++++++++++------ .../MergeTreeIndexVectorSimilarity.h | 23 ++- .../0_stateless/02354_vector_search_bugs.sql | 6 +- ...2354_vector_search_default_granularity.sql | 4 +- .../02354_vector_search_detach_attach.sql | 2 +- ...r_search_index_creation_negative.reference | 12 +- ..._vector_search_index_creation_negative.sql | 48 +++-- ...4_vector_search_multiple_indexes.reference | 0 .../02354_vector_search_multiple_indexes.sql | 14 ++ .../02354_vector_search_queries.sql | 8 +- 11 files changed, 203 insertions(+), 92 deletions(-) create mode 100644 tests/queries/0_stateless/02354_vector_search_multiple_indexes.reference create mode 100644 tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 63c061a0d46..354fac6ea74 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -43,12 +43,22 @@ CREATE TABLE table ( id Int64, vectors Array(Float32), - INDEX [index_name vectors TYPE vector_similarity([Distance[, ScalarKind]]) [GRANULARITY [N]] + INDEX index_name vec TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; ``` +Parameters: +- `method`: Supports currently only `hnsw`. +- `distance_function`: either `L2Distance` (the [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) - the length of a + line between two points in Euclidean space), or `cosineDistance` (the [cosine + distance](https://en.wikipedia.org/wiki/Cosine_similarity#Cosine_distance)- the angle between two non-zero vectors). +- `quantization`: either `f32`, `f16`, or `i8` for storing the vector with reduced precision (optional, default: `f32`) +- `m`: the number of neighbors per graph node (optional, default: 16) +- `ef_construction`: (optional, default: 128) +- `ef_search`: (optional, default: 64) + Vector similarity indexes are based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW algorithm](https://arxiv.org/abs/1603.09320), i.e., a hierarchical graph where each point represents a vector and the edges represent similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp index 6f3b1b043cd..5b0793fa0c8 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.cpp @@ -45,6 +45,9 @@ namespace ErrorCodes namespace { +/// The only indexing method currently supported by USearch +std::set methods = {"hnsw"}; + /// Maps from user-facing name to internal name std::unordered_map distanceFunctionToMetricKind = { {"L2Distance", unum::usearch::metric_kind_t::l2sq_k}, @@ -52,22 +55,37 @@ std::unordered_map distanceFunctionToMetri /// Maps from user-facing name to internal name std::unordered_map quantizationToScalarKind = { - {"f64", unum::usearch::scalar_kind_t::f64_k}, {"f32", unum::usearch::scalar_kind_t::f32_k}, {"f16", unum::usearch::scalar_kind_t::f16_k}, {"i8", unum::usearch::scalar_kind_t::i8_k}}; +template +concept is_set = std::same_as>; + +template +concept is_unordered_map = std::same_as>; + template -String keysAsString(const T & t) +String joinByComma(const T & t) { - String result; - for (const auto & [k, _] : t) + if constexpr (is_set) { - if (!result.empty()) - result += ", "; - result += k; + return fmt::format("{}", fmt::join(t, ", ")); } - return result; + else if constexpr (is_unordered_map) + { + String joined_keys; + for (const auto & [k, _] : t) + { + if (!joined_keys.empty()) + joined_keys += ", "; + joined_keys += k; + } + return joined_keys; + } + /// TODO once our libcxx is recent enough, replace above by + /// return fmt::format("{}", fmt::join(std::views::keys(t)), ", ")); + std::unreachable(); } } @@ -75,8 +93,10 @@ String keysAsString(const T & t) USearchIndexWithSerialization::USearchIndexWithSerialization( size_t dimensions, unum::usearch::metric_kind_t metric_kind, - unum::usearch::scalar_kind_t scalar_kind) - : Base(Base::make(unum::usearch::metric_punned_t(dimensions, metric_kind, scalar_kind))) + unum::usearch::scalar_kind_t scalar_kind, + UsearchHnswParams usearch_hnsw_params) + : Base(Base::make(unum::usearch::metric_punned_t(dimensions, metric_kind, scalar_kind), + unum::usearch::index_dense_config_t(usearch_hnsw_params.m, usearch_hnsw_params.ef_construction, usearch_hnsw_params.ef_search))) { } @@ -125,8 +145,9 @@ MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, - unum::usearch::scalar_kind_t scalar_kind_) - : MergeTreeIndexGranuleVectorSimilarity(index_name_, index_sample_block_, metric_kind_, scalar_kind_, nullptr) + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) + : MergeTreeIndexGranuleVectorSimilarity(index_name_, index_sample_block_, metric_kind_, scalar_kind_, usearch_hnsw_params_, nullptr) { } @@ -135,11 +156,13 @@ MergeTreeIndexGranuleVectorSimilarity::MergeTreeIndexGranuleVectorSimilarity( const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_, USearchIndexWithSerializationPtr index_) : index_name(index_name_) , index_sample_block(index_sample_block_) , metric_kind(metric_kind_) , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) , index(std::move(index_)) { } @@ -153,8 +176,8 @@ void MergeTreeIndexGranuleVectorSimilarity::serializeBinary(WriteBuffer & ostr) /// Number of dimensions is required in the index constructor, /// so it must be written and read separately from the other part - writeIntBinary(static_cast(index->dimensions()), ostr); // write dimension - // + writeIntBinary(static_cast(index->dimensions()), ostr); + index->serialize(ostr); auto statistics = index->getStatistics(); @@ -176,7 +199,7 @@ void MergeTreeIndexGranuleVectorSimilarity::deserializeBinary(ReadBuffer & istr, UInt64 dimension; readIntBinary(dimension, istr); - index = std::make_shared(dimension, metric_kind, scalar_kind); + index = std::make_shared(dimension, metric_kind, scalar_kind, usearch_hnsw_params); index->deserialize(istr); @@ -189,17 +212,19 @@ MergeTreeIndexAggregatorVectorSimilarity::MergeTreeIndexAggregatorVectorSimilari const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, - unum::usearch::scalar_kind_t scalar_kind_) + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) : index_name(index_name_) , index_sample_block(index_sample_block_) , metric_kind(metric_kind_) , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) { } MergeTreeIndexGranulePtr MergeTreeIndexAggregatorVectorSimilarity::getGranuleAndReset() { - auto granule = std::make_shared(index_name, index_sample_block, metric_kind, scalar_kind, index); + auto granule = std::make_shared(index_name, index_sample_block, metric_kind, scalar_kind, usearch_hnsw_params, index); index = nullptr; return granule; } @@ -258,15 +283,15 @@ void MergeTreeIndexAggregatorVectorSimilarity::update(const Block & block, size_ throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column '{}' must have equal length", index_column_name); if (!index) - index = std::make_shared(dimensions, metric_kind, scalar_kind); + index = std::make_shared(dimensions, metric_kind, scalar_kind, usearch_hnsw_params); /// Reserving space is mandatory if (!index->reserve(roundUpToPowerOfTwoOrZero(index->size() + num_rows))) throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not reserve memory for vector similarity index"); - for (size_t current_row = 0; current_row < num_rows; ++current_row) + for (size_t row = 0; row < num_rows; ++row) { - auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[current_row - 1]]); + auto rc = index->add(static_cast(index->size()), &column_array_data_float_data[column_array_offsets[row - 1]]); if (!rc) throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not add data to vector similarity index, error: " + String(rc.error.release())); @@ -313,8 +338,6 @@ std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer const UInt64 limit = vector_similarity_condition.getLimit(); const UInt64 index_granularity = vector_similarity_condition.getIndexGranularity(); - const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); - const auto granule = std::dynamic_pointer_cast(granule_); if (granule == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Granule has the wrong type"); @@ -326,6 +349,8 @@ std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer "does not match the dimension in the index ({})", vector_similarity_condition.getDimensions(), index->dimensions()); + const std::vector reference_vector = vector_similarity_condition.getReferenceVector(); + auto result = index->search(reference_vector.data(), limit); if (result.error) throw Exception::createRuntime(ErrorCodes::INCORRECT_DATA, "Could not search in vector similarity index, error: " + String(result.error.release())); @@ -350,21 +375,26 @@ std::vector MergeTreeIndexConditionVectorSimilarity::getUsefulRanges(Mer return granules; } -MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity(const IndexDescription & index_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_) +MergeTreeIndexVectorSimilarity::MergeTreeIndexVectorSimilarity( + const IndexDescription & index_, + unum::usearch::metric_kind_t metric_kind_, + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_) : IMergeTreeIndex(index_) , metric_kind(metric_kind_) , scalar_kind(scalar_kind_) + , usearch_hnsw_params(usearch_hnsw_params_) { } MergeTreeIndexGranulePtr MergeTreeIndexVectorSimilarity::createIndexGranule() const { - return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params); } MergeTreeIndexAggregatorPtr MergeTreeIndexVectorSimilarity::createIndexAggregator(const MergeTreeWriterSettings & /*settings*/) const { - return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind); + return std::make_shared(index.name, index.sample_block, metric_kind, scalar_kind, usearch_hnsw_params); } MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition(const SelectQueryInfo & query, ContextPtr context) const @@ -379,56 +409,82 @@ MergeTreeIndexConditionPtr MergeTreeIndexVectorSimilarity::createIndexCondition( MergeTreeIndexPtr vectorSimilarityIndexCreator(const IndexDescription & index) { - static constexpr auto default_metric_kind = unum::usearch::metric_kind_t::l2sq_k; - auto metric_kind = default_metric_kind; - if (!index.arguments.empty()) - metric_kind = distanceFunctionToMetricKind.at(index.arguments[0].safeGet()); + const bool has_six_args = (index.arguments.size() == 6); - static constexpr auto default_scalar_kind = unum::usearch::scalar_kind_t::f16_k; - auto scalar_kind = default_scalar_kind; - if (index.arguments.size() > 1) - scalar_kind = quantizationToScalarKind.at(index.arguments[1].safeGet()); + unum::usearch::metric_kind_t metric_kind = distanceFunctionToMetricKind.at(index.arguments[1].safeGet()); - return std::make_shared(index, metric_kind, scalar_kind); + /// use defaults for the other parameters + unum::usearch::scalar_kind_t scalar_kind = unum::usearch::scalar_kind_t::f32_k; + UsearchHnswParams usearch_hnsw_params; + + if (has_six_args) + { + scalar_kind = quantizationToScalarKind.at(index.arguments[2].safeGet()); + usearch_hnsw_params = {.m = index.arguments[3].safeGet(), + .ef_construction = index.arguments[4].safeGet(), + .ef_search = index.arguments[5].safeGet()}; + } + + return std::make_shared(index, metric_kind, scalar_kind, usearch_hnsw_params); } void vectorSimilarityIndexValidator(const IndexDescription & index, bool /* attach */) { - /// Check number and type of index arguments: + const bool has_two_args = (index.arguments.size() == 2); + const bool has_six_args = (index.arguments.size() == 6); - if (index.arguments.size() > 2) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must not have more than one parameters"); + /// Check number and type of arguments + if (!has_two_args && !has_six_args) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Vector similarity index must have two or six arguments"); + if (index.arguments[0].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (method) must be of type String"); + if (index.arguments[1].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (metric) must be of type String"); + if (has_six_args) + { + if (index.arguments[2].getType() != Field::Types::String) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Third argument of vector similarity index (quantization) must be of type String"); + if (index.arguments[3].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fourth argument of vector similarity index (M) must be of type UInt64"); + if (index.arguments[4].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Fifth argument of vector similarity index (ef_construction) must be of type UInt64"); + if (index.arguments[5].getType() != Field::Types::UInt64) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Sixth argument of vector similarity index (ef_search) must be of type UInt64"); + } - if (!index.arguments.empty() && index.arguments[0].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "First argument of vector similarity index (distance function) must be of type String"); - if (index.arguments.size() > 1 && index.arguments[1].getType() != Field::Types::String) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Second argument of vector similarity index (scalar type) must be of type String"); + /// Check that passed arguments are supported + if (!methods.contains(index.arguments[0].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "First argument (method) of vector similarity index is not supported. Supported methods are: {}", joinByComma(methods)); + if (!distanceFunctionToMetricKind.contains(index.arguments[1].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "Second argument (distance function) of vector similarity index is not supported. Supported distance function are: {}", joinByComma(distanceFunctionToMetricKind)); + if (has_six_args) + { + if (!quantizationToScalarKind.contains(index.arguments[2].safeGet())) + throw Exception(ErrorCodes::INCORRECT_DATA, "Third argument (quantization) of vector similarity index is not supported. Supported quantizations are: {}", joinByComma(quantizationToScalarKind)); + if (index.arguments[3].safeGet() < 2) + throw Exception(ErrorCodes::INCORRECT_DATA, "Fourth argument (M) of vector similarity index must be > 1"); + if (index.arguments[4].safeGet() < 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "Fifth argument (ef_construction) of vector similarity index must be > 0"); + if (index.arguments[5].safeGet() < 1) + throw Exception(ErrorCodes::INCORRECT_DATA, "Sixth argument (ef_search) of vector similarity index must be > 0"); + } /// Check that the index is created on a single column - if (index.column_names.size() != 1 || index.data_types.size() != 1) throw Exception(ErrorCodes::INCORRECT_NUMBER_OF_COLUMNS, "Vector similarity indexes must be created on a single column"); - /// Check that a supported metric was passed as first argument - - if (!index.arguments.empty() && !distanceFunctionToMetricKind.contains(index.arguments[0].safeGet())) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized metric kind (first argument) for vector index. Supported kinds are: {}", keysAsString(distanceFunctionToMetricKind)); - - /// Check that a supported kind was passed as a second argument - - if (index.arguments.size() > 1 && !quantizationToScalarKind.contains(index.arguments[1].safeGet())) - throw Exception(ErrorCodes::INCORRECT_DATA, "Unrecognized scalar kind (second argument) for vector index. Supported kinds are: {}", keysAsString(quantizationToScalarKind)); - - /// Check data type of indexed column: + /// Check data type of the indexed column: DataTypePtr data_type = index.sample_block.getDataTypes()[0]; if (const auto * data_type_array = typeid_cast(data_type.get())) { TypeIndex nested_type_index = data_type_array->getNestedType()->getTypeId(); if (!WhichDataType(nested_type_index).isFloat32()) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity index can only be created on columns of type Array(Float32)"); + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); } else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity index can only be created on columns of type Array(Float32)"); + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Vector similarity indexes can only be created on columns of type Array(Float32)"); + } } } diff --git a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h index 95ea3cd5240..f7098c1626c 100644 --- a/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h +++ b/src/Storages/MergeTree/MergeTreeIndexVectorSimilarity.h @@ -14,6 +14,13 @@ namespace DB { +struct UsearchHnswParams +{ + size_t m = unum::usearch::default_connectivity(); + size_t ef_construction = unum::usearch::default_expansion_add(); + size_t ef_search = unum::usearch::default_expansion_search(); +}; + using USearchIndex = unum::usearch::index_dense_gt; class USearchIndexWithSerialization : public USearchIndex @@ -24,7 +31,8 @@ public: USearchIndexWithSerialization( size_t dimensions, unum::usearch::metric_kind_t metric_kind, - unum::usearch::scalar_kind_t scalar_kind); + unum::usearch::scalar_kind_t scalar_kind, + UsearchHnswParams usearch_hnsw_params); void serialize(WriteBuffer & ostr) const; void deserialize(ReadBuffer & istr); @@ -54,13 +62,15 @@ struct MergeTreeIndexGranuleVectorSimilarity final : public IMergeTreeIndexGranu const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, - unum::usearch::scalar_kind_t scalar_kind_); + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); MergeTreeIndexGranuleVectorSimilarity( const String & index_name_, const Block & index_sample_block_, unum::usearch::metric_kind_t metric_kind_, unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_, USearchIndexWithSerializationPtr index_); ~MergeTreeIndexGranuleVectorSimilarity() override = default; @@ -74,6 +84,7 @@ struct MergeTreeIndexGranuleVectorSimilarity final : public IMergeTreeIndexGranu const Block index_sample_block; const unum::usearch::metric_kind_t metric_kind; const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; USearchIndexWithSerializationPtr index; LoggerPtr logger = getLogger("VectorSimilarityIndex"); @@ -93,7 +104,8 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato const String & index_name_, const Block & index_sample_block, unum::usearch::metric_kind_t metric_kind_, - unum::usearch::scalar_kind_t scalar_kind_); + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); ~MergeTreeIndexAggregatorVectorSimilarity() override = default; @@ -105,6 +117,7 @@ struct MergeTreeIndexAggregatorVectorSimilarity final : IMergeTreeIndexAggregato const Block index_sample_block; const unum::usearch::metric_kind_t metric_kind; const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; USearchIndexWithSerializationPtr index; }; @@ -136,7 +149,8 @@ public: MergeTreeIndexVectorSimilarity( const IndexDescription & index_, unum::usearch::metric_kind_t metric_kind_, - unum::usearch::scalar_kind_t scalar_kind_); + unum::usearch::scalar_kind_t scalar_kind_, + UsearchHnswParams usearch_hnsw_params_); ~MergeTreeIndexVectorSimilarity() override = default; @@ -149,6 +163,7 @@ public: private: const unum::usearch::metric_kind_t metric_kind; const unum::usearch::scalar_kind_t scalar_kind; + const UsearchHnswParams usearch_hnsw_params; }; } diff --git a/tests/queries/0_stateless/02354_vector_search_bugs.sql b/tests/queries/0_stateless/02354_vector_search_bugs.sql index 2ef75d0a7fe..7c66b4b8e45 100644 --- a/tests/queries/0_stateless/02354_vector_search_bugs.sql +++ b/tests/queries/0_stateless/02354_vector_search_bugs.sql @@ -9,14 +9,14 @@ DROP TABLE IF EXISTS tab; SELECT 'Issue #52258: Empty Arrays or Arrays with default values are rejected'; -CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree() ORDER BY id; +CREATE TABLE tab (id UInt64, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree() ORDER BY id; INSERT INTO tab VALUES (1, []); -- { serverError INCORRECT_DATA } INSERT INTO tab (id) VALUES (1); -- { serverError INCORRECT_DATA } DROP TABLE tab; SELECT 'It is possible to create parts with different Array vector sizes but there will be an error at query time'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; SYSTEM STOP MERGES tab; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2]); INSERT INTO tab values (2, [2.2, 2.3, 2.4]) (3, [3.1, 3.2, 3.3]); @@ -31,7 +31,7 @@ DROP TABLE tab; SELECT 'Correctness of index with > 1 mark'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes = 0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, index_granularity = 8192; -- disable adaptive granularity due to bug INSERT INTO tab SELECT number, [toFloat32(number), 0.0] from numbers(10000); WITH [1.0, 0.0] AS reference_vec diff --git a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql index a19a0d17536..acb69cb6ff8 100644 --- a/tests/queries/0_stateless/02354_vector_search_default_granularity.sql +++ b/tests/queries/0_stateless/02354_vector_search_default_granularity.sql @@ -6,13 +6,13 @@ SET allow_experimental_vector_similarity_index = 1; -- After CREATE TABLE DROP TABLE IF EXISTS tab; -CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab (id Int32, vec Array(Float32), INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; -- After ALTER TABLE DROP TABLE tab; CREATE TABLE tab (id Int32, vec Array(Float32)) ENGINE = MergeTree ORDER BY id; -ALTER TABLE tab ADD INDEX idx(vec) TYPE vector_similarity; +ALTER TABLE tab ADD INDEX idx(vec) TYPE vector_similarity('hnsw', 'L2Distance'); SELECT granularity FROM system.data_skipping_indices WHERE database = currentDatabase() AND table = 'tab' AND name = 'idx'; DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_detach_attach.sql b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql index 36241dfabf7..f92eaddbbed 100644 --- a/tests/queries/0_stateless/02354_vector_search_detach_attach.sql +++ b/tests/queries/0_stateless/02354_vector_search_detach_attach.sql @@ -5,7 +5,7 @@ SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); DETACH TABLE tab SYNC; diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference index bee3236f436..b6d034208d0 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.reference @@ -1,8 +1,10 @@ -At most two index arguments -1st argument (distance function) must be String -Unsupported distance functions are rejected -2nd argument (scalar kind) must be String -Unsupported scalar kinds are rejected +Two or six index arguments +1st argument (method) must be String and hnsw +2nd argument (distance function) must be String and L2Distance or cosineDistance +3nd argument (quantization), if given, must be String and f32, f16, ... +4nd argument (M), if given, must be UInt64 and > 1 +5nd argument (ef_construction), if given, must be UInt64 and > 0 +6nd argument (ef_search), if given, must be UInt64 and > 0 Must be created on single column Must be created on Array(Float32) columns Rejects INSERTs of Arrays with different sizes diff --git a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql index 912f7d7fcae..7c2ddfe81fc 100644 --- a/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql +++ b/tests/queries/0_stateless/02354_vector_search_index_creation_negative.sql @@ -6,32 +6,46 @@ SET allow_experimental_vector_similarity_index = 1; DROP TABLE IF EXISTS tab; -SELECT 'At most two index arguments'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('too', 'many', 'args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +SELECT 'Two or six index arguments'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant_have_one_arg')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'three_args')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cant', 'have', 'more', 'than', 'six', 'args', '!')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } -SELECT '1st argument (distance function) must be String'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +SELECT '1st argument (method) must be String and hnsw'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3, 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('not_hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -SELECT 'Unsupported distance functions are rejected'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('invalidDistance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '2nd argument (distance function) must be String and L2Distance or cosineDistance'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'invalid_distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } -SELECT '2nd argument (scalar kind) must be String'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity(3)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } - -SELECT 'Unsupported scalar kinds are rejected'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('L2Distance', 'invalidKind')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '3nd argument (quantization), if given, must be String and f32, f16, ...'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 1, 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'invalid', 2, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '4nd argument (M), if given, must be UInt64 and > 1'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 'invalid', 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 1, 1, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '5nd argument (ef_construction), if given, must be UInt64 and > 0'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 'invalid', 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 0, 1)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } +SELECT '6nd argument (ef_search), if given, must be UInt64 and > 0'; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 'invalid')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_QUERY } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance', 'f32', 2, 1, 0)) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_DATA } SELECT 'Must be created on single column'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx (vec, id) TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError INCORRECT_NUMBER_OF_COLUMNS } SELECT 'Must be created on Array(Float32) columns'; SET allow_suspicious_low_cardinality_types = 1; -CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } -CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec UInt64, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Float32, INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Array(Float64), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec LowCardinality(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } +CREATE TABLE tab(id Int32, vec Nullable(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; -- { serverError ILLEGAL_COLUMN } SELECT 'Rejects INSERTs of Arrays with different sizes'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id; INSERT INTO tab values (0, [2.2, 2.3]) (1, [3.1, 3.2, 3.3]); -- { serverError INCORRECT_DATA } DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_multiple_indexes.reference b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql new file mode 100644 index 00000000000..f1cfc041233 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_multiple_indexes.sql @@ -0,0 +1,14 @@ +-- Tags: no-fasttest, no-ordinary-database + +-- Tests that multiple vector similarity indexes can be created on the same column (even if that makes no sense) + +SET allow_experimental_vector_similarity_index = 1; + +DROP TABLE IF EXISTS tab; +CREATE TABLE tab (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')); + +ALTER TABLE tab ADD INDEX idx(vec) TYPE minmax; +ALTER TABLE tab ADD INDEX vec_idx1(vec) TYPE vector_similarity('hnsw', 'cosineDistance'); +ALTER TABLE tab ADD INDEX vec_idx2(vec) TYPE vector_similarity('hnsw', 'L2Distance'); -- silly but creating the same index also works for non-vector indexes ... + +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02354_vector_search_queries.sql b/tests/queries/0_stateless/02354_vector_search_queries.sql index 50537ad6244..dbf0fca32ab 100644 --- a/tests/queries/0_stateless/02354_vector_search_queries.sql +++ b/tests/queries/0_stateless/02354_vector_search_queries.sql @@ -10,7 +10,7 @@ SELECT '10 rows, index_granularity = 8192, GRANULARITY = 1 million --> 1 granule DROP TABLE IF EXISTS tab; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance')) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 8192; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [0.0, 2.0]), (6, [0.0, 2.1]), (7, [0.0, 2.2]), (8, [0.0, 2.3]), (9, [0.0, 2.4]); @@ -34,7 +34,7 @@ DROP TABLE tab; SELECT '12 rows, index_granularity = 3, GRANULARITY = 2 --> 4 granules, 2 indexed block'; -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity() GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'L2Distance') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [1.0, 0.0]), (1, [1.1, 0.0]), (2, [1.2, 0.0]), (3, [1.3, 0.0]), (4, [1.4, 0.0]), (5, [1.5, 0.0]), (6, [0.0, 2.0]), (7, [0.0, 2.1]), (8, [0.0, 2.2]), (9, [0.0, 2.3]), (10, [0.0, 2.4]), (11, [0.0, 2.5]); SELECT '- ORDER-BY-type'; @@ -56,9 +56,9 @@ DROP TABLE tab; SELECT 'Special cases'; -- Not a systematic test, just to check that no bad things happen. --- Just for jun, use metric = 'cosineDistance', scalarKind = 'f64' +-- Test with non-default metric, M, ef_construction, ef_search -CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('cosineDistance', 'f64') GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; +CREATE TABLE tab(id Int32, vec Array(Float32), INDEX idx vec TYPE vector_similarity('hnsw', 'cosineDistance', 'f32', 42, 99, 66) GRANULARITY 2) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 3; INSERT INTO tab VALUES (0, [4.6, 2.3]), (1, [2.0, 3.2]), (2, [4.2, 3.4]), (3, [5.3, 2.9]), (4, [2.4, 5.2]), (5, [5.3, 2.3]), (6, [1.0, 9.3]), (7, [5.5, 4.7]), (8, [6.4, 3.5]), (9, [5.3, 2.5]), (10, [6.4, 3.4]), (11, [6.4, 3.2]); SELECT '- ORDER-BY-type'; From fb76cb90b1badef334b96b61d976136fd38d535d Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 11 Aug 2024 09:31:36 +0000 Subject: [PATCH 133/142] Allow un-quoted skip index parameters Previously, only this syntax to create a skip index worked: INDEX index_name column_name TYPE vector_similarity('hnsw', 'L2Distance') Now, this syntax will work as well: INDEX index_name column_name TYPE vector_similarity(hnsw, L2Distance) --- .../mergetree-family/annindexes.md | 15 +++++++++++- src/Storages/IndicesDescription.cpp | 12 +++++++--- ...search_unquoted_index_parameters.reference | 0 ...ector_search_unquoted_index_parameters.sql | 23 +++++++++++++++++++ 4 files changed, 46 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.reference create mode 100644 tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md index 354fac6ea74..e73d6f07a32 100644 --- a/docs/en/engines/table-engines/mergetree-family/annindexes.md +++ b/docs/en/engines/table-engines/mergetree-family/annindexes.md @@ -43,7 +43,7 @@ CREATE TABLE table ( id Int64, vectors Array(Float32), - INDEX index_name vec TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N] + INDEX index_name vectors TYPE vector_similarity(method, distance_function[, quantization, connectivity, expansion_add, expansion_search]) [GRANULARITY N] ) ENGINE = MergeTree ORDER BY id; @@ -59,6 +59,19 @@ Parameters: - `ef_construction`: (optional, default: 128) - `ef_search`: (optional, default: 64) +Example: + +```sql +CREATE TABLE table +( + id Int64, + vectors Array(Float32), + INDEX idx vectors TYPE vector_similarity('hnsw', 'L2Distance') -- Alternative syntax: TYPE vector_similarity(hnsw, L2Distance) +) +ENGINE = MergeTree +ORDER BY id; +``` + Vector similarity indexes are based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW algorithm](https://arxiv.org/abs/1603.09320), i.e., a hierarchical graph where each point represents a vector and the edges represent similarity. Such hierarchical structures can be very efficient on large collections. They may often fetch 0.05% or less data from the diff --git a/src/Storages/IndicesDescription.cpp b/src/Storages/IndicesDescription.cpp index cef8fd85f97..753fbf1d635 100644 --- a/src/Storages/IndicesDescription.cpp +++ b/src/Storages/IndicesDescription.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -130,10 +131,15 @@ IndexDescription IndexDescription::getIndexFromAST(const ASTPtr & definition_ast { for (size_t i = 0; i < index_type->arguments->children.size(); ++i) { - const auto * argument = index_type->arguments->children[i]->as(); - if (!argument) + const auto & child = index_type->arguments->children[i]; + if (const auto * ast_literal = child->as(); ast_literal != nullptr) + /// E.g. INDEX index_name column_name TYPE vector_similarity('hnsw', 'f32') + result.arguments.emplace_back(ast_literal->value); + else if (const auto * ast_identifier = child->as(); ast_identifier != nullptr) + /// E.g. INDEX index_name column_name TYPE vector_similarity(hnsw, f32) + result.arguments.emplace_back(ast_identifier->name()); + else throw Exception(ErrorCodes::INCORRECT_QUERY, "Only literals can be skip index arguments"); - result.arguments.emplace_back(argument->value); } } diff --git a/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.reference b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql new file mode 100644 index 00000000000..da6494bf831 --- /dev/null +++ b/tests/queries/0_stateless/02354_vector_search_unquoted_index_parameters.sql @@ -0,0 +1,23 @@ +-- Tags: no-fasttest, no-ordinary-database + +SET allow_experimental_vector_similarity_index = 1; + +-- Tests that quoted and unquoted parameters can be passed to vector search indexes. + +DROP TABLE IF EXISTS tab1; +DROP TABLE IF EXISTS tab2; + +CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity('hnsw', 'L2Distance')); +CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id, INDEX vec_idx(vec) TYPE vector_similarity(hnsw, L2Distance)); + +DROP TABLE tab1; +DROP TABLE tab2; + +CREATE TABLE tab1 (id Int32, vec Array(Float32), PRIMARY KEY id); +CREATE TABLE tab2 (id Int32, vec Array(Float32), PRIMARY KEY id); + +ALTER TABLE tab1 ADD INDEX idx1(vec) TYPE vector_similarity('hnsw', 'L2Distance'); +ALTER TABLE tab2 ADD INDEX idx2(vec) TYPE vector_similarity(hnsw, L2Distance); + +DROP TABLE tab1; +DROP TABLE tab2; From ea1cd665750f82bbeaf66f67b1d85e014afdec18 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 12 Aug 2024 17:32:43 +0200 Subject: [PATCH 134/142] fix tidy --- src/Storages/VirtualColumnUtils.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 1abac56d266..3143c7f78f6 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -46,15 +45,7 @@ #include #include #include -#include "Functions/FunctionsLogical.h" -#include "Functions/IFunction.h" -#include "Functions/IFunctionAdaptors.h" -#include "Functions/indexHint.h" #include -#include -#include -#include -#include namespace DB From 3a6e05eb43cbb9937cded286ac7259b2f7168057 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 12 Aug 2024 18:03:42 +0200 Subject: [PATCH 135/142] try to fix includes --- src/Storages/VirtualColumnUtils.cpp | 55 ++++++++++++++++------------- 1 file changed, 30 insertions(+), 25 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 3143c7f78f6..d932f5cc469 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,43 +1,40 @@ #include #include -#include -#include -#include -#include -#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include + #include -#include -#include -#include #include +#include +#include +#include #include -#include -#include + #include +#include #include +#include #include #include -#include -#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include #include #include -#include #include +#include #include + +#include #include #include #include @@ -45,7 +42,15 @@ #include #include #include +#include "Functions/FunctionsLogical.h" +#include "Functions/IFunction.h" +#include "Functions/IFunctionAdaptors.h" +#include "Functions/indexHint.h" #include +#include +#include +#include +#include namespace DB From 858b7e55d0df3db1412d538f701c30584b5783bf Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Mon, 12 Aug 2024 16:16:50 +0000 Subject: [PATCH 136/142] Improve condition in case the default column consumes slightly more memory It never happened in the few hundreds of tests I ran successfully, but we'd rather be safe than sorry. --- .../01903_correct_block_size_prediction_with_default.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh index 075d9a1dacf..1482730af2c 100755 --- a/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh +++ b/tests/queries/0_stateless/01903_correct_block_size_prediction_with_default.sh @@ -28,7 +28,8 @@ function test() SYSTEM FLUSH LOGS; WITH memory_1 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_1' AND type = 'QueryFinish' as memory_1), memory_2 AS (SELECT memory_usage FROM system.query_log WHERE current_database = currentDatabase() AND query_id='$uuid_2' AND type = 'QueryFinish' as memory_2) - SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage FROM memory_1, memory_2;" + SELECT memory_1.memory_usage <= 1.2 * memory_2.memory_usage OR + memory_2.memory_usage <= 1.2 * memory_1.memory_usage FROM memory_1, memory_2;" } test "" From c22265b889684b7fa34ba6816ce3910143ef7226 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Aug 2024 17:11:11 +0000 Subject: [PATCH 137/142] Some fixups --- docs/en/operations/query-cache.md | 16 +++++----- docs/en/operations/settings/settings.md | 8 ++--- .../operations/system-tables/query_cache.md | 4 +-- src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.cpp | 2 +- src/Interpreters/Cache/QueryCache.cpp | 25 ++++++++-------- src/Interpreters/Cache/QueryCache.h | 11 ++++--- src/Interpreters/executeQuery.cpp | 5 ++-- .../System/StorageSystemQueryCache.cpp | 16 +++++----- .../02494_query_cache_tag.reference | 12 ++++---- .../0_stateless/02494_query_cache_tag.sql | 30 ++++++++++--------- 11 files changed, 66 insertions(+), 65 deletions(-) diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index a6c4d74f4ac..384938e28f6 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -143,16 +143,18 @@ value can be specified at session, profile or query level using setting [query_c Entries in the query cache are compressed by default. This reduces the overall memory consumption at the cost of slower writes into / reads from the query cache. To disable compression, use setting [query_cache_compress_entries](settings/settings.md#query-cache-compress-entries). -Entries in the query cache can separate by tag, using setting [query_cache_tag](settings/settings.md#query-cache-tag). Queries with different tags are considered different entries. For example, the result of query +Sometimes it is useful to keep multiple results for the same query cached. This can be achieved using setting +[query_cache_tag](settings/settings.md#query-cache-tag) that acts as as a label (or namespace) for a query cache entries. The query cache +considers results of the same query with different tags different. -``` sql -SELECT 1 SETTINGS use_query_cache = true; -SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; -SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one diff'; +Example for creating three different query cache entries for the same query: + +```sql +SELECT 1 SETTINGS use_query_cache = true; -- query_cache_tag is implicitly '' (empty string) +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'tag 1'; +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'tag 2'; ``` -have different entries in the query cache, find the specified tag in system table [system.query_cache](system-tables/query_cache.md) - ClickHouse reads table data in blocks of [max_block_size](settings/settings.md#setting-max_block_size) rows. Due to filtering, aggregation, etc., result blocks are typically much smaller than 'max_block_size' but there are also cases where they are much bigger. Setting [query_cache_squash_partial_results](settings/settings.md#query-cache-squash-partial-results) (enabled by default) controls if result blocks diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 7b855665efb..e4a126249ca 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1802,14 +1802,14 @@ Default value: `0`. ## query_cache_tag {#query-cache-tag} -An arbitrary string to separate entries in the [query cache](../query-cache.md). -Queries with different values of this setting are considered different. +A string which acts as a label for [query cache](../query-cache.md) entries. +The same queries with different tags are considered different by the query cache. Possible values: -- string: name of query cache tag +- Any string -Default value: `''`. +Default value: `''` ## query_cache_max_size_in_bytes {#query-cache-max-size-in-bytes} diff --git a/docs/en/operations/system-tables/query_cache.md b/docs/en/operations/system-tables/query_cache.md index 393b37d3616..9c48574a329 100644 --- a/docs/en/operations/system-tables/query_cache.md +++ b/docs/en/operations/system-tables/query_cache.md @@ -9,12 +9,12 @@ Columns: - `query` ([String](../../sql-reference/data-types/string.md)) — Query string. - `result_size` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Size of the query cache entry. +- `tag` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — Tag of the query cache entry. - `stale` ([UInt8](../../sql-reference/data-types/int-uint.md)) — If the query cache entry is stale. - `shared` ([UInt8](../../sql-reference/data-types/int-uint.md)) — If the query cache entry is shared between multiple users. - `compressed` ([UInt8](../../sql-reference/data-types/int-uint.md)) — If the query cache entry is compressed. - `expires_at` ([DateTime](../../sql-reference/data-types/datetime.md)) — When the query cache entry becomes stale. - `key_hash` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — A hash of the query string, used as a key to find query cache entries. -- `tag` ([LowCardinality(String)](../../sql-reference/data-types/lowcardinality.md)) — An arbitrary string to separate entries in the query cache. **Example** @@ -27,12 +27,12 @@ Row 1: ────── query: SELECT 1 SETTINGS use_query_cache = 1 result_size: 128 +tag: stale: 0 shared: 0 compressed: 1 expires_at: 2023-10-13 13:35:45 key_hash: 12188185624808016954 -tag: 1 row in set. Elapsed: 0.004 sec. ``` diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 75579f20187..f9ffab0ea57 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -676,7 +676,7 @@ class IColumn; M(Bool, query_cache_squash_partial_results, true, "Squash partial result blocks to blocks of size 'max_block_size'. Reduces performance of inserts into the query cache but improves the compressability of cache entries.", 0) \ M(Seconds, query_cache_ttl, 60, "After this time in seconds entries in the query cache become stale", 0) \ M(Bool, query_cache_share_between_users, false, "Allow other users to read entry in the query cache", 0) \ - M(String, query_cache_tag, "", "An arbitrary string to separate entries in the query cache. Queries with different values of this setting are considered different.", 0) \ + M(String, query_cache_tag, "", "A string which acts as a label for query cache entries. The same queries with different tags are considered different by the query cache.", 0) \ M(Bool, enable_sharing_sets_for_mutations, true, "Allow sharing set objects build for IN subqueries between different tasks of the same mutation. This reduces memory usage and CPU consumption", 0) \ \ M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ diff --git a/src/Core/SettingsChangesHistory.cpp b/src/Core/SettingsChangesHistory.cpp index 8fd16504e95..0528287e83e 100644 --- a/src/Core/SettingsChangesHistory.cpp +++ b/src/Core/SettingsChangesHistory.cpp @@ -82,9 +82,9 @@ static std::initializer_list user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, - bool is_compressed_, - const String & tag_) - : ast_hash(calculateAstHash(ast_, current_database, settings, tag_)) + bool is_compressed_) + : ast_hash(calculateAstHash(ast_, current_database, settings)) , header(header_) , user_id(user_id_) , current_user_roles(current_user_roles_) @@ -247,12 +242,18 @@ QueryCache::Key::Key( , expires_at(expires_at_) , is_compressed(is_compressed_) , query_string(queryStringFromAST(ast_)) - , tag(tag_) + , tag(settings.query_cache_tag) { } -QueryCache::Key::Key(ASTPtr ast_, const String & current_database, const Settings & settings, std::optional user_id_, const std::vector & current_user_roles_, const String & tag_) - : QueryCache::Key(ast_, current_database, settings, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false, tag_) /// dummy values for everything != AST, current database, user name/roles +QueryCache::Key::Key( + ASTPtr ast_, + const String & current_database, + const Settings & settings, + std::optional user_id_, + const std::vector & current_user_roles_) + : QueryCache::Key(ast_, current_database, settings, {}, user_id_, current_user_roles_, false, std::chrono::system_clock::from_time_t(1), false) + /// ^^ dummy values for everything != AST, current database, user name/roles { } diff --git a/src/Interpreters/Cache/QueryCache.h b/src/Interpreters/Cache/QueryCache.h index 54de5edb145..c7ebaf4d26a 100644 --- a/src/Interpreters/Cache/QueryCache.h +++ b/src/Interpreters/Cache/QueryCache.h @@ -88,8 +88,9 @@ public: /// SYSTEM.QUERY_CACHE. const String query_string; - /// An arbitrary string to separate entries in the query cache. - /// Queries with different values of this setting are considered different. + /// A tag (namespace) for distinguish multiple entries of the same query. + /// This member has currently no use besides that SYSTEM.QUERY_CACHE can populate the 'tag' column conveniently without having to + /// compute the tag from the query AST. const String tag; /// Ctor to construct a Key for writing into query cache. @@ -100,15 +101,13 @@ public: std::optional user_id_, const std::vector & current_user_roles_, bool is_shared_, std::chrono::time_point expires_at_, - bool is_compressed, - const String & tag_); + bool is_compressed); /// Ctor to construct a Key for reading from query cache (this operation only needs the AST + user name). Key(ASTPtr ast_, const String & current_database, const Settings & settings, - std::optional user_id_, const std::vector & current_user_roles_, - const String & tag_); + std::optional user_id_, const std::vector & current_user_roles_); bool operator==(const Key & other) const; }; diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 6422d3128fa..fe87eed5570 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -1129,7 +1129,7 @@ static std::tuple executeQueryImpl( { if (can_use_query_cache && settings.enable_reads_from_query_cache) { - QueryCache::Key key(ast, context->getCurrentDatabase(), *settings_copy, context->getUserID(), context->getCurrentRoles(), settings.query_cache_tag); + QueryCache::Key key(ast, context->getCurrentDatabase(), *settings_copy, context->getUserID(), context->getCurrentRoles()); QueryCache::Reader reader = query_cache->createReader(key); if (reader.hasCacheEntryForKey()) { @@ -1258,8 +1258,7 @@ static std::tuple executeQueryImpl( context->getUserID(), context->getCurrentRoles(), settings.query_cache_share_between_users, std::chrono::system_clock::now() + std::chrono::seconds(settings.query_cache_ttl), - settings.query_cache_compress_entries, - settings.query_cache_tag); + settings.query_cache_compress_entries); const size_t num_query_runs = settings.query_cache_min_query_runs ? query_cache->recordQueryRun(key) : 1; /// try to avoid locking a mutex in recordQueryRun() if (num_query_runs <= settings.query_cache_min_query_runs) diff --git a/src/Storages/System/StorageSystemQueryCache.cpp b/src/Storages/System/StorageSystemQueryCache.cpp index f81d50e8806..b3532ba40a7 100644 --- a/src/Storages/System/StorageSystemQueryCache.cpp +++ b/src/Storages/System/StorageSystemQueryCache.cpp @@ -16,12 +16,12 @@ ColumnsDescription StorageSystemQueryCache::getColumnsDescription() { {"query", std::make_shared(), "Query string."}, {"result_size", std::make_shared(), "Size of the query cache entry."}, + {"tag", std::make_shared(std::make_shared()), "Tag of the query cache entry."}, {"stale", std::make_shared(), "If the query cache entry is stale."}, {"shared", std::make_shared(), "If the query cache entry is shared between multiple users."}, {"compressed", std::make_shared(), "If the query cache entry is compressed."}, {"expires_at", std::make_shared(), "When the query cache entry becomes stale."}, - {"key_hash", std::make_shared(), "A hash of the query string, used as a key to find query cache entries."}, - {"tag", std::make_shared(std::make_shared()), "An arbitrary string to separate entries in the query cache."} + {"key_hash", std::make_shared(), "A hash of the query string, used as a key to find query cache entries."} }; } @@ -53,12 +53,12 @@ void StorageSystemQueryCache::fillData(MutableColumns & res_columns, ContextPtr res_columns[0]->insert(key.query_string); /// approximates the original query string res_columns[1]->insert(QueryCache::QueryCacheEntryWeight()(*query_result)); - res_columns[2]->insert(key.expires_at < std::chrono::system_clock::now()); - res_columns[3]->insert(key.is_shared); - res_columns[4]->insert(key.is_compressed); - res_columns[5]->insert(std::chrono::system_clock::to_time_t(key.expires_at)); - res_columns[6]->insert(key.ast_hash.low64); /// query cache considers aliases (issue #56258) - res_columns[7]->insert(key.tag); + res_columns[2]->insert(key.tag); + res_columns[3]->insert(key.expires_at < std::chrono::system_clock::now()); + res_columns[4]->insert(key.is_shared); + res_columns[5]->insert(key.is_compressed); + res_columns[6]->insert(std::chrono::system_clock::to_time_t(key.expires_at)); + res_columns[7]->insert(key.ast_hash.low64); /// query cache considers aliases (issue #56258) } } diff --git a/tests/queries/0_stateless/02494_query_cache_tag.reference b/tests/queries/0_stateless/02494_query_cache_tag.reference index 055d3d4c5bb..f7be5c06ecf 100644 --- a/tests/queries/0_stateless/02494_query_cache_tag.reference +++ b/tests/queries/0_stateless/02494_query_cache_tag.reference @@ -1,14 +1,12 @@ 1 -1 +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = \'abc\' abc --- 1 1 -1 -2 +SELECT 1 SETTINGS use_query_cache = true +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = \'abc\' abc --- 1 1 -1 -2 -1 -3 +SELECT 1 SETTINGS use_query_cache = true abc +SELECT 1 SETTINGS use_query_cache = true def diff --git a/tests/queries/0_stateless/02494_query_cache_tag.sql b/tests/queries/0_stateless/02494_query_cache_tag.sql index 054607058e8..62d36f6ebe6 100644 --- a/tests/queries/0_stateless/02494_query_cache_tag.sql +++ b/tests/queries/0_stateless/02494_query_cache_tag.sql @@ -3,30 +3,32 @@ SYSTEM DROP QUERY CACHE; --- Cache the query after the query invocation -SELECT 1 SETTINGS use_query_cache = true; -SELECT COUNT(*) FROM system.query_cache; +-- Store the result a single query with a tag in the query cache and check that the system table knows about the tag +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'abc'; + +SELECT query, tag FROM system.query_cache; SELECT '---'; SYSTEM DROP QUERY CACHE; --- Queries with tag value of this setting or not are considered different cache entries. -SELECT 1 SETTINGS use_query_cache = true; -SELECT COUNT(*) FROM system.query_cache; -SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; -SELECT COUNT(*) FROM system.query_cache; +-- Store the result of the same query with two different tags. The cache should store two entries. +SELECT 1 SETTINGS use_query_cache = true; -- default query_cache_tag = '' +SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'abc'; +SELECT query, tag FROM system.query_cache ORDER BY ALL; SELECT '---'; SYSTEM DROP QUERY CACHE; --- Queries with different tags values of this setting are considered different cache entries. +-- Like before but the tag is set standalone. + +SET query_cache_tag = 'abc'; SELECT 1 SETTINGS use_query_cache = true; -SELECT COUNT(*) FROM system.query_cache; -SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one'; -SELECT COUNT(*) FROM system.query_cache; -SELECT 1 SETTINGS use_query_cache = true, query_cache_tag = 'one diff'; -SELECT COUNT(*) FROM system.query_cache; + +SET query_cache_tag = 'def'; +SELECT 1 SETTINGS use_query_cache = true; + +SELECT query, tag FROM system.query_cache ORDER BY ALL; SYSTEM DROP QUERY CACHE; From aa7a2bcb02f6c2f48bcc7acca3bcec2f1a16130b Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:34:02 +0200 Subject: [PATCH 138/142] Fix typo --- docs/en/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 21e4a6599ea..4f51dc6b8d3 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -4073,7 +4073,7 @@ getSubcolumn(col_name, subcol_name) **Returned value** -- Returns the extracted sub-colum. +- Returns the extracted sub-column. **Example** From eab8594570e703a766f2f91ae3d13b0ed640b554 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Mon, 12 Aug 2024 20:35:33 +0200 Subject: [PATCH 139/142] Update aspell-dict.txt --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 862f38976ce..51246d990fa 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1697,6 +1697,8 @@ getOSKernelVersion getServerPort getSetting getSizeOfEnumType +getSubcolumn +getTypeSerializationStreams getblockinfo getevents ghcnd From 45a14fa0ce3ae94a374bbf955ba0fb7109b7e678 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 12 Aug 2024 18:54:06 +0000 Subject: [PATCH 140/142] Fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index ffd9fae7f45..03ec8e1752c 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -2115,6 +2115,7 @@ namenode namepassword nameprofile namequota +namespace namespaces natively nats From 5a683796a0dc8408ed2694af672675929352bf8f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 12 Aug 2024 22:34:14 +0200 Subject: [PATCH 141/142] Update DatabaseReplicated.cpp --- src/Databases/DatabaseReplicated.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index f4aa925d6dd..6011b8e65e3 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1584,6 +1584,8 @@ void DatabaseReplicated::dropTable(ContextPtr local_context, const String & tabl } auto table = tryGetTable(table_name, getContext()); + if (!table) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Table {} doesn't exist", table_name); if (table->getName() == "MaterializedView" || table->getName() == "WindowView") { /// Avoid recursive locking of metadata_mutex From a517bc90cd9e369a4385f367e9f5e9688520c8bb Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Mon, 12 Aug 2024 21:42:47 -0400 Subject: [PATCH 142/142] Update PULL_REQUEST_TEMPLATE.md --- .github/PULL_REQUEST_TEMPLATE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 8b6e957e1d8..3dcce68ab46 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -60,7 +60,7 @@ At a minimum, the following information should be added (but add more as needed) - [ ] Exclude: All with aarch64, release, debug --- - [ ] Run only fuzzers related jobs (libFuzzer fuzzers, AST fuzzers, etc.) -- [ ] Exclude AST fuzzers +- [ ] Exclude: AST fuzzers --- - [ ] Do not test - [ ] Woolen Wolfdog