From 6db8b7a2d18ad495a469d71d3243964f0ef9e0f4 Mon Sep 17 00:00:00 2001 From: HappenLee Date: Thu, 29 Feb 2024 20:35:18 +0800 Subject: [PATCH 001/215] Reduce unless virtual func call in ColumnNullable --- src/Columns/ColumnNullable.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index 08b598d6a3b..17bc8ffe1e6 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -53,7 +53,7 @@ public: std::string getName() const override { return "Nullable(" + nested_column->getName() + ")"; } TypeIndex getDataType() const override { return TypeIndex::Nullable; } MutableColumnPtr cloneResized(size_t size) const override; - size_t size() const override { return nested_column->size(); } + size_t size() const override { return assert_cast(*null_map).size(); } bool isNullAt(size_t n) const override { return assert_cast(*null_map).getData()[n] != 0;} Field operator[](size_t n) const override; void get(size_t n, Field & res) const override; From 545cc731dcc1c33a3e016b68d47450aaa5e388bd Mon Sep 17 00:00:00 2001 From: Philipp Schreiber Date: Fri, 1 Mar 2024 09:39:02 +0100 Subject: [PATCH 002/215] fix column ddl expression order documentation --- docs/en/sql-reference/statements/create/table.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index 0edf158e981..2967a11d35c 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -17,8 +17,8 @@ By default, tables are created only on the current server. Distributed DDL queri ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [compression_codec] [TTL expr1] [COMMENT 'comment for column'], - name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [compression_codec] [TTL expr2] [COMMENT 'comment for column'], + name1 [type1] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr1] [COMMENT 'comment for column'] [compression_codec] [TTL expr1], + name2 [type2] [NULL|NOT NULL] [DEFAULT|MATERIALIZED|EPHEMERAL|ALIAS expr2] [COMMENT 'comment for column'] [compression_codec] [TTL expr2], ... ) ENGINE = engine COMMENT 'comment for table' From d52d13da22698239c6fed00fe9e4ec50e5661cb6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 22 Apr 2024 09:38:33 -0300 Subject: [PATCH 003/215] Tmp --- .../EnvironmentProxyConfigurationResolver.cpp | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index f2c60afa1a8..6a925f4ac00 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -1,4 +1,5 @@ #include "EnvironmentProxyConfigurationResolver.h" +#include #include #include @@ -34,6 +35,34 @@ namespace return std::getenv(PROXY_HTTPS_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) } } + + std::vector getNoProxyHosts() + { + std::vector result; + + const char * no_proxy = std::getenv("NO_PROXY"); // NOLINT(concurrency-mt-unsafe) + if (!no_proxy) + { + return result; + } + + std::string no_proxy_str(no_proxy); + std::istringstream no_proxy_stream(no_proxy_str); + std::string host; + while (std::getline(no_proxy_stream, host, ',')) + { + try + { + result.emplace(host); + } + catch (const Poco::SyntaxException & e) + { + LOG_WARNING(getLogger("EnvironmentProxyConfigurationResolver"), "Failed to parse NO_PROXY host '{}': {}", host, e.displayText()); + } + } + + return result; + } } ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() From d5ff9de6fa995cf4c0ddd14799b0367d1d3db592 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 2 May 2024 17:29:23 -0300 Subject: [PATCH 004/215] env resolver impl --- .../EnvironmentProxyConfigurationResolver.cpp | 26 ++++++++++--------- src/Common/ProxyConfiguration.h | 1 + src/Common/tests/gtest_helper_functions.h | 10 ++++++- .../gtest_proxy_environment_configuration.cpp | 5 +++- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index 6a925f4ac00..bc46cd39bd6 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -1,7 +1,9 @@ #include "EnvironmentProxyConfigurationResolver.h" #include +#include #include +#include #include namespace DB @@ -13,6 +15,7 @@ namespace DB * */ static constexpr auto PROXY_HTTP_ENVIRONMENT_VARIABLE = "http_proxy"; static constexpr auto PROXY_HTTPS_ENVIRONMENT_VARIABLE = "https_proxy"; +static constexpr auto NO_PROXY_ENVIRONMENT_VARIABLE = "no_proxy"; EnvironmentProxyConfigurationResolver::EnvironmentProxyConfigurationResolver( Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_) @@ -36,28 +39,26 @@ namespace } } - std::vector getNoProxyHosts() + std::vector getNoProxyHosts() { - std::vector result; + std::vector result; + + const char * no_proxy = std::getenv(NO_PROXY_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) - const char * no_proxy = std::getenv("NO_PROXY"); // NOLINT(concurrency-mt-unsafe) if (!no_proxy) { return result; } - std::string no_proxy_str(no_proxy); - std::istringstream no_proxy_stream(no_proxy_str); + std::istringstream no_proxy_stream(no_proxy); std::string host; while (std::getline(no_proxy_stream, host, ',')) { - try + trim(host); + + if (!host.empty()) { - result.emplace(host); - } - catch (const Poco::SyntaxException & e) - { - LOG_WARNING(getLogger("EnvironmentProxyConfigurationResolver"), "Failed to parse NO_PROXY host '{}': {}", host, e.displayText()); + result.emplace_back(host); } } @@ -86,7 +87,8 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() ProxyConfiguration::protocolFromString(scheme), port, useTunneling(request_protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), - request_protocol + request_protocol, + getNoProxyHosts() }; } diff --git a/src/Common/ProxyConfiguration.h b/src/Common/ProxyConfiguration.h index 289ae2b6813..be0750f1f82 100644 --- a/src/Common/ProxyConfiguration.h +++ b/src/Common/ProxyConfiguration.h @@ -49,6 +49,7 @@ struct ProxyConfiguration uint16_t port = 0; bool tunneling = false; Protocol original_request_protocol = Protocol::HTTP; + std::vector no_proxy_hosts = {}; bool isEmpty() const { return host.empty(); } }; diff --git a/src/Common/tests/gtest_helper_functions.h b/src/Common/tests/gtest_helper_functions.h index 54c9ae9170d..e3aeea407a1 100644 --- a/src/Common/tests/gtest_helper_functions.h +++ b/src/Common/tests/gtest_helper_functions.h @@ -76,7 +76,10 @@ inline std::string xmlNodeAsString(Poco::XML::Node *pNode) struct EnvironmentProxySetter { - EnvironmentProxySetter(const Poco::URI & http_proxy, const Poco::URI & https_proxy) + EnvironmentProxySetter( + const Poco::URI & http_proxy, + const Poco::URI & https_proxy, + const std::string & no_proxy = {}) { if (!http_proxy.empty()) { @@ -87,6 +90,11 @@ struct EnvironmentProxySetter { setenv("https_proxy", https_proxy.toString().c_str(), 1); // NOLINT(concurrency-mt-unsafe) } + + if (!no_proxy.empty()) + { + setenv("no_proxy", no_proxy.c_str(), 1); // NOLINT(concurrency-mt-unsafe) + } } ~EnvironmentProxySetter() diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index 377bef385f6..4956604f6d8 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -15,7 +15,8 @@ namespace TEST(EnvironmentProxyConfigurationResolver, TestHTTP) { - EnvironmentProxySetter setter(http_proxy_server, {}); + std::vector no_proxy_hosts = {"localhost", "127.0.0.1", "some_other_domain", "last_domain"}; + EnvironmentProxySetter setter(http_proxy_server, {}, "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"); EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); @@ -24,6 +25,7 @@ TEST(EnvironmentProxyConfigurationResolver, TestHTTP) ASSERT_EQ(configuration.host, http_proxy_server.getHost()); ASSERT_EQ(configuration.port, http_proxy_server.getPort()); ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); + ASSERT_EQ(configuration.no_proxy_hosts, no_proxy_hosts); } TEST(EnvironmentProxyConfigurationResolver, TestHTTPNoEnv) @@ -35,6 +37,7 @@ TEST(EnvironmentProxyConfigurationResolver, TestHTTPNoEnv) ASSERT_EQ(configuration.host, ""); ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP); ASSERT_EQ(configuration.port, 0u); + ASSERT_TRUE(configuration.no_proxy_hosts.empty()); } TEST(EnvironmentProxyConfigurationResolver, TestHTTPs) From d30aef4b547cd67acdb1a916b77bfbc2a890123d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 2 May 2024 18:03:32 -0300 Subject: [PATCH 005/215] pass it to poco http --- src/Common/HTTPConnectionPool.cpp | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/Common/HTTPConnectionPool.cpp b/src/Common/HTTPConnectionPool.cpp index 167aeee68f3..f559fdc786e 100644 --- a/src/Common/HTTPConnectionPool.cpp +++ b/src/Common/HTTPConnectionPool.cpp @@ -70,6 +70,24 @@ namespace CurrentMetrics namespace { + std::string buildPocoNonProxyHosts(const DB::ProxyConfiguration & proxy_configuration) + { + bool first = true; + std::string ret; + + for (const auto & host : proxy_configuration.no_proxy_hosts) + { + if (!first) + { + ret.append("|"); + } + ret.append(host); + first = false; + } + + return ret; + } + Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) { Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; @@ -79,6 +97,7 @@ namespace poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); poco_proxy_config.tunnel = proxy_configuration.tunneling; poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); + poco_proxy_config.nonProxyHosts = buildPocoNonProxyHosts(proxy_configuration); return poco_proxy_config; } @@ -696,7 +715,8 @@ struct EndpointPoolKey proxy_config.port, proxy_config.protocol, proxy_config.tunneling, - proxy_config.original_request_protocol) + proxy_config.original_request_protocol, + proxy_config.no_proxy_hosts) == std::tie( rhs.connection_group, rhs.target_host, @@ -706,7 +726,8 @@ struct EndpointPoolKey rhs.proxy_config.port, rhs.proxy_config.protocol, rhs.proxy_config.tunneling, - rhs.proxy_config.original_request_protocol); + rhs.proxy_config.original_request_protocol, + rhs.proxy_config.no_proxy_hosts); } }; From f4da4f1eb0fe9a5034367d46f3c12710f6a8de7d Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 2 May 2024 20:29:05 -0300 Subject: [PATCH 006/215] env no proxy tests --- .../helpers/s3_url_proxy_tests_util.py | 22 ++++++++++++++----- .../test.py | 16 ++++++++++++++ 2 files changed, 33 insertions(+), 5 deletions(-) diff --git a/tests/integration/helpers/s3_url_proxy_tests_util.py b/tests/integration/helpers/s3_url_proxy_tests_util.py index 9059fda08ae..16df446b0f7 100644 --- a/tests/integration/helpers/s3_url_proxy_tests_util.py +++ b/tests/integration/helpers/s3_url_proxy_tests_util.py @@ -2,8 +2,8 @@ import os import time -def check_proxy_logs( - cluster, proxy_instance, protocol, bucket, http_methods={"POST", "PUT", "GET"} +def has_any_proxy_related_logs( +cluster, proxy_instance, protocol, bucket, http_methods={"POST", "PUT", "GET"} ): for i in range(10): logs = cluster.get_container_logs(proxy_instance) @@ -13,10 +13,10 @@ def check_proxy_logs( logs.find(http_method + f" {protocol}://minio1:9001/root/data/{bucket}") >= 0 ): - return + return True time.sleep(1) else: - assert False, f"{http_methods} method not found in logs of {proxy_instance}" + return False def wait_resolver(cluster): @@ -85,4 +85,16 @@ def simple_test(cluster, proxies, protocol, bucket): perform_simple_queries(node, minio_endpoint) for proxy in proxies: - check_proxy_logs(cluster, proxy, protocol, bucket) + has_proxy_logs = has_any_proxy_related_logs(cluster, proxy, protocol, bucket) + assert has_proxy_logs, f"Did not find any proxy related logs in {proxy}" + + +def simple_test_assert_no_proxy(cluster, proxies, protocol, bucket): + minio_endpoint = build_s3_endpoint(protocol, bucket) + node = cluster.instances[f"{bucket}"] + + perform_simple_queries(node, minio_endpoint) + + for proxy in proxies: + no_proxy_logs = not has_any_proxy_related_logs(cluster, proxy, protocol, bucket) + assert no_proxy_logs, f"Found proxy logs in {proxy} and it should not have found it" diff --git a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py index ae872a33cd4..fefc98ea1f7 100644 --- a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py +++ b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py @@ -39,6 +39,19 @@ def cluster(): instance_env_variables=True, ) + cluster.add_instance( + "env_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_env.xml", + ], + with_minio=True, + env_variables={ + "https_proxy": "http://proxy1", + "no_proxy": "not_important_host,, minio1 ," + }, + instance_env_variables=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -61,3 +74,6 @@ def test_s3_with_https_remote_proxy(cluster): def test_s3_with_https_env_proxy(cluster): proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") + +def test_s3_with_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") \ No newline at end of file From 326938086955ae4931f7b19abbedf22fc91b9afa Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 2 May 2024 20:35:21 -0300 Subject: [PATCH 007/215] update tests --- .../test.py | 4 ---- .../test.py | 14 ++++++++++++++ .../test.py | 17 +++++++++++++++++ 3 files changed, 31 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py index fefc98ea1f7..00f88cf4c14 100644 --- a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py +++ b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py @@ -47,7 +47,6 @@ def cluster(): with_minio=True, env_variables={ "https_proxy": "http://proxy1", - "no_proxy": "not_important_host,, minio1 ," }, instance_env_variables=True, ) @@ -74,6 +73,3 @@ def test_s3_with_https_remote_proxy(cluster): def test_s3_with_https_env_proxy(cluster): proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") - -def test_s3_with_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") \ No newline at end of file diff --git a/tests/integration/test_s3_table_function_with_http_proxy/test.py b/tests/integration/test_s3_table_function_with_http_proxy/test.py index 1619b413099..cc0d59e51ba 100644 --- a/tests/integration/test_s3_table_function_with_http_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_http_proxy/test.py @@ -36,6 +36,16 @@ def cluster(): instance_env_variables=True, ) + cluster.add_instance( + "env_node_no_proxy", + with_minio=True, + env_variables={ + "http_proxy": "http://proxy1", + "no_proxy": "not_important_host,, minio1 ,", + }, + instance_env_variables=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -58,3 +68,7 @@ def test_s3_with_http_remote_proxy(cluster): def test_s3_with_http_env_proxy(cluster): proxy_util.simple_test(cluster, ["proxy1"], "http", "env_node") + + +def test_s3_with_http_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "env_node_no_proxy") diff --git a/tests/integration/test_s3_table_function_with_https_proxy/test.py b/tests/integration/test_s3_table_function_with_https_proxy/test.py index 83af407093c..87285415d85 100644 --- a/tests/integration/test_s3_table_function_with_https_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_https_proxy/test.py @@ -44,6 +44,19 @@ def cluster(): instance_env_variables=True, ) + cluster.add_instance( + "env_node_no_proxy", + main_configs=[ + "configs/config.d/ssl.xml", + ], + with_minio=True, + env_variables={ + "https_proxy": "https://proxy1", + "no_proxy": "not_important_host,, minio1 ,", + }, + instance_env_variables=True, + ) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -66,3 +79,7 @@ def test_s3_with_https_remote_proxy(cluster): def test_s3_with_https_env_proxy(cluster): proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") + + +def test_s3_with_https_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") From f2083a979b0baeff33108507b188863d087f15a7 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 2 May 2024 20:43:14 -0300 Subject: [PATCH 008/215] some comments --- src/Common/HTTPConnectionPool.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Common/HTTPConnectionPool.cpp b/src/Common/HTTPConnectionPool.cpp index f559fdc786e..a17ed770f19 100644 --- a/src/Common/HTTPConnectionPool.cpp +++ b/src/Common/HTTPConnectionPool.cpp @@ -70,6 +70,11 @@ namespace CurrentMetrics namespace { + /* + * ClickHouse holds a list of hosts, while Poco expects a regex. Build an or-based regex with all the hosts + * Favoring simplicity. https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ + * Open for discussions + * */ std::string buildPocoNonProxyHosts(const DB::ProxyConfiguration & proxy_configuration) { bool first = true; From 1b577a81b97c1f6f896e527598be9d8865dee105 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 3 May 2024 11:48:50 -0300 Subject: [PATCH 009/215] keep no_proxy string as is in clickhouse memory, convert to or-based regex with match anything wildcard for poco --- .../EnvironmentProxyConfigurationResolver.cpp | 22 +---- src/Common/HTTPConnectionPool.cpp | 39 +-------- src/Common/ProxyConfiguration.h | 2 +- .../proxyConfigurationToPocoProxyConfig.cpp | 80 +++++++++++++++++++ .../proxyConfigurationToPocoProxyConfig.h | 11 +++ ...oxy_configuration_to_poco_proxy_config.cpp | 25 ++++++ .../gtest_proxy_environment_configuration.cpp | 6 +- 7 files changed, 125 insertions(+), 60 deletions(-) create mode 100644 src/Common/proxyConfigurationToPocoProxyConfig.cpp create mode 100644 src/Common/proxyConfigurationToPocoProxyConfig.h create mode 100644 src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index bc46cd39bd6..3abaa4ec9e3 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -39,30 +39,16 @@ namespace } } - std::vector getNoProxyHosts() + std::string getNoProxyHostsString() { - std::vector result; - const char * no_proxy = std::getenv(NO_PROXY_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) if (!no_proxy) { - return result; + return ""; } - std::istringstream no_proxy_stream(no_proxy); - std::string host; - while (std::getline(no_proxy_stream, host, ',')) - { - trim(host); - - if (!host.empty()) - { - result.emplace_back(host); - } - } - - return result; + return no_proxy; } } @@ -88,7 +74,7 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() port, useTunneling(request_protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), request_protocol, - getNoProxyHosts() + getNoProxyHostsString() }; } diff --git a/src/Common/HTTPConnectionPool.cpp b/src/Common/HTTPConnectionPool.cpp index a17ed770f19..f3ff09bc90a 100644 --- a/src/Common/HTTPConnectionPool.cpp +++ b/src/Common/HTTPConnectionPool.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -70,44 +71,6 @@ namespace CurrentMetrics namespace { - /* - * ClickHouse holds a list of hosts, while Poco expects a regex. Build an or-based regex with all the hosts - * Favoring simplicity. https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ - * Open for discussions - * */ - std::string buildPocoNonProxyHosts(const DB::ProxyConfiguration & proxy_configuration) - { - bool first = true; - std::string ret; - - for (const auto & host : proxy_configuration.no_proxy_hosts) - { - if (!first) - { - ret.append("|"); - } - ret.append(host); - first = false; - } - - return ret; - } - - Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) - { - Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; - - poco_proxy_config.host = proxy_configuration.host; - poco_proxy_config.port = proxy_configuration.port; - poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); - poco_proxy_config.tunnel = proxy_configuration.tunneling; - poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); - poco_proxy_config.nonProxyHosts = buildPocoNonProxyHosts(proxy_configuration); - - return poco_proxy_config; - } - - constexpr size_t roundUp(size_t x, size_t rounding) { chassert(rounding > 0); diff --git a/src/Common/ProxyConfiguration.h b/src/Common/ProxyConfiguration.h index be0750f1f82..78264d66cb7 100644 --- a/src/Common/ProxyConfiguration.h +++ b/src/Common/ProxyConfiguration.h @@ -49,7 +49,7 @@ struct ProxyConfiguration uint16_t port = 0; bool tunneling = false; Protocol original_request_protocol = Protocol::HTTP; - std::vector no_proxy_hosts = {}; + std::string no_proxy_hosts = ""; bool isEmpty() const { return host.empty(); } }; diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp new file mode 100644 index 00000000000..ec2a5ba19bd --- /dev/null +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -0,0 +1,80 @@ +#include + +#include + +#include + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" +#pragma clang diagnostic ignored "-Wgnu-anonymous-struct" +#pragma clang diagnostic ignored "-Wnested-anon-types" +#pragma clang diagnostic ignored "-Wunused-parameter" +#pragma clang diagnostic ignored "-Wshadow-field-in-constructor" +#pragma clang diagnostic ignored "-Wdtor-name" +#include +#pragma clang diagnostic pop + +namespace DB +{ + +/* + * Even though there is not an RFC that defines NO_PROXY, it is usually a comma-separated list of domains. + * Different tools implement their own versions of `NO_PROXY` support. Some support CIDR blocks, some support wildcard etc. + * Opting for a simple implementation that covers most use cases: + * * Support only single wildcard * (match anything) + * * No regex + * * No CIDR blocks + * * No leading dot strip + * * No fancy stuff about loopback IPs + * https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ + * Open for discussions + * */ +std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) +{ + bool match_any_host = no_proxy_hosts.size() == 1 && no_proxy_hosts[0] == '*'; + + if (match_any_host) + { + return "(.*?)"; + } + + std::string host; + std::istringstream no_proxy_stream(no_proxy_hosts); + + bool first = true; + std::string result; + + while (std::getline(no_proxy_stream, host, ',')) + { + trim(host); + + if (!host.empty()) + { + if (!first) + { + result.append("|"); + } + + result.append(RE2::QuoteMeta(host)); + first = false; + } + } + + return result; +} + +Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) +{ + Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; + + poco_proxy_config.host = proxy_configuration.host; + poco_proxy_config.port = proxy_configuration.port; + poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); + poco_proxy_config.tunnel = proxy_configuration.tunneling; + poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); + poco_proxy_config.nonProxyHosts = buildPocoNonProxyHosts(proxy_configuration.no_proxy_hosts); + + return poco_proxy_config; +} + +} diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.h b/src/Common/proxyConfigurationToPocoProxyConfig.h new file mode 100644 index 00000000000..d093b0f3521 --- /dev/null +++ b/src/Common/proxyConfigurationToPocoProxyConfig.h @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +namespace DB +{ + +Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration); + +} diff --git a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp new file mode 100644 index 00000000000..d161bb9f63e --- /dev/null +++ b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp @@ -0,0 +1,25 @@ +#include + +#include + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuild) +{ + DB::ProxyConfiguration proxy_configuration; + + proxy_configuration.no_proxy_hosts = "localhost,127.0.0.1,some_other_domain:8080,sub-domain.domain.com"; + + auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); + + ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, "localhost|127\\.0\\.0\\.1|some_other_domain\\:8080|sub\\-domain\\.domain\\.com"); +} + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchAnything) +{ + DB::ProxyConfiguration proxy_configuration; + + proxy_configuration.no_proxy_hosts = "*"; + + auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); + + ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, "(.*?)"); +} diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index 4956604f6d8..81388fd877f 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -15,8 +15,8 @@ namespace TEST(EnvironmentProxyConfigurationResolver, TestHTTP) { - std::vector no_proxy_hosts = {"localhost", "127.0.0.1", "some_other_domain", "last_domain"}; - EnvironmentProxySetter setter(http_proxy_server, {}, "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"); + std::string no_proxy_string = "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"; + EnvironmentProxySetter setter(http_proxy_server, {}, no_proxy_string); EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); @@ -25,7 +25,7 @@ TEST(EnvironmentProxyConfigurationResolver, TestHTTP) ASSERT_EQ(configuration.host, http_proxy_server.getHost()); ASSERT_EQ(configuration.port, http_proxy_server.getPort()); ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); - ASSERT_EQ(configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_EQ(configuration.no_proxy_hosts, no_proxy_string); } TEST(EnvironmentProxyConfigurationResolver, TestHTTPNoEnv) From a157d6abd61f27aa3ba35f22fcfd83e069d936f6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 3 May 2024 11:49:12 -0300 Subject: [PATCH 010/215] remove unused node --- .../test.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py index 00f88cf4c14..ae872a33cd4 100644 --- a/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py +++ b/tests/integration/test_https_s3_table_function_with_http_proxy_no_tunneling/test.py @@ -39,18 +39,6 @@ def cluster(): instance_env_variables=True, ) - cluster.add_instance( - "env_node_no_proxy", - main_configs=[ - "configs/config.d/proxy_env.xml", - ], - with_minio=True, - env_variables={ - "https_proxy": "http://proxy1", - }, - instance_env_variables=True, - ) - logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") From 4103a730e2b3f644cb69c9b6e423a6a9a54c9dec Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 3 May 2024 13:51:02 -0300 Subject: [PATCH 011/215] add empty test --- src/Common/EnvironmentProxyConfigurationResolver.cpp | 3 --- ...gtest_proxy_configuration_to_poco_proxy_config.cpp | 11 +++++++++++ 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index 3abaa4ec9e3..387674feaae 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -1,9 +1,6 @@ #include "EnvironmentProxyConfigurationResolver.h" -#include -#include #include -#include #include namespace DB diff --git a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp index d161bb9f63e..4a8488d4055 100644 --- a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp +++ b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp @@ -23,3 +23,14 @@ TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchA ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, "(.*?)"); } + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildEmpty) +{ + DB::ProxyConfiguration proxy_configuration; + + proxy_configuration.no_proxy_hosts = ""; + + auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); + + ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, ""); +} From c97f5cf3b7b33a06b86fa18049f069df46238860 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 3 May 2024 14:33:48 -0300 Subject: [PATCH 012/215] more basic tests --- .../ProxyConfigurationResolverProvider.cpp | 14 ++++++++++++-- src/Common/ProxyListConfigurationResolver.cpp | 10 +++++++--- src/Common/ProxyListConfigurationResolver.h | 7 ++++++- src/Common/RemoteProxyConfigurationResolver.cpp | 7 +++++-- src/Common/RemoteProxyConfigurationResolver.h | 4 +++- ...est_proxy_configuration_resolver_provider.cpp | 16 +++++++++++++++- 6 files changed, 48 insertions(+), 10 deletions(-) diff --git a/src/Common/ProxyConfigurationResolverProvider.cpp b/src/Common/ProxyConfigurationResolverProvider.cpp index d15b4d98615..360db80e360 100644 --- a/src/Common/ProxyConfigurationResolverProvider.cpp +++ b/src/Common/ProxyConfigurationResolverProvider.cpp @@ -17,6 +17,11 @@ namespace ErrorCodes namespace { + std::string getNoProxyHosts(const Poco::Util::AbstractConfiguration & configuration) + { + return configuration.getString("proxy.no_proxy", ""); + } + bool isTunnelingDisabledForHTTPSRequestsOverHTTPProxy( const Poco::Util::AbstractConfiguration & configuration) { @@ -49,7 +54,8 @@ namespace return std::make_shared( server_configuration, request_protocol, - isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); + isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration), + getNoProxyHosts(configuration)); } auto extractURIList(const String & config_prefix, const Poco::Util::AbstractConfiguration & configuration) @@ -87,7 +93,11 @@ namespace return uris.empty() ? nullptr - : std::make_shared(uris, request_protocol, isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); + : std::make_shared( + uris, + request_protocol, + isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration), + getNoProxyHosts(configuration)); } bool hasRemoteResolver(const String & config_prefix, const Poco::Util::AbstractConfiguration & configuration) diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index c9b8923929a..2560b3b8d1d 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -9,8 +9,11 @@ namespace DB ProxyListConfigurationResolver::ProxyListConfigurationResolver( std::vector proxies_, - Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_) - : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), proxies(std::move(proxies_)) + Protocol request_protocol_, + bool disable_tunneling_for_https_requests_over_http_proxy_, + const std::string & no_proxy_hosts_) + : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), + proxies(std::move(proxies_)), no_proxy_hosts(no_proxy_hosts_) { } @@ -31,7 +34,8 @@ ProxyConfiguration ProxyListConfigurationResolver::resolve() ProxyConfiguration::protocolFromString(proxy.getScheme()), proxy.getPort(), useTunneling(request_protocol, ProxyConfiguration::protocolFromString(proxy.getScheme()), disable_tunneling_for_https_requests_over_http_proxy), - request_protocol + request_protocol, + no_proxy_hosts }; } diff --git a/src/Common/ProxyListConfigurationResolver.h b/src/Common/ProxyListConfigurationResolver.h index 95e0073d779..db69e7eb6c9 100644 --- a/src/Common/ProxyListConfigurationResolver.h +++ b/src/Common/ProxyListConfigurationResolver.h @@ -15,7 +15,11 @@ namespace DB class ProxyListConfigurationResolver : public ProxyConfigurationResolver { public: - ProxyListConfigurationResolver(std::vector proxies_, Protocol request_protocol_, bool disable_tunneling_for_https_requests_over_http_proxy_ = false); + ProxyListConfigurationResolver( + std::vector proxies_, + Protocol request_protocol_, + bool disable_tunneling_for_https_requests_over_http_proxy_ = false, + const std::string & no_proxy_hosts_ = ""); ProxyConfiguration resolve() override; @@ -23,6 +27,7 @@ public: private: std::vector proxies; + std::string no_proxy_hosts; /// Access counter to get proxy using round-robin strategy. std::atomic access_counter; diff --git a/src/Common/RemoteProxyConfigurationResolver.cpp b/src/Common/RemoteProxyConfigurationResolver.cpp index ef972a8e318..064840d2e04 100644 --- a/src/Common/RemoteProxyConfigurationResolver.cpp +++ b/src/Common/RemoteProxyConfigurationResolver.cpp @@ -19,9 +19,11 @@ namespace ErrorCodes RemoteProxyConfigurationResolver::RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_ + bool disable_tunneling_for_https_requests_over_http_proxy_, + const std::string & no_proxy_hosts_ ) -: ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), remote_server_configuration(remote_server_configuration_) +: ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), + remote_server_configuration(remote_server_configuration_), no_proxy_hosts(no_proxy_hosts_) { } @@ -105,6 +107,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve() cached_config.port = proxy_port; cached_config.tunneling = use_tunneling_for_https_requests_over_http_proxy; cached_config.original_request_protocol = request_protocol; + cached_config.no_proxy_hosts = no_proxy_hosts; cache_timestamp = std::chrono::system_clock::now(); cache_valid = true; diff --git a/src/Common/RemoteProxyConfigurationResolver.h b/src/Common/RemoteProxyConfigurationResolver.h index 3275202215a..986fd233835 100644 --- a/src/Common/RemoteProxyConfigurationResolver.h +++ b/src/Common/RemoteProxyConfigurationResolver.h @@ -28,7 +28,8 @@ public: RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_ = true); + bool disable_tunneling_for_https_requests_over_http_proxy_ = true, + const std::string & no_proxy_hosts_ = ""); ProxyConfiguration resolve() override; @@ -36,6 +37,7 @@ public: private: RemoteServerConfiguration remote_server_configuration; + std::string no_proxy_hosts; std::mutex cache_mutex; bool cache_valid = false; diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index d5d6f86f661..246d0121255 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -31,9 +31,11 @@ Poco::URI https_env_proxy_server = Poco::URI("http://https_environment_proxy:312 Poco::URI http_list_proxy_server = Poco::URI("http://http_list_proxy:3128"); Poco::URI https_list_proxy_server = Poco::URI("http://https_list_proxy:3128"); +static std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"; + TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedIfNoSettings) { - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); + EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server, no_proxy_hosts); const auto & config = getContext().context->getConfigRef(); auto http_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config)->resolve(); @@ -42,10 +44,12 @@ TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedI ASSERT_EQ(http_configuration.host, http_env_proxy_server.getHost()); ASSERT_EQ(http_configuration.port, http_env_proxy_server.getPort()); ASSERT_EQ(http_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); + ASSERT_EQ(http_configuration.no_proxy_hosts, no_proxy_hosts); ASSERT_EQ(https_configuration.host, https_env_proxy_server.getHost()); ASSERT_EQ(https_configuration.port, https_env_proxy_server.getPort()); ASSERT_EQ(https_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); + ASSERT_EQ(https_configuration.no_proxy_hosts, no_proxy_hosts); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) @@ -53,6 +57,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); + config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy.http", ""); config->setString("proxy.http.uri", http_list_proxy_server.toString()); context->setConfig(config); @@ -62,18 +67,21 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); // No https configuration since it's not set ASSERT_EQ(https_proxy_configuration.host, ""); ASSERT_EQ(https_proxy_configuration.port, 0); + ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, ""); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) { ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); + config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy", ""); config->setString("proxy.https", ""); config->setString("proxy.https.uri", https_list_proxy_server.toString()); @@ -91,12 +99,14 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) // still HTTP because the proxy host is not HTTPS ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); + ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, no_proxy_hosts); } TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) { ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); + config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy", ""); config->setString("proxy.http", ""); config->setString("proxy.http.uri", http_list_proxy_server.toString()); @@ -112,6 +122,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); @@ -120,6 +131,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) // still HTTP because the proxy host is not HTTPS ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); + ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, no_proxy_hosts); } TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTP) @@ -157,6 +169,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC ASSERT_EQ(http_proxy_configuration.host, http_env_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.port, http_env_proxy_server.getPort()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); } TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTPS) @@ -194,6 +207,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC ASSERT_EQ(http_proxy_configuration.host, https_env_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.port, https_env_proxy_server.getPort()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); } // remote resolver is tricky to be tested in unit tests From 11dacf040ebebf5a4f268a99a34163a9a1fe7b5b Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Sat, 4 May 2024 11:03:40 -0300 Subject: [PATCH 013/215] fix tests --- ..._proxy_configuration_resolver_provider.cpp | 4 +- .../configs/config.d/proxy_list_no_proxy.xml | 9 +++ .../config.d/proxy_remote_no_proxy.xml | 18 ++++++ .../test.py | 34 +++++++++-- .../test.py | 56 +++++++++++++++---- 5 files changed, 103 insertions(+), 18 deletions(-) create mode 100644 tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml create mode 100644 tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index 246d0121255..174e9564cee 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -149,6 +149,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); + config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy.https", ""); config->setString("proxy.https.resolver", ""); config->setString("proxy.https.resolver.endpoint", "http://resolver:8080/hostname"); @@ -169,7 +170,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC ASSERT_EQ(http_proxy_configuration.host, http_env_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.port, http_env_proxy_server.getPort()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, ""); } TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTPS) @@ -244,4 +245,3 @@ TEST_F(ProxyConfigurationResolverProviderTests, TunnelingForHTTPSRequestsOverHTT test_tunneling(context); test_tunneling(context); } - diff --git a/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml new file mode 100644 index 00000000000..a1601153151 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_list_no_proxy.xml @@ -0,0 +1,9 @@ + + + not_important_host,, minio1 , + + http://proxy1 + http://proxy2 + + + diff --git a/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml new file mode 100644 index 00000000000..6c16a65b154 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_http_proxy/configs/config.d/proxy_remote_no_proxy.xml @@ -0,0 +1,18 @@ + + + not_important_host,, minio1 , + + + + http://resolver:8080/hostname + http + 80 + 10 + + + + diff --git a/tests/integration/test_s3_table_function_with_http_proxy/test.py b/tests/integration/test_s3_table_function_with_http_proxy/test.py index cc0d59e51ba..a935c5290f4 100644 --- a/tests/integration/test_s3_table_function_with_http_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_http_proxy/test.py @@ -19,6 +19,15 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "remote_proxy_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_remote_no_proxy.xml", + ], + instance_env_variables=True, + with_minio=True, + ) + cluster.add_instance( "proxy_list_node", main_configs=[ @@ -27,6 +36,15 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "proxy_list_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_list_no_proxy.xml", + ], + instance_env_variables=True, + with_minio=True, + ) + cluster.add_instance( "env_node", with_minio=True, @@ -58,6 +76,18 @@ def cluster(): cluster.shutdown() +def test_s3_with_http_proxy_list_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1", "proxy2"], "http", "proxy_list_node_no_proxy") + + +def test_s3_with_http_remote_proxy_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "remote_proxy_node_no_proxy") + + +def test_s3_with_http_env_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "env_node_no_proxy") + + def test_s3_with_http_proxy_list(cluster): proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "http", "proxy_list_node") @@ -68,7 +98,3 @@ def test_s3_with_http_remote_proxy(cluster): def test_s3_with_http_env_proxy(cluster): proxy_util.simple_test(cluster, ["proxy1"], "http", "env_node") - - -def test_s3_with_http_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "env_node_no_proxy") diff --git a/tests/integration/test_s3_table_function_with_https_proxy/test.py b/tests/integration/test_s3_table_function_with_https_proxy/test.py index 87285415d85..b1adf2533c8 100644 --- a/tests/integration/test_s3_table_function_with_https_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_https_proxy/test.py @@ -23,6 +23,18 @@ def cluster(): minio_certs_dir="minio_certs", ) + cluster.add_instance( + "remote_proxy_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_remote.xml", + "configs/config.d/ssl.xml", + ], + env_variables={ + "no_proxy": "not_important_host,, minio1 ,", + }, + with_minio=True, + ) + cluster.add_instance( "proxy_list_node", main_configs=[ @@ -32,6 +44,18 @@ def cluster(): with_minio=True, ) + cluster.add_instance( + "proxy_list_node_no_proxy", + main_configs=[ + "configs/config.d/proxy_list.xml", + "configs/config.d/ssl.xml", + ], + env_variables={ + "no_proxy": "not_important_host,, minio1 ,", + }, + with_minio=True, + ) + cluster.add_instance( "env_node", main_configs=[ @@ -69,17 +93,25 @@ def cluster(): cluster.shutdown() -def test_s3_with_https_proxy_list(cluster): - proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node") +def test_s3_with_https_proxy_list_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node_no_proxy") + +# +# def test_s3_with_https_remote_proxy(cluster): +# proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy") +# +# +# def test_s3_with_https_env_no_proxy(cluster): +# proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") +# +# +# def test_s3_with_https_proxy_list(cluster): +# proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node") +# +# +# def test_s3_with_https_remote_proxy(cluster): +# proxy_util.simple_test(cluster, ["proxy1"], "https", "remote_proxy_node") -def test_s3_with_https_remote_proxy(cluster): - proxy_util.simple_test(cluster, ["proxy1"], "https", "remote_proxy_node") - - -def test_s3_with_https_env_proxy(cluster): - proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") - - -def test_s3_with_https_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") +# def test_s3_with_https_env_proxy(cluster): +# proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") From db21c2cf0a8e267b97675c4f2269f5e30ab163d6 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Sat, 4 May 2024 11:12:18 -0300 Subject: [PATCH 014/215] fix tests --- .../configs/config.d/proxy_list_no_proxy.xml | 13 ++++++ .../config.d/proxy_remote_no_proxy.xml | 18 ++++++++ .../test.py | 44 ++++++++----------- 3 files changed, 50 insertions(+), 25 deletions(-) create mode 100644 tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml create mode 100644 tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml diff --git a/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml new file mode 100644 index 00000000000..0a03986f839 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_list_no_proxy.xml @@ -0,0 +1,13 @@ + + + not_important_host,, minio1 , + + http://proxy1 + http://proxy2 + + + https://proxy1 + https://proxy2 + + + diff --git a/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml new file mode 100644 index 00000000000..943f2b36a34 --- /dev/null +++ b/tests/integration/test_s3_table_function_with_https_proxy/configs/config.d/proxy_remote_no_proxy.xml @@ -0,0 +1,18 @@ + + + not_important_host,, minio1 , + + + + http://resolver:8080/hostname + https + 443 + 10 + + + + diff --git a/tests/integration/test_s3_table_function_with_https_proxy/test.py b/tests/integration/test_s3_table_function_with_https_proxy/test.py index b1adf2533c8..2cce74565d7 100644 --- a/tests/integration/test_s3_table_function_with_https_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_https_proxy/test.py @@ -26,12 +26,9 @@ def cluster(): cluster.add_instance( "remote_proxy_node_no_proxy", main_configs=[ - "configs/config.d/proxy_remote.xml", + "configs/config.d/proxy_remote_no_proxy.xml", "configs/config.d/ssl.xml", ], - env_variables={ - "no_proxy": "not_important_host,, minio1 ,", - }, with_minio=True, ) @@ -47,12 +44,9 @@ def cluster(): cluster.add_instance( "proxy_list_node_no_proxy", main_configs=[ - "configs/config.d/proxy_list.xml", + "configs/config.d/proxy_list_no_proxy.xml", "configs/config.d/ssl.xml", ], - env_variables={ - "no_proxy": "not_important_host,, minio1 ,", - }, with_minio=True, ) @@ -96,22 +90,22 @@ def cluster(): def test_s3_with_https_proxy_list_no_proxy(cluster): proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node_no_proxy") -# -# def test_s3_with_https_remote_proxy(cluster): -# proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy") -# -# -# def test_s3_with_https_env_no_proxy(cluster): -# proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") -# -# -# def test_s3_with_https_proxy_list(cluster): -# proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node") -# -# -# def test_s3_with_https_remote_proxy(cluster): -# proxy_util.simple_test(cluster, ["proxy1"], "https", "remote_proxy_node") + +def test_s3_with_https_env_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") -# def test_s3_with_https_env_proxy(cluster): -# proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") +def test_s3_with_https_remote_no_proxy(cluster): + proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy") + + +def test_s3_with_https_proxy_list(cluster): + proxy_util.simple_test(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node") + + +def test_s3_with_https_remote_proxy(cluster): + proxy_util.simple_test(cluster, ["proxy1"], "https", "remote_proxy_node") + + +def test_s3_with_https_env_proxy(cluster): + proxy_util.simple_test(cluster, ["proxy1"], "https", "env_node") From 96b0d878434a3d6ddf0c98d7703b139b979aa0d4 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 6 May 2024 10:38:10 -0300 Subject: [PATCH 015/215] first impl for subdomains --- .../proxyConfigurationToPocoProxyConfig.cpp | 35 +++++++++++++++++-- ...oxy_configuration_to_poco_proxy_config.cpp | 2 +- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index ec2a5ba19bd..3e2642d1600 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -17,6 +17,25 @@ namespace DB { +namespace +{ + +/* + * Copy `curl` behavior instead of `wget` as it seems to be more flexible. + * `curl` strips leading dot and accepts url gitlab.com as a match for no_proxy .gitlab.com, + * while `wget` does an exact match. + * */ +std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) +{ + std::string_view view_without_leading_dot = host; + if (host[0] == '.') + { + view_without_leading_dot = std::string_view {host.begin() + 1u, host.end()}; + } + + return RE2::QuoteMeta(view_without_leading_dot); +} + /* * Even though there is not an RFC that defines NO_PROXY, it is usually a comma-separated list of domains. * Different tools implement their own versions of `NO_PROXY` support. Some support CIDR blocks, some support wildcard etc. @@ -31,11 +50,15 @@ namespace DB * */ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) { + static constexpr auto OR_SEPARATOR = "|"; + static constexpr auto MATCH_ANYTHING = R"((.*?))"; + static constexpr auto MATCH_SUBDOMAINS_REGEX = R"((?:.*\.)?)"; + bool match_any_host = no_proxy_hosts.size() == 1 && no_proxy_hosts[0] == '*'; if (match_any_host) { - return "(.*?)"; + return MATCH_ANYTHING; } std::string host; @@ -52,10 +75,14 @@ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) { if (!first) { - result.append("|"); + result.append(OR_SEPARATOR); } - result.append(RE2::QuoteMeta(host)); + auto escaped_host_without_leading_dot = buildPocoRegexpEntryWithoutLeadingDot(host); + + result.append(MATCH_SUBDOMAINS_REGEX); + result.append(escaped_host_without_leading_dot); + first = false; } } @@ -63,6 +90,8 @@ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) return result; } +} + Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) { Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; diff --git a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp index 4a8488d4055..db87f23fc65 100644 --- a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp +++ b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp @@ -10,7 +10,7 @@ TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuild) auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); - ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, "localhost|127\\.0\\.0\\.1|some_other_domain\\:8080|sub\\-domain\\.domain\\.com"); + ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, R"((?:.*\.)?localhost|(?:.*\.)?127\.0\.0\.1|(?:.*\.)?some_other_domain\:8080|(?:.*\.)?sub\-domain\.domain\.com)"); } TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchAnything) From c3a4338d3b427ef8638e5ba6033eee4571eeb281 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 6 May 2024 10:43:25 -0300 Subject: [PATCH 016/215] fix tests --- src/Common/tests/gtest_helper_functions.h | 1 + .../tests/gtest_proxy_configuration_resolver_provider.cpp | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Common/tests/gtest_helper_functions.h b/src/Common/tests/gtest_helper_functions.h index e3aeea407a1..1e5c2b21d99 100644 --- a/src/Common/tests/gtest_helper_functions.h +++ b/src/Common/tests/gtest_helper_functions.h @@ -101,5 +101,6 @@ struct EnvironmentProxySetter { unsetenv("http_proxy"); // NOLINT(concurrency-mt-unsafe) unsetenv("https_proxy"); // NOLINT(concurrency-mt-unsafe) + unsetenv("no_proxy"); // NOLINT(concurrency-mt-unsafe) } }; diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index 174e9564cee..6e19520ca22 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -208,7 +208,7 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC ASSERT_EQ(http_proxy_configuration.host, https_env_proxy_server.getHost()); ASSERT_EQ(http_proxy_configuration.port, https_env_proxy_server.getPort()); ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, ""); } // remote resolver is tricky to be tested in unit tests From b7096609692098886b050d36e84897792d2872df Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 6 May 2024 16:53:36 -0300 Subject: [PATCH 017/215] update doc and use early return --- .../proxyConfigurationToPocoProxyConfig.cpp | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index 3e2642d1600..f14d586677a 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -41,9 +41,10 @@ std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) * Different tools implement their own versions of `NO_PROXY` support. Some support CIDR blocks, some support wildcard etc. * Opting for a simple implementation that covers most use cases: * * Support only single wildcard * (match anything) + * * Match subdomains + * * Strip leading dots * * No regex * * No CIDR blocks - * * No leading dot strip * * No fancy stuff about loopback IPs * https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ * Open for discussions @@ -71,20 +72,22 @@ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) { trim(host); - if (!host.empty()) + if (host.empty()) { - if (!first) - { - result.append(OR_SEPARATOR); - } - - auto escaped_host_without_leading_dot = buildPocoRegexpEntryWithoutLeadingDot(host); - - result.append(MATCH_SUBDOMAINS_REGEX); - result.append(escaped_host_without_leading_dot); - - first = false; + continue; } + + if (!first) + { + result.append(OR_SEPARATOR); + } + + auto escaped_host_without_leading_dot = buildPocoRegexpEntryWithoutLeadingDot(host); + + result.append(MATCH_SUBDOMAINS_REGEX); + result.append(escaped_host_without_leading_dot); + + first = false; } return result; From 9aad50f35dfa325a3968707ec3d7af94e9b5af10 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 6 May 2024 17:02:20 -0300 Subject: [PATCH 018/215] reorder args and adapt tests --- src/Common/ProxyConfigurationResolverProvider.cpp | 8 ++++---- src/Common/ProxyListConfigurationResolver.cpp | 4 ++-- src/Common/ProxyListConfigurationResolver.h | 4 ++-- src/Common/RemoteProxyConfigurationResolver.cpp | 4 ++-- src/Common/RemoteProxyConfigurationResolver.h | 4 ++-- ...gtest_proxy_configuration_resolver_provider.cpp | 2 +- .../gtest_proxy_list_configuration_resolver.cpp | 14 +++++++++++--- 7 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/Common/ProxyConfigurationResolverProvider.cpp b/src/Common/ProxyConfigurationResolverProvider.cpp index 360db80e360..a486304ce80 100644 --- a/src/Common/ProxyConfigurationResolverProvider.cpp +++ b/src/Common/ProxyConfigurationResolverProvider.cpp @@ -54,8 +54,8 @@ namespace return std::make_shared( server_configuration, request_protocol, - isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration), - getNoProxyHosts(configuration)); + getNoProxyHosts(configuration), + isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } auto extractURIList(const String & config_prefix, const Poco::Util::AbstractConfiguration & configuration) @@ -96,8 +96,8 @@ namespace : std::make_shared( uris, request_protocol, - isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration), - getNoProxyHosts(configuration)); + getNoProxyHosts(configuration), + isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } bool hasRemoteResolver(const String & config_prefix, const Poco::Util::AbstractConfiguration & configuration) diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index 2560b3b8d1d..985bdf001b1 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -10,8 +10,8 @@ namespace DB ProxyListConfigurationResolver::ProxyListConfigurationResolver( std::vector proxies_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_, - const std::string & no_proxy_hosts_) + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_) : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), proxies(std::move(proxies_)), no_proxy_hosts(no_proxy_hosts_) { diff --git a/src/Common/ProxyListConfigurationResolver.h b/src/Common/ProxyListConfigurationResolver.h index db69e7eb6c9..a87826792d4 100644 --- a/src/Common/ProxyListConfigurationResolver.h +++ b/src/Common/ProxyListConfigurationResolver.h @@ -18,8 +18,8 @@ public: ProxyListConfigurationResolver( std::vector proxies_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_ = false, - const std::string & no_proxy_hosts_ = ""); + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_ = false); ProxyConfiguration resolve() override; diff --git a/src/Common/RemoteProxyConfigurationResolver.cpp b/src/Common/RemoteProxyConfigurationResolver.cpp index 064840d2e04..06587ab7f53 100644 --- a/src/Common/RemoteProxyConfigurationResolver.cpp +++ b/src/Common/RemoteProxyConfigurationResolver.cpp @@ -19,8 +19,8 @@ namespace ErrorCodes RemoteProxyConfigurationResolver::RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_, - const std::string & no_proxy_hosts_ + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_ ) : ProxyConfigurationResolver(request_protocol_, disable_tunneling_for_https_requests_over_http_proxy_), remote_server_configuration(remote_server_configuration_), no_proxy_hosts(no_proxy_hosts_) diff --git a/src/Common/RemoteProxyConfigurationResolver.h b/src/Common/RemoteProxyConfigurationResolver.h index 986fd233835..f9add6f0769 100644 --- a/src/Common/RemoteProxyConfigurationResolver.h +++ b/src/Common/RemoteProxyConfigurationResolver.h @@ -28,8 +28,8 @@ public: RemoteProxyConfigurationResolver( const RemoteServerConfiguration & remote_server_configuration_, Protocol request_protocol_, - bool disable_tunneling_for_https_requests_over_http_proxy_ = true, - const std::string & no_proxy_hosts_ = ""); + const std::string & no_proxy_hosts_, + bool disable_tunneling_for_https_requests_over_http_proxy_ = true); ProxyConfiguration resolve() override; diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index 6e19520ca22..d248835699a 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -31,7 +31,7 @@ Poco::URI https_env_proxy_server = Poco::URI("http://https_environment_proxy:312 Poco::URI http_list_proxy_server = Poco::URI("http://http_list_proxy:3128"); Poco::URI https_list_proxy_server = Poco::URI("http://https_list_proxy:3128"); -static std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"; +static std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, sub-domain.domain.com,"; TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedIfNoSettings) { diff --git a/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp b/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp index 3234fe0ccd1..5d8268eb206 100644 --- a/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp +++ b/src/Common/tests/gtest_proxy_list_configuration_resolver.cpp @@ -10,6 +10,8 @@ namespace { auto proxy_server1 = Poco::URI("http://proxy_server1:3128"); auto proxy_server2 = Poco::URI("http://proxy_server2:3128"); + + std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, sub-domain.domain.com,"; } TEST(ProxyListConfigurationResolver, SimpleTest) @@ -17,7 +19,8 @@ TEST(ProxyListConfigurationResolver, SimpleTest) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, - ProxyConfiguration::Protocol::HTTP); + ProxyConfiguration::Protocol::HTTP, + no_proxy_hosts); auto configuration1 = resolver.resolve(); auto configuration2 = resolver.resolve(); @@ -25,10 +28,12 @@ TEST(ProxyListConfigurationResolver, SimpleTest) ASSERT_EQ(configuration1.host, proxy_server1.getHost()); ASSERT_EQ(configuration1.port, proxy_server1.getPort()); ASSERT_EQ(configuration1.protocol, ProxyConfiguration::protocolFromString(proxy_server1.getScheme())); + ASSERT_EQ(configuration1.no_proxy_hosts, no_proxy_hosts); ASSERT_EQ(configuration2.host, proxy_server2.getHost()); ASSERT_EQ(configuration2.port, proxy_server2.getPort()); ASSERT_EQ(configuration2.protocol, ProxyConfiguration::protocolFromString(proxy_server2.getScheme())); + ASSERT_EQ(configuration2.no_proxy_hosts, no_proxy_hosts); } TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) @@ -36,7 +41,8 @@ TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, - ProxyConfiguration::Protocol::HTTPS); + ProxyConfiguration::Protocol::HTTPS, + ""); auto configuration1 = resolver.resolve(); auto configuration2 = resolver.resolve(); @@ -45,11 +51,12 @@ TEST(ProxyListConfigurationResolver, HTTPSRequestsOverHTTPProxyDefault) ASSERT_EQ(configuration1.port, proxy_server1.getPort()); ASSERT_EQ(configuration1.protocol, ProxyConfiguration::protocolFromString(proxy_server1.getScheme())); ASSERT_EQ(configuration1.tunneling, true); + ASSERT_EQ(configuration1.no_proxy_hosts, ""); ASSERT_EQ(configuration2.host, proxy_server2.getHost()); ASSERT_EQ(configuration2.port, proxy_server2.getPort()); ASSERT_EQ(configuration2.protocol, ProxyConfiguration::protocolFromString(proxy_server2.getScheme())); - ASSERT_EQ(configuration1.tunneling, true); + ASSERT_EQ(configuration2.no_proxy_hosts, ""); } TEST(ProxyListConfigurationResolver, SimpleTestTunnelingDisabled) @@ -58,6 +65,7 @@ TEST(ProxyListConfigurationResolver, SimpleTestTunnelingDisabled) ProxyListConfigurationResolver resolver( {proxy_server1, proxy_server2}, ProxyConfiguration::Protocol::HTTPS, + "", disable_tunneling_for_https_requests_over_http_proxy); auto configuration1 = resolver.resolve(); From eba4b1d558797153a3d41f3304ca8465712f8065 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 6 May 2024 17:15:51 -0300 Subject: [PATCH 019/215] docs for no_proxy --- .../settings.md | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 28831404a1f..4f8a82805af 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -2875,6 +2875,8 @@ Define proxy servers for HTTP and HTTPS requests, currently supported by S3 stor There are three ways to define proxy servers: environment variables, proxy lists, and remote proxy resolvers. +Bypassing proxy servers for specific hosts is also supported with the use of `no_proxy`. + ### Environment variables The `http_proxy` and `https_proxy` environment variables allow you to specify a @@ -2984,6 +2986,29 @@ This also allows a mix of resolver types can be used. By default, tunneling (i.e, `HTTP CONNECT`) is used to make `HTTPS` requests over `HTTP` proxy. This setting can be used to disable it. +### no_proxy +By default, all requests will go through the proxy. In order to disable it for specific hosts, the `no_proxy` variable must be set. +It can be set inside the `` clause for list and remote resolvers and as an environment variable for environment resolver. +It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass. Leading dots are stripped just like curl does. + +Example: + +The below configuration bypasses proxy requests to `clickhouse.cloud` and all of its subdomains (e.g, `auth.clickhouse.cloud`). +The same applies to gitlab, even though it has a leading dot. Both `gitlab.com` and `about.gitlab.com` would bypass the proxy. + +``` xml + + clickhouse.cloud,.gitlab.com + + http://proxy1 + http://proxy2:3128 + + + http://proxy1:3128 + + +``` + ## max_materialized_views_count_for_table {#max_materialized_views_count_for_table} A limit on the number of materialized views attached to a table. From 5eeadb6c0704f5b41407ed6f26deac9d297e720f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 8 May 2024 10:55:26 +0200 Subject: [PATCH 020/215] Support running function for parameterized view value assignment --- src/Interpreters/Context.cpp | 4 ++-- src/Parsers/FunctionParameterValuesVisitor.cpp | 14 +++++++++++--- src/Parsers/FunctionParameterValuesVisitor.h | 3 ++- .../03146_parameterized_view_with_date.reference | 1 + .../03146_parameterized_view_with_date.sql | 12 ++++++++++++ 5 files changed, 28 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/03146_parameterized_view_with_date.reference create mode 100644 tests/queries/0_stateless/03146_parameterized_view_with_date.sql diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 44d36e94441..4d84da36f78 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1840,7 +1840,7 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) { auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext()); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; @@ -2054,7 +2054,7 @@ StoragePtr Context::buildParametrizedViewStorage(const ASTPtr & table_expression return nullptr; auto query = original_view->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); - NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_expression, getQueryContext()); StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); ASTCreateQuery create; diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index 3692a4c73e5..e791e07cdfb 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -7,6 +7,7 @@ #include #include #include +#include namespace DB @@ -20,8 +21,9 @@ namespace ErrorCodes class FunctionParameterValuesVisitor { public: - explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_) + explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, ContextPtr context_) : parameter_values(parameter_values_) + ,context(context_) { } @@ -35,6 +37,7 @@ public: private: NameToNameMap & parameter_values; + ContextPtr context; void visitFunction(const ASTFunction & parameter_function) { @@ -64,15 +67,20 @@ private: parameter_values[identifier->name()] = convertFieldToString(cast_literal->value); } } + else + { + ASTPtr res = evaluateConstantExpressionOrIdentifierAsLiteral(expression_list->children[1], context); + parameter_values[identifier->name()] = convertFieldToString(res->as()->value); + } } } } }; -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast) +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, ContextPtr context) { NameToNameMap parameter_values; - FunctionParameterValuesVisitor(parameter_values).visit(ast); + FunctionParameterValuesVisitor(parameter_values, context).visit(ast); return parameter_values; } diff --git a/src/Parsers/FunctionParameterValuesVisitor.h b/src/Parsers/FunctionParameterValuesVisitor.h index e6ce0e42d06..8c2686dcc65 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.h +++ b/src/Parsers/FunctionParameterValuesVisitor.h @@ -2,12 +2,13 @@ #include #include +#include namespace DB { /// Find parameters in a query parameter values and collect them into map. -NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast); +NameToNameMap analyzeFunctionParamValues(const ASTPtr & ast, ContextPtr context); } diff --git a/tests/queries/0_stateless/03146_parameterized_view_with_date.reference b/tests/queries/0_stateless/03146_parameterized_view_with_date.reference new file mode 100644 index 00000000000..1d6227dbbcb --- /dev/null +++ b/tests/queries/0_stateless/03146_parameterized_view_with_date.reference @@ -0,0 +1 @@ +2 2024-04-01 01:00:00 diff --git a/tests/queries/0_stateless/03146_parameterized_view_with_date.sql b/tests/queries/0_stateless/03146_parameterized_view_with_date.sql new file mode 100644 index 00000000000..53022e969ab --- /dev/null +++ b/tests/queries/0_stateless/03146_parameterized_view_with_date.sql @@ -0,0 +1,12 @@ + +drop table if exists table_pv; +create table table_pv (id Int32, timestamp_field DateTime) engine = Memory(); + +insert into table_pv values(1, '2024-03-01 00:00:00'); +insert into table_pv values (2, '2024-04-01 01:00:00'); + +create view pv as select * from table_pv where timestamp_field > {timestamp_param:DateTime}; + +select * from pv (timestamp_param=toDateTime('2024-04-01 00:00:01')); + +drop table table_pv; From f7d66166ab13dee86bc6f3442707dba2a62bf61f Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 8 May 2024 16:13:04 +0200 Subject: [PATCH 021/215] Bump From 6a2aa299464d1cbf5ee2fc112813dc560c34c838 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 10 May 2024 17:20:36 -0300 Subject: [PATCH 022/215] don't use stringstream --- .../settings.md | 2 +- .../proxyConfigurationToPocoProxyConfig.cpp | 11 ++++++----- .../helpers/s3_url_proxy_tests_util.py | 19 ++++++++++--------- .../test.py | 12 +++++++++--- .../test.py | 12 +++++++++--- 5 files changed, 35 insertions(+), 21 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 4f8a82805af..5fee3c6db5f 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -2994,7 +2994,7 @@ It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass Example: The below configuration bypasses proxy requests to `clickhouse.cloud` and all of its subdomains (e.g, `auth.clickhouse.cloud`). -The same applies to gitlab, even though it has a leading dot. Both `gitlab.com` and `about.gitlab.com` would bypass the proxy. +The same applies to GitLab, even though it has a leading dot. Both `gitlab.com` and `about.gitlab.com` would bypass the proxy. ``` xml diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index f14d586677a..c221dd394ca 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -3,6 +3,7 @@ #include #include +#include #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" @@ -49,26 +50,26 @@ std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) * https://about.gitlab.com/blog/2021/01/27/we-need-to-talk-no-proxy/ * Open for discussions * */ -std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts) +std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string) { static constexpr auto OR_SEPARATOR = "|"; static constexpr auto MATCH_ANYTHING = R"((.*?))"; static constexpr auto MATCH_SUBDOMAINS_REGEX = R"((?:.*\.)?)"; - bool match_any_host = no_proxy_hosts.size() == 1 && no_proxy_hosts[0] == '*'; + bool match_any_host = no_proxy_hosts_string.size() == 1 && no_proxy_hosts_string[0] == '*'; if (match_any_host) { return MATCH_ANYTHING; } - std::string host; - std::istringstream no_proxy_stream(no_proxy_hosts); + std::vector no_proxy_hosts; + splitInto<','>(no_proxy_hosts, no_proxy_hosts_string); bool first = true; std::string result; - while (std::getline(no_proxy_stream, host, ',')) + for (auto & host : no_proxy_hosts) { trim(host); diff --git a/tests/integration/helpers/s3_url_proxy_tests_util.py b/tests/integration/helpers/s3_url_proxy_tests_util.py index 16df446b0f7..7d45dcdec33 100644 --- a/tests/integration/helpers/s3_url_proxy_tests_util.py +++ b/tests/integration/helpers/s3_url_proxy_tests_util.py @@ -3,7 +3,7 @@ import time def has_any_proxy_related_logs( -cluster, proxy_instance, protocol, bucket, http_methods={"POST", "PUT", "GET"} + cluster, proxy_instance, protocol, bucket, http_methods={"POST", "PUT", "GET"} ): for i in range(10): logs = cluster.get_container_logs(proxy_instance) @@ -80,7 +80,7 @@ def perform_simple_queries(node, minio_endpoint): def simple_test(cluster, proxies, protocol, bucket): minio_endpoint = build_s3_endpoint(protocol, bucket) - node = cluster.instances[f"{bucket}"] + node = cluster.instances[bucket] perform_simple_queries(node, minio_endpoint) @@ -90,11 +90,12 @@ def simple_test(cluster, proxies, protocol, bucket): def simple_test_assert_no_proxy(cluster, proxies, protocol, bucket): - minio_endpoint = build_s3_endpoint(protocol, bucket) - node = cluster.instances[f"{bucket}"] + minio_endpoint = build_s3_endpoint(protocol, bucket) + node = cluster.instances[bucket] + perform_simple_queries(node, minio_endpoint) - perform_simple_queries(node, minio_endpoint) - - for proxy in proxies: - no_proxy_logs = not has_any_proxy_related_logs(cluster, proxy, protocol, bucket) - assert no_proxy_logs, f"Found proxy logs in {proxy} and it should not have found it" + for proxy in proxies: + no_proxy_logs = not has_any_proxy_related_logs(cluster, proxy, protocol, bucket) + assert ( + no_proxy_logs + ), f"Found proxy logs in {proxy} and it should not have found it" diff --git a/tests/integration/test_s3_table_function_with_http_proxy/test.py b/tests/integration/test_s3_table_function_with_http_proxy/test.py index a935c5290f4..761057ca6c8 100644 --- a/tests/integration/test_s3_table_function_with_http_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_http_proxy/test.py @@ -77,15 +77,21 @@ def cluster(): def test_s3_with_http_proxy_list_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1", "proxy2"], "http", "proxy_list_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1", "proxy2"], "http", "proxy_list_node_no_proxy" + ) def test_s3_with_http_remote_proxy_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "remote_proxy_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "http", "remote_proxy_node_no_proxy" + ) def test_s3_with_http_env_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "http", "env_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "http", "env_node_no_proxy" + ) def test_s3_with_http_proxy_list(cluster): diff --git a/tests/integration/test_s3_table_function_with_https_proxy/test.py b/tests/integration/test_s3_table_function_with_https_proxy/test.py index 2cce74565d7..26656447d87 100644 --- a/tests/integration/test_s3_table_function_with_https_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_https_proxy/test.py @@ -88,15 +88,21 @@ def cluster(): def test_s3_with_https_proxy_list_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1", "proxy2"], "https", "proxy_list_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1", "proxy2"], "https", "proxy_list_node_no_proxy" + ) def test_s3_with_https_env_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "env_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "https", "env_node_no_proxy" + ) def test_s3_with_https_remote_no_proxy(cluster): - proxy_util.simple_test_assert_no_proxy(cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy") + proxy_util.simple_test_assert_no_proxy( + cluster, ["proxy1"], "https", "remote_proxy_node_no_proxy" + ) def test_s3_with_https_proxy_list(cluster): From 01ef03f8555e232ed4265824262f8c22b289c7a4 Mon Sep 17 00:00:00 2001 From: tomershafir Date: Mon, 13 May 2024 17:11:14 +0300 Subject: [PATCH 023/215] io_uring: improve resumbits visibility --- src/Common/ProfileEvents.cpp | 3 ++- src/Coordination/KeeperConstants.cpp | 3 ++- src/Disks/IO/IOUringReader.cpp | 12 ++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index ed0b29c7b44..34abce6b29e 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -631,7 +631,8 @@ The server successfully detected this situation and will download merged part fr \ M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\ M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \ - M(IOUringSQEsResubmits, "Total number of io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsAsync, "Total number of asychronous io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed") \ M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ \ diff --git a/src/Coordination/KeeperConstants.cpp b/src/Coordination/KeeperConstants.cpp index 8251dca3d1e..51bf037c1c9 100644 --- a/src/Coordination/KeeperConstants.cpp +++ b/src/Coordination/KeeperConstants.cpp @@ -258,7 +258,8 @@ M(KeeperExistsRequest) \ \ M(IOUringSQEsSubmitted) \ - M(IOUringSQEsResubmits) \ + M(IOUringSQEsResubmitsAsync) \ + M(IOUringSQEsResubmitsSync) \ M(IOUringCQEsCompleted) \ M(IOUringCQEsFailed) \ \ diff --git a/src/Disks/IO/IOUringReader.cpp b/src/Disks/IO/IOUringReader.cpp index 90a4d285ecb..ba8c5b94420 100644 --- a/src/Disks/IO/IOUringReader.cpp +++ b/src/Disks/IO/IOUringReader.cpp @@ -22,7 +22,8 @@ namespace ProfileEvents extern const Event AsynchronousReaderIgnoredBytes; extern const Event IOUringSQEsSubmitted; - extern const Event IOUringSQEsResubmits; + extern const Event IOUringSQEsResubmitsAsync; + extern const Event IOUringSQEsResubmitsSync; extern const Event IOUringCQEsCompleted; extern const Event IOUringCQEsFailed; } @@ -149,10 +150,12 @@ int IOUringReader::submitToRing(EnqueuedRequest & enqueued) io_uring_prep_read(sqe, fd, request.buf, static_cast(request.size - enqueued.bytes_read), request.offset + enqueued.bytes_read); int ret = 0; - do + ret = io_uring_submit(&ring); + while (ret == -EINTR || ret == -EAGAIN) { + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsSync); ret = io_uring_submit(&ring); - } while (ret == -EINTR || ret == -EAGAIN); + } if (ret > 0 && !enqueued.resubmitting) { @@ -266,7 +269,7 @@ void IOUringReader::monitorRing() if (cqe->res == -EAGAIN || cqe->res == -EINTR) { enqueued.resubmitting = true; - ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmits); + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) @@ -310,6 +313,7 @@ void IOUringReader::monitorRing() // potential short read, re-submit enqueued.resubmitting = true; enqueued.bytes_read += bytes_read; + ProfileEvents::increment(ProfileEvents::IOUringSQEsResubmitsAsync); ret = submitToRing(enqueued); if (ret <= 0) From 19854af77f8a35de9deea5ca0f6eb50316a213cf Mon Sep 17 00:00:00 2001 From: tomershafir Date: Mon, 13 May 2024 17:49:00 +0300 Subject: [PATCH 024/215] fix typo --- src/Common/ProfileEvents.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 34abce6b29e..87b7c2cce4d 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -631,7 +631,7 @@ The server successfully detected this situation and will download merged part fr \ M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\ M(IOUringSQEsSubmitted, "Total number of io_uring SQEs submitted") \ - M(IOUringSQEsResubmitsAsync, "Total number of asychronous io_uring SQE resubmits performed") \ + M(IOUringSQEsResubmitsAsync, "Total number of asynchronous io_uring SQE resubmits performed") \ M(IOUringSQEsResubmitsSync, "Total number of synchronous io_uring SQE resubmits performed") \ M(IOUringCQEsCompleted, "Total number of successfully completed io_uring CQEs") \ M(IOUringCQEsFailed, "Total number of completed io_uring CQEs with failures") \ From 927ba761a31bf1bbe87d99db7b11c52efee18a40 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 13 May 2024 16:33:38 +0000 Subject: [PATCH 025/215] Try to fix flaky s3 tests test_seekable_formats and test_seekable_formats_url --- tests/integration/helpers/test_tools.py | 2 +- tests/integration/test_storage_s3/test.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 2afbae340be..d605d3ac2d3 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -139,7 +139,7 @@ def assert_logs_contain_with_retry(instance, substring, retry_count=20, sleep_ti def exec_query_with_retry( - instance, query, retry_count=40, sleep_time=0.5, silent=False, settings={} + instance, query, retry_count=40, sleep_time=0.5, silent=False, settings={}, timeout=30 ): exception = None for cnt in range(retry_count): diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index dc929b7db46..6ca11eaa17a 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1133,6 +1133,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1000000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1142,6 +1143,7 @@ def test_seekable_formats(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query( @@ -1169,6 +1171,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) result = instance.query(f"SELECT count() FROM {table_function}") @@ -1178,6 +1181,7 @@ def test_seekable_formats_url(started_cluster): exec_query_with_retry( instance, f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(1500000) settings s3_truncate_on_insert=1", + timeout=100, ) table_function = f"url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_parquet', 'Parquet', 'a Int32, b String')" From a323dd1a787d5dbf6ad1a35e10cd17966f7005bd Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 13 May 2024 16:41:04 +0000 Subject: [PATCH 026/215] Propagate new timeout properly --- tests/integration/helpers/test_tools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index d605d3ac2d3..efdf84cbba9 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -144,7 +144,7 @@ def exec_query_with_retry( exception = None for cnt in range(retry_count): try: - res = instance.query(query, timeout=30, settings=settings) + res = instance.query(query, timeout=timeout, settings=settings) if not silent: logging.debug(f"Result of {query} on {cnt} try is {res}") break From 61a6a27325bc5ded27d3c3fb05010adf5626cd83 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 15 May 2024 14:39:45 +0200 Subject: [PATCH 027/215] Fix check black --- tests/integration/helpers/test_tools.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index efdf84cbba9..1c8c5c33a13 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -139,7 +139,13 @@ def assert_logs_contain_with_retry(instance, substring, retry_count=20, sleep_ti def exec_query_with_retry( - instance, query, retry_count=40, sleep_time=0.5, silent=False, settings={}, timeout=30 + instance, + query, + retry_count=40, + sleep_time=0.5, + silent=False, + settings={}, + timeout=30, ): exception = None for cnt in range(retry_count): From 5c6c378fae33d8fe9294d0031352fd1a82c3bee2 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 24 May 2024 14:34:42 +0200 Subject: [PATCH 028/215] Try to fix GWPAsan --- src/CMakeLists.txt | 2 +- src/Common/Allocator.cpp | 88 +++++++++++++++++++++++++++++- src/Common/AsynchronousMetrics.cpp | 1 + src/Common/PODArray.h | 16 +++++- src/Common/ProfileEvents.cpp | 4 ++ src/Common/clickhouse_malloc.cpp | 5 +- src/Common/memory.cpp | 22 ++++++++ src/Common/memory.h | 45 ++++++++++++++- src/Common/new_delete.cpp | 22 -------- 9 files changed, 177 insertions(+), 28 deletions(-) create mode 100644 src/Common/memory.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4e8946facda..64a09699a54 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -291,7 +291,7 @@ if (TARGET ch_contrib::llvm) endif () if (TARGET ch_contrib::gwp_asan) - target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::gwp_asan) + target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::gwp_asan) target_link_libraries (clickhouse_new_delete PRIVATE ch_contrib::gwp_asan) endif() diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index e80c125c2a0..80f3b204c74 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -10,6 +11,12 @@ #include #include /// MADV_POPULATE_WRITE +namespace ProfileEvents +{ + extern const Event GWPAsanAllocateSuccess; + extern const Event GWPAsanAllocateFailed; + extern const Event GWPAsanFree; +} namespace DB { @@ -59,13 +66,37 @@ void prefaultPages([[maybe_unused]] void * buf_, [[maybe_unused]] size_t len_) template void * allocNoTrack(size_t size, size_t alignment) { + void * buf; +#if USE_GWP_ASAN + if (unlikely(Memory::GuardedAlloc.shouldSample())) + { + if (void * ptr = Memory::GuardedAlloc.allocate(size, alignment)) + { + if constexpr (clear_memory) + memset(ptr, 0, size); + + if constexpr (populate) + prefaultPages(ptr, size); + + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); + + return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } + } +#endif if (alignment <= MALLOC_MIN_ALIGNMENT) { if constexpr (clear_memory) buf = ::calloc(size, 1); else + { buf = ::malloc(size); + } if (nullptr == buf) throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Allocator: Cannot malloc {}.", ReadableSize(size)); @@ -91,6 +122,15 @@ void * allocNoTrack(size_t size, size_t alignment) void freeNoTrack(void * buf) { +#if USE_GWP_ASAN + if (unlikely(Memory::GuardedAlloc.pointerIsMine(buf))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanFree); + Memory::GuardedAlloc.deallocate(buf); + return; + } +#endif + ::free(buf); } @@ -144,8 +184,54 @@ void * Allocator::realloc(void * buf, size_t old_size, { /// nothing to do. /// BTW, it's not possible to change alignment while doing realloc. + return buf; } - else if (alignment <= MALLOC_MIN_ALIGNMENT) + +#if USE_GWP_ASAN + if (unlikely(Memory::GuardedAlloc.shouldSample())) + { + if (void * ptr = Memory::GuardedAlloc.allocate(new_size, alignment)) + { + auto trace_free = CurrentMemoryTracker::free(old_size); + auto trace_alloc = CurrentMemoryTracker::alloc(new_size); + trace_free.onFree(buf, old_size); + + memcpy(ptr, buf, std::min(old_size, new_size)); + free(buf, old_size); + trace_alloc.onAlloc(buf, new_size); + + if constexpr (clear_memory) + if (new_size > old_size) + memset(reinterpret_cast(ptr) + old_size, 0, new_size - old_size); + + if constexpr (populate) + prefaultPages(ptr, new_size); + + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); + return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } + } + + if (unlikely(Memory::GuardedAlloc.pointerIsMine(buf))) + { + /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods. + void * new_buf = alloc(new_size, alignment); + memcpy(new_buf, buf, std::min(old_size, new_size)); + free(buf, old_size); + buf = new_buf; + + if constexpr (populate) + prefaultPages(buf, new_size); + + return buf; + } +#endif + + if (alignment <= MALLOC_MIN_ALIGNMENT) { /// Resize malloc'd memory region with no special alignment requirement. auto trace_free = CurrentMemoryTracker::free(old_size); diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index 4c71b9846c7..ccd65af07f5 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index b4069027ad1..ea3a68eb8bb 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -1,5 +1,7 @@ #pragma once +#include "config.h" + #include #include #include @@ -11,12 +13,16 @@ #include #include #include -#include #ifndef NDEBUG #include #endif +#if USE_GWP_ASAN +# include + +#endif + /** Whether we can use memcpy instead of a loop with assignment to T from U. * It is Ok if types are the same. And if types are integral and of the same size, * example: char, signed char, unsigned char. @@ -112,6 +118,10 @@ protected: template void alloc(size_t bytes, TAllocatorParams &&... allocator_params) { +#if USE_GWP_ASAN + gwp_asan::getThreadLocals()->NextSampleCounter = 1; +#endif + char * allocated = reinterpret_cast(TAllocator::alloc(bytes, std::forward(allocator_params)...)); c_start = allocated + pad_left; @@ -141,6 +151,10 @@ protected: return; } +#if USE_GWP_ASAN + gwp_asan::getThreadLocals()->NextSampleCounter = 1; +#endif + unprotect(); ptrdiff_t end_diff = c_end - c_start; diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 8c8e2163aad..0e7a7a9d514 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -744,6 +744,10 @@ The server successfully detected this situation and will download merged part fr \ M(ReadWriteBufferFromHTTPRequestsSent, "Number of HTTP requests sent by ReadWriteBufferFromHTTP") \ M(ReadWriteBufferFromHTTPBytes, "Total size of payload bytes received and sent by ReadWriteBufferFromHTTP. Doesn't include HTTP headers.") \ + \ + M(GWPAsanAllocateSuccess, "Number of successful allocations done by GWPAsan") \ + M(GWPAsanAllocateFailed, "Number of failed allocations done by GWPAsan (i.e. filled pool)") \ + M(GWPAsanFree, "Number of free operations done by GWPAsan") \ #ifdef APPLY_FOR_EXTERNAL_EVENTS diff --git a/src/Common/clickhouse_malloc.cpp b/src/Common/clickhouse_malloc.cpp index afdad3c6599..a9961eb4ee4 100644 --- a/src/Common/clickhouse_malloc.cpp +++ b/src/Common/clickhouse_malloc.cpp @@ -1,13 +1,14 @@ #include #include - /** These functions can be substituted instead of regular ones when memory tracking is needed. */ extern "C" void * clickhouse_malloc(size_t size) { - void * res = malloc(size); + void * res = nullptr; + res = malloc(size); + if (res) { AllocationTrace trace; diff --git a/src/Common/memory.cpp b/src/Common/memory.cpp new file mode 100644 index 00000000000..862af4470f1 --- /dev/null +++ b/src/Common/memory.cpp @@ -0,0 +1,22 @@ +#include +#include + +#if USE_GWP_ASAN +namespace Memory +{ +gwp_asan::GuardedPoolAllocator GuardedAlloc; +static bool guarded_alloc_initialized = [] +{ + gwp_asan::options::initOptions(); + gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); + opts.MaxSimultaneousAllocations = 256; + GuardedAlloc.init(opts); + + ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled + /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations + /// << ", SampleRate: " << opts.SampleRate << " }\n"; + + return true; +}(); +} +#endif diff --git a/src/Common/memory.h b/src/Common/memory.h index a828ba7a38e..427120edc75 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -5,6 +5,7 @@ #include #include +#include #include "config.h" #if USE_JEMALLOC @@ -17,13 +18,24 @@ #if USE_GWP_ASAN # include +# include -static gwp_asan::GuardedPoolAllocator GuardedAlloc; #endif +namespace ProfileEvents +{ + extern const Event GWPAsanAllocateSuccess; + extern const Event GWPAsanAllocateFailed; + extern const Event GWPAsanFree; +} + namespace Memory { +#if USE_GWP_ASAN +extern gwp_asan::GuardedPoolAllocator GuardedAlloc; +#endif + inline ALWAYS_INLINE size_t alignToSizeT(std::align_val_t align) noexcept { return static_cast(align); @@ -39,12 +51,26 @@ inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) if constexpr (sizeof...(TAlign) == 1) { if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align...))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } else { if (void * ptr = GuardedAlloc.allocate(size)) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } } @@ -69,7 +95,14 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept if (unlikely(GuardedAlloc.shouldSample())) { if (void * ptr = GuardedAlloc.allocate(size)) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } #endif return malloc(size); @@ -81,7 +114,14 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size, std::align_val_t align) if (unlikely(GuardedAlloc.shouldSample())) { if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align))) + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; + } + else + { + ProfileEvents::increment(ProfileEvents::GWPAsanAllocateFailed); + } } #endif return aligned_alloc(static_cast(align), size); @@ -92,6 +132,7 @@ inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept #if USE_GWP_ASAN if (unlikely(GuardedAlloc.pointerIsMine(ptr))) { + ProfileEvents::increment(ProfileEvents::GWPAsanFree); GuardedAlloc.deallocate(ptr); return; } @@ -111,6 +152,7 @@ inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size, TAlign... al #if USE_GWP_ASAN if (unlikely(GuardedAlloc.pointerIsMine(ptr))) { + ProfileEvents::increment(ProfileEvents::GWPAsanFree); GuardedAlloc.deallocate(ptr); return; } @@ -131,6 +173,7 @@ inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unuse #if USE_GWP_ASAN if (unlikely(GuardedAlloc.pointerIsMine(ptr))) { + ProfileEvents::increment(ProfileEvents::GWPAsanFree); GuardedAlloc.deallocate(ptr); return; } diff --git a/src/Common/new_delete.cpp b/src/Common/new_delete.cpp index 9e93dca9787..e8151fbe201 100644 --- a/src/Common/new_delete.cpp +++ b/src/Common/new_delete.cpp @@ -1,5 +1,4 @@ #include -#include #include #include "config.h" #include @@ -42,27 +41,6 @@ static struct InitializeJemallocZoneAllocatorForOSX } initializeJemallocZoneAllocatorForOSX; #endif -#if USE_GWP_ASAN - -#include - -/// Both clickhouse_new_delete and clickhouse_common_io links gwp_asan, but It should only init once, otherwise it -/// will cause unexpected deadlock. -static struct InitGwpAsan -{ - InitGwpAsan() - { - gwp_asan::options::initOptions(); - gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); - GuardedAlloc.init(opts); - - ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled - /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations - /// << ", SampleRate: " << opts.SampleRate << " }\n"; - } -} init_gwp_asan; -#endif - /// Replace default new/delete with memory tracking versions. /// @sa https://en.cppreference.com/w/cpp/memory/new/operator_new /// https://en.cppreference.com/w/cpp/memory/new/operator_delete From 6753a0ad188ce386e7f3059eaae7e293cd4c3087 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 24 May 2024 16:02:12 +0200 Subject: [PATCH 029/215] Better --- src/CMakeLists.txt | 2 +- src/Common/Allocator.cpp | 2 -- src/Common/AsynchronousMetrics.cpp | 1 - src/Common/clickhouse_malloc.cpp | 5 ++--- src/Common/memory.cpp | 3 ++- src/Daemon/BaseDaemon.cpp | 14 ++++++++++++++ 6 files changed, 19 insertions(+), 8 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 64a09699a54..537cdc7887a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -181,7 +181,7 @@ add_library (clickhouse_new_delete STATIC Common/new_delete.cpp) target_link_libraries (clickhouse_new_delete PRIVATE clickhouse_common_io) if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_new_delete PRIVATE ch_contrib::jemalloc) - target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc) + target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::jemalloc) target_link_libraries (clickhouse_storages_system PRIVATE ch_contrib::jemalloc) endif() diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index 80f3b204c74..8b68ef87298 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -94,9 +94,7 @@ void * allocNoTrack(size_t size, size_t alignment) if constexpr (clear_memory) buf = ::calloc(size, 1); else - { buf = ::malloc(size); - } if (nullptr == buf) throw DB::ErrnoException(DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Allocator: Cannot malloc {}.", ReadableSize(size)); diff --git a/src/Common/AsynchronousMetrics.cpp b/src/Common/AsynchronousMetrics.cpp index ccd65af07f5..4c71b9846c7 100644 --- a/src/Common/AsynchronousMetrics.cpp +++ b/src/Common/AsynchronousMetrics.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Common/clickhouse_malloc.cpp b/src/Common/clickhouse_malloc.cpp index a9961eb4ee4..afdad3c6599 100644 --- a/src/Common/clickhouse_malloc.cpp +++ b/src/Common/clickhouse_malloc.cpp @@ -1,14 +1,13 @@ #include #include + /** These functions can be substituted instead of regular ones when memory tracking is needed. */ extern "C" void * clickhouse_malloc(size_t size) { - void * res = nullptr; - res = malloc(size); - + void * res = malloc(size); if (res) { AllocationTrace trace; diff --git a/src/Common/memory.cpp b/src/Common/memory.cpp index 862af4470f1..6c17dbe3ba1 100644 --- a/src/Common/memory.cpp +++ b/src/Common/memory.cpp @@ -1,7 +1,8 @@ -#include #include #if USE_GWP_ASAN +#include + namespace Memory { gwp_asan::GuardedPoolAllocator GuardedAlloc; diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index c6c82df2a72..74c37b6123b 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -156,6 +157,19 @@ static void signalHandler(int sig, siginfo_t * info, void * context) const ucontext_t * signal_context = reinterpret_cast(context); const StackTrace stack_trace(*signal_context); + const auto is_gwp_asan = [&] + { + auto state = ::Memory::GuardedAlloc.getAllocatorState(); + if (state->FailureType != gwp_asan::Error::UNKNOWN && state->FailureAddress != 0) + return true; + + auto addr = reinterpret_cast(info->si_addr); + return addr < state->GuardedPagePoolEnd && state->GuardedPagePool <= addr; + }; + + if (is_gwp_asan()) + std::cerr << "GWPAsan caught something!" << std::endl; + writeBinary(sig, out); writePODBinary(*info, out); writePODBinary(signal_context, out); From 8f6e4c05ea40b14d9d79646e9180ca5480e609dc Mon Sep 17 00:00:00 2001 From: Blargian Date: Fri, 24 May 2024 21:34:53 +0200 Subject: [PATCH 030/215] Fix broken link --- docs/en/interfaces/formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a137eb2bdf2..1c7df32db7f 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -479,7 +479,7 @@ The CSV format supports the output of totals and extremes the same way as `TabSe - [input_format_csv_detect_header](/docs/en/operations/settings/settings-formats.md/#input_format_csv_detect_header) - automatically detect header with names and types in CSV format. Default value - `true`. - [input_format_csv_skip_trailing_empty_lines](/docs/en/operations/settings/settings-formats.md/#input_format_csv_skip_trailing_empty_lines) - skip trailing empty lines at the end of data. Default value - `false`. - [input_format_csv_trim_whitespaces](/docs/en/operations/settings/settings-formats.md/#input_format_csv_trim_whitespaces) - trim spaces and tabs in non-quoted CSV strings. Default value - `true`. -- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/# input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. +- [input_format_csv_allow_whitespace_or_tab_as_delimiter](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_whitespace_or_tab_as_delimiter) - Allow to use whitespace or tab as field delimiter in CSV strings. Default value - `false`. - [input_format_csv_allow_variable_number_of_columns](/docs/en/operations/settings/settings-formats.md/#input_format_csv_allow_variable_number_of_columns) - allow variable number of columns in CSV format, ignore extra columns and use default values on missing columns. Default value - `false`. - [input_format_csv_use_default_on_bad_values](/docs/en/operations/settings/settings-formats.md/#input_format_csv_use_default_on_bad_values) - Allow to set default value to column when CSV field deserialization failed on bad value. Default value - `false`. - [input_format_csv_try_infer_numbers_from_strings](/docs/en/operations/settings/settings-formats.md/#input_format_csv_try_infer_numbers_from_strings) - Try to infer numbers from string fields while schema inference. Default value - `false`. From 44b787c1c906881850593cc5dcf8c9c0989525d1 Mon Sep 17 00:00:00 2001 From: serxa Date: Sun, 26 May 2024 08:31:01 +0000 Subject: [PATCH 031/215] Remove wrong comment --- src/Common/threadPoolCallbackRunner.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Common/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h index 5beec660801..afbdcf2df19 100644 --- a/src/Common/threadPoolCallbackRunner.h +++ b/src/Common/threadPoolCallbackRunner.h @@ -54,7 +54,6 @@ ThreadPoolCallbackRunnerUnsafe threadPoolCallbackRunnerUnsafe( auto future = task->get_future(); - /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority". /// Note: calling method scheduleOrThrowOnError in intentional, because we don't want to throw exceptions /// in critical places where this callback runner is used (e.g. loading or deletion of parts) my_pool->scheduleOrThrowOnError([my_task = std::move(task)]{ (*my_task)(); }, priority); @@ -163,7 +162,6 @@ public: task->future = task_func->get_future(); - /// ThreadPool is using "bigger is higher priority" instead of "smaller is more priority". /// Note: calling method scheduleOrThrowOnError in intentional, because we don't want to throw exceptions /// in critical places where this callback runner is used (e.g. loading or deletion of parts) pool.scheduleOrThrowOnError([my_task = std::move(task_func)]{ (*my_task)(); }, priority); From 80f195d2b983c2db2feb2d5924d06588a7382d9c Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 25 May 2024 18:46:07 +0200 Subject: [PATCH 032/215] Refactor s3 settings --- src/Backups/BackupIO_S3.cpp | 6 +- src/Backups/BackupIO_S3.h | 2 +- src/Coordination/KeeperSnapshotManagerS3.cpp | 7 +- src/Coordination/Standalone/Context.cpp | 2 +- src/Core/Settings.h | 23 +- .../ObjectStorages/ObjectStorageFactory.cpp | 23 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 23 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 6 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 109 +++-- src/Disks/ObjectStorages/S3/diskSettings.h | 17 +- src/IO/ReadBufferFromS3.cpp | 4 +- src/IO/ReadBufferFromS3.h | 6 +- src/IO/S3/Credentials.h | 8 +- src/IO/S3/copyS3File.cpp | 18 +- src/IO/S3/copyS3File.h | 6 +- src/IO/S3/getObjectInfo.cpp | 16 +- src/IO/S3/getObjectInfo.h | 8 +- src/IO/S3/tests/gtest_aws_s3_client.cpp | 6 +- src/IO/S3Common.cpp | 379 ++++++++++++++++-- src/IO/S3Common.h | 86 +++- src/IO/S3Defines.h | 32 ++ src/IO/S3Settings.cpp | 62 +++ src/IO/S3Settings.h | 45 +++ src/IO/WriteBufferFromS3.cpp | 10 +- src/IO/WriteBufferFromS3.h | 8 +- src/IO/tests/gtest_writebuffer_s3.cpp | 4 +- src/Interpreters/Context.cpp | 10 +- src/Interpreters/Context.h | 4 +- .../ExternalDataSourceConfiguration.h | 2 +- .../ObjectStorage/S3/Configuration.cpp | 8 +- src/Storages/ObjectStorage/S3/Configuration.h | 4 +- src/Storages/S3Queue/S3QueueFilesMetadata.cpp | 2 +- src/Storages/StorageS3Settings.cpp | 315 --------------- src/Storages/StorageS3Settings.h | 122 ------ 34 files changed, 717 insertions(+), 666 deletions(-) create mode 100644 src/IO/S3Defines.h create mode 100644 src/IO/S3Settings.cpp create mode 100644 src/IO/S3Settings.h delete mode 100644 src/Storages/StorageS3Settings.cpp delete mode 100644 src/Storages/StorageS3Settings.h diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 92e208ba464..cbf18e2bff9 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -134,7 +134,7 @@ BackupReaderS3::BackupReaderS3( , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); + request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint request_settings.allow_native_copy = allow_s3_native_copy; client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); @@ -225,10 +225,10 @@ BackupWriterS3::BackupWriterS3( , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettingsIfChanged(context_->getSettingsRef()); + request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint request_settings.allow_native_copy = allow_s3_native_copy; - request_settings.setStorageClassName(storage_class_name); + request_settings.upload_settings.storage_class_name = storage_class_name; client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) { diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index f81eb975df3..327f06363c5 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index b984b8ad18e..acf8faa9edd 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -64,7 +65,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo return; } - auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); + auto auth_settings = S3::AuthSettings::loadFromConfig(config, config_prefix, Context::getGlobalContextInstance()->getSettingsRef()); String endpoint = macros->expand(config.getString(config_prefix + ".endpoint")); auto new_uri = S3::URI{endpoint}; @@ -154,7 +155,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh if (s3_client == nullptr) return; - S3Settings::RequestSettings request_settings_1; + S3::RequestSettings request_settings_1; const auto create_writer = [&](const auto & key) { @@ -197,7 +198,7 @@ void KeeperSnapshotManagerS3::uploadSnapshotImpl(const SnapshotFileInfo & snapsh lock_writer.finalize(); // We read back the written UUID, if it's the same we can upload the file - S3Settings::RequestSettings request_settings_2; + S3::RequestSettings request_settings_2; request_settings_2.max_single_read_retries = 1; ReadBufferFromS3 lock_reader { diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp index 4b14b038852..2802d51ae26 100644 --- a/src/Coordination/Standalone/Context.cpp +++ b/src/Coordination/Standalone/Context.cpp @@ -5,7 +5,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f0389e7e2d5..a7a19702282 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -81,17 +82,19 @@ class IColumn; M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_blocks_in_multipart_upload, 50000, "Maximum number of blocks in multipart upload for Azure.", 0) \ - M(UInt64, s3_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ - M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ + M(UInt64, s3_min_upload_part_size, S3::DEFAULT_MIN_UPLOAD_PART_SIZE, "The minimum size of part to upload during multipart upload to S3.", 0) \ + M(UInt64, s3_max_upload_part_size, S3::DEFAULT_MAX_UPLOAD_PART_SIZE, "The maximum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, azure_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage.", 0) \ - M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ - M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ + M(UInt64, s3_upload_part_size_multiply_factor, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ + M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, S3::DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ + M(UInt64, s3_max_part_number, S3::DEFAULT_MAX_PART_NUMBER, "Maximum part number number for s3 upload part.", 0) \ + M(UInt64, s3_max_single_operation_copy_size, S3::DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "Maximum size for a single copy operation in s3", 0) \ M(UInt64, azure_upload_part_size_multiply_factor, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage.", 0) \ M(UInt64, azure_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor.", 0) \ - M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ + M(UInt64, s3_max_inflight_parts_for_one_file, S3::DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ M(UInt64, azure_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ - M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ + M(UInt64, s3_max_single_part_upload_size, S3::DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ @@ -99,13 +102,13 @@ class IColumn; M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ - M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ + M(UInt64, s3_max_connections, S3::DEFAULT_MAX_CONNECTIONS, "The maximum number of connections per server.", 0) \ M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ M(UInt64, s3_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ - M(Bool, s3_use_adaptive_timeouts, true, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ + M(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ M(UInt64, azure_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ M(Bool, azure_truncate_on_insert, false, "Enables or disables truncate before insert in azure engine tables.", 0) \ @@ -124,8 +127,8 @@ class IColumn; M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ - M(UInt64, s3_request_timeout_ms, 30000, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ - M(UInt64, s3_connect_timeout_ms, 1000, "Connection timeout for host from s3 disks.", 0) \ + M(UInt64, s3_request_timeout_ms, S3::DEFAULT_REQUEST_TIMEOUT_MS, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ + M(UInt64, s3_connect_timeout_ms, S3::DEFAULT_CONNECT_TIMEOUT_MS, "Connection timeout for host from s3 disks.", 0) \ M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \ M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \ M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \ diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index d7884c2911b..81de22811fe 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -169,6 +169,14 @@ void checkS3Capabilities( } } +static std::string getEndpoint( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const ContextPtr & context) +{ + return context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); +} + void registerS3ObjectStorage(ObjectStorageFactory & factory) { static constexpr auto disk_type = "s3"; @@ -182,8 +190,9 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) { auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = createObjectStorage( @@ -218,8 +227,9 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = std::make_shared>( @@ -252,8 +262,9 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context); - auto client = getClient(config, config_prefix, context, *settings, true); + auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); + auto endpoint = getEndpoint(config, config_prefix, context); + auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = std::make_shared>( diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index c07313b52db..416eebc7493 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -168,7 +168,7 @@ private: bool S3ObjectStorage::exists(const StoredObject & object) const { auto settings_ptr = s3_settings.get(); - return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}, settings_ptr->request_settings); + return S3::objectExists(*client.get(), uri.bucket, object.remote_path, {}); } std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT @@ -258,10 +258,10 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN if (mode != WriteMode::Rewrite) throw Exception(ErrorCodes::BAD_ARGUMENTS, "S3 doesn't support append to files"); - S3Settings::RequestSettings request_settings = s3_settings.get()->request_settings; + S3::RequestSettings request_settings = s3_settings.get()->request_settings; if (auto query_context = CurrentThread::getQueryContext()) { - request_settings.updateFromSettingsIfChanged(query_context->getSettingsRef()); + request_settings.updateFromSettings(query_context->getSettingsRef(), /* if_changed */true); } ThreadPoolCallbackRunnerUnsafe scheduler; @@ -440,8 +440,7 @@ std::optional S3ObjectStorage::tryGetObjectMetadata(const std::s { auto settings_ptr = s3_settings.get(); auto object_info = S3::getObjectInfo( - *client.get(), uri.bucket, path, {}, settings_ptr->request_settings, - /* with_metadata= */ true, /* throw_on_error= */ false); + *client.get(), uri.bucket, path, {}, /* with_metadata= */ true, /* throw_on_error= */ false); if (object_info.size == 0 && object_info.last_modification_time == 0 && object_info.metadata.empty()) return {}; @@ -460,7 +459,7 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons S3::ObjectInfo object_info; try { - object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, settings_ptr->request_settings, /* with_metadata= */ true); + object_info = S3::getObjectInfo(*client.get(), uri.bucket, path, {}, /* with_metadata= */ true); } catch (DB::Exception & e) { @@ -489,7 +488,7 @@ void S3ObjectStorage::copyObjectToAnotherObjectStorage( // NOLINT { auto current_client = dest_s3->client.get(); auto settings_ptr = s3_settings.get(); - auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings); + auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}); auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "S3ObjStor_copy"); try @@ -532,7 +531,7 @@ void S3ObjectStorage::copyObject( // NOLINT { auto current_client = client.get(); auto settings_ptr = s3_settings.get(); - auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}, settings_ptr->request_settings); + auto size = S3::getObjectSize(*current_client, uri.bucket, object_from.remote_path, {}); auto scheduler = threadPoolCallbackRunnerUnsafe(getThreadPoolWriter(), "S3ObjStor_copy"); copyS3File(current_client, @@ -575,7 +574,7 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto new_s3_settings = getSettings(config, config_prefix, context, context->getSettingsRef().s3_validate_request_settings); + auto new_s3_settings = getSettings(config, config_prefix, context, for_disk_s3, context->getSettingsRef().s3_validate_request_settings); if (!static_headers.empty()) { new_s3_settings->auth_settings.headers.insert( @@ -589,7 +588,7 @@ void S3ObjectStorage::applyNewSettings( auto current_s3_settings = s3_settings.get(); if (options.allow_client_change && (current_s3_settings->auth_settings.hasUpdates(new_s3_settings->auth_settings) || for_disk_s3)) { - auto new_client = getClient(config, config_prefix, context, *new_s3_settings, for_disk_s3, &uri); + auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); client.set(std::move(new_client)); } s3_settings.set(std::move(new_s3_settings)); @@ -601,8 +600,8 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context); - auto new_client = getClient(config, config_prefix, context, *new_s3_settings, true); + auto new_s3_settings = getSettings(config, config_prefix, context, for_disk_s3, true); + auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); auto new_uri{uri}; new_uri.bucket = new_namespace; diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 1fff6d67e23..b7b7ba6ee8d 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -20,7 +20,7 @@ struct S3ObjectStorageSettings S3ObjectStorageSettings() = default; S3ObjectStorageSettings( - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const S3::AuthSettings & auth_settings_, uint64_t min_bytes_for_seek_, int32_t list_object_keys_size_, @@ -34,7 +34,7 @@ struct S3ObjectStorageSettings , read_only(read_only_) {} - S3Settings::RequestSettings request_settings; + S3::RequestSettings request_settings; S3::AuthSettings auth_settings; uint64_t min_bytes_for_seek; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 139472a8b01..f66af556ce1 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -18,18 +19,12 @@ #include #include -#include +#include #include #include -#include namespace DB { -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace ErrorCodes { extern const int NO_ELEMENTS_IN_CONFIG; @@ -39,11 +34,14 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, + bool for_disk_s3, bool validate_settings) { - const Settings & settings = context->getSettingsRef(); - auto request_settings = S3Settings::RequestSettings(config, config_prefix, settings, "s3_", validate_settings); - auto auth_settings = S3::AuthSettings::loadFromConfig(config_prefix, config); + const auto & settings = context->getSettingsRef(); + const std::string setting_name_prefix = for_disk_s3 ? "s3_" : ""; + + auto auth_settings = S3::AuthSettings::loadFromConfig(config, config_prefix, settings); + auto request_settings = S3::RequestSettings::loadFromConfig(config, config_prefix, settings, validate_settings, setting_name_prefix); return std::make_unique( request_settings, @@ -55,38 +53,33 @@ std::unique_ptr getSettings( } std::unique_ptr getClient( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - ContextPtr context, + const std::string & endpoint, const S3ObjectStorageSettings & settings, - bool for_disk_s3, - const S3::URI * url_) + ContextPtr context, + bool for_disk_s3) +{ + auto url = S3::URI(endpoint); + if (!url.key.ends_with('/')) + url.key.push_back('/'); + return getClient(url, settings, context, for_disk_s3); +} + +std::unique_ptr getClient( + const S3::URI & url, + const S3ObjectStorageSettings & settings, + ContextPtr context, + bool for_disk_s3) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); - const Settings & local_settings = context->getSettingsRef(); - const auto & auth_settings = settings.auth_settings; const auto & request_settings = settings.request_settings; - S3::URI url; - if (for_disk_s3) - { - String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); - url = S3::URI(endpoint); - if (!url.key.ends_with('/')) - url.key.push_back('/'); - } - else - { - if (!url_) - throw Exception(ErrorCodes::LOGICAL_ERROR, "URL not passed"); - url = *url_; - } const bool is_s3_express_bucket = S3::isS3ExpressEndpoint(url.endpoint); - if (is_s3_express_bucket && !config.has(config_prefix + ".region")) + if (is_s3_express_bucket && auth_settings.region.empty()) { throw Exception( - ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets ({})", config_prefix); + ErrorCodes::NO_ELEMENTS_IN_CONFIG, + "Region should be explicitly specified for directory buckets"); } S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( @@ -96,49 +89,47 @@ std::unique_ptr getClient( static_cast(global_settings.s3_retry_attempts), global_settings.enable_s3_requests_logging, for_disk_s3, - settings.request_settings.get_request_throttler, - settings.request_settings.put_request_throttler, + request_settings.get_request_throttler, + request_settings.put_request_throttler, url.uri.getScheme()); - client_configuration.connectTimeoutMs = config.getUInt64(config_prefix + ".connect_timeout_ms", local_settings.s3_connect_timeout_ms.value); - client_configuration.requestTimeoutMs = config.getUInt64(config_prefix + ".request_timeout_ms", local_settings.s3_request_timeout_ms.value); - client_configuration.maxConnections = config.getUInt(config_prefix + ".max_connections", static_cast(request_settings.max_connections)); - client_configuration.http_keep_alive_timeout = config.getUInt(config_prefix + ".http_keep_alive_timeout", S3::DEFAULT_KEEP_ALIVE_TIMEOUT); - client_configuration.http_keep_alive_max_requests = config.getUInt(config_prefix + ".http_keep_alive_max_requests", S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); + client_configuration.connectTimeoutMs = auth_settings.connect_timeout_ms.value_or(S3::DEFAULT_CONNECT_TIMEOUT_MS); + client_configuration.requestTimeoutMs = auth_settings.request_timeout_ms.value_or(S3::DEFAULT_REQUEST_TIMEOUT_MS); + client_configuration.maxConnections = static_cast(auth_settings.max_connections.value_or(S3::DEFAULT_MAX_CONNECTIONS)); + client_configuration.http_keep_alive_timeout = auth_settings.http_keep_alive_timeout.value_or(S3::DEFAULT_KEEP_ALIVE_TIMEOUT); + client_configuration.http_keep_alive_max_requests = auth_settings.http_keep_alive_max_requests.value_or(S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); client_configuration.endpointOverride = url.endpoint; - client_configuration.s3_use_adaptive_timeouts = config.getBool( - config_prefix + ".use_adaptive_timeouts", client_configuration.s3_use_adaptive_timeouts); + client_configuration.s3_use_adaptive_timeouts = auth_settings.use_adaptive_timeouts.value_or(S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS); if (for_disk_s3) { + /// TODO: move to S3Common auth settings parsing /* * Override proxy configuration for backwards compatibility with old configuration format. * */ - if (auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( - ProxyConfiguration::protocolFromString(url.uri.getScheme()), config_prefix, config)) - { - client_configuration.per_request_configuration - = [proxy_config]() { return proxy_config->resolve(); }; - client_configuration.error_report - = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; - } + // if (auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( + // ProxyConfiguration::protocolFromString(url.uri.getScheme()), config_prefix, config)) + // { + // client_configuration.per_request_configuration + // = [proxy_config]() { return proxy_config->resolve(); }; + // client_configuration.error_report + // = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; + // } } - S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config); S3::ClientSettings client_settings{ .use_virtual_addressing = url.is_virtual_hosted_style, - .disable_checksum = local_settings.s3_disable_checksum, - .gcs_issue_compose_request = config.getBool("s3.gcs_issue_compose_request", false), - .is_s3express_bucket = is_s3_express_bucket, + .disable_checksum = auth_settings.disable_checksum.value_or(S3::DEFAULT_DISABLE_CHECKSUM), + .gcs_issue_compose_request = auth_settings.gcs_issue_compose_request.value_or(false), }; auto credentials_configuration = S3::CredentialsConfiguration { - auth_settings.use_environment_credentials.value_or(context->getConfigRef().getBool("s3.use_environment_credentials", true)), - auth_settings.use_insecure_imds_request.value_or(context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - auth_settings.expiration_window_seconds.value_or(context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - auth_settings.no_sign_request.value_or(context->getConfigRef().getBool("s3.no_sign_request", false)), + auth_settings.use_environment_credentials.value_or(S3::DEFAULT_USE_ENVIRONMENT_CREDENTIALS), + auth_settings.use_insecure_imds_request.value_or(false), + auth_settings.expiration_window_seconds.value_or(S3::DEFAULT_EXPIRATION_WINDOW_SECONDS), + auth_settings.no_sign_request.value_or(S3::DEFAULT_NO_SIGN_REQUEST), }; return S3::ClientFactory::instance().create( @@ -147,7 +138,7 @@ std::unique_ptr getClient( auth_settings.access_key_id, auth_settings.secret_access_key, auth_settings.server_side_encryption_customer_key_base64, - std::move(sse_kms_config), + auth_settings.server_side_encryption_kms_config, auth_settings.headers, credentials_configuration, auth_settings.session_token); diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 11ac64ce913..5e7a18152d1 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -18,15 +18,20 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, - bool validate_settings = true); + bool for_disk_s3, + bool validate_settings); std::unique_ptr getClient( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - ContextPtr context, + const std::string & endpoint, const S3ObjectStorageSettings & settings, - bool for_disk_s3, - const S3::URI * url_ = nullptr); + ContextPtr context, + bool for_disk_s3); + +std::unique_ptr getClient( + const S3::URI & url_, + const S3ObjectStorageSettings & settings, + ContextPtr context, + bool for_disk_s3); } diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index 8823af55936..9e001232e65 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -51,7 +51,7 @@ ReadBufferFromS3::ReadBufferFromS3( const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer_, size_t offset_, @@ -318,7 +318,7 @@ size_t ReadBufferFromS3::getFileSize() if (file_size) return *file_size; - auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, request_settings); + auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id); file_size = object_size; return *file_size; diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 003c88df7d2..c6625c2d632 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include "config.h" #if USE_AWS_S3 @@ -28,7 +28,7 @@ private: String bucket; String key; String version_id; - const S3Settings::RequestSettings request_settings; + const S3::RequestSettings request_settings; /// These variables are atomic because they can be used for `logging only` /// (where it is not important to get consistent result) @@ -47,7 +47,7 @@ public: const String & bucket_, const String & key_, const String & version_id_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & settings_, bool use_external_buffer = false, size_t offset_ = 0, diff --git a/src/IO/S3/Credentials.h b/src/IO/S3/Credentials.h index 8d586223035..b8698d9b302 100644 --- a/src/IO/S3/Credentials.h +++ b/src/IO/S3/Credentials.h @@ -13,18 +13,12 @@ # include # include +# include namespace DB::S3 { -inline static constexpr uint64_t DEFAULT_EXPIRATION_WINDOW_SECONDS = 120; -inline static constexpr uint64_t DEFAULT_CONNECT_TIMEOUT_MS = 1000; -inline static constexpr uint64_t DEFAULT_REQUEST_TIMEOUT_MS = 30000; -inline static constexpr uint64_t DEFAULT_MAX_CONNECTIONS = 100; -inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_TIMEOUT = 5; -inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_MAX_REQUESTS = 100; - /// In GCP metadata service can be accessed via DNS regardless of IPv4 or IPv6. static inline constexpr char GCP_METADATA_SERVICE_ENDPOINT[] = "http://metadata.google.internal"; diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index cff6fa5ad21..471c4a687a6 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -56,7 +56,7 @@ namespace const std::shared_ptr & client_ptr_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, bool for_disk_s3_, @@ -66,7 +66,7 @@ namespace , dest_bucket(dest_bucket_) , dest_key(dest_key_) , request_settings(request_settings_) - , upload_settings(request_settings.getUploadSettings()) + , upload_settings(request_settings.upload_settings) , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_s3(for_disk_s3_) @@ -81,8 +81,8 @@ namespace std::shared_ptr client_ptr; const String & dest_bucket; const String & dest_key; - const S3Settings::RequestSettings & request_settings; - const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const S3::RequestSettings & request_settings; + const S3::RequestSettings::PartUploadSettings & upload_settings; const std::optional> & object_metadata; ThreadPoolCallbackRunnerUnsafe schedule; bool for_disk_s3; @@ -239,7 +239,7 @@ namespace void checkObjectAfterUpload() { LOG_TRACE(log, "Checking object {} exists after upload", dest_key); - S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, request_settings, "Immediately after upload"); + S3::checkObjectExists(*client_ptr, dest_bucket, dest_key, {}, "Immediately after upload"); LOG_TRACE(log, "Object {} exists after upload", dest_key); } @@ -465,7 +465,7 @@ namespace const std::shared_ptr & client_ptr_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, bool for_disk_s3_, @@ -647,7 +647,7 @@ namespace size_t src_size_, const String & dest_bucket_, const String & dest_key_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, const ReadSettings & read_settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, @@ -853,7 +853,7 @@ void copyDataToS3File( const std::shared_ptr & dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, ThreadPoolCallbackRunnerUnsafe schedule, @@ -872,7 +872,7 @@ void copyS3File( size_t src_size, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, const ReadSettings & read_settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata, diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index d5da4d260b1..c2cd8735188 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -4,7 +4,7 @@ #if USE_AWS_S3 -#include +#include #include #include #include @@ -38,7 +38,7 @@ void copyS3File( size_t src_size, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, const ReadSettings & read_settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, @@ -57,7 +57,7 @@ void copyDataToS3File( const std::shared_ptr & dest_s3_client, const String & dest_bucket, const String & dest_key, - const S3Settings::RequestSettings & settings, + const S3::RequestSettings & settings, BlobStorageLogWriterPtr blob_storage_log, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, diff --git a/src/IO/S3/getObjectInfo.cpp b/src/IO/S3/getObjectInfo.cpp index 78efda4ae57..9271ad820e4 100644 --- a/src/IO/S3/getObjectInfo.cpp +++ b/src/IO/S3/getObjectInfo.cpp @@ -44,7 +44,7 @@ namespace /// Performs a request to get the size and last modification time of an object. std::pair, Aws::S3::S3Error> tryGetObjectInfo( const S3::Client & client, const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & /*request_settings*/, bool with_metadata) + bool with_metadata) { auto outcome = headObject(client, bucket, key, version_id); if (!outcome.IsSuccess()) @@ -73,11 +73,10 @@ ObjectInfo getObjectInfo( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, bool with_metadata, bool throw_on_error) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, with_metadata); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, with_metadata); if (object_info) { return *object_info; @@ -96,20 +95,18 @@ size_t getObjectSize( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, bool throw_on_error) { - return getObjectInfo(client, bucket, key, version_id, request_settings, {}, throw_on_error).size; + return getObjectInfo(client, bucket, key, version_id, {}, throw_on_error).size; } bool objectExists( const S3::Client & client, const String & bucket, const String & key, - const String & version_id, - const S3Settings::RequestSettings & request_settings) + const String & version_id) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, {}); if (object_info) return true; @@ -126,10 +123,9 @@ void checkObjectExists( const String & bucket, const String & key, const String & version_id, - const S3Settings::RequestSettings & request_settings, std::string_view description) { - auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, request_settings, {}); + auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, {}); if (object_info) return; throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}", diff --git a/src/IO/S3/getObjectInfo.h b/src/IO/S3/getObjectInfo.h index ac8072a4338..32f34f74069 100644 --- a/src/IO/S3/getObjectInfo.h +++ b/src/IO/S3/getObjectInfo.h @@ -3,7 +3,7 @@ #include "config.h" #if USE_AWS_S3 -#include +#include #include #include @@ -24,7 +24,6 @@ ObjectInfo getObjectInfo( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, bool with_metadata = false, bool throw_on_error = true); @@ -33,15 +32,13 @@ size_t getObjectSize( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, bool throw_on_error = true); bool objectExists( const S3::Client & client, const String & bucket, const String & key, - const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}); + const String & version_id = {}); /// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message. void checkObjectExists( @@ -49,7 +46,6 @@ void checkObjectExists( const String & bucket, const String & key, const String & version_id = {}, - const S3Settings::RequestSettings & request_settings = {}, std::string_view description = {}); bool isNotFoundError(Aws::S3::S3Errors error); diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 0a28c578f69..5ee9648a44e 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "TestPocoHTTPServer.h" @@ -69,7 +69,7 @@ void doReadRequest(std::shared_ptr client, const DB::S3::U UInt64 max_single_read_retries = 1; DB::ReadSettings read_settings; - DB::S3Settings::RequestSettings request_settings; + DB::S3::RequestSettings request_settings; request_settings.max_single_read_retries = max_single_read_retries; DB::ReadBufferFromS3 read_buffer( client, @@ -88,7 +88,7 @@ void doWriteRequest(std::shared_ptr client, const DB::S3:: { UInt64 max_unexpected_write_error_retries = 1; - DB::S3Settings::RequestSettings request_settings; + DB::S3::RequestSettings request_settings; request_settings.max_unexpected_write_error_retries = max_unexpected_write_error_retries; DB::WriteBufferFromS3 write_buffer( client, diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 78c51fcb29c..8a01f6ca33a 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -8,11 +8,13 @@ #if USE_AWS_S3 -# include -# include -# include -# include -# include +#include +#include +#include +#include +#include +#include +#include namespace ProfileEvents @@ -48,6 +50,7 @@ bool S3Exception::isRetryableError() const namespace DB::ErrorCodes { extern const int S3_ERROR; + extern const int INVALID_SETTING_VALUE; } #endif @@ -98,61 +101,90 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c return sse_kms_config; } -AuthSettings AuthSettings::loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config) +AuthSettings AuthSettings::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + const std::string & setting_name_prefix) { - auto access_key_id = config.getString(config_elem + ".access_key_id", ""); - auto secret_access_key = config.getString(config_elem + ".secret_access_key", ""); - auto session_token = config.getString(config_elem + ".session_token", ""); + auto auth_settings = AuthSettings::loadFromSettings(settings); - auto region = config.getString(config_elem + ".region", ""); - auto server_side_encryption_customer_key_base64 = config.getString(config_elem + ".server_side_encryption_customer_key_base64", ""); + const std::string prefix = config_prefix + "." + setting_name_prefix; + auto has = [&](const std::string & key) -> bool { return config.has(prefix + key); }; + auto get_uint = [&](const std::string & key) -> size_t { return config.getUInt64(prefix + key); }; + auto get_bool = [&](const std::string & key) -> bool { return config.getBool(prefix + key); }; + auto get_string = [&](const std::string & key) -> std::string { return config.getString(prefix + key); }; - std::optional use_environment_credentials; - if (config.has(config_elem + ".use_environment_credentials")) - use_environment_credentials = config.getBool(config_elem + ".use_environment_credentials"); + if (has("access_key_id")) + auth_settings.access_key_id = get_string("access_key_id"); + if (has("secret_access_key")) + auth_settings.secret_access_key = get_string("secret_access_key"); + if (has("session_token")) + auth_settings.secret_access_key = get_string("session_token"); - std::optional use_insecure_imds_request; - if (config.has(config_elem + ".use_insecure_imds_request")) - use_insecure_imds_request = config.getBool(config_elem + ".use_insecure_imds_request"); + if (has("region")) + auth_settings.region = get_string("region"); + if (has("server_side_encryption_customer_key_base64")) + auth_settings.region = get_string("server_side_encryption_customer_key_base64"); - std::optional expiration_window_seconds; - if (config.has(config_elem + ".expiration_window_seconds")) - expiration_window_seconds = config.getUInt64(config_elem + ".expiration_window_seconds"); + if (has("connect_timeout_ms")) + auth_settings.connect_timeout_ms = get_uint("connect_timeout_ms"); + if (has("request_timeout_ms")) + auth_settings.request_timeout_ms = get_uint("request_timeout_ms"); + if (has("max_connections")) + auth_settings.max_connections = get_uint("max_connections"); - std::optional no_sign_request; - if (config.has(config_elem + ".no_sign_request")) - no_sign_request = config.getBool(config_elem + ".no_sign_request"); + if (has("http_keep_alive_timeout")) + auth_settings.http_keep_alive_timeout = get_uint("http_keep_alive_timeout"); + if (has("http_keep_alive_max_requests")) + auth_settings.http_keep_alive_max_requests = get_uint("http_keep_alive_max_requests"); - HTTPHeaderEntries headers = getHTTPHeaders(config_elem, config); - ServerSideEncryptionKMSConfig sse_kms_config = getSSEKMSConfig(config_elem, config); + if (has("use_environment_credentials")) + auth_settings.use_environment_credentials = get_bool("use_environment_credentials"); + if (has("use_adaptive_timeouts")) + auth_settings.use_adaptive_timeouts = get_bool("use_adaptive_timeouts"); + if (has("no_sing_request")) + auth_settings.no_sign_request = get_bool("no_sing_request"); + if (has("expiration_window_seconds")) + auth_settings.expiration_window_seconds = get_uint("expiration_window_seconds"); + if (has("gcs_issue_compose_request")) + auth_settings.gcs_issue_compose_request = get_bool("gcs_issue_compose_request"); + if (has("use_insecure_imds_request")) + auth_settings.use_insecure_imds_request = get_bool("use_insecure_imds_request"); + + auth_settings.headers = getHTTPHeaders(config_prefix, config); + auth_settings.server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); - std::unordered_set users; Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_elem, keys); + config.keys(config_prefix, keys); for (const auto & key : keys) { if (startsWith(key, "user")) - users.insert(config.getString(config_elem + "." + key)); + auth_settings.users.insert(config.getString(config_prefix + "." + key)); } - return AuthSettings - { - std::move(access_key_id), std::move(secret_access_key), std::move(session_token), - std::move(region), - std::move(server_side_encryption_customer_key_base64), - std::move(sse_kms_config), - std::move(headers), - use_environment_credentials, - use_insecure_imds_request, - expiration_window_seconds, - no_sign_request, - std::move(users) - }; + return auth_settings; } -bool AuthSettings::canBeUsedByUser(const String & user) const +AuthSettings AuthSettings::loadFromSettings(const DB::Settings & settings) { - return users.empty() || users.contains(user); + AuthSettings auth_settings{}; + auth_settings.updateFromSettings(settings, /* if_changed */false); + return auth_settings; +} + +void AuthSettings::updateFromSettings(const DB::Settings & settings, bool if_changed) +{ + if (!if_changed || settings.s3_connect_timeout_ms.changed) + connect_timeout_ms = settings.s3_connect_timeout_ms; + if (!if_changed || settings.s3_request_timeout_ms.changed) + request_timeout_ms = settings.s3_request_timeout_ms; + if (!if_changed || settings.s3_max_connections.changed) + max_connections = settings.s3_max_connections; + if (!if_changed || settings.s3_use_adaptive_timeouts.changed) + use_adaptive_timeouts = settings.s3_use_adaptive_timeouts; + if (!if_changed || settings.s3_disable_checksum.changed) + disable_checksum = settings.s3_disable_checksum; } bool AuthSettings::hasUpdates(const AuthSettings & other) const @@ -183,7 +215,7 @@ void AuthSettings::updateFrom(const AuthSettings & from) server_side_encryption_kms_config = from.server_side_encryption_kms_config; if (from.use_environment_credentials.has_value()) - use_environment_credentials = from.use_environment_credentials; + use_environment_credentials = from.use_environment_credentials; if (from.use_insecure_imds_request.has_value()) use_insecure_imds_request = from.use_insecure_imds_request; @@ -197,5 +229,264 @@ void AuthSettings::updateFrom(const AuthSettings & from) users.insert(from.users.begin(), from.users.end()); } +RequestSettings RequestSettings::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + bool validate_settings, + const std::string & setting_name_prefix) +{ + auto request_settings = RequestSettings::loadFromSettings(settings, validate_settings); + + String prefix = config_prefix + "." + setting_name_prefix; + auto has = [&](const std::string & key) -> bool { return config.has(prefix + key); }; + auto get_uint = [&](const std::string & key) -> size_t { return config.getUInt64(prefix + key); }; + auto get_string = [&](const std::string & key) -> std::string { return config.getString(prefix + key); }; + auto get_bool = [&](const std::string & key) -> bool { return config.getBool(prefix + key); }; + + if (has("strict_upload_part_size")) + request_settings.upload_settings.strict_upload_part_size = get_uint("strict_upload_part_size"); + if (has("min_upload_part_size")) + request_settings.upload_settings.min_upload_part_size = get_uint("min_upload_part_size"); + if (has("max_upload_part_size")) + request_settings.upload_settings.max_upload_part_size = get_uint("max_upload_part_size"); + if (has("upload_part_size_multiply_factor")) + request_settings.upload_settings.upload_part_size_multiply_factor = get_uint("upload_part_size_multiply_factor"); + if (has("upload_part_size_multiply_parts_count_threshold")) + request_settings.upload_settings.upload_part_size_multiply_parts_count_threshold = get_uint("upload_part_size_multiply_parts_count_threshold"); + if (has("max_inflight_parts_for_one_file")) + request_settings.upload_settings.max_inflight_parts_for_one_file = get_uint("max_inflight_parts_for_one_file"); + if (has("max_part_number")) + request_settings.upload_settings.max_part_number = get_uint("max_part_number"); + if (has("max_single_part_upload_size")) + request_settings.upload_settings.max_single_part_upload_size = get_uint("max_single_part_upload_size"); + if (has("max_single_operation_copy_size")) + request_settings.upload_settings.max_single_operation_copy_size = get_uint("max_single_operation_copy_size"); + if (has("s3_storage_class")) + request_settings.upload_settings.storage_class_name = get_string("s3_storage_class"); + + request_settings.upload_settings.storage_class_name = Poco::toUpperInPlace(request_settings.upload_settings.storage_class_name); + if (validate_settings) + request_settings.upload_settings.validate(); + + if (has("max_single_read_retries")) + request_settings.max_single_read_retries = get_uint("max_single_read_retries"); + if (has("check_objects_after_upload")) + request_settings.check_objects_after_upload = get_bool("check_objects_after_upload"); + if (has("list_object_keys_size")) + request_settings.list_object_keys_size = get_uint("list_object_keys_size"); + if (has("allow_native_copy")) + request_settings.allow_native_copy = get_bool("allow_native_copy"); + if (has("throw_on_zero_files_match")) + request_settings.throw_on_zero_files_match = get_bool("throw_on_zero_files_match"); + if (has("request_timeout_ms")) + request_settings.request_timeout_ms = get_uint("request_timeout_ms"); + + /// NOTE: it would be better to reuse old throttlers + /// to avoid losing token bucket state on every config reload, + /// which could lead to exceeding limit for short time. + /// But it is good enough unless very high `burst` values are used. + if (UInt64 max_get_rps = has("max_get_rps") ? get_uint("max_get_rps") : settings.s3_max_get_rps) + { + size_t default_max_get_burst = settings.s3_max_get_burst + ? settings.s3_max_get_burst + : (Throttler::default_burst_seconds * max_get_rps); + size_t max_get_burst = has("max_get_burst") ? get_uint("max_get_burst") : default_max_get_burst; + request_settings.get_request_throttler = std::make_shared(max_get_rps, max_get_burst); + } + if (UInt64 max_put_rps = has("max_put_rps") ? get_uint("max_put_rps") : settings.s3_max_put_rps) + { + size_t default_max_put_burst = settings.s3_max_put_burst + ? settings.s3_max_put_burst + : (Throttler::default_burst_seconds * max_put_rps); + size_t max_put_burst = has("max_put_burst") ? get_uint("max_put_burst") : default_max_put_burst; + request_settings.put_request_throttler = std::make_shared(max_put_rps, max_put_burst); + } + return request_settings; } + +RequestSettings RequestSettings::loadFromNamedCollection(const NamedCollection & collection, bool validate_settings) +{ + RequestSettings settings{}; + + if (collection.has("strict_upload_part_size")) + settings.upload_settings.strict_upload_part_size = collection.get("strict_upload_part_size"); + if (collection.has("min_upload_part_size")) + settings.upload_settings.min_upload_part_size = collection.get("min_upload_part_size"); + if (collection.has("max_upload_part_size")) + settings.upload_settings.min_upload_part_size = collection.get("max_upload_part_size"); + if (collection.has("upload_part_size_multiply_factor")) + settings.upload_settings.upload_part_size_multiply_factor = collection.get("upload_part_size_multiply_factor"); + if (collection.has("upload_part_size_multiply_parts_count_threshold")) + settings.upload_settings.upload_part_size_multiply_parts_count_threshold = collection.get("upload_part_size_multiply_parts_count_threshold"); + if (collection.has("max_inflight_parts_for_one_file")) + settings.upload_settings.max_inflight_parts_for_one_file = collection.get("max_inflight_parts_for_one_file"); + if (collection.has("max_part_number")) + settings.upload_settings.max_single_part_upload_size = collection.get("max_part_number"); + if (collection.has("max_single_part_upload_size")) + settings.upload_settings.max_single_part_upload_size = collection.get("max_single_part_upload_size"); + if (collection.has("max_single_operation_copy_size")) + settings.upload_settings.max_single_part_upload_size = collection.get("max_single_operation_copy_size"); + if (collection.has("s3_storage_class")) + settings.upload_settings.storage_class_name = collection.get("s3_storage_class"); + + settings.upload_settings.storage_class_name = Poco::toUpperInPlace(settings.upload_settings.storage_class_name); + if (validate_settings) + settings.upload_settings.validate(); + + if (collection.has("max_single_read_retries")) + settings.max_single_read_retries = collection.get("max_single_read_retries"); + if (collection.has("list_object_keys_size")) + settings.list_object_keys_size = collection.get("list_object_keys_size"); + if (collection.has("allow_native_copy")) + settings.allow_native_copy = collection.get("allow_native_copy"); + if (collection.has("throw_on_zero_files_match")) + settings.throw_on_zero_files_match = collection.get("throw_on_zero_files_match"); + + return settings; +} + +RequestSettings RequestSettings::loadFromSettings(const DB::Settings & settings, bool validate_settings) +{ + RequestSettings request_settings{}; + request_settings.updateFromSettings(settings, /* if_changed */false, validate_settings); + return request_settings; +} + +void RequestSettings::updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings) +{ + if (!if_changed || settings.s3_strict_upload_part_size.changed) + upload_settings.strict_upload_part_size = settings.s3_strict_upload_part_size; + if (!if_changed || settings.s3_min_upload_part_size.changed) + upload_settings.min_upload_part_size = settings.s3_min_upload_part_size; + if (!if_changed || settings.s3_max_upload_part_size.changed) + upload_settings.max_upload_part_size = settings.s3_max_upload_part_size; + if (!if_changed || settings.s3_upload_part_size_multiply_factor.changed) + upload_settings.upload_part_size_multiply_factor = settings.s3_upload_part_size_multiply_factor; + if (!if_changed || settings.s3_upload_part_size_multiply_parts_count_threshold.changed) + upload_settings.upload_part_size_multiply_parts_count_threshold = settings.s3_upload_part_size_multiply_parts_count_threshold; + if (!if_changed || settings.s3_max_inflight_parts_for_one_file.changed) + upload_settings.max_inflight_parts_for_one_file = settings.s3_max_inflight_parts_for_one_file; + if (!if_changed || settings.s3_max_part_number.changed) + upload_settings.max_part_number = settings.s3_max_part_number; + if (!if_changed || settings.s3_max_single_part_upload_size.changed) + upload_settings.max_single_part_upload_size = settings.s3_max_single_part_upload_size; + if (!if_changed || settings.s3_max_single_operation_copy_size.changed) + upload_settings.max_part_number = settings.s3_max_single_operation_copy_size; + + if (validate_settings) + upload_settings.validate(); + + if (!if_changed || settings.s3_max_single_read_retries.changed) + max_single_read_retries = settings.s3_max_single_read_retries; + if (!if_changed || settings.s3_check_objects_after_upload.changed) + check_objects_after_upload = settings.s3_check_objects_after_upload; + if (!if_changed || settings.s3_max_unexpected_write_error_retries.changed) + max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; + if (!if_changed || settings.s3_list_object_keys_size.changed) + list_object_keys_size = settings.s3_list_object_keys_size; + if (!if_changed || settings.s3_throw_on_zero_files_match.changed) + throw_on_zero_files_match = settings.s3_throw_on_zero_files_match; + if (!if_changed || settings.s3_request_timeout_ms.changed) + request_timeout_ms = settings.s3_request_timeout_ms; + + if ((!if_changed || settings.s3_max_get_rps.changed || settings.s3_max_get_burst.changed) && settings.s3_max_get_rps) + { + size_t max_get_burst = settings.s3_max_get_burst + ? settings.s3_max_get_burst + : Throttler::default_burst_seconds * settings.s3_max_get_rps; + get_request_throttler = std::make_shared(settings.s3_max_get_rps, max_get_burst); + } + if ((!if_changed || settings.s3_max_put_rps.changed || settings.s3_max_put_burst.changed) && settings.s3_max_put_rps) + { + size_t max_put_burst = settings.s3_max_put_burst + ? settings.s3_max_put_burst + : Throttler::default_burst_seconds * settings.s3_max_put_rps; + put_request_throttler = std::make_shared(settings.s3_max_put_rps, max_put_burst); + } +} + +void RequestSettings::PartUploadSettings::validate() +{ + static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024; + if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting strict_upload_part_size has invalid value {} which is less than the s3 API limit {}", + ReadableSize(strict_upload_part_size), ReadableSize(min_upload_part_size_limit)); + + if (min_upload_part_size < min_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting min_upload_part_size has invalid value {} which is less than the s3 API limit {}", + ReadableSize(min_upload_part_size), ReadableSize(min_upload_part_size_limit)); + + static constexpr size_t max_upload_part_size_limit = 5ull * 1024 * 1024 * 1024; + if (max_upload_part_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_upload_part_size has invalid value {} which is greater than the s3 API limit {}", + ReadableSize(max_upload_part_size), ReadableSize(max_upload_part_size_limit)); + + if (max_single_part_upload_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_single_part_upload_size has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_single_part_upload_size), ReadableSize(max_upload_part_size_limit)); + + if (max_single_operation_copy_size > max_upload_part_size_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_single_operation_copy_size has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_single_operation_copy_size), ReadableSize(max_upload_part_size_limit)); + + if (max_upload_part_size < min_upload_part_size) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_upload_part_size ({}) can't be less than setting min_upload_part_size {}", + ReadableSize(max_upload_part_size), ReadableSize(min_upload_part_size)); + + if (!upload_part_size_multiply_factor) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_factor cannot be zero"); + + if (!upload_part_size_multiply_parts_count_threshold) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_parts_count_threshold cannot be zero"); + + if (!max_part_number) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_part_number cannot be zero"); + + static constexpr size_t max_part_number_limit = 10000; + if (max_part_number > max_part_number_limit) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting max_part_number has invalid value {} which is grater than the s3 API limit {}", + ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); + + size_t maybe_overflow; + if (common::mulOverflow(max_upload_part_size, upload_part_size_multiply_factor, maybe_overflow)) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting upload_part_size_multiply_factor is too big ({}). " + "Multiplication to max_upload_part_size ({}) will cause integer overflow", + ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); + + std::unordered_set storage_class_names {"STANDARD", "INTELLIGENT_TIERING"}; + if (!storage_class_name.empty() && !storage_class_names.contains(storage_class_name)) + throw Exception( + ErrorCodes::INVALID_SETTING_VALUE, + "Setting storage_class has invalid value {} which only supports STANDARD and INTELLIGENT_TIERING", + storage_class_name); + + /// TODO: it's possible to set too small limits. + /// We can check that max possible object size is not too small. +} + +} + } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index b3e01bd6132..01a7ca56779 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -4,10 +4,6 @@ #include #include -#include -#include -#include - #include "config.h" #if USE_AWS_S3 @@ -15,12 +11,15 @@ #include #include #include +#include +#include #include #include #include #include +#include namespace DB { @@ -31,6 +30,7 @@ namespace ErrorCodes } class RemoteHostFilter; +class NamedCollection; class S3Exception : public Exception { @@ -77,31 +77,93 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c struct AuthSettings { - static AuthSettings loadFromConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); - std::string access_key_id; std::string secret_access_key; std::string session_token; std::string region; std::string server_side_encryption_customer_key_base64; - ServerSideEncryptionKMSConfig server_side_encryption_kms_config; HTTPHeaderEntries headers; + std::unordered_set users; + ServerSideEncryptionKMSConfig server_side_encryption_kms_config; + + std::optional connect_timeout_ms; + std::optional request_timeout_ms; + std::optional max_connections; + std::optional http_keep_alive_timeout; + std::optional http_keep_alive_max_requests; + std::optional expiration_window_seconds; std::optional use_environment_credentials; - std::optional use_insecure_imds_request; - std::optional expiration_window_seconds; std::optional no_sign_request; - - std::unordered_set users; + std::optional use_adaptive_timeouts; + std::optional use_insecure_imds_request; + std::optional is_virtual_hosted_style; + std::optional disable_checksum; + std::optional gcs_issue_compose_request; bool hasUpdates(const AuthSettings & other) const; void updateFrom(const AuthSettings & from); - bool canBeUsedByUser(const String & user) const; + bool canBeUsedByUser(const String & user) const { return users.empty() || users.contains(user); } + + static AuthSettings loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + const std::string & setting_name_prefix = ""); + + static AuthSettings loadFromSettings(const DB::Settings & settings); + + static AuthSettings loadFromNamedCollection(const NamedCollection & collection); + + void updateFromSettings(const DB::Settings & settings, bool if_changed); private: bool operator==(const AuthSettings & other) const = default; }; +struct RequestSettings +{ + size_t max_single_read_retries = 4; + size_t request_timeout_ms = 30000; + size_t max_unexpected_write_error_retries = 4; + size_t list_object_keys_size = 1000; + + bool allow_native_copy = true; + bool check_objects_after_upload = false; + bool throw_on_zero_files_match = false; + + struct PartUploadSettings + { + size_t strict_upload_part_size = 0; + size_t min_upload_part_size = 16 * 1024 * 1024; + size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; + size_t upload_part_size_multiply_factor = 2; + size_t upload_part_size_multiply_parts_count_threshold = 500; + size_t max_inflight_parts_for_one_file = 20; + size_t max_part_number = 10000; + size_t max_single_part_upload_size = 32 * 1024 * 1024; + size_t max_single_operation_copy_size = 5ULL * 1024 * 1024 * 1024; + String storage_class_name; + + void validate(); + }; + + PartUploadSettings upload_settings; + ThrottlerPtr get_request_throttler; + ThrottlerPtr put_request_throttler; + + static RequestSettings loadFromSettings(const DB::Settings & settings, bool validate_settings = true); + static RequestSettings loadFromNamedCollection(const NamedCollection & collection, bool validate_settings = true); + static RequestSettings loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + bool validate_settings = true, + const std::string & setting_name_prefix = ""); + + void updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings = true); +}; + } diff --git a/src/IO/S3Defines.h b/src/IO/S3Defines.h new file mode 100644 index 00000000000..ad01920adda --- /dev/null +++ b/src/IO/S3Defines.h @@ -0,0 +1,32 @@ +#pragma once +#include + +namespace DB::S3 +{ + +/// Client settings. +inline static constexpr uint64_t DEFAULT_EXPIRATION_WINDOW_SECONDS = 120; +inline static constexpr uint64_t DEFAULT_CONNECT_TIMEOUT_MS = 1000; +inline static constexpr uint64_t DEFAULT_REQUEST_TIMEOUT_MS = 30000; +inline static constexpr uint64_t DEFAULT_MAX_CONNECTIONS = 1024; +inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_TIMEOUT = 5; +inline static constexpr uint64_t DEFAULT_KEEP_ALIVE_MAX_REQUESTS = 100; + +inline static constexpr bool DEFAULT_USE_ENVIRONMENT_CREDENTIALS = true; +inline static constexpr bool DEFAULT_NO_SIGN_REQUEST = false; +inline static constexpr bool DEFAULT_DISABLE_CHECKSUM = false; +inline static constexpr bool DEFAULT_USE_ADAPTIVE_TIMEOUTS = true; + +/// Upload settings. +inline static constexpr uint64_t DEFAULT_MIN_UPLOAD_PART_SIZE = 16 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_UPLOAD_PART_SIZE = 5ull * 1024 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE = 32 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR = 2; +inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD = 500; +inline static constexpr uint64_t DEFAULT_MAX_PART_NUMBER = 10000; + +/// Other settings. +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE = 32 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE = 20; + +} diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp new file mode 100644 index 00000000000..6a7b2ea5627 --- /dev/null +++ b/src/IO/S3Settings.cpp @@ -0,0 +1,62 @@ +#include + +#include + +#include +#include +#include +#include +#include + + +namespace DB +{ + +void S3SettingsByEndpoint::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings) +{ + std::lock_guard lock(mutex); + s3_settings.clear(); + if (!config.has(config_prefix)) + return; + + Poco::Util::AbstractConfiguration::Keys config_keys; + config.keys(config_prefix, config_keys); + + for (const String & key : config_keys) + { + const auto key_path = config_prefix + "." + key; + const auto endpoint_path = key_path + ".endpoint"; + if (config.has(endpoint_path)) + { + auto endpoint = config.getString(endpoint_path); + auto auth_settings = S3::AuthSettings::loadFromConfig(config, key_path, settings); + auto request_settings = S3::RequestSettings::loadFromConfig(config, key_path, settings); + s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); + } + } +} + +std::optional S3SettingsByEndpoint::getSettings( + const String & endpoint, + const String & user, + bool ignore_user) const +{ + std::lock_guard lock(mutex); + auto next_prefix_setting = s3_settings.upper_bound(endpoint); + + /// Linear time algorithm may be replaced with logarithmic with prefix tree map. + for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) + { + std::advance(possible_prefix_setting, -1); + const auto & [endpoint_prefix, settings] = *possible_prefix_setting; + if (endpoint.starts_with(endpoint_prefix) && (ignore_user || settings.auth_settings.canBeUsedByUser(user))) + return possible_prefix_setting->second; + } + + return {}; +} + +} diff --git a/src/IO/S3Settings.h b/src/IO/S3Settings.h new file mode 100644 index 00000000000..58e12e48002 --- /dev/null +++ b/src/IO/S3Settings.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace Poco::Util { class AbstractConfiguration; } + +namespace DB +{ + +struct Settings; + +struct S3Settings +{ + S3::AuthSettings auth_settings; + S3::RequestSettings request_settings; +}; + +class S3SettingsByEndpoint +{ +public: + void loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings); + + std::optional getSettings( + const std::string & endpoint, + const std::string & user, + bool ignore_user = false) const; + +private: + mutable std::mutex mutex; + std::map s3_settings; +}; + + +} diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index ff18a77f09f..b83ca174820 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -72,7 +72,7 @@ struct WriteBufferFromS3::PartData } }; -BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings) +BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3::RequestSettings::PartUploadSettings & settings) { BufferAllocationPolicy::Settings allocation_settings; allocation_settings.strict_size = settings.strict_upload_part_size; @@ -91,7 +91,7 @@ WriteBufferFromS3::WriteBufferFromS3( const String & bucket_, const String & key_, size_t buf_size_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, BlobStorageLogWriterPtr blob_log_, std::optional> object_metadata_, ThreadPoolCallbackRunnerUnsafe schedule_, @@ -100,7 +100,7 @@ WriteBufferFromS3::WriteBufferFromS3( , bucket(bucket_) , key(key_) , request_settings(request_settings_) - , upload_settings(request_settings.getUploadSettings()) + , upload_settings(request_settings.upload_settings) , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) @@ -214,9 +214,9 @@ void WriteBufferFromS3::finalizeImpl() if (request_settings.check_objects_after_upload) { - S3::checkObjectExists(*client_ptr, bucket, key, {}, request_settings, "Immediately after upload"); + S3::checkObjectExists(*client_ptr, bucket, key, {}, "Immediately after upload"); - size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}, request_settings); + size_t actual_size = S3::getObjectSize(*client_ptr, bucket, key, {}); if (actual_size != total_size) throw Exception( ErrorCodes::S3_ERROR, diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index fbfec3588fa..d4e25ea1733 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -38,7 +38,7 @@ public: const String & bucket_, const String & key_, size_t buf_size_, - const S3Settings::RequestSettings & request_settings_, + const S3::RequestSettings & request_settings_, BlobStorageLogWriterPtr blob_log_, std::optional> object_metadata_ = std::nullopt, ThreadPoolCallbackRunnerUnsafe schedule_ = {}, @@ -78,8 +78,8 @@ private: const String bucket; const String key; - const S3Settings::RequestSettings request_settings; - const S3Settings::RequestSettings::PartUploadSettings & upload_settings; + const S3::RequestSettings request_settings; + const S3::RequestSettings::PartUploadSettings & upload_settings; const WriteSettings write_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index 4a4d7cc0fc2..1e1fdc02060 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -546,8 +546,8 @@ public: std::unique_ptr getWriteBuffer(String file_name = "file") { - S3Settings::RequestSettings request_settings; - request_settings.updateFromSettingsIfChanged(settings); + S3::RequestSettings request_settings; + request_settings.updateFromSettings(settings, /* if_changed */true); client->resetCounters(); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index e1d82a8f604..738f4e9280e 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -31,7 +31,7 @@ #include #include #include -#include +#include #include #include #include @@ -370,7 +370,7 @@ struct ContextSharedPart : boost::noncopyable ActionLocksManagerPtr action_locks_manager; /// Set of storages' action lockers OnceFlag system_logs_initialized; std::unique_ptr system_logs TSA_GUARDED_BY(mutex); /// Used to log queries and operations on parts - std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage + std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage std::vector warnings TSA_GUARDED_BY(mutex); /// Store warning messages about server configuration. /// Background executors for *MergeTree tables @@ -4264,7 +4264,7 @@ void Context::updateStorageConfiguration(const Poco::Util::AbstractConfiguration { std::lock_guard lock(shared->mutex); if (shared->storage_s3_settings) - shared->storage_s3_settings->loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings->loadFromConfig(config, /* config_prefix */"s3", getSettingsRef()); } } @@ -4316,14 +4316,14 @@ const DistributedSettings & Context::getDistributedSettings() const return *shared->distributed_settings; } -const StorageS3Settings & Context::getStorageS3Settings() const +const S3SettingsByEndpoint & Context::getStorageS3Settings() const { std::lock_guard lock(shared->mutex); if (!shared->storage_s3_settings) { const auto & config = shared->getConfigRefWithLock(lock); - shared->storage_s3_settings.emplace().loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings.emplace().loadFromConfig(config, "s3", getSettingsRef()); } return *shared->storage_s3_settings; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 814534f7035..0de24883e42 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -117,7 +117,7 @@ struct DistributedSettings; struct InitialAllRangesAnnouncement; struct ParallelReadRequest; struct ParallelReadResponse; -class StorageS3Settings; +class S3SettingsByEndpoint; class IDatabase; class DDLWorker; class ITableFunction; @@ -1107,7 +1107,7 @@ public: const MergeTreeSettings & getMergeTreeSettings() const; const MergeTreeSettings & getReplicatedMergeTreeSettings() const; const DistributedSettings & getDistributedSettings() const; - const StorageS3Settings & getStorageS3Settings() const; + const S3SettingsByEndpoint & getStorageS3Settings() const; /// Prevents DROP TABLE if its size is greater than max_size (50GB by default, max_size=0 turn off this check) void setMaxTableSizeToDrop(size_t max_size); diff --git a/src/Storages/ExternalDataSourceConfiguration.h b/src/Storages/ExternalDataSourceConfiguration.h index d4e737a7de1..c703c9ce999 100644 --- a/src/Storages/ExternalDataSourceConfiguration.h +++ b/src/Storages/ExternalDataSourceConfiguration.h @@ -2,7 +2,7 @@ #include #include -#include +#include #include diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 6b6cde0c431..45d54b62cbe 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -108,9 +108,9 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & settings = context->getSettingsRef(); const std::string config_prefix = "s3."; - auto s3_settings = getSettings(config, config_prefix, context, settings.s3_validate_request_settings); + auto s3_settings = getSettings(config, config_prefix, context, /* for_disk_s3 */false, settings.s3_validate_request_settings); - request_settings.updateFromSettingsIfChanged(settings); + request_settings.updateFromSettings(settings, /* if_changed */true); auth_settings.updateFrom(s3_settings->auth_settings); s3_settings->auth_settings = auth_settings; @@ -126,7 +126,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); - auto client = getClient(config, config_prefix, context, *s3_settings, false, &url); + auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); auto s3_capabilities = S3Capabilities { @@ -159,7 +159,7 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); structure = collection.getOrDefault("structure", "auto"); - request_settings = S3Settings::RequestSettings(collection); + request_settings = S3::RequestSettings::loadFromNamedCollection(collection, /* validate_settings */true); static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 906d10a1a9a..5a952497851 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -3,7 +3,7 @@ #include "config.h" #if USE_AWS_S3 -#include +#include #include namespace DB @@ -58,7 +58,7 @@ private: std::vector keys; S3::AuthSettings auth_settings; - S3Settings::RequestSettings request_settings; + S3::RequestSettings request_settings; HTTPHeaderEntries headers_from_ast; /// Headers from ast is a part of static configuration. /// If s3 configuration was passed from ast, then it is static. /// If from config - it can be changed with config reload. diff --git a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp index e1583b8329c..378fa941d09 100644 --- a/src/Storages/S3Queue/S3QueueFilesMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueFilesMetadata.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp deleted file mode 100644 index b767805f637..00000000000 --- a/src/Storages/StorageS3Settings.cpp +++ /dev/null @@ -1,315 +0,0 @@ -#include - -#include - -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int INVALID_SETTING_VALUE; -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const Settings & settings, bool validate_settings) -{ - updateFromSettings(settings, false); - if (validate_settings) - validate(); -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix, - bool validate_settings) - : PartUploadSettings(settings, validate_settings) -{ - String key = config_prefix + "." + setting_name_prefix; - strict_upload_part_size = config.getUInt64(key + "strict_upload_part_size", strict_upload_part_size); - min_upload_part_size = config.getUInt64(key + "min_upload_part_size", min_upload_part_size); - max_upload_part_size = config.getUInt64(key + "max_upload_part_size", max_upload_part_size); - upload_part_size_multiply_factor = config.getUInt64(key + "upload_part_size_multiply_factor", upload_part_size_multiply_factor); - upload_part_size_multiply_parts_count_threshold = config.getUInt64(key + "upload_part_size_multiply_parts_count_threshold", upload_part_size_multiply_parts_count_threshold); - max_inflight_parts_for_one_file = config.getUInt64(key + "max_inflight_parts_for_one_file", max_inflight_parts_for_one_file); - max_part_number = config.getUInt64(key + "max_part_number", max_part_number); - max_single_part_upload_size = config.getUInt64(key + "max_single_part_upload_size", max_single_part_upload_size); - max_single_operation_copy_size = config.getUInt64(key + "max_single_operation_copy_size", max_single_operation_copy_size); - - /// This configuration is only applicable to s3. Other types of object storage are not applicable or have different meanings. - storage_class_name = config.getString(config_prefix + ".s3_storage_class", storage_class_name); - storage_class_name = Poco::toUpperInPlace(storage_class_name); - - if (validate_settings) - validate(); -} - -S3Settings::RequestSettings::PartUploadSettings::PartUploadSettings(const NamedCollection & collection) -{ - strict_upload_part_size = collection.getOrDefault("strict_upload_part_size", strict_upload_part_size); - min_upload_part_size = collection.getOrDefault("min_upload_part_size", min_upload_part_size); - max_single_part_upload_size = collection.getOrDefault("max_single_part_upload_size", max_single_part_upload_size); - upload_part_size_multiply_factor = collection.getOrDefault("upload_part_size_multiply_factor", upload_part_size_multiply_factor); - upload_part_size_multiply_parts_count_threshold = collection.getOrDefault("upload_part_size_multiply_parts_count_threshold", upload_part_size_multiply_parts_count_threshold); - max_inflight_parts_for_one_file = collection.getOrDefault("max_inflight_parts_for_one_file", max_inflight_parts_for_one_file); - - /// This configuration is only applicable to s3. Other types of object storage are not applicable or have different meanings. - storage_class_name = collection.getOrDefault("s3_storage_class", storage_class_name); - storage_class_name = Poco::toUpperInPlace(storage_class_name); - - validate(); -} - -void S3Settings::RequestSettings::PartUploadSettings::updateFromSettings(const Settings & settings, bool if_changed) -{ - if (!if_changed || settings.s3_strict_upload_part_size.changed) - strict_upload_part_size = settings.s3_strict_upload_part_size; - - if (!if_changed || settings.s3_min_upload_part_size.changed) - min_upload_part_size = settings.s3_min_upload_part_size; - - if (!if_changed || settings.s3_max_upload_part_size.changed) - max_upload_part_size = settings.s3_max_upload_part_size; - - if (!if_changed || settings.s3_upload_part_size_multiply_factor.changed) - upload_part_size_multiply_factor = settings.s3_upload_part_size_multiply_factor; - - if (!if_changed || settings.s3_upload_part_size_multiply_parts_count_threshold.changed) - upload_part_size_multiply_parts_count_threshold = settings.s3_upload_part_size_multiply_parts_count_threshold; - - if (!if_changed || settings.s3_max_inflight_parts_for_one_file.changed) - max_inflight_parts_for_one_file = settings.s3_max_inflight_parts_for_one_file; - - if (!if_changed || settings.s3_max_single_part_upload_size.changed) - max_single_part_upload_size = settings.s3_max_single_part_upload_size; -} - -void S3Settings::RequestSettings::PartUploadSettings::validate() -{ - static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024; - if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting strict_upload_part_size has invalid value {} which is less than the s3 API limit {}", - ReadableSize(strict_upload_part_size), ReadableSize(min_upload_part_size_limit)); - - if (min_upload_part_size < min_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting min_upload_part_size has invalid value {} which is less than the s3 API limit {}", - ReadableSize(min_upload_part_size), ReadableSize(min_upload_part_size_limit)); - - static constexpr size_t max_upload_part_size_limit = 5ull * 1024 * 1024 * 1024; - if (max_upload_part_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_upload_part_size has invalid value {} which is greater than the s3 API limit {}", - ReadableSize(max_upload_part_size), ReadableSize(max_upload_part_size_limit)); - - if (max_single_part_upload_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_single_part_upload_size has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_single_part_upload_size), ReadableSize(max_upload_part_size_limit)); - - if (max_single_operation_copy_size > max_upload_part_size_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_single_operation_copy_size has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_single_operation_copy_size), ReadableSize(max_upload_part_size_limit)); - - if (max_upload_part_size < min_upload_part_size) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_upload_part_size ({}) can't be less than setting min_upload_part_size {}", - ReadableSize(max_upload_part_size), ReadableSize(min_upload_part_size)); - - if (!upload_part_size_multiply_factor) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_factor cannot be zero"); - - if (!upload_part_size_multiply_parts_count_threshold) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_parts_count_threshold cannot be zero"); - - if (!max_part_number) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_part_number cannot be zero"); - - static constexpr size_t max_part_number_limit = 10000; - if (max_part_number > max_part_number_limit) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting max_part_number has invalid value {} which is grater than the s3 API limit {}", - ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); - - size_t maybe_overflow; - if (common::mulOverflow(max_upload_part_size, upload_part_size_multiply_factor, maybe_overflow)) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting upload_part_size_multiply_factor is too big ({}). " - "Multiplication to max_upload_part_size ({}) will cause integer overflow", - ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); - - std::unordered_set storage_class_names {"STANDARD", "INTELLIGENT_TIERING"}; - if (!storage_class_name.empty() && !storage_class_names.contains(storage_class_name)) - throw Exception( - ErrorCodes::INVALID_SETTING_VALUE, - "Setting storage_class has invalid value {} which only supports STANDARD and INTELLIGENT_TIERING", - storage_class_name); - - /// TODO: it's possible to set too small limits. We can check that max possible object size is not too small. -} - - -S3Settings::RequestSettings::RequestSettings(const Settings & settings, bool validate_settings) - : upload_settings(settings, validate_settings) -{ - updateFromSettingsImpl(settings, false); -} - -S3Settings::RequestSettings::RequestSettings(const NamedCollection & collection) - : upload_settings(collection) -{ - max_single_read_retries = collection.getOrDefault("max_single_read_retries", max_single_read_retries); - max_connections = collection.getOrDefault("max_connections", max_connections); - list_object_keys_size = collection.getOrDefault("list_object_keys_size", list_object_keys_size); - allow_native_copy = collection.getOrDefault("allow_native_copy", allow_native_copy); - throw_on_zero_files_match = collection.getOrDefault("throw_on_zero_files_match", throw_on_zero_files_match); -} - -S3Settings::RequestSettings::RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix, - bool validate_settings) - : upload_settings(config, config_prefix, settings, setting_name_prefix, validate_settings) -{ - String key = config_prefix + "." + setting_name_prefix; - max_single_read_retries = config.getUInt64(key + "max_single_read_retries", settings.s3_max_single_read_retries); - max_connections = config.getUInt64(key + "max_connections", settings.s3_max_connections); - check_objects_after_upload = config.getBool(key + "check_objects_after_upload", settings.s3_check_objects_after_upload); - list_object_keys_size = config.getUInt64(key + "list_object_keys_size", settings.s3_list_object_keys_size); - allow_native_copy = config.getBool(key + "allow_native_copy", allow_native_copy); - throw_on_zero_files_match = config.getBool(key + "throw_on_zero_files_match", settings.s3_throw_on_zero_files_match); - retry_attempts = config.getUInt64(key + "retry_attempts", settings.s3_retry_attempts); - request_timeout_ms = config.getUInt64(key + "request_timeout_ms", settings.s3_request_timeout_ms); - - /// NOTE: it would be better to reuse old throttlers to avoid losing token bucket state on every config reload, - /// which could lead to exceeding limit for short time. But it is good enough unless very high `burst` values are used. - if (UInt64 max_get_rps = config.getUInt64(key + "max_get_rps", settings.s3_max_get_rps)) - { - size_t default_max_get_burst = settings.s3_max_get_burst - ? settings.s3_max_get_burst - : (Throttler::default_burst_seconds * max_get_rps); - - size_t max_get_burst = config.getUInt64(key + "max_get_burst", default_max_get_burst); - - get_request_throttler = std::make_shared(max_get_rps, max_get_burst); - } - if (UInt64 max_put_rps = config.getUInt64(key + "max_put_rps", settings.s3_max_put_rps)) - { - size_t default_max_put_burst = settings.s3_max_put_burst - ? settings.s3_max_put_burst - : (Throttler::default_burst_seconds * max_put_rps); - - size_t max_put_burst = config.getUInt64(key + "max_put_burst", default_max_put_burst); - - put_request_throttler = std::make_shared(max_put_rps, max_put_burst); - } -} - -void S3Settings::RequestSettings::updateFromSettingsImpl(const Settings & settings, bool if_changed) -{ - if (!if_changed || settings.s3_max_single_read_retries.changed) - max_single_read_retries = settings.s3_max_single_read_retries; - - if (!if_changed || settings.s3_max_connections.changed) - max_connections = settings.s3_max_connections; - - if (!if_changed || settings.s3_check_objects_after_upload.changed) - check_objects_after_upload = settings.s3_check_objects_after_upload; - - if (!if_changed || settings.s3_max_unexpected_write_error_retries.changed) - max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; - - if (!if_changed || settings.s3_list_object_keys_size.changed) - list_object_keys_size = settings.s3_list_object_keys_size; - - if ((!if_changed || settings.s3_max_get_rps.changed || settings.s3_max_get_burst.changed) && settings.s3_max_get_rps) - get_request_throttler = std::make_shared( - settings.s3_max_get_rps, settings.s3_max_get_burst ? settings.s3_max_get_burst : Throttler::default_burst_seconds * settings.s3_max_get_rps); - - if ((!if_changed || settings.s3_max_put_rps.changed || settings.s3_max_put_burst.changed) && settings.s3_max_put_rps) - put_request_throttler = std::make_shared( - settings.s3_max_put_rps, settings.s3_max_put_burst ? settings.s3_max_put_burst : Throttler::default_burst_seconds * settings.s3_max_put_rps); - - if (!if_changed || settings.s3_throw_on_zero_files_match.changed) - throw_on_zero_files_match = settings.s3_throw_on_zero_files_match; - - if (!if_changed || settings.s3_retry_attempts.changed) - retry_attempts = settings.s3_retry_attempts; - - if (!if_changed || settings.s3_request_timeout_ms.changed) - request_timeout_ms = settings.s3_request_timeout_ms; -} - -void S3Settings::RequestSettings::updateFromSettingsIfChanged(const Settings & settings) -{ - updateFromSettingsImpl(settings, true); - upload_settings.updateFromSettings(settings, true); -} - -void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings) -{ - std::lock_guard lock(mutex); - s3_settings.clear(); - if (!config.has(config_elem)) - return; - - Poco::Util::AbstractConfiguration::Keys config_keys; - config.keys(config_elem, config_keys); - - for (const String & key : config_keys) - { - if (config.has(config_elem + "." + key + ".endpoint")) - { - auto endpoint = config.getString(config_elem + "." + key + ".endpoint"); - auto auth_settings = S3::AuthSettings::loadFromConfig(config_elem + "." + key, config); - S3Settings::RequestSettings request_settings(config, config_elem + "." + key, settings); - - s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); - } - } -} - -std::optional StorageS3Settings::getSettings(const String & endpoint, const String & user, bool ignore_user) const -{ - std::lock_guard lock(mutex); - auto next_prefix_setting = s3_settings.upper_bound(endpoint); - - /// Linear time algorithm may be replaced with logarithmic with prefix tree map. - for (auto possible_prefix_setting = next_prefix_setting; possible_prefix_setting != s3_settings.begin();) - { - std::advance(possible_prefix_setting, -1); - const auto & [endpoint_prefix, settings] = *possible_prefix_setting; - if (endpoint.starts_with(endpoint_prefix) && (ignore_user || settings.auth_settings.canBeUsedByUser(user))) - return possible_prefix_setting->second; - } - - return {}; -} - -} diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h deleted file mode 100644 index c3bc8aa6ed6..00000000000 --- a/src/Storages/StorageS3Settings.h +++ /dev/null @@ -1,122 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -namespace Poco::Util -{ -class AbstractConfiguration; -} - -namespace DB -{ - -struct Settings; -class NamedCollection; - -struct S3Settings -{ - struct RequestSettings - { - struct PartUploadSettings - { - size_t strict_upload_part_size = 0; - size_t min_upload_part_size = 16 * 1024 * 1024; - size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; - size_t upload_part_size_multiply_factor = 2; - size_t upload_part_size_multiply_parts_count_threshold = 500; - size_t max_inflight_parts_for_one_file = 20; - size_t max_part_number = 10000; - size_t max_single_part_upload_size = 32 * 1024 * 1024; - size_t max_single_operation_copy_size = 5ULL * 1024 * 1024 * 1024; - String storage_class_name; - - void updateFromSettings(const Settings & settings, bool if_changed); - void validate(); - - private: - PartUploadSettings() = default; - explicit PartUploadSettings(const Settings & settings, bool validate_settings = true); - explicit PartUploadSettings(const NamedCollection & collection); - PartUploadSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix = {}, - bool validate_settings = true); - - friend struct RequestSettings; - }; - - private: - PartUploadSettings upload_settings = {}; - - public: - size_t max_single_read_retries = 4; - size_t max_connections = 1024; - bool check_objects_after_upload = false; - size_t max_unexpected_write_error_retries = 4; - size_t list_object_keys_size = 1000; - ThrottlerPtr get_request_throttler; - ThrottlerPtr put_request_throttler; - size_t retry_attempts = 10; - size_t request_timeout_ms = 30000; - bool allow_native_copy = true; - - bool throw_on_zero_files_match = false; - - const PartUploadSettings & getUploadSettings() const { return upload_settings; } - PartUploadSettings & getUploadSettings() { return upload_settings; } - - void setStorageClassName(const String & storage_class_name) { upload_settings.storage_class_name = storage_class_name; } - - RequestSettings() = default; - explicit RequestSettings(const Settings & settings, bool validate_settings = true); - explicit RequestSettings(const NamedCollection & collection); - - /// What's the setting_name_prefix, and why do we need it? - /// There are (at least) two config sections where s3 settings can be specified: - /// * settings for s3 disk (clickhouse/storage_configuration/disks) - /// * settings for s3 storage (clickhouse/s3), which are also used for backups - /// Even though settings are the same, in case of s3 disk they are prefixed with "s3_" - /// ("s3_max_single_part_upload_size"), but in case of s3 storage they are not - /// ( "max_single_part_upload_size"). Why this happened is a complete mystery to me. - RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const String & config_prefix, - const Settings & settings, - String setting_name_prefix = {}, - bool validate_settings = true); - - void updateFromSettingsIfChanged(const Settings & settings); - - private: - void updateFromSettingsImpl(const Settings & settings, bool if_changed); - }; - - S3::AuthSettings auth_settings; - RequestSettings request_settings; -}; - -/// Settings for the StorageS3. -class StorageS3Settings -{ -public: - void loadFromConfig(const String & config_elem, const Poco::Util::AbstractConfiguration & config, const Settings & settings); - - std::optional getSettings(const String & endpoint, const String & user, bool ignore_user = false) const; - -private: - mutable std::mutex mutex; - std::map s3_settings; -}; - -} From 804b1ad1e02825826f593f80d589ea9015b6a5d6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 27 May 2024 10:34:16 +0000 Subject: [PATCH 033/215] Better --- src/CMakeLists.txt | 2 +- src/Common/Allocator.cpp | 1 - src/Common/GWPAsan.cpp | 196 ++++++++++++++++++++++++++++++++++++++ src/Common/GWPAsan.h | 20 ++++ src/Common/memory.cpp | 23 ----- src/Common/memory.h | 11 +-- src/Daemon/BaseDaemon.cpp | 19 ++-- 7 files changed, 224 insertions(+), 48 deletions(-) create mode 100644 src/Common/GWPAsan.cpp create mode 100644 src/Common/GWPAsan.h delete mode 100644 src/Common/memory.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 7ae20b21889..7b887391df0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -183,7 +183,7 @@ add_library (clickhouse_new_delete STATIC Common/new_delete.cpp) target_link_libraries (clickhouse_new_delete PRIVATE clickhouse_common_io) if (TARGET ch_contrib::jemalloc) target_link_libraries (clickhouse_new_delete PRIVATE ch_contrib::jemalloc) - target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::jemalloc) + target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::jemalloc) target_link_libraries (clickhouse_storages_system PRIVATE ch_contrib::jemalloc) endif() diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index 8b68ef87298..67ef98cf221 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -66,7 +66,6 @@ void prefaultPages([[maybe_unused]] void * buf_, [[maybe_unused]] size_t len_) template void * allocNoTrack(size_t size, size_t alignment) { - void * buf; #if USE_GWP_ASAN if (unlikely(Memory::GuardedAlloc.shouldSample())) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp new file mode 100644 index 00000000000..6f57af9e982 --- /dev/null +++ b/src/Common/GWPAsan.cpp @@ -0,0 +1,196 @@ +#include +#include + +#if USE_GWP_ASAN +# include +# include +# include +# include +# include +# include +# include + +namespace Memory +{ + +namespace +{ + size_t getBackTrace(uintptr_t * trace_buffer, size_t buffer_size) + { + StackTrace stacktrace; + auto trace_size = std::min(buffer_size, stacktrace.getSize()); + const auto & frame_pointers = stacktrace.getFramePointers(); + memcpy(trace_buffer, frame_pointers.data(), std::min(trace_size, buffer_size) * sizeof(uintptr_t)); + return trace_size; + } +} + +gwp_asan::GuardedPoolAllocator GuardedAlloc; +static bool guarded_alloc_initialized = [] +{ + gwp_asan::options::initOptions(); + gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); + opts.MaxSimultaneousAllocations = 1024; + opts.Backtrace = getBackTrace; + GuardedAlloc.init(opts); + + ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled + /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations + /// << ", SampleRate: " << opts.SampleRate << " }\n"; + + return true; +}(); + +bool isGWPAsanError(uintptr_t fault_address) +{ + const auto * state = GuardedAlloc.getAllocatorState(); + if (state->FailureType != gwp_asan::Error::UNKNOWN && state->FailureAddress != 0) + return true; + + return fault_address < state->GuardedPagePoolEnd && state->GuardedPagePool <= fault_address; +} + +namespace +{ + +struct ScopedEndOfReportDecorator { + explicit ScopedEndOfReportDecorator(Poco::LoggerPtr log_) : log(std::move(log_)) {} + ~ScopedEndOfReportDecorator() { LOG_FATAL(log, "*** End GWP-ASan report ***"); } + Poco::LoggerPtr log; +}; + +constexpr std::string_view unknown_crash_text = + "GWP-ASan cannot provide any more information about this error. This may " + "occur due to a wild memory access into the GWP-ASan pool, or an " + "overflow/underflow that is > 512B in length.\n"; + + +// Prints the provided error and metadata information. +void printHeader(gwp_asan::Error error, uintptr_t fault_address, const gwp_asan::AllocationMetadata * allocation_meta, Poco::LoggerPtr log) +{ + bool access_was_in_bounds = false; + std::string description; + if (error != gwp_asan::Error::UNKNOWN && allocation_meta != nullptr) + { + uintptr_t address = __gwp_asan_get_allocation_address(allocation_meta); + size_t size = __gwp_asan_get_allocation_size(allocation_meta); + if (fault_address < address) + { + description = fmt::format( + "({} byte{} to the left of a {}-byte allocation at 0x{}) ", + address - fault_address, + (address - fault_address == 1) ? "" : "s", + size, + address); + } + else if (fault_address > address) + { + description = fmt::format( + "({} byte{} to the right of a {}-byte allocation at 0x{}) ", + fault_address - address, + (fault_address - address == 1) ? "" : "s", + size, + address); + } + else if (error == gwp_asan::Error::DOUBLE_FREE) + { + description = fmt::format("(a {}-byte allocation) ", size); + } + else + { + access_was_in_bounds = true; + description = fmt::format( + "({} byte{} into a {}-byte allocation at 0x{}) ", + fault_address - address, + (fault_address - address == 1) ? "" : "s", + size, + address); + } + } + + // Possible number of digits of a 64-bit number: ceil(log10(2^64)) == 20. Add + // a null terminator, and round to the nearest 8-byte boundary. + uint64_t thread_id = gwp_asan::getThreadID(); + std::string thread_id_string = thread_id == gwp_asan::kInvalidThreadID ? " trace; + + // Maybe print the deallocation trace. + if (__gwp_asan_is_deallocated(allocation_meta)) + { + uint64_t thread_id = __gwp_asan_get_deallocation_thread_id(allocation_meta); + if (thread_id == gwp_asan::kInvalidThreadID) + LOG_FATAL(logger, "0x{} was deallocated by thread here:", fault_address); + else + LOG_FATAL(logger, "0x{} was deallocated by thread {} here:", fault_address, thread_id); + const auto trace_length = __gwp_asan_get_deallocation_trace(allocation_meta, trace.data(), maximum_stack_frames); + StackTrace::toStringEveryLine( + reinterpret_cast(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); }); + } + + // Print the allocation trace. + uint64_t thread_id = __gwp_asan_get_allocation_thread_id(allocation_meta); + if (thread_id == gwp_asan::kInvalidThreadID) + LOG_FATAL(logger, "0x{} was allocated by thread here:", fault_address); + else + LOG_FATAL(logger, "0x{} was allocated by thread {} here:", fault_address, thread_id); + const auto trace_length = __gwp_asan_get_allocation_trace(allocation_meta, trace.data(), maximum_stack_frames); + StackTrace::toStringEveryLine( + reinterpret_cast(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); }); +} + +} +#endif diff --git a/src/Common/GWPAsan.h b/src/Common/GWPAsan.h new file mode 100644 index 00000000000..164c6ee0221 --- /dev/null +++ b/src/Common/GWPAsan.h @@ -0,0 +1,20 @@ +#pragma once + +#include "config.h" + +#if USE_GWP_ASAN + +#include + +namespace Memory +{ + +extern gwp_asan::GuardedPoolAllocator GuardedAlloc; + +bool isGWPAsanError(uintptr_t fault_address); + +void printGWPAsanReport(uintptr_t fault_address); + +} + +#endif diff --git a/src/Common/memory.cpp b/src/Common/memory.cpp deleted file mode 100644 index 6c17dbe3ba1..00000000000 --- a/src/Common/memory.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include - -#if USE_GWP_ASAN -#include - -namespace Memory -{ -gwp_asan::GuardedPoolAllocator GuardedAlloc; -static bool guarded_alloc_initialized = [] -{ - gwp_asan::options::initOptions(); - gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); - opts.MaxSimultaneousAllocations = 256; - GuardedAlloc.init(opts); - - ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled - /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations - /// << ", SampleRate: " << opts.SampleRate << " }\n"; - - return true; -}(); -} -#endif diff --git a/src/Common/memory.h b/src/Common/memory.h index 427120edc75..633994a83e2 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -6,6 +6,7 @@ #include #include #include +#include #include "config.h" #if USE_JEMALLOC @@ -16,12 +17,6 @@ # include #endif -#if USE_GWP_ASAN -# include -# include - -#endif - namespace ProfileEvents { extern const Event GWPAsanAllocateSuccess; @@ -32,10 +27,6 @@ namespace ProfileEvents namespace Memory { -#if USE_GWP_ASAN -extern gwp_asan::GuardedPoolAllocator GuardedAlloc; -#endif - inline ALWAYS_INLINE size_t alignToSizeT(std::align_val_t align) noexcept { return static_cast(align); diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 74c37b6123b..3f7ad8d7126 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include @@ -157,18 +157,11 @@ static void signalHandler(int sig, siginfo_t * info, void * context) const ucontext_t * signal_context = reinterpret_cast(context); const StackTrace stack_trace(*signal_context); - const auto is_gwp_asan = [&] - { - auto state = ::Memory::GuardedAlloc.getAllocatorState(); - if (state->FailureType != gwp_asan::Error::UNKNOWN && state->FailureAddress != 0) - return true; - - auto addr = reinterpret_cast(info->si_addr); - return addr < state->GuardedPagePoolEnd && state->GuardedPagePool <= addr; - }; - - if (is_gwp_asan()) - std::cerr << "GWPAsan caught something!" << std::endl; +#if USE_GWP_ASAN + if (const auto fault_address = reinterpret_cast(info->si_addr); + ::Memory::isGWPAsanError(fault_address)) + ::Memory::printGWPAsanReport(fault_address); +#endif writeBinary(sig, out); writePODBinary(*info, out); From 4f3e2cae11aa1fccd14eff9bc6978269611caf10 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Mon, 27 May 2024 11:04:52 +0000 Subject: [PATCH 034/215] start keeper global thread pool with sufficient amount of threads Signed-off-by: Duc Canh Le --- programs/keeper/Keeper.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index dba5c2b7d2a..0d3c1f10894 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -361,9 +361,10 @@ try } GlobalThreadPool::initialize( - config().getUInt("max_thread_pool_size", 100), - config().getUInt("max_thread_pool_free_size", 1000), - config().getUInt("thread_pool_queue_size", 10000) + /// We need to have sufficient amount of threads for connections + nuraft workers + keeper workers, 1000 is an estimation + std::min(1000U, config().getUInt("max_thread_pool_size", 1000)), + config().getUInt("max_thread_pool_free_size", 100), + config().getUInt("thread_pool_queue_size", 1000) ); /// Wait for all threads to avoid possible use-after-free (for example logging objects can be already destroyed). SCOPE_EXIT({ From 5abebeca63d02f12464fdd4d3067d988f0005104 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 May 2024 16:08:55 +0200 Subject: [PATCH 035/215] Use BaseSettings --- src/Backups/BackupIO_S3.cpp | 2 +- src/Core/BaseSettings.h | 44 +++- .../ObjectStorages/S3/S3ObjectStorage.cpp | 3 +- src/IO/S3/copyS3File.cpp | 30 ++- src/IO/S3Common.cpp | 234 +++++++----------- src/IO/S3Common.h | 61 +++-- src/IO/WriteBufferFromS3.cpp | 29 ++- src/IO/WriteBufferFromS3.h | 1 - 8 files changed, 197 insertions(+), 207 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index cbf18e2bff9..8c16aa0c291 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -228,7 +228,7 @@ BackupWriterS3::BackupWriterS3( request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint request_settings.allow_native_copy = allow_s3_native_copy; - request_settings.upload_settings.storage_class_name = storage_class_name; + request_settings.storage_class_name = storage_class_name; client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) { diff --git a/src/Core/BaseSettings.h b/src/Core/BaseSettings.h index adf7a41193c..6242d78aee7 100644 --- a/src/Core/BaseSettings.h +++ b/src/Core/BaseSettings.h @@ -108,6 +108,7 @@ public: public: const String & getName() const; Field getValue() const; + void setValue(const Field & value); Field getDefaultValue() const; String getValueString() const; String getDefaultValueString() const; @@ -122,10 +123,10 @@ public: private: friend class BaseSettings; - const BaseSettings * settings; + BaseSettings * settings; const typename Traits::Accessor * accessor; size_t index; - std::conditional_t custom_setting; + std::conditional_t custom_setting; }; enum SkipFlags @@ -144,35 +145,50 @@ public: Iterator & operator++(); Iterator operator++(int); /// NOLINT const SettingFieldRef & operator *() const { return field_ref; } + SettingFieldRef & operator *() { return field_ref; } bool operator ==(const Iterator & other) const; bool operator !=(const Iterator & other) const { return !(*this == other); } private: friend class BaseSettings; - Iterator(const BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_); + Iterator(BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_); void doSkip(); void setPointerToCustomSetting(); SettingFieldRef field_ref; - std::conditional_t custom_settings_iterator; + std::conditional_t custom_settings_iterator; SkipFlags skip_flags; }; class Range { public: - Range(const BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} + Range(BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} Iterator begin() const { return Iterator(settings, accessor, skip_flags); } Iterator end() const { return Iterator(settings, accessor, SKIP_ALL); } private: - const BaseSettings & settings; + BaseSettings & settings; const typename Traits::Accessor & accessor; SkipFlags skip_flags; }; - Range all(SkipFlags skip_flags = SKIP_NONE) const { return Range{*this, skip_flags}; } + class MutableRange + { + public: + MutableRange(BaseSettings & settings_, SkipFlags skip_flags_) : settings(settings_), accessor(Traits::Accessor::instance()), skip_flags(skip_flags_) {} + Iterator begin() { return Iterator(settings, accessor, skip_flags); } + Iterator end() { return Iterator(settings, accessor, SKIP_ALL); } + + private: + BaseSettings & settings; + const typename Traits::Accessor & accessor; + SkipFlags skip_flags; + }; + + Range all(SkipFlags skip_flags = SKIP_NONE) const { return Range{const_cast &>(*this), skip_flags}; } + MutableRange allMutable(SkipFlags skip_flags = SKIP_NONE) { return MutableRange{*this, skip_flags}; } Range allChanged() const { return all(SKIP_UNCHANGED); } Range allUnchanged() const { return all(SKIP_CHANGED); } Range allBuiltin() const { return all(SKIP_CUSTOM); } @@ -608,7 +624,7 @@ const SettingFieldCustom * BaseSettings::tryGetCustomSetting(std::strin } template -BaseSettings::Iterator::Iterator(const BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_) +BaseSettings::Iterator::Iterator(BaseSettings & settings_, const typename Traits::Accessor & accessor_, SkipFlags skip_flags_) : skip_flags(skip_flags_) { field_ref.settings = &settings_; @@ -741,6 +757,18 @@ Field BaseSettings::SettingFieldRef::getValue() const return accessor->getValue(*settings, index); } +template +void BaseSettings::SettingFieldRef::setValue(const Field & value) +{ + if constexpr (Traits::allow_custom_settings) + { + if (custom_setting) + custom_setting->second = value; + } + else + accessor->setValue(*settings, index, value); +} + template Field BaseSettings::SettingFieldRef::getDefaultValue() const { diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 013b44a3c7b..00a1216e52a 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -577,6 +577,7 @@ void S3ObjectStorage::applyNewSettings( auto settings_from_config = getSettings(config, config_prefix, context, for_disk_s3, context->getSettingsRef().s3_validate_request_settings); auto modified_settings = std::make_unique(*s3_settings.get()); modified_settings->auth_settings.updateFrom(settings_from_config->auth_settings); + modified_settings->request_settings.updateIfChanged(settings_from_config->request_settings); if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) modified_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); @@ -585,7 +586,7 @@ void S3ObjectStorage::applyNewSettings( if (options.allow_client_change && (current_settings->auth_settings.hasUpdates(modified_settings->auth_settings) || for_disk_s3)) { - auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); + auto new_client = getClient(uri, *settings_from_config, context, for_disk_s3); client.set(std::move(new_client)); } s3_settings.set(std::move(modified_settings)); diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 471c4a687a6..35b75a5cc90 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -66,7 +66,6 @@ namespace , dest_bucket(dest_bucket_) , dest_key(dest_key_) , request_settings(request_settings_) - , upload_settings(request_settings.upload_settings) , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_s3(for_disk_s3_) @@ -82,7 +81,6 @@ namespace const String & dest_bucket; const String & dest_key; const S3::RequestSettings & request_settings; - const S3::RequestSettings::PartUploadSettings & upload_settings; const std::optional> & object_metadata; ThreadPoolCallbackRunnerUnsafe schedule; bool for_disk_s3; @@ -127,8 +125,8 @@ namespace if (object_metadata.has_value()) request.SetMetadata(object_metadata.value()); - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); client_ptr->setKMSHeaders(request); @@ -185,7 +183,7 @@ namespace request.SetMultipartUpload(multipart_upload); - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -290,9 +288,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = upload_settings.max_part_number; - auto min_upload_part_size = upload_settings.min_upload_part_size; - auto max_upload_part_size = upload_settings.max_upload_part_size; + auto max_part_number = request_settings.max_part_number; + auto min_upload_part_size = request_settings.min_upload_part_size; + auto max_upload_part_size = request_settings.max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); @@ -479,7 +477,7 @@ namespace void performCopy() { - if (size <= upload_settings.max_single_part_upload_size) + if (size <= request_settings.max_single_part_upload_size) performSinglepartUpload(); else performMultipartUpload(); @@ -512,8 +510,8 @@ namespace if (object_metadata.has_value()) request.SetMetadata(object_metadata.value()); - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 @@ -524,7 +522,7 @@ namespace void processPutRequest(S3::PutObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3PutObject); @@ -666,7 +664,7 @@ namespace void performCopy() { LOG_TEST(log, "Copy object {} to {} using native copy", src_key, dest_key); - if (!supports_multipart_copy || size <= upload_settings.max_single_operation_copy_size) + if (!supports_multipart_copy || size <= request_settings.max_single_operation_copy_size) performSingleOperationCopy(); else performMultipartUploadCopy(); @@ -710,8 +708,8 @@ namespace request.SetMetadataDirective(Aws::S3::Model::MetadataDirective::REPLACE); } - const auto & storage_class_name = upload_settings.storage_class_name; - if (!storage_class_name.empty()) + const auto & storage_class_name = request_settings.storage_class_name; + if (!storage_class_name.value.empty()) request.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 @@ -722,7 +720,7 @@ namespace void processCopyRequest(S3::CopyObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CopyObject); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 8a01f6ca33a..b3cd037e91d 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -237,113 +237,68 @@ RequestSettings RequestSettings::loadFromConfig( const std::string & setting_name_prefix) { auto request_settings = RequestSettings::loadFromSettings(settings, validate_settings); - String prefix = config_prefix + "." + setting_name_prefix; - auto has = [&](const std::string & key) -> bool { return config.has(prefix + key); }; - auto get_uint = [&](const std::string & key) -> size_t { return config.getUInt64(prefix + key); }; - auto get_string = [&](const std::string & key) -> std::string { return config.getString(prefix + key); }; - auto get_bool = [&](const std::string & key) -> bool { return config.getBool(prefix + key); }; - if (has("strict_upload_part_size")) - request_settings.upload_settings.strict_upload_part_size = get_uint("strict_upload_part_size"); - if (has("min_upload_part_size")) - request_settings.upload_settings.min_upload_part_size = get_uint("min_upload_part_size"); - if (has("max_upload_part_size")) - request_settings.upload_settings.max_upload_part_size = get_uint("max_upload_part_size"); - if (has("upload_part_size_multiply_factor")) - request_settings.upload_settings.upload_part_size_multiply_factor = get_uint("upload_part_size_multiply_factor"); - if (has("upload_part_size_multiply_parts_count_threshold")) - request_settings.upload_settings.upload_part_size_multiply_parts_count_threshold = get_uint("upload_part_size_multiply_parts_count_threshold"); - if (has("max_inflight_parts_for_one_file")) - request_settings.upload_settings.max_inflight_parts_for_one_file = get_uint("max_inflight_parts_for_one_file"); - if (has("max_part_number")) - request_settings.upload_settings.max_part_number = get_uint("max_part_number"); - if (has("max_single_part_upload_size")) - request_settings.upload_settings.max_single_part_upload_size = get_uint("max_single_part_upload_size"); - if (has("max_single_operation_copy_size")) - request_settings.upload_settings.max_single_operation_copy_size = get_uint("max_single_operation_copy_size"); - if (has("s3_storage_class")) - request_settings.upload_settings.storage_class_name = get_string("s3_storage_class"); + auto values = request_settings.allMutable(); + for (auto & field : values) + { + const auto path = prefix + field.getName(); + if (config.has(path)) + { + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + } + } + + if (!request_settings.storage_class_name.value.empty()) + request_settings.storage_class_name = Poco::toUpperInPlace(request_settings.storage_class_name.value); - request_settings.upload_settings.storage_class_name = Poco::toUpperInPlace(request_settings.upload_settings.storage_class_name); if (validate_settings) - request_settings.upload_settings.validate(); + request_settings.validateUploadSettings(); - if (has("max_single_read_retries")) - request_settings.max_single_read_retries = get_uint("max_single_read_retries"); - if (has("check_objects_after_upload")) - request_settings.check_objects_after_upload = get_bool("check_objects_after_upload"); - if (has("list_object_keys_size")) - request_settings.list_object_keys_size = get_uint("list_object_keys_size"); - if (has("allow_native_copy")) - request_settings.allow_native_copy = get_bool("allow_native_copy"); - if (has("throw_on_zero_files_match")) - request_settings.throw_on_zero_files_match = get_bool("throw_on_zero_files_match"); - if (has("request_timeout_ms")) - request_settings.request_timeout_ms = get_uint("request_timeout_ms"); + request_settings.initializeThrottler(settings); - /// NOTE: it would be better to reuse old throttlers - /// to avoid losing token bucket state on every config reload, - /// which could lead to exceeding limit for short time. - /// But it is good enough unless very high `burst` values are used. - if (UInt64 max_get_rps = has("max_get_rps") ? get_uint("max_get_rps") : settings.s3_max_get_rps) - { - size_t default_max_get_burst = settings.s3_max_get_burst - ? settings.s3_max_get_burst - : (Throttler::default_burst_seconds * max_get_rps); - size_t max_get_burst = has("max_get_burst") ? get_uint("max_get_burst") : default_max_get_burst; - request_settings.get_request_throttler = std::make_shared(max_get_rps, max_get_burst); - } - if (UInt64 max_put_rps = has("max_put_rps") ? get_uint("max_put_rps") : settings.s3_max_put_rps) - { - size_t default_max_put_burst = settings.s3_max_put_burst - ? settings.s3_max_put_burst - : (Throttler::default_burst_seconds * max_put_rps); - size_t max_put_burst = has("max_put_burst") ? get_uint("max_put_burst") : default_max_put_burst; - request_settings.put_request_throttler = std::make_shared(max_put_rps, max_put_burst); - } return request_settings; } RequestSettings RequestSettings::loadFromNamedCollection(const NamedCollection & collection, bool validate_settings) { - RequestSettings settings{}; + RequestSettings request_settings{}; - if (collection.has("strict_upload_part_size")) - settings.upload_settings.strict_upload_part_size = collection.get("strict_upload_part_size"); - if (collection.has("min_upload_part_size")) - settings.upload_settings.min_upload_part_size = collection.get("min_upload_part_size"); - if (collection.has("max_upload_part_size")) - settings.upload_settings.min_upload_part_size = collection.get("max_upload_part_size"); - if (collection.has("upload_part_size_multiply_factor")) - settings.upload_settings.upload_part_size_multiply_factor = collection.get("upload_part_size_multiply_factor"); - if (collection.has("upload_part_size_multiply_parts_count_threshold")) - settings.upload_settings.upload_part_size_multiply_parts_count_threshold = collection.get("upload_part_size_multiply_parts_count_threshold"); - if (collection.has("max_inflight_parts_for_one_file")) - settings.upload_settings.max_inflight_parts_for_one_file = collection.get("max_inflight_parts_for_one_file"); - if (collection.has("max_part_number")) - settings.upload_settings.max_single_part_upload_size = collection.get("max_part_number"); - if (collection.has("max_single_part_upload_size")) - settings.upload_settings.max_single_part_upload_size = collection.get("max_single_part_upload_size"); - if (collection.has("max_single_operation_copy_size")) - settings.upload_settings.max_single_part_upload_size = collection.get("max_single_operation_copy_size"); - if (collection.has("s3_storage_class")) - settings.upload_settings.storage_class_name = collection.get("s3_storage_class"); + auto values = request_settings.allMutable(); + for (auto & field : values) + { + const auto path = field.getName(); + if (collection.has(path)) + { + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(collection.get(path)); + else if (which == Field::Types::String) + field.setValue(collection.get(path)); + else if (which == Field::Types::Bool) + field.setValue(collection.get(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + } + } + + if (!request_settings.storage_class_name.value.empty()) + request_settings.storage_class_name = Poco::toUpperInPlace(request_settings.storage_class_name.value); - settings.upload_settings.storage_class_name = Poco::toUpperInPlace(settings.upload_settings.storage_class_name); if (validate_settings) - settings.upload_settings.validate(); + request_settings.validateUploadSettings(); - if (collection.has("max_single_read_retries")) - settings.max_single_read_retries = collection.get("max_single_read_retries"); - if (collection.has("list_object_keys_size")) - settings.list_object_keys_size = collection.get("list_object_keys_size"); - if (collection.has("allow_native_copy")) - settings.allow_native_copy = collection.get("allow_native_copy"); - if (collection.has("throw_on_zero_files_match")) - settings.throw_on_zero_files_match = collection.get("throw_on_zero_files_match"); + // request_settings.initializeThrottler(settings); - return settings; + return request_settings; } RequestSettings RequestSettings::loadFromSettings(const DB::Settings & settings, bool validate_settings) @@ -355,58 +310,57 @@ RequestSettings RequestSettings::loadFromSettings(const DB::Settings & settings, void RequestSettings::updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings) { - if (!if_changed || settings.s3_strict_upload_part_size.changed) - upload_settings.strict_upload_part_size = settings.s3_strict_upload_part_size; - if (!if_changed || settings.s3_min_upload_part_size.changed) - upload_settings.min_upload_part_size = settings.s3_min_upload_part_size; - if (!if_changed || settings.s3_max_upload_part_size.changed) - upload_settings.max_upload_part_size = settings.s3_max_upload_part_size; - if (!if_changed || settings.s3_upload_part_size_multiply_factor.changed) - upload_settings.upload_part_size_multiply_factor = settings.s3_upload_part_size_multiply_factor; - if (!if_changed || settings.s3_upload_part_size_multiply_parts_count_threshold.changed) - upload_settings.upload_part_size_multiply_parts_count_threshold = settings.s3_upload_part_size_multiply_parts_count_threshold; - if (!if_changed || settings.s3_max_inflight_parts_for_one_file.changed) - upload_settings.max_inflight_parts_for_one_file = settings.s3_max_inflight_parts_for_one_file; - if (!if_changed || settings.s3_max_part_number.changed) - upload_settings.max_part_number = settings.s3_max_part_number; - if (!if_changed || settings.s3_max_single_part_upload_size.changed) - upload_settings.max_single_part_upload_size = settings.s3_max_single_part_upload_size; - if (!if_changed || settings.s3_max_single_operation_copy_size.changed) - upload_settings.max_part_number = settings.s3_max_single_operation_copy_size; + for (auto & field : allMutable()) + { + const auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && (!if_changed || settings.isChanged(setting_name))) + { + set(field.getName(), settings.get(setting_name)); + } + } + + if (!storage_class_name.value.empty()) + storage_class_name = Poco::toUpperInPlace(storage_class_name.value); if (validate_settings) - upload_settings.validate(); + validateUploadSettings(); +} - if (!if_changed || settings.s3_max_single_read_retries.changed) - max_single_read_retries = settings.s3_max_single_read_retries; - if (!if_changed || settings.s3_check_objects_after_upload.changed) - check_objects_after_upload = settings.s3_check_objects_after_upload; - if (!if_changed || settings.s3_max_unexpected_write_error_retries.changed) - max_unexpected_write_error_retries = settings.s3_max_unexpected_write_error_retries; - if (!if_changed || settings.s3_list_object_keys_size.changed) - list_object_keys_size = settings.s3_list_object_keys_size; - if (!if_changed || settings.s3_throw_on_zero_files_match.changed) - throw_on_zero_files_match = settings.s3_throw_on_zero_files_match; - if (!if_changed || settings.s3_request_timeout_ms.changed) - request_timeout_ms = settings.s3_request_timeout_ms; - - if ((!if_changed || settings.s3_max_get_rps.changed || settings.s3_max_get_burst.changed) && settings.s3_max_get_rps) +void RequestSettings::updateIfChanged(const RequestSettings & settings) +{ + for (auto & setting : settings.all()) { - size_t max_get_burst = settings.s3_max_get_burst - ? settings.s3_max_get_burst - : Throttler::default_burst_seconds * settings.s3_max_get_rps; - get_request_throttler = std::make_shared(settings.s3_max_get_rps, max_get_burst); - } - if ((!if_changed || settings.s3_max_put_rps.changed || settings.s3_max_put_burst.changed) && settings.s3_max_put_rps) - { - size_t max_put_burst = settings.s3_max_put_burst - ? settings.s3_max_put_burst - : Throttler::default_burst_seconds * settings.s3_max_put_rps; - put_request_throttler = std::make_shared(settings.s3_max_put_rps, max_put_burst); + if (setting.isValueChanged()) + set(setting.getName(), setting.getValue()); } } -void RequestSettings::PartUploadSettings::validate() +void RequestSettings::initializeThrottler(const DB::Settings & settings) +{ + /// NOTE: it would be better to reuse old throttlers + /// to avoid losing token bucket state on every config reload, + /// which could lead to exceeding limit for short time. + /// But it is good enough unless very high `burst` values are used. + if (UInt64 max_get_rps = isChanged("max_get_rps") ? get("max_get_rps").get() : settings.s3_max_get_rps) + { + size_t default_max_get_burst = settings.s3_max_get_burst + ? settings.s3_max_get_burst + : (Throttler::default_burst_seconds * max_get_rps); + + size_t max_get_burst = isChanged("max_get_burts") ? get("max_get_burst").get() : default_max_get_burst; + get_request_throttler = std::make_shared(max_get_rps, max_get_burst); + } + if (UInt64 max_put_rps = isChanged("max_put_rps") ? get("max_put_rps").get() : settings.s3_max_put_rps) + { + size_t default_max_put_burst = settings.s3_max_put_burst + ? settings.s3_max_put_burst + : (Throttler::default_burst_seconds * max_put_rps); + size_t max_put_burst = isChanged("max_put_burts") ? get("max_put_burst").get() : default_max_put_burst; + put_request_throttler = std::make_shared(max_put_rps, max_put_burst); + } +} + +void RequestSettings::validateUploadSettings() { static constexpr size_t min_upload_part_size_limit = 5 * 1024 * 1024; if (strict_upload_part_size && strict_upload_part_size < min_upload_part_size_limit) @@ -469,7 +423,7 @@ void RequestSettings::PartUploadSettings::validate() ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); size_t maybe_overflow; - if (common::mulOverflow(max_upload_part_size, upload_part_size_multiply_factor, maybe_overflow)) + if (common::mulOverflow(max_upload_part_size.value, upload_part_size_multiply_factor.value, maybe_overflow)) throw Exception( ErrorCodes::INVALID_SETTING_VALUE, "Setting upload_part_size_multiply_factor is too big ({}). " @@ -477,11 +431,11 @@ void RequestSettings::PartUploadSettings::validate() ReadableSize(max_part_number), ReadableSize(max_part_number_limit)); std::unordered_set storage_class_names {"STANDARD", "INTELLIGENT_TIERING"}; - if (!storage_class_name.empty() && !storage_class_names.contains(storage_class_name)) + if (!storage_class_name.value.empty() && !storage_class_names.contains(storage_class_name)) throw Exception( ErrorCodes::INVALID_SETTING_VALUE, "Setting storage_class has invalid value {} which only supports STANDARD and INTELLIGENT_TIERING", - storage_class_name); + storage_class_name.value); /// TODO: it's possible to set too small limits. /// We can check that max possible object size is not too small. @@ -489,4 +443,6 @@ void RequestSettings::PartUploadSettings::validate() } +IMPLEMENT_SETTINGS_TRAITS(S3::RequestSettingsTraits, REQUEST_SETTINGS_LIST) + } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 01a7ca56779..b47e3e79409 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -123,34 +124,38 @@ private: bool operator==(const AuthSettings & other) const = default; }; -struct RequestSettings +#define REQUEST_SETTINGS(M, ALIAS) \ + M(UInt64, max_single_read_retries, 4, "", 0) \ + M(UInt64, request_timeout_ms, DEFAULT_REQUEST_TIMEOUT_MS, "", 0) \ + M(UInt64, list_object_keys_size, 1000, "", 0) \ + M(Bool, allow_native_copy, true, "", 0) \ + M(Bool, check_objects_after_upload, false, "", 0) \ + M(Bool, throw_on_zero_files_match, false, "", 0) \ + M(UInt64, max_single_operation_copy_size, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "", 0) \ + M(String, storage_class_name, "", "", 0) \ + +#define PART_UPLOAD_SETTINGS(M, ALIAS) \ + M(UInt64, strict_upload_part_size, 0, "", 0) \ + M(UInt64, min_upload_part_size, DEFAULT_MIN_UPLOAD_PART_SIZE, "", 0) \ + M(UInt64, max_upload_part_size, DEFAULT_MAX_UPLOAD_PART_SIZE, "", 0) \ + M(UInt64, upload_part_size_multiply_factor, DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR, "", 0) \ + M(UInt64, upload_part_size_multiply_parts_count_threshold, DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD, "", 0) \ + M(UInt64, max_inflight_parts_for_one_file, DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE, "", 0) \ + M(UInt64, max_part_number, DEFAULT_MAX_PART_NUMBER, "", 0) \ + M(UInt64, max_single_part_upload_size, DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "", 0) \ + M(UInt64, max_unexpected_write_error_retries, 4, "", 0) \ + + +#define REQUEST_SETTINGS_LIST(M, ALIAS) \ + REQUEST_SETTINGS(M, ALIAS) \ + PART_UPLOAD_SETTINGS(M, ALIAS) + +DECLARE_SETTINGS_TRAITS(RequestSettingsTraits, REQUEST_SETTINGS_LIST) + +struct RequestSettings : public BaseSettings { - size_t max_single_read_retries = 4; - size_t request_timeout_ms = 30000; - size_t max_unexpected_write_error_retries = 4; - size_t list_object_keys_size = 1000; + void validateUploadSettings(); - bool allow_native_copy = true; - bool check_objects_after_upload = false; - bool throw_on_zero_files_match = false; - - struct PartUploadSettings - { - size_t strict_upload_part_size = 0; - size_t min_upload_part_size = 16 * 1024 * 1024; - size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; - size_t upload_part_size_multiply_factor = 2; - size_t upload_part_size_multiply_parts_count_threshold = 500; - size_t max_inflight_parts_for_one_file = 20; - size_t max_part_number = 10000; - size_t max_single_part_upload_size = 32 * 1024 * 1024; - size_t max_single_operation_copy_size = 5ULL * 1024 * 1024 * 1024; - String storage_class_name; - - void validate(); - }; - - PartUploadSettings upload_settings; ThrottlerPtr get_request_throttler; ThrottlerPtr put_request_throttler; @@ -164,6 +169,10 @@ struct RequestSettings const std::string & setting_name_prefix = ""); void updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings = true); + void updateIfChanged(const RequestSettings & settings); + +private: + void initializeThrottler(const DB::Settings & settings); }; } diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index b83ca174820..982d1485efd 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -72,7 +72,7 @@ struct WriteBufferFromS3::PartData } }; -BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3::RequestSettings::PartUploadSettings & settings) +BufferAllocationPolicyPtr createBufferAllocationPolicy(const S3::RequestSettings & settings) { BufferAllocationPolicy::Settings allocation_settings; allocation_settings.strict_size = settings.strict_upload_part_size; @@ -100,15 +100,14 @@ WriteBufferFromS3::WriteBufferFromS3( , bucket(bucket_) , key(key_) , request_settings(request_settings_) - , upload_settings(request_settings.upload_settings) , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , buffer_allocation_policy(createBufferAllocationPolicy(upload_settings)) + , buffer_allocation_policy(createBufferAllocationPolicy(request_settings)) , task_tracker( std::make_unique( std::move(schedule_), - upload_settings.max_inflight_parts_for_one_file, + request_settings.max_inflight_parts_for_one_file, limitedLog)) , blob_log(std::move(blob_log_)) { @@ -165,7 +164,7 @@ void WriteBufferFromS3::preFinalize() if (multipart_upload_id.empty() && detached_part_data.size() <= 1) { - if (detached_part_data.empty() || detached_part_data.front().data_size <= upload_settings.max_single_part_upload_size) + if (detached_part_data.empty() || detached_part_data.front().data_size <= request_settings.max_single_part_upload_size) do_single_part_upload = true; } @@ -499,18 +498,18 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) "Unable to write a part without multipart_upload_id, details: WriteBufferFromS3 created for bucket {}, key {}", bucket, key); - if (part_number > upload_settings.max_part_number) + if (part_number > request_settings.max_part_number) { throw Exception( ErrorCodes::INVALID_CONFIG_PARAMETER, "Part number exceeded {} while writing {} bytes to S3. Check min_upload_part_size = {}, max_upload_part_size = {}, " "upload_part_size_multiply_factor = {}, upload_part_size_multiply_parts_count_threshold = {}, max_single_part_upload_size = {}", - upload_settings.max_part_number, count(), upload_settings.min_upload_part_size, upload_settings.max_upload_part_size, - upload_settings.upload_part_size_multiply_factor, upload_settings.upload_part_size_multiply_parts_count_threshold, - upload_settings.max_single_part_upload_size); + request_settings.max_part_number, count(), request_settings.min_upload_part_size, request_settings.max_upload_part_size, + request_settings.upload_part_size_multiply_factor, request_settings.upload_part_size_multiply_parts_count_threshold, + request_settings.max_single_part_upload_size); } - if (data.data_size > upload_settings.max_upload_part_size) + if (data.data_size > request_settings.max_upload_part_size) { throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -518,7 +517,7 @@ void WriteBufferFromS3::writePart(WriteBufferFromS3::PartData && data) getShortLogDetails(), part_number, data.data_size, - upload_settings.max_upload_part_size + request_settings.max_upload_part_size ); } @@ -605,7 +604,7 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetMultipartUpload(multipart_upload); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); @@ -663,8 +662,8 @@ S3::PutObjectRequest WriteBufferFromS3::getPutRequest(PartData & data) req.SetBody(data.createAwsBuffer()); if (object_metadata.has_value()) req.SetMetadata(object_metadata.value()); - if (!upload_settings.storage_class_name.empty()) - req.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(upload_settings.storage_class_name)); + if (!request_settings.storage_class_name.value.empty()) + req.SetStorageClass(Aws::S3::Model::StorageClassMapper::GetStorageClassForName(request_settings.storage_class_name)); /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 req.SetContentType("binary/octet-stream"); @@ -688,7 +687,7 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data auto & request = std::get<0>(*worker_data); size_t content_length = request.GetContentLength(); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3PutObject); diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index d4e25ea1733..973ca4c7526 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -79,7 +79,6 @@ private: const String bucket; const String key; const S3::RequestSettings request_settings; - const S3::RequestSettings::PartUploadSettings & upload_settings; const WriteSettings write_settings; const std::shared_ptr client_ptr; const std::optional> object_metadata; From 56c7301d468d5bda2fc3a0fd512eff70ce6b1b3b Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 May 2024 17:14:19 +0200 Subject: [PATCH 036/215] Use BaseSettings for auth settings --- src/Backups/BackupIO_S3.cpp | 12 +- src/Coordination/KeeperSnapshotManagerS3.cpp | 10 +- .../ObjectStorages/ObjectStorageFactory.cpp | 6 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 8 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 47 ++- src/Disks/ObjectStorages/S3/diskSettings.h | 1 + src/IO/S3Common.cpp | 269 +++++++----------- src/IO/S3Common.h | 133 +++++---- src/IO/S3Defines.h | 3 + src/IO/S3Settings.cpp | 4 +- .../ObjectStorage/Azure/Configuration.cpp | 2 +- .../ObjectStorage/Azure/Configuration.h | 2 +- .../ObjectStorage/HDFS/Configuration.cpp | 2 +- .../ObjectStorage/HDFS/Configuration.h | 2 +- .../ObjectStorage/S3/Configuration.cpp | 17 +- src/Storages/ObjectStorage/S3/Configuration.h | 2 +- .../ObjectStorage/StorageObjectStorage.cpp | 2 +- .../ObjectStorage/StorageObjectStorage.h | 2 +- 18 files changed, 235 insertions(+), 289 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 8c16aa0c291..b8ade4be027 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -88,14 +88,10 @@ namespace std::move(headers), S3::CredentialsConfiguration { - settings.auth_settings.use_environment_credentials.value_or( - context->getConfigRef().getBool("s3.use_environment_credentials", true)), - settings.auth_settings.use_insecure_imds_request.value_or( - context->getConfigRef().getBool("s3.use_insecure_imds_request", false)), - settings.auth_settings.expiration_window_seconds.value_or( - context->getConfigRef().getUInt64("s3.expiration_window_seconds", S3::DEFAULT_EXPIRATION_WINDOW_SECONDS)), - settings.auth_settings.no_sign_request.value_or( - context->getConfigRef().getBool("s3.no_sign_request", false)), + settings.auth_settings.use_environment_credentials, + settings.auth_settings.use_insecure_imds_request, + settings.auth_settings.expiration_window_seconds, + settings.auth_settings.no_sign_request }); } diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index acf8faa9edd..b8c8d10d497 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -65,7 +65,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo return; } - auto auth_settings = S3::AuthSettings::loadFromConfig(config, config_prefix, Context::getGlobalContextInstance()->getSettingsRef()); + auto auth_settings = S3::AuthSettings(config, config_prefix, Context::getGlobalContextInstance()->getSettingsRef()); String endpoint = macros->expand(config.getString(config_prefix + ".endpoint")); auto new_uri = S3::URI{endpoint}; @@ -119,10 +119,10 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo std::move(headers), S3::CredentialsConfiguration { - auth_settings.use_environment_credentials.value_or(true), - auth_settings.use_insecure_imds_request.value_or(false), - auth_settings.expiration_window_seconds.value_or(S3::DEFAULT_EXPIRATION_WINDOW_SECONDS), - auth_settings.no_sign_request.value_or(false), + auth_settings.use_environment_credentials, + auth_settings.use_insecure_imds_request, + auth_settings.expiration_window_seconds, + auth_settings.no_sign_request, }, credentials.GetSessionToken()); diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 81de22811fe..14c0d656cbf 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -190,8 +190,8 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) { auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); @@ -227,8 +227,8 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); @@ -262,8 +262,8 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); - auto settings = getSettings(config, config_prefix, context, /* for_disk_s3 */true, /* validate_settings */true); auto endpoint = getEndpoint(config, config_prefix, context); + auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 00a1216e52a..84af340e5d0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -574,13 +574,13 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto settings_from_config = getSettings(config, config_prefix, context, for_disk_s3, context->getSettingsRef().s3_validate_request_settings); + auto settings_from_config = getSettings(config, config_prefix, context, uri.endpoint, for_disk_s3, context->getSettingsRef().s3_validate_request_settings); auto modified_settings = std::make_unique(*s3_settings.get()); - modified_settings->auth_settings.updateFrom(settings_from_config->auth_settings); + modified_settings->auth_settings.updateIfChanged(settings_from_config->auth_settings); modified_settings->request_settings.updateIfChanged(settings_from_config->request_settings); if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) - modified_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + modified_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); auto current_settings = s3_settings.get(); if (options.allow_client_change @@ -598,7 +598,7 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context, for_disk_s3, true); + auto new_s3_settings = getSettings(config, config_prefix, context, uri.endpoint, for_disk_s3, true); auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); auto new_uri{uri}; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index f66af556ce1..591b1e6623d 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -34,14 +34,18 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, + const std::string & endpoint, bool for_disk_s3, bool validate_settings) { const auto & settings = context->getSettingsRef(); const std::string setting_name_prefix = for_disk_s3 ? "s3_" : ""; - auto auth_settings = S3::AuthSettings::loadFromConfig(config, config_prefix, settings); - auto request_settings = S3::RequestSettings::loadFromConfig(config, config_prefix, settings, validate_settings, setting_name_prefix); + auto auth_settings = S3::AuthSettings(config, config_prefix, settings); + auto request_settings = S3::RequestSettings(config, config_prefix, settings, validate_settings, setting_name_prefix); + + request_settings.proxy_resolver = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( + ProxyConfiguration::protocolFromString(S3::URI(endpoint).uri.getScheme()), config_prefix, config); return std::make_unique( request_settings, @@ -75,7 +79,7 @@ std::unique_ptr getClient( const auto & request_settings = settings.request_settings; const bool is_s3_express_bucket = S3::isS3ExpressEndpoint(url.endpoint); - if (is_s3_express_bucket && auth_settings.region.empty()) + if (is_s3_express_bucket && auth_settings.region.value.empty()) { throw Exception( ErrorCodes::NO_ELEMENTS_IN_CONFIG, @@ -93,43 +97,36 @@ std::unique_ptr getClient( request_settings.put_request_throttler, url.uri.getScheme()); - client_configuration.connectTimeoutMs = auth_settings.connect_timeout_ms.value_or(S3::DEFAULT_CONNECT_TIMEOUT_MS); - client_configuration.requestTimeoutMs = auth_settings.request_timeout_ms.value_or(S3::DEFAULT_REQUEST_TIMEOUT_MS); - client_configuration.maxConnections = static_cast(auth_settings.max_connections.value_or(S3::DEFAULT_MAX_CONNECTIONS)); - client_configuration.http_keep_alive_timeout = auth_settings.http_keep_alive_timeout.value_or(S3::DEFAULT_KEEP_ALIVE_TIMEOUT); - client_configuration.http_keep_alive_max_requests = auth_settings.http_keep_alive_max_requests.value_or(S3::DEFAULT_KEEP_ALIVE_MAX_REQUESTS); + client_configuration.connectTimeoutMs = auth_settings.connect_timeout_ms; + client_configuration.requestTimeoutMs = auth_settings.request_timeout_ms; + client_configuration.maxConnections = static_cast(auth_settings.max_connections); + client_configuration.http_keep_alive_timeout = auth_settings.http_keep_alive_timeout; + client_configuration.http_keep_alive_max_requests = auth_settings.http_keep_alive_max_requests; client_configuration.endpointOverride = url.endpoint; - client_configuration.s3_use_adaptive_timeouts = auth_settings.use_adaptive_timeouts.value_or(S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS); + client_configuration.s3_use_adaptive_timeouts = auth_settings.use_adaptive_timeouts; - if (for_disk_s3) + if (request_settings.proxy_resolver) { - /// TODO: move to S3Common auth settings parsing /* * Override proxy configuration for backwards compatibility with old configuration format. * */ - // if (auto proxy_config = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( - // ProxyConfiguration::protocolFromString(url.uri.getScheme()), config_prefix, config)) - // { - // client_configuration.per_request_configuration - // = [proxy_config]() { return proxy_config->resolve(); }; - // client_configuration.error_report - // = [proxy_config](const auto & request_config) { proxy_config->errorReport(request_config); }; - // } + client_configuration.per_request_configuration = [=]() { return request_settings.proxy_resolver->resolve(); }; + client_configuration.error_report = [=](const auto & request_config) { request_settings.proxy_resolver->errorReport(request_config); }; } S3::ClientSettings client_settings{ .use_virtual_addressing = url.is_virtual_hosted_style, - .disable_checksum = auth_settings.disable_checksum.value_or(S3::DEFAULT_DISABLE_CHECKSUM), - .gcs_issue_compose_request = auth_settings.gcs_issue_compose_request.value_or(false), + .disable_checksum = auth_settings.disable_checksum, + .gcs_issue_compose_request = auth_settings.gcs_issue_compose_request, }; auto credentials_configuration = S3::CredentialsConfiguration { - auth_settings.use_environment_credentials.value_or(S3::DEFAULT_USE_ENVIRONMENT_CREDENTIALS), - auth_settings.use_insecure_imds_request.value_or(false), - auth_settings.expiration_window_seconds.value_or(S3::DEFAULT_EXPIRATION_WINDOW_SECONDS), - auth_settings.no_sign_request.value_or(S3::DEFAULT_NO_SIGN_REQUEST), + auth_settings.use_environment_credentials, + auth_settings.use_insecure_imds_request, + auth_settings.expiration_window_seconds, + auth_settings.no_sign_request, }; return S3::ClientFactory::instance().create( diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 5e7a18152d1..41aa85991a7 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -18,6 +18,7 @@ std::unique_ptr getSettings( const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context, + const std::string & endpoint, bool for_disk_s3, bool validate_settings); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index b3cd037e91d..ef42b4b2642 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -61,6 +61,7 @@ namespace DB namespace ErrorCodes { extern const int INVALID_CONFIG_PARAMETER; + extern const int BAD_ARGUMENTS; } namespace S3 @@ -101,146 +102,14 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c return sse_kms_config; } -AuthSettings AuthSettings::loadFromConfig( +AuthSettings::AuthSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const DB::Settings & settings, + const DB::Settings &, /// TODO: use settings const std::string & setting_name_prefix) { - auto auth_settings = AuthSettings::loadFromSettings(settings); - const std::string prefix = config_prefix + "." + setting_name_prefix; - auto has = [&](const std::string & key) -> bool { return config.has(prefix + key); }; - auto get_uint = [&](const std::string & key) -> size_t { return config.getUInt64(prefix + key); }; - auto get_bool = [&](const std::string & key) -> bool { return config.getBool(prefix + key); }; - auto get_string = [&](const std::string & key) -> std::string { return config.getString(prefix + key); }; - - if (has("access_key_id")) - auth_settings.access_key_id = get_string("access_key_id"); - if (has("secret_access_key")) - auth_settings.secret_access_key = get_string("secret_access_key"); - if (has("session_token")) - auth_settings.secret_access_key = get_string("session_token"); - - if (has("region")) - auth_settings.region = get_string("region"); - if (has("server_side_encryption_customer_key_base64")) - auth_settings.region = get_string("server_side_encryption_customer_key_base64"); - - if (has("connect_timeout_ms")) - auth_settings.connect_timeout_ms = get_uint("connect_timeout_ms"); - if (has("request_timeout_ms")) - auth_settings.request_timeout_ms = get_uint("request_timeout_ms"); - if (has("max_connections")) - auth_settings.max_connections = get_uint("max_connections"); - - if (has("http_keep_alive_timeout")) - auth_settings.http_keep_alive_timeout = get_uint("http_keep_alive_timeout"); - if (has("http_keep_alive_max_requests")) - auth_settings.http_keep_alive_max_requests = get_uint("http_keep_alive_max_requests"); - - if (has("use_environment_credentials")) - auth_settings.use_environment_credentials = get_bool("use_environment_credentials"); - if (has("use_adaptive_timeouts")) - auth_settings.use_adaptive_timeouts = get_bool("use_adaptive_timeouts"); - if (has("no_sing_request")) - auth_settings.no_sign_request = get_bool("no_sing_request"); - if (has("expiration_window_seconds")) - auth_settings.expiration_window_seconds = get_uint("expiration_window_seconds"); - if (has("gcs_issue_compose_request")) - auth_settings.gcs_issue_compose_request = get_bool("gcs_issue_compose_request"); - if (has("use_insecure_imds_request")) - auth_settings.use_insecure_imds_request = get_bool("use_insecure_imds_request"); - - auth_settings.headers = getHTTPHeaders(config_prefix, config); - auth_settings.server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); - - Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_prefix, keys); - for (const auto & key : keys) - { - if (startsWith(key, "user")) - auth_settings.users.insert(config.getString(config_prefix + "." + key)); - } - - return auth_settings; -} - -AuthSettings AuthSettings::loadFromSettings(const DB::Settings & settings) -{ - AuthSettings auth_settings{}; - auth_settings.updateFromSettings(settings, /* if_changed */false); - return auth_settings; -} - -void AuthSettings::updateFromSettings(const DB::Settings & settings, bool if_changed) -{ - if (!if_changed || settings.s3_connect_timeout_ms.changed) - connect_timeout_ms = settings.s3_connect_timeout_ms; - if (!if_changed || settings.s3_request_timeout_ms.changed) - request_timeout_ms = settings.s3_request_timeout_ms; - if (!if_changed || settings.s3_max_connections.changed) - max_connections = settings.s3_max_connections; - if (!if_changed || settings.s3_use_adaptive_timeouts.changed) - use_adaptive_timeouts = settings.s3_use_adaptive_timeouts; - if (!if_changed || settings.s3_disable_checksum.changed) - disable_checksum = settings.s3_disable_checksum; -} - -bool AuthSettings::hasUpdates(const AuthSettings & other) const -{ - AuthSettings copy = *this; - copy.updateFrom(other); - return *this != copy; -} - -void AuthSettings::updateFrom(const AuthSettings & from) -{ - /// Update with check for emptyness only parameters which - /// can be passed not only from config, but via ast. - - if (!from.access_key_id.empty()) - access_key_id = from.access_key_id; - if (!from.secret_access_key.empty()) - secret_access_key = from.secret_access_key; - if (!from.session_token.empty()) - session_token = from.session_token; - - if (!from.headers.empty()) - headers = from.headers; - if (!from.region.empty()) - region = from.region; - - server_side_encryption_customer_key_base64 = from.server_side_encryption_customer_key_base64; - server_side_encryption_kms_config = from.server_side_encryption_kms_config; - - if (from.use_environment_credentials.has_value()) - use_environment_credentials = from.use_environment_credentials; - - if (from.use_insecure_imds_request.has_value()) - use_insecure_imds_request = from.use_insecure_imds_request; - - if (from.expiration_window_seconds.has_value()) - expiration_window_seconds = from.expiration_window_seconds; - - if (from.no_sign_request.has_value()) - no_sign_request = from.no_sign_request; - - users.insert(from.users.begin(), from.users.end()); -} - -RequestSettings RequestSettings::loadFromConfig( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - const DB::Settings & settings, - bool validate_settings, - const std::string & setting_name_prefix) -{ - auto request_settings = RequestSettings::loadFromSettings(settings, validate_settings); - String prefix = config_prefix + "." + setting_name_prefix; - - auto values = request_settings.allMutable(); - for (auto & field : values) + for (auto & field : allMutable()) { const auto path = prefix + field.getName(); if (config.has(path)) @@ -257,22 +126,92 @@ RequestSettings RequestSettings::loadFromConfig( } } - if (!request_settings.storage_class_name.value.empty()) - request_settings.storage_class_name = Poco::toUpperInPlace(request_settings.storage_class_name.value); + headers = getHTTPHeaders(config_prefix, config); + server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); - if (validate_settings) - request_settings.validateUploadSettings(); - - request_settings.initializeThrottler(settings); - - return request_settings; + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(config_prefix, keys); + for (const auto & key : keys) + { + if (startsWith(key, "user")) + users.insert(config.getString(config_prefix + "." + key)); + } } -RequestSettings RequestSettings::loadFromNamedCollection(const NamedCollection & collection, bool validate_settings) +AuthSettings::AuthSettings(const DB::Settings & settings) { - RequestSettings request_settings{}; + updateFromSettings(settings, /* if_changed */false); +} - auto values = request_settings.allMutable(); +void AuthSettings::updateFromSettings(const DB::Settings & settings, bool if_changed) +{ + for (auto & field : allMutable()) + { + const auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && (!if_changed || settings.isChanged(setting_name))) + { + set(field.getName(), settings.get(setting_name)); + } + } +} + +bool AuthSettings::hasUpdates(const AuthSettings & other) const +{ + AuthSettings copy = *this; + copy.updateIfChanged(other); + return *this != copy; +} + +void AuthSettings::updateIfChanged(const AuthSettings & settings) +{ + /// Update with check for emptyness only parameters which + /// can be passed not only from config, but via ast. + + for (auto & setting : settings.all()) + { + if (setting.isValueChanged()) + set(setting.getName(), setting.getValue()); + } + + if (!settings.headers.empty()) + headers = settings.headers; + server_side_encryption_kms_config = settings.server_side_encryption_kms_config; + users.insert(settings.users.begin(), settings.users.end()); +} + +RequestSettings::RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + bool validate_settings, + const std::string & setting_name_prefix) +{ + String prefix = config_prefix + "." + setting_name_prefix; + for (auto & field : allMutable()) + { + const auto path = prefix + field.getName(); + if (config.has(path)) + { + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + } + } + finishInit(settings, validate_settings); +} + +RequestSettings::RequestSettings( + const NamedCollection & collection, + const DB::Settings & settings, + bool validate_settings) +{ + auto values = allMutable(); for (auto & field : values) { const auto path = field.getName(); @@ -289,26 +228,17 @@ RequestSettings RequestSettings::loadFromNamedCollection(const NamedCollection & throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); } } - - if (!request_settings.storage_class_name.value.empty()) - request_settings.storage_class_name = Poco::toUpperInPlace(request_settings.storage_class_name.value); - - if (validate_settings) - request_settings.validateUploadSettings(); - - // request_settings.initializeThrottler(settings); - - return request_settings; + finishInit(settings, validate_settings); } -RequestSettings RequestSettings::loadFromSettings(const DB::Settings & settings, bool validate_settings) +RequestSettings::RequestSettings(const DB::Settings & settings, bool validate_settings) { - RequestSettings request_settings{}; - request_settings.updateFromSettings(settings, /* if_changed */false, validate_settings); - return request_settings; + updateFromSettings(settings, /* if_changed */false, validate_settings); + finishInit(settings, validate_settings); } -void RequestSettings::updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings) +void RequestSettings::updateFromSettings( + const DB::Settings & settings, bool if_changed, bool /* validate_settings */) /// TODO: process validate_settings { for (auto & field : allMutable()) { @@ -318,12 +248,6 @@ void RequestSettings::updateFromSettings(const DB::Settings & settings, bool if_ set(field.getName(), settings.get(setting_name)); } } - - if (!storage_class_name.value.empty()) - storage_class_name = Poco::toUpperInPlace(storage_class_name.value); - - if (validate_settings) - validateUploadSettings(); } void RequestSettings::updateIfChanged(const RequestSettings & settings) @@ -335,8 +259,14 @@ void RequestSettings::updateIfChanged(const RequestSettings & settings) } } -void RequestSettings::initializeThrottler(const DB::Settings & settings) +void RequestSettings::finishInit(const DB::Settings & settings, bool validate_settings) { + if (!storage_class_name.value.empty() && storage_class_name.changed) + storage_class_name = Poco::toUpperInPlace(storage_class_name.value); + + if (validate_settings) + validateUploadSettings(); + /// NOTE: it would be better to reuse old throttlers /// to avoid losing token bucket state on every config reload, /// which could lead to exceeding limit for short time. @@ -443,6 +373,9 @@ void RequestSettings::validateUploadSettings() } +/// TODO: sometimes disk settings have fallback to "s3" section settings from config, support this. + +IMPLEMENT_SETTINGS_TRAITS(S3::AuthSettingsTraits, CLIENT_SETTINGS_LIST) IMPLEMENT_SETTINGS_TRAITS(S3::RequestSettingsTraits, REQUEST_SETTINGS_LIST) } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index b47e3e79409..b27b9ec1136 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -32,6 +32,7 @@ namespace ErrorCodes class RemoteHostFilter; class NamedCollection; +struct ProxyConfigurationResolver; class S3Exception : public Exception { @@ -72,64 +73,34 @@ namespace Poco::Util namespace DB::S3 { -HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); +#define AUTH_SETTINGS(M, ALIAS) \ + M(String, access_key_id, "", "", 0) \ + M(String, secret_access_key, "", "", 0) \ + M(String, session_token, "", "", 0) \ + M(String, region, "", "", 0) \ + M(String, server_side_encryption_customer_key_base64, "", "", 0) \ -ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); - -struct AuthSettings -{ - std::string access_key_id; - std::string secret_access_key; - std::string session_token; - std::string region; - std::string server_side_encryption_customer_key_base64; - - HTTPHeaderEntries headers; - std::unordered_set users; - ServerSideEncryptionKMSConfig server_side_encryption_kms_config; - - std::optional connect_timeout_ms; - std::optional request_timeout_ms; - std::optional max_connections; - std::optional http_keep_alive_timeout; - std::optional http_keep_alive_max_requests; - std::optional expiration_window_seconds; - - std::optional use_environment_credentials; - std::optional no_sign_request; - std::optional use_adaptive_timeouts; - std::optional use_insecure_imds_request; - std::optional is_virtual_hosted_style; - std::optional disable_checksum; - std::optional gcs_issue_compose_request; - - bool hasUpdates(const AuthSettings & other) const; - void updateFrom(const AuthSettings & from); - - bool canBeUsedByUser(const String & user) const { return users.empty() || users.contains(user); } - - static AuthSettings loadFromConfig( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - const DB::Settings & settings, - const std::string & setting_name_prefix = ""); - - static AuthSettings loadFromSettings(const DB::Settings & settings); - - static AuthSettings loadFromNamedCollection(const NamedCollection & collection); - - void updateFromSettings(const DB::Settings & settings, bool if_changed); - -private: - bool operator==(const AuthSettings & other) const = default; -}; +#define CLIENT_SETTINGS(M, ALIAS) \ + M(UInt64, connect_timeout_ms, DEFAULT_CONNECT_TIMEOUT_MS, "", 0) \ + M(UInt64, request_timeout_ms, DEFAULT_REQUEST_TIMEOUT_MS, "", 0) \ + M(UInt64, max_connections, DEFAULT_MAX_CONNECTIONS, "", 0) \ + M(UInt64, http_keep_alive_timeout, DEFAULT_KEEP_ALIVE_TIMEOUT, "", 0) \ + M(UInt64, http_keep_alive_max_requests, DEFAULT_KEEP_ALIVE_MAX_REQUESTS, "", 0) \ + M(UInt64, expiration_window_seconds, DEFAULT_EXPIRATION_WINDOW_SECONDS, "", 0) \ + M(Bool, use_environment_credentials, DEFAULT_USE_ENVIRONMENT_CREDENTIALS, "", 0) \ + M(Bool, no_sign_request, DEFAULT_NO_SIGN_REQUEST, "", 0) \ + M(Bool, use_insecure_imds_request, false, "", 0) \ + M(Bool, use_adaptive_timeouts, DEFAULT_USE_ADAPTIVE_TIMEOUTS, "", 0) \ + M(Bool, is_virtual_hosted_style, false, "", 0) \ + M(Bool, disable_checksum, DEFAULT_DISABLE_CHECKSUM, "", 0) \ + M(Bool, gcs_issue_compose_request, false, "", 0) \ #define REQUEST_SETTINGS(M, ALIAS) \ M(UInt64, max_single_read_retries, 4, "", 0) \ M(UInt64, request_timeout_ms, DEFAULT_REQUEST_TIMEOUT_MS, "", 0) \ - M(UInt64, list_object_keys_size, 1000, "", 0) \ - M(Bool, allow_native_copy, true, "", 0) \ - M(Bool, check_objects_after_upload, false, "", 0) \ + M(UInt64, list_object_keys_size, DEFAULT_LIST_OBJECT_KEYS_SIZE, "", 0) \ + M(Bool, allow_native_copy, DEFAULT_ALLOW_NATIVE_COPY, "", 0) \ + M(Bool, check_objects_after_upload, DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD, "", 0) \ M(Bool, throw_on_zero_files_match, false, "", 0) \ M(UInt64, max_single_operation_copy_size, DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE, "", 0) \ M(String, storage_class_name, "", "", 0) \ @@ -145,23 +116,56 @@ private: M(UInt64, max_single_part_upload_size, DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "", 0) \ M(UInt64, max_unexpected_write_error_retries, 4, "", 0) \ +#define CLIENT_SETTINGS_LIST(M, ALIAS) \ + CLIENT_SETTINGS(M, ALIAS) \ + AUTH_SETTINGS(M, ALIAS) #define REQUEST_SETTINGS_LIST(M, ALIAS) \ REQUEST_SETTINGS(M, ALIAS) \ PART_UPLOAD_SETTINGS(M, ALIAS) +DECLARE_SETTINGS_TRAITS(AuthSettingsTraits, CLIENT_SETTINGS_LIST) DECLARE_SETTINGS_TRAITS(RequestSettingsTraits, REQUEST_SETTINGS_LIST) +struct AuthSettings : public BaseSettings +{ + AuthSettings() = default; + + AuthSettings( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings, + const std::string & setting_name_prefix = ""); + + AuthSettings(const DB::Settings & settings); + + AuthSettings(const NamedCollection & collection); + + void updateFromSettings(const DB::Settings & settings, bool if_changed); + bool hasUpdates(const AuthSettings & other) const; + void updateIfChanged(const AuthSettings & settings); + bool canBeUsedByUser(const String & user) const { return users.empty() || users.contains(user); } + + HTTPHeaderEntries headers; + std::unordered_set users; + ServerSideEncryptionKMSConfig server_side_encryption_kms_config; +}; + struct RequestSettings : public BaseSettings { - void validateUploadSettings(); + RequestSettings() = default; - ThrottlerPtr get_request_throttler; - ThrottlerPtr put_request_throttler; + /// Create request settings from DB::Settings. + explicit RequestSettings(const DB::Settings & settings, bool validate_settings = true); - static RequestSettings loadFromSettings(const DB::Settings & settings, bool validate_settings = true); - static RequestSettings loadFromNamedCollection(const NamedCollection & collection, bool validate_settings = true); - static RequestSettings loadFromConfig( + /// Create request settings from NamedCollection. + RequestSettings( + const NamedCollection & collection, + const DB::Settings & settings, + bool validate_settings = true); + + /// Create request settings from Config. + RequestSettings( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, const DB::Settings & settings, @@ -170,9 +174,18 @@ struct RequestSettings : public BaseSettings void updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings = true); void updateIfChanged(const RequestSettings & settings); + void validateUploadSettings(); + + ThrottlerPtr get_request_throttler; + ThrottlerPtr put_request_throttler; + std::shared_ptr proxy_resolver; private: - void initializeThrottler(const DB::Settings & settings); + void finishInit(const DB::Settings & settings, bool validate_settings); }; +HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); + +ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); + } diff --git a/src/IO/S3Defines.h b/src/IO/S3Defines.h index ad01920adda..eedd0df81a6 100644 --- a/src/IO/S3Defines.h +++ b/src/IO/S3Defines.h @@ -28,5 +28,8 @@ inline static constexpr uint64_t DEFAULT_MAX_PART_NUMBER = 10000; /// Other settings. inline static constexpr uint64_t DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE = 32 * 1024 * 1024; inline static constexpr uint64_t DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE = 20; +inline static constexpr uint64_t DEFAULT_LIST_OBJECT_KEYS_SIZE = 1000; +inline static constexpr uint64_t DEFAULT_ALLOW_NATIVE_COPY = true; +inline static constexpr uint64_t DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD = false; } diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index 6a7b2ea5627..4197014c454 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -32,8 +32,8 @@ void S3SettingsByEndpoint::loadFromConfig( if (config.has(endpoint_path)) { auto endpoint = config.getString(endpoint_path); - auto auth_settings = S3::AuthSettings::loadFromConfig(config, key_path, settings); - auto request_settings = S3::RequestSettings::loadFromConfig(config, key_path, settings); + auto auth_settings = S3::AuthSettings(config, key_path, settings); + auto request_settings = S3::RequestSettings(config, key_path, settings); s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); } } diff --git a/src/Storages/ObjectStorage/Azure/Configuration.cpp b/src/Storages/ObjectStorage/Azure/Configuration.cpp index ada3e2e9323..163f08be420 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.cpp +++ b/src/Storages/ObjectStorage/Azure/Configuration.cpp @@ -249,7 +249,7 @@ AzureClientPtr StorageAzureConfiguration::createClient(bool is_read_only, bool a return result; } -void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & collection) + void StorageAzureConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr) { validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); diff --git a/src/Storages/ObjectStorage/Azure/Configuration.h b/src/Storages/ObjectStorage/Azure/Configuration.h index 35b19079ca9..bbaa82c51ba 100644 --- a/src/Storages/ObjectStorage/Azure/Configuration.h +++ b/src/Storages/ObjectStorage/Azure/Configuration.h @@ -51,7 +51,7 @@ public: ContextPtr context) override; protected: - void fromNamedCollection(const NamedCollection & collection) override; + void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.cpp b/src/Storages/ObjectStorage/HDFS/Configuration.cpp index a8a9ab5b557..155f51adf61 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.cpp +++ b/src/Storages/ObjectStorage/HDFS/Configuration.cpp @@ -119,7 +119,7 @@ void StorageHDFSConfiguration::fromAST(ASTs & args, ContextPtr context, bool wit setURL(url_str); } -void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & collection) +void StorageHDFSConfiguration::fromNamedCollection(const NamedCollection & collection, ContextPtr) { std::string url_str; diff --git a/src/Storages/ObjectStorage/HDFS/Configuration.h b/src/Storages/ObjectStorage/HDFS/Configuration.h index 01a8b9c5e3b..04884542908 100644 --- a/src/Storages/ObjectStorage/HDFS/Configuration.h +++ b/src/Storages/ObjectStorage/HDFS/Configuration.h @@ -46,7 +46,7 @@ public: ContextPtr context) override; private: - void fromNamedCollection(const NamedCollection &) override; + void fromNamedCollection(const NamedCollection &, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr, bool /* with_structure */) override; void setURL(const std::string & url_); diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 537af4421f2..327efba2169 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -108,10 +108,12 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & settings = context->getSettingsRef(); const std::string config_prefix = "s3."; - auto s3_settings = getSettings(config, config_prefix, context, /* for_disk_s3 */false, settings.s3_validate_request_settings); + auto s3_settings = getSettings( + config, config_prefix, context, url.endpoint, /* for_disk_s3 */false, + settings.s3_validate_request_settings); request_settings.updateFromSettings(settings, /* if_changed */true); - auth_settings.updateFrom(s3_settings->auth_settings); + auth_settings.updateIfChanged(s3_settings->auth_settings); s3_settings->auth_settings = auth_settings; s3_settings->request_settings = request_settings; @@ -124,7 +126,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, } if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) - s3_settings->auth_settings.updateFrom(endpoint_settings->auth_settings); + s3_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); @@ -139,8 +141,9 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, key_generator, "StorageS3", false); } -void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection) +void StorageS3Configuration::fromNamedCollection(const NamedCollection & collection, ContextPtr context) { + const auto settings = context->getSettingsRef(); validateNamedCollection(collection, required_configuration_keys, optional_configuration_keys); auto filename = collection.getOrDefault("filename", ""); @@ -159,9 +162,9 @@ void StorageS3Configuration::fromNamedCollection(const NamedCollection & collect compression_method = collection.getOrDefault("compression_method", collection.getOrDefault("compression", "auto")); structure = collection.getOrDefault("structure", "auto"); - request_settings = S3::RequestSettings::loadFromNamedCollection(collection, /* validate_settings */true); + request_settings = S3::RequestSettings(collection, settings, /* validate_settings */true); - static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + static_configuration = !auth_settings.access_key_id.value.empty() || auth_settings.no_sign_request.changed; keys = {url.key}; } @@ -357,7 +360,7 @@ void StorageS3Configuration::fromAST(ASTs & args, ContextPtr context, bool with_ if (no_sign_request) auth_settings.no_sign_request = no_sign_request; - static_configuration = !auth_settings.access_key_id.empty() || auth_settings.no_sign_request.has_value(); + static_configuration = !auth_settings.access_key_id.value.empty() || auth_settings.no_sign_request.changed; auth_settings.no_sign_request = no_sign_request; keys = {url.key}; diff --git a/src/Storages/ObjectStorage/S3/Configuration.h b/src/Storages/ObjectStorage/S3/Configuration.h index 5a952497851..39a646c7df2 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.h +++ b/src/Storages/ObjectStorage/S3/Configuration.h @@ -51,7 +51,7 @@ public: ContextPtr context) override; private: - void fromNamedCollection(const NamedCollection & collection) override; + void fromNamedCollection(const NamedCollection & collection, ContextPtr context) override; void fromAST(ASTs & args, ContextPtr context, bool with_structure) override; S3::URI url; diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.cpp b/src/Storages/ObjectStorage/StorageObjectStorage.cpp index 2c8e60b49d0..90a97a9ea62 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorage.cpp @@ -424,7 +424,7 @@ void StorageObjectStorage::Configuration::initialize( bool with_table_structure) { if (auto named_collection = tryGetNamedCollectionWithOverrides(engine_args, local_context)) - configuration.fromNamedCollection(*named_collection); + configuration.fromNamedCollection(*named_collection, local_context); else configuration.fromAST(engine_args, local_context, with_table_structure); diff --git a/src/Storages/ObjectStorage/StorageObjectStorage.h b/src/Storages/ObjectStorage/StorageObjectStorage.h index f45d8c1f01a..cf8ec113653 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorage.h +++ b/src/Storages/ObjectStorage/StorageObjectStorage.h @@ -193,7 +193,7 @@ public: String structure = "auto"; protected: - virtual void fromNamedCollection(const NamedCollection & collection) = 0; + virtual void fromNamedCollection(const NamedCollection & collection, ContextPtr context) = 0; virtual void fromAST(ASTs & args, ContextPtr context, bool with_structure) = 0; void assertInitialized() const; From 673f6c981809a6287be939bd50930ca828cece1a Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 27 May 2024 11:19:05 +0000 Subject: [PATCH 037/215] Add env variable for GWPAsan --- src/Common/Allocator.cpp | 8 +++--- src/Common/GWPAsan.cpp | 62 +++++++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index 67ef98cf221..87075a8c709 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -1,9 +1,9 @@ #include -#include -#include -#include -#include #include +#include +#include +#include +#include #include #include diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 6f57af9e982..a46b7d640e4 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -1,36 +1,67 @@ -#include #include #if USE_GWP_ASAN +# include # include # include # include # include +# include +# include # include # include # include +# include +# include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +} + namespace Memory { namespace { - size_t getBackTrace(uintptr_t * trace_buffer, size_t buffer_size) - { - StackTrace stacktrace; - auto trace_size = std::min(buffer_size, stacktrace.getSize()); - const auto & frame_pointers = stacktrace.getFramePointers(); - memcpy(trace_buffer, frame_pointers.data(), std::min(trace_size, buffer_size) * sizeof(uintptr_t)); - return trace_size; - } +size_t getBackTrace(uintptr_t * trace_buffer, size_t buffer_size) +{ + StackTrace stacktrace; + auto trace_size = std::min(buffer_size, stacktrace.getSize()); + const auto & frame_pointers = stacktrace.getFramePointers(); + memcpy(trace_buffer, frame_pointers.data(), trace_size * sizeof(uintptr_t)); + return trace_size; +} + +__attribute__((__format__ (__printf__, 1, 0))) +void printString(const char * format, ...) // NOLINT(cert-dcl50-cpp) +{ + std::array formatted; + va_list args; + va_start(args, format); + + if (vsnprintf(formatted.data(), formatted.size(), format, args) > 0) + std::cerr << formatted.data() << std::endl; + + va_end(args); +} + } gwp_asan::GuardedPoolAllocator GuardedAlloc; + static bool guarded_alloc_initialized = [] { - gwp_asan::options::initOptions(); - gwp_asan::options::Options &opts = gwp_asan::options::getOptions(); - opts.MaxSimultaneousAllocations = 1024; + const char * env_options_raw = std::getenv("GWP_ASAN_OPTIONS"); // NOLINT(concurrency-mt-unsafe) + if (env_options_raw) + gwp_asan::options::initOptions(env_options_raw, printString); + + auto & opts = gwp_asan::options::getOptions(); opts.Backtrace = getBackTrace; GuardedAlloc.init(opts); @@ -53,8 +84,9 @@ bool isGWPAsanError(uintptr_t fault_address) namespace { -struct ScopedEndOfReportDecorator { - explicit ScopedEndOfReportDecorator(Poco::LoggerPtr log_) : log(std::move(log_)) {} +struct ScopedEndOfReportDecorator +{ + explicit ScopedEndOfReportDecorator(Poco::LoggerPtr log_) : log(std::move(log_)) { } ~ScopedEndOfReportDecorator() { LOG_FATAL(log, "*** End GWP-ASan report ***"); } Poco::LoggerPtr log; }; @@ -108,8 +140,6 @@ void printHeader(gwp_asan::Error error, uintptr_t fault_address, const gwp_asan: } } - // Possible number of digits of a 64-bit number: ceil(log10(2^64)) == 20. Add - // a null terminator, and round to the nearest 8-byte boundary. uint64_t thread_id = gwp_asan::getThreadID(); std::string thread_id_string = thread_id == gwp_asan::kInvalidThreadID ? " Date: Tue, 28 May 2024 08:42:17 +0000 Subject: [PATCH 038/215] Fix style --- src/Common/GWPAsan.cpp | 10 ---------- utils/check-style/check-style | 1 + 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index a46b7d640e4..ecff097e365 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -13,16 +13,6 @@ # include # include -# include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - -} namespace Memory { diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 23e8b6b2bc4..1786418f9a5 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -311,6 +311,7 @@ std_cerr_cout_excludes=( src/Bridge/IBridge.cpp src/Daemon/BaseDaemon.cpp src/Loggers/Loggers.cpp + src/Common/GWPAsan.cpp ) sources_with_std_cerr_cout=( $( find $ROOT_PATH/{src,base} -name '*.h' -or -name '*.cpp' | \ From 73e9719768944871a44fc8bebfafa50fb0577aa5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 27 May 2024 19:18:47 +0200 Subject: [PATCH 039/215] Fix build --- src/IO/S3Common.cpp | 10 +++++----- src/IO/S3Common.h | 30 ++++++++++++++++-------------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index ef42b4b2642..1a1df2c9e9d 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -2,6 +2,10 @@ #include #include +#include +#include +#include +#include #include #include "config.h" @@ -11,10 +15,6 @@ #include #include #include -#include -#include -#include -#include namespace ProfileEvents @@ -50,7 +50,6 @@ bool S3Exception::isRetryableError() const namespace DB::ErrorCodes { extern const int S3_ERROR; - extern const int INVALID_SETTING_VALUE; } #endif @@ -62,6 +61,7 @@ namespace ErrorCodes { extern const int INVALID_CONFIG_PARAMETER; extern const int BAD_ARGUMENTS; + extern const int INVALID_SETTING_VALUE; } namespace S3 diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index b27b9ec1136..1572b93d3f9 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -3,24 +3,24 @@ #include #include #include - -#include "config.h" - -#if USE_AWS_S3 - +#include #include #include #include #include #include +#include +#include +#include + +#include "config.h" + +#if USE_AWS_S3 #include #include -#include - #include #include -#include namespace DB { @@ -30,10 +30,6 @@ namespace ErrorCodes extern const int S3_ERROR; } -class RemoteHostFilter; -class NamedCollection; -struct ProxyConfigurationResolver; - class S3Exception : public Exception { public: @@ -70,7 +66,12 @@ namespace Poco::Util class AbstractConfiguration; }; -namespace DB::S3 +namespace DB +{ +class NamedCollection; +struct ProxyConfigurationResolver; + +namespace S3 { #define AUTH_SETTINGS(M, ALIAS) \ @@ -139,7 +140,7 @@ struct AuthSettings : public BaseSettings AuthSettings(const DB::Settings & settings); - AuthSettings(const NamedCollection & collection); + AuthSettings(const DB::NamedCollection & collection); void updateFromSettings(const DB::Settings & settings, bool if_changed); bool hasUpdates(const AuthSettings & other) const; @@ -189,3 +190,4 @@ HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Ut ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); } +} From 9d961d1936f791183812892e028e346643ec6efc Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 May 2024 13:54:34 +0200 Subject: [PATCH 040/215] Better --- src/Core/Settings.h | 17 +-- src/Disks/ObjectStorages/S3/diskSettings.cpp | 6 +- src/IO/S3Common.cpp | 136 +++++++++++++----- src/IO/S3Common.h | 29 ++-- src/IO/S3Defines.h | 10 +- src/IO/S3Settings.cpp | 8 +- .../ObjectStorage/S3/Configuration.cpp | 7 +- 7 files changed, 140 insertions(+), 73 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index a7a19702282..ee2dc38b0d7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -79,7 +79,7 @@ class IColumn; M(UInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.", 0) \ M(UInt64, distributed_connections_pool_size, 1024, "Maximum number of connections with one remote server in the pool.", 0) \ M(UInt64, connections_with_failover_max_tries, 3, "The maximum number of attempts to connect to replicas.", 0) \ - M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ + M(UInt64, s3_strict_upload_part_size, S3::DEFAULT_STRICT_UPLOAD_PART_SIZE, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_blocks_in_multipart_upload, 50000, "Maximum number of blocks in multipart upload for Azure.", 0) \ M(UInt64, s3_min_upload_part_size, S3::DEFAULT_MIN_UPLOAD_PART_SIZE, "The minimum size of part to upload during multipart upload to S3.", 0) \ @@ -97,17 +97,17 @@ class IColumn; M(UInt64, s3_max_single_part_upload_size, S3::DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ - M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ + M(UInt64, s3_max_single_read_retries, S3::DEFAULT_MAX_SINGLE_READ_TRIES, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ M(UInt64, azure_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write", 0) \ - M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ - M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ + M(UInt64, s3_max_unexpected_write_error_retries, S3::DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ + M(UInt64, s3_max_redirects, S3::DEFAULT_MAX_REDIRECTS, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, S3::DEFAULT_MAX_CONNECTIONS, "The maximum number of connections per server.", 0) \ M(UInt64, s3_max_get_rps, 0, "Limit on S3 GET request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_get_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_get_rps`", 0) \ M(UInt64, s3_max_put_rps, 0, "Limit on S3 PUT request per second rate before throttling. Zero means unlimited.", 0) \ M(UInt64, s3_max_put_burst, 0, "Max number of requests that can be issued simultaneously before hitting request per second limit. By default (0) equals to `s3_max_put_rps`", 0) \ - M(UInt64, s3_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ + M(UInt64, s3_list_object_keys_size, S3::DEFAULT_LIST_OBJECT_KEYS_SIZE, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_use_adaptive_timeouts, S3::DEFAULT_USE_ADAPTIVE_TIMEOUTS, "When adaptive timeouts are enabled first two attempts are made with low receive and send timeout", 0) \ M(UInt64, azure_list_object_keys_size, 1000, "Maximum number of files that could be returned in batch by ListObject request", 0) \ M(Bool, s3_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables.", 0) \ @@ -125,14 +125,15 @@ class IColumn; M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ M(Bool, s3_validate_request_settings, true, "Validate S3 request settings", 0) \ - M(Bool, s3_disable_checksum, false, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ - M(UInt64, s3_retry_attempts, 100, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ + M(Bool, s3_disable_checksum, S3::DEFAULT_DISABLE_CHECKSUM, "Do not calculate a checksum when sending a file to S3. This speeds up writes by avoiding excessive processing passes on a file. It is mostly safe as the data of MergeTree tables is checksummed by ClickHouse anyway, and when S3 is accessed with HTTPS, the TLS layer already provides integrity while transferring through the network. While additional checksums on S3 give defense in depth.", 0) \ + M(UInt64, s3_retry_attempts, S3::DEFAULT_RETRY_ATTEMPTS, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \ M(UInt64, s3_request_timeout_ms, S3::DEFAULT_REQUEST_TIMEOUT_MS, "Idleness timeout for sending and receiving data to/from S3. Fail if a single TCP read or write call blocks for this long.", 0) \ M(UInt64, s3_connect_timeout_ms, S3::DEFAULT_CONNECT_TIMEOUT_MS, "Connection timeout for host from s3 disks.", 0) \ M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \ M(String, s3queue_default_zookeeper_path, "/clickhouse/s3queue/", "Default zookeeper path prefix for S3Queue engine", 0) \ M(Bool, s3queue_enable_logging_to_s3queue_log, false, "Enable writing to system.s3queue_log. The value can be overwritten per table with table settings", 0) \ - M(Bool, s3queue_allow_experimental_sharded_mode, false, "Enable experimental sharded mode of S3Queue table engine. It is experimental because it will be rewritten", 0) \ + M(Bool, s3queue_allow_experimental_sharded_mode, false \ + , "Enable experimental sharded mode of S3Queue table engine. It is experimental because it will be rewritten", 0) \ M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \ M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \ M(Bool, hdfs_create_new_file_on_insert, false, "Enables or disables creating a new file on each insert in hdfs engine tables", 0) \ diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 591b1e6623d..14bb5f05071 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -39,10 +39,8 @@ std::unique_ptr getSettings( bool validate_settings) { const auto & settings = context->getSettingsRef(); - const std::string setting_name_prefix = for_disk_s3 ? "s3_" : ""; - - auto auth_settings = S3::AuthSettings(config, config_prefix, settings); - auto request_settings = S3::RequestSettings(config, config_prefix, settings, validate_settings, setting_name_prefix); + auto auth_settings = S3::AuthSettings(config, settings, for_disk_s3, for_disk_s3 ? config_prefix : ""); + auto request_settings = S3::RequestSettings(config, settings, for_disk_s3, validate_settings, for_disk_s3 ? config_prefix : ""); request_settings.proxy_resolver = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( ProxyConfiguration::protocolFromString(S3::URI(endpoint).uri.getScheme()), config_prefix, config); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 1a1df2c9e9d..ef1e630582d 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -104,28 +104,55 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c AuthSettings::AuthSettings( const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - const DB::Settings &, /// TODO: use settings - const std::string & setting_name_prefix) + const DB::Settings & settings, + bool for_disk_s3, + const std::string & disk_config_prefix) { - const std::string prefix = config_prefix + "." + setting_name_prefix; + if (for_disk_s3 && disk_config_prefix.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk config path cannot be empty"); + + auto update_value_if_exists = [&](const std::string & path, SettingFieldRef & field) -> bool + { + if (!config.has(path)) + return false; + + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + return true; + }; + for (auto & field : allMutable()) { - const auto path = prefix + field.getName(); - if (config.has(path)) + std::string path, fallback_path; + if (for_disk_s3) { - auto which = field.getValue().getType(); - if (isInt64OrUInt64FieldType(which)) - field.setValue(config.getUInt64(path)); - else if (which == Field::Types::String) - field.setValue(config.getString(path)); - else if (which == Field::Types::Bool) - field.setValue(config.getBool(path)); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + path = fmt::format("{}.s3_{}", disk_config_prefix, field.getName()); + fallback_path = fmt::format("s3.{}", field.getName()); + } + else + path = fmt::format("s3.{}", field.getName()); + + bool updated = update_value_if_exists(path, field); + + if (!updated && !fallback_path.empty()) + updated = update_value_if_exists(fallback_path, field); + + if (!updated) + { + auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && settings.isChanged(setting_name)) + field.setValue(settings.get(setting_name)); } } + const auto config_prefix = for_disk_s3 ? disk_config_prefix : "s3"; headers = getHTTPHeaders(config_prefix, config); server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); @@ -150,7 +177,7 @@ void AuthSettings::updateFromSettings(const DB::Settings & settings, bool if_cha const auto setting_name = "s3_" + field.getName(); if (settings.has(setting_name) && (!if_changed || settings.isChanged(setting_name))) { - set(field.getName(), settings.get(setting_name)); + field.setValue(settings.get(setting_name)); } } } @@ -164,9 +191,6 @@ bool AuthSettings::hasUpdates(const AuthSettings & other) const void AuthSettings::updateIfChanged(const AuthSettings & settings) { - /// Update with check for emptyness only parameters which - /// can be passed not only from config, but via ast. - for (auto & setting : settings.all()) { if (setting.isValueChanged()) @@ -175,34 +199,64 @@ void AuthSettings::updateIfChanged(const AuthSettings & settings) if (!settings.headers.empty()) headers = settings.headers; + + if (!settings.users.empty()) + users.insert(settings.users.begin(), settings.users.end()); + server_side_encryption_kms_config = settings.server_side_encryption_kms_config; - users.insert(settings.users.begin(), settings.users.end()); } RequestSettings::RequestSettings( const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, const DB::Settings & settings, + bool for_disk_s3, bool validate_settings, - const std::string & setting_name_prefix) + const std::string & disk_config_path) { - String prefix = config_prefix + "." + setting_name_prefix; + if (for_disk_s3 && disk_config_path.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk config path cannot be empty"); + + auto update_value_if_exists = [&](const std::string & path, SettingFieldRef & field) -> bool + { + if (!config.has(path)) + return false; + + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + return true; + }; + for (auto & field : allMutable()) { - const auto path = prefix + field.getName(); - if (config.has(path)) + std::string path, fallback_path; + if (for_disk_s3) { - auto which = field.getValue().getType(); - if (isInt64OrUInt64FieldType(which)) - field.setValue(config.getUInt64(path)); - else if (which == Field::Types::String) - field.setValue(config.getString(path)); - else if (which == Field::Types::Bool) - field.setValue(config.getBool(path)); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + path = fmt::format("{}.s3_{}", disk_config_path, field.getName()); + fallback_path = fmt::format("s3.{}", field.getName()); + } + else + path = fmt::format("s3.{}", field.getName()); + + bool updated = update_value_if_exists(path, field); + + if (!updated && !fallback_path.empty()) + updated = update_value_if_exists(fallback_path, field); + + if (!updated) + { + auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && settings.isChanged(setting_name)) + field.setValue(settings.get(setting_name)); } } + finishInit(settings, validate_settings); } @@ -238,7 +292,7 @@ RequestSettings::RequestSettings(const DB::Settings & settings, bool validate_se } void RequestSettings::updateFromSettings( - const DB::Settings & settings, bool if_changed, bool /* validate_settings */) /// TODO: process validate_settings + const DB::Settings & settings, bool if_changed, bool validate_settings) { for (auto & field : allMutable()) { @@ -248,6 +302,10 @@ void RequestSettings::updateFromSettings( set(field.getName(), settings.get(setting_name)); } } + + normalizeSettings(); + if (validate_settings) + validateUploadSettings(); } void RequestSettings::updateIfChanged(const RequestSettings & settings) @@ -259,11 +317,15 @@ void RequestSettings::updateIfChanged(const RequestSettings & settings) } } -void RequestSettings::finishInit(const DB::Settings & settings, bool validate_settings) +void RequestSettings::normalizeSettings() { if (!storage_class_name.value.empty() && storage_class_name.changed) storage_class_name = Poco::toUpperInPlace(storage_class_name.value); +} +void RequestSettings::finishInit(const DB::Settings & settings, bool validate_settings) +{ + normalizeSettings(); if (validate_settings) validateUploadSettings(); @@ -373,8 +435,6 @@ void RequestSettings::validateUploadSettings() } -/// TODO: sometimes disk settings have fallback to "s3" section settings from config, support this. - IMPLEMENT_SETTINGS_TRAITS(S3::AuthSettingsTraits, CLIENT_SETTINGS_LIST) IMPLEMENT_SETTINGS_TRAITS(S3::RequestSettingsTraits, REQUEST_SETTINGS_LIST) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 1572b93d3f9..c5b31c4b564 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -73,6 +73,14 @@ struct ProxyConfigurationResolver; namespace S3 { +/// We use s3 settings for DiskS3, StorageS3 (StorageS3Cluster, S3Queue, etc), BackupIO_S3, etc. +/// 1. For DiskS3 we usually have configuration in disk section in configuration file. +/// All s3 related settings start with "s3_" prefix there. +/// If some setting is absent from disk configuration, we look up for it in the "s3." server config section, +/// where s3 settings no longer have "s3_" prefix like in disk configuration section. +/// If the settings is absent there as well, we look up for it in Users config (where query/session settings are also updated). +/// 2. For StorageS3 and similar - we look up to "s3." config section (again - settings there do not have "s3_" prefix). +/// If some setting is absent from there, we lool up for it in Users config. #define AUTH_SETTINGS(M, ALIAS) \ M(String, access_key_id, "", "", 0) \ @@ -134,9 +142,9 @@ struct AuthSettings : public BaseSettings AuthSettings( const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, const DB::Settings & settings, - const std::string & setting_name_prefix = ""); + bool for_disk_s3, + const std::string & disk_config_prefix = ""); AuthSettings(const DB::Settings & settings); @@ -156,6 +164,14 @@ struct RequestSettings : public BaseSettings { RequestSettings() = default; + /// Create request settings from Config. + RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + bool for_disk_s3, + bool validate_settings = true, + const std::string & disk_config_path = ""); + /// Create request settings from DB::Settings. explicit RequestSettings(const DB::Settings & settings, bool validate_settings = true); @@ -165,14 +181,6 @@ struct RequestSettings : public BaseSettings const DB::Settings & settings, bool validate_settings = true); - /// Create request settings from Config. - RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - const DB::Settings & settings, - bool validate_settings = true, - const std::string & setting_name_prefix = ""); - void updateFromSettings(const DB::Settings & settings, bool if_changed, bool validate_settings = true); void updateIfChanged(const RequestSettings & settings); void validateUploadSettings(); @@ -183,6 +191,7 @@ struct RequestSettings : public BaseSettings private: void finishInit(const DB::Settings & settings, bool validate_settings); + void normalizeSettings(); }; HTTPHeaderEntries getHTTPHeaders(const std::string & config_elem, const Poco::Util::AbstractConfiguration & config); diff --git a/src/IO/S3Defines.h b/src/IO/S3Defines.h index eedd0df81a6..332ebcfea92 100644 --- a/src/IO/S3Defines.h +++ b/src/IO/S3Defines.h @@ -21,6 +21,7 @@ inline static constexpr bool DEFAULT_USE_ADAPTIVE_TIMEOUTS = true; inline static constexpr uint64_t DEFAULT_MIN_UPLOAD_PART_SIZE = 16 * 1024 * 1024; inline static constexpr uint64_t DEFAULT_MAX_UPLOAD_PART_SIZE = 5ull * 1024 * 1024 * 1024; inline static constexpr uint64_t DEFAULT_MAX_SINGLE_PART_UPLOAD_SIZE = 32 * 1024 * 1024; +inline static constexpr uint64_t DEFAULT_STRICT_UPLOAD_PART_SIZE = 0; inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_FACTOR = 2; inline static constexpr uint64_t DEFAULT_UPLOAD_PART_SIZE_MULTIPLY_PARTS_COUNT_THRESHOLD = 500; inline static constexpr uint64_t DEFAULT_MAX_PART_NUMBER = 10000; @@ -29,7 +30,12 @@ inline static constexpr uint64_t DEFAULT_MAX_PART_NUMBER = 10000; inline static constexpr uint64_t DEFAULT_MAX_SINGLE_OPERATION_COPY_SIZE = 32 * 1024 * 1024; inline static constexpr uint64_t DEFAULT_MAX_INFLIGHT_PARTS_FOR_ONE_FILE = 20; inline static constexpr uint64_t DEFAULT_LIST_OBJECT_KEYS_SIZE = 1000; -inline static constexpr uint64_t DEFAULT_ALLOW_NATIVE_COPY = true; -inline static constexpr uint64_t DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD = false; +inline static constexpr uint64_t DEFAULT_MAX_SINGLE_READ_TRIES = 4; +inline static constexpr uint64_t DEFAULT_MAX_UNEXPECTED_WRITE_ERROR_RETRIES = 4; +inline static constexpr uint64_t DEFAULT_MAX_REDIRECTS = 10; +inline static constexpr uint64_t DEFAULT_RETRY_ATTEMPTS = 100; + +inline static constexpr bool DEFAULT_ALLOW_NATIVE_COPY = true; +inline static constexpr bool DEFAULT_CHECK_OBJECTS_AFTER_UPLOAD = false; } diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index 4197014c454..85f30e7e316 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -1,11 +1,7 @@ #include #include - #include -#include -#include -#include #include @@ -32,8 +28,8 @@ void S3SettingsByEndpoint::loadFromConfig( if (config.has(endpoint_path)) { auto endpoint = config.getString(endpoint_path); - auto auth_settings = S3::AuthSettings(config, key_path, settings); - auto request_settings = S3::RequestSettings(config, key_path, settings); + auto auth_settings = S3::AuthSettings(config, settings, /* for_disk_s3 */false, config_prefix); + auto request_settings = S3::RequestSettings(config, settings, /* for_disk_s3 */false, settings.s3_validate_request_settings, config_prefix); s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); } } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 327efba2169..d59b3e8ea06 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -112,11 +112,8 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, config, config_prefix, context, url.endpoint, /* for_disk_s3 */false, settings.s3_validate_request_settings); - request_settings.updateFromSettings(settings, /* if_changed */true); - auth_settings.updateIfChanged(s3_settings->auth_settings); - - s3_settings->auth_settings = auth_settings; - s3_settings->request_settings = request_settings; + s3_settings->auth_settings.updateIfChanged(auth_settings); + s3_settings->request_settings.updateIfChanged(request_settings); if (!headers_from_ast.empty()) { From 9e1cd7e8986cb433ded9988268008dab878a9eff Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 May 2024 14:03:51 +0200 Subject: [PATCH 041/215] Unify part of the code --- src/IO/S3Common.cpp | 62 +++++++++++---------------------------------- 1 file changed, 15 insertions(+), 47 deletions(-) diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index ef1e630582d..ef5108612ec 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -102,7 +102,9 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c return sse_kms_config; } -AuthSettings::AuthSettings( +template +static void updateS3SettingsFromConfig( + Settings & s3_settings, const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, bool for_disk_s3, @@ -111,7 +113,7 @@ AuthSettings::AuthSettings( if (for_disk_s3 && disk_config_prefix.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk config path cannot be empty"); - auto update_value_if_exists = [&](const std::string & path, SettingFieldRef & field) -> bool + auto update_value_if_exists = [&](const std::string & path, Settings::SettingFieldRef & field) -> bool { if (!config.has(path)) return false; @@ -128,7 +130,7 @@ AuthSettings::AuthSettings( return true; }; - for (auto & field : allMutable()) + for (auto & field : s3_settings.allMutable()) { std::string path, fallback_path; if (for_disk_s3) @@ -151,6 +153,15 @@ AuthSettings::AuthSettings( field.setValue(settings.get(setting_name)); } } +} + +AuthSettings::AuthSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + bool for_disk_s3, + const std::string & disk_config_prefix) +{ + updateS3SettingsFromConfig(*this, config, settings, for_disk_s3, disk_config_prefix); const auto config_prefix = for_disk_s3 ? disk_config_prefix : "s3"; headers = getHTTPHeaders(config_prefix, config); @@ -213,50 +224,7 @@ RequestSettings::RequestSettings( bool validate_settings, const std::string & disk_config_path) { - if (for_disk_s3 && disk_config_path.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk config path cannot be empty"); - - auto update_value_if_exists = [&](const std::string & path, SettingFieldRef & field) -> bool - { - if (!config.has(path)) - return false; - - auto which = field.getValue().getType(); - if (isInt64OrUInt64FieldType(which)) - field.setValue(config.getUInt64(path)); - else if (which == Field::Types::String) - field.setValue(config.getString(path)); - else if (which == Field::Types::Bool) - field.setValue(config.getBool(path)); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); - return true; - }; - - for (auto & field : allMutable()) - { - std::string path, fallback_path; - if (for_disk_s3) - { - path = fmt::format("{}.s3_{}", disk_config_path, field.getName()); - fallback_path = fmt::format("s3.{}", field.getName()); - } - else - path = fmt::format("s3.{}", field.getName()); - - bool updated = update_value_if_exists(path, field); - - if (!updated && !fallback_path.empty()) - updated = update_value_if_exists(fallback_path, field); - - if (!updated) - { - auto setting_name = "s3_" + field.getName(); - if (settings.has(setting_name) && settings.isChanged(setting_name)) - field.setValue(settings.get(setting_name)); - } - } - + updateS3SettingsFromConfig(*this, config, settings, for_disk_s3, disk_config_path); finishInit(settings, validate_settings); } From dc1c1bcdd04892cb2920f0166ce0297a78d6abf0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 May 2024 17:02:06 +0200 Subject: [PATCH 042/215] Update settings changes history --- src/Core/SettingsChangesHistory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 16f28d94640..29c48dae422 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -92,6 +92,8 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + {"s3_max_part_number", 10000, 10000, "Maximum part number number for s3 upload part"}, + {"s3_max_single_operation_copy_size", 32 * 1024 * 1024, 32 * 1024 * 1024, "Maximum size for a single copy operation in s3"}, }}, {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, From cec2e40398d668c841fb1645cb8782b45cb7ff8b Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 May 2024 18:52:47 +0200 Subject: [PATCH 043/215] Fix build --- src/Coordination/KeeperSnapshotManagerS3.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index b8c8d10d497..db1f84a047f 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -65,7 +65,8 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo return; } - auto auth_settings = S3::AuthSettings(config, config_prefix, Context::getGlobalContextInstance()->getSettingsRef()); + const auto & settings = Context::getGlobalContextInstance()->getSettingsRef(); + auto auth_settings = S3::AuthSettings(config, settings, true, config_prefix); String endpoint = macros->expand(config.getString(config_prefix + ".endpoint")); auto new_uri = S3::URI{endpoint}; From dfd19576cde5396850d31eb77d43a091da9f5cfe Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 28 May 2024 19:42:57 +0200 Subject: [PATCH 044/215] Fix keeper build --- programs/keeper/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index af360e44ff4..517c7d7ab4b 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -156,8 +156,6 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolRemoteFSReader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Disks/IO/ThreadPoolReader.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Storages/StorageS3Settings.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/BaseDaemon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/SentryWriter.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Daemon/GraphiteWriter.cpp From f85030f48150a294b0b5751307a81d366eea6991 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 29 May 2024 12:29:17 +0200 Subject: [PATCH 045/215] Better --- src/Coordination/KeeperSnapshotManagerS3.cpp | 2 +- src/Coordination/Standalone/Context.cpp | 6 +- src/Coordination/Standalone/Context.h | 4 +- .../ObjectStorages/ObjectStorageFactory.cpp | 6 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 5 +- src/Disks/ObjectStorages/S3/diskSettings.cpp | 5 +- src/Disks/ObjectStorages/S3/diskSettings.h | 1 - src/IO/S3Common.cpp | 120 +++++++++++------- src/IO/S3Common.h | 20 ++- src/IO/S3Settings.cpp | 4 +- .../ObjectStorage/S3/Configuration.cpp | 4 +- 11 files changed, 103 insertions(+), 74 deletions(-) diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index db1f84a047f..a24ad81fd64 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -66,7 +66,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo } const auto & settings = Context::getGlobalContextInstance()->getSettingsRef(); - auto auth_settings = S3::AuthSettings(config, settings, true, config_prefix); + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); String endpoint = macros->expand(config.getString(config_prefix + ".endpoint")); auto new_uri = S3::URI{endpoint}; diff --git a/src/Coordination/Standalone/Context.cpp b/src/Coordination/Standalone/Context.cpp index 2802d51ae26..2128a78387f 100644 --- a/src/Coordination/Standalone/Context.cpp +++ b/src/Coordination/Standalone/Context.cpp @@ -146,7 +146,7 @@ struct ContextSharedPart : boost::noncopyable mutable ThrottlerPtr local_read_throttler; /// A server-wide throttler for local IO reads mutable ThrottlerPtr local_write_throttler; /// A server-wide throttler for local IO writes - std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage + std::optional storage_s3_settings TSA_GUARDED_BY(mutex); /// Settings of S3 storage mutable std::mutex keeper_dispatcher_mutex; mutable std::shared_ptr keeper_dispatcher TSA_GUARDED_BY(keeper_dispatcher_mutex); @@ -455,14 +455,14 @@ std::shared_ptr Context::getZooKeeper() const throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "Cannot connect to ZooKeeper from Keeper"); } -const StorageS3Settings & Context::getStorageS3Settings() const +const S3SettingsByEndpoint & Context::getStorageS3Settings() const { std::lock_guard lock(shared->mutex); if (!shared->storage_s3_settings) { const auto & config = shared->config ? *shared->config : Poco::Util::Application::instance().config(); - shared->storage_s3_settings.emplace().loadFromConfig("s3", config, getSettingsRef()); + shared->storage_s3_settings.emplace().loadFromConfig(config, "s3", getSettingsRef()); } return *shared->storage_s3_settings; diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h index 7e4d1794f7d..bba52f8d493 100644 --- a/src/Coordination/Standalone/Context.h +++ b/src/Coordination/Standalone/Context.h @@ -37,7 +37,7 @@ class FilesystemCacheLog; class FilesystemReadPrefetchesLog; class BlobStorageLog; class IOUringReader; -class StorageS3Settings; +class S3SettingsByEndpoint; /// A small class which owns ContextShared. /// We don't use something like unique_ptr directly to allow ContextShared type to be incomplete. @@ -163,7 +163,7 @@ public: zkutil::ZooKeeperPtr getZooKeeper() const; - const StorageS3Settings & getStorageS3Settings() const; + const S3SettingsByEndpoint & getStorageS3Settings() const; const String & getUserName() const { static std::string user; return user; } diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 14c0d656cbf..a2fff253465 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -191,7 +191,7 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto endpoint = getEndpoint(config, config_prefix, context); - auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); @@ -228,7 +228,7 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto endpoint = getEndpoint(config, config_prefix, context); - auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); @@ -263,7 +263,7 @@ void registerS3PlainRewritableObjectStorage(ObjectStorageFactory & factory) auto uri = getS3URI(config, config_prefix, context); auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto endpoint = getEndpoint(config, config_prefix, context); - auto settings = getSettings(config, config_prefix, context, endpoint, /* for_disk_s3 */true, /* validate_settings */true); + auto settings = getSettings(config, config_prefix, context, endpoint, /* validate_settings */true); auto client = getClient(endpoint, *settings, context, /* for_disk_s3 */true); auto key_generator = getKeyGenerator(uri, config, config_prefix); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 84af340e5d0..60bd9b1673c 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -574,7 +574,7 @@ void S3ObjectStorage::applyNewSettings( ContextPtr context, const ApplyNewSettingsOptions & options) { - auto settings_from_config = getSettings(config, config_prefix, context, uri.endpoint, for_disk_s3, context->getSettingsRef().s3_validate_request_settings); + auto settings_from_config = getSettings(config, config_prefix, context, uri.uri_str, context->getSettingsRef().s3_validate_request_settings); auto modified_settings = std::make_unique(*s3_settings.get()); modified_settings->auth_settings.updateIfChanged(settings_from_config->auth_settings); modified_settings->request_settings.updateIfChanged(settings_from_config->request_settings); @@ -598,7 +598,8 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( const std::string & config_prefix, ContextPtr context) { - auto new_s3_settings = getSettings(config, config_prefix, context, uri.endpoint, for_disk_s3, true); + const auto & settings = context->getSettingsRef(); + auto new_s3_settings = getSettings(config, config_prefix, context, uri.uri_str, settings.s3_validate_request_settings); auto new_client = getClient(uri, *new_s3_settings, context, for_disk_s3); auto new_uri{uri}; diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 14bb5f05071..bdaacbf62cd 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -35,12 +35,11 @@ std::unique_ptr getSettings( const String & config_prefix, ContextPtr context, const std::string & endpoint, - bool for_disk_s3, bool validate_settings) { const auto & settings = context->getSettingsRef(); - auto auth_settings = S3::AuthSettings(config, settings, for_disk_s3, for_disk_s3 ? config_prefix : ""); - auto request_settings = S3::RequestSettings(config, settings, for_disk_s3, validate_settings, for_disk_s3 ? config_prefix : ""); + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); + auto request_settings = S3::RequestSettings(config, settings, config_prefix, validate_settings); request_settings.proxy_resolver = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( ProxyConfiguration::protocolFromString(S3::URI(endpoint).uri.getScheme()), config_prefix, config); diff --git a/src/Disks/ObjectStorages/S3/diskSettings.h b/src/Disks/ObjectStorages/S3/diskSettings.h index 41aa85991a7..aa427bee41a 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.h +++ b/src/Disks/ObjectStorages/S3/diskSettings.h @@ -19,7 +19,6 @@ std::unique_ptr getSettings( const String & config_prefix, ContextPtr context, const std::string & endpoint, - bool for_disk_s3, bool validate_settings); std::unique_ptr getClient( diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index ef5108612ec..a545d12aade 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -103,48 +103,45 @@ ServerSideEncryptionKMSConfig getSSEKMSConfig(const std::string & config_elem, c } template -static void updateS3SettingsFromConfig( - Settings & s3_settings, +static bool setValueFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & path, + typename Settings::SettingFieldRef & field) +{ + if (!config.has(path)) + return false; + + auto which = field.getValue().getType(); + if (isInt64OrUInt64FieldType(which)) + field.setValue(config.getUInt64(path)); + else if (which == Field::Types::String) + field.setValue(config.getString(path)); + else if (which == Field::Types::Bool) + field.setValue(config.getBool(path)); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); + + return true; +} + +AuthSettings::AuthSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - bool for_disk_s3, - const std::string & disk_config_prefix) + const std::string & config_prefix, + const std::string & fallback_config_prefix) { - if (for_disk_s3 && disk_config_prefix.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Disk config path cannot be empty"); + if (config_prefix.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config path cannot be empty"); - auto update_value_if_exists = [&](const std::string & path, Settings::SettingFieldRef & field) -> bool + for (auto & field : allMutable()) { - if (!config.has(path)) - return false; + auto path = fmt::format("{}.{}", config_prefix, field.getName()); + auto fallback_path = fallback_config_prefix.empty() ? "" : fmt::format("{}.{}", fallback_config_prefix, field.getName()); - auto which = field.getValue().getType(); - if (isInt64OrUInt64FieldType(which)) - field.setValue(config.getUInt64(path)); - else if (which == Field::Types::String) - field.setValue(config.getString(path)); - else if (which == Field::Types::Bool) - field.setValue(config.getBool(path)); - else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type: {}", field.getTypeName()); - return true; - }; - - for (auto & field : s3_settings.allMutable()) - { - std::string path, fallback_path; - if (for_disk_s3) - { - path = fmt::format("{}.s3_{}", disk_config_prefix, field.getName()); - fallback_path = fmt::format("s3.{}", field.getName()); - } - else - path = fmt::format("s3.{}", field.getName()); - - bool updated = update_value_if_exists(path, field); + bool updated = setValueFromConfig(config, path, field); if (!updated && !fallback_path.empty()) - updated = update_value_if_exists(fallback_path, field); + updated = setValueFromConfig(config, fallback_path, field); if (!updated) { @@ -153,17 +150,7 @@ static void updateS3SettingsFromConfig( field.setValue(settings.get(setting_name)); } } -} -AuthSettings::AuthSettings( - const Poco::Util::AbstractConfiguration & config, - const DB::Settings & settings, - bool for_disk_s3, - const std::string & disk_config_prefix) -{ - updateS3SettingsFromConfig(*this, config, settings, for_disk_s3, disk_config_prefix); - - const auto config_prefix = for_disk_s3 ? disk_config_prefix : "s3"; headers = getHTTPHeaders(config_prefix, config); server_side_encryption_kms_config = getSSEKMSConfig(config_prefix, config); @@ -220,11 +207,48 @@ void AuthSettings::updateIfChanged(const AuthSettings & settings) RequestSettings::RequestSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - bool for_disk_s3, - bool validate_settings, - const std::string & disk_config_path) + const std::string & config_prefix, + bool validate_settings) + : RequestSettings( + config, + settings, + config_prefix, + validate_settings, + config_prefix == "s3" ? "" : "s3_", /* setting_name_prefix */ + config_prefix == "s3" ? "" : "s3", /* fallback_config_prefix */ + "") /* fallback_setting_name_prefix */ { - updateS3SettingsFromConfig(*this, config, settings, for_disk_s3, disk_config_path); +} + +RequestSettings::RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix, + bool validate_settings, + const std::string & setting_name_prefix, + const std::string & fallback_config_prefix, + const std::string & fallback_setting_name_prefix) +{ + if (config_prefix.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config path cannot be empty"); + + for (auto & field : allMutable()) + { + auto path = fmt::format("{}.{}{}", config_prefix, setting_name_prefix, field.getName()); + auto fallback_path = fallback_config_prefix.empty() ? "" : fmt::format("{}.{}{}", fallback_config_prefix, fallback_setting_name_prefix, field.getName()); + + bool updated = setValueFromConfig(config, path, field); + + if (!updated && !fallback_path.empty()) + updated = setValueFromConfig(config, fallback_path, field); + + if (!updated) + { + auto setting_name = "s3_" + field.getName(); + if (settings.has(setting_name) && settings.isChanged(setting_name)) + field.setValue(settings.get(setting_name)); + } + } finishInit(settings, validate_settings); } diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index c5b31c4b564..6391075a5ab 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -75,7 +75,8 @@ namespace S3 { /// We use s3 settings for DiskS3, StorageS3 (StorageS3Cluster, S3Queue, etc), BackupIO_S3, etc. /// 1. For DiskS3 we usually have configuration in disk section in configuration file. -/// All s3 related settings start with "s3_" prefix there. +/// REQUEST_SETTINGS, PART_UPLOAD_SETTINGS start with "s3_" prefix there, while AUTH_SETTINGS and CLIENT_SETTINGS do not +/// (does not make sense, but it happened this way). /// If some setting is absent from disk configuration, we look up for it in the "s3." server config section, /// where s3 settings no longer have "s3_" prefix like in disk configuration section. /// If the settings is absent there as well, we look up for it in Users config (where query/session settings are also updated). @@ -143,8 +144,8 @@ struct AuthSettings : public BaseSettings AuthSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - bool for_disk_s3, - const std::string & disk_config_prefix = ""); + const std::string & config_prefix, + const std::string & fallback_config_prefix = ""); AuthSettings(const DB::Settings & settings); @@ -168,9 +169,8 @@ struct RequestSettings : public BaseSettings RequestSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - bool for_disk_s3, - bool validate_settings = true, - const std::string & disk_config_path = ""); + const std::string & config_prefix, + bool validate_settings = true); /// Create request settings from DB::Settings. explicit RequestSettings(const DB::Settings & settings, bool validate_settings = true); @@ -190,6 +190,14 @@ struct RequestSettings : public BaseSettings std::shared_ptr proxy_resolver; private: + RequestSettings( + const Poco::Util::AbstractConfiguration & config, + const DB::Settings & settings, + const std::string & config_prefix, + bool validate_settings, + const std::string & setting_name_prefix, + const std::string & fallback_config_prefix, + const std::string & fallback_setting_name_prefix); void finishInit(const DB::Settings & settings, bool validate_settings); void normalizeSettings(); }; diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index 85f30e7e316..0323da53117 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -28,8 +28,8 @@ void S3SettingsByEndpoint::loadFromConfig( if (config.has(endpoint_path)) { auto endpoint = config.getString(endpoint_path); - auto auth_settings = S3::AuthSettings(config, settings, /* for_disk_s3 */false, config_prefix); - auto request_settings = S3::RequestSettings(config, settings, /* for_disk_s3 */false, settings.s3_validate_request_settings, config_prefix); + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); + auto request_settings = S3::RequestSettings(config, settings, config_prefix, settings.s3_validate_request_settings); s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); } } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index d59b3e8ea06..1252c010769 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -106,11 +106,9 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & config = context->getConfigRef(); const auto & settings = context->getSettingsRef(); - const std::string config_prefix = "s3."; auto s3_settings = getSettings( - config, config_prefix, context, url.endpoint, /* for_disk_s3 */false, - settings.s3_validate_request_settings); + config, "s3"/* config_prefix */, context, url.endpoint, settings.s3_validate_request_settings); s3_settings->auth_settings.updateIfChanged(auth_settings); s3_settings->request_settings.updateIfChanged(request_settings); From dd34d316c51789048bdbf744747d45d11f806bc9 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 30 May 2024 16:54:11 +0200 Subject: [PATCH 046/215] Fix tests --- src/IO/tests/gtest_writebuffer_s3.cpp | 2 +- src/Storages/ObjectStorage/S3/Configuration.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index 1e1fdc02060..3c1af6538ad 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -547,7 +547,7 @@ public: std::unique_ptr getWriteBuffer(String file_name = "file") { S3::RequestSettings request_settings; - request_settings.updateFromSettings(settings, /* if_changed */true); + request_settings.updateFromSettings(settings, /* if_changed */true, /* validate_settings */false); client->resetCounters(); diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 1252c010769..244d233b302 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -108,7 +108,7 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, const auto & settings = context->getSettingsRef(); auto s3_settings = getSettings( - config, "s3"/* config_prefix */, context, url.endpoint, settings.s3_validate_request_settings); + config, "s3"/* config_prefix */, context, url.uri_str, settings.s3_validate_request_settings); s3_settings->auth_settings.updateIfChanged(auth_settings); s3_settings->request_settings.updateIfChanged(request_settings); From 779eaa01ec54ad06a8ceff4b9efc125439a9671c Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 31 May 2024 17:51:00 +0200 Subject: [PATCH 047/215] Fix tests --- src/Backups/BackupIO_S3.cpp | 34 ++++++++++------ .../ObjectStorages/S3/S3ObjectStorage.cpp | 9 ++++- src/Disks/ObjectStorages/S3/diskSettings.cpp | 3 +- src/Disks/getOrCreateDiskFromAST.cpp | 2 +- src/IO/S3Common.cpp | 39 +------------------ src/IO/S3Common.h | 12 +----- src/IO/S3Settings.cpp | 10 +++-- .../ObjectStorage/S3/Configuration.cpp | 9 +++-- 8 files changed, 49 insertions(+), 69 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 2d8ae7f1b00..273d4b4ebe8 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -127,12 +127,18 @@ BackupReaderS3::BackupReaderS3( : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { - auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint - request_settings.allow_native_copy = allow_s3_native_copy; + auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), + context_->getUserName(), + /*ignore_user=*/is_internal_backup); + + if (endpoint_settings.has_value()) + s3_settings = endpoint_settings.value(); + + s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); + s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; + client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) @@ -219,13 +225,19 @@ BackupWriterS3::BackupWriterS3( : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} - , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup).value_or(S3Settings{})) { - auto & request_settings = s3_settings.request_settings; - request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); - request_settings.max_single_read_retries = context_->getSettingsRef().s3_max_single_read_retries; // FIXME: Avoid taking value for endpoint - request_settings.allow_native_copy = allow_s3_native_copy; - request_settings.storage_class_name = storage_class_name; + auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), + context_->getUserName(), + /*ignore_user=*/is_internal_backup); + + if (endpoint_settings.has_value()) + s3_settings = endpoint_settings.value(); + + s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); + s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; + s3_settings.request_settings.storage_class_name = storage_class_name; + client = makeS3Client(s3_uri_, access_key_id_, secret_access_key_, s3_settings, context_); if (auto blob_storage_system_log = context_->getBlobStorageLog()) { diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 9b862d2c0b3..56cf7a1b9e6 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -262,9 +262,11 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN /// NOTE: For background operations settings are not propagated from session or query. They are taken from /// default user's .xml config. It's obscure and unclear behavior. For them it's always better /// to rely on settings from disk. - if (auto query_context = CurrentThread::getQueryContext(); query_context && !query_context->isBackgroundOperationContext()) + if (auto query_context = CurrentThread::getQueryContext(); + query_context && !query_context->isBackgroundOperationContext()) { - request_settings.updateFromSettings(query_context->getSettingsRef(), /* if_changed */true); + const auto & settings = query_context->getSettingsRef(); + request_settings.updateFromSettings(settings, /* if_changed */true, settings.s3_validate_request_settings); } ThreadPoolCallbackRunnerUnsafe scheduler; @@ -586,7 +588,10 @@ void S3ObjectStorage::applyNewSettings( modified_settings->request_settings.updateIfChanged(settings_from_config->request_settings); if (auto endpoint_settings = context->getStorageS3Settings().getSettings(uri.uri.toString(), context->getUserName())) + { modified_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); + modified_settings->request_settings.updateIfChanged(endpoint_settings->request_settings); + } auto current_settings = s3_settings.get(); if (options.allow_client_change diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index bdaacbf62cd..62df98f51e6 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -38,8 +38,9 @@ std::unique_ptr getSettings( bool validate_settings) { const auto & settings = context->getSettingsRef(); + auto auth_settings = S3::AuthSettings(config, settings, config_prefix); - auto request_settings = S3::RequestSettings(config, settings, config_prefix, validate_settings); + auto request_settings = S3::RequestSettings(config, settings, config_prefix, "s3_", validate_settings); request_settings.proxy_resolver = DB::ProxyConfigurationResolverProvider::getFromOldSettingsFormat( ProxyConfiguration::protocolFromString(S3::URI(endpoint).uri.getScheme()), config_prefix, config); diff --git a/src/Disks/getOrCreateDiskFromAST.cpp b/src/Disks/getOrCreateDiskFromAST.cpp index 7b2762613b6..fd43f31a009 100644 --- a/src/Disks/getOrCreateDiskFromAST.cpp +++ b/src/Disks/getOrCreateDiskFromAST.cpp @@ -47,7 +47,7 @@ namespace auto result_disk = context->getOrCreateDisk(disk_name, [&](const DisksMap & disks_map) -> DiskPtr { auto disk = DiskFactory::instance().create( - disk_name, *config, "", context, disks_map, /* attach */attach, /* custom_disk */true); + disk_name, *config, /* config_path */"", context, disks_map, /* attach */attach, /* custom_disk */true); /// Mark that disk can be used without storage policy. disk->markDiskAsCustom(); return disk; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index a545d12aade..54881227d13 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -127,22 +127,13 @@ static bool setValueFromConfig( AuthSettings::AuthSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - const std::string & config_prefix, - const std::string & fallback_config_prefix) + const std::string & config_prefix) { - if (config_prefix.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config path cannot be empty"); - for (auto & field : allMutable()) { auto path = fmt::format("{}.{}", config_prefix, field.getName()); - auto fallback_path = fallback_config_prefix.empty() ? "" : fmt::format("{}.{}", fallback_config_prefix, field.getName()); bool updated = setValueFromConfig(config, path, field); - - if (!updated && !fallback_path.empty()) - updated = setValueFromConfig(config, fallback_path, field); - if (!updated) { auto setting_name = "s3_" + field.getName(); @@ -208,40 +199,14 @@ RequestSettings::RequestSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, const std::string & config_prefix, - bool validate_settings) - : RequestSettings( - config, - settings, - config_prefix, - validate_settings, - config_prefix == "s3" ? "" : "s3_", /* setting_name_prefix */ - config_prefix == "s3" ? "" : "s3", /* fallback_config_prefix */ - "") /* fallback_setting_name_prefix */ -{ -} - -RequestSettings::RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const DB::Settings & settings, - const std::string & config_prefix, - bool validate_settings, const std::string & setting_name_prefix, - const std::string & fallback_config_prefix, - const std::string & fallback_setting_name_prefix) + bool validate_settings) { - if (config_prefix.empty()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Config path cannot be empty"); - for (auto & field : allMutable()) { auto path = fmt::format("{}.{}{}", config_prefix, setting_name_prefix, field.getName()); - auto fallback_path = fallback_config_prefix.empty() ? "" : fmt::format("{}.{}{}", fallback_config_prefix, fallback_setting_name_prefix, field.getName()); bool updated = setValueFromConfig(config, path, field); - - if (!updated && !fallback_path.empty()) - updated = setValueFromConfig(config, fallback_path, field); - if (!updated) { auto setting_name = "s3_" + field.getName(); diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 6391075a5ab..68d44b0ed01 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -144,8 +144,7 @@ struct AuthSettings : public BaseSettings AuthSettings( const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, - const std::string & config_prefix, - const std::string & fallback_config_prefix = ""); + const std::string & config_prefix); AuthSettings(const DB::Settings & settings); @@ -170,6 +169,7 @@ struct RequestSettings : public BaseSettings const Poco::Util::AbstractConfiguration & config, const DB::Settings & settings, const std::string & config_prefix, + const std::string & setting_name_prefix = "", bool validate_settings = true); /// Create request settings from DB::Settings. @@ -190,14 +190,6 @@ struct RequestSettings : public BaseSettings std::shared_ptr proxy_resolver; private: - RequestSettings( - const Poco::Util::AbstractConfiguration & config, - const DB::Settings & settings, - const std::string & config_prefix, - bool validate_settings, - const std::string & setting_name_prefix, - const std::string & fallback_config_prefix, - const std::string & fallback_setting_name_prefix); void finishInit(const DB::Settings & settings, bool validate_settings); void normalizeSettings(); }; diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index 0323da53117..396eb8f5ec7 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -27,10 +27,12 @@ void S3SettingsByEndpoint::loadFromConfig( const auto endpoint_path = key_path + ".endpoint"; if (config.has(endpoint_path)) { - auto endpoint = config.getString(endpoint_path); - auto auth_settings = S3::AuthSettings(config, settings, config_prefix); - auto request_settings = S3::RequestSettings(config, settings, config_prefix, settings.s3_validate_request_settings); - s3_settings.emplace(endpoint, S3Settings{std::move(auth_settings), std::move(request_settings)}); + auto auth_settings = S3::AuthSettings(config, settings, key_path); + auto request_settings = S3::RequestSettings(config, settings, key_path, "", settings.s3_validate_request_settings); + + s3_settings.emplace( + config.getString(endpoint_path), + S3Settings{std::move(auth_settings), std::move(request_settings)}); } } } diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 244d233b302..8720e6440da 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -110,9 +110,6 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, auto s3_settings = getSettings( config, "s3"/* config_prefix */, context, url.uri_str, settings.s3_validate_request_settings); - s3_settings->auth_settings.updateIfChanged(auth_settings); - s3_settings->request_settings.updateIfChanged(request_settings); - if (!headers_from_ast.empty()) { s3_settings->auth_settings.headers.insert( @@ -121,7 +118,13 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, } if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) + { s3_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); + s3_settings->request_settings.updateIfChanged(endpoint_settings->request_settings); + } + + s3_settings->auth_settings.updateIfChanged(auth_settings); + s3_settings->request_settings.updateIfChanged(request_settings); auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); From 5246c56a2aae742d498028a7d5d2b7f9aa124baf Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 30 May 2024 16:26:49 +0200 Subject: [PATCH 048/215] Fix type inference for float (in case of small buffer) In case of small buffer (i.e. --max_read_buffer_size 1) the pos() will be always point to this one byte, so, comparing pos() will be always evaluated to true. And we cannot use count() as well, since in case of big buffer it will be the same, plus, in case of reading extra byte for checking for '.' the count() will be different, but it does not mean that the byte had been interpreted (and allowing 1 byte of difference will not work almost always, since it will read max_read_buffer_size bytes). So instead, expose the has_fractional flag from the read helpers for float, via two new methods: - tryReadFloatTextExt - tryReadFloatTextExtNoExponent Where "ext" stands for "extended", which means expose extra information. v2: consider number as float if it has '.' or 'e' (previously only if it has some signs after those two it had been considered as float) Signed-off-by: Azat Khuzhin --- src/Formats/SchemaInferenceUtils.cpp | 30 +++++++------- src/IO/readFloatText.h | 39 +++++++++++++++++-- ...oat_schema_inference_small_block.reference | 15 +++++++ ...3170_float_schema_inference_small_block.sh | 32 +++++++++++++++ 4 files changed, 95 insertions(+), 21 deletions(-) create mode 100644 tests/queries/0_stateless/03170_float_schema_inference_small_block.reference create mode 100755 tests/queries/0_stateless/03170_float_schema_inference_small_block.sh diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 02c0aa6dd77..80a467a1145 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -879,11 +879,11 @@ namespace } template - bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings) + bool tryReadFloat(Float64 & value, ReadBuffer & buf, const FormatSettings & settings, bool & has_fractional) { if (is_json || settings.try_infer_exponent_floats) - return tryReadFloatText(value, buf); - return tryReadFloatTextNoExponent(value, buf); + return tryReadFloatTextExt(value, buf, has_fractional); + return tryReadFloatTextExtNoExponent(value, buf, has_fractional); } template @@ -893,6 +893,7 @@ namespace return nullptr; Float64 tmp_float; + bool has_fractional; if (settings.try_infer_integers) { /// If we read from String, we can do it in a more efficient way. @@ -906,12 +907,10 @@ namespace if (buf.eof()) return read_int ? std::make_shared() : nullptr; - char * int_end = buf.position(); /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. buf.position() = number_start; bool read_uint = false; - char * uint_end = nullptr; /// In case of Int64 overflow we can try to infer UInt64. if (!read_int) { @@ -921,15 +920,14 @@ namespace if (buf.eof()) return read_uint ? std::make_shared() : nullptr; - uint_end = buf.position(); buf.position() = number_start; } - if (tryReadFloat(tmp_float, buf, settings)) + if (tryReadFloat(tmp_float, buf, settings, has_fractional)) { - if (read_int && buf.position() == int_end) + if (read_int && !has_fractional) return std::make_shared(); - if (read_uint && buf.position() == uint_end) + if (read_uint && !has_fractional) return std::make_shared(); return std::make_shared(); } @@ -944,34 +942,31 @@ namespace PeekableReadBufferCheckpoint checkpoint(peekable_buf); Int64 tmp_int; bool read_int = tryReadIntText(tmp_int, peekable_buf); - auto * int_end = peekable_buf.position(); peekable_buf.rollbackToCheckpoint(true); bool read_uint = false; - char * uint_end = nullptr; /// In case of Int64 overflow we can try to infer UInt64. if (!read_int) { PeekableReadBufferCheckpoint new_checkpoint(peekable_buf); UInt64 tmp_uint; read_uint = tryReadIntText(tmp_uint, peekable_buf); - uint_end = peekable_buf.position(); peekable_buf.rollbackToCheckpoint(true); } - if (tryReadFloat(tmp_float, peekable_buf, settings)) + if (tryReadFloat(tmp_float, peekable_buf, settings, has_fractional)) { /// Float parsing reads no fewer bytes than integer parsing, /// so position of the buffer is either the same, or further. /// If it's the same, then it's integer. - if (read_int && peekable_buf.position() == int_end) + if (read_int && !has_fractional) return std::make_shared(); - if (read_uint && peekable_buf.position() == uint_end) + if (read_uint && !has_fractional) return std::make_shared(); return std::make_shared(); } } - else if (tryReadFloat(tmp_float, buf, settings)) + else if (tryReadFloat(tmp_float, buf, settings, has_fractional)) { return std::make_shared(); } @@ -1004,7 +999,8 @@ namespace buf.position() = buf.buffer().begin(); Float64 tmp; - if (tryReadFloat(tmp, buf, settings) && buf.eof()) + bool has_fractional; + if (tryReadFloat(tmp, buf, settings, has_fractional) && buf.eof()) return std::make_shared(); return nullptr; diff --git a/src/IO/readFloatText.h b/src/IO/readFloatText.h index 3a21d7201a9..215bb1a3270 100644 --- a/src/IO/readFloatText.h +++ b/src/IO/readFloatText.h @@ -320,11 +320,13 @@ static inline void readUIntTextUpToNSignificantDigits(T & x, ReadBuffer & buf) template -ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) +ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in, bool & has_fractional) { static_assert(std::is_same_v || std::is_same_v, "Argument for readFloatTextImpl must be float or double"); static_assert('a' > '.' && 'A' > '.' && '\n' < '.' && '\t' < '.' && '\'' < '.' && '"' < '.', "Layout of char is not like ASCII"); + has_fractional = false; + static constexpr bool throw_exception = std::is_same_v; bool negative = false; @@ -377,6 +379,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) if (checkChar('.', in)) { + has_fractional = true; auto after_point_count = in.count(); while (!in.eof() && *in.position() == '0') @@ -394,6 +397,7 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) { if (checkChar('e', in) || checkChar('E', in)) { + has_fractional = true; if (in.eof()) { if constexpr (throw_exception) @@ -420,10 +424,14 @@ ReturnType readFloatTextFastImpl(T & x, ReadBuffer & in) } if (after_point) + { x += static_cast(shift10(after_point, after_point_exponent)); + } if (exponent) + { x = static_cast(shift10(x, exponent)); + } if (negative) x = -x; @@ -590,8 +598,16 @@ ReturnType readFloatTextSimpleImpl(T & x, ReadBuffer & buf) template void readFloatTextPrecise(T & x, ReadBuffer & in) { readFloatTextPreciseImpl(x, in); } template bool tryReadFloatTextPrecise(T & x, ReadBuffer & in) { return readFloatTextPreciseImpl(x, in); } -template void readFloatTextFast(T & x, ReadBuffer & in) { readFloatTextFastImpl(x, in); } -template bool tryReadFloatTextFast(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template void readFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextFast(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} template void readFloatTextSimple(T & x, ReadBuffer & in) { readFloatTextSimpleImpl(x, in); } template bool tryReadFloatTextSimple(T & x, ReadBuffer & in) { return readFloatTextSimpleImpl(x, in); } @@ -603,6 +619,21 @@ template void readFloatText(T & x, ReadBuffer & in) { readFloatText template bool tryReadFloatText(T & x, ReadBuffer & in) { return tryReadFloatTextFast(x, in); } /// Don't read exponent part of the number. -template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) { return readFloatTextFastImpl(x, in); } +template bool tryReadFloatTextNoExponent(T & x, ReadBuffer & in) +{ + bool has_fractional; + return readFloatTextFastImpl(x, in, has_fractional); +} + +/// With a @has_fractional flag +/// Used for input_format_try_infer_integers +template bool tryReadFloatTextExt(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} +template bool tryReadFloatTextExtNoExponent(T & x, ReadBuffer & in, bool & has_fractional) +{ + return readFloatTextFastImpl(x, in, has_fractional); +} } diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference new file mode 100644 index 00000000000..9ee16da8728 --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.reference @@ -0,0 +1,15 @@ +Int64 +x Nullable(Int64) +x Nullable(Int64) +x Nullable(Int64) +Float64 +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +x Nullable(Float64) +Float64.explicit File +x Nullable(Float64) +Float64.pipe +x Nullable(Float64) +Float64.default max_read_buffer_size +x Nullable(Float64) diff --git a/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh new file mode 100755 index 00000000000..88f9bfad7ed --- /dev/null +++ b/tests/queries/0_stateless/03170_float_schema_inference_small_block.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +# do not fallback to float always +echo "Int64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : -1}' + +echo "Float64" +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.1}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' <<<'{"x" : +1.111}' + +# this is requried due to previously clickhouse-local does not interprets +# --max_read_buffer_size for fds [1] +# +# [1]: https://github.com/ClickHouse/ClickHouse/pull/64532 +echo "Float64.explicit File" +tmp_path=$(mktemp "$CUR_DIR/03170_float_schema_inference_small_block.json.XXXXXX") +trap 'rm -f $tmp_path' EXIT +cat > "$tmp_path" <<<'{"x" : 1.111}' +$CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' --file "$tmp_path" + +echo "Float64.pipe" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --max_read_buffer_size 1 --input-format JSONEachRow 'desc "table"' +echo "Float64.default max_read_buffer_size" +echo '{"x" : 1.1}' | $CLICKHOUSE_LOCAL --storage_file_read_method read --input-format JSONEachRow 'desc "table"' From 5df91804f6559095e115ec417aeeeab368b2975f Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 3 Jun 2024 11:51:35 +0200 Subject: [PATCH 049/215] Add probability setting for GWPAsan --- programs/server/Server.cpp | 8 ++++++++ src/Common/Allocator.cpp | 14 +++++++------- src/Common/GWPAsan.cpp | 14 ++++++++++++-- src/Common/GWPAsan.h | 18 ++++++++++++++++-- src/Common/PODArray.h | 22 +++++++++++++--------- src/Common/memory.h | 26 +++++++++++++------------- src/Core/ServerSettings.h | 1 + src/Daemon/BaseDaemon.cpp | 4 ++-- 8 files changed, 72 insertions(+), 35 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 8fcb9d87a93..1822c016f4d 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1632,6 +1632,10 @@ try if (global_context->isServerCompletelyStarted()) CannotAllocateThreadFaultInjector::setFaultProbability(new_server_settings.cannot_allocate_thread_fault_injection_probability); +#if USE_GWP_ASAN + GWPAsan::setForceSampleProbability(new_server_settings.gwp_asan_force_sample_probability); +#endif + ProfileEvents::increment(ProfileEvents::MainConfigLoads); /// Must be the last. @@ -2124,6 +2128,10 @@ try CannotAllocateThreadFaultInjector::setFaultProbability(server_settings.cannot_allocate_thread_fault_injection_probability); +#if USE_GWP_ASAN + GWPAsan::setForceSampleProbability(server_settings.gwp_asan_force_sample_probability); +#endif + try { global_context->startClusterDiscovery(); diff --git a/src/Common/Allocator.cpp b/src/Common/Allocator.cpp index 87075a8c709..bfc85559fe8 100644 --- a/src/Common/Allocator.cpp +++ b/src/Common/Allocator.cpp @@ -68,9 +68,9 @@ void * allocNoTrack(size_t size, size_t alignment) { void * buf; #if USE_GWP_ASAN - if (unlikely(Memory::GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = Memory::GuardedAlloc.allocate(size, alignment)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignment)) { if constexpr (clear_memory) memset(ptr, 0, size); @@ -120,10 +120,10 @@ void * allocNoTrack(size_t size, size_t alignment) void freeNoTrack(void * buf) { #if USE_GWP_ASAN - if (unlikely(Memory::GuardedAlloc.pointerIsMine(buf))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(buf))) { ProfileEvents::increment(ProfileEvents::GWPAsanFree); - Memory::GuardedAlloc.deallocate(buf); + GWPAsan::GuardedAlloc.deallocate(buf); return; } #endif @@ -185,9 +185,9 @@ void * Allocator::realloc(void * buf, size_t old_size, } #if USE_GWP_ASAN - if (unlikely(Memory::GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = Memory::GuardedAlloc.allocate(new_size, alignment)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(new_size, alignment)) { auto trace_free = CurrentMemoryTracker::free(old_size); auto trace_alloc = CurrentMemoryTracker::alloc(new_size); @@ -213,7 +213,7 @@ void * Allocator::realloc(void * buf, size_t old_size, } } - if (unlikely(Memory::GuardedAlloc.pointerIsMine(buf))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(buf))) { /// Big allocs that requires a copy. MemoryTracker is called inside 'alloc', 'free' methods. void * new_buf = alloc(new_size, alignment); diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index ecff097e365..4bda2f7e913 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -12,9 +12,10 @@ # include # include +# include # include -namespace Memory +namespace GWPAsan { namespace @@ -53,6 +54,7 @@ static bool guarded_alloc_initialized = [] auto & opts = gwp_asan::options::getOptions(); opts.Backtrace = getBackTrace; + opts.MaxSimultaneousAllocations = 256; /// for testing GuardedAlloc.init(opts); ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled @@ -155,7 +157,7 @@ void printHeader(gwp_asan::Error error, uintptr_t fault_address, const gwp_asan: } -void printGWPAsanReport([[maybe_unused]] uintptr_t fault_address) +void printReport([[maybe_unused]] uintptr_t fault_address) { const auto logger = getLogger("GWPAsan"); const auto * state = GuardedAlloc.getAllocatorState(); @@ -212,5 +214,13 @@ void printGWPAsanReport([[maybe_unused]] uintptr_t fault_address) reinterpret_cast(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); }); } +std::atomic force_sample_probability = 0.0; + +void setForceSampleProbability(double value) +{ + force_sample_probability.store(value, std::memory_order_relaxed); } + +} + #endif diff --git a/src/Common/GWPAsan.h b/src/Common/GWPAsan.h index 164c6ee0221..b3215c6157e 100644 --- a/src/Common/GWPAsan.h +++ b/src/Common/GWPAsan.h @@ -5,15 +5,29 @@ #if USE_GWP_ASAN #include +#include -namespace Memory +#include +#include + +namespace GWPAsan { extern gwp_asan::GuardedPoolAllocator GuardedAlloc; bool isGWPAsanError(uintptr_t fault_address); -void printGWPAsanReport(uintptr_t fault_address); +void printReport(uintptr_t fault_address); + +extern std::atomic force_sample_probability; + +void setForceSampleProbability(double value); + +inline bool shouldForceSample() +{ + std::bernoulli_distribution dist(force_sample_probability.load(std::memory_order_relaxed)); + return dist(thread_local_rng); +} } diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index b9246bcdca2..d004e703ac6 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -2,17 +2,19 @@ #include "config.h" -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include + +#include +#include +#include #include #include -#include -#include -#include #ifndef NDEBUG #include @@ -119,7 +121,8 @@ protected: void alloc(size_t bytes, TAllocatorParams &&... allocator_params) { #if USE_GWP_ASAN - gwp_asan::getThreadLocals()->NextSampleCounter = 1; + if (unlikely(GWPAsan::shouldForceSample())) + gwp_asan::getThreadLocals()->NextSampleCounter = 1; #endif char * allocated = reinterpret_cast(TAllocator::alloc(bytes, std::forward(allocator_params)...)); @@ -152,7 +155,8 @@ protected: } #if USE_GWP_ASAN - gwp_asan::getThreadLocals()->NextSampleCounter = 1; + if (unlikely(GWPAsan::shouldForceSample())) + gwp_asan::getThreadLocals()->NextSampleCounter = 1; #endif unprotect(); diff --git a/src/Common/memory.h b/src/Common/memory.h index 633994a83e2..73b86c5d3ca 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -37,11 +37,11 @@ requires DB::OptionalArgument inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { if constexpr (sizeof...(TAlign) == 1) { - if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align...))) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignToSizeT(align...))) { ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; @@ -53,7 +53,7 @@ inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) } else { - if (void * ptr = GuardedAlloc.allocate(size)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size)) { ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; @@ -83,9 +83,9 @@ inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align) inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = GuardedAlloc.allocate(size)) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size)) { ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; @@ -102,9 +102,9 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size) noexcept inline ALWAYS_INLINE void * newNoExept(std::size_t size, std::align_val_t align) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.shouldSample())) + if (unlikely(GWPAsan::GuardedAlloc.shouldSample())) { - if (void * ptr = GuardedAlloc.allocate(size, alignToSizeT(align))) + if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignToSizeT(align))) { ProfileEvents::increment(ProfileEvents::GWPAsanAllocateSuccess); return ptr; @@ -121,10 +121,10 @@ inline ALWAYS_INLINE void * newNoExept(std::size_t size, std::align_val_t align) inline ALWAYS_INLINE void deleteImpl(void * ptr) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { ProfileEvents::increment(ProfileEvents::GWPAsanFree); - GuardedAlloc.deallocate(ptr); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif @@ -141,10 +141,10 @@ inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size, TAlign... al return; #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { ProfileEvents::increment(ProfileEvents::GWPAsanFree); - GuardedAlloc.deallocate(ptr); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif @@ -217,10 +217,10 @@ inline ALWAYS_INLINE size_t untrackMemory(void * ptr [[maybe_unused]], Allocatio std::size_t actual_size = 0; #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { if (!size) - size = GuardedAlloc.getSize(ptr); + size = GWPAsan::GuardedAlloc.getSize(ptr); trace = CurrentMemoryTracker::free(size); return size; } diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 45f235116ab..55d8a8f0ec7 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -146,6 +146,7 @@ namespace DB M(UInt64, global_profiler_real_time_period_ns, 0, "Period for real clock timer of global profiler (in nanoseconds). Set 0 value to turn off the real clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, global_profiler_cpu_time_period_ns, 0, "Period for CPU clock timer of global profiler (in nanoseconds). Set 0 value to turn off the CPU clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, enable_azure_sdk_logging, false, "Enables logging from Azure sdk", 0) \ + M(Double, gwp_asan_force_sample_probability, 0.1, "Probability that an allocation from specific places will be sampled by GWP Asan (i.e. PODArray allocations)", 0) \ /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 3f7ad8d7126..dc8cdf7deff 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -159,8 +159,8 @@ static void signalHandler(int sig, siginfo_t * info, void * context) #if USE_GWP_ASAN if (const auto fault_address = reinterpret_cast(info->si_addr); - ::Memory::isGWPAsanError(fault_address)) - ::Memory::printGWPAsanReport(fault_address); + GWPAsan::isGWPAsanError(fault_address)) + GWPAsan::printReport(fault_address); #endif writeBinary(sig, out); From a035e4ed698230117303114a12dd67de6929470c Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 3 Jun 2024 11:52:37 +0200 Subject: [PATCH 050/215] Fix tests --- .../ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/IO/S3Common.cpp | 25 ++++++++++++++++++- src/IO/S3Common.h | 7 ++++-- .../ObjectStorage/S3/Configuration.cpp | 14 +++++------ 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 56cf7a1b9e6..fab3fa357cc 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -597,7 +597,7 @@ void S3ObjectStorage::applyNewSettings( if (options.allow_client_change && (current_settings->auth_settings.hasUpdates(modified_settings->auth_settings) || for_disk_s3)) { - auto new_client = getClient(uri, *settings_from_config, context, for_disk_s3); + auto new_client = getClient(uri, *modified_settings, context, for_disk_s3); client.set(std::move(new_client)); } s3_settings.set(std::move(modified_settings)); diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index 54881227d13..490bf8c2d0c 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -192,7 +192,10 @@ void AuthSettings::updateIfChanged(const AuthSettings & settings) if (!settings.users.empty()) users.insert(settings.users.begin(), settings.users.end()); - server_side_encryption_kms_config = settings.server_side_encryption_kms_config; + if (settings.server_side_encryption_kms_config.key_id.has_value() + || settings.server_side_encryption_kms_config.encryption_context.has_value() + || settings.server_side_encryption_kms_config.key_id.has_value()) + server_side_encryption_kms_config = settings.server_side_encryption_kms_config; } RequestSettings::RequestSettings( @@ -390,6 +393,26 @@ void RequestSettings::validateUploadSettings() /// We can check that max possible object size is not too small. } +bool operator==(const AuthSettings & left, const AuthSettings & right) +{ + if (left.headers != right.headers) + return false; + + if (left.users != right.users) + return false; + + if (left.server_side_encryption_kms_config != right.server_side_encryption_kms_config) + return false; + + auto l = left.begin(); + for (const auto & r : right) + { + if ((l == left.end()) || (*l != r)) + return false; + ++l; + } + return l == left.end(); +} } IMPLEMENT_SETTINGS_TRAITS(S3::AuthSettingsTraits, CLIENT_SETTINGS_LIST) diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 68d44b0ed01..2dca08871d3 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -146,9 +146,9 @@ struct AuthSettings : public BaseSettings const DB::Settings & settings, const std::string & config_prefix); - AuthSettings(const DB::Settings & settings); + explicit AuthSettings(const DB::Settings & settings); - AuthSettings(const DB::NamedCollection & collection); + explicit AuthSettings(const DB::NamedCollection & collection); void updateFromSettings(const DB::Settings & settings, bool if_changed); bool hasUpdates(const AuthSettings & other) const; @@ -158,8 +158,11 @@ struct AuthSettings : public BaseSettings HTTPHeaderEntries headers; std::unordered_set users; ServerSideEncryptionKMSConfig server_side_encryption_kms_config; + /// Note: if you add any field, do not forget to update operator ==. }; +bool operator==(const AuthSettings & left, const AuthSettings & right); + struct RequestSettings : public BaseSettings { RequestSettings() = default; diff --git a/src/Storages/ObjectStorage/S3/Configuration.cpp b/src/Storages/ObjectStorage/S3/Configuration.cpp index 8720e6440da..b33d55105e9 100644 --- a/src/Storages/ObjectStorage/S3/Configuration.cpp +++ b/src/Storages/ObjectStorage/S3/Configuration.cpp @@ -110,13 +110,6 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, auto s3_settings = getSettings( config, "s3"/* config_prefix */, context, url.uri_str, settings.s3_validate_request_settings); - if (!headers_from_ast.empty()) - { - s3_settings->auth_settings.headers.insert( - s3_settings->auth_settings.headers.end(), - headers_from_ast.begin(), headers_from_ast.end()); - } - if (auto endpoint_settings = context->getStorageS3Settings().getSettings(url.uri.toString(), context->getUserName())) { s3_settings->auth_settings.updateIfChanged(endpoint_settings->auth_settings); @@ -126,6 +119,13 @@ ObjectStoragePtr StorageS3Configuration::createObjectStorage(ContextPtr context, s3_settings->auth_settings.updateIfChanged(auth_settings); s3_settings->request_settings.updateIfChanged(request_settings); + if (!headers_from_ast.empty()) + { + s3_settings->auth_settings.headers.insert( + s3_settings->auth_settings.headers.end(), + headers_from_ast.begin(), headers_from_ast.end()); + } + auto client = getClient(url, *s3_settings, context, /* for_disk_s3 */false); auto key_generator = createObjectStorageKeysGeneratorAsIsWithPrefix(url.key); auto s3_capabilities = S3Capabilities From 918d3849e18e4eb8933fa869e2cf6c534424e6d5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 31 May 2024 15:49:03 +0200 Subject: [PATCH 051/215] Simplify logic for input_format_try_infer_integers Now, when we can be sure that it is a float, parse it as a float, and fallback to int/uint after. But note, that this would break something if tryReadFloat() != tryReadIntText() + parsing of '.'/'e', but for now, it is true. Signed-off-by: Azat Khuzhin --- src/Formats/SchemaInferenceUtils.cpp | 78 ++++++++++------------------ 1 file changed, 27 insertions(+), 51 deletions(-) diff --git a/src/Formats/SchemaInferenceUtils.cpp b/src/Formats/SchemaInferenceUtils.cpp index 80a467a1145..1dc612728f5 100644 --- a/src/Formats/SchemaInferenceUtils.cpp +++ b/src/Formats/SchemaInferenceUtils.cpp @@ -900,37 +900,24 @@ namespace if (auto * string_buf = dynamic_cast(&buf)) { /// Remember the pointer to the start of the number to rollback to it. - char * number_start = buf.position(); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_int ? std::make_shared() : nullptr; - /// We can safely get back to the start of the number, because we read from a string and we didn't reach eof. - buf.position() = number_start; + char * number_start = buf.position(); - bool read_uint = false; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, buf); - /// If we reached eof, it cannot be float (it requires no less data than integer) - if (buf.eof()) - return read_uint ? std::make_shared() : nullptr; - - buf.position() = number_start; - } - - if (tryReadFloat(tmp_float, buf, settings, has_fractional)) - { - if (read_int && !has_fractional) - return std::make_shared(); - if (read_uint && !has_fractional) - return std::make_shared(); + /// NOTE: it may break parsing of tryReadFloat() != tryReadIntText() + parsing of '.'/'e' + /// But, for now it is true + if (tryReadFloat(tmp_float, buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + + Int64 tmp_int; + buf.position() = number_start; + if (tryReadIntText(tmp_int, buf)) + return std::make_shared(); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + buf.position() = number_start; + if (tryReadIntText(tmp_uint, buf)) + return std::make_shared(); return nullptr; } @@ -940,31 +927,20 @@ namespace /// and then as float. PeekableReadBuffer peekable_buf(buf); PeekableReadBufferCheckpoint checkpoint(peekable_buf); - Int64 tmp_int; - bool read_int = tryReadIntText(tmp_int, peekable_buf); - peekable_buf.rollbackToCheckpoint(true); - bool read_uint = false; - /// In case of Int64 overflow we can try to infer UInt64. - if (!read_int) - { - PeekableReadBufferCheckpoint new_checkpoint(peekable_buf); - UInt64 tmp_uint; - read_uint = tryReadIntText(tmp_uint, peekable_buf); - peekable_buf.rollbackToCheckpoint(true); - } - - if (tryReadFloat(tmp_float, peekable_buf, settings, has_fractional)) - { - /// Float parsing reads no fewer bytes than integer parsing, - /// so position of the buffer is either the same, or further. - /// If it's the same, then it's integer. - if (read_int && !has_fractional) - return std::make_shared(); - if (read_uint && !has_fractional) - return std::make_shared(); + if (tryReadFloat(tmp_float, peekable_buf, settings, has_fractional) && has_fractional) return std::make_shared(); - } + peekable_buf.rollbackToCheckpoint(/* drop= */ false); + + Int64 tmp_int; + if (tryReadIntText(tmp_int, peekable_buf)) + return std::make_shared(); + peekable_buf.rollbackToCheckpoint(/* drop= */ true); + + /// In case of Int64 overflow we can try to infer UInt64. + UInt64 tmp_uint; + if (tryReadIntText(tmp_uint, peekable_buf)) + return std::make_shared(); } else if (tryReadFloat(tmp_float, buf, settings, has_fractional)) { From e1bf007bb97f12f379512d0d9a24235a0114f0b3 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 3 Jun 2024 14:18:15 +0200 Subject: [PATCH 052/215] Small fix --- src/Common/PODArray.h | 5 ----- src/Common/memory.h | 4 ++-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index d004e703ac6..92ef0597c7e 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -20,11 +20,6 @@ #include #endif -#if USE_GWP_ASAN -# include - -#endif - /** Whether we can use memcpy instead of a loop with assignment to T from U. * It is Ok if types are the same. And if types are integral and of the same size, * example: char, signed char, unsigned char. diff --git a/src/Common/memory.h b/src/Common/memory.h index 73b86c5d3ca..caa0418fa56 100644 --- a/src/Common/memory.h +++ b/src/Common/memory.h @@ -162,10 +162,10 @@ requires DB::OptionalArgument inline ALWAYS_INLINE void deleteSized(void * ptr, std::size_t size [[maybe_unused]], TAlign... /* align */) noexcept { #if USE_GWP_ASAN - if (unlikely(GuardedAlloc.pointerIsMine(ptr))) + if (unlikely(GWPAsan::GuardedAlloc.pointerIsMine(ptr))) { ProfileEvents::increment(ProfileEvents::GWPAsanFree); - GuardedAlloc.deallocate(ptr); + GWPAsan::GuardedAlloc.deallocate(ptr); return; } #endif From 642e2fdd7211b8eaa2eb8012050cb2d84b73755b Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 3 Jun 2024 18:17:13 +0200 Subject: [PATCH 053/215] Fix clang-tidy --- src/IO/S3/copyS3File.cpp | 2 +- src/IO/WriteBufferFromS3.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 22572b5fc66..c48d7965ac2 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -185,7 +185,7 @@ namespace request.SetMultipartUpload(multipart_upload); - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 7e3ba48c165..ac63281d328 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -610,7 +610,7 @@ void WriteBufferFromS3::completeMultipartUpload() req.SetMultipartUpload(multipart_upload); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3CompleteMultipartUpload); From 2d72e194fbad9f1b16b5bde929fa9d006aa3ddf1 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 4 Jun 2024 07:59:15 +0000 Subject: [PATCH 054/215] remove bad optimization and re-enable vertical final Signed-off-by: Duc Canh Le --- src/Columns/FilterDescription.h | 7 - src/Core/Settings.h | 2 +- src/Interpreters/InterpreterSelectQuery.cpp | 4 - .../QueryPlan/ReadFromMergeTree.cpp | 8 +- src/Processors/Transforms/FilterTransform.cpp | 154 +----------------- src/Storages/SelectQueryInfo.h | 6 - ...ce => 02893_vertical_final_bugs.reference} | 1 + ...join.sql => 02893_vertical_final_bugs.sql} | 12 ++ 8 files changed, 19 insertions(+), 175 deletions(-) rename tests/queries/0_stateless/{02893_vertical_final_array_join.reference => 02893_vertical_final_bugs.reference} (99%) rename tests/queries/0_stateless/{02893_vertical_final_array_join.sql => 02893_vertical_final_bugs.sql} (65%) diff --git a/src/Columns/FilterDescription.h b/src/Columns/FilterDescription.h index 63457b8b544..b4335a49787 100644 --- a/src/Columns/FilterDescription.h +++ b/src/Columns/FilterDescription.h @@ -23,15 +23,10 @@ struct ConstantFilterDescription struct IFilterDescription { - /// has_one can be pre-compute during creating the filter description in some cases - Int64 has_one = -1; virtual ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const = 0; virtual size_t countBytesInFilter() const = 0; virtual ~IFilterDescription() = default; - bool hasOne() { return has_one >= 0 ? has_one : hasOneImpl();} protected: - /// Calculate if filter has a non-zero from the filter values, may update has_one - virtual bool hasOneImpl() = 0; }; /// Obtain a filter from non constant Column, that may have type: UInt8, Nullable(UInt8). @@ -45,7 +40,6 @@ struct FilterDescription final : public IFilterDescription ColumnPtr filter(const IColumn & column, ssize_t result_size_hint) const override { return column.filter(*data, result_size_hint); } size_t countBytesInFilter() const override { return DB::countBytesInFilter(*data); } protected: - bool hasOneImpl() override { return data ? (has_one = !memoryIsZero(data->data(), 0, data->size())) : false; } }; struct SparseFilterDescription final : public IFilterDescription @@ -56,7 +50,6 @@ struct SparseFilterDescription final : public IFilterDescription ColumnPtr filter(const IColumn & column, ssize_t) const override { return column.index(*filter_indices, 0); } size_t countBytesInFilter() const override { return filter_indices->size(); } protected: - bool hasOneImpl() override { return filter_indices && !filter_indices->empty(); } }; struct ColumnWithTypeAndName; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b8f5a8b5a75..ec5ee7b8fb4 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -396,7 +396,7 @@ class IColumn; M(Bool, allow_experimental_analyzer, true, "Allow experimental analyzer.", 0) \ M(Bool, analyzer_compatibility_join_using_top_level_identifier, false, "Force to resolve identifier in JOIN USING from projection (for example, in `SELECT a + 1 AS b FROM t1 JOIN t2 USING (b)` join will be performed by `t1.a + 1 = t2.b`, rather then `t1.b = t2.b`).", 0) \ M(Bool, prefer_global_in_and_join, false, "If enabled, all IN/JOIN operators will be rewritten as GLOBAL IN/JOIN. It's useful when the to-be-joined tables are only available on the initiator and we need to always scatter their data on-the-fly during distributed processing with the GLOBAL keyword. It's also useful to reduce the need to access the external sources joining external tables.", 0) \ - M(Bool, enable_vertical_final, false, "Not recommended. If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows", 0) \ + M(Bool, enable_vertical_final, true, "If enable, remove duplicated rows during FINAL by marking rows as deleted and filtering them later instead of merging rows", 0) \ \ \ /** Limits during query execution are part of the settings. \ diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index ffe45d55643..4afd3f26b31 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2599,10 +2599,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.storage_limits = std::make_shared(storage_limits); query_info.settings_limit_offset_done = options.settings_limit_offset_done; - /// Possible filters: row-security, additional filter, replica filter (before array join), where (after array join) - query_info.has_filters_and_no_array_join_before_filter = row_policy_filter || additional_filter_info - || parallel_replicas_custom_filter_info - || (analysis_result.hasWhere() && !analysis_result.before_where->hasArrayJoin() && !analysis_result.array_join); storage->read(query_plan, required_columns, storage_snapshot, query_info, context, processing_stage, max_block_size, max_streams); if (context->hasQueryContext() && !options.is_internal) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 3988ba33d90..1a3f97f2dfb 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1092,8 +1092,7 @@ static void addMergingFinal( MergeTreeData::MergingParams merging_params, Names partition_key_columns, size_t max_block_size_rows, - bool enable_vertical_final, - bool can_merge_final_indices_to_next_step_filter) + bool enable_vertical_final) { const auto & header = pipe.getHeader(); size_t num_outputs = pipe.numOutputPorts(); @@ -1135,7 +1134,7 @@ static void addMergingFinal( }; pipe.addTransform(get_merging_processor()); - if (enable_vertical_final && !can_merge_final_indices_to_next_step_filter) + if (enable_vertical_final) pipe.addSimpleTransform([](const Block & header_) { return std::make_shared(header_); }); } @@ -1323,8 +1322,7 @@ Pipe ReadFromMergeTree::spreadMarkRangesAmongStreamsFinal( data.merging_params, partition_key_columns, block_size.max_block_size_rows, - enable_vertical_final, - query_info.has_filters_and_no_array_join_before_filter); + enable_vertical_final); merging_pipes.emplace_back(Pipe::unitePipes(std::move(pipes))); } diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index e8e7f99ce53..f3b3d8127d4 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -37,147 +37,6 @@ static void replaceFilterToConstant(Block & block, const String & filter_column_ } } -static std::shared_ptr getSelectByFinalIndices(Chunk & chunk) -{ - if (auto select_final_indices_info = std::dynamic_pointer_cast(chunk.getChunkInfo())) - { - const auto & index_column = select_final_indices_info->select_final_indices; - chunk.setChunkInfo(nullptr); - if (index_column && index_column->size() != chunk.getNumRows()) - return select_final_indices_info; - } - return nullptr; -} - -static void -executeSelectByIndices(Columns & columns, std::shared_ptr & select_final_indices_info, size_t & num_rows) -{ - if (select_final_indices_info) - { - const auto & index_column = select_final_indices_info->select_final_indices; - - for (auto & column : columns) - column = column->index(*index_column, 0); - - num_rows = index_column->size(); - } -} - -static std::unique_ptr combineFilterAndIndices( - std::unique_ptr description, - std::shared_ptr & select_final_indices_info, - size_t num_rows) -{ - if (select_final_indices_info) - { - const auto * index_column = select_final_indices_info->select_final_indices; - - if (description->hasOne()) - { - const auto & selected_by_indices = index_column->getData(); - const auto * selected_by_filter = description->data->data(); - /// We will recompute new has_one - description->has_one = 0; - /// At this point we know that the filter is not constant, just create a new filter - auto mutable_holder = ColumnUInt8::create(num_rows, 0); - auto & data = mutable_holder->getData(); - for (auto idx : selected_by_indices) - { - if (idx >= num_rows) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Index {} out of range {}", idx, num_rows); - data[idx] = 1; - } - - /// AND two filters - auto * begin = data.data(); - const auto * end = begin + num_rows; -#if defined(__AVX2__) - while (end - begin >= 32) - { - _mm256_storeu_si256( - reinterpret_cast<__m256i *>(begin), - _mm256_and_si256( - _mm256_loadu_si256(reinterpret_cast(begin)), - _mm256_loadu_si256(reinterpret_cast(selected_by_filter)))); - description->has_one |= !memoryIsZero(begin, 0, 32); - begin += 32; - selected_by_filter += 32; - } -#elif defined(__SSE2__) - while (end - begin >= 16) - { - _mm_storeu_si128( - reinterpret_cast<__m128i *>(begin), - _mm_and_si128( - _mm_loadu_si128(reinterpret_cast(begin)), - _mm_loadu_si128(reinterpret_cast(selected_by_filter)))); - description->has_one |= !memoryIsZero(begin, 0, 16); - begin += 16; - selected_by_filter += 16; - } -#endif - - while (end - begin >= 8) - { - *reinterpret_cast(begin) &= *reinterpret_cast(selected_by_filter); - description->has_one |= *reinterpret_cast(begin); - begin += 8; - selected_by_filter += 8; - } - - while (end - begin > 0) - { - *begin &= *selected_by_filter; - description->has_one |= *begin; - begin++; - selected_by_filter++; - } - - description->data_holder = std::move(mutable_holder); - description->data = &data; - } - } - return std::move(description); -} - -static std::unique_ptr combineFilterAndIndices( - std::unique_ptr description, - std::shared_ptr & select_final_indices_info, - size_t num_rows) -{ - /// Iterator interface to decorate data from output of std::set_intersection - struct Iterator - { - UInt8 * data; - Int64 & pop_cnt; - explicit Iterator(UInt8 * data_, Int64 & pop_cnt_) : data(data_), pop_cnt(pop_cnt_) {} - Iterator & operator = (UInt64 index) { data[index] = 1; ++pop_cnt; return *this; } - Iterator & operator ++ () { return *this; } - Iterator & operator * () { return *this; } - }; - - if (select_final_indices_info) - { - const auto * index_column = select_final_indices_info->select_final_indices; - - if (description->hasOne()) - { - std::unique_ptr res; - res->has_one = 0; - const auto & selected_by_indices = index_column->getData(); - const auto & selected_by_filter = description->filter_indices->getData(); - auto mutable_holder = ColumnUInt8::create(num_rows, 0); - auto & data = mutable_holder->getData(); - Iterator decorator(data.data(), res->has_one); - std::set_intersection(selected_by_indices.begin(), selected_by_indices.end(), selected_by_filter.begin(), selected_by_filter.end(), decorator); - res->data_holder = std::move(mutable_holder); - res->data = &data; - return res; - } - } - return std::move(description); -} - Block FilterTransform::transformHeader( const Block & header, const ActionsDAG * expression, const String & filter_column_name, bool remove_filter_column) { @@ -267,7 +126,6 @@ void FilterTransform::doTransform(Chunk & chunk) size_t num_rows_before_filtration = chunk.getNumRows(); auto columns = chunk.detachColumns(); DataTypes types; - auto select_final_indices_info = getSelectByFinalIndices(chunk); { Block block = getInputPort().getHeader().cloneWithColumns(columns); @@ -282,7 +140,6 @@ void FilterTransform::doTransform(Chunk & chunk) if (constant_filter_description.always_true || on_totals) { - executeSelectByIndices(columns, select_final_indices_info, num_rows_before_filtration); chunk.setColumns(std::move(columns), num_rows_before_filtration); removeFilterIfNeed(chunk); return; @@ -303,7 +160,6 @@ void FilterTransform::doTransform(Chunk & chunk) if (constant_filter_description.always_true) { - executeSelectByIndices(columns, select_final_indices_info, num_rows_before_filtration); chunk.setColumns(std::move(columns), num_rows_before_filtration); removeFilterIfNeed(chunk); return; @@ -311,15 +167,9 @@ void FilterTransform::doTransform(Chunk & chunk) std::unique_ptr filter_description; if (filter_column->isSparse()) - filter_description = combineFilterAndIndices( - std::make_unique(*filter_column), select_final_indices_info, num_rows_before_filtration); + filter_description = std::make_unique(*filter_column); else - filter_description = combineFilterAndIndices( - std::make_unique(*filter_column), select_final_indices_info, num_rows_before_filtration); - - - if (!filter_description->has_one) - return; + filter_description = std::make_unique(*filter_column); /** Let's find out how many rows will be in result. * To do this, we filter out the first non-constant column diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 11e2a2fc5e7..c3db0f510f9 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -215,12 +215,6 @@ struct SelectQueryInfo /// If query has aggregate functions bool has_aggregates = false; - /// If query has any filter and no arrayJoin before filter. Used by skipping FINAL - /// Skipping FINAL algorithm will output the original chunk and a column indices of - /// selected rows. If query has filter and doesn't have array join before any filter, - /// we can merge the indices with the first filter in FilterTransform later. - bool has_filters_and_no_array_join_before_filter = false; - ClusterPtr getCluster() const { return !optimized_cluster ? cluster : optimized_cluster; } bool settings_limit_offset_done = false; diff --git a/tests/queries/0_stateless/02893_vertical_final_array_join.reference b/tests/queries/0_stateless/02893_vertical_final_bugs.reference similarity index 99% rename from tests/queries/0_stateless/02893_vertical_final_array_join.reference rename to tests/queries/0_stateless/02893_vertical_final_bugs.reference index 27b54a2e42e..ab23116aa5f 100644 --- a/tests/queries/0_stateless/02893_vertical_final_array_join.reference +++ b/tests/queries/0_stateless/02893_vertical_final_bugs.reference @@ -1,3 +1,4 @@ +1 2 b 1 -- { echo ON } SELECT arrayJoin([(k1, v), (k2, v)]) AS row, row.1 as k FROM t FINAL WHERE k1 != 3 AND k = 1 ORDER BY row SETTINGS enable_vertical_final = 0; (1,4) 1 diff --git a/tests/queries/0_stateless/02893_vertical_final_array_join.sql b/tests/queries/0_stateless/02893_vertical_final_bugs.sql similarity index 65% rename from tests/queries/0_stateless/02893_vertical_final_array_join.sql rename to tests/queries/0_stateless/02893_vertical_final_bugs.sql index cc2e37fdc6e..e82ab674c58 100644 --- a/tests/queries/0_stateless/02893_vertical_final_array_join.sql +++ b/tests/queries/0_stateless/02893_vertical_final_bugs.sql @@ -1,3 +1,15 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/64543 +DROP TABLE IF EXISTS foo; +DROP TABLE IF EXISTS bar; +CREATE TABLE foo (id UInt64, seq UInt64) ENGINE = Memory; +CREATE TABLE bar (id UInt64, seq UInt64, name String) ENGINE = ReplacingMergeTree ORDER BY id; +INSERT INTO foo VALUES (1, 1); +INSERT INTO bar VALUES (1, 1, 'a') (2, 2, 'b'); +INSERT INTO bar VALUES (1, 2, 'b') (2, 3, 'c'); +SELECT * FROM bar INNER JOIN foo USING id WHERE bar.seq > foo.seq SETTINGS final = 1; + +-- Same problem possible can happen with array join +DROP TABLE IF EXISTS t; CREATE TABLE t (k1 UInt64, k2 UInt64, v UInt64) ENGINE = ReplacingMergeTree() ORDER BY (k1, k2); SET optimize_on_insert = 0; INSERT INTO t VALUES (1, 2, 3) (1, 2, 4) (2, 3, 4), (2, 3, 5); From 00c352a31c259802866d0992e917d1b64c947583 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 4 Jun 2024 12:28:07 +0200 Subject: [PATCH 055/215] Fix clang-tidy --- src/IO/S3/copyS3File.cpp | 4 ++-- src/IO/WriteBufferFromS3.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index c48d7965ac2..bb654c3f5c9 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -524,7 +524,7 @@ namespace void processPutRequest(S3::PutObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3PutObject); @@ -726,7 +726,7 @@ namespace void processCopyRequest(S3::CopyObjectRequest & request) { - size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); + size_t max_retries = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t retries = 1;; ++retries) { ProfileEvents::increment(ProfileEvents::S3CopyObject); diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index ac63281d328..cd9949862ca 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -693,7 +693,7 @@ void WriteBufferFromS3::makeSinglepartUpload(WriteBufferFromS3::PartData && data auto & request = std::get<0>(*worker_data); size_t content_length = request.GetContentLength(); - size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); + size_t max_retry = std::max(request_settings.max_unexpected_write_error_retries.value, 1UL); for (size_t i = 0; i < max_retry; ++i) { ProfileEvents::increment(ProfileEvents::S3PutObject); From c769b3bda056b50a1822ed57ab0299f46ffad58f Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 4 Jun 2024 15:04:14 -0300 Subject: [PATCH 056/215] fix conflicts and adapt --- src/Common/proxyConfigurationToPocoProxyConfig.cpp | 3 +-- tests/integration/helpers/s3_url_proxy_tests_util.py | 9 +++++++++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index c221dd394ca..fbd9a229da3 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -1,8 +1,7 @@ #include -#include -#include +#include #include #pragma clang diagnostic push diff --git a/tests/integration/helpers/s3_url_proxy_tests_util.py b/tests/integration/helpers/s3_url_proxy_tests_util.py index 7a57c006689..02603bbf879 100644 --- a/tests/integration/helpers/s3_url_proxy_tests_util.py +++ b/tests/integration/helpers/s3_url_proxy_tests_util.py @@ -124,3 +124,12 @@ def simple_storage_test(cluster, node, proxies, policy): # not checking for POST because it is in a different format check_proxy_logs(cluster, proxies, "http", policy, ["PUT", "GET"]) + +def simple_test_assert_no_proxy(cluster, proxies, protocol, bucket): + minio_endpoint = build_s3_endpoint(protocol, bucket) + node = cluster.instances[bucket] + perform_simple_queries(node, minio_endpoint) + + # No HTTP method should be found in proxy logs if no proxy is active + empty_method_list = [] + check_proxy_logs(cluster, proxies, protocol, bucket, empty_method_list) From 53b9ea483fc7b9f888a4478db24a4f108da838b3 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 4 Jun 2024 15:23:52 -0300 Subject: [PATCH 057/215] black fix --- tests/integration/helpers/s3_url_proxy_tests_util.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/helpers/s3_url_proxy_tests_util.py b/tests/integration/helpers/s3_url_proxy_tests_util.py index 02603bbf879..9a45855acb8 100644 --- a/tests/integration/helpers/s3_url_proxy_tests_util.py +++ b/tests/integration/helpers/s3_url_proxy_tests_util.py @@ -125,6 +125,7 @@ def simple_storage_test(cluster, node, proxies, policy): # not checking for POST because it is in a different format check_proxy_logs(cluster, proxies, "http", policy, ["PUT", "GET"]) + def simple_test_assert_no_proxy(cluster, proxies, protocol, bucket): minio_endpoint = build_s3_endpoint(protocol, bucket) node = cluster.instances[bucket] From 236a565e91fcd9b7a2224f145799852a71bae593 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 4 Jun 2024 15:44:22 -0300 Subject: [PATCH 058/215] add missing no_proxy_hosts assignment --- src/Common/RemoteProxyConfigurationResolver.cpp | 1 + .../integration/test_s3_table_function_with_http_proxy/test.py | 2 -- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Common/RemoteProxyConfigurationResolver.cpp b/src/Common/RemoteProxyConfigurationResolver.cpp index b95eb958049..33dc0e957db 100644 --- a/src/Common/RemoteProxyConfigurationResolver.cpp +++ b/src/Common/RemoteProxyConfigurationResolver.cpp @@ -95,6 +95,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve() cached_config.port = proxy_port; cached_config.tunneling = use_tunneling_for_https_requests_over_http_proxy; cached_config.original_request_protocol = request_protocol; + cached_config.no_proxy_hosts = no_proxy_hosts; cache_timestamp = std::chrono::system_clock::now(); cache_valid = true; diff --git a/tests/integration/test_s3_table_function_with_http_proxy/test.py b/tests/integration/test_s3_table_function_with_http_proxy/test.py index 49cf78ebf2b..2ec73ecbef6 100644 --- a/tests/integration/test_s3_table_function_with_http_proxy/test.py +++ b/tests/integration/test_s3_table_function_with_http_proxy/test.py @@ -24,7 +24,6 @@ def cluster(): main_configs=[ "configs/config.d/proxy_remote_no_proxy.xml", ], - instance_env_variables=True, with_minio=True, ) @@ -41,7 +40,6 @@ def cluster(): main_configs=[ "configs/config.d/proxy_list_no_proxy.xml", ], - instance_env_variables=True, with_minio=True, ) From 2fc57af08a51ac6c6127fd162ef059efefacaa14 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 4 Jun 2024 17:32:53 -0300 Subject: [PATCH 059/215] fix ut --- .../tests/gtest_proxy_remote_configuration_resolver.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp b/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp index 7068e0f2061..5489a931f24 100644 --- a/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp +++ b/src/Common/tests/gtest_proxy_remote_configuration_resolver.cpp @@ -42,6 +42,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPOverHTTP) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTP, + "", std::make_shared(proxy_server_mock) ); @@ -68,6 +69,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPS) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock) ); @@ -95,6 +97,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTP) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock) ); @@ -122,6 +125,7 @@ TEST(RemoteProxyConfigurationResolver, HTTPSOverHTTPNoTunneling) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTPS, + "", std::make_shared(proxy_server_mock), true /* disable_tunneling_for_https_requests_over_http_proxy_ */ ); @@ -153,6 +157,7 @@ TEST(RemoteProxyConfigurationResolver, SimpleCacheTest) RemoteProxyConfigurationResolver resolver( remote_server_configuration, ProxyConfiguration::Protocol::HTTP, + "", fetcher_mock ); From 3ae8176a6d094be53ebd82d2fc7b5e1c75ceccd2 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 4 Jun 2024 20:12:28 -0300 Subject: [PATCH 060/215] read from env only once --- .../EnvironmentProxyConfigurationResolver.cpp | 22 ++++++++++++++----- .../EnvironmentProxyConfigurationResolver.h | 3 +++ 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index 387674feaae..1b7431ee827 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -49,10 +49,8 @@ namespace } } -ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() +ProxyConfiguration EnvironmentProxyConfigurationResolver::buildProxyConfiguration(Protocol protocol, const char * proxy_host, const std::string & no_proxy_hosts_string) { - const auto * proxy_host = getProxyHost(request_protocol); - if (!proxy_host) { return {}; @@ -69,10 +67,22 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() host, ProxyConfiguration::protocolFromString(scheme), port, - useTunneling(request_protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), - request_protocol, - getNoProxyHostsString() + useTunneling(protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), + protocol, + no_proxy_hosts_string }; } +ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() +{ + static const auto * http_proxy_host = getProxyHost(Protocol::HTTP); + static const auto * https_proxy_host = getProxyHost(Protocol::HTTPS); + static const auto no_proxy_hosts_string = getNoProxyHostsString(); + + static const auto http_proxy_configuration = buildProxyConfiguration(Protocol::HTTP, http_proxy_host, no_proxy_hosts_string); + static const auto https_proxy_configuration = buildProxyConfiguration(Protocol::HTTPS, https_proxy_host, no_proxy_hosts_string); + + return request_protocol == Protocol::HTTP ? http_proxy_configuration : https_proxy_configuration; +} + } diff --git a/src/Common/EnvironmentProxyConfigurationResolver.h b/src/Common/EnvironmentProxyConfigurationResolver.h index 6bc9d8a368c..08f91790162 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.h +++ b/src/Common/EnvironmentProxyConfigurationResolver.h @@ -15,6 +15,9 @@ public: ProxyConfiguration resolve() override; void errorReport(const ProxyConfiguration &) override {} + +private: + ProxyConfiguration buildProxyConfiguration(Protocol protocol, const char * proxy_host, const std::string & no_proxy_hosts); }; } From a0acea74c08b293e093bcd894fa123f437c96159 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 5 Jun 2024 10:36:48 +0200 Subject: [PATCH 061/215] Cleanup --- src/Common/GWPAsan.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 4bda2f7e913..2bdf418e152 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -54,13 +54,8 @@ static bool guarded_alloc_initialized = [] auto & opts = gwp_asan::options::getOptions(); opts.Backtrace = getBackTrace; - opts.MaxSimultaneousAllocations = 256; /// for testing GuardedAlloc.init(opts); - ///std::cerr << "GwpAsan is initialized, the options are { Enabled: " << opts.Enabled - /// << ", MaxSimultaneousAllocations: " << opts.MaxSimultaneousAllocations - /// << ", SampleRate: " << opts.SampleRate << " }\n"; - return true; }(); From a51577ce8a3ae839e38243247c37ab6a7b72e7e4 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 5 Jun 2024 13:42:39 +0200 Subject: [PATCH 062/215] Disable by default --- src/Core/ServerSettings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 55d8a8f0ec7..fff7b7f7e21 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -146,7 +146,7 @@ namespace DB M(UInt64, global_profiler_real_time_period_ns, 0, "Period for real clock timer of global profiler (in nanoseconds). Set 0 value to turn off the real clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(UInt64, global_profiler_cpu_time_period_ns, 0, "Period for CPU clock timer of global profiler (in nanoseconds). Set 0 value to turn off the CPU clock global profiler. Recommended value is at least 10000000 (100 times a second) for single queries or 1000000000 (once a second) for cluster-wide profiling.", 0) \ M(Bool, enable_azure_sdk_logging, false, "Enables logging from Azure sdk", 0) \ - M(Double, gwp_asan_force_sample_probability, 0.1, "Probability that an allocation from specific places will be sampled by GWP Asan (i.e. PODArray allocations)", 0) \ + M(Double, gwp_asan_force_sample_probability, 0, "Probability that an allocation from specific places will be sampled by GWP Asan (i.e. PODArray allocations)", 0) \ /// If you add a setting which can be updated at runtime, please update 'changeable_settings' map in StorageSystemServerSettings.cpp From 70b5457d54fdbbf5a4a8c3bb169d63060548f222 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Jun 2024 09:52:42 -0300 Subject: [PATCH 063/215] build poco no proxy only once --- src/Common/EnvironmentProxyConfigurationResolver.cpp | 3 ++- src/Common/proxyConfigurationToPocoProxyConfig.cpp | 6 +++--- src/Common/proxyConfigurationToPocoProxyConfig.h | 2 ++ 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index 1b7431ee827..32ad321fdee 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -1,6 +1,7 @@ #include "EnvironmentProxyConfigurationResolver.h" #include +#include #include namespace DB @@ -77,7 +78,7 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() { static const auto * http_proxy_host = getProxyHost(Protocol::HTTP); static const auto * https_proxy_host = getProxyHost(Protocol::HTTPS); - static const auto no_proxy_hosts_string = getNoProxyHostsString(); + static const auto no_proxy_hosts_string = buildPocoNonProxyHosts(getNoProxyHostsString()); static const auto http_proxy_configuration = buildProxyConfiguration(Protocol::HTTP, http_proxy_host, no_proxy_hosts_string); static const auto https_proxy_configuration = buildProxyConfiguration(Protocol::HTTPS, https_proxy_host, no_proxy_hosts_string); diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index fbd9a229da3..c1a439d2f36 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -36,6 +36,8 @@ std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) return RE2::QuoteMeta(view_without_leading_dot); } +} + /* * Even though there is not an RFC that defines NO_PROXY, it is usually a comma-separated list of domains. * Different tools implement their own versions of `NO_PROXY` support. Some support CIDR blocks, some support wildcard etc. @@ -93,8 +95,6 @@ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string) return result; } -} - Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration) { Poco::Net::HTTPClientSession::ProxyConfig poco_proxy_config; @@ -104,7 +104,7 @@ Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(co poco_proxy_config.protocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.protocol); poco_proxy_config.tunnel = proxy_configuration.tunneling; poco_proxy_config.originalRequestProtocol = DB::ProxyConfiguration::protocolToString(proxy_configuration.original_request_protocol); - poco_proxy_config.nonProxyHosts = buildPocoNonProxyHosts(proxy_configuration.no_proxy_hosts); + poco_proxy_config.nonProxyHosts = proxy_configuration.no_proxy_hosts; return poco_proxy_config; } diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.h b/src/Common/proxyConfigurationToPocoProxyConfig.h index d093b0f3521..c118bd059f9 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.h +++ b/src/Common/proxyConfigurationToPocoProxyConfig.h @@ -8,4 +8,6 @@ namespace DB Poco::Net::HTTPClientSession::ProxyConfig proxyConfigurationToPocoProxyConfig(const DB::ProxyConfiguration & proxy_configuration); +std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string); + } From 877717840bd889d0952531b2159321737d1d6200 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Jun 2024 10:18:04 -0300 Subject: [PATCH 064/215] fix and simplify proxy provider ut --- ..._proxy_configuration_resolver_provider.cpp | 158 ++++++------------ 1 file changed, 48 insertions(+), 110 deletions(-) diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index d248835699a..368ce12cd7b 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -1,6 +1,9 @@ #include #include +#include +#include +#include #include #include @@ -38,18 +41,11 @@ TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedI EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server, no_proxy_hosts); const auto & config = getContext().context->getConfigRef(); - auto http_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config)->resolve(); - auto https_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, config); - ASSERT_EQ(http_configuration.host, http_env_proxy_server.getHost()); - ASSERT_EQ(http_configuration.port, http_env_proxy_server.getPort()); - ASSERT_EQ(http_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); - ASSERT_EQ(http_configuration.no_proxy_hosts, no_proxy_hosts); - - ASSERT_EQ(https_configuration.host, https_env_proxy_server.getHost()); - ASSERT_EQ(https_configuration.port, https_env_proxy_server.getPort()); - ASSERT_EQ(https_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); - ASSERT_EQ(https_configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) @@ -62,19 +58,11 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) config->setString("proxy.http.uri", http_list_proxy_server.toString()); context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - // No https configuration since it's not set - ASSERT_EQ(https_proxy_configuration.host, ""); - ASSERT_EQ(https_proxy_configuration.port, 0); - ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, ""); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) @@ -87,19 +75,11 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) config->setString("proxy.https.uri", https_list_proxy_server.toString()); context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, ""); - ASSERT_EQ(http_proxy_configuration.port, 0); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - ASSERT_EQ(https_proxy_configuration.host, https_list_proxy_server.getHost()); - - // still HTTP because the proxy host is not HTTPS - ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); - ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); - ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) @@ -117,35 +97,38 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - ASSERT_EQ(http_proxy_configuration.host, http_list_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_list_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.port, http_list_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, no_proxy_hosts); - - auto https_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - ASSERT_EQ(https_proxy_configuration.host, https_list_proxy_server.getHost()); - - // still HTTP because the proxy host is not HTTPS - ASSERT_EQ(https_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_list_proxy_server.getScheme())); - ASSERT_EQ(https_proxy_configuration.port, https_list_proxy_server.getPort()); - ASSERT_EQ(https_proxy_configuration.no_proxy_hosts, no_proxy_hosts); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } -TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTP) +TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTPS) { - /* - * Since there is no way to call `ProxyConfigurationResolver::resolve` on remote resolver, - * it is hard to verify the remote resolver was actually picked. One hackish way to assert - * the remote resolver was OR was not picked based on the configuration, is to use the - * environment resolver. Since the environment resolver is always returned as a fallback, - * we can assert the remote resolver was not picked if `ProxyConfigurationResolver::resolve` - * succeeds and returns an environment proxy configuration. - * */ - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); + ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); + config->setString("proxy", ""); + config->setString("proxy.http", ""); + config->setString("proxy.http.resolver", ""); + config->setString("proxy.http.resolver.endpoint", "http://resolver:8080/hostname"); + + // even tho proxy protocol / scheme is https, it should not be picked (prior to this PR, it would be picked) + config->setString("proxy.http.resolver.proxy_scheme", "https"); + config->setString("proxy.http.resolver.proxy_port", "80"); + config->setString("proxy.http.resolver.proxy_cache_time", "10"); + + context->setConfig(config); + + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); + + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); +} + +TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverHTTPSOnly) +{ ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); @@ -161,66 +144,21 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolC context->setConfig(config); - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config)->resolve(); + auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, *config); + auto https_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config); - /* - * Asserts env proxy is used and not the remote resolver. If the remote resolver is picked, it is an error because - * there is no `http` specification for remote resolver - * */ - ASSERT_EQ(http_proxy_configuration.host, http_env_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, http_env_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(http_env_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, ""); + ASSERT_TRUE(std::dynamic_pointer_cast(http_resolver)); + ASSERT_TRUE(std::dynamic_pointer_cast(https_resolver)); } -TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverIsBasedOnProtocolConfigurationHTTPS) -{ - /* - * Since there is no way to call `ProxyConfigurationResolver::resolve` on remote resolver, - * it is hard to verify the remote resolver was actually picked. One hackish way to assert - * the remote resolver was OR was not picked based on the configuration, is to use the - * environment resolver. Since the environment resolver is always returned as a fallback, - * we can assert the remote resolver was not picked if `ProxyConfigurationResolver::resolve` - * succeeds and returns an environment proxy configuration. - * */ - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); - - ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); - - config->setString("proxy", ""); - config->setString("proxy.http", ""); - config->setString("proxy.http.resolver", ""); - config->setString("proxy.http.resolver.endpoint", "http://resolver:8080/hostname"); - - // even tho proxy protocol / scheme is https, it should not be picked (prior to this PR, it would be picked) - config->setString("proxy.http.resolver.proxy_scheme", "https"); - config->setString("proxy.http.resolver.proxy_port", "80"); - config->setString("proxy.http.resolver.proxy_cache_time", "10"); - - context->setConfig(config); - - auto http_proxy_configuration = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTPS, *config)->resolve(); - - /* - * Asserts env proxy is used and not the remote resolver. If the remote resolver is picked, it is an error because - * there is no `http` specification for remote resolver - * */ - ASSERT_EQ(http_proxy_configuration.host, https_env_proxy_server.getHost()); - ASSERT_EQ(http_proxy_configuration.port, https_env_proxy_server.getPort()); - ASSERT_EQ(http_proxy_configuration.protocol, DB::ProxyConfiguration::protocolFromString(https_env_proxy_server.getScheme())); - ASSERT_EQ(http_proxy_configuration.no_proxy_hosts, ""); -} - -// remote resolver is tricky to be tested in unit tests - template void test_tunneling(DB::ContextMutablePtr context) { - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server); - ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); + config->setString("proxy.https", ""); + config->setString("proxy.https.uri", http_list_proxy_server.toString()); if constexpr (STRING) { From 90332b3c8916cbdc22cb1291a943598193f66e24 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Jun 2024 13:46:45 -0300 Subject: [PATCH 065/215] fix tunneling --- .../EnvironmentProxyConfigurationResolver.cpp | 56 ++++++++------ .../EnvironmentProxyConfigurationResolver.h | 3 - src/Common/ProxyConfiguration.h | 6 ++ src/Common/ProxyConfigurationResolver.h | 7 -- src/Common/ProxyListConfigurationResolver.cpp | 7 +- .../RemoteProxyConfigurationResolver.cpp | 2 +- .../gtest_proxy_environment_configuration.cpp | 75 ++++--------------- 7 files changed, 61 insertions(+), 95 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index 32ad321fdee..bf062b8d9ba 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -48,30 +48,33 @@ namespace return no_proxy; } -} -ProxyConfiguration EnvironmentProxyConfigurationResolver::buildProxyConfiguration(Protocol protocol, const char * proxy_host, const std::string & no_proxy_hosts_string) -{ - if (!proxy_host) + ProxyConfiguration buildProxyConfiguration( + ProxyConfiguration::Protocol request_protocol, + const Poco::URI & uri, + const std::string & no_proxy_hosts_string, + bool disable_tunneling_for_https_requests_over_http_proxy) { - return {}; + const auto & host = uri.getHost(); + const auto & scheme = uri.getScheme(); + const auto port = uri.getPort(); + + const bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( + request_protocol, + ProxyConfiguration::protocolFromString(scheme), + disable_tunneling_for_https_requests_over_http_proxy); + + LOG_TRACE(getLogger("EnvironmentProxyConfigurationResolver"), "Use proxy from environment: {}://{}:{}", scheme, host, port); + + return ProxyConfiguration { + host, + ProxyConfiguration::protocolFromString(scheme), + port, + use_tunneling_for_https_requests_over_http_proxy, + request_protocol, + no_proxy_hosts_string + }; } - - auto uri = Poco::URI(proxy_host); - auto host = uri.getHost(); - auto scheme = uri.getScheme(); - auto port = uri.getPort(); - - LOG_TRACE(getLogger("EnvironmentProxyConfigurationResolver"), "Use proxy from environment: {}://{}:{}", scheme, host, port); - - return ProxyConfiguration { - host, - ProxyConfiguration::protocolFromString(scheme), - port, - useTunneling(protocol, ProxyConfiguration::protocolFromString(scheme), disable_tunneling_for_https_requests_over_http_proxy), - protocol, - no_proxy_hosts_string - }; } ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() @@ -80,10 +83,15 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() static const auto * https_proxy_host = getProxyHost(Protocol::HTTPS); static const auto no_proxy_hosts_string = buildPocoNonProxyHosts(getNoProxyHostsString()); - static const auto http_proxy_configuration = buildProxyConfiguration(Protocol::HTTP, http_proxy_host, no_proxy_hosts_string); - static const auto https_proxy_configuration = buildProxyConfiguration(Protocol::HTTPS, https_proxy_host, no_proxy_hosts_string); + static const Poco::URI http_proxy_uri(http_proxy_host ? http_proxy_host : ""); + static const Poco::URI https_proxy_uri(https_proxy_host ? https_proxy_host : ""); + + return buildProxyConfiguration( + request_protocol, + request_protocol == Protocol::HTTP ? http_proxy_uri : https_proxy_uri, + no_proxy_hosts_string, + disable_tunneling_for_https_requests_over_http_proxy); - return request_protocol == Protocol::HTTP ? http_proxy_configuration : https_proxy_configuration; } } diff --git a/src/Common/EnvironmentProxyConfigurationResolver.h b/src/Common/EnvironmentProxyConfigurationResolver.h index 08f91790162..6bc9d8a368c 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.h +++ b/src/Common/EnvironmentProxyConfigurationResolver.h @@ -15,9 +15,6 @@ public: ProxyConfiguration resolve() override; void errorReport(const ProxyConfiguration &) override {} - -private: - ProxyConfiguration buildProxyConfiguration(Protocol protocol, const char * proxy_host, const std::string & no_proxy_hosts); }; } diff --git a/src/Common/ProxyConfiguration.h b/src/Common/ProxyConfiguration.h index d2d195b2d2e..3398afce37c 100644 --- a/src/Common/ProxyConfiguration.h +++ b/src/Common/ProxyConfiguration.h @@ -44,6 +44,12 @@ struct ProxyConfiguration } } + static bool useTunneling(Protocol request_protocol, Protocol proxy_protocol, bool disable_tunneling_for_https_requests_over_http_proxy) + { + bool is_https_request_over_http_proxy = request_protocol == Protocol::HTTPS && proxy_protocol == Protocol::HTTP; + return is_https_request_over_http_proxy && !disable_tunneling_for_https_requests_over_http_proxy; + } + std::string host = std::string{}; Protocol protocol = Protocol::HTTP; uint16_t port = 0; diff --git a/src/Common/ProxyConfigurationResolver.h b/src/Common/ProxyConfigurationResolver.h index b82936502bb..1e9f4ad77f7 100644 --- a/src/Common/ProxyConfigurationResolver.h +++ b/src/Common/ProxyConfigurationResolver.h @@ -19,13 +19,6 @@ struct ProxyConfigurationResolver virtual void errorReport(const ProxyConfiguration & config) = 0; protected: - - static bool useTunneling(Protocol request_protocol, Protocol proxy_protocol, bool disable_tunneling_for_https_requests_over_http_proxy) - { - bool is_https_request_over_http_proxy = request_protocol == Protocol::HTTPS && proxy_protocol == Protocol::HTTP; - return is_https_request_over_http_proxy && !disable_tunneling_for_https_requests_over_http_proxy; - } - Protocol request_protocol; bool disable_tunneling_for_https_requests_over_http_proxy = false; }; diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index e0452e98544..a23c3a61951 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -29,11 +29,16 @@ ProxyConfiguration ProxyListConfigurationResolver::resolve() auto & proxy = proxies[index]; + bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( + request_protocol, + ProxyConfiguration::protocolFromString(proxy.getScheme()), + disable_tunneling_for_https_requests_over_http_proxy); + return ProxyConfiguration { proxy.getHost(), ProxyConfiguration::protocolFromString(proxy.getScheme()), proxy.getPort(), - useTunneling(request_protocol, ProxyConfiguration::protocolFromString(proxy.getScheme()), disable_tunneling_for_https_requests_over_http_proxy), + use_tunneling_for_https_requests_over_http_proxy, request_protocol, no_proxy_hosts }; diff --git a/src/Common/RemoteProxyConfigurationResolver.cpp b/src/Common/RemoteProxyConfigurationResolver.cpp index 33dc0e957db..8fd9d381ece 100644 --- a/src/Common/RemoteProxyConfigurationResolver.cpp +++ b/src/Common/RemoteProxyConfigurationResolver.cpp @@ -85,7 +85,7 @@ ProxyConfiguration RemoteProxyConfigurationResolver::resolve() auto proxy_protocol = ProxyConfiguration::protocolFromString(proxy_protocol_string); - bool use_tunneling_for_https_requests_over_http_proxy = useTunneling( + bool use_tunneling_for_https_requests_over_http_proxy = ProxyConfiguration::useTunneling( request_protocol, proxy_protocol, disable_tunneling_for_https_requests_over_http_proxy); diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index 81388fd877f..095e44dfb86 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -2,6 +2,7 @@ #include #include +#include #include namespace DB @@ -13,73 +14,29 @@ namespace auto https_proxy_server = Poco::URI("https://proxy_server:3128"); } -TEST(EnvironmentProxyConfigurationResolver, TestHTTP) +TEST(EnvironmentProxyConfigurationResolver, TestHTTPandHTTPS) { std::string no_proxy_string = "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"; - EnvironmentProxySetter setter(http_proxy_server, {}, no_proxy_string); + std::string poco_no_proxy_regex = buildPocoNonProxyHosts(no_proxy_string); + EnvironmentProxySetter setter(http_proxy_server, https_proxy_server, no_proxy_string); - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); + EnvironmentProxyConfigurationResolver http_resolver(ProxyConfiguration::Protocol::HTTP); - auto configuration = resolver.resolve(); + auto http_configuration = http_resolver.resolve(); - ASSERT_EQ(configuration.host, http_proxy_server.getHost()); - ASSERT_EQ(configuration.port, http_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); - ASSERT_EQ(configuration.no_proxy_hosts, no_proxy_string); -} + ASSERT_EQ(http_configuration.host, http_proxy_server.getHost()); + ASSERT_EQ(http_configuration.port, http_proxy_server.getPort()); + ASSERT_EQ(http_configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); + ASSERT_EQ(http_configuration.no_proxy_hosts, poco_no_proxy_regex); -TEST(EnvironmentProxyConfigurationResolver, TestHTTPNoEnv) -{ - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTP); + EnvironmentProxyConfigurationResolver https_resolver(ProxyConfiguration::Protocol::HTTPS); - auto configuration = resolver.resolve(); + auto https_configuration = https_resolver.resolve(); - ASSERT_EQ(configuration.host, ""); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP); - ASSERT_EQ(configuration.port, 0u); - ASSERT_TRUE(configuration.no_proxy_hosts.empty()); -} - -TEST(EnvironmentProxyConfigurationResolver, TestHTTPs) -{ - EnvironmentProxySetter setter({}, https_proxy_server); - - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTPS); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, https_proxy_server.getHost()); - ASSERT_EQ(configuration.port, https_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(https_proxy_server.getScheme())); -} - -TEST(EnvironmentProxyConfigurationResolver, TestHTTPsNoEnv) -{ - EnvironmentProxyConfigurationResolver resolver(ProxyConfiguration::Protocol::HTTPS); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, ""); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::Protocol::HTTP); - ASSERT_EQ(configuration.port, 0u); -} - -TEST(EnvironmentProxyConfigurationResolver, TestHTTPsOverHTTPTunnelingDisabled) -{ - // use http proxy for https, this would use connect protocol by default - EnvironmentProxySetter setter({}, http_proxy_server); - - bool disable_tunneling_for_https_requests_over_http_proxy = true; - - EnvironmentProxyConfigurationResolver resolver( - ProxyConfiguration::Protocol::HTTPS, disable_tunneling_for_https_requests_over_http_proxy); - - auto configuration = resolver.resolve(); - - ASSERT_EQ(configuration.host, http_proxy_server.getHost()); - ASSERT_EQ(configuration.port, http_proxy_server.getPort()); - ASSERT_EQ(configuration.protocol, ProxyConfiguration::protocolFromString(http_proxy_server.getScheme())); - ASSERT_EQ(configuration.tunneling, false); + ASSERT_EQ(https_configuration.host, https_proxy_server.getHost()); + ASSERT_EQ(https_configuration.port, https_proxy_server.getPort()); + ASSERT_EQ(https_configuration.protocol, ProxyConfiguration::protocolFromString(https_proxy_server.getScheme())); + ASSERT_EQ(https_configuration.no_proxy_hosts, poco_no_proxy_regex); } } From 946a5913b10821496658a9083a82866855a82123 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Wed, 5 Jun 2024 15:46:11 -0300 Subject: [PATCH 066/215] return empty if uri is empty --- src/Common/EnvironmentProxyConfigurationResolver.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index bf062b8d9ba..fff2d354e3a 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -55,6 +55,11 @@ namespace const std::string & no_proxy_hosts_string, bool disable_tunneling_for_https_requests_over_http_proxy) { + if (uri.empty()) + { + return {}; + } + const auto & host = uri.getHost(); const auto & scheme = uri.getScheme(); const auto port = uri.getPort(); From f264c8ed590a9a790fb3b4abb85f6949b10fdc2d Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Thu, 6 Jun 2024 10:00:32 +0200 Subject: [PATCH 067/215] Set default for MaxSimultaneousAllocations --- src/Common/GWPAsan.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 2bdf418e152..4c6c8c7e9cc 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -53,6 +53,9 @@ static bool guarded_alloc_initialized = [] gwp_asan::options::initOptions(env_options_raw, printString); auto & opts = gwp_asan::options::getOptions(); + if (!env_options_raw || !std::string_view{env_options_raw}.contains("MaxSimultaneousAllocations")) + opts.MaxSimultaneousAllocations = 1024; + opts.Backtrace = getBackTrace; GuardedAlloc.init(opts); From 001ac9f8476dbcc2f04647b74fc8653dda9717fc Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Jun 2024 09:48:48 -0300 Subject: [PATCH 068/215] fix no proxy hosts for remote and list resolvers --- src/Common/ProxyConfiguration.h | 2 +- src/Common/ProxyConfigurationResolverProvider.cpp | 5 +++-- src/Common/ProxyListConfigurationResolver.cpp | 1 - 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Common/ProxyConfiguration.h b/src/Common/ProxyConfiguration.h index 3398afce37c..a9921f1474d 100644 --- a/src/Common/ProxyConfiguration.h +++ b/src/Common/ProxyConfiguration.h @@ -55,7 +55,7 @@ struct ProxyConfiguration uint16_t port = 0; bool tunneling = false; Protocol original_request_protocol = Protocol::HTTP; - std::string no_proxy_hosts = ""; + std::string no_proxy_hosts = std::string{}; bool isEmpty() const { return host.empty(); } }; diff --git a/src/Common/ProxyConfigurationResolverProvider.cpp b/src/Common/ProxyConfigurationResolverProvider.cpp index f9eea1805fe..b06073121e7 100644 --- a/src/Common/ProxyConfigurationResolverProvider.cpp +++ b/src/Common/ProxyConfigurationResolverProvider.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -54,7 +55,7 @@ namespace return std::make_shared( server_configuration, request_protocol, - getNoProxyHosts(configuration), + buildPocoNonProxyHosts(getNoProxyHosts(configuration)), std::make_shared(), isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } @@ -97,7 +98,7 @@ namespace : std::make_shared( uris, request_protocol, - getNoProxyHosts(configuration), + buildPocoNonProxyHosts(getNoProxyHosts(configuration)), isTunnelingDisabledForHTTPSRequestsOverHTTPProxy(configuration)); } diff --git a/src/Common/ProxyListConfigurationResolver.cpp b/src/Common/ProxyListConfigurationResolver.cpp index a23c3a61951..2d5b5e97364 100644 --- a/src/Common/ProxyListConfigurationResolver.cpp +++ b/src/Common/ProxyListConfigurationResolver.cpp @@ -1,7 +1,6 @@ #include #include -#include #include namespace DB From 15317b5b507aae95194f0cdb7d9678ce43a3784f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 6 Jun 2024 07:29:06 +0200 Subject: [PATCH 069/215] Fix possible loss of "Query was cancelled" message in client Previously "Query was cancelled" had been printed only on new blocks, so if no new blocks will be arrived, then it will not print this message, and this is indeed why 03023_zeros_generate_random_with_limit_progress_bar was flaky [1]: 2024-06-05 16:21:22 expect: does "\u258e\u001b[0m \u001b[2m(0.9 CPU)\u001b[0m 4%\u001b[K^C\r\u001b[1;31m\u2198\u001b[0m Progress: 404.74 thousand rows, 404.74 KB (806.77 thousand rows/s., 806.77 KB/s.) \u001b[0;32m\u2588\u258e\u001b[0m \u001b[2m(2.0 CPU)\u001b[0m 4%\u001b[K\r\u001b[KCancelling query.\r\n \r\u001b[1;32m\u2193\u001b[0m Progress: 404.74 thousand rows, 404.74 KB (806.15 thousand rows/s., 806.15 KB/s.) \u001b[0;32m\u2588\u258e\u001b[0m \u001b[2m(2.0 CPU)\u001b[0m 4%\u001b[K" (spawn_id exp4) match glob pattern "Query was cancelled."? no 2024-06-05 16:21:22 2024-06-05 16:21:22 [KOk. 2024-06-05 16:21:22 , result: As you can see it printed "Cancelling query" and "Ok" but not "Query was cancelled" due to this. [1]: https://s3.amazonaws.com/clickhouse-test-reports/61973/6cfd5b2165970a65a551117fe58e4b9d22237b8c/stateless_tests__tsan__s3_storage__[1_5].html A bit hackish with all this bool flags, but I guess it is OK. Signed-off-by: Azat Khuzhin --- src/Client/ClientBase.cpp | 13 +++++++++++-- src/Client/ClientBase.h | 1 + 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index f8391c64d5a..854cc3fef8b 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1188,7 +1188,10 @@ void ClientBase::receiveResult(ASTPtr parsed_query, Int32 signals_before_stop, b std::rethrow_exception(local_format_error); if (cancelled && is_interactive) + { std::cout << "Query was cancelled." << std::endl; + cancelled_printed = true; + } } @@ -1302,8 +1305,13 @@ void ClientBase::onEndOfStream() resetOutput(); - if (is_interactive && !written_first_block) - std::cout << "Ok." << std::endl; + if (is_interactive) + { + if (cancelled && !cancelled_printed) + std::cout << "Query was cancelled." << std::endl; + else if (!written_first_block) + std::cout << "Ok." << std::endl; + } } @@ -1866,6 +1874,7 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin resetOutput(); have_error = false; cancelled = false; + cancelled_printed = false; client_exception.reset(); server_exception.reset(); diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 7a0489641c8..220fcddc038 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -329,6 +329,7 @@ protected: bool allow_merge_tree_settings = false; bool cancelled = false; + bool cancelled_printed = false; /// Does log_comment has specified by user? bool has_log_comment = false; From bd0ef01ffdcdf1828dd56dde25f8b0d0e65dd1b4 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 6 Jun 2024 16:45:34 -0300 Subject: [PATCH 070/215] try to fix ut --- .../tests/gtest_poco_no_proxy_regex.cpp | 24 +++++++++++++ ...oxy_configuration_to_poco_proxy_config.cpp | 36 ------------------- .../gtest_proxy_environment_configuration.cpp | 7 +++- 3 files changed, 30 insertions(+), 37 deletions(-) create mode 100644 src/Common/tests/gtest_poco_no_proxy_regex.cpp delete mode 100644 src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp diff --git a/src/Common/tests/gtest_poco_no_proxy_regex.cpp b/src/Common/tests/gtest_poco_no_proxy_regex.cpp new file mode 100644 index 00000000000..1f70a483ab4 --- /dev/null +++ b/src/Common/tests/gtest_poco_no_proxy_regex.cpp @@ -0,0 +1,24 @@ +#include + +#include + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuild) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts("localhost,127.0.0.1,some_other_domain:8080,sub-domain.domain.com"), + R"((?:.*\.)?localhost|(?:.*\.)?127\.0\.0\.1|(?:.*\.)?some_other_domain\:8080|(?:.*\.)?sub\-domain\.domain\.com)"); +} + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchAnything) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts("*"), + "(.*?)"); +} + +TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildEmpty) +{ + ASSERT_EQ( + DB::buildPocoNonProxyHosts(""), + ""); +} diff --git a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp b/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp deleted file mode 100644 index db87f23fc65..00000000000 --- a/src/Common/tests/gtest_proxy_configuration_to_poco_proxy_config.cpp +++ /dev/null @@ -1,36 +0,0 @@ -#include - -#include - -TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuild) -{ - DB::ProxyConfiguration proxy_configuration; - - proxy_configuration.no_proxy_hosts = "localhost,127.0.0.1,some_other_domain:8080,sub-domain.domain.com"; - - auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); - - ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, R"((?:.*\.)?localhost|(?:.*\.)?127\.0\.0\.1|(?:.*\.)?some_other_domain\:8080|(?:.*\.)?sub\-domain\.domain\.com)"); -} - -TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchAnything) -{ - DB::ProxyConfiguration proxy_configuration; - - proxy_configuration.no_proxy_hosts = "*"; - - auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); - - ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, "(.*?)"); -} - -TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildEmpty) -{ - DB::ProxyConfiguration proxy_configuration; - - proxy_configuration.no_proxy_hosts = ""; - - auto poco_proxy_configuration = DB::proxyConfigurationToPocoProxyConfig(proxy_configuration); - - ASSERT_EQ(poco_proxy_configuration.nonProxyHosts, ""); -} diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index 095e44dfb86..ac31e33e129 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -16,7 +16,12 @@ namespace TEST(EnvironmentProxyConfigurationResolver, TestHTTPandHTTPS) { - std::string no_proxy_string = "localhost,,127.0.0.1,some_other_domain,,,, last_domain,"; + // Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration + // since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once + // from the environment, the proxy configuration will always be there. + // The problem is that the proxy server does not exist, causing the test to fail. + // To work around this issue, `no_proxy` is set to bypass all domains. + std::string no_proxy_string = "*"; std::string poco_no_proxy_regex = buildPocoNonProxyHosts(no_proxy_string); EnvironmentProxySetter setter(http_proxy_server, https_proxy_server, no_proxy_string); From de5258128e48061514a82bdfdc5368852d6a5062 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Fri, 7 Jun 2024 06:44:36 +0000 Subject: [PATCH 071/215] update fmtlib version to 9.1.0 Signed-off-by: Duc Canh Le --- base/base/EnumReflection.h | 2 +- base/base/wide_integer_to_string.h | 2 +- contrib/fmtlib | 2 +- contrib/fmtlib-cmake/CMakeLists.txt | 1 - src/Analyzer/Identifier.h | 4 ++-- src/Client/TestHint.h | 2 +- src/Common/TransactionID.h | 2 +- src/Common/ZooKeeper/IKeeper.h | 2 +- src/Common/filesystemHelpers.cpp | 13 ++++++++----- src/Common/formatReadable.h | 2 +- src/Coordination/KeeperSnapshotManager.cpp | 2 +- src/Coordination/RaftServerConfig.h | 4 ++-- src/Core/Field.h | 2 +- src/Core/QualifiedTableName.h | 2 +- src/DataTypes/IDataType.h | 2 +- src/Functions/FunctionsStringDistance.cpp | 4 ++-- src/IO/WriteHelpers.h | 2 +- src/Interpreters/StorageID.h | 2 +- src/Parsers/formatAST.h | 2 +- .../Formats/Impl/BSONEachRowRowInputFormat.cpp | 4 ++-- src/Processors/Transforms/WindowTransform.cpp | 2 +- src/Storages/Kafka/KafkaConsumer.cpp | 5 +---- src/Storages/Kafka/KafkaConsumer.h | 5 +++++ src/Storages/MergeTree/MarkRange.h | 2 +- .../ParallelReplicasReadingCoordinator.cpp | 2 +- src/Storages/MergeTree/RangesInDataPart.cpp | 2 +- src/Storages/NamedCollectionsHelpers.h | 2 +- 27 files changed, 41 insertions(+), 37 deletions(-) diff --git a/base/base/EnumReflection.h b/base/base/EnumReflection.h index 4a9de4d17a3..e4e0ef672fd 100644 --- a/base/base/EnumReflection.h +++ b/base/base/EnumReflection.h @@ -32,7 +32,7 @@ constexpr void static_for(F && f) template struct fmt::formatter : fmt::formatter { - constexpr auto format(T value, auto& format_context) + constexpr auto format(T value, auto& format_context) const { return formatter::format(magic_enum::enum_name(value), format_context); } diff --git a/base/base/wide_integer_to_string.h b/base/base/wide_integer_to_string.h index c2cbe8d82e3..f703a722afa 100644 --- a/base/base/wide_integer_to_string.h +++ b/base/base/wide_integer_to_string.h @@ -62,7 +62,7 @@ struct fmt::formatter> } template - auto format(const wide::integer & value, FormatContext & ctx) + auto format(const wide::integer & value, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", to_string(value)); } diff --git a/contrib/fmtlib b/contrib/fmtlib index b6f4ceaed0a..a33701196ad 160000 --- a/contrib/fmtlib +++ b/contrib/fmtlib @@ -1 +1 @@ -Subproject commit b6f4ceaed0a0a24ccf575fab6c56dd50ccf6f1a9 +Subproject commit a33701196adfad74917046096bf5a2aa0ab0bb50 diff --git a/contrib/fmtlib-cmake/CMakeLists.txt b/contrib/fmtlib-cmake/CMakeLists.txt index fe399ddc6e1..6625e411295 100644 --- a/contrib/fmtlib-cmake/CMakeLists.txt +++ b/contrib/fmtlib-cmake/CMakeLists.txt @@ -13,7 +13,6 @@ set (SRCS ${FMT_SOURCE_DIR}/include/fmt/core.h ${FMT_SOURCE_DIR}/include/fmt/format.h ${FMT_SOURCE_DIR}/include/fmt/format-inl.h - ${FMT_SOURCE_DIR}/include/fmt/locale.h ${FMT_SOURCE_DIR}/include/fmt/os.h ${FMT_SOURCE_DIR}/include/fmt/ostream.h ${FMT_SOURCE_DIR}/include/fmt/printf.h diff --git a/src/Analyzer/Identifier.h b/src/Analyzer/Identifier.h index cbd8f5e7694..91190dc7cdb 100644 --- a/src/Analyzer/Identifier.h +++ b/src/Analyzer/Identifier.h @@ -406,7 +406,7 @@ struct fmt::formatter } template - auto format(const DB::Identifier & identifier, FormatContext & ctx) + auto format(const DB::Identifier & identifier, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", identifier.getFullName()); } @@ -428,7 +428,7 @@ struct fmt::formatter } template - auto format(const DB::IdentifierView & identifier_view, FormatContext & ctx) + auto format(const DB::IdentifierView & identifier_view, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", identifier_view.getFullName()); } diff --git a/src/Client/TestHint.h b/src/Client/TestHint.h index eaf854be5df..b76c4245df4 100644 --- a/src/Client/TestHint.h +++ b/src/Client/TestHint.h @@ -112,7 +112,7 @@ struct fmt::formatter } template - auto format(const DB::TestHint::ErrorVector & ErrorVector, FormatContext & ctx) + auto format(const DB::TestHint::ErrorVector & ErrorVector, FormatContext & ctx) const { if (ErrorVector.empty()) return fmt::format_to(ctx.out(), "{}", 0); diff --git a/src/Common/TransactionID.h b/src/Common/TransactionID.h index 97d0072bc14..466f3f5343b 100644 --- a/src/Common/TransactionID.h +++ b/src/Common/TransactionID.h @@ -108,7 +108,7 @@ struct fmt::formatter } template - auto format(const DB::TransactionID & tid, FormatContext & context) + auto format(const DB::TransactionID & tid, FormatContext & context) const { return fmt::format_to(context.out(), "({}, {}, {})", tid.start_csn, tid.local_tid, tid.host_id); } diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index ddd30c4eef2..7d574247aa5 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -647,7 +647,7 @@ public: template <> struct fmt::formatter : fmt::formatter { - constexpr auto format(Coordination::Error code, auto & ctx) + constexpr auto format(Coordination::Error code, auto & ctx) const { return formatter::format(Coordination::errorMessage(code), ctx); } diff --git a/src/Common/filesystemHelpers.cpp b/src/Common/filesystemHelpers.cpp index 2d053c615d9..f8d209bc11f 100644 --- a/src/Common/filesystemHelpers.cpp +++ b/src/Common/filesystemHelpers.cpp @@ -1,4 +1,6 @@ #include "filesystemHelpers.h" +#include +#include #if defined(OS_LINUX) # include @@ -11,7 +13,7 @@ #include #include #include -#include +#include #include #include #include @@ -369,10 +371,11 @@ Poco::Timestamp getModificationTimestamp(const std::string & path) void setModificationTime(const std::string & path, time_t time) { - struct utimbuf tb; - tb.actime = time; - tb.modtime = time; - if (utime(path.c_str(), &tb) != 0) + struct timeval times[2]; + times[0].tv_usec = times[1].tv_usec = 0; + times[0].tv_sec = ::time(nullptr); + times[1].tv_sec = time; + if (utimes(path.c_str(), times) != 0) DB::ErrnoException::throwFromPath(DB::ErrorCodes::PATH_ACCESS_DENIED, path, "Cannot set modification time to file: {}", path); } diff --git a/src/Common/formatReadable.h b/src/Common/formatReadable.h index a05a2a7f9e2..0d7a437219a 100644 --- a/src/Common/formatReadable.h +++ b/src/Common/formatReadable.h @@ -49,7 +49,7 @@ struct fmt::formatter } template - auto format(const ReadableSize & size, FormatContext & ctx) + auto format(const ReadableSize & size, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", formatReadableSizeWithBinarySuffix(size.value)); } diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index f25ccab86b1..23ff714a929 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -466,7 +466,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial #ifdef NDEBUG /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA. LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}" - " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key); + " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key.toView()); #else throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}" " is different from actual children size {} for node {}", diff --git a/src/Coordination/RaftServerConfig.h b/src/Coordination/RaftServerConfig.h index 0ecbd6464c1..37b6a92ba70 100644 --- a/src/Coordination/RaftServerConfig.h +++ b/src/Coordination/RaftServerConfig.h @@ -57,7 +57,7 @@ using ClusterUpdateActions = std::vector; template <> struct fmt::formatter : fmt::formatter { - constexpr auto format(const DB::RaftServerConfig & server, format_context & ctx) + constexpr auto format(const DB::RaftServerConfig & server, format_context & ctx) const { return fmt::format_to( ctx.out(), "server.{}={};{};{}", server.id, server.endpoint, server.learner ? "learner" : "participant", server.priority); @@ -67,7 +67,7 @@ struct fmt::formatter : fmt::formatter template <> struct fmt::formatter : fmt::formatter { - constexpr auto format(const DB::ClusterUpdateAction & action, format_context & ctx) + constexpr auto format(const DB::ClusterUpdateAction & action, format_context & ctx) const { if (const auto * add = std::get_if(&action)) return fmt::format_to(ctx.out(), "(Add server {})", add->id); diff --git a/src/Core/Field.h b/src/Core/Field.h index a78b589c883..f1bb4a72b0d 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -1038,7 +1038,7 @@ struct fmt::formatter } template - auto format(const DB::Field & x, FormatContext & ctx) + auto format(const DB::Field & x, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", toString(x)); } diff --git a/src/Core/QualifiedTableName.h b/src/Core/QualifiedTableName.h index bf05bd59caf..0fd72c32a54 100644 --- a/src/Core/QualifiedTableName.h +++ b/src/Core/QualifiedTableName.h @@ -125,7 +125,7 @@ namespace fmt } template - auto format(const DB::QualifiedTableName & name, FormatContext & ctx) + auto format(const DB::QualifiedTableName & name, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}.{}", DB::backQuoteIfNeed(name.database), DB::backQuoteIfNeed(name.table)); } diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 46c30240ef8..bd6065ca270 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -623,7 +623,7 @@ struct fmt::formatter } template - auto format(const DB::DataTypePtr & type, FormatContext & ctx) + auto format(const DB::DataTypePtr & type, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", type->getName()); } diff --git a/src/Functions/FunctionsStringDistance.cpp b/src/Functions/FunctionsStringDistance.cpp index 6cb23bbea9f..d0d8ebc946d 100644 --- a/src/Functions/FunctionsStringDistance.cpp +++ b/src/Functions/FunctionsStringDistance.cpp @@ -159,7 +159,7 @@ struct ByteJaccardIndexImpl } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(haystack, haystack_end - haystack)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", std::string_view(haystack, haystack_end - haystack)); } } } @@ -186,7 +186,7 @@ struct ByteJaccardIndexImpl } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(needle, needle_end - needle)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", std::string_view(needle, needle_end - needle)); } } } diff --git a/src/IO/WriteHelpers.h b/src/IO/WriteHelpers.h index d4b2d8ea0dc..6b0de441e94 100644 --- a/src/IO/WriteHelpers.h +++ b/src/IO/WriteHelpers.h @@ -1420,7 +1420,7 @@ struct fmt::formatter } template - auto format(const DB::UUID & uuid, FormatContext & context) + auto format(const DB::UUID & uuid, FormatContext & context) const { return fmt::format_to(context.out(), "{}", toString(uuid)); } diff --git a/src/Interpreters/StorageID.h b/src/Interpreters/StorageID.h index 96e3cefe00c..69dac8ea32d 100644 --- a/src/Interpreters/StorageID.h +++ b/src/Interpreters/StorageID.h @@ -136,7 +136,7 @@ namespace fmt } template - auto format(const DB::StorageID & storage_id, FormatContext & ctx) + auto format(const DB::StorageID & storage_id, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", storage_id.getNameForLogs()); } diff --git a/src/Parsers/formatAST.h b/src/Parsers/formatAST.h index dd72a59b4a2..e34902663dd 100644 --- a/src/Parsers/formatAST.h +++ b/src/Parsers/formatAST.h @@ -40,7 +40,7 @@ struct fmt::formatter } template - auto format(const DB::ASTPtr & ast, FormatContext & context) + auto format(const DB::ASTPtr & ast, FormatContext & context) const { return fmt::format_to(context.out(), "{}", DB::serializeAST(*ast)); } diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp index 6a3475a1830..56c774782c2 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp @@ -421,7 +421,7 @@ void BSONEachRowRowInputFormat::readTuple(IColumn & column, const DataTypePtr & "Cannot parse tuple column with type {} from BSON array/embedded document field: " "tuple doesn't have element with name \"{}\"", data_type->getName(), - name); + name.toView()); index = *try_get_index; } @@ -806,7 +806,7 @@ bool BSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi else { if (seen_columns[index]) - throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing BSONEachRow format: {}", name); + throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing BSONEachRow format: {}", name.toView()); seen_columns[index] = true; read_columns[index] = readField(*columns[index], types[index], BSONType(type)); diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index af340c4aab8..b9f61d30182 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -37,7 +37,7 @@ struct fmt::formatter } template - auto format(const DB::RowNumber & x, FormatContext & ctx) + auto format(const DB::RowNumber & x, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}:{}", x.block, x.row); } diff --git a/src/Storages/Kafka/KafkaConsumer.cpp b/src/Storages/Kafka/KafkaConsumer.cpp index 7075dcb71ca..9ba42b9875e 100644 --- a/src/Storages/Kafka/KafkaConsumer.cpp +++ b/src/Storages/Kafka/KafkaConsumer.cpp @@ -1,7 +1,4 @@ -// Needs to go first because its partial specialization of fmt::formatter -// should be defined before any instantiation -#include - +#include #include #include diff --git a/src/Storages/Kafka/KafkaConsumer.h b/src/Storages/Kafka/KafkaConsumer.h index a3bc97779b3..a2d047933be 100644 --- a/src/Storages/Kafka/KafkaConsumer.h +++ b/src/Storages/Kafka/KafkaConsumer.h @@ -1,5 +1,7 @@ #pragma once +#include +#include #include #include @@ -197,3 +199,6 @@ private: }; } + +template <> struct fmt::formatter : fmt::ostream_formatter {}; +template <> struct fmt::formatter : fmt::ostream_formatter {}; diff --git a/src/Storages/MergeTree/MarkRange.h b/src/Storages/MergeTree/MarkRange.h index 626d4e9e689..6b111f348bb 100644 --- a/src/Storages/MergeTree/MarkRange.h +++ b/src/Storages/MergeTree/MarkRange.h @@ -69,7 +69,7 @@ struct fmt::formatter } template - auto format(const DB::MarkRange & range, FormatContext & ctx) + auto format(const DB::MarkRange & range, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", fmt::format("({}, {})", range.begin, range.end)); } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp index f3318a48883..79c0e6ad262 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp @@ -112,7 +112,7 @@ struct fmt::formatter static constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } template - auto format(const DB::Part & part, FormatContext & ctx) + auto format(const DB::Part & part, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{} in replicas [{}]", part.description.describe(), fmt::join(part.replicas, ", ")); } diff --git a/src/Storages/MergeTree/RangesInDataPart.cpp b/src/Storages/MergeTree/RangesInDataPart.cpp index c46385e84ef..50e0781b4e6 100644 --- a/src/Storages/MergeTree/RangesInDataPart.cpp +++ b/src/Storages/MergeTree/RangesInDataPart.cpp @@ -13,7 +13,7 @@ struct fmt::formatter static constexpr auto parse(format_parse_context & ctx) { return ctx.begin(); } template - auto format(const DB::RangesInDataPartDescription & range, FormatContext & ctx) + auto format(const DB::RangesInDataPartDescription & range, FormatContext & ctx) const { return fmt::format_to(ctx.out(), "{}", range.describe()); } diff --git a/src/Storages/NamedCollectionsHelpers.h b/src/Storages/NamedCollectionsHelpers.h index a1909f514ea..b4aea096c59 100644 --- a/src/Storages/NamedCollectionsHelpers.h +++ b/src/Storages/NamedCollectionsHelpers.h @@ -158,7 +158,7 @@ struct fmt::formatter> } template - auto format(const DB::NamedCollectionValidateKey & elem, FormatContext & context) + auto format(const DB::NamedCollectionValidateKey & elem, FormatContext & context) const { return fmt::format_to(context.out(), "{}", elem.value); } From 441279a979d3fd4f4e6257c21a82f4d162de3505 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Fri, 7 Jun 2024 14:47:22 +0200 Subject: [PATCH 072/215] Disable GWP for sanitizer builds --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 455adc24182..c4f093b1c99 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -399,7 +399,7 @@ option (ENABLE_GWP_ASAN "Enable Gwp-Asan" ON) # but GWP-ASan also wants to use mmap frequently, # and due to a large number of memory mappings, # it does not work together well. -if ((NOT OS_LINUX AND NOT OS_ANDROID) OR (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")) +if ((NOT OS_LINUX AND NOT OS_ANDROID) OR (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") OR SANITIZE) set(ENABLE_GWP_ASAN OFF) endif () From 2b200b566107f8023a4b6cc323ee1a3e4d1f06d3 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 7 Jun 2024 10:03:07 -0300 Subject: [PATCH 073/215] modify match everything regex --- src/Common/proxyConfigurationToPocoProxyConfig.cpp | 2 +- src/Common/tests/gtest_poco_no_proxy_regex.cpp | 2 +- .../tests/gtest_proxy_configuration_resolver_provider.cpp | 7 ++++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index c1a439d2f36..f64dbc3bc02 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -54,7 +54,7 @@ std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string) { static constexpr auto OR_SEPARATOR = "|"; - static constexpr auto MATCH_ANYTHING = R"((.*?))"; + static constexpr auto MATCH_ANYTHING = R"(.*)"; static constexpr auto MATCH_SUBDOMAINS_REGEX = R"((?:.*\.)?)"; bool match_any_host = no_proxy_hosts_string.size() == 1 && no_proxy_hosts_string[0] == '*'; diff --git a/src/Common/tests/gtest_poco_no_proxy_regex.cpp b/src/Common/tests/gtest_poco_no_proxy_regex.cpp index 1f70a483ab4..c3c1b512c08 100644 --- a/src/Common/tests/gtest_poco_no_proxy_regex.cpp +++ b/src/Common/tests/gtest_poco_no_proxy_regex.cpp @@ -13,7 +13,7 @@ TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildMatchA { ASSERT_EQ( DB::buildPocoNonProxyHosts("*"), - "(.*?)"); + ".*"); } TEST(ProxyConfigurationToPocoProxyConfiguration, TestNoProxyHostRegexBuildEmpty) diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index 368ce12cd7b..8bb2100561f 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -34,7 +34,12 @@ Poco::URI https_env_proxy_server = Poco::URI("http://https_environment_proxy:312 Poco::URI http_list_proxy_server = Poco::URI("http://http_list_proxy:3128"); Poco::URI https_list_proxy_server = Poco::URI("http://https_list_proxy:3128"); -static std::string no_proxy_hosts = "localhost,,127.0.0.1,some_other_domain,,,, sub-domain.domain.com,"; +// Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration +// since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once +// from the environment, the proxy configuration will always be there. +// The problem is that the proxy server does not exist, causing the test to fail. +// To work around this issue, `no_proxy` is set to bypass all domains. +static std::string no_proxy_hosts = "*"; TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedIfNoSettings) { From 48de600770aa5ad08720c365d8a705b48375a647 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Fri, 7 Jun 2024 10:47:54 -0300 Subject: [PATCH 074/215] unify environment proxy set] --- src/Common/tests/gtest_helper_functions.h | 29 +++++++++---------- ..._proxy_configuration_resolver_provider.cpp | 16 +--------- .../gtest_proxy_environment_configuration.cpp | 20 ++++--------- 3 files changed, 20 insertions(+), 45 deletions(-) diff --git a/src/Common/tests/gtest_helper_functions.h b/src/Common/tests/gtest_helper_functions.h index 1e5c2b21d99..90c5d4d2088 100644 --- a/src/Common/tests/gtest_helper_functions.h +++ b/src/Common/tests/gtest_helper_functions.h @@ -76,25 +76,22 @@ inline std::string xmlNodeAsString(Poco::XML::Node *pNode) struct EnvironmentProxySetter { - EnvironmentProxySetter( - const Poco::URI & http_proxy, - const Poco::URI & https_proxy, - const std::string & no_proxy = {}) + static constexpr auto * NO_PROXY = "*"; + static constexpr auto * HTTP_PROXY = "http://proxy_server:3128"; + static constexpr auto * HTTPS_PROXY = "https://proxy_server:3128"; + + EnvironmentProxySetter() { - if (!http_proxy.empty()) - { - setenv("http_proxy", http_proxy.toString().c_str(), 1); // NOLINT(concurrency-mt-unsafe) - } + setenv("http_proxy", HTTP_PROXY, 1); // NOLINT(concurrency-mt-unsafe) - if (!https_proxy.empty()) - { - setenv("https_proxy", https_proxy.toString().c_str(), 1); // NOLINT(concurrency-mt-unsafe) - } + setenv("https_proxy", HTTPS_PROXY, 1); // NOLINT(concurrency-mt-unsafe) - if (!no_proxy.empty()) - { - setenv("no_proxy", no_proxy.c_str(), 1); // NOLINT(concurrency-mt-unsafe) - } + // Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration + // since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once + // from the environment, the proxy configuration will always be there. + // The problem is that the proxy server does not exist, causing the test to fail. + // To work around this issue, `no_proxy` is set to bypass all domains. + setenv("no_proxy", NO_PROXY, 1); // NOLINT(concurrency-mt-unsafe) } ~EnvironmentProxySetter() diff --git a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp index 8bb2100561f..7bc48203998 100644 --- a/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp +++ b/src/Common/tests/gtest_proxy_configuration_resolver_provider.cpp @@ -28,22 +28,12 @@ protected: DB::ContextMutablePtr ProxyConfigurationResolverProviderTests::context; -Poco::URI http_env_proxy_server = Poco::URI("http://http_environment_proxy:3128"); -Poco::URI https_env_proxy_server = Poco::URI("http://https_environment_proxy:3128"); - Poco::URI http_list_proxy_server = Poco::URI("http://http_list_proxy:3128"); Poco::URI https_list_proxy_server = Poco::URI("http://https_list_proxy:3128"); -// Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration -// since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once -// from the environment, the proxy configuration will always be there. -// The problem is that the proxy server does not exist, causing the test to fail. -// To work around this issue, `no_proxy` is set to bypass all domains. -static std::string no_proxy_hosts = "*"; - TEST_F(ProxyConfigurationResolverProviderTests, EnvironmentResolverShouldBeUsedIfNoSettings) { - EnvironmentProxySetter setter(http_env_proxy_server, https_env_proxy_server, no_proxy_hosts); + EnvironmentProxySetter setter; const auto & config = getContext().context->getConfigRef(); auto http_resolver = DB::ProxyConfigurationResolverProvider::get(DB::ProxyConfiguration::Protocol::HTTP, config); @@ -58,7 +48,6 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPOnly) ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); - config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy.http", ""); config->setString("proxy.http.uri", http_list_proxy_server.toString()); context->setConfig(config); @@ -74,7 +63,6 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListHTTPSOnly) { ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); - config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy", ""); config->setString("proxy.https", ""); config->setString("proxy.https.uri", https_list_proxy_server.toString()); @@ -91,7 +79,6 @@ TEST_F(ProxyConfigurationResolverProviderTests, ListBoth) { ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); - config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy", ""); config->setString("proxy.http", ""); config->setString("proxy.http.uri", http_list_proxy_server.toString()); @@ -137,7 +124,6 @@ TEST_F(ProxyConfigurationResolverProviderTests, RemoteResolverHTTPSOnly) ConfigurationPtr config = Poco::AutoPtr(new Poco::Util::MapConfiguration()); config->setString("proxy", ""); - config->setString("proxy.no_proxy", no_proxy_hosts); config->setString("proxy.https", ""); config->setString("proxy.https.resolver", ""); config->setString("proxy.https.resolver.endpoint", "http://resolver:8080/hostname"); diff --git a/src/Common/tests/gtest_proxy_environment_configuration.cpp b/src/Common/tests/gtest_proxy_environment_configuration.cpp index ac31e33e129..708c7194785 100644 --- a/src/Common/tests/gtest_proxy_environment_configuration.cpp +++ b/src/Common/tests/gtest_proxy_environment_configuration.cpp @@ -8,22 +8,14 @@ namespace DB { -namespace -{ - auto http_proxy_server = Poco::URI("http://proxy_server:3128"); - auto https_proxy_server = Poco::URI("https://proxy_server:3128"); -} - TEST(EnvironmentProxyConfigurationResolver, TestHTTPandHTTPS) { - // Some other tests rely on HTTP clients (e.g, gtest_aws_s3_client), which depend on proxy configuration - // since in https://github.com/ClickHouse/ClickHouse/pull/63314 the environment proxy resolver reads only once - // from the environment, the proxy configuration will always be there. - // The problem is that the proxy server does not exist, causing the test to fail. - // To work around this issue, `no_proxy` is set to bypass all domains. - std::string no_proxy_string = "*"; - std::string poco_no_proxy_regex = buildPocoNonProxyHosts(no_proxy_string); - EnvironmentProxySetter setter(http_proxy_server, https_proxy_server, no_proxy_string); + const auto http_proxy_server = Poco::URI(EnvironmentProxySetter::HTTP_PROXY); + const auto https_proxy_server = Poco::URI(EnvironmentProxySetter::HTTPS_PROXY); + + std::string poco_no_proxy_regex = buildPocoNonProxyHosts(EnvironmentProxySetter::NO_PROXY); + + EnvironmentProxySetter setter; EnvironmentProxyConfigurationResolver http_resolver(ProxyConfiguration::Protocol::HTTP); From 01ece1403520af82e5ba02892a938a44d0b5b45f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 7 Jun 2024 16:23:40 +0200 Subject: [PATCH 075/215] fix initialization order (ServerUUID/ZooKeeper) --- programs/server/Server.cpp | 28 ++++++++++++++-------------- src/Core/ServerUUID.cpp | 8 ++++++++ src/Core/ServerUUID.h | 2 +- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 8fcb9d87a93..a35a8ab6de6 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -944,6 +944,18 @@ try } } + std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH)); + fs::path path = path_str; + + /// Check that the process user id matches the owner of the data. + assertProcessUserMatchesDataOwner(path_str, [&](const std::string & message){ global_context->addWarningMessage(message); }); + + global_context->setPath(path_str); + + StatusFile status{path / "status", StatusFile::write_full_info}; + + ServerUUID::load(path / "uuid", log); + zkutil::validateZooKeeperConfig(config()); bool has_zookeeper = zkutil::hasZooKeeperConfig(config()); @@ -955,7 +967,7 @@ try ConfigProcessor config_processor(config_path); loaded_config = config_processor.loadConfigWithZooKeeperIncludes( main_config_zk_node_cache, main_config_zk_changed_event, /* fallback_to_preprocessed = */ true); - config_processor.savePreprocessedConfig(loaded_config, config().getString("path", DBMS_DEFAULT_PATH)); + config_processor.savePreprocessedConfig(loaded_config, path_str); config().removeConfiguration(old_configuration.get()); config().add(loaded_config.configuration.duplicate(), PRIO_DEFAULT, false); global_context->setConfig(loaded_config.configuration); @@ -1089,19 +1101,6 @@ try global_context->setRemoteHostFilter(config()); global_context->setHTTPHeaderFilter(config()); - std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH)); - fs::path path = path_str; - std::string default_database = server_settings.default_database.toString(); - - /// Check that the process user id matches the owner of the data. - assertProcessUserMatchesDataOwner(path_str, [&](const std::string & message){ global_context->addWarningMessage(message); }); - - global_context->setPath(path_str); - - StatusFile status{path / "status", StatusFile::write_full_info}; - - ServerUUID::load(path / "uuid", log); - /// Try to increase limit on number of open files. { rlimit rlim; @@ -1889,6 +1888,7 @@ try /// Set current database name before loading tables and databases because /// system logs may copy global context. + std::string default_database = server_settings.default_database.toString(); global_context->setCurrentDatabaseNameInGlobalContext(default_database); LOG_INFO(log, "Loading metadata from {}", path_str); diff --git a/src/Core/ServerUUID.cpp b/src/Core/ServerUUID.cpp index c2de6be7794..159aa8faadf 100644 --- a/src/Core/ServerUUID.cpp +++ b/src/Core/ServerUUID.cpp @@ -11,6 +11,14 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_CREATE_FILE; + extern const int LOGICAL_ERROR; +} + +UUID ServerUUID::get() +{ + if (server_uuid == UUIDHelpers::Nil) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ServerUUID is not initialized yet"); + return server_uuid; } void ServerUUID::load(const fs::path & server_uuid_file, Poco::Logger * log) diff --git a/src/Core/ServerUUID.h b/src/Core/ServerUUID.h index 71ae9edc00e..9b9963ceeeb 100644 --- a/src/Core/ServerUUID.h +++ b/src/Core/ServerUUID.h @@ -15,7 +15,7 @@ class ServerUUID public: /// Returns persistent UUID of current clickhouse-server or clickhouse-keeper instance. - static UUID get() { return server_uuid; } + static UUID get(); /// Loads server UUID from file or creates new one. Should be called on daemon startup. static void load(const fs::path & server_uuid_file, Poco::Logger * log); From 98235ba2b1df3ce121959a0c09db2a40993302e6 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Fri, 7 Jun 2024 15:47:12 +0000 Subject: [PATCH 076/215] Dedicated support for Base64URL encoding --- .../functions/string-functions.md | 8 ++ .../functions/string-functions.md | 8 ++ src/Functions/FunctionBase64Conversion.h | 79 +++++++++++++++++++ src/Functions/base64UrlDecode.cpp | 14 ++++ src/Functions/base64UrlEncode.cpp | 14 ++++ .../03167_base64_url_functions.reference | 11 +++ .../03167_base64_url_functions.sql | 28 +++++++ 7 files changed, 162 insertions(+) create mode 100644 src/Functions/base64UrlDecode.cpp create mode 100644 src/Functions/base64UrlEncode.cpp create mode 100644 tests/queries/0_stateless/03167_base64_url_functions.reference create mode 100644 tests/queries/0_stateless/03167_base64_url_functions.sql diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 342ca2b9f03..7365e0f4d27 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1140,12 +1140,20 @@ Encodes a String or FixedString as base64. Alias: `TO_BASE64`. +## base64UrlEncode + +Encodes an URL (String or FixedString) as base64 according to [RFC 4648](https://tools.ietf.org/html/rfc4648). + ## base64Decode Decodes a base64-encoded String or FixedString. Throws an exception in case of error. Alias: `FROM_BASE64`. +## base64UrlDecode + +Decodes a base64-encoded URL (String or FixedString) according to [RFC 4648](https://tools.ietf.org/html/rfc4648). Throws an exception in case of error. + ## tryBase64Decode Like `base64Decode` but returns an empty string in case of error. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index fc258f7b4cf..c44cf94876f 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -538,12 +538,20 @@ SELECT base58Decode('3dc8KtHrwM'); Синоним: `TO_BASE64`. +## base64UrlEncode(s) + +Производит кодирование URL (String или FixedString) в base64-представление в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). + ## base64Decode(s) {#base64decode} Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение Синоним: `FROM_BASE64`. +## base64UrlDecode(s) + +Декодирует base64-представление URL в исходную строку в соответствии с [RFC 4648](https://tools.ietf.org/html/rfc4648). При невозможности декодирования выбрасывает исключение + ## tryBase64Decode(s) {#trybase64decode} Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 3906563a254..008ce0b7338 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -85,6 +85,85 @@ struct TryBase64Decode } }; +struct Base64UrlEncode : Base64Encode +{ + static constexpr auto name = "base64UrlEncode"; + + static size_t perform(const std::span src, UInt8 * dst) + { + auto out_len = Base64Encode::perform(src, dst); + + // Do postprocessing as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + for (size_t i = 0; i < out_len; ++i) + { + switch (dst[i]) + { + case '/': + dst[i] = '_'; + break; + case '+': + dst[i] = '-'; + break; + case '=': // stop when padding is detected + return i; + default: + break; + } + } + return out_len; + } +}; + +struct Base64UrlDecode : Base64Decode +{ + static constexpr auto name = "base64UrlDecode"; + + static size_t perform(const std::span src, UInt8 * dst) + { + std::vector tmp{}; + // insert padding to please alcomp library + auto size = src.size(); + auto remainder = size % 4; + switch (remainder) + { + case 0: + break; // no padding needed + case 1: + break; // invalid input, let it be detected by alcomp library + case 2: + size += 2; // two bytes padding + break; + default: // remainder == 3 + ++size; // one byte padding + } + tmp.resize(size); + + size_t i = 0; + for (; i < src.size(); ++i) + { + switch (src[i]) + { + case '_': + tmp[i] = '/'; + break; + case '-': + tmp[i] = '+'; + break; + default: + tmp[i] = src[i]; + break; + } + } + if (remainder == 2 || remainder == 3) + tmp[i++] = '='; + if (remainder == 2) + tmp[i++] = '='; + + return Base64Decode::perform(tmp, dst); + } +}; + + template class FunctionBase64Conversion : public IFunction { diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64UrlDecode.cpp new file mode 100644 index 00000000000..fa2adac3c5f --- /dev/null +++ b/src/Functions/base64UrlDecode.cpp @@ -0,0 +1,14 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlDecode) +{ + factory.registerFunction>(); +} +} + +#endif diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64UrlEncode.cpp new file mode 100644 index 00000000000..a3775324145 --- /dev/null +++ b/src/Functions/base64UrlEncode.cpp @@ -0,0 +1,14 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlEncode) +{ + factory.registerFunction>(); +} +} + +#endif diff --git a/tests/queries/0_stateless/03167_base64_url_functions.reference b/tests/queries/0_stateless/03167_base64_url_functions.reference new file mode 100644 index 00000000000..075d1729cef --- /dev/null +++ b/tests/queries/0_stateless/03167_base64_url_functions.reference @@ -0,0 +1,11 @@ +aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ +https://clickhouse.com +MTI_ +12? +aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmZ3NfbHA9RWd4bmQzTXRkMmw2TFhObGNuQWlHR05zYVdOcmFHOTFjMlVnWW1GelpUWTBJR1JsWTI5a1pUSUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUjBqWEJGQUFXQUJ3QVhnQmtBRUFtQUVBb0FFQXFnRUF1QUVEeUFFQW1BSUJvQUlIbUFNQWlBWUJrQVlJa2djQk1hQUhBQSZzY2xpZW50PWd3cy13aXotc2VycA +https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode&gs_lp=Egxnd3Mtd2l6LXNlcnAiGGNsaWNraG91c2UgYmFzZTY0IGRlY29kZTIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYR0jXBFAAWABwAXgBkAEAmAEAoAEAqgEAuAEDyAEAmAIBoAIHmAMAiAYBkAYIkgcBMaAHAA&sclient=gws-wiz-serp +https://clic +https://clickh +https://click +aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ +https://clickhouse.com diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql new file mode 100644 index 00000000000..908ca890be2 --- /dev/null +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -0,0 +1,28 @@ +SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlEncode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT base64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } + +-- test with valid inputs +SELECT base64UrlEncode('https://clickhouse.com'); +SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ'); +-- encoding differs from base64Encode +SELECT base64UrlEncode('12?'); +SELECT base64UrlDecode('MTI_'); +-- long string +SELECT base64UrlEncode('https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode&gs_lp=Egxnd3Mtd2l6LXNlcnAiGGNsaWNraG91c2UgYmFzZTY0IGRlY29kZTIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYR0jXBFAAWABwAXgBkAEAmAEAoAEAqgEAuAEDyAEAmAIBoAIHmAMAiAYBkAYIkgcBMaAHAA&sclient=gws-wiz-serp'); +SELECT base64UrlDecode('aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmZ3NfbHA9RWd4bmQzTXRkMmw2TFhObGNuQWlHR05zYVdOcmFHOTFjMlVnWW1GelpUWTBJR1JsWTI5a1pUSUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUjBqWEJGQUFXQUJ3QVhnQmtBRUFtQUVBb0FFQXFnRUF1QUVEeUFFQW1BSUJvQUlIbUFNQWlBWUJrQVlJa2djQk1hQUhBQSZzY2xpZW50PWd3cy13aXotc2VycA'); +-- no padding +SELECT base64UrlDecode('aHR0cHM6Ly9jbGlj'); +-- one-byte padding +SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja2g'); +-- two-bytes padding +SELECT base64UrlDecode('aHR0cHM6Ly9jbGljaw'); + +-- invalid inputs +SELECT base64UrlDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } +SELECT base64UrlDecode('12?'); -- { serverError INCORRECT_DATA } + +-- test FixedString argument +SELECT base64UrlEncode(toFixedString('https://clickhouse.com', 22)); +SELECT base64UrlDecode(toFixedString('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ', 30)); From 54630c932e18b182c11b9366bd97bf564323550b Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Fri, 7 Jun 2024 18:28:30 +0000 Subject: [PATCH 077/215] Style fix --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index c35e860a5d7..c48b931dc54 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1008,6 +1008,8 @@ Updatable Uppercased Uptime Uptrace +UrlDecode +UrlEncode UserID Util VARCHAR From e6f88126964f3018f07d4137e89541f8b45806a3 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Mon, 10 Jun 2024 06:53:25 +0000 Subject: [PATCH 078/215] fix mistake Signed-off-by: Duc Canh Le --- src/Common/filesystemHelpers.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Common/filesystemHelpers.cpp b/src/Common/filesystemHelpers.cpp index f8d209bc11f..09c4508b7b2 100644 --- a/src/Common/filesystemHelpers.cpp +++ b/src/Common/filesystemHelpers.cpp @@ -1,5 +1,5 @@ #include "filesystemHelpers.h" -#include +#include #include #if defined(OS_LINUX) @@ -371,11 +371,10 @@ Poco::Timestamp getModificationTimestamp(const std::string & path) void setModificationTime(const std::string & path, time_t time) { - struct timeval times[2]; - times[0].tv_usec = times[1].tv_usec = 0; - times[0].tv_sec = ::time(nullptr); - times[1].tv_sec = time; - if (utimes(path.c_str(), times) != 0) + struct utimbuf tb; + tb.actime = time; + tb.modtime = time; + if (utime(path.c_str(), &tb) != 0) DB::ErrnoException::throwFromPath(DB::ErrorCodes::PATH_ACCESS_DENIED, path, "Cannot set modification time to file: {}", path); } From 0f6f86314faec7aea15b75210cf632032ccd2884 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 10 Jun 2024 09:37:52 +0200 Subject: [PATCH 079/215] Better code --- src/Common/GWPAsan.cpp | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 4c6c8c7e9cc..8e9cbf8e842 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -81,12 +81,6 @@ struct ScopedEndOfReportDecorator Poco::LoggerPtr log; }; -constexpr std::string_view unknown_crash_text = - "GWP-ASan cannot provide any more information about this error. This may " - "occur due to a wild memory access into the GWP-ASan pool, or an " - "overflow/underflow that is > 512B in length.\n"; - - // Prints the provided error and metadata information. void printHeader(gwp_asan::Error error, uintptr_t fault_address, const gwp_asan::AllocationMetadata * allocation_meta, Poco::LoggerPtr log) { @@ -164,6 +158,11 @@ void printReport([[maybe_unused]] uintptr_t fault_address) const gwp_asan::AllocationMetadata * allocation_meta = __gwp_asan_get_metadata(state, GuardedAlloc.getMetadataRegion(), fault_address); + static constexpr std::string_view unknown_crash_text = + "GWP-ASan cannot provide any more information about this error. This may " + "occur due to a wild memory access into the GWP-ASan pool, or an " + "overflow/underflow that is > 512B in length.\n"; + if (allocation_meta == nullptr) { LOG_FATAL(logger, "*** GWP-ASan detected a memory error ***"); From c6e43f7a7b74a8928c1a9bf0a572aadcb56e8c54 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:17:01 +0000 Subject: [PATCH 080/215] Bump absl to 2023-11-28 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 3bd86026c93..0c09fd0ff0d 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 3bd86026c93da5a40006fd53403dff9d5f5e30e3 +Subproject commit 0c09fd0ff0d502c30831ff2ccf59894e36d2b60a From 4d3d18cee71ecadf520868623130538c1d3179e3 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:18:22 +0000 Subject: [PATCH 081/215] Bump absl to 2023-12-06 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 0c09fd0ff0d..8588e7d14dc 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 0c09fd0ff0d502c30831ff2ccf59894e36d2b60a +Subproject commit 8588e7d14dca32eb2c695a9cd49d272aa23cc483 From 70c0589675d3c3c7f9a17d805818601fc0bd698e Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:19:20 +0000 Subject: [PATCH 082/215] Bump absl to 2023-12-12 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 8588e7d14dc..ad0a6d2faf8 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 8588e7d14dca32eb2c695a9cd49d272aa23cc483 +Subproject commit ad0a6d2faf803645c8126f0b67eee2eaad98bc3f From 1bca6b900bb55aaadecdb24f07a60b18b6677eb8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:20:25 +0000 Subject: [PATCH 083/215] Bump absl to 2023-12-20 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index ad0a6d2faf8..794352a92f0 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit ad0a6d2faf803645c8126f0b67eee2eaad98bc3f +Subproject commit 794352a92f09425714b9116974b29e58ce8f9ba9 From 8fe272f210c7d214cedfaffa8eb72f73cb7756be Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:22:27 +0000 Subject: [PATCH 084/215] Bump absl to 2024-01-02 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 794352a92f0..925a5e681ea 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 794352a92f09425714b9116974b29e58ce8f9ba9 +Subproject commit 925a5e681ea1958171ba580c4402e5ce76473cb5 From a0d8d5a37ca944f6cb135444112f4906deb03371 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:31:33 +0000 Subject: [PATCH 085/215] Bump absl to 2024-01-02 --- contrib/abseil-cpp | 2 +- contrib/abseil-cpp-cmake/CMakeLists.txt | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 925a5e681ea..4038192a57c 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 925a5e681ea1958171ba580c4402e5ce76473cb5 +Subproject commit 4038192a57cb75f7ee671f81a3378ff4c74c4f8e diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index 7372195bb0d..be42d98345e 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -1283,12 +1283,9 @@ absl_cc_library( absl_cc_library( NAME flags - SRCS - "${DIR}/flag.cc" HDRS "${DIR}/declare.h" "${DIR}/flag.h" - "${DIR}/internal/flag_msvc.inc" COPTS ${ABSL_DEFAULT_COPTS} LINKOPTS From d4a453aad5eaaceb993570bb369e9321bc17bcf8 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:32:22 +0000 Subject: [PATCH 086/215] Bump absl to 2024-03-06 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 4038192a57c..6f0bb2747d0 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 4038192a57cb75f7ee671f81a3378ff4c74c4f8e +Subproject commit 6f0bb2747d0a910de4a958eeeab2b9d615156382 From ae7d8821a78b5ff7a2228ca463323ca90c385a96 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 08:46:40 +0000 Subject: [PATCH 087/215] Bump absl to 2024-04-04 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 6f0bb2747d0..1ec4a27e399 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 6f0bb2747d0a910de4a958eeeab2b9d615156382 +Subproject commit 1ec4a27e39944462a574abbfa040498ed2831cc8 From 0da0d8dfb1fc73fb70926338f2e946ad5737d880 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Mon, 10 Jun 2024 10:14:45 +0000 Subject: [PATCH 088/215] PR post-review fixes --- .../functions/string-functions.md | 4 + .../functions/string-functions.md | 4 + src/Functions/FunctionBase64Conversion.cpp | 93 +++++++++++++ src/Functions/FunctionBase64Conversion.h | 127 ++++++------------ src/Functions/base64Decode.cpp | 2 +- src/Functions/base64Encode.cpp | 2 +- src/Functions/base64UrlDecode.cpp | 2 +- src/Functions/base64UrlEncode.cpp | 2 +- src/Functions/tryBase64Decode.cpp | 2 +- src/Functions/tryBase64UrlDecode.cpp | 14 ++ .../03167_base64_url_functions.reference | 21 ++- .../03167_base64_url_functions.sql | 28 ++-- 12 files changed, 188 insertions(+), 113 deletions(-) create mode 100644 src/Functions/FunctionBase64Conversion.cpp create mode 100644 src/Functions/tryBase64UrlDecode.cpp diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 7365e0f4d27..6c8f09e74ce 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1158,6 +1158,10 @@ Decodes a base64-encoded URL (String or FixedString) according to [RFC 4648](htt Like `base64Decode` but returns an empty string in case of error. +## tryBase64UrlDecode + +Like `base64UrlDecode` but returns an empty string in case of error. + **Syntax** ```sql diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index c44cf94876f..fa76e84f130 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -556,6 +556,10 @@ SELECT base58Decode('3dc8KtHrwM'); Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. +## tryBase64UrlDecode(s) + +Функционал аналогичен base64UrlDecode, но при невозможности декодирования возвращает пустую строку. + ## endsWith(s, suffix) {#endswith} Возвращает 1, если строка завершается указанным суффиксом, и 0 в противном случае. diff --git a/src/Functions/FunctionBase64Conversion.cpp b/src/Functions/FunctionBase64Conversion.cpp new file mode 100644 index 00000000000..a87ce31f478 --- /dev/null +++ b/src/Functions/FunctionBase64Conversion.cpp @@ -0,0 +1,93 @@ +#include "config.h" +#if USE_BASE64 +# include +# include +# include + +namespace DB +{ + +std::vector preprocessBase64Url(const std::span src) +{ + std::vector padded_src{}; + // insert padding to please aklomp library + size_t padded_size = src.size(); + size_t remainder = padded_size % 4; + switch (remainder) + { + case 0: + break; // no padding needed + case 1: + padded_size += 3; // this case is impossible to occur, however, we'll insert padding anyway + break; + case 2: + padded_size += 2; // two bytes padding + break; + default: // remainder == 3 + padded_size += 1; // one byte padding + break; + } + padded_src.resize(padded_size); + + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + size_t i = 0; + for (; i < src.size(); ++i) + { + switch (src[i]) + { + case '_': + padded_src[i] = '/'; + break; + case '-': + padded_src[i] = '+'; + break; + default: + padded_src[i] = src[i]; + break; + } + } + if (remainder == 1) + { + padded_src[i] = '='; + ++i; + padded_src[i] = '='; + ++i; + padded_src[i] = '='; + } + else if (remainder == 2) + { + padded_src[i] = '='; + ++i; + padded_src[i] = '='; + } + else if (remainder == 3) + padded_src[i] = '='; + + return padded_src; +} + +size_t postprocessBase64Url(UInt8 * dst, size_t out_len) +{ + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + for (size_t i = 0; i < out_len; ++i) + { + switch (dst[i]) + { + case '/': + dst[i] = '_'; + break; + case '+': + dst[i] = '-'; + break; + case '=': // stop when padding is detected + return i; + default: + break; + } + } + return out_len; +} + +} + +#endif diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 008ce0b7338..51ca28aa670 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -22,9 +22,19 @@ namespace ErrorCodes extern const int INCORRECT_DATA; } +enum class Base64Variant : uint8_t +{ + Normal, + Url +}; + +extern std::vector preprocessBase64Url(const std::span src); +extern size_t postprocessBase64Url(UInt8 * dst, size_t out_len); + +template struct Base64Encode { - static constexpr auto name = "base64Encode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Encode" : "base64UrlEncode"; static size_t getBufferSize(size_t string_length, size_t string_count) { @@ -35,13 +45,18 @@ struct Base64Encode { size_t outlen = 0; base64_encode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); - return outlen; + + if constexpr (variant == Base64Variant::Url) + return postprocessBase64Url(dst, outlen); + else + return outlen; } }; +template struct Base64Decode { - static constexpr auto name = "base64Decode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "base64Decode" : "base64UrlDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { @@ -50,8 +65,17 @@ struct Base64Decode static size_t perform(const std::span src, UInt8 * dst) { + int rc; size_t outlen = 0; - int rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + if constexpr (variant == Base64Variant::Url) + { + auto src_padded = preprocessBase64Url(src); + rc = base64_decode(reinterpret_cast(src_padded.data()), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + } + else + { + rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + } if (rc != 1) throw Exception( @@ -64,19 +88,29 @@ struct Base64Decode } }; +template struct TryBase64Decode { - static constexpr auto name = "tryBase64Decode"; + static constexpr auto name = (variant == Base64Variant::Normal) ? "tryBase64Decode" : "tryBase64UrlDecode"; static size_t getBufferSize(size_t string_length, size_t string_count) { - return Base64Decode::getBufferSize(string_length, string_count); + return Base64Decode::getBufferSize(string_length, string_count); } static size_t perform(const std::span src, UInt8 * dst) { + int rc; size_t outlen = 0; - int rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + if constexpr (variant == Base64Variant::Url) + { + auto src_padded = preprocessBase64Url(src); + rc = base64_decode(reinterpret_cast(src_padded.data()), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + } + else + { + rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + } if (rc != 1) outlen = 0; @@ -85,85 +119,6 @@ struct TryBase64Decode } }; -struct Base64UrlEncode : Base64Encode -{ - static constexpr auto name = "base64UrlEncode"; - - static size_t perform(const std::span src, UInt8 * dst) - { - auto out_len = Base64Encode::perform(src, dst); - - // Do postprocessing as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 - for (size_t i = 0; i < out_len; ++i) - { - switch (dst[i]) - { - case '/': - dst[i] = '_'; - break; - case '+': - dst[i] = '-'; - break; - case '=': // stop when padding is detected - return i; - default: - break; - } - } - return out_len; - } -}; - -struct Base64UrlDecode : Base64Decode -{ - static constexpr auto name = "base64UrlDecode"; - - static size_t perform(const std::span src, UInt8 * dst) - { - std::vector tmp{}; - // insert padding to please alcomp library - auto size = src.size(); - auto remainder = size % 4; - switch (remainder) - { - case 0: - break; // no padding needed - case 1: - break; // invalid input, let it be detected by alcomp library - case 2: - size += 2; // two bytes padding - break; - default: // remainder == 3 - ++size; // one byte padding - } - tmp.resize(size); - - size_t i = 0; - for (; i < src.size(); ++i) - { - switch (src[i]) - { - case '_': - tmp[i] = '/'; - break; - case '-': - tmp[i] = '+'; - break; - default: - tmp[i] = src[i]; - break; - } - } - if (remainder == 2 || remainder == 3) - tmp[i++] = '='; - if (remainder == 2) - tmp[i++] = '='; - - return Base64Decode::perform(tmp, dst); - } -}; - - template class FunctionBase64Conversion : public IFunction { diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index 5f7a3406c62..a7a243b6d7d 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -7,7 +7,7 @@ namespace DB { REGISTER_FUNCTION(Base64Decode) { - factory.registerFunction>(); + factory.registerFunction>>(); /// MySQL compatibility alias. factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index 69268f5a25d..1599505a413 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -7,7 +7,7 @@ namespace DB { REGISTER_FUNCTION(Base64Encode) { - factory.registerFunction>(); + factory.registerFunction>>(); /// MySQL compatibility alias. factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64UrlDecode.cpp index fa2adac3c5f..1ed836768b8 100644 --- a/src/Functions/base64UrlDecode.cpp +++ b/src/Functions/base64UrlDecode.cpp @@ -7,7 +7,7 @@ namespace DB { REGISTER_FUNCTION(Base64UrlDecode) { - factory.registerFunction>(); + factory.registerFunction>>(); } } diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64UrlEncode.cpp index a3775324145..9d959c6bbc6 100644 --- a/src/Functions/base64UrlEncode.cpp +++ b/src/Functions/base64UrlEncode.cpp @@ -7,7 +7,7 @@ namespace DB { REGISTER_FUNCTION(Base64UrlEncode) { - factory.registerFunction>(); + factory.registerFunction>>(); } } diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index bd452b8357b..da1a24fd776 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -7,7 +7,7 @@ namespace DB { REGISTER_FUNCTION(TryBase64Decode) { - factory.registerFunction>(); + factory.registerFunction>>(); } } diff --git a/src/Functions/tryBase64UrlDecode.cpp b/src/Functions/tryBase64UrlDecode.cpp new file mode 100644 index 00000000000..528018b26f9 --- /dev/null +++ b/src/Functions/tryBase64UrlDecode.cpp @@ -0,0 +1,14 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(TryBase64UrlDecode) +{ + factory.registerFunction>>(); +} +} + +#endif diff --git a/tests/queries/0_stateless/03167_base64_url_functions.reference b/tests/queries/0_stateless/03167_base64_url_functions.reference index 075d1729cef..2a0d0013609 100644 --- a/tests/queries/0_stateless/03167_base64_url_functions.reference +++ b/tests/queries/0_stateless/03167_base64_url_functions.reference @@ -1,11 +1,10 @@ -aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ -https://clickhouse.com -MTI_ -12? -aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmZ3NfbHA9RWd4bmQzTXRkMmw2TFhObGNuQWlHR05zYVdOcmFHOTFjMlVnWW1GelpUWTBJR1JsWTI5a1pUSUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUjBqWEJGQUFXQUJ3QVhnQmtBRUFtQUVBb0FFQXFnRUF1QUVEeUFFQW1BSUJvQUlIbUFNQWlBWUJrQVlJa2djQk1hQUhBQSZzY2xpZW50PWd3cy13aXotc2VycA -https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode&gs_lp=Egxnd3Mtd2l6LXNlcnAiGGNsaWNraG91c2UgYmFzZTY0IGRlY29kZTIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYR0jXBFAAWABwAXgBkAEAmAEAoAEAqgEAuAEDyAEAmAIBoAIHmAMAiAYBkAYIkgcBMaAHAA&sclient=gws-wiz-serp -https://clic -https://clickh -https://click -aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ -https://clickhouse.com +https://clickhouse.com aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ https://clickhouse.com https://clickhouse.com +12? MTI_ 12? 12? +https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGU https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode +aHR0cHM6Ly9jbGlj https://clic https://clic +aHR0cHM6Ly9jbGlja2g https://clickh https://clickh +aHR0cHM6Ly9jbGljaw https://click https://click + + + +https://clickhouse.com aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ https://clickhouse.com https://clickhouse.com diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql index 908ca890be2..60bb1746e90 100644 --- a/tests/queries/0_stateless/03167_base64_url_functions.sql +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -1,28 +1,34 @@ +-- incorrect number of arguments SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64UrlEncode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT tryBase64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -- test with valid inputs -SELECT base64UrlEncode('https://clickhouse.com'); -SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ'); +SELECT 'https://clickhouse.com' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; + -- encoding differs from base64Encode -SELECT base64UrlEncode('12?'); -SELECT base64UrlDecode('MTI_'); +SELECT '12?' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; + -- long string -SELECT base64UrlEncode('https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode&gs_lp=Egxnd3Mtd2l6LXNlcnAiGGNsaWNraG91c2UgYmFzZTY0IGRlY29kZTIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYRzIKEAAYsAMY1gQYR0jXBFAAWABwAXgBkAEAmAEAoAEAqgEAuAEDyAEAmAIBoAIHmAMAiAYBkAYIkgcBMaAHAA&sclient=gws-wiz-serp'); -SELECT base64UrlDecode('aHR0cHM6Ly93d3cuZ29vZ2xlLmNvbS9zZWFyY2g_cT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmc2NhX2Vzdj03MzlmOGJiMzgwZTRjN2VkJmVpPVRmUmlacUNESXJtbndQQVAyS0xSa0E4JnZlZD0wYWhVS0V3amczWkhpdHNtR0F4VzVFeEFJSFZoUkZQSVE0ZFVEQ0JBJnVhY3Q9NSZvcT1jbGlja2hvdXNlK2Jhc2U2NCtkZWNvZGUmZ3NfbHA9RWd4bmQzTXRkMmw2TFhObGNuQWlHR05zYVdOcmFHOTFjMlVnWW1GelpUWTBJR1JsWTI5a1pUSUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUnpJS0VBQVlzQU1ZMWdRWVJ6SUtFQUFZc0FNWTFnUVlSeklLRUFBWXNBTVkxZ1FZUjBqWEJGQUFXQUJ3QVhnQmtBRUFtQUVBb0FFQXFnRUF1QUVEeUFFQW1BSUJvQUlIbUFNQWlBWUJrQVlJa2djQk1hQUhBQSZzY2xpZW50PWd3cy13aXotc2VycA'); +SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; + -- no padding -SELECT base64UrlDecode('aHR0cHM6Ly9jbGlj'); +SELECT 'aHR0cHM6Ly9jbGlj' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; -- one-byte padding -SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja2g'); +SELECT 'aHR0cHM6Ly9jbGlja2g' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; -- two-bytes padding -SELECT base64UrlDecode('aHR0cHM6Ly9jbGljaw'); +SELECT 'aHR0cHM6Ly9jbGljaw' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; -- invalid inputs SELECT base64UrlDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('https://clickhouse.com'); SELECT base64UrlDecode('12?'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('12?'); +SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja'); -- { serverError INCORRECT_DATA } +SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja'); -- test FixedString argument -SELECT base64UrlEncode(toFixedString('https://clickhouse.com', 22)); -SELECT base64UrlDecode(toFixedString('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ', 30)); +SELECT toFixedString('https://clickhouse.com', 22) as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; From 61b464321759e8edf0fad69aa80c8b0daef1818c Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 10:39:35 +0000 Subject: [PATCH 089/215] Bump absl to 2024-04-24 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 1ec4a27e399..08b21bd0379 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 1ec4a27e39944462a574abbfa040498ed2831cc8 +Subproject commit 08b21bd037990c18d44fda1691211e73835bf214 From 643444eb1134c9e3767efeb1698e1553b1c686af Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 10:41:47 +0000 Subject: [PATCH 090/215] Bump absl to 2024-05-03 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 08b21bd0379..c1e1b47d989 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 08b21bd037990c18d44fda1691211e73835bf214 +Subproject commit c1e1b47d989978cde8c5a2a219df425b785a0c47 From af08c2bc13c06f369dcdb1a19e36663fcd9e5531 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 10 Jun 2024 12:50:02 +0200 Subject: [PATCH 091/215] Increase default sample rate --- src/Common/GWPAsan.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 8e9cbf8e842..088f34fa6ae 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -56,6 +56,9 @@ static bool guarded_alloc_initialized = [] if (!env_options_raw || !std::string_view{env_options_raw}.contains("MaxSimultaneousAllocations")) opts.MaxSimultaneousAllocations = 1024; + if (!env_options_raw || !std::string_view{env_options_raw}.contains("SampleRate")) + opts.SampleRate = 10000; + opts.Backtrace = getBackTrace; GuardedAlloc.init(opts); From c95ed40d3eb7db5fcef1a5a51c3964e11cb77f56 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 11:32:29 +0000 Subject: [PATCH 092/215] Bump absl to 2024-05-06 --- contrib/abseil-cpp | 2 +- contrib/abseil-cpp-cmake/CMakeLists.txt | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index c1e1b47d989..a28ee5b51c9 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit c1e1b47d989978cde8c5a2a219df425b785a0c47 +Subproject commit a28ee5b51c9ea41707d9a5d2d358ad77850264c4 diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index be42d98345e..d026a7c78bc 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -1058,8 +1058,10 @@ absl_cc_library( demangle_internal HDRS "${DIR}/internal/demangle.h" + "${DIR}/internal/demangle_rust.h" SRCS "${DIR}/internal/demangle.cc" + "${DIR}/internal/demangle_rust.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS From da91dd64283de30172fca6cd30df4c711d291b44 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 11:35:07 +0000 Subject: [PATCH 093/215] Bump absl to 2024-06-07 --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index a28ee5b51c9..696b32788ca 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit a28ee5b51c9ea41707d9a5d2d358ad77850264c4 +Subproject commit 696b32788ca887881547380530926314c521ea7d From 2056ed8ee83ce8bca37492d521d5a3d2f7d19242 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Mon, 10 Jun 2024 13:54:46 +0200 Subject: [PATCH 094/215] Capture weak_ptr for safety --- src/Access/ContextAccess.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 2a658d7aaa2..28a825de6cf 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -360,10 +360,13 @@ void ContextAccess::setUser(const UserPtr & user_) const subscription_for_roles_changes.reset(); enabled_roles = access_control->getEnabledRoles(current_roles, current_roles_with_admin_option); - subscription_for_roles_changes = enabled_roles->subscribeForChanges([this](const std::shared_ptr & roles_info_) + subscription_for_roles_changes = enabled_roles->subscribeForChanges([weak_ptr = weak_from_this()](const std::shared_ptr & roles_info_) { - std::lock_guard lock{mutex}; - setRolesInfo(roles_info_); + auto ptr = weak_ptr.lock(); + if (!ptr) + return; + std::lock_guard lock{ptr->mutex}; + ptr->setRolesInfo(roles_info_); }); setRolesInfo(enabled_roles->getRolesInfo()); From 9e538b70bb452022d5a3dff0509a901032432c49 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Mon, 10 Jun 2024 12:16:03 +0000 Subject: [PATCH 095/215] Try to fix base64Url functions registration --- src/Functions/base64Decode.cpp | 1 + src/Functions/base64Encode.cpp | 1 + src/Functions/base64UrlDecode.cpp | 14 -------------- src/Functions/base64UrlEncode.cpp | 14 -------------- src/Functions/tryBase64Decode.cpp | 1 + src/Functions/tryBase64UrlDecode.cpp | 14 -------------- 6 files changed, 3 insertions(+), 42 deletions(-) delete mode 100644 src/Functions/base64UrlDecode.cpp delete mode 100644 src/Functions/base64UrlEncode.cpp delete mode 100644 src/Functions/tryBase64UrlDecode.cpp diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index a7a243b6d7d..2c0cf27c592 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -8,6 +8,7 @@ namespace DB REGISTER_FUNCTION(Base64Decode) { factory.registerFunction>>(); + factory.registerFunction>>(); /// MySQL compatibility alias. factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index 1599505a413..07ca28d6a87 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -8,6 +8,7 @@ namespace DB REGISTER_FUNCTION(Base64Encode) { factory.registerFunction>>(); + factory.registerFunction>>(); /// MySQL compatibility alias. factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64UrlDecode.cpp deleted file mode 100644 index 1ed836768b8..00000000000 --- a/src/Functions/base64UrlDecode.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#if USE_BASE64 -#include - -namespace DB -{ -REGISTER_FUNCTION(Base64UrlDecode) -{ - factory.registerFunction>>(); -} -} - -#endif diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64UrlEncode.cpp deleted file mode 100644 index 9d959c6bbc6..00000000000 --- a/src/Functions/base64UrlEncode.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#if USE_BASE64 -#include - -namespace DB -{ -REGISTER_FUNCTION(Base64UrlEncode) -{ - factory.registerFunction>>(); -} -} - -#endif diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index da1a24fd776..25da111492d 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -8,6 +8,7 @@ namespace DB REGISTER_FUNCTION(TryBase64Decode) { factory.registerFunction>>(); + factory.registerFunction>>(); } } diff --git a/src/Functions/tryBase64UrlDecode.cpp b/src/Functions/tryBase64UrlDecode.cpp deleted file mode 100644 index 528018b26f9..00000000000 --- a/src/Functions/tryBase64UrlDecode.cpp +++ /dev/null @@ -1,14 +0,0 @@ -#include - -#if USE_BASE64 -#include - -namespace DB -{ -REGISTER_FUNCTION(TryBase64UrlDecode) -{ - factory.registerFunction>>(); -} -} - -#endif From 8af077f3d3cd891768b310b59b42696691578245 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 12:53:06 +0000 Subject: [PATCH 096/215] Update build descriptions --- contrib/abseil-cpp-cmake/CMakeLists.txt | 258 ++++++++++++++++++++---- 1 file changed, 218 insertions(+), 40 deletions(-) diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index d026a7c78bc..a9e79be4f09 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -1,6 +1,8 @@ set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp") set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}") +# This is a minimized version of the function definition in CMake/AbseilHelpers.cmake + # # Copyright 2017 The Abseil Authors. # @@ -16,7 +18,6 @@ set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}") # See the License for the specific language governing permissions and # limitations under the License. # - function(absl_cc_library) cmake_parse_arguments(ABSL_CC_LIB "DISABLE_INSTALL;PUBLIC;TESTONLY" @@ -76,6 +77,12 @@ function(absl_cc_library) add_library(absl::${ABSL_CC_LIB_NAME} ALIAS ${_NAME}) endfunction() +# The following definitions are an amalgamation of the CMakeLists.txt files in absl/*/ +# To refresh them when upgrading to a new version: +# - copy them over from upstream +# - remove calls of 'absl_cc_test' +# - remove calls of `absl_cc_library` that contain `TESTONLY` +# - append '${DIR}' to the file definitions set(DIR ${ABSL_ROOT_DIR}/absl/algorithm) @@ -102,12 +109,12 @@ absl_cc_library( absl::algorithm absl::core_headers absl::meta + absl::nullability PUBLIC ) set(DIR ${ABSL_ROOT_DIR}/absl/base) -# Internal-only target, do not depend on directly. absl_cc_library( NAME atomic_hook @@ -146,6 +153,18 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} ) +absl_cc_library( + NAME + no_destructor + HDRS + "${DIR}/no_destructor.h" + DEPS + absl::config + absl::nullability + COPTS + ${ABSL_DEFAULT_COPTS} +) + absl_cc_library( NAME nullability @@ -305,6 +324,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-lrt> + $<$:-ladvapi32> DEPS absl::atomic_hook absl::base_internal @@ -312,6 +333,7 @@ absl_cc_library( absl::core_headers absl::dynamic_annotations absl::log_severity + absl::nullability absl::raw_logging_internal absl::spinlock_wait absl::type_traits @@ -357,6 +379,7 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::nullability PUBLIC ) @@ -467,10 +490,11 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS - absl::container_common absl::common_policy_traits absl::compare absl::compressed_tuple + absl::config + absl::container_common absl::container_memory absl::cord absl::core_headers @@ -480,7 +504,6 @@ absl_cc_library( absl::strings absl::throw_delegate absl::type_traits - absl::utility ) # Internal-only target, do not depend on directly. @@ -523,7 +546,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::base_internal absl::compressed_tuple + absl::config absl::core_headers absl::memory absl::span @@ -548,18 +573,6 @@ absl_cc_library( PUBLIC ) -# Internal-only target, do not depend on directly. -absl_cc_library( - NAME - counting_allocator - HDRS - "${DIR}/internal/counting_allocator.h" - COPTS - ${ABSL_DEFAULT_COPTS} - DEPS - absl::config -) - absl_cc_library( NAME flat_hash_map @@ -570,7 +583,7 @@ absl_cc_library( DEPS absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::raw_hash_map absl::algorithm_container absl::memory @@ -586,7 +599,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::container_memory - absl::hash_function_defaults + absl::hash_container_defaults absl::raw_hash_set absl::algorithm_container absl::core_headers @@ -604,7 +617,7 @@ absl_cc_library( DEPS absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::node_slot_policy absl::raw_hash_map absl::algorithm_container @@ -620,8 +633,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::container_memory absl::core_headers - absl::hash_function_defaults + absl::hash_container_defaults absl::node_slot_policy absl::raw_hash_set absl::algorithm_container @@ -629,6 +643,19 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + hash_container_defaults + HDRS + "${DIR}/hash_container_defaults.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + absl::hash_function_defaults + PUBLIC +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -655,9 +682,11 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS absl::config + absl::container_common absl::cord absl::hash absl::strings + absl::type_traits PUBLIC ) @@ -703,6 +732,7 @@ absl_cc_library( absl::base absl::config absl::exponential_biased + absl::no_destructor absl::raw_logging_internal absl::sample_recorder absl::synchronization @@ -756,7 +786,9 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config absl::container_memory + absl::core_headers absl::raw_hash_set absl::throw_delegate PUBLIC @@ -817,6 +849,7 @@ absl_cc_library( DEPS absl::config absl::core_headers + absl::debugging_internal absl::meta absl::strings absl::span @@ -931,6 +964,7 @@ absl_cc_library( absl::crc32c absl::config absl::strings + absl::no_destructor ) set(DIR ${ABSL_ROOT_DIR}/absl/debugging) @@ -954,6 +988,8 @@ absl_cc_library( "${DIR}/stacktrace.cc" COPTS ${ABSL_DEFAULT_COPTS} + LINKOPTS + $<$:${EXECINFO_LIBRARY}> DEPS absl::debugging_internal absl::config @@ -980,6 +1016,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-ldbghelp> DEPS absl::debugging_internal absl::demangle_internal @@ -1254,6 +1291,7 @@ absl_cc_library( absl::strings absl::synchronization absl::flat_hash_map + absl::no_destructor ) # Internal-only target, do not depend on directly. @@ -1298,7 +1336,6 @@ absl_cc_library( absl::flags_config absl::flags_internal absl::flags_reflection - absl::base absl::core_headers absl::strings ) @@ -1378,6 +1415,9 @@ absl_cc_library( absl::synchronization ) +############################################################################ +# Unit tests in alphabetical order. + set(DIR ${ABSL_ROOT_DIR}/absl/functional) absl_cc_library( @@ -1430,6 +1470,18 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + overload + HDRS + "${DIR}/overload.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::meta + PUBLIC +) + set(DIR ${ABSL_ROOT_DIR}/absl/hash) absl_cc_library( @@ -1639,6 +1691,7 @@ absl_cc_library( absl::log_internal_conditions absl::log_internal_message absl::log_internal_strip + absl::absl_vlog_is_on ) absl_cc_library( @@ -1720,6 +1773,7 @@ absl_cc_library( absl::log_entry absl::log_severity absl::log_sink + absl::no_destructor absl::raw_logging_internal absl::synchronization absl::span @@ -1770,6 +1824,7 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS + absl::core_headers absl::log_internal_message absl::log_internal_nullstream absl::log_severity @@ -1875,6 +1930,11 @@ absl_cc_library( PUBLIC ) +# Warning: Many linkers will strip the contents of this library because its +# symbols are only used in a global constructor. A workaround is for clients +# to link this using $ instead of +# the plain absl::log_flags. +# TODO(b/320467376): Implement the equivalent of Bazel's alwayslink=True. absl_cc_library( NAME log_flags @@ -1896,6 +1956,7 @@ absl_cc_library( absl::flags absl::flags_marshalling absl::strings + absl::vlog_config_internal PUBLIC ) @@ -1918,6 +1979,7 @@ absl_cc_library( absl::log_severity absl::raw_logging_internal absl::strings + absl::vlog_config_internal ) absl_cc_library( @@ -1951,6 +2013,7 @@ absl_cc_library( ${ABSL_DEFAULT_LINKOPTS} DEPS absl::log_internal_log_impl + absl::vlog_is_on PUBLIC ) @@ -2063,21 +2126,75 @@ absl_cc_library( ) absl_cc_library( - NAME - log_internal_fnmatch - SRCS - "${DIR}/internal/fnmatch.cc" - HDRS - "${DIR}/internal/fnmatch.h" - COPTS - ${ABSL_DEFAULT_COPTS} - LINKOPTS - ${ABSL_DEFAULT_LINKOPTS} - DEPS - absl::config - absl::strings + NAME + vlog_config_internal + SRCS + "${DIR}/internal/vlog_config.cc" + HDRS + "${DIR}/internal/vlog_config.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::base + absl::config + absl::core_headers + absl::log_internal_fnmatch + absl::memory + absl::no_destructor + absl::strings + absl::synchronization + absl::optional ) +absl_cc_library( + NAME + absl_vlog_is_on + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + HDRS + "${DIR}/absl_vlog_is_on.h" + DEPS + absl::vlog_config_internal + absl::config + absl::core_headers + absl::strings +) + +absl_cc_library( + NAME + vlog_is_on + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + HDRS + "${DIR}/vlog_is_on.h" + DEPS + absl::absl_vlog_is_on +) + +absl_cc_library( + NAME + log_internal_fnmatch + SRCS + "${DIR}/internal/fnmatch.cc" + HDRS + "${DIR}/internal/fnmatch.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::config + absl::strings +) + +# Test targets + set(DIR ${ABSL_ROOT_DIR}/absl/memory) absl_cc_library( @@ -2146,6 +2263,7 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::compare absl::config absl::core_headers absl::bits @@ -2175,6 +2293,8 @@ absl_cc_library( PUBLIC ) +set(DIR ${ABSL_ROOT_DIR}/absl/profiling) + absl_cc_library( NAME sample_recorder @@ -2187,8 +2307,6 @@ absl_cc_library( absl::synchronization ) -set(DIR ${ABSL_ROOT_DIR}/absl/profiling) - absl_cc_library( NAME exponential_biased @@ -2264,6 +2382,7 @@ absl_cc_library( LINKOPTS ${ABSL_DEFAULT_LINKOPTS} DEPS + absl::config absl::fast_type_id absl::optional ) @@ -2335,11 +2454,13 @@ absl_cc_library( DEPS absl::config absl::inlined_vector + absl::nullability absl::random_internal_pool_urbg absl::random_internal_salted_seed_seq absl::random_internal_seed_material absl::random_seed_gen_exception absl::span + absl::string_view ) # Internal-only target, do not depend on directly. @@ -2398,6 +2519,7 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} LINKOPTS ${ABSL_DEFAULT_LINKOPTS} + $<$:-lbcrypt> DEPS absl::core_headers absl::optional @@ -2657,6 +2779,29 @@ absl_cc_library( absl::config ) +# Internal-only target, do not depend on directly. +absl_cc_library( + NAME + random_internal_distribution_test_util + SRCS + "${DIR}/internal/chi_square.cc" + "${DIR}/internal/distribution_test_util.cc" + HDRS + "${DIR}/internal/chi_square.h" + "${DIR}/internal/distribution_test_util.h" + COPTS + ${ABSL_DEFAULT_COPTS} + LINKOPTS + ${ABSL_DEFAULT_LINKOPTS} + DEPS + absl::config + absl::core_headers + absl::raw_logging_internal + absl::strings + absl::str_format + absl::span +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -2698,6 +2843,8 @@ absl_cc_library( absl::function_ref absl::inlined_vector absl::memory + absl::no_destructor + absl::nullability absl::optional absl::raw_logging_internal absl::span @@ -2723,8 +2870,11 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::has_ostream_operator + absl::nullability absl::raw_logging_internal absl::status + absl::str_format absl::strings absl::type_traits absl::utility @@ -2747,6 +2897,7 @@ absl_cc_library( absl::base absl::config absl::core_headers + absl::nullability absl::throw_delegate PUBLIC ) @@ -2761,6 +2912,7 @@ absl_cc_library( "${DIR}/has_absl_stringify.h" "${DIR}/internal/damerau_levenshtein_distance.h" "${DIR}/internal/string_constant.h" + "${DIR}/internal/has_absl_stringify.h" "${DIR}/match.h" "${DIR}/numbers.h" "${DIR}/str_cat.h" @@ -2804,6 +2956,7 @@ absl_cc_library( absl::endian absl::int128 absl::memory + absl::nullability absl::raw_logging_internal absl::throw_delegate absl::type_traits @@ -2823,6 +2976,18 @@ absl_cc_library( PUBLIC ) +absl_cc_library( + NAME + has_ostream_operator + HDRS + "${DIR}/has_ostream_operator.h" + COPTS + ${ABSL_DEFAULT_COPTS} + DEPS + absl::config + PUBLIC +) + # Internal-only target, do not depend on directly. absl_cc_library( NAME @@ -2850,11 +3015,16 @@ absl_cc_library( NAME str_format HDRS - "${DIR}/str_format.h" + "str_format.h" COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config + absl::core_headers + absl::nullability + absl::span absl::str_format_internal + absl::string_view PUBLIC ) @@ -2885,6 +3055,7 @@ absl_cc_library( absl::strings absl::config absl::core_headers + absl::fixed_array absl::inlined_vector absl::numeric_representation absl::type_traits @@ -2988,6 +3159,7 @@ absl_cc_library( DEPS absl::base absl::config + absl::no_destructor absl::raw_logging_internal absl::synchronization ) @@ -3078,6 +3250,7 @@ absl_cc_library( absl::endian absl::function_ref absl::inlined_vector + absl::nullability absl::optional absl::raw_logging_internal absl::span @@ -3245,6 +3418,8 @@ absl_cc_library( ${ABSL_DEFAULT_COPTS} DEPS Threads::Threads + # TODO(#1495): Use $ once our + # minimum CMake version >= 3.24 $<$:-Wl,-framework,CoreFoundation> ) @@ -3254,7 +3429,7 @@ absl_cc_library( NAME any HDRS - "${DIR}/any.h" + "any.h" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -3285,8 +3460,8 @@ absl_cc_library( NAME bad_any_cast_impl SRCS - "${DIR}/bad_any_cast.h" - "${DIR}/bad_any_cast.cc" + "${DIR}/bad_any_cast.h" + "${DIR}/bad_any_cast.cc" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -3306,6 +3481,7 @@ absl_cc_library( DEPS absl::algorithm absl::core_headers + absl::nullability absl::throw_delegate absl::type_traits PUBLIC @@ -3326,6 +3502,7 @@ absl_cc_library( absl::config absl::core_headers absl::memory + absl::nullability absl::type_traits absl::utility PUBLIC @@ -3388,6 +3565,7 @@ absl_cc_library( COPTS ${ABSL_DEFAULT_COPTS} DEPS + absl::config absl::core_headers absl::type_traits PUBLIC From 598219c57dac54d9000d2fe338b523007e13be21 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 12:56:21 +0000 Subject: [PATCH 097/215] Minor update --- contrib/abseil-cpp-cmake/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/abseil-cpp-cmake/CMakeLists.txt b/contrib/abseil-cpp-cmake/CMakeLists.txt index a9e79be4f09..4137547b736 100644 --- a/contrib/abseil-cpp-cmake/CMakeLists.txt +++ b/contrib/abseil-cpp-cmake/CMakeLists.txt @@ -3015,7 +3015,7 @@ absl_cc_library( NAME str_format HDRS - "str_format.h" + "${DIR}/str_format.h" COPTS ${ABSL_DEFAULT_COPTS} DEPS @@ -3429,7 +3429,7 @@ absl_cc_library( NAME any HDRS - "any.h" + "${DIR}/any.h" COPTS ${ABSL_DEFAULT_COPTS} DEPS From 1f17ddc6fe35be95736b448ebb3b73123c034196 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 13:06:44 +0000 Subject: [PATCH 098/215] Update .clang-tidy --- .clang-tidy | 1 - 1 file changed, 1 deletion(-) diff --git a/.clang-tidy b/.clang-tidy index 896052915f7..de19059d09e 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -37,7 +37,6 @@ Checks: [ '-cert-oop54-cpp', '-cert-oop57-cpp', - '-clang-analyzer-optin.core.EnumCastOutOfRange', # https://github.com/abseil/abseil-cpp/issues/1667 '-clang-analyzer-optin.performance.Padding', '-clang-analyzer-unix.Malloc', From e849b21cbaafa5171daad76c6b05d606d81fb0d4 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 10 Jun 2024 15:15:44 +0200 Subject: [PATCH 099/215] Increase even more --- src/Common/GWPAsan.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/GWPAsan.cpp b/src/Common/GWPAsan.cpp index 088f34fa6ae..488f8e2c5dc 100644 --- a/src/Common/GWPAsan.cpp +++ b/src/Common/GWPAsan.cpp @@ -57,7 +57,7 @@ static bool guarded_alloc_initialized = [] opts.MaxSimultaneousAllocations = 1024; if (!env_options_raw || !std::string_view{env_options_raw}.contains("SampleRate")) - opts.SampleRate = 10000; + opts.SampleRate = 50000; opts.Backtrace = getBackTrace; GuardedAlloc.init(opts); From 1fb5b35dd9a926de4eca1d83b7b1bfc29347d253 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Mon, 10 Jun 2024 13:53:25 +0000 Subject: [PATCH 100/215] explicitly define formatter for StringRef Signed-off-by: Duc Canh Le --- base/base/StringRef.h | 4 ++++ src/Coordination/KeeperSnapshotManager.cpp | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/base/base/StringRef.h b/base/base/StringRef.h index 24af84626de..fc0674b8440 100644 --- a/base/base/StringRef.h +++ b/base/base/StringRef.h @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include @@ -376,3 +378,5 @@ namespace PackedZeroTraits std::ostream & operator<<(std::ostream & os, const StringRef & str); + +template<> struct fmt::formatter : fmt::ostream_formatter {}; diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index 23ff714a929..f25ccab86b1 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -466,7 +466,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial #ifdef NDEBUG /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA. LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}" - " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key.toView()); + " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key); #else throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}" " is different from actual children size {} for node {}", From b3ca9cbaf23b4e5686eff8e39d15073eb4466cd5 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Mon, 10 Jun 2024 14:00:36 +0000 Subject: [PATCH 101/215] no-fasttest tag for base64 functions --- tests/queries/0_stateless/03167_base64_url_functions.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql index 60bb1746e90..2152002e412 100644 --- a/tests/queries/0_stateless/03167_base64_url_functions.sql +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest + -- incorrect number of arguments SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64UrlDecode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } From f026cc43cd2572894f1e86fcc75650b97c1bbcd9 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 10 Jun 2024 11:17:10 -0300 Subject: [PATCH 102/215] safe guard around no proxy strings --- .../EnvironmentProxyConfigurationResolver.cpp | 17 +++++------------ .../proxyConfigurationToPocoProxyConfig.cpp | 5 +++++ 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Common/EnvironmentProxyConfigurationResolver.cpp b/src/Common/EnvironmentProxyConfigurationResolver.cpp index fff2d354e3a..b7b1f1ecfde 100644 --- a/src/Common/EnvironmentProxyConfigurationResolver.cpp +++ b/src/Common/EnvironmentProxyConfigurationResolver.cpp @@ -37,16 +37,9 @@ namespace } } - std::string getNoProxyHostsString() + const char * getNoProxyHosts() { - const char * no_proxy = std::getenv(NO_PROXY_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) - - if (!no_proxy) - { - return ""; - } - - return no_proxy; + return std::getenv(NO_PROXY_ENVIRONMENT_VARIABLE); // NOLINT(concurrency-mt-unsafe) } ProxyConfiguration buildProxyConfiguration( @@ -86,7 +79,8 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() { static const auto * http_proxy_host = getProxyHost(Protocol::HTTP); static const auto * https_proxy_host = getProxyHost(Protocol::HTTPS); - static const auto no_proxy_hosts_string = buildPocoNonProxyHosts(getNoProxyHostsString()); + static const auto * no_proxy = getNoProxyHosts(); + static const auto poco_no_proxy_hosts = no_proxy ? buildPocoNonProxyHosts(no_proxy) : ""; static const Poco::URI http_proxy_uri(http_proxy_host ? http_proxy_host : ""); static const Poco::URI https_proxy_uri(https_proxy_host ? https_proxy_host : ""); @@ -94,9 +88,8 @@ ProxyConfiguration EnvironmentProxyConfigurationResolver::resolve() return buildProxyConfiguration( request_protocol, request_protocol == Protocol::HTTP ? http_proxy_uri : https_proxy_uri, - no_proxy_hosts_string, + poco_no_proxy_hosts, disable_tunneling_for_https_requests_over_http_proxy); - } } diff --git a/src/Common/proxyConfigurationToPocoProxyConfig.cpp b/src/Common/proxyConfigurationToPocoProxyConfig.cpp index f64dbc3bc02..c06014ac2dc 100644 --- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp +++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp @@ -53,6 +53,11 @@ std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host) * */ std::string buildPocoNonProxyHosts(const std::string & no_proxy_hosts_string) { + if (no_proxy_hosts_string.empty()) + { + return ""; + } + static constexpr auto OR_SEPARATOR = "|"; static constexpr auto MATCH_ANYTHING = R"(.*)"; static constexpr auto MATCH_SUBDOMAINS_REGEX = R"((?:.*\.)?)"; From 6b459ee2d2bddb1a11b499880ff1b296de4f7a7d Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Mon, 10 Jun 2024 14:46:12 +0000 Subject: [PATCH 103/215] remove unused ErrorCodes Signed-off-by: Duc Canh Le --- src/Processors/Transforms/FilterTransform.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index f3b3d8127d4..cd87019a8e0 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -14,7 +14,6 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_TYPE_OF_COLUMN_FOR_FILTER; - extern const int LOGICAL_ERROR; } static void replaceFilterToConstant(Block & block, const String & filter_column_name) From 33766797c1de8523dc3e66418777733debe7396d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Jun 2024 15:37:40 +0000 Subject: [PATCH 104/215] Refactor query plan prewhere optimization for Merge --- .../Optimizations/filterPushDown.cpp | 9 + .../Optimizations/optimizePrewhere.cpp | 7 +- .../QueryPlan/SourceStepWithFilter.h | 5 - src/Storages/StorageMerge.cpp | 313 +++++++++--------- src/Storages/StorageMerge.h | 26 +- .../02156_storage_merge_prewhere.reference | 18 +- 6 files changed, 205 insertions(+), 173 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 8ca240b3e8b..263598bdca7 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -608,6 +609,14 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes return 3; } + if (auto * read_from_merge = typeid_cast(child.get())) + { + FilterDAGInfo info{filter->getExpression(), filter->getFilterColumnName(), filter->removesFilterColumn()}; + read_from_merge->addFilter(std::move(info)); + std::swap(*parent_node, *child_node); + return 1; + } + return 0; } diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index fbd9b451ddc..1ce9d1482c9 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -4,10 +4,10 @@ #include #include #include +#include #include #include #include - namespace DB { @@ -30,7 +30,7 @@ static void removeFromOutput(ActionsDAG & dag, const std::string name) void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) { - if (stack.size() < 3) + if (stack.size() < 2) return; auto & frame = stack.back(); @@ -45,6 +45,9 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) if (!source_step_with_filter) return; + if (typeid_cast(frame.node->step.get())) + return; + const auto & storage_snapshot = source_step_with_filter->getStorageSnapshot(); const auto & storage = storage_snapshot->storage; if (!storage.canMoveConditionsToPrewhere()) diff --git a/src/Processors/QueryPlan/SourceStepWithFilter.h b/src/Processors/QueryPlan/SourceStepWithFilter.h index 0971b99d828..126d4824fff 100644 --- a/src/Processors/QueryPlan/SourceStepWithFilter.h +++ b/src/Processors/QueryPlan/SourceStepWithFilter.h @@ -49,11 +49,6 @@ public: filter_dags.push_back(std::move(filter_dag)); } - void addFilterFromParentStep(const ActionsDAG::Node * filter_node) - { - filter_nodes.nodes.push_back(filter_node); - } - /// Apply filters that can optimize reading from storage. void applyFilters() { diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 4c678a1228b..b42a8ed90ed 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -37,6 +37,8 @@ #include #include #include +#include +#include #include #include #include @@ -402,10 +404,14 @@ ReadFromMerge::ReadFromMerge( { } -void ReadFromMerge::updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) +void ReadFromMerge::addFilter(FilterDAGInfo filter) { - SourceStepWithFilter::updatePrewhereInfo(prewhere_info_value); - common_header = applyPrewhereActions(common_header, prewhere_info); + output_stream->header = FilterTransform::transformHeader( + output_stream->header, + filter.actions.get(), + filter.column_name, + filter.do_remove_column); + pushed_down_filters.push_back(std::move(filter)); } void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &) @@ -441,14 +447,11 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu auto modified_query_info = getModifiedQueryInfo(modified_context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); - auto source_pipeline = createSources( + auto source_pipeline = buildPipeline( child_plan.plan, nested_storage_snaphsot, modified_query_info, common_processed_stage, - common_header, - child_plan.table_aliases, - child_plan.row_policy_data_opt, table); if (source_pipeline && source_pipeline->initialized()) @@ -651,6 +654,7 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ res.back().plan = createPlanForTable( nested_storage_snaphsot, + aliases, modified_query_info, common_processed_stage, required_max_block_size, @@ -660,8 +664,17 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ modified_context, current_streams); res.back().plan.addInterpreterContext(modified_context); - } + // createSources1( + // res.back().plan, + // nested_storage_snaphsot, + // modified_query_info, + // common_processed_stage, + // common_header, + // res.back().table_aliases, + // row_policy_data_opt, + // table); + } return res; } @@ -1019,7 +1032,7 @@ bool recursivelyApplyToReadingSteps(QueryPlan::Node * node, const std::function< return ok; } -QueryPipelineBuilderPtr ReadFromMerge::createSources( +void ReadFromMerge::updatePlan( QueryPlan & plan, const StorageSnapshotPtr & storage_snapshot_, SelectQueryInfo & modified_query_info, @@ -1027,22 +1040,124 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( const Block & header, const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams) const + const StorageWithLockAndName & storage_with_lock) const { if (!plan.isInitialized()) - return std::make_unique(); - - QueryPipelineBuilderPtr builder; + return; const auto & [database_name, storage, _, table_name] = storage_with_lock; bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; auto storage_stage = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); - builder = plan.buildQueryPipeline( - QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); + /// Add virtual columns if we don't already have them. + Block plan_header = plan.getCurrentDataStream().header; + + if (allow_experimental_analyzer) + { + String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); + + String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; + String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; + + if (has_database_virtual_column && common_header.has(database_column) + && storage_stage == QueryProcessingStage::FetchColumns && !plan_header.has(database_column)) + { + ColumnWithTypeAndName column; + column.name = database_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); + plan.addStep(std::move(expression_step)); + plan_header = plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has(table_column) + && storage_stage == QueryProcessingStage::FetchColumns && !plan_header.has(table_column)) + { + ColumnWithTypeAndName column; + column.name = table_column; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); + plan.addStep(std::move(expression_step)); + plan_header = plan.getCurrentDataStream().header; + } + } + else + { + if (has_database_virtual_column && common_header.has("_database") && !plan_header.has("_database")) + { + ColumnWithTypeAndName column; + column.name = "_database"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(database_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); + plan.addStep(std::move(expression_step)); + plan_header = plan.getCurrentDataStream().header; + } + + if (has_table_virtual_column && common_header.has("_table") && !plan_header.has("_table")) + { + ColumnWithTypeAndName column; + column.name = "_table"; + column.type = std::make_shared(std::make_shared()); + column.column = column.type->createColumnConst(0, Field(table_name)); + + auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); + plan.addStep(std::move(expression_step)); + plan_header = plan.getCurrentDataStream().header; + } + } + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + convertAndFilterSourceStream( + header, modified_query_info, storage_snapshot_, aliases, row_policy_data_opt, context, plan, storage_stage); + + for (const auto & filter_info : pushed_down_filters) + { + auto filter_step = std::make_unique( + plan.getCurrentDataStream(), + filter_info.actions->clone(), + filter_info.column_name, + filter_info.do_remove_column); + + plan.addStep(std::move(filter_step)); + } +} + +QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( + QueryPlan & plan, + const StorageSnapshotPtr & storage_snapshot_, + SelectQueryInfo & modified_query_info, + QueryProcessingStage::Enum processed_stage, + const StorageWithLockAndName & storage_with_lock) const +{ + if (!plan.isInitialized()) + return nullptr; + + const auto & [database_name, storage, _, table_name] = storage_with_lock; + auto storage_stage + = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); + + auto optimisation_settings = QueryPlanOptimizationSettings::fromContext(context); + /// All optimisations will be done at plans creation + optimisation_settings.optimize_plan = false; + auto builder = plan.buildQueryPipeline(optimisation_settings, BuildQueryPipelineSettings::fromContext(context)); + + if (!builder->initialized()) + return builder; + + bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) { /** Materialization is needed, since from distributed storage the constants come materialized. @@ -1052,93 +1167,11 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); } - if (builder->initialized()) + if (builder->getNumStreams() > 1) { - if (concat_streams && builder->getNumStreams() > 1) - { - // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. - // Using concat instead. - builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); - } - - /// Add virtual columns if we don't already have them. - - Block pipe_header = builder->getHeader(); - - if (allow_experimental_analyzer) - { - String table_alias = modified_query_info.query_tree->as()->getJoinTree()->as()->getAlias(); - - String database_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_database" : table_alias + "._database"; - String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; - - if (has_database_virtual_column && common_header.has(database_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(database_column)) - { - ColumnWithTypeAndName column; - column.name = database_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has(table_column) - && storage_stage == QueryProcessingStage::FetchColumns && !pipe_header.has(table_column)) - { - ColumnWithTypeAndName column; - column.name = table_column; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - else - { - if (has_database_virtual_column && common_header.has("_database") && !pipe_header.has("_database")) - { - ColumnWithTypeAndName column; - column.name = "_database"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(database_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - - if (has_table_virtual_column && common_header.has("_table") && !pipe_header.has("_table")) - { - ColumnWithTypeAndName column; - column.name = "_table"; - column.type = std::make_shared(std::make_shared()); - column.column = column.type->createColumnConst(0, Field(table_name)); - - auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto adding_column_actions = std::make_shared( - std::move(adding_column_dag), ExpressionActionsSettings::fromContext(context, CompileExpressions::yes)); - builder->addSimpleTransform([&](const Block & stream_header) - { return std::make_shared(stream_header, adding_column_actions); }); - } - } - - /// Subordinary tables could have different but convertible types, like numeric types of different width. - /// We must return streams with structure equals to structure of Merge table. - convertAndFilterSourceStream( - header, modified_query_info, storage_snapshot_, aliases, row_policy_data_opt, context, *builder, storage_stage); + // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. + // Using concat instead. + builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); } return builder; @@ -1146,6 +1179,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( QueryPlan ReadFromMerge::createPlanForTable( const StorageSnapshotPtr & storage_snapshot_, + const Aliases & table_aliases, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, UInt64 max_block_size, @@ -1248,6 +1282,19 @@ QueryPlan ReadFromMerge::createPlanForTable( } } + updatePlan( + plan, + storage_snapshot_, + modified_query_info, + common_processed_stage, + common_header, + table_aliases, + row_policy_data_opt, + storage_with_lock); + + if (plan.isInitialized()) + plan.optimize(QueryPlanOptimizationSettings::fromContext(modified_context)); + return plan; } @@ -1306,12 +1353,10 @@ void ReadFromMerge::RowPolicyData::addStorageFilter(SourceStepWithFilter * step) step->addFilter(actions_dag, filter_column_name); } -void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPipelineBuilder & builder) const +void ReadFromMerge::RowPolicyData::addFilterTransform(QueryPlan & plan) const { - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, filter_actions, filter_column_name, true /* remove filter column */); - }); + auto filter_step = std::make_unique(plan.getCurrentDataStream(), actions_dag, filter_column_name, true /* remove filter column */); + plan.addStep(std::move(filter_step)); } StorageMerge::StorageListWithLocks ReadFromMerge::getSelectedTables( @@ -1490,13 +1535,13 @@ void ReadFromMerge::convertAndFilterSourceStream( const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr local_context, - QueryPipelineBuilder & builder, + QueryPlan & plan, QueryProcessingStage::Enum processed_stage) { - Block before_block_header = builder.getHeader(); + Block before_block_header = plan.getCurrentDataStream().header; auto storage_sample_block = snapshot->metadata->getSampleBlock(); - auto pipe_columns = builder.getHeader().getNamesAndTypesList(); + auto pipe_columns = before_block_header.getNamesAndTypesList(); if (local_context->getSettingsRef().allow_experimental_analyzer) { @@ -1519,13 +1564,8 @@ void ReadFromMerge::convertAndFilterSourceStream( throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); - - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), actions_dag); + plan.addStep(std::move(expression_step)); } } else @@ -1539,12 +1579,8 @@ void ReadFromMerge::convertAndFilterSourceStream( auto dag = std::make_shared(pipe_columns); auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), actions_dag); + plan.addStep(std::move(expression_step)); } } @@ -1556,20 +1592,15 @@ void ReadFromMerge::convertAndFilterSourceStream( if (row_policy_data_opt) { - row_policy_data_opt->addFilterTransform(builder); + row_policy_data_opt->addFilterTransform(plan); } - auto convert_actions_dag = ActionsDAG::makeConvertingActions(builder.getHeader().getColumnsWithTypeAndName(), + auto convert_actions_dag = ActionsDAG::makeConvertingActions(plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), convert_actions_match_columns_mode); - auto actions = std::make_shared( - std::move(convert_actions_dag), - ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - builder.addSimpleTransform([&](const Block & stream_header) - { - return std::make_shared(stream_header, actions); - }); + auto expression_step = std::make_unique(plan.getCurrentDataStream(), convert_actions_dag); + plan.addStep(std::move(expression_step)); } const ReadFromMerge::StorageListWithLocks & ReadFromMerge::getSelectedTables() @@ -1606,29 +1637,11 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) return true; } -void ReadFromMerge::applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const -{ - auto apply_filters = [&added_filter_nodes](ReadFromMergeTree & read_from_merge_tree) - { - for (const auto & node : added_filter_nodes.nodes) - read_from_merge_tree.addFilterFromParentStep(node); - - read_from_merge_tree.SourceStepWithFilter::applyFilters(); - return true; - }; - - recursivelyApplyToReadingSteps(plan.getRootNode(), apply_filters); -} - void ReadFromMerge::applyFilters(ActionDAGNodes added_filter_nodes) { SourceStepWithFilter::applyFilters(added_filter_nodes); filterTablesAndCreateChildrenPlans(); - - for (const auto & child_plan : *child_plans) - if (child_plan.plan.isInitialized()) - applyFilters(child_plan.plan, added_filter_nodes); } QueryPlanRawPtrs ReadFromMerge::getChildPlans() diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 735c8711a63..42544676bd8 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -165,7 +165,7 @@ public: QueryPlanRawPtrs getChildPlans() override; - void updatePrewhereInfo(const PrewhereInfoPtr & prewhere_info_value) override; + void addFilter(FilterDAGInfo filter); private: const size_t required_max_block_size; @@ -221,7 +221,7 @@ private: /// Create explicit filter transform to exclude /// rows that are not conform to row level policy - void addFilterTransform(QueryPipelineBuilder &) const; + void addFilterTransform(QueryPlan &) const; private: std::string filter_column_name; // complex filter, may contain logic operations @@ -243,14 +243,16 @@ private: /// It's needed to guarantee lifetime for child steps to be the same as for this step (mainly for EXPLAIN PIPELINE). std::optional> child_plans; + /// Store filters pushed down from query plan optimization. Filters are added on top of child plans. + std::vector pushed_down_filters; + std::vector createChildrenPlans(SelectQueryInfo & query_info_) const; void filterTablesAndCreateChildrenPlans(); - void applyFilters(const QueryPlan & plan, const ActionDAGNodes & added_filter_nodes) const; - QueryPlan createPlanForTable( const StorageSnapshotPtr & storage_snapshot, + const Aliases & table_aliases, SelectQueryInfo & query_info, QueryProcessingStage::Enum processed_stage, UInt64 max_block_size, @@ -260,7 +262,7 @@ private: ContextMutablePtr modified_context, size_t streams_num) const; - QueryPipelineBuilderPtr createSources( + void updatePlan( QueryPlan & plan, const StorageSnapshotPtr & storage_snapshot, SelectQueryInfo & modified_query_info, @@ -268,8 +270,14 @@ private: const Block & header, const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, - const StorageWithLockAndName & storage_with_lock, - bool concat_streams = false) const; + const StorageWithLockAndName & storage_with_lock) const; + + QueryPipelineBuilderPtr buildPipeline( + QueryPlan & plan, + const StorageSnapshotPtr & storage_snapshot, + SelectQueryInfo & modified_query_info, + QueryProcessingStage::Enum processed_stage, + const StorageWithLockAndName & storage_with_lock) const; static void convertAndFilterSourceStream( const Block & header, @@ -278,15 +286,13 @@ private: const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr context, - QueryPipelineBuilder & builder, + QueryPlan & plan, QueryProcessingStage::Enum processed_stage); StorageMerge::StorageListWithLocks getSelectedTables( ContextPtr query_context, bool filter_by_database_virtual_column, bool filter_by_table_virtual_column) const; - - // static VirtualColumnsDescription createVirtuals(StoragePtr first_table); }; } diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference index 86a36a9392c..5632b333c5e 100644 --- a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference @@ -1,15 +1,21 @@ - Prewhere info - Prewhere filter - Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) + Filter column: and(equals(k, 3), notEmpty(v)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) + Filter column: and(equals(k, 3), notEmpty(v)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 - Filter column: and(equals(k, 3), notEmpty(v)) (removed) - Filter column: and(equals(k, 3), notEmpty(v)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) 2 From fdb32e825c6053c1bf4fecbb5293ddb8f6e747b9 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Mon, 10 Jun 2024 15:39:38 +0000 Subject: [PATCH 105/215] PR post-review fixes --- src/Functions/FunctionBase64Conversion.cpp | 93 ---------------------- src/Functions/FunctionBase64Conversion.h | 83 ++++++++++++++++--- 2 files changed, 71 insertions(+), 105 deletions(-) delete mode 100644 src/Functions/FunctionBase64Conversion.cpp diff --git a/src/Functions/FunctionBase64Conversion.cpp b/src/Functions/FunctionBase64Conversion.cpp deleted file mode 100644 index a87ce31f478..00000000000 --- a/src/Functions/FunctionBase64Conversion.cpp +++ /dev/null @@ -1,93 +0,0 @@ -#include "config.h" -#if USE_BASE64 -# include -# include -# include - -namespace DB -{ - -std::vector preprocessBase64Url(const std::span src) -{ - std::vector padded_src{}; - // insert padding to please aklomp library - size_t padded_size = src.size(); - size_t remainder = padded_size % 4; - switch (remainder) - { - case 0: - break; // no padding needed - case 1: - padded_size += 3; // this case is impossible to occur, however, we'll insert padding anyway - break; - case 2: - padded_size += 2; // two bytes padding - break; - default: // remainder == 3 - padded_size += 1; // one byte padding - break; - } - padded_src.resize(padded_size); - - // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 - size_t i = 0; - for (; i < src.size(); ++i) - { - switch (src[i]) - { - case '_': - padded_src[i] = '/'; - break; - case '-': - padded_src[i] = '+'; - break; - default: - padded_src[i] = src[i]; - break; - } - } - if (remainder == 1) - { - padded_src[i] = '='; - ++i; - padded_src[i] = '='; - ++i; - padded_src[i] = '='; - } - else if (remainder == 2) - { - padded_src[i] = '='; - ++i; - padded_src[i] = '='; - } - else if (remainder == 3) - padded_src[i] = '='; - - return padded_src; -} - -size_t postprocessBase64Url(UInt8 * dst, size_t out_len) -{ - // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 - for (size_t i = 0; i < out_len; ++i) - { - switch (dst[i]) - { - case '/': - dst[i] = '_'; - break; - case '+': - dst[i] = '-'; - break; - case '=': // stop when padding is detected - return i; - default: - break; - } - } - return out_len; -} - -} - -#endif diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 51ca28aa670..8d73d0c70df 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -28,8 +28,67 @@ enum class Base64Variant : uint8_t Url }; -extern std::vector preprocessBase64Url(const std::span src); -extern size_t postprocessBase64Url(UInt8 * dst, size_t out_len); +inline std::string preprocessBase64Url(std::span src) +{ + std::string padded_src; + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + for (size_t i = 0; i < src.size(); ++i) + { + switch (src[i]) + { + case '_': + padded_src += '/'; + break; + case '-': + padded_src += '+'; + break; + default: + padded_src += src[i]; + break; + } + } + + // insert padding to please aklomp library + size_t remainder = src.size() % 4; + switch (remainder) + { + case 0: + break; // no padding needed + case 1: + padded_src.append("==="); // this case is impossible to occur, however, we'll insert padding anyway + break; + case 2: + padded_src.append("=="); // two bytes padding + break; + default: // remainder == 3 + padded_src.append("="); // one byte padding + break; + } + + return padded_src; +} + +inline size_t postprocessBase64Url(UInt8 * dst, size_t out_len) +{ + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + for (size_t i = 0; i < out_len; ++i) + { + switch (dst[i]) + { + case '/': + dst[i] = '_'; + break; + case '+': + dst[i] = '-'; + break; + case '=': // stop when padding is detected + return i; + default: + break; + } + } + return out_len; +} template struct Base64Encode @@ -41,15 +100,15 @@ struct Base64Encode return ((string_length - string_count) / 3 + string_count) * 4 + string_count; } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::span src, UInt8 * dst) { size_t outlen = 0; base64_encode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); if constexpr (variant == Base64Variant::Url) - return postprocessBase64Url(dst, outlen); - else - return outlen; + outlen = postprocessBase64Url(dst, outlen); + + return outlen; } }; @@ -63,14 +122,14 @@ struct Base64Decode return ((string_length - string_count) / 4 + string_count) * 3 + string_count; } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::span src, UInt8 * dst) { int rc; size_t outlen = 0; if constexpr (variant == Base64Variant::Url) { - auto src_padded = preprocessBase64Url(src); - rc = base64_decode(reinterpret_cast(src_padded.data()), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + std::string src_padded = preprocessBase64Url(src); + rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); } else { @@ -98,14 +157,14 @@ struct TryBase64Decode return Base64Decode::getBufferSize(string_length, string_count); } - static size_t perform(const std::span src, UInt8 * dst) + static size_t perform(std::span src, UInt8 * dst) { int rc; size_t outlen = 0; if constexpr (variant == Base64Variant::Url) { - auto src_padded = preprocessBase64Url(src); - rc = base64_decode(reinterpret_cast(src_padded.data()), src_padded.size(), reinterpret_cast(dst), &outlen, 0); + std::string src_padded = preprocessBase64Url(src); + rc = base64_decode(src_padded.data(), src_padded.size(), reinterpret_cast(dst), &outlen, 0); } else { From a8b1b11ee640d458804e4088c4d75c767d83c204 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Jun 2024 15:55:00 +0000 Subject: [PATCH 106/215] Fixing test. --- src/Interpreters/InterpreterSelectQuery.cpp | 6 +++--- src/Planner/Planner.cpp | 2 +- .../0_stateless/02156_storage_merge_prewhere.reference | 2 -- ...2156_storage_merge_prewhere_not_ready_set_bug.reference | 1 + .../02156_storage_merge_prewhere_not_ready_set_bug.sql | 7 +++++++ 5 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference create mode 100644 tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index e72cf670f69..f8f3867dfd4 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2544,9 +2544,9 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc } else if (storage) { - if (shouldMoveToPrewhere() && settings.query_plan_optimize_prewhere && settings.query_plan_enable_optimizations - && typeid_cast(storage.get())) - collectFiltersForAnalysis(query_ptr, context, storage_snapshot, options, query_info); + // if (shouldMoveToPrewhere() && settings.query_plan_optimize_prewhere && settings.query_plan_enable_optimizations + // && typeid_cast(storage.get())) + // collectFiltersForAnalysis(query_ptr, context, storage_snapshot, options, query_info); /// Table. if (max_streams == 0) diff --git a/src/Planner/Planner.cpp b/src/Planner/Planner.cpp index b40e23a9553..15b92ed12da 100644 --- a/src/Planner/Planner.cpp +++ b/src/Planner/Planner.cpp @@ -166,7 +166,7 @@ FiltersForTableExpressionMap collectFiltersForAnalysis(const QueryTreeNodePtr & continue; const auto & storage = table_node ? table_node->getStorage() : table_function_node->getStorage(); - if (typeid_cast(storage.get()) || typeid_cast(storage.get()) + if (typeid_cast(storage.get()) || (parallel_replicas_estimation_enabled && std::dynamic_pointer_cast(storage))) { collect_filters = true; diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference index 5632b333c5e..8a18c609ede 100644 --- a/tests/queries/0_stateless/02156_storage_merge_prewhere.reference +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere.reference @@ -1,8 +1,6 @@ - Filter column: and(equals(k, 3), notEmpty(v)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) - Filter column: and(equals(k, 3), notEmpty(v)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(notEmpty(v), equals(k, 3)) (removed) diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference new file mode 100644 index 00000000000..20c58c33770 --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.reference @@ -0,0 +1 @@ +59900 1000 1396 diff --git a/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql new file mode 100644 index 00000000000..fc18c97cb6e --- /dev/null +++ b/tests/queries/0_stateless/02156_storage_merge_prewhere_not_ready_set_bug.sql @@ -0,0 +1,7 @@ +create table merge_kek_1 (x UInt32, y UInt32) engine = MergeTree order by x; +create table merge_kek_2 (x UInt32, y UInt32) engine = MergeTree order by x; + +insert into merge_kek_1 select number, number from numbers(100); +insert into merge_kek_2 select number + 500, number + 500 from numbers(1e6); + +select sum(x), min(x + x), max(x + x) from merge(currentDatabase(), '^merge_kek_.$') where x > 200 and y in (select 500 + number * 2 from numbers(100)) settings max_threads=2; From 8904880480a314e456c05e9721a19c202e606382 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 10 Jun 2024 18:03:26 +0200 Subject: [PATCH 107/215] Try fix test --- src/IO/S3Settings.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index 396eb8f5ec7..e88d8133c32 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -20,6 +20,8 @@ void S3SettingsByEndpoint::loadFromConfig( Poco::Util::AbstractConfiguration::Keys config_keys; config.keys(config_prefix, config_keys); + auto default_auth_settings = S3::AuthSettings(config, settings, config_prefix); + auto default_request_settings = S3::RequestSettings(config, settings, config_prefix); for (const String & key : config_keys) { @@ -27,8 +29,11 @@ void S3SettingsByEndpoint::loadFromConfig( const auto endpoint_path = key_path + ".endpoint"; if (config.has(endpoint_path)) { - auto auth_settings = S3::AuthSettings(config, settings, key_path); - auto request_settings = S3::RequestSettings(config, settings, key_path, "", settings.s3_validate_request_settings); + auto auth_settings{default_auth_settings}; + auth_settings.updateIfChanged(S3::AuthSettings(config, settings, key_path)); + + auto request_settings{default_request_settings}; + request_settings.updateIfChanged(S3::RequestSettings(config, settings, key_path, "", settings.s3_validate_request_settings)); s3_settings.emplace( config.getString(endpoint_path), From 374854a972616bb42c0d2c960511e377fb271b8b Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Mon, 10 Jun 2024 18:20:55 +0200 Subject: [PATCH 108/215] Increase timeout in wait_for_all_mutations --- tests/queries/0_stateless/mergetree_mutations.lib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/mergetree_mutations.lib b/tests/queries/0_stateless/mergetree_mutations.lib index b11b2e6b852..9eeea87b52d 100644 --- a/tests/queries/0_stateless/mergetree_mutations.lib +++ b/tests/queries/0_stateless/mergetree_mutations.lib @@ -37,7 +37,7 @@ function wait_for_all_mutations() echo "Timed out while waiting for mutation to execute!" fi - sleep 0.1 + sleep 0.3 done } From 20d673a20645740d7b805e03521c300c829557a2 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Jun 2024 17:02:26 +0000 Subject: [PATCH 109/215] Fix more tests. --- src/Storages/StorageMerge.cpp | 37 ++++--------------- ..._optimize_count_for_merge_tables.reference | 9 +++-- 2 files changed, 14 insertions(+), 32 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index b42a8ed90ed..f05618f8488 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1215,35 +1215,14 @@ QueryPlan ReadFromMerge::createPlanForTable( if (real_column_names.empty()) real_column_names.push_back(ExpressionActions::getSmallestColumn(storage_snapshot_->metadata->getColumns().getAllPhysical()).name); - StorageView * view = dynamic_cast(storage.get()); - if (!view || allow_experimental_analyzer) - { - storage->read(plan, - real_column_names, - storage_snapshot_, - modified_query_info, - modified_context, - processed_stage, - max_block_size, - UInt32(streams_num)); - } - else - { - /// For view storage, we need to rewrite the `modified_query_info.view_query` to optimize read. - /// The most intuitive way is to use InterpreterSelectQuery. - - /// Intercept the settings - modified_context->setSetting("max_threads", streams_num); - modified_context->setSetting("max_streams_to_max_threads_ratio", 1); - modified_context->setSetting("max_block_size", max_block_size); - - InterpreterSelectQuery interpreter(modified_query_info.query, - modified_context, - storage, - view->getInMemoryMetadataPtr(), - SelectQueryOptions(processed_stage)); - interpreter.buildQueryPlan(plan); - } + storage->read(plan, + real_column_names, + storage_snapshot_, + modified_query_info, + modified_context, + processed_stage, + max_block_size, + UInt32(streams_num)); if (!plan.isInitialized()) return {}; diff --git a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference index 786a6b3bf25..7278018f1d6 100644 --- a/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference +++ b/tests/queries/0_stateless/02918_optimize_count_for_merge_tables.reference @@ -7,6 +7,9 @@ Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) ReadFromMerge - ReadFromMergeTree (default.mt1) - ReadFromMergeTree (default.mt2) - ReadFromStorage (TinyLog) + Expression + ReadFromMergeTree (default.mt1) + Expression + ReadFromMergeTree (default.mt2) + Expression + ReadFromStorage (TinyLog) From 3bcb32c7193ce9bb1209dbe4d6761190a916b9ac Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 10 Jun 2024 17:52:46 +0000 Subject: [PATCH 110/215] Fixing another test. --- src/Storages/StorageMerge.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f05618f8488..6d12876c776 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1618,6 +1618,9 @@ bool ReadFromMerge::requestReadingInOrder(InputOrderInfoPtr order_info_) void ReadFromMerge::applyFilters(ActionDAGNodes added_filter_nodes) { + for (const auto & filter_info : pushed_down_filters) + added_filter_nodes.nodes.push_back(&filter_info.actions->findInOutputs(filter_info.column_name)); + SourceStepWithFilter::applyFilters(added_filter_nodes); filterTablesAndCreateChildrenPlans(); From 71a42d427a0be4c5803fdfcacd088cced97377af Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 10 Jun 2024 16:42:13 +0000 Subject: [PATCH 111/215] Minor fixups --- src/Functions/FunctionBase64Conversion.h | 30 ++++++++++--------- .../0_stateless/00732_base64_functions.sql | 3 +- .../03167_base64_url_functions.sql | 26 ++++++++-------- 3 files changed, 30 insertions(+), 29 deletions(-) diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index 8d73d0c70df..e569a351591 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -12,7 +12,7 @@ # include # include -# include +# include namespace DB { @@ -28,13 +28,15 @@ enum class Base64Variant : uint8_t Url }; -inline std::string preprocessBase64Url(std::span src) +inline std::string preprocessBase64Url(std::string_view src) { std::string padded_src; + padded_src.reserve(src.size() + 3); + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 - for (size_t i = 0; i < src.size(); ++i) + for (auto s : src) { - switch (src[i]) + switch (s) { case '_': padded_src += '/'; @@ -43,12 +45,12 @@ inline std::string preprocessBase64Url(std::span src) padded_src += '+'; break; default: - padded_src += src[i]; + padded_src += s; break; } } - // insert padding to please aklomp library + /// Insert padding to please aklomp library size_t remainder = src.size() % 4; switch (remainder) { @@ -100,10 +102,10 @@ struct Base64Encode return ((string_length - string_count) / 3 + string_count) * 4 + string_count; } - static size_t perform(std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { size_t outlen = 0; - base64_encode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + base64_encode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); if constexpr (variant == Base64Variant::Url) outlen = postprocessBase64Url(dst, outlen); @@ -122,7 +124,7 @@ struct Base64Decode return ((string_length - string_count) / 4 + string_count) * 3 + string_count; } - static size_t perform(std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { int rc; size_t outlen = 0; @@ -133,7 +135,7 @@ struct Base64Decode } else { - rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + rc = base64_decode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); } if (rc != 1) @@ -157,7 +159,7 @@ struct TryBase64Decode return Base64Decode::getBufferSize(string_length, string_count); } - static size_t perform(std::span src, UInt8 * dst) + static size_t perform(std::string_view src, UInt8 * dst) { int rc; size_t outlen = 0; @@ -168,7 +170,7 @@ struct TryBase64Decode } else { - rc = base64_decode(reinterpret_cast(src.data()), src.size(), reinterpret_cast(dst), &outlen, 0); + rc = base64_decode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); } if (rc != 1) @@ -232,7 +234,7 @@ private: auto * dst = dst_chars.data(); auto * dst_pos = dst; - const auto * src = src_chars.data(); + const auto * src = reinterpret_cast(src_chars.data()); size_t src_offset_prev = 0; for (size_t row = 0; row < src_row_count; ++row) @@ -272,7 +274,7 @@ private: auto * dst = dst_chars.data(); auto * dst_pos = dst; - const auto * src = src_chars.data(); + const auto * src = reinterpret_cast(src_chars.data()); for (size_t row = 0; row < src_row_count; ++row) { diff --git a/tests/queries/0_stateless/00732_base64_functions.sql b/tests/queries/0_stateless/00732_base64_functions.sql index 3c60bf939fe..b4be8db4ede 100644 --- a/tests/queries/0_stateless/00732_base64_functions.sql +++ b/tests/queries/0_stateless/00732_base64_functions.sql @@ -1,6 +1,5 @@ -- Tags: no-fasttest - -SET send_logs_level = 'fatal'; +-- no-fasttest because aklomp-base64 library is required SELECT base64Encode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } SELECT base64Decode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } diff --git a/tests/queries/0_stateless/03167_base64_url_functions.sql b/tests/queries/0_stateless/03167_base64_url_functions.sql index 2152002e412..674f1ae498b 100644 --- a/tests/queries/0_stateless/03167_base64_url_functions.sql +++ b/tests/queries/0_stateless/03167_base64_url_functions.sql @@ -1,4 +1,5 @@ -- Tags: no-fasttest +-- no-fasttest because aklomp-base64 library is required -- incorrect number of arguments SELECT base64UrlEncode(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } @@ -9,22 +10,20 @@ SELECT base64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARG SELECT tryBase64UrlDecode('foo', 'excess argument'); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } -- test with valid inputs -SELECT 'https://clickhouse.com' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; --- encoding differs from base64Encode -SELECT '12?' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; +SELECT 'https://clickhouse.com' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT '12?' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); --- long string -SELECT 'https://www.google.com/search?q=clickhouse+base64+decode&sca_esv=739f8bb380e4c7ed&ei=TfRiZqCDIrmnwPAP2KLRkA8&ved=0ahUKEwjg3ZHitsmGAxW5ExAIHVhRFPIQ4dUDCBA&uact=5&oq=clickhouse+base64+decode' as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; +-- encoded value has no padding +SELECT 'aHR0cHM6Ly9jbGlj' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +-- encoded value has one-byte padding +SELECT 'aHR0cHM6Ly9jbGlja2g' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); +-- encoded value has two-bytes padding +SELECT 'aHR0cHM6Ly9jbGljaw' AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); --- no padding -SELECT 'aHR0cHM6Ly9jbGlj' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; --- one-byte padding -SELECT 'aHR0cHM6Ly9jbGlja2g' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; --- two-bytes padding -SELECT 'aHR0cHM6Ly9jbGljaw' as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; +-- test with invalid inputs --- invalid inputs SELECT base64UrlDecode('https://clickhouse.com'); -- { serverError INCORRECT_DATA } SELECT tryBase64UrlDecode('https://clickhouse.com'); SELECT base64UrlDecode('12?'); -- { serverError INCORRECT_DATA } @@ -33,4 +32,5 @@ SELECT base64UrlDecode('aHR0cHM6Ly9jbGlja'); -- { serverError INCORRECT_DATA } SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja'); -- test FixedString argument -SELECT toFixedString('https://clickhouse.com', 22) as original, base64UrlEncode(original) as encoded, base64UrlDecode(encoded) as decoded, tryBase64UrlDecode(encoded) as try_decoded; + +SELECT toFixedString('https://clickhouse.com', 22) AS original, base64UrlEncode(original) AS encoded, base64UrlDecode(encoded), tryBase64UrlDecode(encoded); From e4a0f42ab8ffc5e9af469729a6bdceee71fac2c9 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Mon, 10 Jun 2024 21:15:22 +0000 Subject: [PATCH 112/215] time_virtual_col_tests: rollback tests --- .../test_storage_azure_blob_storage/test.py | 4 ++-- tests/integration/test_storage_hdfs/test.py | 4 ++-- tests/integration/test_storage_s3/test.py | 8 ++------ ...mn_use_structure_from_insertion_table.reference | 1 + ...al_column_use_structure_from_insertion_table.sh | 13 +++++++++++++ ...mn_use_structure_from_insertion_table.reference | 1 - ...al_column_use_structure_from_insertion_table.sh | 14 -------------- 7 files changed, 20 insertions(+), 25 deletions(-) create mode 100644 tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference create mode 100755 tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh delete mode 100644 tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference delete mode 100755 tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 9f5aef1489c..f836c58ce30 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -758,12 +758,12 @@ def test_read_subcolumns(cluster): ) res = node.query( - f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()), a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.tsv'," + f"select a.b.d, _path, a.b, _file, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.tsv'," f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "2\tcont/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t0\t3\n" + assert res == "2\tcont/test_subcolumns.tsv\t(1,2)\ttest_subcolumns.tsv\t3\n" res = node.query( f"select a.b.d, _path, a.b, _file, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumns.jsonl'," diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index cda2b8694c6..44c0223e677 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -987,10 +987,10 @@ def test_read_subcolumns(started_cluster): assert res == "2\ttest_subcolumns.jsonl\t(1,2)\ttest_subcolumns.jsonl\t3\n" res = node.query( - f"select x.b.d, _path, x.b, _file, dateDiff('minute', _time, now()), x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" ) - assert res == "0\ttest_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\t0\n" + assert res == "0\ttest_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = node.query( f"select x.b.d, _path, x.b, _file, x.e from hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 61c6d95f123..09b27fff1e8 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -2117,12 +2117,10 @@ def test_read_subcolumns(started_cluster): assert res == "0\troot/test_subcolumns.jsonl\t(0,0)\ttest_subcolumns.jsonl\t0\n" res = instance.query( - f"select x.b.d, _path, x.b, _file, dateDiff('minute', _time, now()), x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" + f"select x.b.d, _path, x.b, _file, x.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'x Tuple(b Tuple(c UInt32, d UInt32), e UInt32) default ((42, 42), 42)')" ) - assert ( - res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t0\t42\n" - ) + assert res == "42\troot/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" res = instance.query( f"select a.b.d, _path, a.b, _file, a.e from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" @@ -2150,8 +2148,6 @@ def test_read_subcolumns(started_cluster): res == "42\t/root/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" ) - logging.info("Some custom logging") - def test_filtering_by_file_or_path(started_cluster): bucket = started_cluster.minio_bucket diff --git a/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference new file mode 100644 index 00000000000..35ef86f5339 --- /dev/null +++ b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.reference @@ -0,0 +1 @@ +1 2 4 diff --git a/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh new file mode 100755 index 00000000000..d9e4a2c8f8b --- /dev/null +++ b/tests/queries/0_stateless/02931_size_virtual_column_use_structure_from_insertion_table.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv +$CLICKHOUSE_LOCAL -nm -q " +create table test (x UInt64, y UInt32, size UInt64) engine=Memory; +insert into test select c1, c2, _size from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv') settings use_structure_from_insertion_table_in_table_functions=1; +select * from test; +" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv diff --git a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference b/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference deleted file mode 100644 index 93acdc34842..00000000000 --- a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.reference +++ /dev/null @@ -1 +0,0 @@ -1 2 4 1 1 diff --git a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh b/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh deleted file mode 100755 index ebdda0cc1d3..00000000000 --- a/tests/queries/0_stateless/02931_virtual_column_use_structure_from_insertion_table.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/usr/bin/env bash - -CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) -# shellcheck source=../shell_config.sh -. "$CURDIR"/../shell_config.sh - -echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv -sleep 1 -$CLICKHOUSE_LOCAL -nm -q " -create table test (x UInt64, y UInt32, size UInt64, d32 DateTime32, d64 DateTime64) engine=Memory; -insert into test select c1, c2, _size, _time, _time from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv') settings use_structure_from_insertion_table_in_table_functions=1; -select x, y, size, (dateDiff('millisecond', d32, now()) < 4000 AND dateDiff('millisecond', d32, now()) > 0), (dateDiff('second', d64, now()) < 4 AND dateDiff('second', d64, now()) > 0) from test; -" -rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv From 26e82457b9b5f206b7d11ca982f7c8f60ef82d26 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 11 Jun 2024 03:36:26 +0000 Subject: [PATCH 113/215] add setting changes Signed-off-by: Duc Canh Le --- src/Core/SettingsChangesHistory.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 6734c0dc525..313b5547f4d 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -101,6 +101,7 @@ static const std::map Date: Tue, 11 Jun 2024 05:39:24 +0000 Subject: [PATCH 114/215] Test fix --- .../0_stateless/02415_all_new_functions_must_be_documented.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql index cabcd230eb6..544f29e8a7d 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql @@ -3,7 +3,7 @@ SELECT name FROM system.functions WHERE NOT is_aggregate AND origin = 'System' AND alias_to = '' AND length(description) < 10 AND name NOT IN ( 'aes_decrypt_mysql', 'aes_encrypt_mysql', 'decrypt', 'encrypt', - 'base64Decode', 'base64Encode', 'tryBase64Decode', + 'base64Decode', 'base64Encode', 'tryBase64Decode', 'base64UrlDecode', 'base64UrlEncode', 'tryBase64UrlDecode', 'convertCharset', 'detectLanguage', 'detectLanguageMixed', 'geoToH3', From a6fa7993961221381b14ba061bb0697da2203ec2 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 08:22:11 +0000 Subject: [PATCH 115/215] Fix other tests. --- src/Storages/StorageMerge.cpp | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 6d12876c776..f661fc237b6 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include @@ -1167,13 +1166,6 @@ QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( builder->addSimpleTransform([](const Block & stream_header) { return std::make_shared(stream_header); }); } - if (builder->getNumStreams() > 1) - { - // It's possible to have many tables read from merge, resize(1) might open too many files at the same time. - // Using concat instead. - builder->addTransform(std::make_shared(builder->getHeader(), builder->getNumStreams())); - } - return builder; } From f27e92c97b0d363f906deea41981176ac61c7bdf Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:34:19 +0200 Subject: [PATCH 116/215] Update s3queue.md --- docs/en/engines/table-engines/integrations/s3queue.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index f930fab1805..c29b90f3cd7 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -75,7 +75,7 @@ Possible values: - unordered — With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKeeper. - ordered — With ordered mode, only the max name of the successfully consumed file, and the names of files that will be retried after unsuccessful loading attempt are being stored in ZooKeeper. -Default value: `unordered`. +Default value: `ordered` unti 24.5. Starting with 24.6 there is no default value, the setting becomes required to be specified manually. For tables created on earlier versions the default value will remain `Ordered` for compatibility. ### after_processing {#after_processing} @@ -181,6 +181,10 @@ For 'Ordered' mode. Defines a maximum boundary for reschedule interval for a bac Default value: `30000`. +### s3queue_buckets {#buckets} + +For 'Ordered' mode. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even futher, as it defines the actual parallelism of `S3Queue` processing. + ## S3-related Settings {#s3-settings} Engine supports all s3 related settings. For more information about S3 settings see [here](../../../engines/table-engines/integrations/s3.md). From 00e58f522ff1a6420aa95e8c2ec360dd9b3d0017 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Tue, 11 Jun 2024 10:35:55 +0200 Subject: [PATCH 117/215] Update s3queue.md --- docs/en/engines/table-engines/integrations/s3queue.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index c29b90f3cd7..f72bc79c1e5 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -75,7 +75,7 @@ Possible values: - unordered — With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKeeper. - ordered — With ordered mode, only the max name of the successfully consumed file, and the names of files that will be retried after unsuccessful loading attempt are being stored in ZooKeeper. -Default value: `ordered` unti 24.5. Starting with 24.6 there is no default value, the setting becomes required to be specified manually. For tables created on earlier versions the default value will remain `Ordered` for compatibility. +Default value: `ordered` in versions before 24.6. Starting with 24.6 there is no default value, the setting becomes required to be specified manually. For tables created on earlier versions the default value will remain `Ordered` for compatibility. ### after_processing {#after_processing} @@ -183,7 +183,7 @@ Default value: `30000`. ### s3queue_buckets {#buckets} -For 'Ordered' mode. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even futher, as it defines the actual parallelism of `S3Queue` processing. +For 'Ordered' mode. Available since `24.6`. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even futher, as it defines the actual parallelism of `S3Queue` processing. ## S3-related Settings {#s3-settings} From 4baabb16f9e66d70fc960292d533e1ee1cfb3a9a Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Mon, 10 Jun 2024 22:43:32 +0000 Subject: [PATCH 118/215] time_virtual_col_tests: tests reintroduced --- .../test_storage_azure_blob_storage/test.py | 21 +++++++++++++++++++ tests/integration/test_storage_hdfs/test.py | 14 +++++++++++++ tests/integration/test_storage_s3/test.py | 17 +++++++++++++++ .../03169_time_virtual_column.reference | 1 + .../0_stateless/03169_time_virtual_column.sh | 12 +++++++++++ 5 files changed, 65 insertions(+) create mode 100644 tests/queries/0_stateless/03169_time_virtual_column.reference create mode 100755 tests/queries/0_stateless/03169_time_virtual_column.sh diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index f836c58ce30..e7aa7d052a5 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -790,6 +790,27 @@ def test_read_subcolumns(cluster): assert res == "42\tcont/test_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv', " + f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)", + ) + + res = node.query( + f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv'," + f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," + f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + ) + + assert ( + res == "2\tcont/test_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" + ) + + def test_read_from_not_existing_container(cluster): node = cluster.instances["node"] query = ( diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 44c0223e677..15d9ee0bb26 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -999,6 +999,20 @@ def test_read_subcolumns(started_cluster): assert res == "42\ttest_subcolumns.jsonl\t(42,42)\ttest_subcolumns.jsonl\t42\n" +def test_read_subcolumn_time(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + ) + + res = node.query( + f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + ) + + assert res == "2\ttest_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" + + def test_union_schema_inference_mode(started_cluster): node = started_cluster.instances["node1"] diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 09b27fff1e8..eff066739c0 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -2149,6 +2149,23 @@ def test_read_subcolumns(started_cluster): ) +def test_read_subcolumn_time(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query( + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + ) + + res = instance.query( + f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + ) + + assert ( + res == "2\troot/test_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" + ) + + def test_filtering_by_file_or_path(started_cluster): bucket = started_cluster.minio_bucket instance = started_cluster.instances["dummy"] diff --git a/tests/queries/0_stateless/03169_time_virtual_column.reference b/tests/queries/0_stateless/03169_time_virtual_column.reference new file mode 100644 index 00000000000..4482956b706 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.reference @@ -0,0 +1 @@ +4 1 diff --git a/tests/queries/0_stateless/03169_time_virtual_column.sh b/tests/queries/0_stateless/03169_time_virtual_column.sh new file mode 100755 index 00000000000..fef1de8c6f2 --- /dev/null +++ b/tests/queries/0_stateless/03169_time_virtual_column.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +echo "1,2" > $CLICKHOUSE_TEST_UNIQUE_NAME.csv +sleep 1 +$CLICKHOUSE_LOCAL -nm -q " +select _size, (dateDiff('millisecond', _time, now()) < 600000 AND dateDiff('millisecond', _time, now()) > 0) from file('$CLICKHOUSE_TEST_UNIQUE_NAME.csv'); +" +rm $CLICKHOUSE_TEST_UNIQUE_NAME.csv From 1519efe7e21384b55780e214a5434dafdd1d1f63 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 11 Jun 2024 11:53:33 +0200 Subject: [PATCH 119/215] Addressed review comments --- src/Parsers/FunctionParameterValuesVisitor.cpp | 2 +- .../queries/0_stateless/03146_parameterized_view_with_date.sql | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Parsers/FunctionParameterValuesVisitor.cpp b/src/Parsers/FunctionParameterValuesVisitor.cpp index e791e07cdfb..eaf28bbbc41 100644 --- a/src/Parsers/FunctionParameterValuesVisitor.cpp +++ b/src/Parsers/FunctionParameterValuesVisitor.cpp @@ -23,7 +23,7 @@ class FunctionParameterValuesVisitor public: explicit FunctionParameterValuesVisitor(NameToNameMap & parameter_values_, ContextPtr context_) : parameter_values(parameter_values_) - ,context(context_) + , context(context_) { } diff --git a/tests/queries/0_stateless/03146_parameterized_view_with_date.sql b/tests/queries/0_stateless/03146_parameterized_view_with_date.sql index 53022e969ab..2cfadb70b24 100644 --- a/tests/queries/0_stateless/03146_parameterized_view_with_date.sql +++ b/tests/queries/0_stateless/03146_parameterized_view_with_date.sql @@ -9,4 +9,6 @@ create view pv as select * from table_pv where timestamp_field > {timestamp_para select * from pv (timestamp_param=toDateTime('2024-04-01 00:00:01')); +select * from pv (timestamp_param=toDateTime('2024-040')); -- { serverError CANNOT_PARSE_DATETIME } + drop table table_pv; From 88fffcd171dfc7a7819ad819fd53050f5690470e Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 10:16:16 +0000 Subject: [PATCH 120/215] Fix INTERPOLATE (alias) for remote queries with analyzer. --- src/Analyzer/InterpolateNode.cpp | 16 +++++++++++++--- src/Analyzer/InterpolateNode.h | 7 +++++-- .../03155_analyzer_interpolate.reference | 5 +++++ .../0_stateless/03155_analyzer_interpolate.sql | 3 +++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/src/Analyzer/InterpolateNode.cpp b/src/Analyzer/InterpolateNode.cpp index e4f7e22b803..0de11998d1f 100644 --- a/src/Analyzer/InterpolateNode.cpp +++ b/src/Analyzer/InterpolateNode.cpp @@ -10,9 +10,12 @@ namespace DB { -InterpolateNode::InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_) +InterpolateNode::InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_) : IQueryTreeNode(children_size) { + if(expression_) + expression_name = expression_->getIdentifier().getFullName(); + children[expression_child_index] = std::move(expression_); children[interpolate_expression_child_index] = std::move(interpolate_expression_); } @@ -41,13 +44,20 @@ void InterpolateNode::updateTreeHashImpl(HashState &, CompareOptions) const QueryTreeNodePtr InterpolateNode::cloneImpl() const { - return std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + auto cloned = std::make_shared(nullptr /*expression*/, nullptr /*interpolate_expression*/); + cloned->expression_name = expression_name; + return cloned; } ASTPtr InterpolateNode::toASTImpl(const ConvertToASTOptions & options) const { auto result = std::make_shared(); - result->column = getExpression()->toAST(options)->getColumnName(); + + if (const auto * identifier = getExpression()->as()) + result->column = identifier->toAST(options)->getColumnName(); + else + result->column = expression_name; + result->children.push_back(getInterpolateExpression()->toAST(options)); result->expr = result->children.back(); diff --git a/src/Analyzer/InterpolateNode.h b/src/Analyzer/InterpolateNode.h index 9269d3924f5..ec493ed8bdd 100644 --- a/src/Analyzer/InterpolateNode.h +++ b/src/Analyzer/InterpolateNode.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include namespace DB @@ -19,7 +19,7 @@ class InterpolateNode final : public IQueryTreeNode { public: /// Initialize interpolate node with expression and interpolate expression - explicit InterpolateNode(QueryTreeNodePtr expression_, QueryTreeNodePtr interpolate_expression_); + explicit InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_); /// Get expression to interpolate const QueryTreeNodePtr & getExpression() const @@ -61,6 +61,9 @@ protected: ASTPtr toASTImpl(const ConvertToASTOptions & options) const override; + /// Initial name from column identifier. + std::string expression_name; + private: static constexpr size_t expression_child_index = 0; static constexpr size_t interpolate_expression_child_index = 1; diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.reference b/tests/queries/0_stateless/03155_analyzer_interpolate.reference index 791aaa5b2a2..eade3b45d26 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.reference +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.reference @@ -11,3 +11,8 @@ 5 [5] 5.5 [5] 7 [7] +2 +100500 +18 +26 +34 diff --git a/tests/queries/0_stateless/03155_analyzer_interpolate.sql b/tests/queries/0_stateless/03155_analyzer_interpolate.sql index b3c1d233f47..30423cb86ff 100644 --- a/tests/queries/0_stateless/03155_analyzer_interpolate.sql +++ b/tests/queries/0_stateless/03155_analyzer_interpolate.sql @@ -10,3 +10,6 @@ SELECT n, number+5 AS inter FROM ( -- { serverError NOT_AN_AGGREGATE } SELECT toFloat32(number % 10) AS n, number, number*2 AS mn FROM numbers(10) WHERE number % 3 = 1 ) GROUP BY n, inter ORDER BY n WITH FILL FROM 0 TO 5.51 STEP 0.5 INTERPOLATE (inter AS mn * 2); + +-- https://github.com/ClickHouse/ClickHouse/issues/64636 +select sum(number) as s from remote('127.0.0.{1,2}', numbers(10)) where (intDiv(number, 2) as key) != 1 group by key order by key with fill interpolate (s as 100500); From 5f6904fa38fc976a79cee8cff042d8e25845b619 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 10:23:56 +0000 Subject: [PATCH 121/215] Add a comment. --- src/Analyzer/InterpolateNode.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Analyzer/InterpolateNode.cpp b/src/Analyzer/InterpolateNode.cpp index 0de11998d1f..4b8797282a2 100644 --- a/src/Analyzer/InterpolateNode.cpp +++ b/src/Analyzer/InterpolateNode.cpp @@ -53,6 +53,9 @@ ASTPtr InterpolateNode::toASTImpl(const ConvertToASTOptions & options) const { auto result = std::make_shared(); + /// Interpolate parser supports only identifier node. + /// In case of alias, identifier is replaced to expression, which can't be parsed. + /// In this case, keep original alias name. if (const auto * identifier = getExpression()->as()) result->column = identifier->toAST(options)->getColumnName(); else From be154c37b726362b3cde7d0260a82275e2a1faec Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 11:05:04 +0000 Subject: [PATCH 122/215] Refactor and cleanup. --- src/Storages/StorageMerge.cpp | 177 ++++++++++++---------------------- src/Storages/StorageMerge.h | 24 ++--- 2 files changed, 71 insertions(+), 130 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index f661fc237b6..92350d4c5bc 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -440,18 +440,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu Names column_names_as_aliases; Aliases aliases; - Names real_column_names = column_names; - if (child_plan.row_policy_data_opt) - child_plan.row_policy_data_opt->extendNames(real_column_names); - - auto modified_query_info = getModifiedQueryInfo(modified_context, table, nested_storage_snaphsot, real_column_names, column_names_as_aliases, aliases); - - auto source_pipeline = buildPipeline( - child_plan.plan, - nested_storage_snaphsot, - modified_query_info, - common_processed_stage, - table); + auto source_pipeline = buildPipeline(child_plan, common_processed_stage); if (source_pipeline && source_pipeline->initialized()) { @@ -569,10 +558,8 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ if (sampling_requested && !storage->supportsSampling()) throw Exception(ErrorCodes::SAMPLING_NOT_SUPPORTED, "Illegal SAMPLE: table {} doesn't support sampling", storage->getStorageID().getNameForLogs()); - res.emplace_back(); - - auto & aliases = res.back().table_aliases; - auto & row_policy_data_opt = res.back().row_policy_data_opt; + Aliases aliases; + RowPolicyDataOpt row_policy_data_opt; auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, modified_context); @@ -651,9 +638,8 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ } - res.back().plan = createPlanForTable( + auto child = createPlanForTable( nested_storage_snaphsot, - aliases, modified_query_info, common_processed_stage, required_max_block_size, @@ -662,17 +648,31 @@ std::vector ReadFromMerge::createChildrenPlans(SelectQ row_policy_data_opt, modified_context, current_streams); - res.back().plan.addInterpreterContext(modified_context); + child.plan.addInterpreterContext(modified_context); - // createSources1( - // res.back().plan, - // nested_storage_snaphsot, - // modified_query_info, - // common_processed_stage, - // common_header, - // res.back().table_aliases, - // row_policy_data_opt, - // table); + if (child.plan.isInitialized()) + { + addVirtualColumns(child, modified_query_info, common_processed_stage, table); + + /// Subordinary tables could have different but convertible types, like numeric types of different width. + /// We must return streams with structure equals to structure of Merge table. + convertAndFilterSourceStream(common_header, modified_query_info, nested_storage_snaphsot, aliases, row_policy_data_opt, context, child); + + for (const auto & filter_info : pushed_down_filters) + { + auto filter_step = std::make_unique( + child.plan.getCurrentDataStream(), + filter_info.actions->clone(), + filter_info.column_name, + filter_info.do_remove_column); + + child.plan.addStep(std::move(filter_step)); + } + + child.plan.optimize(QueryPlanOptimizationSettings::fromContext(modified_context)); + } + + res.emplace_back(std::move(child)); } return res; @@ -1031,27 +1031,18 @@ bool recursivelyApplyToReadingSteps(QueryPlan::Node * node, const std::function< return ok; } -void ReadFromMerge::updatePlan( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot_, +void ReadFromMerge::addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, const StorageWithLockAndName & storage_with_lock) const { - if (!plan.isInitialized()) - return; - - const auto & [database_name, storage, _, table_name] = storage_with_lock; + const auto & [database_name, _, storage, table_name] = storage_with_lock; bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; - auto storage_stage - = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); /// Add virtual columns if we don't already have them. - Block plan_header = plan.getCurrentDataStream().header; + Block plan_header = child.plan.getCurrentDataStream().header; if (allow_experimental_analyzer) { @@ -1061,7 +1052,7 @@ void ReadFromMerge::updatePlan( String table_column = table_alias.empty() || processed_stage == QueryProcessingStage::FetchColumns ? "_table" : table_alias + "._table"; if (has_database_virtual_column && common_header.has(database_column) - && storage_stage == QueryProcessingStage::FetchColumns && !plan_header.has(database_column)) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(database_column)) { ColumnWithTypeAndName column; column.name = database_column; @@ -1069,13 +1060,13 @@ void ReadFromMerge::updatePlan( column.column = column.type->createColumnConst(0, Field(database_name)); auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); - plan.addStep(std::move(expression_step)); - plan_header = plan.getCurrentDataStream().header; + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; } if (has_table_virtual_column && common_header.has(table_column) - && storage_stage == QueryProcessingStage::FetchColumns && !plan_header.has(table_column)) + && child.stage == QueryProcessingStage::FetchColumns && !plan_header.has(table_column)) { ColumnWithTypeAndName column; column.name = table_column; @@ -1083,9 +1074,9 @@ void ReadFromMerge::updatePlan( column.column = column.type->createColumnConst(0, Field(table_name)); auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); - plan.addStep(std::move(expression_step)); - plan_header = plan.getCurrentDataStream().header; + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; } } else @@ -1098,9 +1089,9 @@ void ReadFromMerge::updatePlan( column.column = column.type->createColumnConst(0, Field(database_name)); auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); - plan.addStep(std::move(expression_step)); - plan_header = plan.getCurrentDataStream().header; + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; } if (has_table_virtual_column && common_header.has("_table") && !plan_header.has("_table")) @@ -1111,53 +1102,30 @@ void ReadFromMerge::updatePlan( column.column = column.type->createColumnConst(0, Field(table_name)); auto adding_column_dag = ActionsDAG::makeAddingColumnActions(std::move(column)); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), adding_column_dag); - plan.addStep(std::move(expression_step)); - plan_header = plan.getCurrentDataStream().header; + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), adding_column_dag); + child.plan.addStep(std::move(expression_step)); + plan_header = child.plan.getCurrentDataStream().header; } } - - /// Subordinary tables could have different but convertible types, like numeric types of different width. - /// We must return streams with structure equals to structure of Merge table. - convertAndFilterSourceStream( - header, modified_query_info, storage_snapshot_, aliases, row_policy_data_opt, context, plan, storage_stage); - - for (const auto & filter_info : pushed_down_filters) - { - auto filter_step = std::make_unique( - plan.getCurrentDataStream(), - filter_info.actions->clone(), - filter_info.column_name, - filter_info.do_remove_column); - - plan.addStep(std::move(filter_step)); - } } QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot_, - SelectQueryInfo & modified_query_info, - QueryProcessingStage::Enum processed_stage, - const StorageWithLockAndName & storage_with_lock) const + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const { - if (!plan.isInitialized()) + if (!child.plan.isInitialized()) return nullptr; - const auto & [database_name, storage, _, table_name] = storage_with_lock; - auto storage_stage - = storage->getQueryProcessingStage(context, processed_stage, storage_snapshot_, modified_query_info); - auto optimisation_settings = QueryPlanOptimizationSettings::fromContext(context); /// All optimisations will be done at plans creation optimisation_settings.optimize_plan = false; - auto builder = plan.buildQueryPipeline(optimisation_settings, BuildQueryPipelineSettings::fromContext(context)); + auto builder = child.plan.buildQueryPipeline(optimisation_settings, BuildQueryPipelineSettings::fromContext(context)); if (!builder->initialized()) return builder; bool allow_experimental_analyzer = context->getSettingsRef().allow_experimental_analyzer; - if (processed_stage > storage_stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) + if (processed_stage > child.stage || (allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns)) { /** Materialization is needed, since from distributed storage the constants come materialized. * If you do not do this, different types (Const and non-Const) columns will be produced in different threads, @@ -1169,9 +1137,8 @@ QueryPipelineBuilderPtr ReadFromMerge::buildPipeline( return builder; } -QueryPlan ReadFromMerge::createPlanForTable( +ReadFromMerge::ChildPlan ReadFromMerge::createPlanForTable( const StorageSnapshotPtr & storage_snapshot_, - const Aliases & table_aliases, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, UInt64 max_block_size, @@ -1253,20 +1220,7 @@ QueryPlan ReadFromMerge::createPlanForTable( } } - updatePlan( - plan, - storage_snapshot_, - modified_query_info, - common_processed_stage, - common_header, - table_aliases, - row_policy_data_opt, - storage_with_lock); - - if (plan.isInitialized()) - plan.optimize(QueryPlanOptimizationSettings::fromContext(modified_context)); - - return plan; + return ChildPlan{std::move(plan), storage_stage}; } ReadFromMerge::RowPolicyData::RowPolicyData(RowPolicyFilterPtr row_policy_filter_ptr, @@ -1506,10 +1460,9 @@ void ReadFromMerge::convertAndFilterSourceStream( const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr local_context, - QueryPlan & plan, - QueryProcessingStage::Enum processed_stage) + ChildPlan & child) { - Block before_block_header = plan.getCurrentDataStream().header; + Block before_block_header = child.plan.getCurrentDataStream().header; auto storage_sample_block = snapshot->metadata->getSampleBlock(); auto pipe_columns = before_block_header.getNamesAndTypesList(); @@ -1535,8 +1488,8 @@ void ReadFromMerge::convertAndFilterSourceStream( throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), actions_dag); - plan.addStep(std::move(expression_step)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } else @@ -1550,28 +1503,26 @@ void ReadFromMerge::convertAndFilterSourceStream( auto dag = std::make_shared(pipe_columns); auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), actions_dag); - plan.addStep(std::move(expression_step)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), actions_dag); + child.plan.addStep(std::move(expression_step)); } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; if (local_context->getSettingsRef().allow_experimental_analyzer - && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) + && (child.stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; if (row_policy_data_opt) - { - row_policy_data_opt->addFilterTransform(plan); - } + row_policy_data_opt->addFilterTransform(child.plan); - auto convert_actions_dag = ActionsDAG::makeConvertingActions(plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), + auto convert_actions_dag = ActionsDAG::makeConvertingActions(child.plan.getCurrentDataStream().header.getColumnsWithTypeAndName(), header.getColumnsWithTypeAndName(), convert_actions_match_columns_mode); - auto expression_step = std::make_unique(plan.getCurrentDataStream(), convert_actions_dag); - plan.addStep(std::move(expression_step)); + auto expression_step = std::make_unique(child.plan.getCurrentDataStream(), convert_actions_dag); + child.plan.addStep(std::move(expression_step)); } const ReadFromMerge::StorageListWithLocks & ReadFromMerge::getSelectedTables() diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 42544676bd8..94b34256d02 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -235,8 +235,7 @@ private: struct ChildPlan { QueryPlan plan; - Aliases table_aliases; - RowPolicyDataOpt row_policy_data_opt; + QueryProcessingStage::Enum stage; }; /// Store read plan for each child table. @@ -250,9 +249,8 @@ private: void filterTablesAndCreateChildrenPlans(); - QueryPlan createPlanForTable( + ChildPlan createPlanForTable( const StorageSnapshotPtr & storage_snapshot, - const Aliases & table_aliases, SelectQueryInfo & query_info, QueryProcessingStage::Enum processed_stage, UInt64 max_block_size, @@ -262,22 +260,15 @@ private: ContextMutablePtr modified_context, size_t streams_num) const; - void updatePlan( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot, + void addVirtualColumns( + ChildPlan & child, SelectQueryInfo & modified_query_info, QueryProcessingStage::Enum processed_stage, - const Block & header, - const Aliases & aliases, - const RowPolicyDataOpt & row_policy_data_opt, const StorageWithLockAndName & storage_with_lock) const; QueryPipelineBuilderPtr buildPipeline( - QueryPlan & plan, - const StorageSnapshotPtr & storage_snapshot, - SelectQueryInfo & modified_query_info, - QueryProcessingStage::Enum processed_stage, - const StorageWithLockAndName & storage_with_lock) const; + ChildPlan & child, + QueryProcessingStage::Enum processed_stage) const; static void convertAndFilterSourceStream( const Block & header, @@ -286,8 +277,7 @@ private: const Aliases & aliases, const RowPolicyDataOpt & row_policy_data_opt, ContextPtr context, - QueryPlan & plan, - QueryProcessingStage::Enum processed_stage); + ChildPlan & child); StorageMerge::StorageListWithLocks getSelectedTables( ContextPtr query_context, From 2d2ebc918a172dfe76968210afa3b5cfb2cbfe96 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 11 Jun 2024 14:42:48 +0200 Subject: [PATCH 123/215] Update odbc-bridge.md --- docs/en/operations/utilities/odbc-bridge.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/utilities/odbc-bridge.md b/docs/en/operations/utilities/odbc-bridge.md index abb8860880e..eb849c6b6ae 100644 --- a/docs/en/operations/utilities/odbc-bridge.md +++ b/docs/en/operations/utilities/odbc-bridge.md @@ -18,7 +18,7 @@ This tool works via HTTP, not via pipes, shared memory, or TCP because: However it can be used as standalone tool from command line with the following parameters in POST-request URL: - `connection_string` -- ODBC connection string. -- `columns` -- columns in ClickHouse NamesAndTypesList format, name in backticks, +- `sample_block` -- columns description in ClickHouse NamesAndTypesList format, name in backticks, type as string. Name and type are space separated, rows separated with newline. - `max_block_size` -- optional parameter, sets maximum size of single block. From fdc2f156b1e472a81acfc8f34124fe5c396f8144 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Tue, 11 Jun 2024 12:50:16 +0000 Subject: [PATCH 124/215] time_virtual_col_tests: just one column in test_read_subcolumn_time --- .../test_storage_azure_blob_storage/test.py | 10 +++----- tests/integration/test_storage_hdfs/test.py | 6 ++--- tests/integration/test_storage_s3/test.py | 25 ++++++++----------- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index e7aa7d052a5..d986c1f9746 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -797,18 +797,16 @@ def test_read_subcolumn_time(cluster): node, f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv', " f"'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," - f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)", + f" 'a UInt32') select (42)", ) res = node.query( - f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv'," + f"select a, dateDiff('minute', _time, now()) < 59 from azureBlobStorage('{storage_account_url}', 'cont', 'test_subcolumn_time.tsv'," f" 'devstoreaccount1', 'Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==', 'auto', 'auto'," - f" 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + f" 'a UInt32')" ) - assert ( - res == "2\tcont/test_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" - ) + assert res == "42\t1\n" def test_read_from_not_existing_container(cluster): diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 15d9ee0bb26..47d8f44c0b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -1003,14 +1003,14 @@ def test_read_subcolumn_time(started_cluster): node = started_cluster.instances["node1"] node.query( - f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" ) res = node.query( - f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + f"select a, dateDiff('minute', _time, now()) < 59 from hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32')" ) - assert res == "2\ttest_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" + assert res == "42\t1\n" def test_union_schema_inference_mode(started_cluster): diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index eff066739c0..8a5bac8b392 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,16 +553,13 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = ( - "select sum(column1), sum(column2), sum(column3) " - "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, - ) + select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, ) try: select_result = run_query( @@ -2154,16 +2151,14 @@ def test_read_subcolumn_time(started_cluster): instance = started_cluster.instances["dummy"] instance.query( - f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" + f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" ) res = instance.query( - f"select a.b.d, _path, a.b, _file, dateDiff('minute', _time, now()) < 59, a.e from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)')" + f"select a, dateDiff('minute', _time, now()) < 59 from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32')" ) - assert ( - res == "2\troot/test_subcolumn_time.tsv\t(1,2)\ttest_subcolumn_time.tsv\t1\t3\n" - ) + assert res == "42\t1\n" def test_filtering_by_file_or_path(started_cluster): From e47bbfb7f29a591d9b8e2432c4626054a6011dc2 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 11 Jun 2024 14:08:16 +0000 Subject: [PATCH 125/215] Remove upstream abseil repository --- .gitmodules | 3 --- contrib/abseil-cpp | 1 - 2 files changed, 4 deletions(-) delete mode 160000 contrib/abseil-cpp diff --git a/.gitmodules b/.gitmodules index 28696428e8c..a6ad00e434b 100644 --- a/.gitmodules +++ b/.gitmodules @@ -161,9 +161,6 @@ [submodule "contrib/xz"] path = contrib/xz url = https://github.com/xz-mirror/xz -[submodule "contrib/abseil-cpp"] - path = contrib/abseil-cpp - url = https://github.com/abseil/abseil-cpp [submodule "contrib/dragonbox"] path = contrib/dragonbox url = https://github.com/ClickHouse/dragonbox diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp deleted file mode 160000 index 696b32788ca..00000000000 --- a/contrib/abseil-cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 696b32788ca887881547380530926314c521ea7d From e5dcf75968b44a10e521bd9c1c106621a56ec7cb Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 11 Jun 2024 14:09:56 +0000 Subject: [PATCH 126/215] Add forked abseil submodule back --- .gitmodules | 3 +++ contrib/abseil-cpp | 1 + 2 files changed, 4 insertions(+) create mode 160000 contrib/abseil-cpp diff --git a/.gitmodules b/.gitmodules index a6ad00e434b..6d64c52ce00 100644 --- a/.gitmodules +++ b/.gitmodules @@ -161,6 +161,9 @@ [submodule "contrib/xz"] path = contrib/xz url = https://github.com/xz-mirror/xz +[submodule "abseil"] + path = contrib/abseil-cpp + url = https://github.com/ClickHouse/abseil-cpp.git [submodule "contrib/dragonbox"] path = contrib/dragonbox url = https://github.com/ClickHouse/dragonbox diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp new file mode 160000 index 00000000000..3916ba76a98 --- /dev/null +++ b/contrib/abseil-cpp @@ -0,0 +1 @@ +Subproject commit 3916ba76a98d3082414a10977e10bdebfdf3b177 From af83bc92ced8c08325924b5a77bfad210eb48149 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 11 Jun 2024 14:16:38 +0000 Subject: [PATCH 127/215] Switch to same HEAD as before but with s390x-breaking commit reverted --- contrib/abseil-cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/abseil-cpp b/contrib/abseil-cpp index 3916ba76a98..a3c4dd3e77f 160000 --- a/contrib/abseil-cpp +++ b/contrib/abseil-cpp @@ -1 +1 @@ -Subproject commit 3916ba76a98d3082414a10977e10bdebfdf3b177 +Subproject commit a3c4dd3e77f28b526efbb0eb394b72e29c633936 From 3680ff07ead49abf2af18367f450aa1cedfbca13 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 11 Jun 2024 16:34:53 +0200 Subject: [PATCH 128/215] Analyzer: Fix AggregateFunctionsArithmericOperationsPass --- .../Passes/AggregateFunctionsArithmericOperationsPass.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index 9153bc4eca2..e6798a792dd 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -51,7 +51,7 @@ public: using Base = InDepthQueryTreeVisitorWithContext; using Base::Base; - void leaveImpl(QueryTreeNodePtr & node) + void enterImpl(QueryTreeNodePtr & node) { if (!getSettings().optimize_arithmetic_operations_in_aggregate_functions) return; From 1c5e935dfa380fbf7a474810fc8594f3227fdbc3 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 11 Jun 2024 16:38:19 +0200 Subject: [PATCH 129/215] Small fix for 02340_parts_refcnt_mergetree https://s3.amazonaws.com/clickhouse-test-reports/61112/f8e3e95b97920c4bd9a21101a2d664e9b3ed60e8/stateless_tests__debug__[1_5].html --- tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh index b100f96befa..e7d95d8db72 100755 --- a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh @@ -58,7 +58,7 @@ function check_refcnt_for_table() $CLICKHOUSE_CLIENT -q "select table, name, refcount>=6 from system.parts where database = '$CLICKHOUSE_DATABASE' and table = '$table' and refcount >= 3" # Kill the query gracefully. - kill -INT $PID + kill -INT $PID ||: wait $PID grep -F Exception "$log_file" | grep -v -F QUERY_WAS_CANCELLED rm -f "${log_file:?}" From 62b732a1fa97144abb5148dd885cc3490a5f2686 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 14:41:23 +0000 Subject: [PATCH 130/215] Remove unused code. --- src/Interpreters/InterpreterSelectQuery.cpp | 47 --------------------- src/Planner/PlannerJoinTree.cpp | 1 - src/Storages/SelectQueryInfo.h | 4 -- src/Storages/StorageMerge.cpp | 2 - 4 files changed, 54 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index f8f3867dfd4..6046c5ca34d 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -2372,49 +2372,6 @@ UInt64 InterpreterSelectQuery::maxBlockSizeByLimit() const return 0; } -/** Storages can rely that filters that for storage will be available for analysis before - * plan is fully constructed and optimized. - * - * StorageMerge common header calculation and prewhere push-down relies on this. - * - * This is similar to Planner::collectFiltersForAnalysis - */ -void collectFiltersForAnalysis( - const ASTPtr & query_ptr, - const ContextPtr & query_context, - const StorageSnapshotPtr & storage_snapshot, - const SelectQueryOptions & options, - SelectQueryInfo & query_info) -{ - auto get_column_options = GetColumnsOptions(GetColumnsOptions::All).withExtendedObjects().withVirtuals(); - - auto dummy = std::make_shared( - storage_snapshot->storage.getStorageID(), ColumnsDescription(storage_snapshot->getColumns(get_column_options)), storage_snapshot); - - QueryPlan query_plan; - InterpreterSelectQuery(query_ptr, query_context, dummy, dummy->getInMemoryMetadataPtr(), options).buildQueryPlan(query_plan); - - auto optimization_settings = QueryPlanOptimizationSettings::fromContext(query_context); - query_plan.optimize(optimization_settings); - - std::vector nodes_to_process; - nodes_to_process.push_back(query_plan.getRootNode()); - - while (!nodes_to_process.empty()) - { - const auto * node_to_process = nodes_to_process.back(); - nodes_to_process.pop_back(); - nodes_to_process.insert(nodes_to_process.end(), node_to_process->children.begin(), node_to_process->children.end()); - - auto * read_from_dummy = typeid_cast(node_to_process->step.get()); - if (!read_from_dummy) - continue; - - query_info.filter_actions_dag = read_from_dummy->getFilterActionsDAG(); - query_info.optimized_prewhere_info = read_from_dummy->getPrewhereInfo(); - } -} - void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) { auto & query = getSelectQuery(); @@ -2544,10 +2501,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc } else if (storage) { - // if (shouldMoveToPrewhere() && settings.query_plan_optimize_prewhere && settings.query_plan_enable_optimizations - // && typeid_cast(storage.get())) - // collectFiltersForAnalysis(query_ptr, context, storage_snapshot, options, query_info); - /// Table. if (max_streams == 0) max_streams = 1; diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 1b2a55a50b0..851603e805a 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -643,7 +643,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres auto table_expression_query_info = select_query_info; table_expression_query_info.table_expression = table_expression; table_expression_query_info.filter_actions_dag = table_expression_data.getFilterActions(); - table_expression_query_info.optimized_prewhere_info = table_expression_data.getPrewhereInfo(); table_expression_query_info.analyzer_can_use_parallel_replicas_on_follower = table_node == planner_context->getGlobalPlannerContext()->parallel_replicas_table; size_t max_streams = settings.max_threads; diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 11e2a2fc5e7..6901b6cb5ff 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -208,10 +208,6 @@ struct SelectQueryInfo bool need_aggregate = false; PrewhereInfoPtr prewhere_info; - /// Generated by pre-run optimization with StorageDummy. - /// Currently it's used to support StorageMerge PREWHERE optimization. - PrewhereInfoPtr optimized_prewhere_info; - /// If query has aggregate functions bool has_aggregates = false; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 92350d4c5bc..ed3f43367dd 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -888,8 +888,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextMutablePtr & mo const StorageID current_storage_id = storage->getStorageID(); SelectQueryInfo modified_query_info = query_info; - if (modified_query_info.optimized_prewhere_info && !modified_query_info.prewhere_info) - modified_query_info.prewhere_info = modified_query_info.optimized_prewhere_info; if (modified_query_info.planner_context) modified_query_info.planner_context = std::make_shared(modified_context, modified_query_info.planner_context); From 66ac187b7cf34006f0067a88d13be71b43b271ab Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 11 Jun 2024 16:42:39 +0200 Subject: [PATCH 131/215] fix test --- tests/queries/0_stateless/02922_server_exit_code.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02922_server_exit_code.sh b/tests/queries/0_stateless/02922_server_exit_code.sh index 60049902410..ded0dc4763f 100755 --- a/tests/queries/0_stateless/02922_server_exit_code.sh +++ b/tests/queries/0_stateless/02922_server_exit_code.sh @@ -7,6 +7,6 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # We will check that the server's exit code corresponds to the exception code if it was terminated after exception. # In this example, we provide an invalid path to the server's config, ignore its logs and check the exit code. -# The exception code is 400 = CANNOT_STAT, so the exit code will be 400 % 256. +# The exception code is 76 = CANNOT_OPEN_FILE, so the exit code will be 76 % 256. -${CLICKHOUSE_SERVER_BINARY} -- --path /dev/null 2>/dev/null; [[ "$?" == "$((400 % 256))" ]] && echo 'Ok' || echo 'Fail' +${CLICKHOUSE_SERVER_BINARY} -- --path /dev/null 2>/dev/null; [[ "$?" == "$((76 % 256))" ]] && echo 'Ok' || echo 'Fail' From c795158ea9b1e32ccccba88a13bdc32622fa7a52 Mon Sep 17 00:00:00 2001 From: Ilya Golshtein Date: Tue, 11 Jun 2024 14:45:00 +0000 Subject: [PATCH 132/215] time_virtual_col_tests: make black happy --- tests/integration/test_storage_s3/test.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 8a5bac8b392..781b68036e6 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,13 +553,16 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, + select_query = ( + "select sum(column1), sum(column2), sum(column3) " + "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, + ) ) try: select_result = run_query( From 9f54007eb80e237b99001337767454a98fd8d6a3 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 11 Jun 2024 14:47:35 +0000 Subject: [PATCH 133/215] Fix style. --- src/Analyzer/InterpolateNode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Analyzer/InterpolateNode.cpp b/src/Analyzer/InterpolateNode.cpp index 4b8797282a2..97dc79f565b 100644 --- a/src/Analyzer/InterpolateNode.cpp +++ b/src/Analyzer/InterpolateNode.cpp @@ -13,7 +13,7 @@ namespace DB InterpolateNode::InterpolateNode(std::shared_ptr expression_, QueryTreeNodePtr interpolate_expression_) : IQueryTreeNode(children_size) { - if(expression_) + if (expression_) expression_name = expression_->getIdentifier().getFullName(); children[expression_child_index] = std::move(expression_); From 39914d848da11ba0d14cf345fb670667a1d4cde9 Mon Sep 17 00:00:00 2001 From: Mikhail Gorshkov Date: Tue, 11 Jun 2024 14:54:50 +0000 Subject: [PATCH 134/215] Docs updated, __msan_unpoison usage fixed --- .../functions/string-functions.md | 170 ++++++++++++++++-- src/Functions/FunctionBase64Conversion.h | 18 +- src/Functions/base64Decode.cpp | 10 +- src/Functions/base64Encode.cpp | 10 +- src/Functions/base64UrlDecode.cpp | 21 +++ src/Functions/base64UrlEncode.cpp | 21 +++ src/Functions/tryBase64Decode.cpp | 10 +- src/Functions/tryBase64UrlDecode.cpp | 21 +++ ...5_all_new_functions_must_be_documented.sql | 1 - 9 files changed, 251 insertions(+), 31 deletions(-) create mode 100644 src/Functions/base64UrlDecode.cpp create mode 100644 src/Functions/base64UrlEncode.cpp create mode 100644 src/Functions/tryBase64UrlDecode.cpp diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 6c8f09e74ce..7a4b3d01b97 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1136,41 +1136,153 @@ SELECT tryBase58Decode('3dc8KtHrwM') as res, tryBase58Decode('invalid') as res_i ## base64Encode -Encodes a String or FixedString as base64. +Encodes a String or FixedString as base64, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4). Alias: `TO_BASE64`. +**Syntax** + +```sql +base64Encode(plaintext) +``` + +**Arguments** + +- `plaintext` — [String](../data-types/string.md) column or constant. + +**Returned value** + +- A string containing the encoded value of the argument. + +**Example** + +``` sql +SELECT base64Encode('clickhouse'); +``` + +Result: + +```result +┌─base64Encode('clickhouse')─┐ +│ Y2xpY2tob3VzZQ== │ +└────────────────────────────┘ +``` + ## base64UrlEncode -Encodes an URL (String or FixedString) as base64 according to [RFC 4648](https://tools.ietf.org/html/rfc4648). +Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). + +**Syntax** + +```sql +base64UrlEncode(url) +``` + +**Arguments** + +- `url` — [String](../data-types/string.md) column or constant. + +**Returned value** + +- A string containing the encoded value of the argument. + +**Example** + +``` sql +SELECT base64UrlEncode('https://clickhouse.com'); +``` + +Result: + +```result +┌─base64UrlEncode('https://clickhouse.com')─┐ +│ aHR0cDovL2NsaWNraG91c2UuY29t │ +└───────────────────────────────────────────┘ +``` ## base64Decode -Decodes a base64-encoded String or FixedString. Throws an exception in case of error. +Accepts a String and decodes it from base64, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-4). Throws an exception in case of an error. Alias: `FROM_BASE64`. +**Syntax** + +```sql +base64Decode(encoded) +``` + +**Arguments** + +- `encoded` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value, an exception is thrown. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Example** + +``` sql +SELECT base64Decode('Y2xpY2tob3VzZQ=='); +``` + +Result: + +```result +┌─base64Decode('Y2xpY2tob3VzZQ==')─┐ +│ clickhouse │ +└──────────────────────────────────┘ +``` + ## base64UrlDecode -Decodes a base64-encoded URL (String or FixedString) according to [RFC 4648](https://tools.ietf.org/html/rfc4648). Throws an exception in case of error. +Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to [RFC 4648](https://datatracker.ietf.org/doc/html/rfc4648#section-5). Throws an exception in case of an error. + +**Syntax** + +```sql +base64UrlDecode(encodedUrl) +``` + +**Arguments** + +- `encodedUrl` — [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, an exception is thrown. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Example** + +``` sql +SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t'); +``` + +Result: + +```result +┌─base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')─┐ +│ https://clickhouse.com │ +└─────────────────────────────────────────────────┘ +``` ## tryBase64Decode Like `base64Decode` but returns an empty string in case of error. -## tryBase64UrlDecode - -Like `base64UrlDecode` but returns an empty string in case of error. - **Syntax** ```sql tryBase64Decode(encoded) ``` -**Parameters** +**Arguments** -- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base58-encoded value, returns an empty string in case of error. +- `encoded`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value, returns an empty string. + +**Returned value** + +- A string containing the decoded value of the argument. **Examples** @@ -1181,9 +1293,41 @@ SELECT tryBase64Decode('RW5jb2RlZA==') as res, tryBase64Decode('invalid') as res ``` ```response -┌─res─────┬─res_invalid─┐ -│ Encoded │ │ -└─────────┴─────────────┘ +┌─res────────┬─res_invalid─┐ +│ clickhouse │ │ +└────────────┴─────────────┘ +``` + +## tryBase64UrlDecode + +Like `base64UrlDecode` but returns an empty string in case of error. + +**Syntax** + +```sql +tryBase64UrlDecode(encodedUrl) +``` + +**Parameters** + +- `encodedUrl`: [String](../data-types/string.md) column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string. + +**Returned value** + +- A string containing the decoded value of the argument. + +**Examples** + +Query: + +```sql +SELECT tryBase64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t') as res, tryBase64Decode('aHR0cHM6Ly9jbGlja') as res_invalid; +``` + +```response +┌─res────────────────────┬─res_invalid─┐ +│ https://clickhouse.com │ │ +└────────────────────────┴─────────────┘ ``` ## endsWith {#endswith} diff --git a/src/Functions/FunctionBase64Conversion.h b/src/Functions/FunctionBase64Conversion.h index e569a351591..05914be3837 100644 --- a/src/Functions/FunctionBase64Conversion.h +++ b/src/Functions/FunctionBase64Conversion.h @@ -33,7 +33,7 @@ inline std::string preprocessBase64Url(std::string_view src) std::string padded_src; padded_src.reserve(src.size() + 3); - // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#section-5 for (auto s : src) { switch (s) @@ -57,7 +57,7 @@ inline std::string preprocessBase64Url(std::string_view src) case 0: break; // no padding needed case 1: - padded_src.append("==="); // this case is impossible to occur, however, we'll insert padding anyway + padded_src.append("==="); // this case is impossible to occur with valid base64-URL encoded input, however, we'll insert padding anyway break; case 2: padded_src.append("=="); // two bytes padding @@ -72,7 +72,7 @@ inline std::string preprocessBase64Url(std::string_view src) inline size_t postprocessBase64Url(UInt8 * dst, size_t out_len) { - // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#page-7 + // Do symbol substitution as described in https://datatracker.ietf.org/doc/html/rfc4648#section-5 for (size_t i = 0; i < out_len; ++i) { switch (dst[i]) @@ -107,6 +107,10 @@ struct Base64Encode size_t outlen = 0; base64_encode(src.data(), src.size(), reinterpret_cast(dst), &outlen, 0); + /// Base64 library is using AVX-512 with some shuffle operations. + /// Memory sanitizer doesn't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. + __msan_unpoison(dst, outlen); + if constexpr (variant == Base64Variant::Url) outlen = postprocessBase64Url(dst, outlen); @@ -242,10 +246,6 @@ private: const size_t src_length = src_offsets[row] - src_offset_prev - 1; const size_t outlen = Func::perform({src, src_length}, dst_pos); - /// Base64 library is using AVX-512 with some shuffle operations. - /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. - __msan_unpoison(dst_pos, outlen); - src += src_length + 1; dst_pos += outlen; *dst_pos = '\0'; @@ -280,10 +280,6 @@ private: { const auto outlen = Func::perform({src, src_n}, dst_pos); - /// Base64 library is using AVX-512 with some shuffle operations. - /// Memory sanitizer don't understand if there was uninitialized memory in SIMD register but it was not used in the result of shuffle. - __msan_unpoison(dst_pos, outlen); - src += src_n; dst_pos += outlen; *dst_pos = '\0'; diff --git a/src/Functions/base64Decode.cpp b/src/Functions/base64Decode.cpp index 2c0cf27c592..50278c4b0b2 100644 --- a/src/Functions/base64Decode.cpp +++ b/src/Functions/base64Decode.cpp @@ -7,8 +7,14 @@ namespace DB { REGISTER_FUNCTION(Base64Decode) { - factory.registerFunction>>(); - factory.registerFunction>>(); + FunctionDocumentation::Description description = R"(Accepts a String and decodes it from base64, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-4). Throws an exception in case of an error. Alias: FROM_BASE64.)"; + FunctionDocumentation::Syntax syntax = "base64Decode(encoded)"; + FunctionDocumentation::Arguments arguments = {{"encoded", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64Decode('Y2xpY2tob3VzZQ==')", "clickhouse"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); /// MySQL compatibility alias. factory.registerAlias("FROM_BASE64", "base64Decode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64Encode.cpp b/src/Functions/base64Encode.cpp index 07ca28d6a87..d6e63c38a4c 100644 --- a/src/Functions/base64Encode.cpp +++ b/src/Functions/base64Encode.cpp @@ -7,8 +7,14 @@ namespace DB { REGISTER_FUNCTION(Base64Encode) { - factory.registerFunction>>(); - factory.registerFunction>>(); + FunctionDocumentation::Description description = R"(Encodes a String as base64, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-4). Alias: TO_BASE64.)"; + FunctionDocumentation::Syntax syntax = "base64Encode(plaintext)"; + FunctionDocumentation::Arguments arguments = {{"plaintext", "String column or constant."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the encoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64Encode('clickhouse')", "Y2xpY2tob3VzZQ=="}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); /// MySQL compatibility alias. factory.registerAlias("TO_BASE64", "base64Encode", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/base64UrlDecode.cpp b/src/Functions/base64UrlDecode.cpp new file mode 100644 index 00000000000..59975d8f9d1 --- /dev/null +++ b/src/Functions/base64UrlDecode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlDecode) +{ + FunctionDocumentation::Description description = R"(Accepts a base64-encoded URL and decodes it from base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; + FunctionDocumentation::Syntax syntax = "base64UrlDecode(encodedUrl)"; + FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value, an exception is thrown."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlDecode('aHR0cDovL2NsaWNraG91c2UuY29t')", "https://clickhouse.com"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/src/Functions/base64UrlEncode.cpp b/src/Functions/base64UrlEncode.cpp new file mode 100644 index 00000000000..05d50170c14 --- /dev/null +++ b/src/Functions/base64UrlEncode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(Base64UrlEncode) +{ + FunctionDocumentation::Description description = R"(Encodes an URL (String or FixedString) as base64 with URL-specific modifications, according to RFC 4648 (https://datatracker.ietf.org/doc/html/rfc4648#section-5).)"; + FunctionDocumentation::Syntax syntax = "base64UrlEncode(url)"; + FunctionDocumentation::Arguments arguments = {{"url", "String column or constant."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the encoded value of the argument."; + FunctionDocumentation::Examples examples = {{"Example", "SELECT base64UrlEncode('https://clickhouse.com')", "aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ"}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/src/Functions/tryBase64Decode.cpp b/src/Functions/tryBase64Decode.cpp index 25da111492d..08eabe93200 100644 --- a/src/Functions/tryBase64Decode.cpp +++ b/src/Functions/tryBase64Decode.cpp @@ -7,8 +7,14 @@ namespace DB { REGISTER_FUNCTION(TryBase64Decode) { - factory.registerFunction>>(); - factory.registerFunction>>(); + FunctionDocumentation::Description description = R"(Decodes a String or FixedString from base64, like base64Decode but returns an empty string in case of an error.)"; + FunctionDocumentation::Syntax syntax = "tryBase64Decode(encoded)"; + FunctionDocumentation::Arguments arguments = {{"encoded", "String column or constant. If the string is not a valid Base64-encoded value, returns an empty string."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64Decode('Y2xpY2tob3VzZQ==')", "clickhouse"}, {"invalid", "SELECT tryBase64Decode('invalid')", ""}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); } } diff --git a/src/Functions/tryBase64UrlDecode.cpp b/src/Functions/tryBase64UrlDecode.cpp new file mode 100644 index 00000000000..b9aaf4f9273 --- /dev/null +++ b/src/Functions/tryBase64UrlDecode.cpp @@ -0,0 +1,21 @@ +#include + +#if USE_BASE64 +#include + +namespace DB +{ +REGISTER_FUNCTION(TryBase64UrlDecode) +{ + FunctionDocumentation::Description description = R"(Decodes an URL from base64, like base64UrlDecode but returns an empty string in case of an error.)"; + FunctionDocumentation::Syntax syntax = "tryBase64UrlDecode(encodedUrl)"; + FunctionDocumentation::Arguments arguments = {{"encodedUrl", "String column or constant. If the string is not a valid Base64-encoded value with URL-specific modifications, returns an empty string."}}; + FunctionDocumentation::ReturnedValue returned_value = "A string containing the decoded value of the argument."; + FunctionDocumentation::Examples examples = {{"valid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja2hvdXNlLmNvbQ')", "https://clickhouse.com"}, {"invalid", "SELECT tryBase64UrlDecode('aHR0cHM6Ly9jbGlja')", ""}}; + FunctionDocumentation::Categories categories = {"String encoding"}; + + factory.registerFunction>>({description, syntax, arguments, returned_value, examples, categories}); +} +} + +#endif diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql index 544f29e8a7d..e9deb778075 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.sql @@ -3,7 +3,6 @@ SELECT name FROM system.functions WHERE NOT is_aggregate AND origin = 'System' AND alias_to = '' AND length(description) < 10 AND name NOT IN ( 'aes_decrypt_mysql', 'aes_encrypt_mysql', 'decrypt', 'encrypt', - 'base64Decode', 'base64Encode', 'tryBase64Decode', 'base64UrlDecode', 'base64UrlEncode', 'tryBase64UrlDecode', 'convertCharset', 'detectLanguage', 'detectLanguageMixed', 'geoToH3', From 5bfca13a4cf5743ea14d0c248dbc69a414c93f75 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 11 Jun 2024 18:04:38 +0200 Subject: [PATCH 135/215] Update test.py --- tests/integration/test_replicated_database/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index ea569939c1c..73b7ae265e4 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -404,6 +404,7 @@ def test_alter_detach_part(started_cluster, engine): main_node.query(f"INSERT INTO {database}.alter_detach VALUES (123)") if engine == "MergeTree": dummy_node.query(f"INSERT INTO {database}.alter_detach VALUES (456)") + main_node.query(f"SYSTEM SYNC REPLICA {database}.alter_detach PULL") main_node.query(f"ALTER TABLE {database}.alter_detach DETACH PART '{part_name}'") detached_parts_query = f"SELECT name FROM system.detached_parts WHERE database='{database}' AND table='alter_detach'" assert main_node.query(detached_parts_query) == f"{part_name}\n" From e0279e856f874c652d292d8c903975f51f2de3ba Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 11 Jun 2024 19:41:22 +0000 Subject: [PATCH 136/215] Fix #63792 (hopefully) --- contrib/openssl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/openssl b/contrib/openssl index f7b8721dfc6..5c4b034785b 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit f7b8721dfc66abb147f24ca07b9c9d1d64f40f71 +Subproject commit 5c4b034785bf04f80380138cf49bf9743400f144 From 848054e85f743cee8797a7419163840764d76935 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Tue, 11 Jun 2024 21:06:45 +0000 Subject: [PATCH 137/215] Fix another false positive leak --- contrib/openssl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/openssl b/contrib/openssl index 5c4b034785b..67c0b63e578 160000 --- a/contrib/openssl +++ b/contrib/openssl @@ -1 +1 @@ -Subproject commit 5c4b034785bf04f80380138cf49bf9743400f144 +Subproject commit 67c0b63e578e4c751ac9edf490f5a96124fff8dc From d8366119c403429c2748f4ea372d0542701147f6 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 11 Jun 2024 22:04:37 +0100 Subject: [PATCH 138/215] impl --- src/Processors/QueryPlan/ReadFromLoopStep.cpp | 26 ++++++++++++------- ...op_engine_with_parallel_replicas.reference | 0 ...168_loop_engine_with_parallel_replicas.sql | 9 +++++++ 3 files changed, 26 insertions(+), 9 deletions(-) create mode 100644 tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.reference create mode 100644 tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql diff --git a/src/Processors/QueryPlan/ReadFromLoopStep.cpp b/src/Processors/QueryPlan/ReadFromLoopStep.cpp index 10436490a2a..2e5fa3ec9f7 100644 --- a/src/Processors/QueryPlan/ReadFromLoopStep.cpp +++ b/src/Processors/QueryPlan/ReadFromLoopStep.cpp @@ -1,14 +1,15 @@ -#include -#include -#include -#include -#include +#include +#include +#include #include #include -#include -#include +#include +#include #include -#include +#include +#include +#include +#include namespace DB { @@ -111,6 +112,13 @@ namespace DB std::unique_ptr executor; }; + static ContextPtr disableParallelReplicas(ContextPtr context) + { + auto modified_context = Context::createCopy(context); + modified_context->setSetting("allow_experimental_parallel_reading_from_replicas", Field(0)); + return modified_context; + } + ReadFromLoopStep::ReadFromLoopStep( const Names & column_names_, const SelectQueryInfo & query_info_, @@ -125,7 +133,7 @@ namespace DB column_names_, query_info_, storage_snapshot_, - context_) + disableParallelReplicas(context_)) , column_names(column_names_) , processed_stage(processed_stage_) , inner_storage(std::move(inner_storage_)) diff --git a/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.reference b/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql b/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql new file mode 100644 index 00000000000..dfcb5de9f2a --- /dev/null +++ b/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql @@ -0,0 +1,9 @@ +DROP DATABASE IF EXISTS 03147_db; +CREATE DATABASE IF NOT EXISTS 03147_db; +CREATE TABLE 03147_db.t (n Int8) ENGINE=MergeTree ORDER BY n; +INSERT INTO 03147_db.t SELECT * FROM numbers(10); +USE 03147_db; + +SET allow_experimental_parallel_reading_from_replicas = 2, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'parallel_replicas', max_parallel_replicas = 100; + +SELECT * FROM loop(03147_db.t) LIMIT 15 FORMAT Null; From 1ad648517556ca5a144ba2497867da6bfe463607 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 11 Jun 2024 22:50:32 +0000 Subject: [PATCH 139/215] Fix 'Tasks in BackgroundSchedulePool cannot throw' caused by MergeTreeData::loadUnexpectedDataParts() --- src/Storages/MergeTree/MergeTreeData.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 14a310364dc..89f39c65517 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1981,6 +1981,7 @@ void MergeTreeData::loadDataParts(bool skip_sanity_checks, std::optional runner(getUnexpectedPartsLoadingThreadPool().get(), "UnexpectedParts"); for (auto & load_state : unexpected_data_parts) @@ -2027,6 +2031,13 @@ void MergeTreeData::loadUnexpectedDataParts() unexpected_data_parts_cv.notify_all(); } } +catch (...) +{ + LOG_ERROR(log, "Loading of unexpected parts failed. " + "Will terminate to avoid undefined behaviour due to inconsistent set of parts. " + "Exception: {}", getCurrentExceptionMessage(true)); + std::terminate(); +} void MergeTreeData::loadOutdatedDataParts(bool is_async) try From 08504d7b1b0e66292fa4ebdbe522d21046b1ec2a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 14 Jun 2023 20:42:46 +0000 Subject: [PATCH 140/215] Change default s3_throw_on_zero_files_match to true, document that pre-signed S3 URLs are not supported --- docs/en/sql-reference/table-functions/s3.md | 11 ++++++++++- src/Core/Settings.h | 6 +++--- src/Core/SettingsChangesHistory.h | 5 +++-- src/Storages/ObjectStorage/Azure/Configuration.cpp | 1 + src/Storages/ObjectStorage/HDFS/Configuration.cpp | 1 + src/Storages/ObjectStorage/S3/Configuration.cpp | 1 + src/Storages/ObjectStorage/StorageObjectStorage.h | 1 + .../ObjectStorage/StorageObjectStorageSource.cpp | 12 +++++++++--- .../ObjectStorage/StorageObjectStorageSource.h | 2 ++ src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- 10 files changed, 32 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 1a7e2b8d66a..7538d66996f 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -248,7 +248,6 @@ FROM s3( LIMIT 5; ``` - ## Working with archives Suppose that we have several archive files with following URIs on S3: @@ -266,6 +265,16 @@ FROM s3( ); ``` +## Presigned URL + +Presigned URLs are currently not supported. Use `url()` table function instead: +```sql +SELECT * +FROM url( + 'https://example.amazonaws.com/f.csv?X-Amz-Security-Token=[...]' +) +``` + ## Virtual Columns {#virtual-columns} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7f99243e285..bb8a7587a84 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -115,9 +115,9 @@ class IColumn; M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, azure_allow_parallel_part_upload, true, "Use multiple threads for azure multipart upload.", 0) \ - M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, s3_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, hdfs_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, azure_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in S3 table engine", 0) \ M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b447421671e..b4bb4716a8a 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -88,8 +88,9 @@ static const std::map StorageObjectStorageSourc iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, - settings.throw_on_zero_files_match, file_progress_callback); + settings.throw_on_zero_files_match, settings.throw_on_zero_files_match_setting_name, + file_progress_callback); } else { @@ -425,6 +426,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( ObjectInfos * read_keys_, size_t list_object_keys_size, bool throw_on_zero_files_match_, + const char * throw_on_zero_files_match_setting_name_, std::function file_progress_callback_) : IIterator("GlobIterator") , WithContext(context_) @@ -432,6 +434,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , configuration(configuration_) , virtual_columns(virtual_columns_) , throw_on_zero_files_match(throw_on_zero_files_match_) + , throw_on_zero_files_match_setting_name(throw_on_zero_files_match_setting_name_) , read_keys(read_keys_) , file_progress_callback(file_progress_callback_) { @@ -484,8 +487,11 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne if (first_iteration && !object_info && throw_on_zero_files_match) { throw Exception(ErrorCodes::FILE_DOESNT_EXIST, - "Can not match any files with path {}", - configuration->getPath()); + "Can not match any files with path {}{}", + configuration->getPath(), + throw_on_zero_files_match_setting_name + ? fmt::format(" (this error can be suppressed by setting {} = false)", throw_on_zero_files_match_setting_name) + : ""); } first_iteration = false; return object_info; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index fd7c7aa7102..5e76d8e979f 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -168,6 +168,7 @@ public: ObjectInfos * read_keys_, size_t list_object_keys_size, bool throw_on_zero_files_match_, + const char * throw_on_zero_files_match_setting_name_, std::function file_progress_callback_ = {}); ~GlobIterator() override = default; @@ -184,6 +185,7 @@ private: const ConfigurationPtr configuration; const NamesAndTypesList virtual_columns; const bool throw_on_zero_files_match; + const char * throw_on_zero_files_match_setting_name; size_t index = 0; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index afb75a21b21..7e26335c691 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -486,7 +486,7 @@ std::shared_ptr StorageS3Queue::createFileIterator { auto settings = configuration->getQuerySettings(local_context); auto glob_iterator = std::make_unique( - object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match); + object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match, settings.throw_on_zero_files_match_setting_name); return std::make_shared(files_metadata, std::move(glob_iterator), shutdown_called, log); } From 26e5d9a8675488a5502a356169990f88e53e64a1 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 11 Jun 2024 01:28:44 +0000 Subject: [PATCH 141/215] aspell --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 84682689934..c4b70de1f65 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2758 +personal_ws-1.1 en 2912 AArch ACLs ALTERs @@ -722,6 +722,7 @@ Postgres PostgresSQL Precompiled Preprocess +Presigned PrettyCompact PrettyCompactMonoBlock PrettyCompactNoEscapes @@ -1936,9 +1937,9 @@ loghouse london lookups loongarch -lowcardinality lowCardinalityIndices lowCardinalityKeys +lowcardinality lowerUTF lowercased lttb From 13dd79f5b2a7d57e4e87e6e63849ceb8479dd495 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 11 Jun 2024 06:19:12 +0000 Subject: [PATCH 142/215] Fix tests --- tests/integration/test_storage_hdfs/test.py | 13 +++++++------ .../02481_s3_throw_if_mismatch_files.reference | 4 ++-- .../02481_s3_throw_if_mismatch_files.sql | 4 ++-- ...resigned_url_and_url_with_special_characters.sql | 4 ++-- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index cda2b8694c6..9d17686b2cc 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -111,7 +111,7 @@ def test_storage_with_multidirectory_glob(started_cluster): try: node1.query( - "SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p4/path1,p2/path3}/postfix/data{1,2}.nonexist', TSV)" + "SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p4/path1,p2/path3}/postfix/data{1,2}.nonexist', TSV) SETTINGS hdfs_throw_on_zero_files_match=0" ) assert False, "Exception have to be thrown" except Exception as ex: @@ -220,14 +220,14 @@ def test_globs_in_read_table(started_cluster): ) print("inside_table_func ", inside_table_func) assert ( - node1.query("select * from hdfs(" + inside_table_func + ")") + node1.query("select * from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0") == paths_amount * some_data ) assert node1.query( - "select count(distinct _path) from hdfs(" + inside_table_func + ")" + "select count(distinct _path) from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0" ).rstrip() == str(paths_amount) assert node1.query( - "select count(distinct _file) from hdfs(" + inside_table_func + ")" + "select count(distinct _file) from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0" ).rstrip() == str(files_amount) @@ -635,6 +635,7 @@ def test_cluster_join(started_cluster): SELECT l.id,r.id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') as l JOIN hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') as r ON l.id = r.id + SETTINGS hdfs_throw_on_zero_files_match=0 """ ) assert "AMBIGUOUS_COLUMN_NAME" not in result @@ -643,13 +644,13 @@ def test_cluster_join(started_cluster): def test_cluster_macro(started_cluster): with_macro = node1.query( """ - SELECT id FROM hdfsCluster('{default_cluster_macro}', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') + SELECT id FROM hdfsCluster('{default_cluster_macro}', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') SETTINGS hdfs_throw_on_zero_files_match=0 """ ) no_macro = node1.query( """ - SELECT id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') + SELECT id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') SETTINGS hdfs_throw_on_zero_files_match=0 """ ) diff --git a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference index a7096a686f5..752b12ff3bd 100644 --- a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference +++ b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference @@ -3,5 +3,5 @@ drop table if exists test_02481_mismatch_files; create table test_02481_mismatch_files (a UInt64, b String) engine = S3(s3_conn, filename='test_02481_mismatch_files_{_partition_id}', format=Parquet) partition by a; set s3_truncate_on_insert=1; insert into test_02481_mismatch_files values (1, 'a'), (22, 'b'), (333, 'c'); -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=1; -- { serverError FILE_DOESNT_EXIST } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError FILE_DOESNT_EXIST } diff --git a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql index 7ec1d3ebd5f..cd500b58946 100644 --- a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql +++ b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql @@ -7,6 +7,6 @@ create table test_02481_mismatch_files (a UInt64, b String) engine = S3(s3_conn, set s3_truncate_on_insert=1; insert into test_02481_mismatch_files values (1, 'a'), (22, 'b'), (333, 'c'); -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=1; -- { serverError FILE_DOESNT_EXIST } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError FILE_DOESNT_EXIST } diff --git a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql index 1e99eb8b83d..078a5701aca 100644 --- a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql +++ b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql @@ -1,5 +1,5 @@ -- Tags: no-fasttest -select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/BU%20-%20UNIT%20-%201/*.parquet'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/BU%20-%20UNIT%20-%201/*.parquet', NOSIGN) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD'); -- { serverError CANNOT_DETECT_FORMAT } +select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD', NOSIGN) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_DETECT_FORMAT } From af9f8ddbfa1ce14f199b09003430dd6ed4d9fa3a Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 11 Jun 2024 23:05:47 +0000 Subject: [PATCH 143/215] Move an unrelated setting to the correct section of SettingsChangesHistory.h --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b4bb4716a8a..31da77fddaf 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -106,6 +106,7 @@ static const std::map Date: Tue, 11 Jun 2024 23:13:49 +0000 Subject: [PATCH 144/215] Automatic style fix --- tests/integration/test_storage_hdfs/test.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 9d17686b2cc..3e9342c0499 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -220,14 +220,22 @@ def test_globs_in_read_table(started_cluster): ) print("inside_table_func ", inside_table_func) assert ( - node1.query("select * from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0") + node1.query( + "select * from hdfs(" + + inside_table_func + + ") settings hdfs_throw_on_zero_files_match=0" + ) == paths_amount * some_data ) assert node1.query( - "select count(distinct _path) from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0" + "select count(distinct _path) from hdfs(" + + inside_table_func + + ") settings hdfs_throw_on_zero_files_match=0" ).rstrip() == str(paths_amount) assert node1.query( - "select count(distinct _file) from hdfs(" + inside_table_func + ") settings hdfs_throw_on_zero_files_match=0" + "select count(distinct _file) from hdfs(" + + inside_table_func + + ") settings hdfs_throw_on_zero_files_match=0" ).rstrip() == str(files_amount) From f632424f92dbb242369d5a17a6788ea57d9d0103 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 12 Jun 2024 04:32:34 +0000 Subject: [PATCH 145/215] remove unnecessary change Signed-off-by: Duc Canh Le --- src/Common/filesystemHelpers.cpp | 4 +--- src/Functions/FunctionsStringDistance.cpp | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/Common/filesystemHelpers.cpp b/src/Common/filesystemHelpers.cpp index 09c4508b7b2..2d053c615d9 100644 --- a/src/Common/filesystemHelpers.cpp +++ b/src/Common/filesystemHelpers.cpp @@ -1,6 +1,4 @@ #include "filesystemHelpers.h" -#include -#include #if defined(OS_LINUX) # include @@ -13,7 +11,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Functions/FunctionsStringDistance.cpp b/src/Functions/FunctionsStringDistance.cpp index d0d8ebc946d..6cb23bbea9f 100644 --- a/src/Functions/FunctionsStringDistance.cpp +++ b/src/Functions/FunctionsStringDistance.cpp @@ -159,7 +159,7 @@ struct ByteJaccardIndexImpl } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", std::string_view(haystack, haystack_end - haystack)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(haystack, haystack_end - haystack)); } } } @@ -186,7 +186,7 @@ struct ByteJaccardIndexImpl } else { - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", std::string_view(needle, needle_end - needle)); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal UTF-8 sequence, while processing '{}'", StringRef(needle, needle_end - needle)); } } } From 7683f06188d8dc901bd912c4ace935a4b3f498e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 12 Jun 2024 11:26:21 +0200 Subject: [PATCH 146/215] Revert "S3: reduce retires time for queries, increase retries count for backups" --- src/Backups/BackupIO_S3.cpp | 6 +++--- src/Core/Settings.h | 1 - src/Core/SettingsChangesHistory.h | 1 - src/IO/S3/Client.h | 2 +- .../integration/test_mask_sensitive_info/configs/users.xml | 1 - 5 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 1ea59c1d38b..92f086295a0 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -54,9 +54,9 @@ namespace S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( settings.auth_settings.region, context->getRemoteHostFilter(), - static_cast(local_settings.s3_max_redirects), - static_cast(local_settings.backup_restore_s3_retry_attempts), - local_settings.enable_s3_requests_logging, + static_cast(global_settings.s3_max_redirects), + static_cast(global_settings.s3_retry_attempts), + global_settings.enable_s3_requests_logging, /* for_disk_s3 = */ false, request_settings.get_request_throttler, request_settings.put_request_throttler, diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7f99243e285..b3e83092a77 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -517,7 +517,6 @@ class IColumn; M(UInt64, backup_restore_keeper_value_max_size, 1048576, "Maximum size of data of a [Zoo]Keeper's node during backup", 0) \ M(UInt64, backup_restore_batch_size_for_keeper_multiread, 10000, "Maximum size of batch for multiread request to [Zoo]Keeper during backup or restore", 0) \ M(UInt64, backup_restore_batch_size_for_keeper_multi, 1000, "Maximum size of batch for multi request to [Zoo]Keeper during backup or restore", 0) \ - M(UInt64, backup_restore_s3_retry_attempts, 1000, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries. It takes place only for backup/restore.", 0) \ M(UInt64, max_backup_bandwidth, 0, "The maximum read speed in bytes per second for particular backup on server. Zero means unlimited.", 0) \ \ M(Bool, log_profile_events, true, "Log query performance statistics into the query_log, query_thread_log and query_views_log.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b447421671e..69bc8c5d207 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -115,7 +115,6 @@ static const std::map& error, long attemptedRetries) const override; diff --git a/tests/integration/test_mask_sensitive_info/configs/users.xml b/tests/integration/test_mask_sensitive_info/configs/users.xml index f767216e907..f129a5bb3e3 100644 --- a/tests/integration/test_mask_sensitive_info/configs/users.xml +++ b/tests/integration/test_mask_sensitive_info/configs/users.xml @@ -2,7 +2,6 @@ 5 - 5 From 71d76aa4ac308e52e2663b409cf2e78c7d7b672f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 12 Jun 2024 11:27:56 +0200 Subject: [PATCH 147/215] Revert "Small fix for 02340_parts_refcnt_mergetree" --- tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh index e7d95d8db72..b100f96befa 100755 --- a/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh +++ b/tests/queries/0_stateless/02340_parts_refcnt_mergetree.sh @@ -58,7 +58,7 @@ function check_refcnt_for_table() $CLICKHOUSE_CLIENT -q "select table, name, refcount>=6 from system.parts where database = '$CLICKHOUSE_DATABASE' and table = '$table' and refcount >= 3" # Kill the query gracefully. - kill -INT $PID ||: + kill -INT $PID wait $PID grep -F Exception "$log_file" | grep -v -F QUERY_WAS_CANCELLED rm -f "${log_file:?}" From 4b252dea727816380c377ddc2e425d394c1b4d47 Mon Sep 17 00:00:00 2001 From: Sema Checherinda Date: Wed, 12 Jun 2024 13:04:42 +0200 Subject: [PATCH 148/215] rework tests with sleep: use sleep_until instead sleep_for --- src/Common/tests/gtest_resolve_pool.cpp | 85 ++++++++++++++++++------- 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/src/Common/tests/gtest_resolve_pool.cpp b/src/Common/tests/gtest_resolve_pool.cpp index 2391fc8bacf..7cfe158d90f 100644 --- a/src/Common/tests/gtest_resolve_pool.cpp +++ b/src/Common/tests/gtest_resolve_pool.cpp @@ -1,12 +1,39 @@ #include -#include #include #include -#include "base/defines.h" +#include + +#include #include +#include #include -#include + + +using namespace std::literals::chrono_literals; + + +auto now() +{ + return std::chrono::steady_clock::now(); +} + +void sleep_until(auto time_point) +{ + std::this_thread::sleep_until(time_point); +} + +void sleep_for(auto duration) +{ + std::this_thread::sleep_for(duration); +} + +size_t toMilliseconds(auto duration) +{ + return std::chrono::duration_cast(duration).count(); +} + +const auto epsilon = 500us; class ResolvePoolMock : public DB::HostResolver { @@ -267,13 +294,14 @@ TEST_F(ResolvePoolTest, CanFailAndHeal) TEST_F(ResolvePoolTest, CanExpire) { - auto resolver = make_resolver(); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto expired_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*expired_addr)); - addresses.erase(*expired_addr); - sleepForSeconds(1); + + sleep_for(history + epsilon); for (size_t i = 0; i < 1000; ++i) { @@ -310,12 +338,16 @@ TEST_F(ResolvePoolTest, DuplicatesInAddresses) ASSERT_EQ(3, DB::CurrentThread::getProfileEvents()[metrics.discovered]); } -void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics) +void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses, auto & failed_addr, auto & metrics, auto deadline) { ASSERT_EQ(iteration, DB::CurrentThread::getProfileEvents()[metrics.failed]); for (size_t i = 0; i < 100; ++i) { auto next_addr = resolver->resolve(); + + if (now() > deadline) + break; + ASSERT_TRUE(addresses.contains(*next_addr)); ASSERT_NE(*next_addr, *failed_addr); } @@ -323,52 +355,60 @@ void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses TEST_F(ResolvePoolTest, BannedForConsiquenceFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); - + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(0, CurrentMetrics::get(metrics.banned_count)); failed_addr.setFail(); - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); + start_at = now(); - sleepForMilliseconds(history_ms + 1); resolver->update(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); // ip still banned adter history_ms + update, because it was his second consiquent fail - check_no_failed_address(2, resolver, addresses, failed_addr, metrics); + check_no_failed_address(2, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); } TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); + auto start_at = now(); + failed_addr.setFail(); failed_addr.setFail(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(3, resolver, addresses, failed_addr, metrics); + check_no_failed_address(3, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); + + sleep_until(start_at + history + epsilon); - sleepForMilliseconds(history_ms + 1); resolver->update(); // ip is cleared after just 1 history_ms interval. ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); @@ -377,8 +417,8 @@ TEST_F(ResolvePoolTest, NoAditionalBannForConcurrentFail) TEST_F(ResolvePoolTest, StillBannedAfterSuccess) { - size_t history_ms = 5; - auto resolver = make_resolver(history_ms); + auto history = 5ms; + auto resolver = make_resolver(toMilliseconds(history)); auto failed_addr = resolver->resolve(); ASSERT_TRUE(addresses.contains(*failed_addr)); @@ -395,11 +435,12 @@ TEST_F(ResolvePoolTest, StillBannedAfterSuccess) } chassert(again_addr); + auto start_at = now(); failed_addr.setFail(); ASSERT_EQ(3, CurrentMetrics::get(metrics.active_count)); ASSERT_EQ(1, CurrentMetrics::get(metrics.banned_count)); - check_no_failed_address(1, resolver, addresses, failed_addr, metrics); + check_no_failed_address(1, resolver, addresses, failed_addr, metrics, start_at + history - epsilon); again_addr = std::nullopt; // success; From a7f3c9fde8f7d483904b5befc078c120d097b467 Mon Sep 17 00:00:00 2001 From: Sema Checherinda <104093494+CheSema@users.noreply.github.com> Date: Wed, 12 Jun 2024 13:52:19 +0200 Subject: [PATCH 149/215] Update src/Common/tests/gtest_resolve_pool.cpp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: János Benjamin Antal --- src/Common/tests/gtest_resolve_pool.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Common/tests/gtest_resolve_pool.cpp b/src/Common/tests/gtest_resolve_pool.cpp index 7cfe158d90f..b760b9b1524 100644 --- a/src/Common/tests/gtest_resolve_pool.cpp +++ b/src/Common/tests/gtest_resolve_pool.cpp @@ -346,7 +346,10 @@ void check_no_failed_address(size_t iteration, auto & resolver, auto & addresses auto next_addr = resolver->resolve(); if (now() > deadline) + { + ASSERT_NE(i, 0); break; + } ASSERT_TRUE(addresses.contains(*next_addr)); ASSERT_NE(*next_addr, *failed_addr); From d851fa871fe1c732fa3f976654fcab74b6d788fd Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 12 Jun 2024 14:09:37 +0200 Subject: [PATCH 150/215] Fix broken links in docs --- .../aggregate-functions/combinators.md | 8 +-- .../parametric-functions.md | 4 +- .../reference/stochasticlinearregression.md | 4 +- .../aggregate-functions/reference/varpop.md | 8 +-- .../aggregate-functions/reference/varsamp.md | 14 +++--- docs/en/sql-reference/data-types/geo.md | 6 +-- docs/en/sql-reference/dictionaries/index.md | 50 +++++++++---------- .../functions/array-functions.md | 17 +++++-- .../functions/bitmap-functions.md | 4 +- .../functions/date-time-functions.md | 35 +++++++------ .../functions/ext-dict-functions.md | 2 +- .../en/sql-reference/functions/geo/geohash.md | 8 +-- docs/en/sql-reference/functions/geo/h3.md | 2 + docs/en/sql-reference/functions/geo/s2.md | 10 ++-- .../sql-reference/functions/hash-functions.md | 4 +- .../functions/ip-address-functions.md | 2 +- .../sql-reference/functions/json-functions.md | 4 +- .../functions/other-functions.md | 24 ++++----- .../functions/rounding-functions.md | 2 +- .../functions/string-functions.md | 2 +- .../functions/string-search-functions.md | 26 +++++----- .../functions/type-conversion-functions.md | 30 +++++------ .../sql-reference/functions/uuid-functions.md | 4 +- docs/en/sql-reference/operators/in.md | 2 +- .../sql-reference/statements/alter/column.md | 4 +- .../statements/alter/partition.md | 6 +-- .../sql-reference/statements/create/view.md | 2 +- docs/en/sql-reference/statements/grant.md | 6 +-- .../sql-reference/statements/select/sample.md | 16 +++--- docs/en/sql-reference/statements/system.md | 2 +- docs/en/sql-reference/syntax.md | 4 +- docs/en/sql-reference/table-functions/file.md | 4 +- .../table-functions/fileCluster.md | 2 +- 33 files changed, 167 insertions(+), 151 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index 8ccc5e292b5..5351531afdb 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -106,14 +106,14 @@ To work with these states, use: - [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. - [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) function. - [runningAccumulate](../../sql-reference/functions/other-functions.md#runningaccumulate) function. -- [-Merge](#aggregate_functions_combinators-merge) combinator. -- [-MergeState](#aggregate_functions_combinators-mergestate) combinator. +- [-Merge](#aggregate_functions_combinators_merge) combinator. +- [-MergeState](#aggregate_functions_combinators_mergestate) combinator. -## -Merge +## -Merge {#aggregate_functions_combinators_merge} If you apply this combinator, the aggregate function takes the intermediate aggregation state as an argument, combines the states to finish aggregation, and returns the resulting value. -## -MergeState +## -MergeState {#aggregate_functions_combinators_mergestate} Merges the intermediate aggregation states in the same way as the -Merge combinator. However, it does not return the resulting value, but an intermediate aggregation state, similar to the -State combinator. diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 1dc89b8dcf9..43ded9df60a 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,7 +82,7 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) +## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} Checks whether the sequence contains an event chain that matches the pattern. @@ -172,7 +172,7 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM - [sequenceCount](#function-sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, ...) +## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index ddac82a0977..15533ba9fd7 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -3,7 +3,7 @@ slug: /en/sql-reference/aggregate-functions/reference/stochasticlinearregression sidebar_position: 221 --- -# stochasticLinearRegression +# stochasticLinearRegression {#agg_functions_stochasticlinearregression_parameters} This function implements stochastic linear regression. It supports custom parameters for learning rate, L2 regularization coefficient, mini-batch size, and has a few methods for updating weights ([Adam](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Adam) (used by default), [simple SGD](https://en.wikipedia.org/wiki/Stochastic_gradient_descent), [Momentum](https://en.wikipedia.org/wiki/Stochastic_gradient_descent#Momentum), and [Nesterov](https://mipt.ru/upload/medialibrary/d7e/41-91.pdf)). @@ -72,5 +72,5 @@ The query will return a column of predicted values. Note that first argument of **See Also** -- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions-stochasticlogisticregression) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions_stochasticlinearregression_parameters) - [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index fcabeb4c6a8..d2b19fe2a3e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -6,7 +6,7 @@ sidebar_position: 32 This page covers the `varPop` and `varPopStable` functions available in ClickHouse. -## varPop +## varPop {#varPop} Calculates the population covariance between two data columns. The population covariance measures the degree to which two variables vary together. Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. @@ -27,7 +27,7 @@ Returns an integer of type `Float64`. **Implementation details** -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable` function](#varPopStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable`](#varPopStable) function. **Example** @@ -55,7 +55,7 @@ Result: 3 ``` -## varPopStable +## varPopStable {#varPopStable} Calculates population covariance between two data columns using a stable, numerically accurate method to calculate the variance. This function is designed to provide reliable results even with large datasets or values that might cause numerical instability in other implementations. @@ -76,7 +76,7 @@ Returns an integer of type `Float64`. **Implementation details** -Unlike [`varPop()`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. +Unlike [`varPop`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index be669a16ae8..e9ec9ba2bc1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -40,7 +40,7 @@ Where: The function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPop()` function](./varpop#varpop) instead. -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable` function](#varSampStable). +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable`](#varSampStable) function. **Example** @@ -66,7 +66,7 @@ Response: 0.8650000000000091 ``` -## varSampStable +## varSampStable {#varSampStable} Calculate the sample variance of a data set using a numerically stable algorithm. @@ -82,11 +82,11 @@ varSampStable(expr) **Returned value** -The `varSampStable()` function returns a Float64 value representing the sample variance of the input data set. +The `varSampStable` function returns a Float64 value representing the sample variance of the input data set. **Implementation details** -The `varSampStable()` function calculates the sample variance using the same formula as the [`varSamp()`](#varSamp function): +The `varSampStable` function calculates the sample variance using the same formula as the [`varSamp`](#varSamp) function: ```plaintext ∑(x - mean(x))^2 / (n - 1) @@ -97,9 +97,9 @@ Where: - `mean(x)` is the arithmetic mean of the data set. - `n` is the number of data points in the data set. -The difference between `varSampStable()` and `varSamp()` is that `varSampStable()` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. +The difference between `varSampStable` and `varSamp` is that `varSampStable` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. -Like `varSamp()`, the `varSampStable()` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable()` function](./varpop#varpopstable) instead. +Like `varSamp`, the `varSampStable` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable`](./varpop#varpopstable) function instead. **Example** @@ -125,4 +125,4 @@ Response: 0.865 ``` -This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp()` due to the more precise handling of floating-point arithmetic. +This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp` due to the more precise handling of floating-point arithmetic. diff --git a/docs/en/sql-reference/data-types/geo.md b/docs/en/sql-reference/data-types/geo.md index 7e3c32b3451..7ffc7447d96 100644 --- a/docs/en/sql-reference/data-types/geo.md +++ b/docs/en/sql-reference/data-types/geo.md @@ -33,7 +33,7 @@ Result: ## Ring -`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point-data-type)). +`Ring` is a simple polygon without holes stored as an array of points: [Array](array.md)([Point](#point)). **Example** @@ -54,7 +54,7 @@ Result: ## Polygon -`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring-data-type)). First element of outer array is the outer shape of polygon and all the following elements are holes. +`Polygon` is a polygon with holes stored as an array of rings: [Array](array.md)([Ring](#ring)). First element of outer array is the outer shape of polygon and all the following elements are holes. **Example** @@ -76,7 +76,7 @@ Result: ## MultiPolygon -`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon-data-type)). +`MultiPolygon` consists of multiple polygons and is stored as an array of polygons: [Array](array.md)([Polygon](#polygon)). **Example** diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 080de94f8b7..437b836ec0e 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -16,14 +16,14 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: - Dictionaries with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). -- [Embedded dictionaries](#embedded_dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). +- [Embedded dictionaries](#embedded-dictionaries) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). :::tip Tutorial If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). ::: -You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](#dictionary-sources)”. +You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](#dictionary_sources)”. ClickHouse: @@ -75,14 +75,14 @@ The dictionary configuration file has the following format: ``` -You can [configure](#configuring-a-dictionary) any number of dictionaries in the same file. +You can [configure](#configuring_a_dictionary) any number of dictionaries in the same file. :::note You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: -## Configuring a Dictionary {#configuring-a-dictionary} +## Configuring a Dictionary {#configuring_a_dictionary} @@ -679,7 +679,7 @@ When searching for a dictionary, the cache is searched first. For each block of If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. -For cache dictionaries, the expiration [lifetime](#dictionary-updates) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. +For cache dictionaries, the expiration [lifetime](#lifetime) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. @@ -771,7 +771,7 @@ The dictionary is not stored in memory and directly goes to the source during th The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. -All types of [sources](#dictionary-sources), except local files, are supported. +All types of [sources](#dictionary_sources), except local files, are supported. Configuration example: @@ -952,7 +952,7 @@ LIFETIME(MIN 300 MAX 360) If `0` and `0`, ClickHouse does not reload the dictionary by timeout. In this case, ClickHouse can reload the dictionary earlier if the dictionary configuration file was changed or the `SYSTEM RELOAD DICTIONARY` command was executed. -When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](#dictionary-sources): +When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](#dictionary_sources): - For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. - For MySQL source, the time of modification is checked using a `SHOW TABLE STATUS` query (in case of MySQL 8 you need to disable meta-information caching in MySQL by `set global information_schema_stats_expiry=0`). @@ -961,7 +961,7 @@ When updating the dictionaries, the ClickHouse server applies different logic de For other sources (ODBC, PostgreSQL, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: - The dictionary table must have a field that always changes when the source data is updated. -- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](#dictionary-sources). +- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](#dictionary_sources). Example of settings: @@ -1031,7 +1031,7 @@ SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) ... ``` -## Dictionary Sources {#dictionary-sources} +## Dictionary Sources {#dictionary_sources} @@ -1092,7 +1092,7 @@ Types of sources (`source_type`): - [Local file](#local_file) - [Executable File](#executable) - [Executable Pool](#executable_pool) -- [HTTP(S)](#http) +- [HTTP(S)](#https) - DBMS - [ODBC](#odbc) - [MySQL](#mysql) @@ -1134,7 +1134,7 @@ When a dictionary with source `FILE` is created via DDL command (`CREATE DICTION ### Executable File {#executable} -Working with executable files depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. +Working with executable files depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. Example of settings: @@ -1285,7 +1285,7 @@ Setting fields: - `db` – Name of the database. Omit it if the database name is set in the `` parameters. - `table` – Name of the table and schema if exists. - `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1575,7 +1575,7 @@ Setting fields: - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). - `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. @@ -1672,7 +1672,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). - `secure` - Use ssl for connection. - `query` – The custom query. Optional parameter. @@ -1849,7 +1849,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Updating dictionaries](#dictionary-updates). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). - `query` – The custom query. Optional parameter. :::note @@ -2030,17 +2030,17 @@ CREATE DICTIONARY somename ( Configuration fields: -| Tag | Description | Required | -|------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| -| `name` | Column name. | Yes | -| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | -| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | -| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | -| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | -| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | -| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. +| Tag | Description | Required | +|------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------| +| `name` | Column name. | Yes | +| `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | +| `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | +| `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical_dictionaries).

Default value: `false`. | No | +| `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | +| `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. -## Hierarchical Dictionaries {#hierarchical-dictionaries} +## Hierarchical Dictionaries {#hierarchical_dictionaries} ClickHouse supports hierarchical dictionaries with a [numeric key](#numeric-key). diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 7b52fbff714..d87ca4a0fe7 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1261,7 +1261,7 @@ SELECT arraySort((x) -> -x, [1, 2, 3]) as res; └─────────┘ ``` -For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#reverse-sort) in a sorting. +For each element of the source array, the lambda function returns the sorting key, that is, \[1 –\> -1, 2 –\> -2, 3 –\> -3\]. Since the `arraySort` function sorts the keys in ascending order, the result is \[3, 2, 1\]. Thus, the `(x) –> -x` lambda function sets the [descending order](#arrayreversesort) in a sorting. The lambda function can accept multiple arguments. In this case, you need to pass the `arraySort` function several arrays of identical length that the arguments of lambda function will correspond to. The resulting array will consist of elements from the first input array; elements from the next input array(s) specify the sorting keys. For example: @@ -1307,10 +1307,15 @@ To improve sorting efficiency, the [Schwartzian transform](https://en.wikipedia. Same as `arraySort` with additional `limit` argument allowing partial sorting. Returns an array of the same size as the original array where elements in range `[1..limit]` are sorted in ascending order. Remaining elements `(limit..N]` shall contain elements in unspecified order. -## arrayReverseSort(\[func,\] arr, ...) {#reverse-sort} +## arrayReverseSort Sorts the elements of the `arr` array in descending order. If the `func` function is specified, `arr` is sorted according to the result of the `func` function applied to the elements of the array, and then the sorted array is reversed. If `func` accepts multiple arguments, the `arrayReverseSort` function is passed several arrays that the arguments of `func` will correspond to. Detailed examples are shown at the end of `arrayReverseSort` description. +**Syntax** + +```sql +arrayReverseSort([func,] arr, ...) +``` Example of integer values sorting: ``` sql @@ -1907,10 +1912,16 @@ FROM numbers(1,10); - [arrayReduce](#arrayreduce) -## arrayReverse(arr) +## arrayReverse Returns an array of the same size as the original array containing the elements in reverse order. +**Syntax** + +```sql +arrayReverse(arr) +``` + Example: ``` sql diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index a5c8a663b71..d98d7d19d7c 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -74,7 +74,7 @@ bitmapSubsetInRange(bitmap, range_start, range_end) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `range_end` – End of the range (exclusive). [UInt32](../data-types/int-uint.md). @@ -188,7 +188,7 @@ Result: Checks whether two bitmaps intersect. -If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmap_functions-bitmapcontains) instead as it works more efficiently. +If `bitmap2` contains exactly one element, consider using [bitmapContains](#bitmapcontains) instead as it works more efficiently. **Syntax** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4092c83954a..e8661b5f5c3 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -83,7 +83,7 @@ Result: ``` ## makeDate32 -Like [makeDate](#makeDate) but produces a [Date32](../data-types/date32.md). +Like [makeDate](#makedate) but produces a [Date32](../data-types/date32.md). ## makeDateTime @@ -214,7 +214,7 @@ Result: **See also** -- [serverTimeZone](#serverTimeZone) +- [serverTimeZone](#servertimezone) ## serverTimeZone @@ -249,7 +249,7 @@ Result: **See also** -- [timeZone](#timeZone) +- [timeZone](#timezone) ## toTimeZone @@ -305,7 +305,7 @@ int32samoa: 1546300800 **See Also** -- [formatDateTime](#formatDateTime) - supports non-constant timezone. +- [formatDateTime](#formatdatetime) - supports non-constant timezone. - [toString](type-conversion-functions.md#tostring) - supports non-constant timezone. ## timeZoneOf @@ -1006,7 +1006,7 @@ toStartOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the first day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the first day of the week as described in the [toWeek()](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1719,7 +1719,7 @@ Result: **See Also** -- [fromDaysSinceYearZero](#fromDaysSinceYearZero) +- [fromDaysSinceYearZero](#fromdayssinceyearzero) ## fromDaysSinceYearZero @@ -1759,7 +1759,7 @@ Result: **See Also** -- [toDaysSinceYearZero](#toDaysSinceYearZero) +- [toDaysSinceYearZero](#todayssinceyearzero) ## fromDaysSinceYearZero32 @@ -1982,7 +1982,7 @@ Result: **See Also** -- [toStartOfInterval](#tostartofintervaldate_or_date_with_time-interval-x-unit--time_zone) +- [toStartOfInterval](#tostartofinterval) ## date\_add @@ -2055,7 +2055,7 @@ Result: **See Also** -- [addDate](#addDate) +- [addDate](#adddate) ## date\_sub @@ -2129,7 +2129,7 @@ Result: **See Also** -- [subDate](#subDate) +- [subDate](#subdate) ## timestamp\_add @@ -2310,7 +2310,7 @@ Alias: `SUBDATE` - [date_sub](#date_sub) -## now {#now} +## now Returns the current date and time at the moment of query analysis. The function is a constant expression. @@ -3609,7 +3609,7 @@ SELECT timeSlots(toDateTime64('1980-12-12 21:01:02.1234', 4, 'UTC'), toDecimal64 └───────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## formatDateTime {#formatDateTime} +## formatDateTime Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. @@ -3734,10 +3734,9 @@ LIMIT 10 **See Also** -- [formatDateTimeInJodaSyntax](##formatDateTimeInJodaSyntax) +- [formatDateTimeInJodaSyntax](#formatdatetimeinjodasyntax) - -## formatDateTimeInJodaSyntax {#formatDateTimeInJodaSyntax} +## formatDateTimeInJodaSyntax Similar to formatDateTime, except that it formats datetime in Joda style instead of MySQL style. Refer to https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html. @@ -3902,11 +3901,11 @@ Result: **See Also** -- [fromUnixTimestampInJodaSyntax](##fromUnixTimestampInJodaSyntax) +- [fromUnixTimestampInJodaSyntax](#fromunixtimestampinjodasyntax) ## fromUnixTimestampInJodaSyntax -Same as [fromUnixTimestamp](#fromUnixTimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. +Same as [fromUnixTimestamp](#fromunixtimestamp) but when called in the second way (two or three arguments), the formatting is performed using [Joda style](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL style. **Example:** @@ -4121,7 +4120,7 @@ Result: Returns the current date and time at the moment of query analysis. The function is a constant expression. :::note -This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now-now) is the preferred usage. +This function gives the same result that `now('UTC')` would. It was added only for MySQL support and [`now`](#now) is the preferred usage. ::: **Syntax** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 82c21ce40c8..093ee690d47 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -12,7 +12,7 @@ For dictionaries created with [DDL queries](../../sql-reference/statements/creat For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/index.md). -## dictGet, dictGetOrDefault, dictGetOrNull {#dictGet} +## dictGet, dictGetOrDefault, dictGetOrNull Retrieves values from a dictionary. diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 8abc8006e5d..9a3d52824f6 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -4,13 +4,15 @@ sidebar_label: Geohash title: "Functions for Working with Geohash" --- +## Geohash {#geohash_description} + [Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer is the geohash string, the more precise is the geographic location. If you need to manually convert geographic coordinates to geohash strings, you can use [geohash.org](http://geohash.org/). ## geohashEncode -Encodes latitude and longitude as a [geohash](#geohash)-string. +Encodes latitude and longitude as a [geohash](#geohash_description)-string. ``` sql geohashEncode(longitude, latitude, [precision]) @@ -40,7 +42,7 @@ SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res; ## geohashDecode -Decodes any [geohash](#geohash)-encoded string into longitude and latitude. +Decodes any [geohash](#geohash_description)-encoded string into longitude and latitude. **Input values** @@ -64,7 +66,7 @@ SELECT geohashDecode('ezs42') AS res; ## geohashesInBox -Returns an array of [geohash](#geohash)-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array. +Returns an array of [geohash](#geohash_description)-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array. **Syntax** diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index bcdd457964a..6fce91f4d8e 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -4,6 +4,8 @@ sidebar_label: H3 Indexes title: "Functions for Working with H3 Indexes" --- +## H3 Index {#h3index} + [H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be split into seven even but smaller ones ("children"), and so on. The level of the hierarchy is called `resolution` and can receive a value from `0` till `15`, where `0` is the `base` level with the largest and coarsest cells. diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index 3165b21318b..bcb6b2833c9 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -5,13 +5,15 @@ sidebar_label: S2 Geometry # Functions for Working with S2 Index +## S2Index {#s2_index} + [S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe). In the S2 library points are represented as the S2 Index - a specific number which encodes internally a point on the surface of a unit sphere, unlike traditional (latitude, longitude) pairs. To get the S2 point index for a given point specified in the format (latitude, longitude) use the [geoToS2](#geotos2) function. Also, you can use the [s2ToGeo](#s2togeo) function for getting geographical coordinates corresponding to the specified S2 point index. ## geoToS2 -Returns [S2](#s2index) point index corresponding to the provided coordinates `(longitude, latitude)`. +Returns [S2](#s2_index) point index corresponding to the provided coordinates `(longitude, latitude)`. **Syntax** @@ -46,7 +48,7 @@ Result: ## s2ToGeo -Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2index) point index. +Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2_index) point index. **Syntax** @@ -82,7 +84,7 @@ Result: ## s2GetNeighbors -Returns S2 neighbor indexes corresponding to the provided [S2](#s2index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. +Returns S2 neighbor indexes corresponding to the provided [S2](#s2_index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. **Syntax** @@ -116,7 +118,7 @@ Result: ## s2CellsIntersect -Determines if the two provided [S2](#s2index) cells intersect or not. +Determines if the two provided [S2](#s2_index) cells intersect or not. **Syntax** diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 506114038f7..e431ed75465 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -45,13 +45,13 @@ SELECT halfMD5(array('e','x','a'), 'mple', 10, toDateTime('2019-06-15 23:00:00') Calculates the MD4 from a string and returns the resulting set of bytes as FixedString(16). -## MD5 {#md5} +## MD5 Calculates the MD5 from a string and returns the resulting set of bytes as FixedString(16). If you do not need MD5 in particular, but you need a decent cryptographic 128-bit hash, use the ‘sipHash128’ function instead. If you want to get the same result as output by the md5sum utility, use lower(hex(MD5(s))). -## sipHash64 {#siphash64} +## sipHash64 Produces a 64-bit [SipHash](https://en.wikipedia.org/wiki/SipHash) hash value. diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 5b6a3aef2c8..11a7749b33d 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -295,7 +295,7 @@ Same as `toIPv6`, but if the IPv6 address has an invalid format, it returns null ## toIPv6 Converts a string form of IPv6 address to [IPv6](../data-types/ipv6.md) type. If the IPv6 address has an invalid format, returns an empty value. -Similar to [IPv6StringToNum](#ipv6stringtonums) function, which converts IPv6 address to binary format. +Similar to [IPv6StringToNum](#ipv6stringtonum) function, which converts IPv6 address to binary format. If the input string contains a valid IPv4 address, then the IPv6 equivalent of the IPv4 address is returned. diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 5d73c9a83b3..7bff6a6cba5 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -5,10 +5,10 @@ sidebar_label: JSON --- There are two sets of functions to parse JSON: - - [`simpleJSON*` (`visitParam*`)](#simplejson--visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. + - [`simpleJSON*` (`visitParam*`)](#simplejson-visitparam-functions) which is made for parsing a limited subset of JSON extremely fast. - [`JSONExtract*`](#jsonextract-functions) which is made for parsing ordinary JSON. -## simpleJSON / visitParam functions +## simpleJSON (visitParam) functions ClickHouse has special functions for working with simplified JSON. All these JSON functions are based on strong assumptions about what the JSON can be. They try to do as little as possible to get the job done as quickly as possible. diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5e63d9824b4..5eae8b7905e 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -762,7 +762,7 @@ LIMIT 10 Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -795,7 +795,7 @@ Result: Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parseReadableSizeOrZero), and [parseReadableSizeOrNull](#parseReadableSizeOrNull). +The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -921,12 +921,12 @@ SELECT └────────────────────┴────────────────────────────────────────────────┘ ``` -## parseReadableSize +## parseReadableSize {#parseReadableSize} Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it throws an exception. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -964,7 +964,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `NULL`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatReadableDecimalSize). **Syntax** @@ -1002,7 +1002,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `0`. -The inverse operations of this function are [formatReadableSize](#formatReadableSize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatReadableDecimalSize). **Syntax** @@ -2711,7 +2711,7 @@ countDigits(x) - Number of digits. [UInt8](../data-types/int-uint.md#uint-ranges). :::note -For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow). +For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#isdecimaloverflow). ::: **Example** @@ -2803,7 +2803,7 @@ currentProfiles() ## enabledProfiles -Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#current-profiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). +Returns settings profiles, assigned to the current user both explicitly and implicitly. Explicitly assigned profiles are the same as returned by the [currentProfiles](#currentprofiles) function. Implicitly assigned profiles include parent profiles of other assigned profiles, profiles assigned via granted roles, profiles assigned via their own settings, and the main default profile (see the `default_profile` section in the main server configuration file). **Syntax** @@ -2916,11 +2916,11 @@ Result: └───────────────────────────┘ ``` -## queryID {#queryID} +## queryID Returns the ID of the current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `query_id`. -In contrast to [initialQueryID](#initial-query-id) function, `queryID` can return different results on different shards (see the example). +In contrast to [initialQueryID](#initialqueryid) function, `queryID` can return different results on different shards (see the example). **Syntax** @@ -2954,7 +2954,7 @@ Result: Returns the ID of the initial current query. Other parameters of a query can be extracted from the [system.query_log](../../operations/system-tables/query_log.md) table via `initial_query_id`. -In contrast to [queryID](#query-id) function, `initialQueryID` returns the same results on different shards (see example). +In contrast to [queryID](#queryid) function, `initialQueryID` returns the same results on different shards (see example). **Syntax** @@ -3041,7 +3041,7 @@ shardCount() **See Also** -- [shardNum()](#shard-num) function example also contains `shardCount()` function call. +- [shardNum()](#shardnum) function example also contains `shardCount()` function call. ## getOSKernelVersion diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index 6495a43fc85..c3a915ca195 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -200,7 +200,7 @@ Banker's rounding is a method of rounding fractional numbers When the rounding number is halfway between two numbers, it's rounded to the nearest even digit at the specified decimal position. For example: 3.5 rounds up to 4, 2.5 rounds down to 2. It's the default rounding method for floating point numbers defined in [IEEE 754](https://en.wikipedia.org/wiki/IEEE_754#Roundings_to_nearest). -The [round](#rounding_functions-round) function performs the same rounding for floating point numbers. +The [round](#round) function performs the same rounding for floating point numbers. The `roundBankers` function also rounds integers the same way, for example, `roundBankers(45, -1) = 40`. In other cases, the function rounds numbers to the nearest integer. diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 342ca2b9f03..c2d19f58422 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -1994,7 +1994,7 @@ Result: ## stringJaccardIndexUTF8 -Like [stringJaccardIndex](#stringJaccardIndex) but for UTF8-encoded strings. +Like [stringJaccardIndex](#stringjaccardindex) but for UTF8-encoded strings. ## editDistance diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index d261cff3580..5353bbf9b27 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -262,7 +262,7 @@ Result: ## multiSearchAllPositionsUTF8 -Like [multiSearchAllPositions](#multiSearchAllPositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. +Like [multiSearchAllPositions](#multisearchallpositions) but assumes `haystack` and the `needle` substrings are UTF-8 encoded strings. **Syntax** @@ -336,7 +336,7 @@ Result: Like [`position`](#position) but returns the leftmost offset in a `haystack` string which matches any of multiple `needle` strings. -Functions [`multiSearchFirstPositionCaseInsensitive`](#multiSearchFirstPositionCaseInsensitive), [`multiSearchFirstPositionUTF8`](#multiSearchFirstPositionUTF8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multiSearchFirstPositionCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstPositionCaseInsensitive`](#multisearchfirstpositioncaseinsensitive), [`multiSearchFirstPositionUTF8`](#multisearchfirstpositionutf8) and [`multiSearchFirstPositionCaseInsensitiveUTF8`](#multisearchfirstpositioncaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -370,7 +370,7 @@ Result: ## multiSearchFirstPositionCaseInsensitive -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but ignores case. **Syntax** @@ -404,7 +404,7 @@ Result: ## multiSearchFirstPositionUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings. **Syntax** @@ -440,7 +440,7 @@ Result: ## multiSearchFirstPositionCaseInsensitiveUTF8 -Like [`multiSearchFirstPosition`](#multiSearchFirstPosition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. +Like [`multiSearchFirstPosition`](#multisearchfirstposition) but assumes `haystack` and `needle` to be UTF-8 strings and ignores case. **Syntax** @@ -478,7 +478,7 @@ Result: Returns the index `i` (starting from 1) of the leftmost found needlei in the string `haystack` and 0 otherwise. -Functions [`multiSearchFirstIndexCaseInsensitive`](#multiSearchFirstIndexCaseInsensitive), [`multiSearchFirstIndexUTF8`](#multiSearchFirstIndexUTF8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multiSearchFirstIndexCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchFirstIndexCaseInsensitive`](#multisearchfirstindexcaseinsensitive), [`multiSearchFirstIndexUTF8`](#multisearchfirstindexutf8) and [`multiSearchFirstIndexCaseInsensitiveUTF8`](#multisearchfirstindexcaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -615,7 +615,7 @@ Result: Returns 1, if at least one string needlei matches the string `haystack` and 0 otherwise. -Functions [`multiSearchAnyCaseInsensitive`](#multiSearchAnyCaseInsensitive), [`multiSearchAnyUTF8`](#multiSearchAnyUTF8) and []`multiSearchAnyCaseInsensitiveUTF8`](#multiSearchAnyCaseInsensitiveUTF8) provide case-insensitive and/or UTF-8 variants of this function. +Functions [`multiSearchAnyCaseInsensitive`](#multisearchanycaseinsensitive), [`multiSearchAnyUTF8`](#multisearchanyutf8) and [`multiSearchAnyCaseInsensitiveUTF8`](#multisearchanycaseinsensitiveutf8) provide case-insensitive and/or UTF-8 variants of this function. **Syntax** @@ -719,7 +719,7 @@ Result: ## multiSearchAnyCaseInsensitiveUTF8 -Like [multiSearchAnyUTF8](#multiSearchAnyUTF8) but ignores case. +Like [multiSearchAnyUTF8](#multisearchanyutf8) but ignores case. *Syntax** @@ -880,7 +880,7 @@ extractAll(haystack, pattern) Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc. -This function is slower than [extractAllGroupsVertical](#extractallgroups-vertical). +This function is slower than [extractAllGroupsVertical](#extractallgroupsvertical). **Syntax** @@ -952,7 +952,7 @@ Result: └────────────────────────────────────────────────────────────────────────────────────────┘ ``` -## like {#like} +## like Returns whether string `haystack` matches the LIKE expression `pattern`. @@ -1215,7 +1215,7 @@ Result: ## ngramSearchCaseInsensitive -Provides a case-insensitive variant of [ngramSearch](#ngramSearch). +Provides a case-insensitive variant of [ngramSearch](#ngramsearch). **Syntax** @@ -1630,7 +1630,7 @@ Result: ## hasSubsequenceCaseInsensitive -Like [hasSubsequence](#hasSubsequence) but searches case-insensitively. +Like [hasSubsequence](#hassubsequence) but searches case-insensitively. **Syntax** @@ -1700,7 +1700,7 @@ Result: ## hasSubsequenceCaseInsensitiveUTF8 -Like [hasSubsequenceUTF8](#hasSubsequenceUTF8) but searches case-insensitively. +Like [hasSubsequenceUTF8](#hassubsequenceutf8) but searches case-insensitively. **Syntax** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2ec51d43c59..86739ac0b8d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -70,7 +70,7 @@ Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` dat Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -169,7 +169,7 @@ Converts an input value to the [UInt](../data-types/int-uint.md) data type. This Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers. -The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#numeric-conversion-issues), when using the functions. +The behavior of functions for negative arguments and for the [NaN and Inf](../data-types/float.md/#data_type-float-nan-inf) arguments is undefined. If you pass a string with a negative number, for example `'-32'`, ClickHouse raises an exception. Remember about [numeric conversions issues](#common-issues-with-data-conversion), when using the functions. **Example** @@ -1730,7 +1730,7 @@ Result: └─────────────────────┘ ``` -## reinterpret(x, T) +## reinterpret Uses the same source in-memory bytes sequence for `x` value and reinterprets it to destination type. @@ -1766,9 +1766,9 @@ Result: └─────────────┴──────────────┴───────────────┘ ``` -## CAST(x, T) +## CAST -Converts an input value to the specified data type. Unlike the [reinterpret](#type_conversion_function-reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. +Converts an input value to the specified data type. Unlike the [reinterpret](#reinterpret) function, `CAST` tries to present the same value using the new data type. If the conversion can not be done then an exception is raised. Several syntax variants are supported. **Syntax** @@ -1875,7 +1875,7 @@ Result: Converts `x` to the `T` data type. -The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. +The difference from [cast](#cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. **Example** @@ -2061,7 +2061,7 @@ Result: └───────────────────────────┴──────────────────────────────┘ ``` -## parseDateTime {#type_conversion_functions-parseDateTime} +## parseDateTime Converts a [String](../data-types/string.md) to [DateTime](../data-types/datetime.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format). @@ -2102,15 +2102,15 @@ Alias: `TO_TIMESTAMP`. ## parseDateTimeOrZero -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeOrNull -Same as for [parseDateTime](#type_conversion_functions-parseDateTime) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTime](#parsedatetime) except that it returns `NULL` when it encounters a date format that cannot be processed. Alias: `str_to_date`. -## parseDateTimeInJodaSyntax {#type_conversion_functions-parseDateTimeInJodaSyntax} +## parseDateTimeInJodaSyntax Similar to [parseDateTime](#parsedatetime), except that the format string is in [Joda](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html) instead of MySQL syntax. @@ -2151,11 +2151,11 @@ SELECT parseDateTimeInJodaSyntax('2023-02-24 14:53:31', 'yyyy-MM-dd HH:mm:ss', ' ## parseDateTimeInJodaSyntaxOrZero -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns zero date when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns zero date when it encounters a date format that cannot be processed. ## parseDateTimeInJodaSyntaxOrNull -Same as for [parseDateTimeInJodaSyntax](#type_conversion_functions-parseDateTimeInJodaSyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffort ## parseDateTime32BestEffort @@ -2313,11 +2313,11 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns `NULL` when it encounters a date format that cannot be processed. ## parseDateTimeBestEffortUSOrZero -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortus) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. ## parseDateTime64BestEffort @@ -2389,7 +2389,7 @@ Same as for [parseDateTime64BestEffort](#parsedatetime64besteffort), except that Converts input parameter to the [LowCardinality](../data-types/lowcardinality.md) version of same data type. -To convert data from the `LowCardinality` data type use the [CAST](#type_conversion_function-cast) function. For example, `CAST(x as String)`. +To convert data from the `LowCardinality` data type use the [CAST](#cast) function. For example, `CAST(x as String)`. **Syntax** diff --git a/docs/en/sql-reference/functions/uuid-functions.md b/docs/en/sql-reference/functions/uuid-functions.md index 0323ae728a9..5f15907d029 100644 --- a/docs/en/sql-reference/functions/uuid-functions.md +++ b/docs/en/sql-reference/functions/uuid-functions.md @@ -150,7 +150,7 @@ The function also works for [Arrays](array-functions.md#function-empty) and [Str **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: @@ -190,7 +190,7 @@ The function also works for [Arrays](array-functions.md#function-notempty) or [S **Example** -To generate the UUID value, ClickHouse provides the [generateUUIDv4](#uuid-function-generate) function. +To generate the UUID value, ClickHouse provides the [generateUUIDv4](#generateuuidv4) function. Query: diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 0257d21b30f..5c83b2363e0 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -255,7 +255,7 @@ where `M` is between `1` and `3` depending on which replica the local query is e These settings affect every MergeTree-family table in the query and have the same effect as applying `SAMPLE 1/3 OFFSET (M-1)/3` on each table. -Therefore adding the [max_parallel_replicas](#settings-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. +Therefore adding the [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) setting will only produce correct results if both tables have the same replication scheme and are sampled by UserID or a subkey of it. In particular, if `local_table_2` does not have a sampling key, incorrect results will be produced. The same rule applies to `JOIN`. One workaround if `local_table_2` does not meet the requirements, is to use `GLOBAL IN` or `GLOBAL JOIN`. diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index a23710b12bd..aa6f132e08e 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -108,7 +108,7 @@ ALTER TABLE visits RENAME COLUMN webBrowser TO browser CLEAR COLUMN [IF EXISTS] name IN PARTITION partition_name ``` -Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](partition.md/#how-to-set-partition-expression). +Resets all data in a column for a specified partition. Read more about setting the partition name in the section [How to set the partition expression](../alter/partition.md/#how-to-set-partition-expression). If the `IF EXISTS` clause is specified, the query won’t return an error if the column does not exist. @@ -173,7 +173,7 @@ ALTER TABLE visits MODIFY COLUMN browser Array(String) Changing the column type is the only complex action – it changes the contents of files with data. For large tables, this may take a long time. -The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#alter_add-column) description, but column type is mandatory in this case. +The query also can change the order of the columns using `FIRST | AFTER` clause, see [ADD COLUMN](#add-column) description, but column type is mandatory in this case. Example: diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 0ed1e523669..778816f8934 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -31,7 +31,7 @@ The following operations with [partitions](/docs/en/engines/table-engines/merget ALTER TABLE table_name [ON CLUSTER cluster] DETACH PARTITION|PART partition_expr ``` -Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#alter_attach-partition) query. +Moves all data for the specified partition to the `detached` directory. The server forgets about the detached data partition as if it does not exist. The server will not know about this data until you make the [ATTACH](#attach-partitionpart) query. Example: @@ -252,7 +252,7 @@ Downloads a partition from another server. This query only works for the replica The query does the following: 1. Downloads the partition|part from the specified shard. In ‘path-in-zookeeper’ you must specify a path to the shard in ZooKeeper. -2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#alter_attach-partition) query to add the data to the table. +2. Then the query puts the downloaded data to the `detached` directory of the `table_name` table. Use the [ATTACH PARTITION\|PART](#attach-partitionpart) query to add the data to the table. For example: @@ -353,7 +353,7 @@ You can specify the partition expression in `ALTER ... PARTITION` queries in dif - Using the keyword `ALL`. It can be used only with DROP/DETACH/ATTACH. For example, `ALTER TABLE visits ATTACH PARTITION ALL`. - As a tuple of expressions or constants that matches (in types) the table partitioning keys tuple. In the case of a single element partitioning key, the expression should be wrapped in the `tuple (...)` function. For example, `ALTER TABLE visits DETACH PARTITION tuple(toYYYYMM(toDate('2019-01-25')))`. - Using the partition ID. Partition ID is a string identifier of the partition (human-readable, if possible) that is used as the names of partitions in the file system and in ZooKeeper. The partition ID must be specified in the `PARTITION ID` clause, in a single quotes. For example, `ALTER TABLE visits DETACH PARTITION ID '201901'`. -- In the [ALTER ATTACH PART](#alter_attach-partition) and [DROP DETACHED PART](#alter_drop-detached) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. +- In the [ALTER ATTACH PART](#attach-partitionpart) and [DROP DETACHED PART](#drop-detached-partitionpart) query, to specify the name of a part, use string literal with a value from the `name` column of the [system.detached_parts](/docs/en/operations/system-tables/detached_parts.md/#system_tables-detached_parts) table. For example, `ALTER TABLE visits ATTACH PART '201901_1_1_0'`. Usage of quotes when specifying the partition depends on the type of partition expression. For example, for the `String` type, you have to specify its name in quotes (`'`). For the `Date` and `Int*` types no quotes are needed. diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 1bdf22b35b0..1fabb6d8cc7 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -6,7 +6,7 @@ sidebar_label: VIEW # CREATE VIEW -Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-experimental), and [window](#window-view-experimental) (live view and window view are experimental features). +Creates a new view. Views can be [normal](#normal-view), [materialized](#materialized-view), [live](#live-view-deprecated), and [window](#window-view-experimental) (live view and window view are experimental features). ## Normal View diff --git a/docs/en/sql-reference/statements/grant.md b/docs/en/sql-reference/statements/grant.md index 2850ce71781..43fa344a16d 100644 --- a/docs/en/sql-reference/statements/grant.md +++ b/docs/en/sql-reference/statements/grant.md @@ -33,7 +33,7 @@ GRANT [ON CLUSTER cluster_name] role [,...] TO {user | another_role | CURRENT_US - `role` — ClickHouse user role. - `user` — ClickHouse user account. -The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option-privilege) privilege to `user` or `role`. +The `WITH ADMIN OPTION` clause grants [ADMIN OPTION](#admin-option) privilege to `user` or `role`. The `WITH REPLACE OPTION` clause replace old roles by new role for the `user` or `role`, if is not specified it appends roles. ## Grant Current Grants Syntax @@ -201,7 +201,7 @@ Hierarchy of privileges: - `HDFS` - `S3` - [dictGet](#dictget) -- [displaySecretsInShowAndSelect](#display-secrets) +- [displaySecretsInShowAndSelect](#displaysecretsinshowandselect) - [NAMED COLLECTION ADMIN](#named-collection-admin) - `CREATE NAMED COLLECTION` - `DROP NAMED COLLECTION` @@ -498,7 +498,7 @@ Privilege level: `DICTIONARY`. - `GRANT dictGet ON mydictionary TO john` -### displaySecretsInShowAndSelect {#display-secrets} +### displaySecretsInShowAndSelect Allows a user to view secrets in `SHOW` and `SELECT` queries if both [`display_secrets_in_show_and_select` server setting](../../operations/server-configuration-parameters/settings#display_secrets_in_show_and_select) diff --git a/docs/en/sql-reference/statements/select/sample.md b/docs/en/sql-reference/statements/select/sample.md index 137f86cc8b9..78e05b19bd1 100644 --- a/docs/en/sql-reference/statements/select/sample.md +++ b/docs/en/sql-reference/statements/select/sample.md @@ -27,14 +27,14 @@ The features of data sampling are listed below: For the `SAMPLE` clause the following syntax is supported: -| SAMPLE Clause Syntax | Description | -|----------------------|------------------------------| -| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#select-sample-k) | -| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#select-sample-n) | -| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) | +| SAMPLE Clause Syntax | Description | +|----------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `SAMPLE k` | Here `k` is the number from 0 to 1. The query is executed on `k` fraction of data. For example, `SAMPLE 0.1` runs the query on 10% of data. [Read more](#sample-k) | +| `SAMPLE n` | Here `n` is a sufficiently large integer. The query is executed on a sample of at least `n` rows (but not significantly more than this). For example, `SAMPLE 10000000` runs the query on a minimum of 10,000,000 rows. [Read more](#sample-n) | +| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#sample-k-offset-m) | -## SAMPLE K {#select-sample-k} +## SAMPLE K Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`. @@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000 In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10. -## SAMPLE N {#select-sample-n} +## SAMPLE N Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`. @@ -90,7 +90,7 @@ FROM visits SAMPLE 10000000 ``` -## SAMPLE K OFFSET M {#select-sample-offset} +## SAMPLE K OFFSET M Here `k` and `m` are numbers from 0 to 1. Examples are shown below. diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 7efbff1b42b..e6d3439d2b9 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -174,7 +174,7 @@ Aborts ClickHouse process (like `kill -9 {$ pid_clickhouse-server}`) ## Managing Distributed Tables -ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#query_language-system-stop-distributed-sends), [FLUSH DISTRIBUTED](#query_language-system-flush-distributed), and [START DISTRIBUTED SENDS](#query_language-system-start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. +ClickHouse can manage [distributed](../../engines/table-engines/special/distributed.md) tables. When a user inserts data into these tables, ClickHouse first creates a queue of the data that should be sent to cluster nodes, then asynchronously sends it. You can manage queue processing with the [STOP DISTRIBUTED SENDS](#stop-distributed-sends), [FLUSH DISTRIBUTED](#flush-distributed), and [START DISTRIBUTED SENDS](#start-distributed-sends) queries. You can also synchronously insert distributed data with the [distributed_foreground_insert](../../operations/settings/settings.md#distributed_foreground_insert) setting. ### STOP DISTRIBUTED SENDS diff --git a/docs/en/sql-reference/syntax.md b/docs/en/sql-reference/syntax.md index fc0286e76ad..6a4afb63db8 100644 --- a/docs/en/sql-reference/syntax.md +++ b/docs/en/sql-reference/syntax.md @@ -54,11 +54,11 @@ Identifiers are: - Cluster, database, table, partition, and column names. - Functions. - Data types. -- [Expression aliases](#expression_aliases). +- [Expression aliases](#expression-aliases). Identifiers can be quoted or non-quoted. The latter is preferred. -Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#syntax-keywords). Examples: `x`, `_1`, `X_y__Z123_`. +Non-quoted identifiers must match the regex `^[a-zA-Z_][0-9a-zA-Z_]*$` and can not be equal to [keywords](#keywords). Examples: `x`, `_1`, `X_y__Z123_`. If you want to use identifiers the same as keywords or you want to use other symbols in identifiers, quote it using double quotes or backticks, for example, `"id"`, `` `id` ``. diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index 4fec772c373..3a3162dad9a 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -18,7 +18,7 @@ file([path_to_archive ::] path [,format] [,structure] [,compression]) **Parameters** -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs_in_path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Supports in read-only mode the following [globs](#globs-in-path): `*`, `?`, `{abc,def}` (with `'abc'` and `'def'` being strings) and `{N..M}` (with `N` and `M` being numbers). - `path_to_archive` - The relative path to a zip/tar/7z archive. Supports the same globs as `path`. - `format` — The [format](/docs/en/interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. @@ -128,7 +128,7 @@ Reading data from `table.csv`, located in `archive1.zip` or/and `archive2.zip`: SELECT * FROM file('user_files/archives/archive{1..2}.zip :: table.csv'); ``` -## Globs in path {#globs_in_path} +## Globs in path Paths may use globbing. Files must match the whole path pattern, not only the suffix or prefix. diff --git a/docs/en/sql-reference/table-functions/fileCluster.md b/docs/en/sql-reference/table-functions/fileCluster.md index 4677d2883a7..3060e6c151d 100644 --- a/docs/en/sql-reference/table-functions/fileCluster.md +++ b/docs/en/sql-reference/table-functions/fileCluster.md @@ -74,7 +74,7 @@ SELECT * FROM fileCluster('my_cluster', 'file{1,2}.csv', 'CSV', 'i UInt32, s Str ``` -## Globs in Path {#globs_in_path} +## Globs in Path All patterns supported by [File](../../sql-reference/table-functions/file.md#globs-in-path) table function are supported by FileCluster. From dcd3e9d1511572b71af3b9149e3ecf713fb35a8a Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 4 Jun 2024 14:06:26 +0200 Subject: [PATCH 151/215] Move changelog script to tests/ci --- tests/ci/changelog.py | 427 ++++++++++++++++++++++++++++++ utils/changelog/changelog.py | 428 +------------------------------ utils/changelog/git_helper.py | 1 - utils/changelog/github_helper.py | 1 - 4 files changed, 435 insertions(+), 422 deletions(-) create mode 100755 tests/ci/changelog.py delete mode 120000 utils/changelog/git_helper.py delete mode 120000 utils/changelog/github_helper.py diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py new file mode 100755 index 00000000000..b1a43b1520f --- /dev/null +++ b/tests/ci/changelog.py @@ -0,0 +1,427 @@ +#!/usr/bin/env python3 +# In our CI this script runs in style-test containers + +import argparse +import logging +import os +import os.path as p +import re +from datetime import date, timedelta +from subprocess import DEVNULL +from typing import Dict, List, Optional, TextIO + +from github.GithubException import RateLimitExceededException, UnknownObjectException +from github.NamedUser import NamedUser +from thefuzz.fuzz import ratio # type: ignore + +from git_helper import git_runner as runner +from git_helper import is_shallow +from github_helper import GitHub, PullRequest, PullRequests, Repository + +# This array gives the preferred category order, and is also used to +# normalize category names. +# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there +# updated accordingly +categories_preferred_order = ( + "Backward Incompatible Change", + "New Feature", + "Performance Improvement", + "Improvement", + "Critical Bug Fix", + "Bug Fix", + "Build/Testing/Packaging Improvement", + "Other", +) + +FROM_REF = "" +TO_REF = "" +SHA_IN_CHANGELOG = [] # type: List[str] +gh = GitHub(create_cache_dir=False) +CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") + + +class Description: + def __init__( + self, number: int, user: NamedUser, html_url: str, entry: str, category: str + ): + self.number = number + self.html_url = html_url + self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore + self.entry = entry + self.category = category + + @property + def formatted_entry(self) -> str: + # Substitute issue links. + # 1) issue number w/o markdown link + entry = re.sub( + r"([^[])#([0-9]{4,})", + r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", + self.entry, + ) + # 2) issue URL w/o markdown link + # including #issuecomment-1 or #event-12 + entry = re.sub( + r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", + r"\1[#\3](\2)", + entry, + ) + # It's possible that we face a secondary rate limit. + # In this case we should sleep until we get it + while True: + try: + user_name = self.user.name if self.user.name else self.user.login + break + except UnknownObjectException: + user_name = self.user.login + break + except RateLimitExceededException: + gh.sleep_on_rate_limit() + return ( + f"* {entry} [#{self.number}]({self.html_url}) " + f"([{user_name}]({self.user.html_url}))." + ) + + # Sort PR descriptions by numbers + def __eq__(self, other) -> bool: + if not isinstance(self, type(other)): + return NotImplemented + return self.number == other.number + + def __lt__(self, other: "Description") -> bool: + return self.number < other.number + + +def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: + descriptions = {} # type: Dict[str, List[Description]] + repos = {} # type: Dict[str, Repository] + for pr in prs: + # See https://github.com/PyGithub/PyGithub/issues/2202, + # obj._rawData doesn't spend additional API requests + # We'll save some requests + # pylint: disable=protected-access + repo_name = pr._rawData["base"]["repo"]["full_name"] + # pylint: enable=protected-access + if repo_name not in repos: + repos[repo_name] = pr.base.repo + in_changelog = False + merge_commit = pr.merge_commit_sha + if merge_commit is None: + logging.warning("PR %s does not have merge-commit, skipping", pr.number) + continue + + in_changelog = merge_commit in SHA_IN_CHANGELOG + if in_changelog: + desc = generate_description(pr, repos[repo_name]) + if desc: + if desc.category not in descriptions: + descriptions[desc.category] = [] + descriptions[desc.category].append(desc) + + for descs in descriptions.values(): + descs.sort() + + return descriptions + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Generate a changelog in Markdown format between given tags. " + "It fetches all tags and unshallow the git repository automatically", + ) + parser.add_argument( + "-v", + "--verbose", + action="count", + default=0, + help="set the script verbosity, could be used multiple", + ) + parser.add_argument( + "--debug-helpers", + action="store_true", + help="add debug logging for git_helper and github_helper", + ) + parser.add_argument( + "--output", + type=argparse.FileType("w"), + default="-", + help="output file for changelog", + ) + parser.add_argument( + "--repo", + default="ClickHouse/ClickHouse", + help="a repository to query for pull-requests from GitHub", + ) + parser.add_argument( + "--jobs", + type=int, + default=10, + help="number of jobs to get pull-requests info from GitHub API", + ) + parser.add_argument( + "--gh-user-or-token", + help="user name or GH token to authenticate", + ) + parser.add_argument( + "--gh-password", + help="a password that should be used when user is given", + ) + parser.add_argument( + "--with-testing-tags", + action="store_true", + help="by default '*-testing' tags are ignored, this argument enables them too", + ) + parser.add_argument( + "--from", + dest="from_ref", + help="git ref for a starting point of changelog, by default is calculated " + "automatically to match a previous tag in history", + ) + parser.add_argument( + "to_ref", + metavar="TO_REF", + help="git ref for the changelog end", + ) + args = parser.parse_args() + return args + + +# This function mirrors the PR description checks in ClickhousePullRequestTrigger. +# Returns None if the PR should not be mentioned in changelog. +def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: + backport_number = item.number + if item.head.ref.startswith("backport/"): + branch_parts = item.head.ref.split("/") + if len(branch_parts) == 3: + try: + item = gh.get_pull_cached(repo, int(branch_parts[-1])) + except Exception as e: + logging.warning("unable to get backpoted PR, exception: %s", e) + else: + logging.warning( + "The branch %s doesn't match backport template, using PR %s as is", + item.head.ref, + item.number, + ) + description = item.body + # Don't skip empty lines because they delimit parts of description + lines = [x.strip() for x in (description.split("\n") if description else [])] + lines = [re.sub(r"\s+", " ", ln) for ln in lines] + + category = "" + entry = "" + + if lines: + i = 0 + while i < len(lines): + if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): + i += 1 + if i >= len(lines): + break + # Can have one empty line between header and the category itself. + # Filter it out. + if not lines[i]: + i += 1 + if i >= len(lines): + break + category = re.sub(r"^[-*\s]*", "", lines[i]) + i += 1 + elif re.match( + r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] + ): + i += 1 + # Can have one empty line between header and the entry itself. + # Filter it out. + if i < len(lines) and not lines[i]: + i += 1 + # All following lines until empty one are the changelog entry. + entry_lines = [] + while i < len(lines) and lines[i]: + entry_lines.append(lines[i]) + i += 1 + entry = " ".join(entry_lines) + else: + i += 1 + + # Remove excessive bullets from the entry. + if re.match(r"^[\-\*] ", entry): + entry = entry[2:] + + # Better style. + if re.match(r"^[a-z]", entry): + entry = entry.capitalize() + + if not category: + # Shouldn't happen, because description check in CI should catch such PRs. + # Fall through, so that it shows up in output and the user can fix it. + category = "NO CL CATEGORY" + + # Filter out the PR categories that are not for changelog. + if re.match( + r"(?i)((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", + category, + ): + category = "NOT FOR CHANGELOG / INSIGNIFICANT" + return Description(item.number, item.user, item.html_url, item.title, category) + + # Normalize bug fixes + if re.match( + r"(?i)bug\Wfix", + category, + ): + category = "Bug Fix (user-visible misbehavior in an official stable release)" + + # Filter out documentations changelog + if re.match( + r"(?i)doc", + category, + ): + return None + + if backport_number != item.number: + entry = f"Backported in #{backport_number}: {entry}" + + if not entry: + # Shouldn't happen, because description check in CI should catch such PRs. + category = "NO CL ENTRY" + entry = "NO CL ENTRY: '" + item.title + "'" + + entry = entry.strip() + if entry[-1] != ".": + entry += "." + + for c in categories_preferred_order: + if ratio(category.lower(), c.lower()) >= 90: + category = c + break + + return Description(item.number, item.user, item.html_url, entry, category) + + +def write_changelog( + fd: TextIO, descriptions: Dict[str, List[Description]], year: int +) -> None: + to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] + from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] + fd.write( + f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" + f"# {year} Changelog\n\n" + f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " + f"as compared to {FROM_REF} ({from_commit})\n\n" + ) + + seen_categories = [] # type: List[str] + for category in categories_preferred_order: + if category in descriptions: + seen_categories.append(category) + fd.write(f"#### {category}\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + for category in sorted(descriptions): + if category not in seen_categories: + fd.write(f"#### {category}\n\n") + for desc in descriptions[category]: + fd.write(f"{desc.formatted_entry}\n") + + fd.write("\n") + + +def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool): + global FROM_REF, TO_REF + TO_REF = to_ref + + # Check TO_REF + runner.run(f"git rev-parse {TO_REF}") + + # Check from_ref + if from_ref is None: + # Get all tags pointing to TO_REF + tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") + logging.info("All tags pointing to %s:\n%s", TO_REF, tags) + if not with_testing_tags: + tags.append("*-testing") + exclude = " ".join([f"--exclude='{tag}'" for tag in tags]) + cmd = f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'" + FROM_REF = runner.run(cmd) + else: + runner.run(f"git rev-parse {FROM_REF}") + FROM_REF = from_ref + + +def set_sha_in_changelog(): + global SHA_IN_CHANGELOG + SHA_IN_CHANGELOG = runner.run( + f"git log --format=format:%H {FROM_REF}..{TO_REF}" + ).split("\n") + + +def get_year(prs: PullRequests) -> int: + if not prs: + return date.today().year + return max(pr.created_at.year for pr in prs) + + +def main(): + log_levels = [logging.WARN, logging.INFO, logging.DEBUG] + args = parse_args() + logging.basicConfig( + format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", + level=log_levels[min(args.verbose, 2)], + ) + if args.debug_helpers: + logging.getLogger("github_helper").setLevel(logging.DEBUG) + logging.getLogger("git_helper").setLevel(logging.DEBUG) + # Create a cache directory + if not p.isdir(CACHE_PATH): + os.mkdir(CACHE_PATH, 0o700) + + # Get the full repo + if is_shallow(): + logging.info("Unshallow repository") + runner.run("git fetch --unshallow", stderr=DEVNULL) + logging.info("Fetching all tags") + runner.run("git fetch --tags", stderr=DEVNULL) + + check_refs(args.from_ref, args.to_ref, args.with_testing_tags) + set_sha_in_changelog() + + logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) + + # use merge-base commit as a starting point, if used ref in another branch + base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") + # Get starting and ending dates for gathering PRs + # Add one day after and before to mitigate TZ possible issues + # `tag^{}` format gives commit ref when we have annotated tags + # format %cs gives a committer date, works better for cherry-picked commits + from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") + to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") + merged = ( + date.fromisoformat(from_date) - timedelta(1), + date.fromisoformat(to_date) + timedelta(1), + ) + + # Get all PRs for the given time frame + global gh + gh = GitHub( + args.gh_user_or_token, + args.gh_password, + create_cache_dir=False, + per_page=100, + pool_size=args.jobs, + ) + gh.cache_path = CACHE_PATH + query = f"type:pr repo:{args.repo} is:merged" + prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") + + descriptions = get_descriptions(prs) + changelog_year = get_year(prs) + + write_changelog(args.output, descriptions, changelog_year) + + +if __name__ == "__main__": + main() diff --git a/utils/changelog/changelog.py b/utils/changelog/changelog.py index 314461a6b3a..b79e4139bcc 100755 --- a/utils/changelog/changelog.py +++ b/utils/changelog/changelog.py @@ -1,427 +1,15 @@ #!/usr/bin/env python3 # In our CI this script runs in style-test containers -import argparse -import logging -import os -import os.path as p -import re -from datetime import date, timedelta -from subprocess import DEVNULL, CalledProcessError -from typing import Dict, List, Optional, TextIO +# The main script is moved to tests/ci/changelog.py +# It depends on the ci scripts too hard to keep it here +# Here's only a wrapper around it for the people who used to it -from github.GithubException import RateLimitExceededException, UnknownObjectException -from github.NamedUser import NamedUser -from thefuzz.fuzz import ratio # type: ignore - -from git_helper import git_runner as runner -from git_helper import is_shallow -from github_helper import GitHub, PullRequest, PullRequests, Repository - -# This array gives the preferred category order, and is also used to -# normalize category names. -# Categories are used in .github/PULL_REQUEST_TEMPLATE.md, keep comments there -# updated accordingly -categories_preferred_order = ( - "Backward Incompatible Change", - "New Feature", - "Performance Improvement", - "Improvement", - "Critical Bug Fix", - "Bug Fix", - "Build/Testing/Packaging Improvement", - "Other", -) - -FROM_REF = "" -TO_REF = "" -SHA_IN_CHANGELOG = [] # type: List[str] -gh = GitHub(create_cache_dir=False) -CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") - - -class Description: - def __init__( - self, number: int, user: NamedUser, html_url: str, entry: str, category: str - ): - self.number = number - self.html_url = html_url - self.user = gh.get_user_cached(user._rawData["login"]) # type: ignore - self.entry = entry - self.category = category - - @property - def formatted_entry(self) -> str: - # Substitute issue links. - # 1) issue number w/o markdown link - entry = re.sub( - r"([^[])#([0-9]{4,})", - r"\1[#\2](https://github.com/ClickHouse/ClickHouse/issues/\2)", - self.entry, - ) - # 2) issue URL w/o markdown link - # including #issuecomment-1 or #event-12 - entry = re.sub( - r"([^(])(https://github.com/ClickHouse/ClickHouse/issues/([0-9]{4,})[-#a-z0-9]*)", - r"\1[#\3](\2)", - entry, - ) - # It's possible that we face a secondary rate limit. - # In this case we should sleep until we get it - while True: - try: - user_name = self.user.name if self.user.name else self.user.login - break - except UnknownObjectException: - user_name = self.user.login - break - except RateLimitExceededException: - gh.sleep_on_rate_limit() - return ( - f"* {entry} [#{self.number}]({self.html_url}) " - f"([{user_name}]({self.user.html_url}))." - ) - - # Sort PR descriptions by numbers - def __eq__(self, other) -> bool: - if not isinstance(self, type(other)): - return NotImplemented - return self.number == other.number - - def __lt__(self, other: "Description") -> bool: - return self.number < other.number - - -def get_descriptions(prs: PullRequests) -> Dict[str, List[Description]]: - descriptions = {} # type: Dict[str, List[Description]] - repos = {} # type: Dict[str, Repository] - for pr in prs: - # See https://github.com/PyGithub/PyGithub/issues/2202, - # obj._rawData doesn't spend additional API requests - # We'll save some requests - # pylint: disable=protected-access - repo_name = pr._rawData["base"]["repo"]["full_name"] - # pylint: enable=protected-access - if repo_name not in repos: - repos[repo_name] = pr.base.repo - in_changelog = False - merge_commit = pr.merge_commit_sha - if merge_commit is None: - logging.warning("PR %s does not have merge-commit, skipping", pr.number) - continue - - in_changelog = merge_commit in SHA_IN_CHANGELOG - if in_changelog: - desc = generate_description(pr, repos[repo_name]) - if desc: - if desc.category not in descriptions: - descriptions[desc.category] = [] - descriptions[desc.category].append(desc) - - for descs in descriptions.values(): - descs.sort() - - return descriptions - - -def parse_args() -> argparse.Namespace: - parser = argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - description="Generate a changelog in Markdown format between given tags. " - "It fetches all tags and unshallow the git repository automatically", - ) - parser.add_argument( - "-v", - "--verbose", - action="count", - default=0, - help="set the script verbosity, could be used multiple", - ) - parser.add_argument( - "--debug-helpers", - action="store_true", - help="add debug logging for git_helper and github_helper", - ) - parser.add_argument( - "--output", - type=argparse.FileType("w"), - default="-", - help="output file for changelog", - ) - parser.add_argument( - "--repo", - default="ClickHouse/ClickHouse", - help="a repository to query for pull-requests from GitHub", - ) - parser.add_argument( - "--jobs", - type=int, - default=10, - help="number of jobs to get pull-requests info from GitHub API", - ) - parser.add_argument( - "--gh-user-or-token", - help="user name or GH token to authenticate", - ) - parser.add_argument( - "--gh-password", - help="a password that should be used when user is given", - ) - parser.add_argument( - "--with-testing-tags", - action="store_true", - help="by default '*-testing' tags are ignored, this argument enables them too", - ) - parser.add_argument( - "--from", - dest="from_ref", - help="git ref for a starting point of changelog, by default is calculated " - "automatically to match a previous tag in history", - ) - parser.add_argument( - "to_ref", - metavar="TO_REF", - help="git ref for the changelog end", - ) - args = parser.parse_args() - return args - - -# This function mirrors the PR description checks in ClickhousePullRequestTrigger. -# Returns None if the PR should not be mentioned in changelog. -def generate_description(item: PullRequest, repo: Repository) -> Optional[Description]: - backport_number = item.number - if item.head.ref.startswith("backport/"): - branch_parts = item.head.ref.split("/") - if len(branch_parts) == 3: - try: - item = gh.get_pull_cached(repo, int(branch_parts[-1])) - except Exception as e: - logging.warning("unable to get backpoted PR, exception: %s", e) - else: - logging.warning( - "The branch %s doesn't match backport template, using PR %s as is", - item.head.ref, - item.number, - ) - description = item.body - # Don't skip empty lines because they delimit parts of description - lines = [x.strip() for x in (description.split("\n") if description else [])] - lines = [re.sub(r"\s+", " ", ln) for ln in lines] - - category = "" - entry = "" - - if lines: - i = 0 - while i < len(lines): - if re.match(r"(?i)^[#>*_ ]*change\s*log\s*category", lines[i]): - i += 1 - if i >= len(lines): - break - # Can have one empty line between header and the category itself. - # Filter it out. - if not lines[i]: - i += 1 - if i >= len(lines): - break - category = re.sub(r"^[-*\s]*", "", lines[i]) - i += 1 - elif re.match( - r"(?i)^[#>*_ ]*(short\s*description|change\s*log\s*entry)", lines[i] - ): - i += 1 - # Can have one empty line between header and the entry itself. - # Filter it out. - if i < len(lines) and not lines[i]: - i += 1 - # All following lines until empty one are the changelog entry. - entry_lines = [] - while i < len(lines) and lines[i]: - entry_lines.append(lines[i]) - i += 1 - entry = " ".join(entry_lines) - else: - i += 1 - - # Remove excessive bullets from the entry. - if re.match(r"^[\-\*] ", entry): - entry = entry[2:] - - # Better style. - if re.match(r"^[a-z]", entry): - entry = entry.capitalize() - - if not category: - # Shouldn't happen, because description check in CI should catch such PRs. - # Fall through, so that it shows up in output and the user can fix it. - category = "NO CL CATEGORY" - - # Filter out the PR categories that are not for changelog. - if re.match( - r"(?i)((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", - category, - ): - category = "NOT FOR CHANGELOG / INSIGNIFICANT" - return Description(item.number, item.user, item.html_url, item.title, category) - - # Normalize bug fixes - if re.match( - r"(?i)bug\Wfix", - category, - ): - category = "Bug Fix (user-visible misbehavior in an official stable release)" - - # Filter out documentations changelog - if re.match( - r"(?i)doc", - category, - ): - return None - - if backport_number != item.number: - entry = f"Backported in #{backport_number}: {entry}" - - if not entry: - # Shouldn't happen, because description check in CI should catch such PRs. - category = "NO CL ENTRY" - entry = "NO CL ENTRY: '" + item.title + "'" - - entry = entry.strip() - if entry[-1] != ".": - entry += "." - - for c in categories_preferred_order: - if ratio(category.lower(), c.lower()) >= 90: - category = c - break - - return Description(item.number, item.user, item.html_url, entry, category) - - -def write_changelog( - fd: TextIO, descriptions: Dict[str, List[Description]], year: int -) -> None: - to_commit = runner(f"git rev-parse {TO_REF}^{{}}")[:11] - from_commit = runner(f"git rev-parse {FROM_REF}^{{}}")[:11] - fd.write( - f"---\nsidebar_position: 1\nsidebar_label: {year}\n---\n\n" - f"# {year} Changelog\n\n" - f"### ClickHouse release {TO_REF} ({to_commit}) FIXME " - f"as compared to {FROM_REF} ({from_commit})\n\n" - ) - - seen_categories = [] # type: List[str] - for category in categories_preferred_order: - if category in descriptions: - seen_categories.append(category) - fd.write(f"#### {category}\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - for category in sorted(descriptions): - if category not in seen_categories: - fd.write(f"#### {category}\n\n") - for desc in descriptions[category]: - fd.write(f"{desc.formatted_entry}\n") - - fd.write("\n") - - -def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool): - global FROM_REF, TO_REF - TO_REF = to_ref - - # Check TO_REF - runner.run(f"git rev-parse {TO_REF}") - - # Check from_ref - if from_ref is None: - # Get all tags pointing to TO_REF - tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") - logging.info("All tags pointing to %s:\n%s", TO_REF, tags) - if not with_testing_tags: - tags.append("*-testing") - exclude = " ".join([f"--exclude='{tag}'" for tag in tags]) - cmd = f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'" - FROM_REF = runner.run(cmd) - else: - runner.run(f"git rev-parse {FROM_REF}") - FROM_REF = from_ref - - -def set_sha_in_changelog(): - global SHA_IN_CHANGELOG - SHA_IN_CHANGELOG = runner.run( - f"git log --format=format:%H {FROM_REF}..{TO_REF}" - ).split("\n") - - -def get_year(prs: PullRequests) -> int: - if not prs: - return date.today().year - return max(pr.created_at.year for pr in prs) - - -def main(): - log_levels = [logging.WARN, logging.INFO, logging.DEBUG] - args = parse_args() - logging.basicConfig( - format="%(asctime)s %(levelname)-8s [%(filename)s:%(lineno)d]:\n%(message)s", - level=log_levels[min(args.verbose, 2)], - ) - if args.debug_helpers: - logging.getLogger("github_helper").setLevel(logging.DEBUG) - logging.getLogger("git_helper").setLevel(logging.DEBUG) - # Create a cache directory - if not p.isdir(CACHE_PATH): - os.mkdir(CACHE_PATH, 0o700) - - # Get the full repo - if is_shallow(): - logging.info("Unshallow repository") - runner.run("git fetch --unshallow", stderr=DEVNULL) - logging.info("Fetching all tags") - runner.run("git fetch --tags", stderr=DEVNULL) - - check_refs(args.from_ref, args.to_ref, args.with_testing_tags) - set_sha_in_changelog() - - logging.info("Using %s..%s as changelog interval", FROM_REF, TO_REF) - - # use merge-base commit as a starting point, if used ref in another branch - base_commit = runner.run(f"git merge-base '{FROM_REF}^{{}}' '{TO_REF}^{{}}'") - # Get starting and ending dates for gathering PRs - # Add one day after and before to mitigate TZ possible issues - # `tag^{}` format gives commit ref when we have annotated tags - # format %cs gives a committer date, works better for cherry-picked commits - from_date = runner.run(f"git log -1 --format=format:%cs '{base_commit}'") - to_date = runner.run(f"git log -1 --format=format:%cs '{TO_REF}^{{}}'") - merged = ( - date.fromisoformat(from_date) - timedelta(1), - date.fromisoformat(to_date) + timedelta(1), - ) - - # Get all PRs for the given time frame - global gh - gh = GitHub( - args.gh_user_or_token, - args.gh_password, - create_cache_dir=False, - per_page=100, - pool_size=args.jobs, - ) - gh.cache_path = CACHE_PATH - query = f"type:pr repo:{args.repo} is:merged" - prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") - - descriptions = get_descriptions(prs) - changelog_year = get_year(prs) - - write_changelog(args.output, descriptions, changelog_year) +import subprocess +import sys +from pathlib import Path +SCRIPT_PATH = (Path(__file__).parents[2] / "tests/ci/changelog.py").absolute() if __name__ == "__main__": - main() + subprocess.check_call(["python3", SCRIPT_PATH, *sys.argv[1:]]) diff --git a/utils/changelog/git_helper.py b/utils/changelog/git_helper.py deleted file mode 120000 index 03b05a7eddd..00000000000 --- a/utils/changelog/git_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/git_helper.py \ No newline at end of file diff --git a/utils/changelog/github_helper.py b/utils/changelog/github_helper.py deleted file mode 120000 index 2d44dfe8000..00000000000 --- a/utils/changelog/github_helper.py +++ /dev/null @@ -1 +0,0 @@ -../../tests/ci/github_helper.py \ No newline at end of file From 0803e87a9354c0c03e1d11f4844cab19df0248b1 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 4 Jun 2024 14:20:51 +0200 Subject: [PATCH 152/215] Use GitHubCache in changelog.py --- tests/ci/changelog.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index b1a43b1520f..a08866eb1aa 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -3,10 +3,9 @@ import argparse import logging -import os -import os.path as p import re from datetime import date, timedelta +from pathlib import Path from subprocess import DEVNULL from typing import Dict, List, Optional, TextIO @@ -14,9 +13,12 @@ from github.GithubException import RateLimitExceededException, UnknownObjectExce from github.NamedUser import NamedUser from thefuzz.fuzz import ratio # type: ignore +from cache_utils import GitHubCache +from env_helper import TEMP_PATH from git_helper import git_runner as runner from git_helper import is_shallow from github_helper import GitHub, PullRequest, PullRequests, Repository +from s3_helper import S3Helper # This array gives the preferred category order, and is also used to # normalize category names. @@ -37,7 +39,6 @@ FROM_REF = "" TO_REF = "" SHA_IN_CHANGELOG = [] # type: List[str] gh = GitHub(create_cache_dir=False) -CACHE_PATH = p.join(p.dirname(p.realpath(__file__)), "gh_cache") class Description: @@ -375,9 +376,6 @@ def main(): if args.debug_helpers: logging.getLogger("github_helper").setLevel(logging.DEBUG) logging.getLogger("git_helper").setLevel(logging.DEBUG) - # Create a cache directory - if not p.isdir(CACHE_PATH): - os.mkdir(CACHE_PATH, 0o700) # Get the full repo if is_shallow(): @@ -413,7 +411,9 @@ def main(): per_page=100, pool_size=args.jobs, ) - gh.cache_path = CACHE_PATH + temp_path = Path(TEMP_PATH) + gh_cache = GitHubCache(gh.cache_path, temp_path, S3Helper()) + gh_cache.download() query = f"type:pr repo:{args.repo} is:merged" prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") @@ -421,6 +421,7 @@ def main(): changelog_year = get_year(prs) write_changelog(args.output, descriptions, changelog_year) + gh_cache.upload() if __name__ == "__main__": From 55350a33381ad04320fda49d3fcf6d863c2bea74 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 4 Jun 2024 18:40:41 +0200 Subject: [PATCH 153/215] Fix style issues --- tests/ci/changelog.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index a08866eb1aa..dcdc5d515b8 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -7,7 +7,7 @@ import re from datetime import date, timedelta from pathlib import Path from subprocess import DEVNULL -from typing import Dict, List, Optional, TextIO +from typing import Any, Dict, List, Optional, TextIO from github.GithubException import RateLimitExceededException, UnknownObjectException from github.NamedUser import NamedUser @@ -84,10 +84,10 @@ class Description: ) # Sort PR descriptions by numbers - def __eq__(self, other) -> bool: + def __eq__(self, other: Any) -> bool: if not isinstance(self, type(other)): - return NotImplemented - return self.number == other.number + raise NotImplementedError + return bool(self.number == other.number) def __lt__(self, other: "Description") -> bool: return self.number < other.number @@ -331,7 +331,7 @@ def write_changelog( fd.write("\n") -def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool): +def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool) -> None: global FROM_REF, TO_REF TO_REF = to_ref From 3cc099a88ed37e4fae9a62b207b31b6bc471fadc Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 4 Jun 2024 20:05:18 +0200 Subject: [PATCH 154/215] Adjust changelog.py to a new release model with v24.6.1.1-new tags --- tests/ci/changelog.py | 46 ++++++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 12 deletions(-) diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index dcdc5d515b8..95b9ee9be27 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -15,10 +15,15 @@ from thefuzz.fuzz import ratio # type: ignore from cache_utils import GitHubCache from env_helper import TEMP_PATH -from git_helper import git_runner as runner -from git_helper import is_shallow +from git_helper import git_runner, is_shallow from github_helper import GitHub, PullRequest, PullRequests, Repository from s3_helper import S3Helper +from version_helper import ( + FILE_WITH_VERSION_PATH, + get_abs_path, + get_version_from_repo, + get_version_from_tag, +) # This array gives the preferred category order, and is also used to # normalize category names. @@ -39,6 +44,7 @@ FROM_REF = "" TO_REF = "" SHA_IN_CHANGELOG = [] # type: List[str] gh = GitHub(create_cache_dir=False) +runner = git_runner class Description: @@ -339,18 +345,34 @@ def check_refs(from_ref: Optional[str], to_ref: str, with_testing_tags: bool) -> runner.run(f"git rev-parse {TO_REF}") # Check from_ref - if from_ref is None: - # Get all tags pointing to TO_REF - tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") - logging.info("All tags pointing to %s:\n%s", TO_REF, tags) - if not with_testing_tags: - tags.append("*-testing") - exclude = " ".join([f"--exclude='{tag}'" for tag in tags]) - cmd = f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'" - FROM_REF = runner.run(cmd) - else: + if from_ref is not None: runner.run(f"git rev-parse {FROM_REF}") FROM_REF = from_ref + return + + # Get the cmake/autogenerated_versions.txt from FROM_REF to read the version + # If the previous tag is greater than version in the FROM_REF, + # then we need to add it to tags_to_exclude + temp_cmake = "tests/ci/tmp/autogenerated_versions.txt" + cmake_version = get_abs_path(temp_cmake) + cmake_version.write_text(runner(f"git show {TO_REF}:{FILE_WITH_VERSION_PATH}")) + to_ref_version = get_version_from_repo(cmake_version) + # Get all tags pointing to TO_REF + excluded_tags = runner.run(f"git tag --points-at '{TO_REF}^{{}}'").split("\n") + logging.info("All tags pointing to %s:\n%s", TO_REF, excluded_tags) + if not with_testing_tags: + excluded_tags.append("*-testing") + while not from_ref: + exclude = " ".join([f"--exclude='{tag}'" for tag in excluded_tags]) + from_ref_tag = runner(f"git describe --abbrev=0 --tags {exclude} '{TO_REF}'") + from_ref_version = get_version_from_tag(from_ref_tag) + if from_ref_version <= to_ref_version: + from_ref = from_ref_tag + break + excluded_tags.append(from_ref_tag) + + cmake_version.unlink() + FROM_REF = from_ref def set_sha_in_changelog(): From 335ec8f3e181ae9df21a6295cfac10e51a99030a Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 10 Jun 2024 13:43:22 +0200 Subject: [PATCH 155/215] Parse `changelog entry is not required` properly --- tests/ci/changelog.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index 95b9ee9be27..0cf21589669 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -266,7 +266,9 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri # Filter out the PR categories that are not for changelog. if re.match( - r"(?i)((non|in|not|un)[-\s]*significant)|(not[ ]*for[ ]*changelog)", + r"(?i)((non|in|not|un)[-\s]*significant)|" + r"(not[ ]*for[ ]*changelog)|" + r"(changelog[ ]*entry[ ]*is[ ]*not[ ]*required)", category, ): category = "NOT FOR CHANGELOG / INSIGNIFICANT" From f4630e9daed0786b163572bb9f6d48bf509d36af Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 10 Jun 2024 14:18:13 +0200 Subject: [PATCH 156/215] Fix `Backported in` for not-for-changelog PRs --- tests/ci/changelog.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index 0cf21589669..bc52a47fb38 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -264,15 +264,22 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri # Fall through, so that it shows up in output and the user can fix it. category = "NO CL CATEGORY" - # Filter out the PR categories that are not for changelog. + # Filter out documentations changelog before not-for-changelog if re.match( + r"(?i)doc", + category, + ): + return None + + # Filter out the PR categories that are not for changelog. + if re.search( r"(?i)((non|in|not|un)[-\s]*significant)|" r"(not[ ]*for[ ]*changelog)|" r"(changelog[ ]*entry[ ]*is[ ]*not[ ]*required)", category, ): category = "NOT FOR CHANGELOG / INSIGNIFICANT" - return Description(item.number, item.user, item.html_url, item.title, category) + entry = item.title # Normalize bug fixes if re.match( @@ -281,13 +288,6 @@ def generate_description(item: PullRequest, repo: Repository) -> Optional[Descri ): category = "Bug Fix (user-visible misbehavior in an official stable release)" - # Filter out documentations changelog - if re.match( - r"(?i)doc", - category, - ): - return None - if backport_number != item.number: entry = f"Backported in #{backport_number}: {entry}" From 3dba47297cf9a071f148c6753d76b0fbd8915100 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Mon, 10 Jun 2024 14:28:33 +0200 Subject: [PATCH 157/215] Generate omit v24.1.6.52-stable.md changelog --- docs/changelogs/v24.1.6.52-stable.md | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 docs/changelogs/v24.1.6.52-stable.md diff --git a/docs/changelogs/v24.1.6.52-stable.md b/docs/changelogs/v24.1.6.52-stable.md new file mode 100644 index 00000000000..341561e9a64 --- /dev/null +++ b/docs/changelogs/v24.1.6.52-stable.md @@ -0,0 +1,45 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.1.6.52-stable (fa09f677bc9) FIXME as compared to v24.1.5.6-stable (7f67181ff31) + +#### Improvement +* Backported in [#60292](https://github.com/ClickHouse/ClickHouse/issues/60292): Copy S3 file GCP fallback to buffer copy in case GCP returned `Internal Error` with `GATEWAY_TIMEOUT` HTTP error code. [#60164](https://github.com/ClickHouse/ClickHouse/pull/60164) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60832](https://github.com/ClickHouse/ClickHouse/issues/60832): Update tzdata to 2024a. [#60768](https://github.com/ClickHouse/ClickHouse/pull/60768) ([Raúl Marín](https://github.com/Algunenano)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#60413](https://github.com/ClickHouse/ClickHouse/issues/60413): Fix segmentation fault in KQL parser when the input query exceeds the `max_query_size`. Also re-enable the KQL dialect. Fixes [#59036](https://github.com/ClickHouse/ClickHouse/issues/59036) and [#59037](https://github.com/ClickHouse/ClickHouse/issues/59037). [#59626](https://github.com/ClickHouse/ClickHouse/pull/59626) ([Yong Wang](https://github.com/kashwy)). +* Backported in [#60074](https://github.com/ClickHouse/ClickHouse/issues/60074): Fix error `Read beyond last offset` for `AsynchronousBoundedReadBuffer`. [#59630](https://github.com/ClickHouse/ClickHouse/pull/59630) ([Vitaly Baranov](https://github.com/vitlibar)). +* Backported in [#60299](https://github.com/ClickHouse/ClickHouse/issues/60299): Fix having neigher acked nor nacked messages. If exception happens during read-write phase, messages will be nacked. [#59775](https://github.com/ClickHouse/ClickHouse/pull/59775) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#60066](https://github.com/ClickHouse/ClickHouse/issues/60066): Fix optimize_uniq_to_count removing the column alias. [#60026](https://github.com/ClickHouse/ClickHouse/pull/60026) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60638](https://github.com/ClickHouse/ClickHouse/issues/60638): Fixed a bug in parallel optimization for queries with `FINAL`, which could give an incorrect result in rare cases. [#60041](https://github.com/ClickHouse/ClickHouse/pull/60041) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#60177](https://github.com/ClickHouse/ClickHouse/issues/60177): Fix cosineDistance crash with Nullable. [#60150](https://github.com/ClickHouse/ClickHouse/pull/60150) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60279](https://github.com/ClickHouse/ClickHouse/issues/60279): Hide sensitive info for `S3Queue` table engine. [#60233](https://github.com/ClickHouse/ClickHouse/pull/60233) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#61000](https://github.com/ClickHouse/ClickHouse/issues/61000): Reduce the number of read rows from `system.numbers`. Fixes [#59418](https://github.com/ClickHouse/ClickHouse/issues/59418). [#60546](https://github.com/ClickHouse/ClickHouse/pull/60546) ([JackyWoo](https://github.com/JackyWoo)). +* Backported in [#60791](https://github.com/ClickHouse/ClickHouse/issues/60791): Fix buffer overflow that can happen if the attacker asks the HTTP server to decompress data with a composition of codecs and size triggering numeric overflow. Fix buffer overflow that can happen inside codec NONE on wrong input data. This was submitted by TIANGONG research team through our [Bug Bounty program](https://github.com/ClickHouse/ClickHouse/issues/38986). [#60731](https://github.com/ClickHouse/ClickHouse/pull/60731) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60783](https://github.com/ClickHouse/ClickHouse/issues/60783): Functions for SQL/JSON were able to read uninitialized memory. This closes [#60017](https://github.com/ClickHouse/ClickHouse/issues/60017). Found by Fuzzer. [#60738](https://github.com/ClickHouse/ClickHouse/pull/60738) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#60803](https://github.com/ClickHouse/ClickHouse/issues/60803): Do not set aws custom metadata `x-amz-meta-*` headers on UploadPart & CompleteMultipartUpload calls. [#60748](https://github.com/ClickHouse/ClickHouse/pull/60748) ([Francisco J. Jurado Moreno](https://github.com/Beetelbrox)). +* Backported in [#60820](https://github.com/ClickHouse/ClickHouse/issues/60820): Fix crash in arrayEnumerateRanked. [#60764](https://github.com/ClickHouse/ClickHouse/pull/60764) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60841](https://github.com/ClickHouse/ClickHouse/issues/60841): Fix crash when using input() in INSERT SELECT JOIN. Closes [#60035](https://github.com/ClickHouse/ClickHouse/issues/60035). [#60765](https://github.com/ClickHouse/ClickHouse/pull/60765) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#60904](https://github.com/ClickHouse/ClickHouse/issues/60904): Avoid segfault if too many keys are skipped when reading from S3. [#60849](https://github.com/ClickHouse/ClickHouse/pull/60849) ([Antonio Andelic](https://github.com/antonio2368)). + +#### NO CL CATEGORY + +* Backported in [#60186](https://github.com/ClickHouse/ClickHouse/issues/60186):. [#60181](https://github.com/ClickHouse/ClickHouse/pull/60181) ([Alexey Milovidov](https://github.com/alexey-milovidov)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#60333](https://github.com/ClickHouse/ClickHouse/issues/60333): CI: Fix job failures due to jepsen artifacts. [#59890](https://github.com/ClickHouse/ClickHouse/pull/59890) ([Max K.](https://github.com/maxknv)). +* Backported in [#60034](https://github.com/ClickHouse/ClickHouse/issues/60034): Fix mark release ready. [#59994](https://github.com/ClickHouse/ClickHouse/pull/59994) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60326](https://github.com/ClickHouse/ClickHouse/issues/60326): Ability to detect undead ZooKeeper sessions. [#60044](https://github.com/ClickHouse/ClickHouse/pull/60044) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#60363](https://github.com/ClickHouse/ClickHouse/issues/60363): CI: hot fix for gh statuses. [#60201](https://github.com/ClickHouse/ClickHouse/pull/60201) ([Max K.](https://github.com/maxknv)). +* Backported in [#60648](https://github.com/ClickHouse/ClickHouse/issues/60648): Detect io_uring in tests. [#60373](https://github.com/ClickHouse/ClickHouse/pull/60373) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#60569](https://github.com/ClickHouse/ClickHouse/issues/60569): Remove broken test while we fix it. [#60547](https://github.com/ClickHouse/ClickHouse/pull/60547) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#60756](https://github.com/ClickHouse/ClickHouse/issues/60756): Update shellcheck. [#60553](https://github.com/ClickHouse/ClickHouse/pull/60553) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#60584](https://github.com/ClickHouse/ClickHouse/issues/60584): CI: fix docker build job name. [#60554](https://github.com/ClickHouse/ClickHouse/pull/60554) ([Max K.](https://github.com/maxknv)). + From 4b90e0ecd4c3f4d6d7029415f220b0a9dc98cced Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 11 Jun 2024 13:23:36 +0200 Subject: [PATCH 158/215] Improve splitting search_issues, solving pylint issues --- tests/ci/github_helper.py | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index eb0f6c24527..3fe72214430 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -49,38 +49,43 @@ class GitHub(github.Github): """Wrapper around search method with throttling and splitting by date. We split only by the first""" - splittable = False + splittable_arg = "" + splittable_value = [] for arg, value in kwargs.items(): if arg in ["closed", "created", "merged", "updated"]: if hasattr(value, "__iter__") and not isinstance(value, str): - assert [True for v in value if isinstance(v, (date, datetime))] + assert all(True for v in value if isinstance(v, (date, datetime))) assert len(value) == 2 kwargs[arg] = f"{value[0].isoformat()}..{value[1].isoformat()}" - if not splittable: + if not splittable_arg: # We split only by the first met splittable argument - preserved_arg = arg - preserved_value = value middle_value = value[0] + (value[1] - value[0]) / 2 - splittable = middle_value not in value + if middle_value in value: + # When the middle value in itareble value, we can't use it + # to split by dates later + continue + splittable_arg = arg + splittable_value = value continue assert isinstance(value, (date, datetime, str)) inter_result = [] # type: Issues + exception = RateLimitExceededException(0) for i in range(self.retries): try: logger.debug("Search issues, args=%s, kwargs=%s", args, kwargs) result = super().search_issues(*args, **kwargs) - if result.totalCount == 1000 and splittable: + if result.totalCount == 1000 and splittable_arg: # The hard limit is 1000. If it's splittable, then we make # two subrequests requests with less time frames logger.debug( "The search result contain exactly 1000 results, " "splitting %s=%s by middle point %s", - preserved_arg, - kwargs[preserved_arg], + splittable_arg, + kwargs[splittable_arg], middle_value, ) - kwargs[preserved_arg] = [preserved_value[0], middle_value] + kwargs[splittable_arg] = [splittable_value[0], middle_value] inter_result.extend(self.search_issues(*args, **kwargs)) if isinstance(middle_value, date): # When middle_value is a date, 2022-01-01..2022-01-03 @@ -88,9 +93,10 @@ class GitHub(github.Github): # 2022-01-02..2022-01-03, so we have results for # 2022-01-02 twicely. We split it to # 2022-01-01..2022-01-02 and 2022-01-03..2022-01-03. - # 2022-01-01..2022-01-02 aren't split, see splittable + # 2022-01-01..2022-01-02 aren't split, see splittable_arg + # definition above for kwargs.items middle_value += timedelta(days=1) - kwargs[preserved_arg] = [middle_value, preserved_value[1]] + kwargs[splittable_arg] = [middle_value, splittable_value[1]] inter_result.extend(self.search_issues(*args, **kwargs)) return inter_result From c797c8105d52c385dc1f2872e888db0b5c1462cf Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 11 Jun 2024 13:24:29 +0200 Subject: [PATCH 159/215] Allow to pass tqdm.tqdm into get_pulls_from_search --- tests/ci/github_helper.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index 3fe72214430..b6407c5d531 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -6,7 +6,7 @@ from datetime import date, datetime, timedelta from os import path as p from pathlib import Path from time import sleep -from typing import List, Optional, Tuple, Union +from typing import Any, Callable, List, Optional, Tuple, Union import github import requests @@ -110,12 +110,15 @@ class GitHub(github.Github): raise exception # pylint: enable=signature-differs - def get_pulls_from_search(self, *args, **kwargs) -> PullRequests: # type: ignore + def get_pulls_from_search(self, *args: Any, **kwargs: Any) -> PullRequests: """The search api returns actually issues, so we need to fetch PullRequests""" issues = self.search_issues(*args, **kwargs) repos = {} prs = [] # type: PullRequests - for issue in issues: + progress_func = kwargs.pop( + "progress_func", lambda x: x + ) # type: Callable[[Issues], Issues] + for issue in progress_func(issues): # See https://github.com/PyGithub/PyGithub/issues/2202, # obj._rawData doesn't spend additional API requests # pylint: disable=protected-access From 94c82787c555237e823ceebcc75b24016aceba62 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Tue, 11 Jun 2024 13:26:35 +0200 Subject: [PATCH 160/215] Add a progress function for searching PRs in changelog.py --- docker/test/style/Dockerfile | 1 + tests/ci/changelog.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 91768c8328d..54fab849301 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -33,6 +33,7 @@ RUN pip3 install \ flake8==4.0.1 \ requests \ thefuzz \ + tqdm==4.66.4 \ types-requests \ unidiff \ && rm -rf /root/.cache/pip diff --git a/tests/ci/changelog.py b/tests/ci/changelog.py index bc52a47fb38..fcb61d3f605 100755 --- a/tests/ci/changelog.py +++ b/tests/ci/changelog.py @@ -9,6 +9,7 @@ from pathlib import Path from subprocess import DEVNULL from typing import Any, Dict, List, Optional, TextIO +import tqdm # type: ignore from github.GithubException import RateLimitExceededException, UnknownObjectException from github.NamedUser import NamedUser from thefuzz.fuzz import ratio # type: ignore @@ -439,7 +440,9 @@ def main(): gh_cache = GitHubCache(gh.cache_path, temp_path, S3Helper()) gh_cache.download() query = f"type:pr repo:{args.repo} is:merged" - prs = gh.get_pulls_from_search(query=query, merged=merged, sort="created") + prs = gh.get_pulls_from_search( + query=query, merged=merged, sort="created", progress_func=tqdm.tqdm + ) descriptions = get_descriptions(prs) changelog_year = get_year(prs) From feef24b9a3e8fad38f2e6a7c52fe4a9267d1e2ce Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 12:51:05 +0000 Subject: [PATCH 161/215] Rename submodule "abseil" to "contrib/abseil" --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 6d64c52ce00..0020bdd006a 100644 --- a/.gitmodules +++ b/.gitmodules @@ -161,7 +161,7 @@ [submodule "contrib/xz"] path = contrib/xz url = https://github.com/xz-mirror/xz -[submodule "abseil"] +[submodule "contrib/abseil"] path = contrib/abseil-cpp url = https://github.com/ClickHouse/abseil-cpp.git [submodule "contrib/dragonbox"] From 13b23e9e9c3465902ad57c6b1e802c2d54832e42 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 13:05:33 +0000 Subject: [PATCH 162/215] Check submodule name in style check --- .gitmodules | 6 +++--- utils/check-style/check-style | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 0020bdd006a..a3bb7eacabc 100644 --- a/.gitmodules +++ b/.gitmodules @@ -91,13 +91,13 @@ [submodule "contrib/aws"] path = contrib/aws url = https://github.com/ClickHouse/aws-sdk-cpp -[submodule "aws-c-event-stream"] +[submodule "contrib/aws-c-event-stream"] path = contrib/aws-c-event-stream url = https://github.com/awslabs/aws-c-event-stream -[submodule "aws-c-common"] +[submodule "contrib/aws-c-common"] path = contrib/aws-c-common url = https://github.com/awslabs/aws-c-common.git -[submodule "aws-checksums"] +[submodule "contrib/aws-checksums"] path = contrib/aws-checksums url = https://github.com/awslabs/aws-checksums [submodule "contrib/curl"] diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 5c05907e9dd..4c6a4b9ea39 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -243,6 +243,10 @@ done # All the submodules should be from https://github.com/ find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done +# All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) +# - restrict the check to top-level .gitmodules file +find $ROOT_PATH -maxdepth 1 -name '.gitmodules' | while read i; do grep -F '[submodule ' $i | grep -v -F 'contrib' && echo 'All submodules should have form [submodule "contrib/libxyz"]'; done + # There shouldn't be any code snippets under GPL or LGPL find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" From 84e81daa3ec51046d027188763e2e6c6d7a9c9f5 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 12 Jun 2024 15:09:50 +0200 Subject: [PATCH 163/215] Second pass fix remaining broken links --- .../aggregate-functions/combinators.md | 8 +-- .../parametric-functions.md | 16 +++-- .../reference/stochasticlinearregression.md | 2 +- .../reference/stochasticlogisticregression.md | 2 +- .../aggregate-functions/reference/varpop.md | 8 +-- .../aggregate-functions/reference/varsamp.md | 6 +- docs/en/sql-reference/dictionaries/index.md | 60 +++++++++---------- .../functions/bitmap-functions.md | 6 +- .../functions/date-time-functions.md | 4 +- .../en/sql-reference/functions/geo/geohash.md | 8 +-- docs/en/sql-reference/functions/geo/h3.md | 44 +++++++------- docs/en/sql-reference/functions/geo/s2.md | 10 ++-- .../functions/other-functions.md | 10 ++-- .../functions/rounding-functions.md | 2 +- .../functions/string-search-functions.md | 2 +- .../functions/type-conversion-functions.md | 30 +++++----- docs/en/sql-reference/operators/in.md | 2 +- .../table-functions/fileCluster.md | 2 +- 18 files changed, 113 insertions(+), 109 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index 5351531afdb..e30aa66b3b3 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -106,14 +106,14 @@ To work with these states, use: - [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engine. - [finalizeAggregation](../../sql-reference/functions/other-functions.md#function-finalizeaggregation) function. - [runningAccumulate](../../sql-reference/functions/other-functions.md#runningaccumulate) function. -- [-Merge](#aggregate_functions_combinators_merge) combinator. -- [-MergeState](#aggregate_functions_combinators_mergestate) combinator. +- [-Merge](#-merge) combinator. +- [-MergeState](#-mergestate) combinator. -## -Merge {#aggregate_functions_combinators_merge} +## -Merge If you apply this combinator, the aggregate function takes the intermediate aggregation state as an argument, combines the states to finish aggregation, and returns the resulting value. -## -MergeState {#aggregate_functions_combinators_mergestate} +## -MergeState Merges the intermediate aggregation states in the same way as the -Merge combinator. However, it does not return the resulting value, but an intermediate aggregation state, similar to the -State combinator. diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 43ded9df60a..093d88f939f 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -82,10 +82,12 @@ FROM In this case, you should remember that you do not know the histogram bin borders. -## sequenceMatch(pattern)(timestamp, cond1, cond2, ...) {#function-sequencematch} +## sequenceMatch Checks whether the sequence contains an event chain that matches the pattern. +**Syntax** + ``` sql sequenceMatch(pattern)(timestamp, cond1, cond2, ...) ``` @@ -102,7 +104,7 @@ Events that occur at the same second may lay in the sequence in an undefined ord **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -170,9 +172,9 @@ SELECT sequenceMatch('(?1)(?2)')(time, number = 1, number = 2, number = 4) FROM **See Also** -- [sequenceCount](#function-sequencecount) +- [sequenceCount](#sequencecount) -## sequenceCount(pattern)(time, cond1, cond2, ...) {#function-sequencecount} +## sequenceCount Counts the number of event chains that matched the pattern. The function searches event chains that do not overlap. It starts to search for the next chain after the current chain is matched. @@ -180,6 +182,8 @@ Counts the number of event chains that matched the pattern. The function searche Events that occur at the same second may lay in the sequence in an undefined order affecting the result. ::: +**Syntax** + ``` sql sequenceCount(pattern)(timestamp, cond1, cond2, ...) ``` @@ -192,7 +196,7 @@ sequenceCount(pattern)(timestamp, cond1, cond2, ...) **Parameters** -- `pattern` — Pattern string. See [Pattern syntax](#sequence-function-pattern-syntax). +- `pattern` — Pattern string. See [Pattern syntax](#sequencematch). **Returned values** @@ -229,7 +233,7 @@ SELECT sequenceCount('(?1).*(?2)')(time, number = 1, number = 2) FROM t **See Also** -- [sequenceMatch](#function-sequencematch) +- [sequenceMatch](#sequencematch) ## windowFunnel diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md index 15533ba9fd7..7ab9e1d3256 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlinearregression.md @@ -72,5 +72,5 @@ The query will return a column of predicted values. Note that first argument of **See Also** -- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#agg_functions_stochasticlinearregression_parameters) +- [stochasticLogisticRegression](../../../sql-reference/aggregate-functions/reference/stochasticlogisticregression.md#stochasticlogisticregression) - [Difference between linear and logistic regressions](https://stackoverflow.com/questions/12146914/what-is-the-difference-between-linear-regression-and-logistic-regression) diff --git a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md index 0a040689681..4bf5529ddcb 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md +++ b/docs/en/sql-reference/aggregate-functions/reference/stochasticlogisticregression.md @@ -11,7 +11,7 @@ This function implements stochastic logistic regression. It can be used for bina Parameters are exactly the same as in stochasticLinearRegression: `learning rate`, `l2 regularization coefficient`, `mini-batch size`, `method for updating weights`. -For more information see [parameters](#agg_functions-stochasticlinearregression-parameters). +For more information see [parameters](../reference/stochasticlinearregression.md/#parameters). ``` text stochasticLogisticRegression(1.0, 1.0, 10, 'SGD') diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index d2b19fe2a3e..4e010248f6e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -6,7 +6,7 @@ sidebar_position: 32 This page covers the `varPop` and `varPopStable` functions available in ClickHouse. -## varPop {#varPop} +## varPop Calculates the population covariance between two data columns. The population covariance measures the degree to which two variables vary together. Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. @@ -27,7 +27,7 @@ Returns an integer of type `Float64`. **Implementation details** -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable`](#varPopStable) function. +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable`](#varpopstable) function. **Example** @@ -55,7 +55,7 @@ Result: 3 ``` -## varPopStable {#varPopStable} +## varPopStable Calculates population covariance between two data columns using a stable, numerically accurate method to calculate the variance. This function is designed to provide reliable results even with large datasets or values that might cause numerical instability in other implementations. @@ -76,7 +76,7 @@ Returns an integer of type `Float64`. **Implementation details** -Unlike [`varPop`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. +Unlike [`varPop`](#varpop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. **Example** diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index e9ec9ba2bc1..bd1cfa5742a 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -40,7 +40,7 @@ Where: The function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPop()` function](./varpop#varpop) instead. -This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable`](#varSampStable) function. +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable`](#varsampstable) function. **Example** @@ -66,7 +66,7 @@ Response: 0.8650000000000091 ``` -## varSampStable {#varSampStable} +## varSampStable Calculate the sample variance of a data set using a numerically stable algorithm. @@ -86,7 +86,7 @@ The `varSampStable` function returns a Float64 value representing the sample var **Implementation details** -The `varSampStable` function calculates the sample variance using the same formula as the [`varSamp`](#varSamp) function: +The `varSampStable` function calculates the sample variance using the same formula as the [`varSamp`](#varsamp) function: ```plaintext ∑(x - mean(x))^2 / (n - 1) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index 437b836ec0e..4c7421d57c0 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -23,7 +23,7 @@ ClickHouse supports: If you are getting started with Dictionaries in ClickHouse we have a tutorial that covers that topic. Take a look [here](/docs/en/tutorial.md). ::: -You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](#dictionary_sources)”. +You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](#dictionary-sources)”. ClickHouse: @@ -75,14 +75,14 @@ The dictionary configuration file has the following format: ``` -You can [configure](#configuring_a_dictionary) any number of dictionaries in the same file. +You can [configure](#configuring-a-dictionary) any number of dictionaries in the same file. :::note You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: -## Configuring a Dictionary {#configuring_a_dictionary} +## Configuring a Dictionary @@ -123,7 +123,7 @@ LAYOUT(...) -- Memory layout configuration LIFETIME(...) -- Lifetime of dictionary in memory ``` -## Storing Dictionaries in Memory {#storing-dictionaries-in-memory} +## Storing Dictionaries in Memory There are a variety of ways to store dictionaries in memory. @@ -415,7 +415,7 @@ or LAYOUT(COMPLEX_KEY_HASHED_ARRAY([SHARDS 1])) ``` -### range_hashed {#range_hashed} +### range_hashed The dictionary is stored in memory in the form of a hash table with an ordered array of ranges and their corresponding values. @@ -679,7 +679,7 @@ When searching for a dictionary, the cache is searched first. For each block of If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`. -For cache dictionaries, the expiration [lifetime](#lifetime) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. +For cache dictionaries, the expiration [lifetime](#refreshing-dictionary-data-using-lifetime) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`. This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../operations/system-tables/dictionaries.md) table. @@ -771,7 +771,7 @@ The dictionary is not stored in memory and directly goes to the source during th The dictionary key has the [UInt64](../../sql-reference/data-types/int-uint.md) type. -All types of [sources](#dictionary_sources), except local files, are supported. +All types of [sources](#dictionary-sources), except local files, are supported. Configuration example: @@ -899,7 +899,7 @@ Other types are not supported yet. The function returns the attribute for the pr Data must completely fit into RAM. -## Refreshing dictionary data using LIFETIME {#lifetime} +## Refreshing dictionary data using LIFETIME ClickHouse periodically updates dictionaries based on the `LIFETIME` tag (defined in seconds). `LIFETIME` is the update interval for fully downloaded dictionaries and the invalidation interval for cached dictionaries. @@ -952,7 +952,7 @@ LIFETIME(MIN 300 MAX 360) If `0` and `0`, ClickHouse does not reload the dictionary by timeout. In this case, ClickHouse can reload the dictionary earlier if the dictionary configuration file was changed or the `SYSTEM RELOAD DICTIONARY` command was executed. -When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](#dictionary_sources): +When updating the dictionaries, the ClickHouse server applies different logic depending on the type of [source](#dictionary-sources): - For a text file, it checks the time of modification. If the time differs from the previously recorded time, the dictionary is updated. - For MySQL source, the time of modification is checked using a `SHOW TABLE STATUS` query (in case of MySQL 8 you need to disable meta-information caching in MySQL by `set global information_schema_stats_expiry=0`). @@ -961,7 +961,7 @@ When updating the dictionaries, the ClickHouse server applies different logic de For other sources (ODBC, PostgreSQL, ClickHouse, etc), you can set up a query that will update the dictionaries only if they really changed, rather than each time. To do this, follow these steps: - The dictionary table must have a field that always changes when the source data is updated. -- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](#dictionary_sources). +- The settings of the source must specify a query that retrieves the changing field. The ClickHouse server interprets the query result as a row, and if this row has changed relative to its previous state, the dictionary is updated. Specify the query in the `` field in the settings for the [source](#dictionary-sources). Example of settings: @@ -1031,7 +1031,7 @@ SOURCE(CLICKHOUSE(... update_field 'added_time' update_lag 15)) ... ``` -## Dictionary Sources {#dictionary_sources} +## Dictionary Sources @@ -1065,7 +1065,7 @@ SOURCE(SOURCE_TYPE(param1 val1 ... paramN valN)) -- Source configuration The source is configured in the `source` section. -For source types [Local file](#local_file), [Executable file](#executable), [HTTP(s)](#https), [ClickHouse](#clickhouse) +For source types [Local file](#local-file), [Executable file](#executable-file), [HTTP(s)](#https), [ClickHouse](#clickhouse) optional settings are available: ``` xml @@ -1089,9 +1089,9 @@ SETTINGS(format_csv_allow_single_quotes = 0) Types of sources (`source_type`): -- [Local file](#local_file) -- [Executable File](#executable) -- [Executable Pool](#executable_pool) +- [Local file](#local-file) +- [Executable File](#executable-file) +- [Executable Pool](#executable-pool) - [HTTP(S)](#https) - DBMS - [ODBC](#odbc) @@ -1102,7 +1102,7 @@ Types of sources (`source_type`): - [Cassandra](#cassandra) - [PostgreSQL](#postgresql) -### Local File {#local_file} +### Local File Example of settings: @@ -1132,7 +1132,7 @@ When a dictionary with source `FILE` is created via DDL command (`CREATE DICTION - [Dictionary function](../../sql-reference/table-functions/dictionary.md#dictionary-function) -### Executable File {#executable} +### Executable File Working with executable files depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request to the executable file’s STDIN. Otherwise, ClickHouse starts the executable file and treats its output as dictionary data. @@ -1161,7 +1161,7 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled; otherwise, the DB user would be able to execute arbitrary binaries on the ClickHouse node. -### Executable Pool {#executable_pool} +### Executable Pool Executable pool allows loading data from pool of processes. This source does not work with dictionary layouts that need to load all data from source. Executable pool works if the dictionary [is stored](#ways-to-store-dictionaries-in-memory) using `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache`, `direct`, or `complex_key_direct` layouts. @@ -1196,9 +1196,9 @@ Setting fields: That dictionary source can be configured only via XML configuration. Creating dictionaries with executable source via DDL is disabled, otherwise, the DB user would be able to execute arbitrary binary on ClickHouse node. -### HTTP(S) {#https} +### HTTP(S) -Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storig-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. +Working with an HTTP(S) server depends on [how the dictionary is stored in memory](#storing-dictionaries-in-memory). If the dictionary is stored using `cache` and `complex_key_cache`, ClickHouse requests the necessary keys by sending a request via the `POST` method. Example of settings: @@ -1285,7 +1285,7 @@ Setting fields: - `db` – Name of the database. Omit it if the database name is set in the `` parameters. - `table` – Name of the table and schema if exists. - `connection_string` – Connection string. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1575,7 +1575,7 @@ Setting fields: - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in MySQL, for example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `fail_on_connection_loss` – The configuration parameter that controls behavior of the server on connection loss. If `true`, an exception is thrown immediately if the connection between client and server was lost. If `false`, the ClickHouse server retries to execute the query three times before throwing an exception. Note that retrying leads to increased response times. Default value: `false`. @@ -1672,7 +1672,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. May be omitted. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `secure` - Use ssl for connection. - `query` – The custom query. Optional parameter. @@ -1849,7 +1849,7 @@ Setting fields: - `db` – Name of the database. - `table` – Name of the table. - `where` – The selection criteria. The syntax for conditions is the same as for `WHERE` clause in PostgreSQL. For example, `id > 10 AND id < 20`. Optional parameter. -- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#lifetime). +- `invalidate_query` – Query for checking the dictionary status. Optional parameter. Read more in the section [Refreshing dictionary data using LIFETIME](#refreshing-dictionary-data-using-lifetime). - `query` – The custom query. Optional parameter. :::note @@ -1873,7 +1873,7 @@ LAYOUT(FLAT()) LIFETIME(0); ``` -## Dictionary Key and Fields {#dictionary-key-and-fields} +## Dictionary Key and Fields @@ -1963,7 +1963,7 @@ PRIMARY KEY Id ### Composite Key -The key can be a `tuple` from any types of fields. The [layout](#storig-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. +The key can be a `tuple` from any types of fields. The [layout](#storing-dictionaries-in-memory) in this case must be `complex_key_hashed` or `complex_key_cache`. :::tip A composite key can consist of a single element. This makes it possible to use a string as the key, for instance. @@ -2036,11 +2036,11 @@ Configuration fields: | `type` | ClickHouse data type: [UInt8](../../sql-reference/data-types/int-uint.md), [UInt16](../../sql-reference/data-types/int-uint.md), [UInt32](../../sql-reference/data-types/int-uint.md), [UInt64](../../sql-reference/data-types/int-uint.md), [Int8](../../sql-reference/data-types/int-uint.md), [Int16](../../sql-reference/data-types/int-uint.md), [Int32](../../sql-reference/data-types/int-uint.md), [Int64](../../sql-reference/data-types/int-uint.md), [Float32](../../sql-reference/data-types/float.md), [Float64](../../sql-reference/data-types/float.md), [UUID](../../sql-reference/data-types/uuid.md), [Decimal32](../../sql-reference/data-types/decimal.md), [Decimal64](../../sql-reference/data-types/decimal.md), [Decimal128](../../sql-reference/data-types/decimal.md), [Decimal256](../../sql-reference/data-types/decimal.md),[Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md), [String](../../sql-reference/data-types/string.md), [Array](../../sql-reference/data-types/array.md).
ClickHouse tries to cast value from dictionary to the specified data type. For example, for MySQL, the field might be `TEXT`, `VARCHAR`, or `BLOB` in the MySQL source table, but it can be uploaded as `String` in ClickHouse.
[Nullable](../../sql-reference/data-types/nullable.md) is currently supported for [Flat](#flat), [Hashed](#hashed), [ComplexKeyHashed](#complex_key_hashed), [Direct](#direct), [ComplexKeyDirect](#complex_key_direct), [RangeHashed](#range_hashed), Polygon, [Cache](#cache), [ComplexKeyCache](#complex_key_cache), [SSDCache](#ssd_cache), [SSDComplexKeyCache](#complex_key_ssd_cache) dictionaries. In [IPTrie](#ip_trie) dictionaries `Nullable` types are not supported. | Yes | | `null_value` | Default value for a non-existing element.
In the example, it is an empty string. [NULL](../syntax.md#null) value can be used only for the `Nullable` types (see the previous line with types description). | Yes | | `expression` | [Expression](../../sql-reference/syntax.md#expressions) that ClickHouse executes on the value.
The expression can be a column name in the remote SQL database. Thus, you can use it to create an alias for the remote column.

Default value: no expression. | No | -| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical_dictionaries).

Default value: `false`. | No | +| `hierarchical` | If `true`, the attribute contains the value of a parent key for the current key. See [Hierarchical Dictionaries](#hierarchical-dictionaries).

Default value: `false`. | No | | `injective` | Flag that shows whether the `id -> attribute` image is [injective](https://en.wikipedia.org/wiki/Injective_function).
If `true`, ClickHouse can automatically place after the `GROUP BY` clause the requests to dictionaries with injection. Usually it significantly reduces the amount of such requests.

Default value: `false`. | No | | `is_object_id` | Flag that shows whether the query is executed for a MongoDB document by `ObjectID`.

Default value: `false`. -## Hierarchical Dictionaries {#hierarchical_dictionaries} +## Hierarchical Dictionaries ClickHouse supports hierarchical dictionaries with a [numeric key](#numeric-key). @@ -2165,7 +2165,7 @@ Points can be specified as an array or a tuple of their coordinates. In the curr The user can upload their own data in all formats supported by ClickHouse. -There are 3 types of [in-memory storage](#storig-dictionaries-in-memory) available: +There are 3 types of [in-memory storage](#storing-dictionaries-in-memory) available: - `POLYGON_SIMPLE`. This is a naive implementation, where a linear pass through all polygons is made for each query, and membership is checked for each one without using additional indexes. @@ -2435,7 +2435,7 @@ LIFETIME(0) LAYOUT(regexp_tree); ``` -## Embedded Dictionaries {#embedded-dictionaries} +## Embedded Dictionaries diff --git a/docs/en/sql-reference/functions/bitmap-functions.md b/docs/en/sql-reference/functions/bitmap-functions.md index d98d7d19d7c..d30c0f4dde4 100644 --- a/docs/en/sql-reference/functions/bitmap-functions.md +++ b/docs/en/sql-reference/functions/bitmap-functions.md @@ -104,7 +104,7 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `range_start` – Start of the range (inclusive). [UInt32](../data-types/int-uint.md). - `cardinality_limit` – Maximum cardinality of the subset. [UInt32](../data-types/int-uint.md). @@ -134,7 +134,7 @@ subBitmap(bitmap, offset, cardinality_limit) **Arguments** -- `bitmap` – The bitmap. [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – The bitmap. [Bitmap object](#bitmapbuild). - `offset` – The position of the first element of the subset. [UInt32](../data-types/int-uint.md). - `cardinality_limit` – The maximum number of elements in the subset. [UInt32](../data-types/int-uint.md). @@ -162,7 +162,7 @@ bitmapContains(bitmap, needle) **Arguments** -- `bitmap` – [Bitmap object](#bitmap_functions-bitmapbuild). +- `bitmap` – [Bitmap object](#bitmapbuild). - `needle` – Searched bit value. [UInt32](../data-types/int-uint.md). **Returned values** diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index e8661b5f5c3..b532e0de8f0 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -1049,7 +1049,7 @@ toLastDayOfWeek(t[, mode[, timezone]]) **Arguments** - `t` - a [Date](../data-types/date.md), [Date32](../data-types/date32.md), [DateTime](../data-types/datetime.md) or [DateTime64](../data-types/datetime64.md) -- `mode` - determines the last day of the week as described in the [toWeek()](date-time-functions#toweek) function +- `mode` - determines the last day of the week as described in the [toWeek](#toweek) function - `timezone` - Optional parameter, it behaves like any other conversion function **Returned value** @@ -1763,7 +1763,7 @@ Result: ## fromDaysSinceYearZero32 -Like [fromDaysSinceYearZero](#fromDaysSinceYearZero) but returns a [Date32](../data-types/date32.md). +Like [fromDaysSinceYearZero](#fromdayssinceyearzero) but returns a [Date32](../data-types/date32.md). ## age diff --git a/docs/en/sql-reference/functions/geo/geohash.md b/docs/en/sql-reference/functions/geo/geohash.md index 9a3d52824f6..b6ac7a74092 100644 --- a/docs/en/sql-reference/functions/geo/geohash.md +++ b/docs/en/sql-reference/functions/geo/geohash.md @@ -4,7 +4,7 @@ sidebar_label: Geohash title: "Functions for Working with Geohash" --- -## Geohash {#geohash_description} +## Geohash [Geohash](https://en.wikipedia.org/wiki/Geohash) is the geocode system, which subdivides Earth’s surface into buckets of grid shape and encodes each cell into a short string of letters and digits. It is a hierarchical data structure, so the longer is the geohash string, the more precise is the geographic location. @@ -12,7 +12,7 @@ If you need to manually convert geographic coordinates to geohash strings, you c ## geohashEncode -Encodes latitude and longitude as a [geohash](#geohash_description)-string. +Encodes latitude and longitude as a [geohash](#geohash)-string. ``` sql geohashEncode(longitude, latitude, [precision]) @@ -42,7 +42,7 @@ SELECT geohashEncode(-5.60302734375, 42.593994140625, 0) AS res; ## geohashDecode -Decodes any [geohash](#geohash_description)-encoded string into longitude and latitude. +Decodes any [geohash](#geohash)-encoded string into longitude and latitude. **Input values** @@ -66,7 +66,7 @@ SELECT geohashDecode('ezs42') AS res; ## geohashesInBox -Returns an array of [geohash](#geohash_description)-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array. +Returns an array of [geohash](#geohash)-encoded strings of given precision that fall inside and intersect boundaries of given box, basically a 2D grid flattened into array. **Syntax** diff --git a/docs/en/sql-reference/functions/geo/h3.md b/docs/en/sql-reference/functions/geo/h3.md index 6fce91f4d8e..5fbc2adf2fa 100644 --- a/docs/en/sql-reference/functions/geo/h3.md +++ b/docs/en/sql-reference/functions/geo/h3.md @@ -4,7 +4,7 @@ sidebar_label: H3 Indexes title: "Functions for Working with H3 Indexes" --- -## H3 Index {#h3index} +## H3 Index [H3](https://eng.uber.com/h3/) is a geographical indexing system where Earth’s surface divided into a grid of even hexagonal cells. This system is hierarchical, i. e. each hexagon on the top level ("parent") can be split into seven even but smaller ones ("children"), and so on. @@ -18,7 +18,7 @@ The full description of the H3 system is available at [the Uber Engineering site ## h3IsValid -Verifies whether the number is a valid [H3](#h3index) index. +Verifies whether the number is a valid [H3](#h3-index) index. **Syntax** @@ -53,7 +53,7 @@ Result: ## h3GetResolution -Defines the resolution of the given [H3](#h3index) index. +Defines the resolution of the given [H3](#h3-index) index. **Syntax** @@ -88,7 +88,7 @@ Result: ## h3EdgeAngle -Calculates the average length of the [H3](#h3index) hexagon edge in grades. +Calculates the average length of the [H3](#h3-index) hexagon edge in grades. **Syntax** @@ -102,7 +102,7 @@ h3EdgeAngle(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in grades. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in grades. [Float64](../../data-types/float.md). **Example** @@ -122,7 +122,7 @@ Result: ## h3EdgeLengthM -Calculates the average length of the [H3](#h3index) hexagon edge in meters. +Calculates the average length of the [H3](#h3-index) hexagon edge in meters. **Syntax** @@ -136,7 +136,7 @@ h3EdgeLengthM(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in meters. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in meters. [Float64](../../data-types/float.md). **Example** @@ -156,7 +156,7 @@ Result: ## h3EdgeLengthKm -Calculates the average length of the [H3](#h3index) hexagon edge in kilometers. +Calculates the average length of the [H3](#h3-index) hexagon edge in kilometers. **Syntax** @@ -170,7 +170,7 @@ h3EdgeLengthKm(resolution) **Returned values** -- The average length of the [H3](#h3index) hexagon edge in kilometers. [Float64](../../data-types/float.md). +- The average length of the [H3](#h3-index) hexagon edge in kilometers. [Float64](../../data-types/float.md). **Example** @@ -190,7 +190,7 @@ Result: ## geoToH3 -Returns [H3](#h3index) point index `(lon, lat)` with specified resolution. +Returns [H3](#h3-index) point index `(lon, lat)` with specified resolution. **Syntax** @@ -227,7 +227,7 @@ Result: ## h3ToGeo -Returns the centroid longitude and latitude corresponding to the provided [H3](#h3index) index. +Returns the centroid longitude and latitude corresponding to the provided [H3](#h3-index) index. **Syntax** @@ -296,7 +296,7 @@ Result: ## h3kRing - Lists all the [H3](#h3index) hexagons in the raduis of `k` from the given hexagon in random order. + Lists all the [H3](#h3-index) hexagons in the raduis of `k` from the given hexagon in random order. **Syntax** @@ -337,7 +337,7 @@ Result: ## h3GetBaseCell -Returns the base cell number of the [H3](#h3index) index. +Returns the base cell number of the [H3](#h3-index) index. **Syntax** @@ -439,7 +439,7 @@ Result: ## h3IndexesAreNeighbors -Returns whether or not the provided [H3](#h3index) indexes are neighbors. +Returns whether or not the provided [H3](#h3-index) indexes are neighbors. **Syntax** @@ -475,7 +475,7 @@ Result: ## h3ToChildren -Returns an array of child indexes for the given [H3](#h3index) index. +Returns an array of child indexes for the given [H3](#h3-index) index. **Syntax** @@ -510,7 +510,7 @@ Result: ## h3ToParent -Returns the parent (coarser) index containing the given [H3](#h3index) index. +Returns the parent (coarser) index containing the given [H3](#h3-index) index. **Syntax** @@ -611,7 +611,7 @@ Result: ## h3GetResolution -Returns the resolution of the [H3](#h3index) index. +Returns the resolution of the [H3](#h3-index) index. **Syntax** @@ -645,7 +645,7 @@ Result: ## h3IsResClassIII -Returns whether [H3](#h3index) index has a resolution with Class III orientation. +Returns whether [H3](#h3-index) index has a resolution with Class III orientation. **Syntax** @@ -680,7 +680,7 @@ Result: ## h3IsPentagon -Returns whether this [H3](#h3index) index represents a pentagonal cell. +Returns whether this [H3](#h3-index) index represents a pentagonal cell. **Syntax** @@ -715,7 +715,7 @@ Result: ## h3GetFaces -Returns icosahedron faces intersected by a given [H3](#h3index) index. +Returns icosahedron faces intersected by a given [H3](#h3-index) index. **Syntax** @@ -817,7 +817,7 @@ Result: ## h3ToCenterChild -Returns the center child (finer) [H3](#h3index) index contained by given [H3](#h3index) at the given resolution. +Returns the center child (finer) [H3](#h3-index) index contained by given [H3](#h3-index) at the given resolution. **Syntax** @@ -832,7 +832,7 @@ h3ToCenterChild(index, resolution) **Returned values** -- [H3](#h3index) index of the center child contained by given [H3](#h3index) at the given resolution. [UInt64](../../data-types/int-uint.md). +- [H3](#h3-index) index of the center child contained by given [H3](#h3-index) at the given resolution. [UInt64](../../data-types/int-uint.md). **Example** diff --git a/docs/en/sql-reference/functions/geo/s2.md b/docs/en/sql-reference/functions/geo/s2.md index bcb6b2833c9..e022ce870b0 100644 --- a/docs/en/sql-reference/functions/geo/s2.md +++ b/docs/en/sql-reference/functions/geo/s2.md @@ -5,7 +5,7 @@ sidebar_label: S2 Geometry # Functions for Working with S2 Index -## S2Index {#s2_index} +## S2Index [S2](https://s2geometry.io/) is a geographical indexing system where all geographical data is represented on a three-dimensional sphere (similar to a globe). @@ -13,7 +13,7 @@ In the S2 library points are represented as the S2 Index - a specific number whi ## geoToS2 -Returns [S2](#s2_index) point index corresponding to the provided coordinates `(longitude, latitude)`. +Returns [S2](#s2index) point index corresponding to the provided coordinates `(longitude, latitude)`. **Syntax** @@ -48,7 +48,7 @@ Result: ## s2ToGeo -Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2_index) point index. +Returns geo coordinates `(longitude, latitude)` corresponding to the provided [S2](#s2index) point index. **Syntax** @@ -84,7 +84,7 @@ Result: ## s2GetNeighbors -Returns S2 neighbor indexes corresponding to the provided [S2](#s2_index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. +Returns S2 neighbor indexes corresponding to the provided [S2](#s2index). Each cell in the S2 system is a quadrilateral bounded by four geodesics. So, each cell has 4 neighbors. **Syntax** @@ -118,7 +118,7 @@ Result: ## s2CellsIntersect -Determines if the two provided [S2](#s2_index) cells intersect or not. +Determines if the two provided [S2](#s2index) cells intersect or not. **Syntax** diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 5eae8b7905e..e22dd5d827c 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -762,7 +762,7 @@ LIMIT 10 Given a size (number of bytes), this function returns a readable, rounded size with suffix (KB, MB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -795,7 +795,7 @@ Result: Given a size (number of bytes), this function returns a readable, rounded size with suffix (KiB, MiB, etc.) as string. -The opposite operations of this function are [parseReadableSize](#parseReadableSize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). +The opposite operations of this function are [parseReadableSize](#parsereadablesize), [parseReadableSizeOrZero](#parsereadablesizeorzero), and [parseReadableSizeOrNull](#parsereadablesizeornull). **Syntax** @@ -921,7 +921,7 @@ SELECT └────────────────────┴────────────────────────────────────────────────┘ ``` -## parseReadableSize {#parseReadableSize} +## parseReadableSize Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it throws an exception. @@ -964,7 +964,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `NULL`. -The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** @@ -1002,7 +1002,7 @@ SELECT Given a string containing a byte size and `B`, `KiB`, `KB`, `MiB`, `MB`, etc. as a unit (i.e. [ISO/IEC 80000-13](https://en.wikipedia.org/wiki/ISO/IEC_80000) or decimal byte unit), this function returns the corresponding number of bytes. If the function is unable to parse the input value, it returns `0`. -The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatReadableDecimalSize). +The inverse operations of this function are [formatReadableSize](#formatreadablesize) and [formatReadableDecimalSize](#formatreadabledecimalsize). **Syntax** diff --git a/docs/en/sql-reference/functions/rounding-functions.md b/docs/en/sql-reference/functions/rounding-functions.md index c3a915ca195..e2f471d47eb 100644 --- a/docs/en/sql-reference/functions/rounding-functions.md +++ b/docs/en/sql-reference/functions/rounding-functions.md @@ -274,7 +274,7 @@ roundBankers(10.755, 2) = 10.76 **See Also** -- [round](#rounding_functions-round) +- [round](#round) ## roundToExp2 diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 5353bbf9b27..b7ba1d4feb7 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -1665,7 +1665,7 @@ Result: ## hasSubsequenceUTF8 -Like [hasSubsequence](#hasSubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. +Like [hasSubsequence](#hassubsequence) but assumes `haystack` and `needle` are UTF-8 encoded strings. **Syntax** diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 86739ac0b8d..61e84ca72d1 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -10,7 +10,7 @@ sidebar_label: Type Conversion ClickHouse generally uses the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion). -`to` functions and [cast](#castx-t) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#castx-t) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. +`to` functions and [cast](#cast) behave differently in some cases, for example in case of [LowCardinality](../data-types/lowcardinality.md): [cast](#cast) removes [LowCardinality](../data-types/lowcardinality.md) trait `to` functions don't. The same with [Nullable](../data-types/nullable.md), this behaviour is not compatible with SQL standard, and it can be changed using [cast_keep_nullable](../../operations/settings/settings.md/#cast_keep_nullable) setting. :::note Be aware of potential data loss if values of a datatype are converted to a smaller datatype (for example from `Int64` to `Int32`) or between @@ -996,7 +996,7 @@ Result: ## reinterpretAsUInt8 -Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1034,7 +1034,7 @@ Result: ## reinterpretAsUInt16 -Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1072,7 +1072,7 @@ Result: ## reinterpretAsUInt32 -Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1110,7 +1110,7 @@ Result: ## reinterpretAsUInt64 -Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1148,7 +1148,7 @@ Result: ## reinterpretAsUInt128 -Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1186,7 +1186,7 @@ Result: ## reinterpretAsUInt256 -Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type UInt256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1224,7 +1224,7 @@ Result: ## reinterpretAsInt8 -Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int8. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1262,7 +1262,7 @@ Result: ## reinterpretAsInt16 -Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int16. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1300,7 +1300,7 @@ Result: ## reinterpretAsInt32 -Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1338,7 +1338,7 @@ Result: ## reinterpretAsInt64 -Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1376,7 +1376,7 @@ Result: ## reinterpretAsInt128 -Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int128. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1414,7 +1414,7 @@ Result: ## reinterpretAsInt256 -Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Int256. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1452,7 +1452,7 @@ Result: ## reinterpretAsFloat32 -Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float32. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** @@ -1486,7 +1486,7 @@ Result: ## reinterpretAsFloat64 -Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#castx-t), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. +Performs byte reinterpretation by treating the input value as a value of type Float64. Unlike [`CAST`](#cast), the function does not attempt to preserve the original value - if the target type is not able to represent the input type, the output is meaningless. **Syntax** diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 5c83b2363e0..ed75b1802d8 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -235,7 +235,7 @@ If `some_predicate` is not selective enough, it will return a large amount of da ### Distributed Subqueries and max_parallel_replicas -When [max_parallel_replicas](#settings-max_parallel_replicas) is greater than 1, distributed queries are further transformed. +When [max_parallel_replicas](#distributed-subqueries-and-max_parallel_replicas) is greater than 1, distributed queries are further transformed. For example, the following: diff --git a/docs/en/sql-reference/table-functions/fileCluster.md b/docs/en/sql-reference/table-functions/fileCluster.md index 3060e6c151d..62b00fadd62 100644 --- a/docs/en/sql-reference/table-functions/fileCluster.md +++ b/docs/en/sql-reference/table-functions/fileCluster.md @@ -22,7 +22,7 @@ fileCluster(cluster_name, path[, format, structure, compression_method]) **Arguments** - `cluster_name` — Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers. -- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs_in_path). +- `path` — The relative path to the file from [user_files_path](/docs/en/operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file also supports [globs](#globs-in-path). - `format` — [Format](../../interfaces/formats.md#formats) of the files. Type: [String](../../sql-reference/data-types/string.md). - `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). - `compression_method` — Compression method. Supported compression types are `gz`, `br`, `xz`, `zst`, `lz4`, and `bz2`. From d92a2f3ef6c86c00953a8694294859c70d4a4732 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 13:20:50 +0000 Subject: [PATCH 164/215] Remove obsolete fix from aws submodule --- contrib/aws | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/aws b/contrib/aws index deeaa9e7c5f..1c2946bfcb7 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit deeaa9e7c5fe690e3dacc4005d7ecfa7a66a32bb +Subproject commit 1c2946bfcb7f1e3ae0a858de0b59d4f1a7b4ccaf From 3ff69a9a030f829bef7582e95618f63ec73b897f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 15:36:15 +0200 Subject: [PATCH 165/215] Update utils/check-style/check-style Co-authored-by: Mikhail f. Shiryaev --- utils/check-style/check-style | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 4c6a4b9ea39..f8c6d6b0fde 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -245,7 +245,12 @@ find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | gre # All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) # - restrict the check to top-level .gitmodules file -find $ROOT_PATH -maxdepth 1 -name '.gitmodules' | while read i; do grep -F '[submodule ' $i | grep -v -F 'contrib' && echo 'All submodules should have form [submodule "contrib/libxyz"]'; done +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.path' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.path*} + path=${line#* } + [ "$name" != "$path" ] && echo "Submodule name '$name' is not equal to it's path '$path'" +done # There shouldn't be any code snippets under GPL or LGPL find $ROOT_PATH/{src,base,programs} -name '*.h' -or -name '*.cpp' 2>/dev/null | xargs grep -i -F 'General Public License' && echo "There shouldn't be any code snippets under GPL or LGPL" From 9d1f64e8b302d9d069c6218046788a6b236c98c2 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 12 Jun 2024 16:11:21 +0200 Subject: [PATCH 166/215] Update utils/check-style/check-style --- utils/check-style/check-style | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index f8c6d6b0fde..21325ece916 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -241,7 +241,13 @@ for test_case in "${tests_with_replicated_merge_tree[@]}"; do done # All the submodules should be from https://github.com/ -find $ROOT_PATH -name '.gitmodules' | while read i; do grep -F 'url = ' $i | grep -v -F 'https://github.com/' && echo 'All the submodules should be from https://github.com/'; done + +git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ +while read -r line; do + name=${line#submodule.}; name=${name%.url*} + url=${line#* } + [[ "$url" != 'https://github.com/'* ]] && echo "All the submodules should be from https://github.com/, submodule '$name' has '$url'" +done # All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) # - restrict the check to top-level .gitmodules file From 3fd1918c3f6f92aeb14119a461cde2ed4038cedd Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 16:21:08 +0200 Subject: [PATCH 167/215] Update check-style --- utils/check-style/check-style | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/utils/check-style/check-style b/utils/check-style/check-style index 21325ece916..db491c67f2c 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -240,13 +240,12 @@ for test_case in "${tests_with_replicated_merge_tree[@]}"; do esac done -# All the submodules should be from https://github.com/ - +# All submodules should be from https://github.com/ git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ while read -r line; do name=${line#submodule.}; name=${name%.url*} url=${line#* } - [[ "$url" != 'https://github.com/'* ]] && echo "All the submodules should be from https://github.com/, submodule '$name' has '$url'" + [[ "$url" != 'https://github.com/'* ]] && echo "All submodules should be from https://github.com/, submodule '$name' has '$url'" done # All submodules should be of this form: [submodule "contrib/libxyz"] (for consistency, the submodule name does matter too much) From 49977e89e01afb8612332a4cd79fb89e95934921 Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Wed, 12 Jun 2024 14:29:52 +0000 Subject: [PATCH 168/215] CI: Fix not-merged cherry-picks for backports --- tests/ci/cherry_pick.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/ci/cherry_pick.py b/tests/ci/cherry_pick.py index 629464d0422..459be12ada0 100644 --- a/tests/ci/cherry_pick.py +++ b/tests/ci/cherry_pick.py @@ -532,9 +532,9 @@ class Backport: for br in branches: br.process(self.dry_run) - for br in branches: - if br.backported: - self.mark_pr_backported(pr) + if all(br.backported for br in branches): + # And check it after the running + self.mark_pr_backported(pr) def mark_pr_backported(self, pr: PullRequest) -> None: if self.dry_run: From 208f32328c8a91c17e1b4bb8d93b0b3343b054a1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 12 Jun 2024 16:56:09 +0200 Subject: [PATCH 169/215] Update tests/integration/test_replicated_database/test.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Marín --- tests/integration/test_replicated_database/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 73b7ae265e4..f23384b5c04 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -404,7 +404,8 @@ def test_alter_detach_part(started_cluster, engine): main_node.query(f"INSERT INTO {database}.alter_detach VALUES (123)") if engine == "MergeTree": dummy_node.query(f"INSERT INTO {database}.alter_detach VALUES (456)") - main_node.query(f"SYSTEM SYNC REPLICA {database}.alter_detach PULL") + else: + main_node.query(f"SYSTEM SYNC REPLICA {database}.alter_detach PULL") main_node.query(f"ALTER TABLE {database}.alter_detach DETACH PART '{part_name}'") detached_parts_query = f"SELECT name FROM system.detached_parts WHERE database='{database}' AND table='alter_detach'" assert main_node.query(detached_parts_query) == f"{part_name}\n" From b8bb1675476276da200ee7c971749a0b75a9d7b6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 12 Jun 2024 17:27:17 +0200 Subject: [PATCH 170/215] Update .gitmodules --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index a3bb7eacabc..12d865307d8 100644 --- a/.gitmodules +++ b/.gitmodules @@ -161,7 +161,7 @@ [submodule "contrib/xz"] path = contrib/xz url = https://github.com/xz-mirror/xz -[submodule "contrib/abseil"] +[submodule "contrib/abseil-cpp"] path = contrib/abseil-cpp url = https://github.com/ClickHouse/abseil-cpp.git [submodule "contrib/dragonbox"] From 5f5a6d5f107d09e22fb1466aeef1aec395eb4b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 12 Jun 2024 17:51:17 +0200 Subject: [PATCH 171/215] Fix bug in short circuit optimization with cache dictionaries --- src/Dictionaries/CacheDictionary.cpp | 3 ++ ...e_complex_dict_short_circuit_bug.reference | 0 ...9_cache_complex_dict_short_circuit_bug.sql | 31 +++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference create mode 100644 tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 2842e2b8799..1816324a93b 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -511,7 +511,10 @@ MutableColumns CacheDictionary::aggregateColumns( if (default_mask) { if (key_state_from_storage.isDefault()) + { (*default_mask)[key_index] = 1; + aggregated_column->insertDefault(); + } else { (*default_mask)[key_index] = 0; diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql new file mode 100644 index 00000000000..8463d13d251 --- /dev/null +++ b/tests/queries/0_stateless/03169_cache_complex_dict_short_circuit_bug.sql @@ -0,0 +1,31 @@ +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; + +CREATE TABLE complex_key_simple_attributes_source_short_circuit_table +( + id UInt64, + id_key String, + value_first String, + value_second String +) + ENGINE = TinyLog; + +INSERT INTO complex_key_simple_attributes_source_short_circuit_table VALUES(0, 'id_key_0', 'value_0', 'value_second_0'); + +CREATE DICTIONARY cache_dictionary_complex_key_simple_attributes_short_circuit +( + `id` UInt64, + `id_key` String, + `value_first` String DEFAULT 'value_first_default', + `value_second` String DEFAULT 'value_second_default' +) +PRIMARY KEY id, id_key +SOURCE(CLICKHOUSE(TABLE 'complex_key_simple_attributes_source_short_circuit_table')) +LIFETIME(MIN 1 MAX 1000) +LAYOUT(COMPLEX_KEY_CACHE(SIZE_IN_CELLS 10)); + +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; +SELECT dictGetOrDefault('cache_dictionary_complex_key_simple_attributes_short_circuit', 'value_first', (number, concat(toString(number))), toString(materialize('default'))) AS value_first FROM system.numbers LIMIT 20 FORMAT Null; + +DROP TABLE IF EXISTS complex_key_simple_attributes_source_short_circuit_table; +DROP DICTIONARY IF EXISTS cache_dictionary_complex_key_simple_attributes_short_circuit; From bf04aebc948b078ba2f00dcb601e89158367422c Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 12 Jun 2024 18:00:19 +0200 Subject: [PATCH 172/215] Update ReplicatedMergeTreeQueue.cpp --- src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 9a368bd44f5..e30d63c343a 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -2004,8 +2004,7 @@ MutationCommands ReplicatedMergeTreeQueue::getMutationCommands( MutationCommands commands; for (auto it = begin; it != end; ++it) { - /// FIXME uncomment this assertion after relesing 23.5 (currently it fails in Upgrade check) - /// chassert(mutation_pointer < it->second->entry->znode_name); + chassert(mutation_pointer < it->second->entry->znode_name); mutation_ids.push_back(it->second->entry->znode_name); const auto & commands_from_entry = it->second->entry->commands; commands.insert(commands.end(), commands_from_entry.begin(), commands_from_entry.end()); From b7161b77d177680187b489b971f35ab856a4004b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 12 Jun 2024 19:08:23 +0200 Subject: [PATCH 173/215] Fix UniqInjectiveFunctionsEliminationPass with uniqCombined --- .../UniqInjectiveFunctionsEliminationPass.cpp | 14 +++++++------ ...tive_functions_inside_uniq_crash.reference | 2 ++ ..._injective_functions_inside_uniq_crash.sql | 21 +++++++++++++++++++ 3 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference create mode 100644 tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp index d087fe1c7b9..2360cd3f0c2 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp @@ -41,9 +41,9 @@ public: return; bool replaced_argument = false; - auto & uniq_function_arguments_nodes = function_node->getArguments().getNodes(); + auto replaced_uniq_function_arguments_nodes = function_node->getArguments().getNodes(); - for (auto & uniq_function_argument_node : uniq_function_arguments_nodes) + for (auto & uniq_function_argument_node : replaced_uniq_function_arguments_nodes) { auto * uniq_function_argument_node_typed = uniq_function_argument_node->as(); if (!uniq_function_argument_node_typed || !uniq_function_argument_node_typed->isOrdinaryFunction()) @@ -67,12 +67,10 @@ public: if (!replaced_argument) return; - const auto & function_node_argument_nodes = function_node->getArguments().getNodes(); - DataTypes argument_types; - argument_types.reserve(function_node_argument_nodes.size()); + argument_types.reserve(replaced_uniq_function_arguments_nodes.size()); - for (const auto & function_node_argument : function_node_argument_nodes) + for (const auto & function_node_argument : replaced_uniq_function_arguments_nodes) argument_types.emplace_back(function_node_argument->getResultType()); AggregateFunctionProperties properties; @@ -83,6 +81,10 @@ public: function_node->getAggregateFunction()->getParameters(), properties); + /// uniqCombined returns nullable with nullable arguments so the result type might change which breaks the pass + if (!aggregate_function->getResultType()->equals(*function_node->getAggregateFunction()->getResultType())) + return; + function_node->resolveAsAggregateFunction(std::move(aggregate_function)); } }; diff --git a/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference new file mode 100644 index 00000000000..e58e9764b39 --- /dev/null +++ b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.reference @@ -0,0 +1,2 @@ +100 +100 diff --git a/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql new file mode 100644 index 00000000000..50d99b851a6 --- /dev/null +++ b/tests/queries/0_stateless/03169_optimize_injective_functions_inside_uniq_crash.sql @@ -0,0 +1,21 @@ +SELECT sum(u) +FROM +( + SELECT + intDiv(number, 4096) AS k, + uniqCombined(tuple(materialize(toLowCardinality(toNullable(16))))) AS u + FROM numbers(4096 * 100) + GROUP BY k +) +SETTINGS allow_experimental_analyzer = 1, optimize_injective_functions_inside_uniq=0; + +SELECT sum(u) +FROM +( + SELECT + intDiv(number, 4096) AS k, + uniqCombined(tuple(materialize(toLowCardinality(toNullable(16))))) AS u + FROM numbers(4096 * 100) + GROUP BY k +) +SETTINGS allow_experimental_analyzer = 1, optimize_injective_functions_inside_uniq=1; From 35252c4eda20d6dfcb878d7b1522e7842d826f60 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 12 Jun 2024 18:41:46 +0100 Subject: [PATCH 174/215] impl --- .../0_stateless/03168_loop_engine_with_parallel_replicas.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql b/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql index dfcb5de9f2a..da4626ad897 100644 --- a/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql +++ b/tests/queries/0_stateless/03168_loop_engine_with_parallel_replicas.sql @@ -1,3 +1,5 @@ +-- Tags: no-parallel + DROP DATABASE IF EXISTS 03147_db; CREATE DATABASE IF NOT EXISTS 03147_db; CREATE TABLE 03147_db.t (n Int8) ENGINE=MergeTree ORDER BY n; From 6d48962ca09f582478e6caa160463b34ead0091b Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 10 Jun 2024 23:54:19 +0100 Subject: [PATCH 175/215] impl --- .../QueryPlan/ReadFromMergeTree.cpp | 9 ++++ src/Storages/StorageMerge.cpp | 6 +++ .../03155_test_move_to_prewhere.reference | 1 + .../03155_test_move_to_prewhere.sh | 46 +++++++++++++++++++ 4 files changed, 62 insertions(+) create mode 100644 tests/queries/0_stateless/03155_test_move_to_prewhere.reference create mode 100755 tests/queries/0_stateless/03155_test_move_to_prewhere.sh diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index e469062d7e7..4fad1bbb653 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1515,6 +1515,15 @@ static void buildIndexes( void ReadFromMergeTree::applyFilters(ActionDAGNodes added_filter_nodes) { + /// Sometimes a really dumb problem may happen. + /// For ReadFromMerge for example we may recursively call `applyFilters` for child reading steps (with no filters added so far). + /// Then later `optimizePrimaryKeyCondition` will try to apply filters to those child reading steps, but with no luck, + /// because we already made an `applyFilters` call that could lead to indexes initialization few lines below. + /// So effectively the right set of filters will be just ignored. + /// This is not an ultimate solution, of course, we're better to have more structured way of applying filters. + if (added_filter_nodes.nodes.empty()) + return; + if (!indexes) { filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes, query_info.buildNodeNameToInputNodeColumn()); diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index ed3f43367dd..55cfd1ffcd7 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1573,8 +1573,14 @@ QueryPlanRawPtrs ReadFromMerge::getChildPlans() QueryPlanRawPtrs plans; for (auto & child_plan : *child_plans) + { if (child_plan.plan.isInitialized()) + { + /// So we will see the optimized plan in EXPLAIN output + child_plan.plan.optimize(QueryPlanOptimizationSettings::fromContext(context)); plans.push_back(&child_plan.plan); + } + } return plans; } diff --git a/tests/queries/0_stateless/03155_test_move_to_prewhere.reference b/tests/queries/0_stateless/03155_test_move_to_prewhere.reference new file mode 100644 index 00000000000..0cfbf08886f --- /dev/null +++ b/tests/queries/0_stateless/03155_test_move_to_prewhere.reference @@ -0,0 +1 @@ +2 diff --git a/tests/queries/0_stateless/03155_test_move_to_prewhere.sh b/tests/queries/0_stateless/03155_test_move_to_prewhere.sh new file mode 100755 index 00000000000..b6980b3a23a --- /dev/null +++ b/tests/queries/0_stateless/03155_test_move_to_prewhere.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -nq " + CREATE TABLE event_envoy + ( + timestamp_interval DateTime CODEC(DoubleDelta), + region LowCardinality(String), + cluster LowCardinality(String) + ) + ENGINE = MergeTree + ORDER BY (timestamp_interval) + SETTINGS index_granularity = 8192; + + INSERT INTO event_envoy SELECT now() - number, 'us-east-1', 'ch_super_fast' FROM numbers_mt(1e5); +" + +${CLICKHOUSE_CLIENT} -nq " + CREATE TABLE event_envoy_remote + ( + timestamp_interval DateTime CODEC(DoubleDelta), + region LowCardinality(String), + cluster LowCardinality(String) + ) AS remote('127.0.0.1', '${CLICKHOUSE_DATABASE}', event_envoy); +" + +${CLICKHOUSE_CLIENT} -q " + CREATE TABLE global_event_envoy + ( + timestamp_interval DateTime, + region LowCardinality(String), + cluster LowCardinality(String) + ) + ENGINE = Merge('${CLICKHOUSE_DATABASE}', 'event_envoy.*'); +" + +${CLICKHOUSE_CLIENT} --prefer_localhost_replica 1 -q " + EXPLAIN indexes=1 + SELECT timestamp_interval + FROM global_event_envoy + WHERE timestamp_interval <= now() - 54321 AND region = 'us-east-1' +" | grep -c 'Condition.*timestamp_interval' + From 8a49c1614e9bef2859b405408597957c73eb06bf Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 12 Jun 2024 21:45:00 +0200 Subject: [PATCH 176/215] fix --- src/Core/ServerUUID.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Core/ServerUUID.cpp b/src/Core/ServerUUID.cpp index 159aa8faadf..9dfaf4fecf2 100644 --- a/src/Core/ServerUUID.cpp +++ b/src/Core/ServerUUID.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -16,7 +17,9 @@ namespace ErrorCodes UUID ServerUUID::get() { - if (server_uuid == UUIDHelpers::Nil) + if (server_uuid == UUIDHelpers::Nil && + (Context::getGlobalContextInstance()->getApplicationType() == Context::ApplicationType::SERVER || + Context::getGlobalContextInstance()->getApplicationType() == Context::ApplicationType::KEEPER)) throw Exception(ErrorCodes::LOGICAL_ERROR, "ServerUUID is not initialized yet"); return server_uuid; } From 55e0c668ae58ada0071916853aa07952c51fa15d Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 12 Jun 2024 20:49:26 +0100 Subject: [PATCH 177/215] fix --- src/Processors/QueryPlan/ReadFromMergeTree.cpp | 9 --------- src/Storages/StorageMerge.cpp | 6 ------ 2 files changed, 15 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 4fad1bbb653..e469062d7e7 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1515,15 +1515,6 @@ static void buildIndexes( void ReadFromMergeTree::applyFilters(ActionDAGNodes added_filter_nodes) { - /// Sometimes a really dumb problem may happen. - /// For ReadFromMerge for example we may recursively call `applyFilters` for child reading steps (with no filters added so far). - /// Then later `optimizePrimaryKeyCondition` will try to apply filters to those child reading steps, but with no luck, - /// because we already made an `applyFilters` call that could lead to indexes initialization few lines below. - /// So effectively the right set of filters will be just ignored. - /// This is not an ultimate solution, of course, we're better to have more structured way of applying filters. - if (added_filter_nodes.nodes.empty()) - return; - if (!indexes) { filter_actions_dag = ActionsDAG::buildFilterActionsDAG(added_filter_nodes.nodes, query_info.buildNodeNameToInputNodeColumn()); diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 55cfd1ffcd7..ed3f43367dd 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1573,14 +1573,8 @@ QueryPlanRawPtrs ReadFromMerge::getChildPlans() QueryPlanRawPtrs plans; for (auto & child_plan : *child_plans) - { if (child_plan.plan.isInitialized()) - { - /// So we will see the optimized plan in EXPLAIN output - child_plan.plan.optimize(QueryPlanOptimizationSettings::fromContext(context)); plans.push_back(&child_plan.plan); - } - } return plans; } From f030b220272e040f6d56048eef0e4de2d1ffd2c5 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 13 Jun 2024 00:05:46 +0200 Subject: [PATCH 178/215] fix build --- src/Coordination/Standalone/Context.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Coordination/Standalone/Context.h b/src/Coordination/Standalone/Context.h index 79a3e32a72d..38b810e4bfc 100644 --- a/src/Coordination/Standalone/Context.h +++ b/src/Coordination/Standalone/Context.h @@ -130,7 +130,8 @@ public: enum class ApplicationType : uint8_t { - KEEPER + KEEPER, + SERVER, }; void setApplicationType(ApplicationType) {} From 857a412e3b274bb5a309bebe9bfd284ae5ac8ad8 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Thu, 13 Jun 2024 01:27:54 +0000 Subject: [PATCH 179/215] address some review comments Signed-off-by: Duc Canh Le --- src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp | 4 ++-- src/Storages/Kafka/KafkaConsumer.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp index 56c774782c2..6a3475a1830 100644 --- a/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/BSONEachRowRowInputFormat.cpp @@ -421,7 +421,7 @@ void BSONEachRowRowInputFormat::readTuple(IColumn & column, const DataTypePtr & "Cannot parse tuple column with type {} from BSON array/embedded document field: " "tuple doesn't have element with name \"{}\"", data_type->getName(), - name.toView()); + name); index = *try_get_index; } @@ -806,7 +806,7 @@ bool BSONEachRowRowInputFormat::readRow(MutableColumns & columns, RowReadExtensi else { if (seen_columns[index]) - throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing BSONEachRow format: {}", name.toView()); + throw Exception(ErrorCodes::INCORRECT_DATA, "Duplicate field found while parsing BSONEachRow format: {}", name); seen_columns[index] = true; read_columns[index] = readField(*columns[index], types[index], BSONType(type)); diff --git a/src/Storages/Kafka/KafkaConsumer.h b/src/Storages/Kafka/KafkaConsumer.h index a2d047933be..4daf8652c3b 100644 --- a/src/Storages/Kafka/KafkaConsumer.h +++ b/src/Storages/Kafka/KafkaConsumer.h @@ -1,14 +1,14 @@ #pragma once -#include -#include #include +#include #include #include #include #include +#include #include namespace CurrentMetrics From 07f93fe78d946c66df2dbe6e44efa0c971a853e5 Mon Sep 17 00:00:00 2001 From: morning-color Date: Thu, 13 Jun 2024 10:44:32 +0800 Subject: [PATCH 180/215] Fix docs --- docs/zh/guides/improving-query-performance/skipping-indexes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/guides/improving-query-performance/skipping-indexes.md b/docs/zh/guides/improving-query-performance/skipping-indexes.md index f9f43e46927..8eb88d859f2 100644 --- a/docs/zh/guides/improving-query-performance/skipping-indexes.md +++ b/docs/zh/guides/improving-query-performance/skipping-indexes.md @@ -123,7 +123,7 @@ Bloom filter是一种数据结构,它允许对集合成员进行高效的是 有三种基于Bloom过滤器的数据跳数索引类型: -* 基本的**bloom_filter**接受一个可选参数,该参数表示在0到1之间允许的“假阳性”率(如果未指定,则使用.025)。 +* 基本的**bloom_filter**接受一个可选参数,该参数表示在0到1之间允许的“假阳性”率(如果未指定,则使用0.025)。 * 更专业的**tokenbf_v1**。需要三个参数,用来优化布隆过滤器:(1)过滤器的大小字节(大过滤器有更少的假阳性,有更高的存储成本),(2)哈希函数的个数(更多的散列函数可以减少假阳性)。(3)布隆过滤器哈希函数的种子。有关这些参数如何影响布隆过滤器功能的更多细节,请参阅 [这里](https://hur.st/bloomfilter/) 。此索引仅适用于String、FixedString和Map类型的数据。输入表达式被分割为由非字母数字字符分隔的字符序列。例如,列值`This is a candidate for a "full text" search`将被分割为`This` `is` `a` `candidate` `for` `full` `text` `search`。它用于LIKE、EQUALS、in、hasToken()和类似的长字符串中单词和其他值的搜索。例如,一种可能的用途是在非结构的应用程序日志行列中搜索少量的类名或行号。 From b74f910aaf9b1a05fd909923afb7714d4070532d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 13 Jun 2024 12:22:41 +0200 Subject: [PATCH 181/215] Do the replacement of the arguments --- src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp index 2360cd3f0c2..8a6276008d8 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp @@ -85,6 +85,7 @@ public: if (!aggregate_function->getResultType()->equals(*function_node->getAggregateFunction()->getResultType())) return; + function_node->getArguments().getNodes() = replaced_uniq_function_arguments_nodes; function_node->resolveAsAggregateFunction(std::move(aggregate_function)); } }; From 98d92b3be23d13d3d5f0d1180483d2f444acad78 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 13 Jun 2024 12:54:31 +0200 Subject: [PATCH 182/215] Fix the descriptions on some server settings --- src/Core/ServerSettings.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 45f235116ab..47ea5c29a50 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -97,11 +97,11 @@ namespace DB \ M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \ M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \ - M(UInt64, max_table_num_to_warn, 5000lu, "If number of tables is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_view_num_to_warn, 10000lu, "If number of views is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_dictionary_num_to_warn, 1000lu, "If number of dictionaries is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_database_num_to_warn, 1000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ - M(UInt64, max_part_num_to_warn, 100000lu, "If number of databases is greater than this value, server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_table_num_to_warn, 5000lu, "If the number of tables is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_view_num_to_warn, 10000lu, "If the number of views is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_dictionary_num_to_warn, 1000lu, "If the number of dictionaries is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_database_num_to_warn, 1000lu, "If the number of databases is greater than this value, the server will create a warning that will displayed to user.", 0) \ + M(UInt64, max_part_num_to_warn, 100000lu, "If the number of parts is greater than this value, the server will create a warning that will displayed to user.", 0) \ M(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \ M(UInt64, concurrent_threads_soft_limit_ratio_to_cores, 0, "Same as concurrent_threads_soft_limit_num, but with ratio to cores.", 0) \ \ From 55319c760deecd2d63a4605c705e298f8290ebb8 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Mon, 10 Jun 2024 17:34:09 +0200 Subject: [PATCH 183/215] stateless tests: add test for https://github.com/ClickHouse/ClickHouse/issues/42083 --- .../03169_modify_column_data_loss.reference | 4 ++++ .../03169_modify_column_data_loss.sql | 19 +++++++++++++++++++ 2 files changed, 23 insertions(+) create mode 100644 tests/queries/0_stateless/03169_modify_column_data_loss.reference create mode 100644 tests/queries/0_stateless/03169_modify_column_data_loss.sql diff --git a/tests/queries/0_stateless/03169_modify_column_data_loss.reference b/tests/queries/0_stateless/03169_modify_column_data_loss.reference new file mode 100644 index 00000000000..2126a658c16 --- /dev/null +++ b/tests/queries/0_stateless/03169_modify_column_data_loss.reference @@ -0,0 +1,4 @@ +1 one 0 +2 two 0 +3 \N 0 +1 one 1 0 diff --git a/tests/queries/0_stateless/03169_modify_column_data_loss.sql b/tests/queries/0_stateless/03169_modify_column_data_loss.sql new file mode 100644 index 00000000000..def0a25a1b4 --- /dev/null +++ b/tests/queries/0_stateless/03169_modify_column_data_loss.sql @@ -0,0 +1,19 @@ +DROP TABLE IF EXISTS column_modify_test; + +CREATE TABLE column_modify_test (id UInt64, val String, other_col UInt64) engine=MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part=0; +INSERT INTO column_modify_test VALUES (1,'one',0); +INSERT INTO column_modify_test VALUES (2,'two',0); + +-- on 21.9 that was done via mutations mechanism +ALTER TABLE column_modify_test MODIFY COLUMN val Nullable(String); + +INSERT INTO column_modify_test VALUES (3,Null,0); + +-- till now everythings looks ok +SELECT * FROM column_modify_test order by id, val, other_col; + +-- Now we do mutation. It will affect one of the parts, and will update columns.txt to the latest / correct state w/o updating the column file! +alter table column_modify_test update other_col=1 where id = 1 SETTINGS mutations_sync=1; + +-- row 1 is damaged now the column file & columns.txt is out of sync! +SELECT *, throwIf(val <> 'one') as issue FROM column_modify_test WHERE id = 1; From f4493d2544d5453cead162f776969b22ec409763 Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Mon, 10 Jun 2024 18:33:00 +0200 Subject: [PATCH 184/215] stateless tests: add test for https://github.com/ClickHouse/ClickHouse/issues/59094 --- ...3170_part_offset_as_table_column.reference | 30 +++++++++++++++++++ .../03170_part_offset_as_table_column.sql | 25 ++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 tests/queries/0_stateless/03170_part_offset_as_table_column.reference create mode 100644 tests/queries/0_stateless/03170_part_offset_as_table_column.sql diff --git a/tests/queries/0_stateless/03170_part_offset_as_table_column.reference b/tests/queries/0_stateless/03170_part_offset_as_table_column.reference new file mode 100644 index 00000000000..435187cb39b --- /dev/null +++ b/tests/queries/0_stateless/03170_part_offset_as_table_column.reference @@ -0,0 +1,30 @@ +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 +0 0 +1 0 +2 0 +3 0 +4 0 +5 0 +6 0 +7 0 +8 0 +9 0 diff --git a/tests/queries/0_stateless/03170_part_offset_as_table_column.sql b/tests/queries/0_stateless/03170_part_offset_as_table_column.sql new file mode 100644 index 00000000000..36cbc156744 --- /dev/null +++ b/tests/queries/0_stateless/03170_part_offset_as_table_column.sql @@ -0,0 +1,25 @@ +CREATE TABLE test_table +( + `key` UInt32, + `_part_offset` DEFAULT 0 +) +ENGINE = MergeTree +ORDER BY key; + +INSERT INTO test_table (key) SELECT number +FROM numbers(10); + +set allow_experimental_analyzer=0; + +SELECT * +FROM test_table; + +set allow_experimental_analyzer=1; + +SELECT * +FROM test_table; + +SELECT + key, + _part_offset +FROM test_table; From 3f0211a2f88c3f8ebae75f7cb39469b8769c86d1 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 13 Jun 2024 13:49:47 +0200 Subject: [PATCH 185/215] Update s3queue.md --- docs/en/engines/table-engines/integrations/s3queue.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/s3queue.md b/docs/en/engines/table-engines/integrations/s3queue.md index f72bc79c1e5..0958680dc56 100644 --- a/docs/en/engines/table-engines/integrations/s3queue.md +++ b/docs/en/engines/table-engines/integrations/s3queue.md @@ -183,7 +183,7 @@ Default value: `30000`. ### s3queue_buckets {#buckets} -For 'Ordered' mode. Available since `24.6`. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even futher, as it defines the actual parallelism of `S3Queue` processing. +For 'Ordered' mode. Available since `24.6`. If there are several replicas of S3Queue table, each working with the same metadata directory in keeper, the value of `s3queue_buckets` needs to be equal to at least the number of replicas. If `s3queue_processing_threads` setting is used as well, it makes sense to increase the value of `s3queue_buckets` setting even further, as it defines the actual parallelism of `S3Queue` processing. ## S3-related Settings {#s3-settings} From 64f8fedc7a11302015b946d9cc65ec5672e819ff Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 13 Jun 2024 14:23:32 +0200 Subject: [PATCH 186/215] Fix test --- src/Backups/BackupIO_S3.cpp | 24 ++++++++++++------------ src/IO/S3Settings.cpp | 15 +++++++++++++++ src/IO/S3Settings.h | 7 +++++++ 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 273d4b4ebe8..56544312c26 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -128,13 +128,13 @@ BackupReaderS3::BackupReaderS3( , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} { - auto endpoint_settings = context_->getStorageS3Settings().getSettings( - s3_uri.uri.toString(), - context_->getUserName(), - /*ignore_user=*/is_internal_backup); + s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); - if (endpoint_settings.has_value()) - s3_settings = endpoint_settings.value(); + if (auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup)) + { + s3_settings.updateIfChanged(*endpoint_settings); + } s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; @@ -226,13 +226,13 @@ BackupWriterS3::BackupWriterS3( , s3_uri(s3_uri_) , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} { - auto endpoint_settings = context_->getStorageS3Settings().getSettings( - s3_uri.uri.toString(), - context_->getUserName(), - /*ignore_user=*/is_internal_backup); + s3_settings.loadFromConfig(context_->getConfigRef(), "s3", context_->getSettingsRef()); - if (endpoint_settings.has_value()) - s3_settings = endpoint_settings.value(); + if (auto endpoint_settings = context_->getStorageS3Settings().getSettings( + s3_uri.uri.toString(), context_->getUserName(), /*ignore_user=*/is_internal_backup)) + { + s3_settings.updateIfChanged(*endpoint_settings); + } s3_settings.request_settings.updateFromSettings(context_->getSettingsRef(), /* if_changed */true); s3_settings.request_settings.allow_native_copy = allow_s3_native_copy; diff --git a/src/IO/S3Settings.cpp b/src/IO/S3Settings.cpp index e88d8133c32..a5a50c873cb 100644 --- a/src/IO/S3Settings.cpp +++ b/src/IO/S3Settings.cpp @@ -8,6 +8,21 @@ namespace DB { +void S3Settings::loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings) +{ + auth_settings = S3::AuthSettings(config, settings, config_prefix); + request_settings = S3::RequestSettings(config, settings, config_prefix); +} + +void S3Settings::updateIfChanged(const S3Settings & settings) +{ + auth_settings.updateIfChanged(settings.auth_settings); + request_settings.updateIfChanged(settings.request_settings); +} + void S3SettingsByEndpoint::loadFromConfig( const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, diff --git a/src/IO/S3Settings.h b/src/IO/S3Settings.h index 58e12e48002..9eed0a5652f 100644 --- a/src/IO/S3Settings.h +++ b/src/IO/S3Settings.h @@ -21,6 +21,13 @@ struct S3Settings { S3::AuthSettings auth_settings; S3::RequestSettings request_settings; + + void loadFromConfig( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const DB::Settings & settings); + + void updateIfChanged(const S3Settings & settings); }; class S3SettingsByEndpoint From fe378edb350e3c45d00e765fd3b024abce05f24b Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 6 Jun 2024 08:13:51 +0000 Subject: [PATCH 187/215] Fix THERE_IS_NO_COLUMN error in case move to PREWHERE applied to storage merge inside another table function (cherry picked from commit ac22904ff2b960b46b85b8197cbf814f26855049) --- ...3165_storage_merge_view_prewhere.reference | 7 ++++ .../03165_storage_merge_view_prewhere.sql | 41 +++++++++++++++++++ 2 files changed, 48 insertions(+) create mode 100644 tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference create mode 100644 tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql diff --git a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference new file mode 100644 index 00000000000..3ee56295b2e --- /dev/null +++ b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference @@ -0,0 +1,7 @@ +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever diff --git a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql new file mode 100644 index 00000000000..97651d1b0fd --- /dev/null +++ b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.sql @@ -0,0 +1,41 @@ +-- Tags: distributed + +DROP TABLE IF EXISTS ids; +DROP TABLE IF EXISTS data; +DROP TABLE IF EXISTS data2; + +CREATE TABLE ids (id UUID, whatever String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO ids VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', 'whatever'); + +CREATE TABLE data (id UUID, event_time DateTime, status String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO data VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', '2000-01-01', 'CREATED'); + +CREATE TABLE data2 (id UUID, event_time DateTime, status String) Engine=MergeTree ORDER BY tuple(); +INSERT INTO data2 VALUES ('a1451105-722e-4fe7-bfaa-65ad2ae249c2', '2000-01-02', 'CREATED'); + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN merge(currentDatabase(), 'data*') AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN clusterAllReplicas(test_cluster_two_shards, merge(currentDatabase(), 'data*')) AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; + +SELECT + id, + whatever +FROM ids AS l +INNER JOIN view(SELECT * FROM merge(currentDatabase(), 'data*')) AS s ON l.id = s.id +WHERE (status IN ['CREATED', 'CREATING']) +ORDER BY event_time DESC +; From 1172b324e5ef3dcdbb751b066a546f4f0a81564d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 13 Jun 2024 13:08:30 +0000 Subject: [PATCH 188/215] Update test. --- .../0_stateless/03165_storage_merge_view_prewhere.reference | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference index 3ee56295b2e..4cd7f2cb141 100644 --- a/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference +++ b/tests/queries/0_stateless/03165_storage_merge_view_prewhere.reference @@ -5,3 +5,4 @@ a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever +a1451105-722e-4fe7-bfaa-65ad2ae249c2 whatever From 88b6b2732a3f75444fcc6c578e16e61f7db9ae5b Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 13 Jun 2024 15:14:05 +0200 Subject: [PATCH 189/215] Sync changes --- src/Common/ProfileEvents.h | 1 + utils/check-style/check-style | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Common/ProfileEvents.h b/src/Common/ProfileEvents.h index e670b8907d2..f196ed5a04c 100644 --- a/src/Common/ProfileEvents.h +++ b/src/Common/ProfileEvents.h @@ -40,6 +40,7 @@ namespace ProfileEvents Timer(Counters & counters_, Event timer_event_, Event counter_event, Resolution resolution_); ~Timer() { end(); } void cancel() { watch.reset(); } + void restart() { watch.restart(); } void end(); UInt64 get(); diff --git a/utils/check-style/check-style b/utils/check-style/check-style index db491c67f2c..f5cb65eb879 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -65,6 +65,7 @@ EXTERN_TYPES_EXCLUDES=( ProfileEvents::increment ProfileEvents::incrementForLogMessage ProfileEvents::getName + ProfileEvents::Timer ProfileEvents::Type ProfileEvents::TypeEnum ProfileEvents::dumpToMapColumn @@ -242,7 +243,7 @@ done # All submodules should be from https://github.com/ git config --file "$ROOT_PATH/.gitmodules" --get-regexp 'submodule\..+\.url' | \ -while read -r line; do +while read -r line; do name=${line#submodule.}; name=${name%.url*} url=${line#* } [[ "$url" != 'https://github.com/'* ]] && echo "All submodules should be from https://github.com/, submodule '$name' has '$url'" From 82a5496f28569a46e468587a76e4686d324e5847 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 13 Jun 2024 15:28:04 +0200 Subject: [PATCH 190/215] fix --- src/Core/ServerUUID.cpp | 5 +++++ src/Core/ServerUUID.h | 2 ++ src/Interpreters/tests/gtest_filecache.cpp | 7 +++++++ 3 files changed, 14 insertions(+) diff --git a/src/Core/ServerUUID.cpp b/src/Core/ServerUUID.cpp index 9dfaf4fecf2..251b407e673 100644 --- a/src/Core/ServerUUID.cpp +++ b/src/Core/ServerUUID.cpp @@ -68,4 +68,9 @@ UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log) } } +void ServerUUID::setRandomForUnitTests() +{ + server_uuid = UUIDHelpers::generateV4(); +} + } diff --git a/src/Core/ServerUUID.h b/src/Core/ServerUUID.h index 9b9963ceeeb..9c7f7d32acc 100644 --- a/src/Core/ServerUUID.h +++ b/src/Core/ServerUUID.h @@ -19,6 +19,8 @@ public: /// Loads server UUID from file or creates new one. Should be called on daemon startup. static void load(const fs::path & server_uuid_file, Poco::Logger * log); + + static void setRandomForUnitTests(); }; UUID loadServerUUID(const fs::path & server_uuid_file, Poco::Logger * log); diff --git a/src/Interpreters/tests/gtest_filecache.cpp b/src/Interpreters/tests/gtest_filecache.cpp index 41191ba1605..36acc319f4e 100644 --- a/src/Interpreters/tests/gtest_filecache.cpp +++ b/src/Interpreters/tests/gtest_filecache.cpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -333,6 +334,7 @@ public: TEST_F(FileCacheTest, LRUPolicy) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; /// To work with cache need query_id and query context. @@ -807,6 +809,7 @@ TEST_F(FileCacheTest, LRUPolicy) TEST_F(FileCacheTest, writeBuffer) { + ServerUUID::setRandomForUnitTests(); FileCacheSettings settings; settings.max_size = 100; settings.max_elements = 5; @@ -938,6 +941,7 @@ static size_t readAllTemporaryData(TemporaryFileStream & stream) TEST_F(FileCacheTest, temporaryData) { + ServerUUID::setRandomForUnitTests(); DB::FileCacheSettings settings; settings.max_size = 10_KiB; settings.max_file_segment_size = 1_KiB; @@ -1044,6 +1048,7 @@ TEST_F(FileCacheTest, temporaryData) TEST_F(FileCacheTest, CachedReadBuffer) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; /// To work with cache need query_id and query context. @@ -1120,6 +1125,7 @@ TEST_F(FileCacheTest, CachedReadBuffer) TEST_F(FileCacheTest, TemporaryDataReadBufferSize) { + ServerUUID::setRandomForUnitTests(); /// Temporary data stored in cache { DB::FileCacheSettings settings; @@ -1167,6 +1173,7 @@ TEST_F(FileCacheTest, TemporaryDataReadBufferSize) TEST_F(FileCacheTest, SLRUPolicy) { + ServerUUID::setRandomForUnitTests(); DB::ThreadStatus thread_status; std::string query_id = "query_id"; /// To work with cache need query_id and query context. From df607c535c88fa7faf5732f2f7056dc69787a6f6 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 13 Jun 2024 13:30:23 +0000 Subject: [PATCH 191/215] Docs: Recommend disabling ASLR for sanitizer builds --- docs/en/development/tests.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index bbc7dac0a2a..f01b4249472 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -229,6 +229,10 @@ For production builds, clang is used, but we also test make gcc builds. For deve ## Sanitizers {#sanitizers} +:::note +Make sure to have address space layout randomization disabled when running sanitizer builds locally: `sudo sysctl kernel.randomize_va_space=0` +::: + ### Address sanitizer We run functional, integration, stress and unit tests under ASan on per-commit basis. From f1707998d214832faa57e3ae35c7343ba6025fa3 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 13 Jun 2024 14:25:02 +0200 Subject: [PATCH 192/215] Fix search issues for progress_func, add zstd for GH cache --- docker/test/style/Dockerfile | 7 ++++--- tests/ci/github_helper.py | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index 54fab849301..6ad03852b66 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -10,14 +10,15 @@ RUN sed -i "s|http://archive.ubuntu.com|$apt_archive|g" /etc/apt/sources.list RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \ aspell \ curl \ - git \ - gh \ file \ + gh \ + git \ libxml2-utils \ + locales \ moreutils \ python3-pip \ yamllint \ - locales \ + zstd \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /var/cache/debconf /tmp/* diff --git a/tests/ci/github_helper.py b/tests/ci/github_helper.py index b6407c5d531..431e6977091 100644 --- a/tests/ci/github_helper.py +++ b/tests/ci/github_helper.py @@ -112,12 +112,12 @@ class GitHub(github.Github): # pylint: enable=signature-differs def get_pulls_from_search(self, *args: Any, **kwargs: Any) -> PullRequests: """The search api returns actually issues, so we need to fetch PullRequests""" - issues = self.search_issues(*args, **kwargs) - repos = {} - prs = [] # type: PullRequests progress_func = kwargs.pop( "progress_func", lambda x: x ) # type: Callable[[Issues], Issues] + issues = self.search_issues(*args, **kwargs) + repos = {} + prs = [] # type: PullRequests for issue in progress_func(issues): # See https://github.com/PyGithub/PyGithub/issues/2202, # obj._rawData doesn't spend additional API requests From f48421fd61f426e246274f66f82bb35dfc1ebdb6 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 13 Jun 2024 14:30:11 +0200 Subject: [PATCH 193/215] Running a changelog.py with a host network for AMI --- .github/workflows/tags_stable.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tags_stable.yml b/.github/workflows/tags_stable.yml index e4fc9f0b1d3..2aa7694bc41 100644 --- a/.github/workflows/tags_stable.yml +++ b/.github/workflows/tags_stable.yml @@ -46,9 +46,10 @@ jobs: ./utils/list-versions/list-versions.sh > ./utils/list-versions/version_date.tsv ./utils/list-versions/update-docker-version.sh GID=$(id -g "${UID}") - docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 \ + # --network=host and CI=1 are required for the S3 access from a container + docker run -u "${UID}:${GID}" -e PYTHONUNBUFFERED=1 -e CI=1 --network=host \ --volume="${GITHUB_WORKSPACE}:/ClickHouse" clickhouse/style-test \ - /ClickHouse/utils/changelog/changelog.py -v --debug-helpers \ + /ClickHouse/tests/ci/changelog.py -v --debug-helpers \ --gh-user-or-token="$GITHUB_TOKEN" --jobs=5 \ --output="/ClickHouse/docs/changelogs/${GITHUB_TAG}.md" "${GITHUB_TAG}" git add "./docs/changelogs/${GITHUB_TAG}.md" From e8ac8d46ff472cc8bbcb9e5cdbc47102cc533880 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 13 Jun 2024 15:43:30 +0200 Subject: [PATCH 194/215] Update docs/en/development/tests.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Raúl Marín --- docs/en/development/tests.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index f01b4249472..55330602377 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -230,7 +230,7 @@ For production builds, clang is used, but we also test make gcc builds. For deve ## Sanitizers {#sanitizers} :::note -Make sure to have address space layout randomization disabled when running sanitizer builds locally: `sudo sysctl kernel.randomize_va_space=0` +If the process crashes at the start when running it locally, you might need to disable address space layout randomization: `sudo sysctl kernel.randomize_va_space=0` ::: ### Address sanitizer From 5e1ef8aefaf3477566f91a3e92b6cc844800fc57 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Thu, 13 Jun 2024 15:44:36 +0200 Subject: [PATCH 195/215] Update tests.md --- docs/en/development/tests.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/development/tests.md b/docs/en/development/tests.md index 55330602377..8dff6f0ed1d 100644 --- a/docs/en/development/tests.md +++ b/docs/en/development/tests.md @@ -230,7 +230,7 @@ For production builds, clang is used, but we also test make gcc builds. For deve ## Sanitizers {#sanitizers} :::note -If the process crashes at the start when running it locally, you might need to disable address space layout randomization: `sudo sysctl kernel.randomize_va_space=0` +If the process (ClickHouse server or client) crashes at startup when running it locally, you might need to disable address space layout randomization: `sudo sysctl kernel.randomize_va_space=0` ::: ### Address sanitizer From 0d17f2cededd98d80d5d8f1486e62899ed953f43 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 16:14:57 +0200 Subject: [PATCH 196/215] CI: FinishCheck to set failure if workflow failed --- .github/workflows/backport_branches.yml | 2 +- .github/workflows/master.yml | 2 +- .github/workflows/merge_queue.yml | 2 +- .github/workflows/pull_request.yml | 2 +- .github/workflows/release_branches.yml | 2 +- tests/ci/finish_check.py | 42 ++++++++++++++++++------- 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index b0380b939bb..c8c6ba30b0b 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -273,5 +273,5 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} python3 merge_pr.py diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 91dcb6a4968..f5c78a6b6a1 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -173,4 +173,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index c8b2452829b..3f45daf0fb4 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -112,4 +112,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py ${{ (contains(needs.*.result, 'failure') && github.event_name == 'merge_group') && '--pipeline-failure' || '' }} + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index e4deaf9f35e..079208eb65a 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -191,7 +191,7 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} ############################################################################################# ###################################### JEPSEN TESTS ######################################### diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 4d45c8d8d4b..f9b8a4fa764 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -496,4 +496,4 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 12756599865..6a32ba71bed 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +import argparse import logging import sys @@ -20,27 +21,38 @@ from report import FAILURE, PENDING, SUCCESS, StatusType from synchronizer_utils import SYNC_BRANCH_PREFIX +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + description="Script to merge the given PR. Additional checks for approved " + "status and green commit statuses could be done", + ) + parser.add_argument( + "--wf-status", + type=str, + default="", + help="overall workflow status [success|failure]", + ) + return parser.parse_args() + + def main(): logging.basicConfig(level=logging.INFO) + args = parse_args() - has_failure = False - - # FIXME: temporary hack to fail Mergeable Check in MQ if pipeline has any failed jobs - if len(sys.argv) > 1 and sys.argv[1] == "--pipeline-failure": - has_failure = True + has_workflow_failures = args.wf_status == FAILURE pr_info = PRInfo(need_orgs=True) gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) - statuses = None if pr_info.is_merge_queue: - # in MQ Mergeable check status must never be green if any failures in workflow - if has_failure: - set_mergeable_check(commit, "workflow failed", "failure") + # in MQ Mergeable check status must never be green if any failures in the workflow + if has_workflow_failures: + set_mergeable_check(commit, "workflow failed", FAILURE) else: # This must be the only place where green MCheck is set in the MQ (in the end of CI) to avoid early merge - set_mergeable_check(commit, "workflow passed", "success") + set_mergeable_check(commit, "workflow passed", SUCCESS) else: statuses = get_commit_filtered_statuses(commit) state = trigger_mergeable_check(commit, statuses, set_if_green=True) @@ -67,6 +79,7 @@ def main(): has_failure = False has_pending = False + error_cnt = 0 for status in statuses: if status.context in (StatusNames.MERGEABLE, StatusNames.CI): # do not account these statuses @@ -80,12 +93,19 @@ def main(): continue else: has_failure = True + error_cnt += 1 ci_state = SUCCESS # type: StatusType + description = "All checks finished" if has_failure: ci_state = FAILURE + description = f"All checks finished. {error_cnt} jobs failed" + elif has_workflow_failures: + ci_state = FAILURE + description = "All checks finished. Workflow has failures." elif has_pending: print("ERROR: CI must not have pending jobs by the time of finish check") + description = "ERROR: workflow has pending jobs" ci_state = FAILURE if ci_status.state == PENDING: @@ -93,7 +113,7 @@ def main(): commit, ci_state, ci_status.target_url, - "All checks finished", + description, StatusNames.CI, pr_info, dump_to_file=True, From a77eda6388bbb3c887d1b621e6a191269c6ba5c5 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 16:27:41 +0200 Subject: [PATCH 197/215] fix pylint --- tests/ci/finish_check.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 6a32ba71bed..2f624fd91f8 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -1,7 +1,6 @@ #!/usr/bin/env python3 import argparse import logging -import sys from github import Github From fc255456cee541f70f54c4365c410d053d073c22 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 16:36:07 +0200 Subject: [PATCH 198/215] comment --- tests/ci/finish_check.py | 119 ++++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 59 deletions(-) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 2f624fd91f8..0d59c3b43a4 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -52,71 +52,72 @@ def main(): else: # This must be the only place where green MCheck is set in the MQ (in the end of CI) to avoid early merge set_mergeable_check(commit, "workflow passed", SUCCESS) - else: - statuses = get_commit_filtered_statuses(commit) - state = trigger_mergeable_check(commit, statuses, set_if_green=True) + return - # Process upstream StatusNames.SYNC - if ( - pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") - and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY - ): - upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) - update_upstream_sync_status( - upstream_pr_number, - pr_info.number, - gh, - state, - can_set_green_mergeable_status=True, - ) + statuses = get_commit_filtered_statuses(commit) + state = trigger_mergeable_check(commit, statuses, set_if_green=True) - ci_running_statuses = [s for s in statuses if s.context == StatusNames.CI] - if not ci_running_statuses: - return - # Take the latest status - ci_status = ci_running_statuses[-1] + # Process upstream StatusNames.SYNC + if ( + pr_info.head_ref.startswith(f"{SYNC_BRANCH_PREFIX}/pr/") + and GITHUB_REPOSITORY != GITHUB_UPSTREAM_REPOSITORY + ): + upstream_pr_number = int(pr_info.head_ref.split("/pr/", maxsplit=1)[1]) + update_upstream_sync_status( + upstream_pr_number, + pr_info.number, + gh, + state, + can_set_green_mergeable_status=True, + ) - has_failure = False - has_pending = False - error_cnt = 0 - for status in statuses: - if status.context in (StatusNames.MERGEABLE, StatusNames.CI): - # do not account these statuses + ci_running_statuses = [s for s in statuses if s.context == StatusNames.CI] + if not ci_running_statuses: + return + # Take the latest status + ci_status = ci_running_statuses[-1] + + has_failure = False + has_pending = False + error_cnt = 0 + for status in statuses: + if status.context in (StatusNames.MERGEABLE, StatusNames.CI): + # do not account these statuses + continue + if status.state == PENDING: + if status.context == StatusNames.SYNC: + # do not account sync status if pending - it's a different WF continue - if status.state == PENDING: - if status.context == StatusNames.SYNC: - # do not account sync status if pending - it's a different WF - continue - has_pending = True - elif status.state == SUCCESS: - continue - else: - has_failure = True - error_cnt += 1 + has_pending = True + elif status.state == SUCCESS: + continue + else: + has_failure = True + error_cnt += 1 - ci_state = SUCCESS # type: StatusType - description = "All checks finished" - if has_failure: - ci_state = FAILURE - description = f"All checks finished. {error_cnt} jobs failed" - elif has_workflow_failures: - ci_state = FAILURE - description = "All checks finished. Workflow has failures." - elif has_pending: - print("ERROR: CI must not have pending jobs by the time of finish check") - description = "ERROR: workflow has pending jobs" - ci_state = FAILURE + ci_state = SUCCESS # type: StatusType + description = "All checks finished" + if has_failure: + ci_state = FAILURE + description = f"All checks finished. {error_cnt} jobs failed" + elif has_workflow_failures: + ci_state = FAILURE + description = "All checks finished. Workflow has failures." + elif has_pending: + print("ERROR: CI must not have pending jobs by the time of finish check") + description = "ERROR: workflow has pending jobs" + ci_state = FAILURE - if ci_status.state == PENDING: - post_commit_status( - commit, - ci_state, - ci_status.target_url, - description, - StatusNames.CI, - pr_info, - dump_to_file=True, - ) + if ci_status.state == PENDING: + post_commit_status( + commit, + ci_state, + ci_status.target_url, + description, + StatusNames.CI, + pr_info, + dump_to_file=True, + ) if __name__ == "__main__": From 833d138383073cdeb431e1c647d0169b28750379 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 13 Jun 2024 16:28:12 +0000 Subject: [PATCH 199/215] Update version_date.tsv and changelogs after v24.3.4.147-lts --- docker/keeper/Dockerfile | 2 +- docker/server/Dockerfile.alpine | 2 +- docker/server/Dockerfile.ubuntu | 2 +- docs/changelogs/v24.3.4.147-lts.md | 100 +++++++++++++++++++++++++++ utils/list-versions/version_date.tsv | 2 + 5 files changed, 105 insertions(+), 3 deletions(-) create mode 100644 docs/changelogs/v24.3.4.147-lts.md diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index b3271d94184..7a57a592724 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.2.34" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 3f3b880c8f3..3456a4ee2b9 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.2.34" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 5fd22ee9b51..0c17cc10b08 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.2.34" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" #docker-official-library:off diff --git a/docs/changelogs/v24.3.4.147-lts.md b/docs/changelogs/v24.3.4.147-lts.md new file mode 100644 index 00000000000..7d77fb29977 --- /dev/null +++ b/docs/changelogs/v24.3.4.147-lts.md @@ -0,0 +1,100 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.3.4.147-lts (31a7bdc346d) FIXME as compared to v24.3.3.102-lts (7e7f3bdd9be) + +#### Improvement +* Backported in [#63465](https://github.com/ClickHouse/ClickHouse/issues/63465): Make rabbitmq nack broken messages. Closes [#45350](https://github.com/ClickHouse/ClickHouse/issues/45350). [#60312](https://github.com/ClickHouse/ClickHouse/pull/60312) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#64290](https://github.com/ClickHouse/ClickHouse/issues/64290): Fix logical-error when undoing quorum insert transaction. [#61953](https://github.com/ClickHouse/ClickHouse/pull/61953) ([Han Fei](https://github.com/hanfei1991)). + +#### Build/Testing/Packaging Improvement +* Backported in [#63610](https://github.com/ClickHouse/ClickHouse/issues/63610): The Dockerfile is reviewed by the docker official library in https://github.com/docker-library/official-images/pull/15846. [#63400](https://github.com/ClickHouse/ClickHouse/pull/63400) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#65128](https://github.com/ClickHouse/ClickHouse/issues/65128): Decrease the `unit-test` image a few times. [#65102](https://github.com/ClickHouse/ClickHouse/pull/65102) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#64277](https://github.com/ClickHouse/ClickHouse/issues/64277): Fix queries with FINAL give wrong result when table does not use adaptive granularity. [#62432](https://github.com/ClickHouse/ClickHouse/pull/62432) ([Duc Canh Le](https://github.com/canhld94)). +* Backported in [#63716](https://github.com/ClickHouse/ClickHouse/issues/63716): Fix excessive memory usage for queries with nested lambdas. Fixes [#62036](https://github.com/ClickHouse/ClickHouse/issues/62036). [#62462](https://github.com/ClickHouse/ClickHouse/pull/62462) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63247](https://github.com/ClickHouse/ClickHouse/issues/63247): Fix size checks when updating materialized nested columns ( fixes [#62731](https://github.com/ClickHouse/ClickHouse/issues/62731) ). [#62773](https://github.com/ClickHouse/ClickHouse/pull/62773) ([Eliot Hautefeuille](https://github.com/hileef)). +* Backported in [#62984](https://github.com/ClickHouse/ClickHouse/issues/62984): Fix the `Unexpected return type` error for queries that read from `StorageBuffer` with `PREWHERE` when the source table has different types. Fixes [#62545](https://github.com/ClickHouse/ClickHouse/issues/62545). [#62916](https://github.com/ClickHouse/ClickHouse/pull/62916) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63185](https://github.com/ClickHouse/ClickHouse/issues/63185): Sanity check: Clamp values instead of throwing. [#63119](https://github.com/ClickHouse/ClickHouse/pull/63119) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63293](https://github.com/ClickHouse/ClickHouse/issues/63293): Fix crash with untuple and unresolved lambda. [#63131](https://github.com/ClickHouse/ClickHouse/pull/63131) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#63411](https://github.com/ClickHouse/ClickHouse/issues/63411): Fix a misbehavior when SQL security defaults don't load for old tables during server startup. [#63209](https://github.com/ClickHouse/ClickHouse/pull/63209) ([pufit](https://github.com/pufit)). +* Backported in [#63616](https://github.com/ClickHouse/ClickHouse/issues/63616): Fix bug which could potentially lead to rare LOGICAL_ERROR during SELECT query with message: `Unexpected return type from materialize. Expected type_XXX. Got type_YYY.` Introduced in [#59379](https://github.com/ClickHouse/ClickHouse/issues/59379). [#63353](https://github.com/ClickHouse/ClickHouse/pull/63353) ([alesapin](https://github.com/alesapin)). +* Backported in [#63455](https://github.com/ClickHouse/ClickHouse/issues/63455): Fix `X-ClickHouse-Timezone` header returning wrong timezone when using `session_timezone` as query level setting. [#63377](https://github.com/ClickHouse/ClickHouse/pull/63377) ([Andrey Zvonov](https://github.com/zvonand)). +* Backported in [#63603](https://github.com/ClickHouse/ClickHouse/issues/63603): Fix backup of projection part in case projection was removed from table metadata, but part still has projection. [#63426](https://github.com/ClickHouse/ClickHouse/pull/63426) ([Kseniia Sumarokova](https://github.com/kssenii)). +* Backported in [#63508](https://github.com/ClickHouse/ClickHouse/issues/63508): Fix 'Every derived table must have its own alias' error for MYSQL dictionary source, close [#63341](https://github.com/ClickHouse/ClickHouse/issues/63341). [#63481](https://github.com/ClickHouse/ClickHouse/pull/63481) ([vdimir](https://github.com/vdimir)). +* Backported in [#63595](https://github.com/ClickHouse/ClickHouse/issues/63595): Avoid segafult in `MergeTreePrefetchedReadPool` while fetching projection parts. [#63513](https://github.com/ClickHouse/ClickHouse/pull/63513) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63748](https://github.com/ClickHouse/ClickHouse/issues/63748): Read only the necessary columns from VIEW (new analyzer). Closes [#62594](https://github.com/ClickHouse/ClickHouse/issues/62594). [#63688](https://github.com/ClickHouse/ClickHouse/pull/63688) ([Maksim Kita](https://github.com/kitaisreal)). +* Backported in [#63770](https://github.com/ClickHouse/ClickHouse/issues/63770): Fix [#63539](https://github.com/ClickHouse/ClickHouse/issues/63539). Forbid WINDOW redefinition in new analyzer. [#63694](https://github.com/ClickHouse/ClickHouse/pull/63694) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64189](https://github.com/ClickHouse/ClickHouse/issues/64189): Fix `Not found column` and `CAST AS Map from array requires nested tuple of 2 elements` exceptions for distributed queries which use `Map(Nothing, Nothing)` type. Fixes [#63637](https://github.com/ClickHouse/ClickHouse/issues/63637). [#63753](https://github.com/ClickHouse/ClickHouse/pull/63753) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63845](https://github.com/ClickHouse/ClickHouse/issues/63845): Fix possible `ILLEGAL_COLUMN` error in `partial_merge` join, close [#37928](https://github.com/ClickHouse/ClickHouse/issues/37928). [#63755](https://github.com/ClickHouse/ClickHouse/pull/63755) ([vdimir](https://github.com/vdimir)). +* Backported in [#63906](https://github.com/ClickHouse/ClickHouse/issues/63906): `query_plan_remove_redundant_distinct` can break queries with WINDOW FUNCTIONS (with `allow_experimental_analyzer` is on). Fixes [#62820](https://github.com/ClickHouse/ClickHouse/issues/62820). [#63776](https://github.com/ClickHouse/ClickHouse/pull/63776) ([Igor Nikonov](https://github.com/devcrafter)). +* Backported in [#63989](https://github.com/ClickHouse/ClickHouse/issues/63989): Fix incorrect select query result when parallel replicas were used to read from a Materialized View. [#63861](https://github.com/ClickHouse/ClickHouse/pull/63861) ([Nikita Taranov](https://github.com/nickitat)). +* Backported in [#64031](https://github.com/ClickHouse/ClickHouse/issues/64031): Fix a error `Database name is empty` for remote queries with lambdas over the cluster with modified default database. Fixes [#63471](https://github.com/ClickHouse/ClickHouse/issues/63471). [#63864](https://github.com/ClickHouse/ClickHouse/pull/63864) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64559](https://github.com/ClickHouse/ClickHouse/issues/64559): Fix SIGSEGV due to CPU/Real (`query_profiler_real_time_period_ns`/`query_profiler_cpu_time_period_ns`) profiler (has been an issue since 2022, that leads to periodic server crashes, especially if you were using distributed engine). [#63865](https://github.com/ClickHouse/ClickHouse/pull/63865) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#64009](https://github.com/ClickHouse/ClickHouse/issues/64009): Fix analyzer - IN function with arbitrary deep sub-selects in materialized view to use insertion block. [#63930](https://github.com/ClickHouse/ClickHouse/pull/63930) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64236](https://github.com/ClickHouse/ClickHouse/issues/64236): Fix resolve of unqualified COLUMNS matcher. Preserve the input columns order and forbid usage of unknown identifiers. [#63962](https://github.com/ClickHouse/ClickHouse/pull/63962) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64106](https://github.com/ClickHouse/ClickHouse/issues/64106): Deserialize untrusted binary inputs in a safer way. [#64024](https://github.com/ClickHouse/ClickHouse/pull/64024) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64168](https://github.com/ClickHouse/ClickHouse/issues/64168): Add missing settings to recoverLostReplica. [#64040](https://github.com/ClickHouse/ClickHouse/pull/64040) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64320](https://github.com/ClickHouse/ClickHouse/issues/64320): This fix will use a proper redefined context with the correct definer for each individual view in the query pipeline Closes [#63777](https://github.com/ClickHouse/ClickHouse/issues/63777). [#64079](https://github.com/ClickHouse/ClickHouse/pull/64079) ([pufit](https://github.com/pufit)). +* Backported in [#64380](https://github.com/ClickHouse/ClickHouse/issues/64380): Fix analyzer: "Not found column" error is fixed when using INTERPOLATE. [#64096](https://github.com/ClickHouse/ClickHouse/pull/64096) ([Yakov Olkhovskiy](https://github.com/yakov-olkhovskiy)). +* Backported in [#64567](https://github.com/ClickHouse/ClickHouse/issues/64567): Fix creating backups to S3 buckets with different credentials from the disk containing the file. [#64153](https://github.com/ClickHouse/ClickHouse/pull/64153) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64270](https://github.com/ClickHouse/ClickHouse/issues/64270): Prevent LOGICAL_ERROR on CREATE TABLE as MaterializedView. [#64174](https://github.com/ClickHouse/ClickHouse/pull/64174) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64339](https://github.com/ClickHouse/ClickHouse/issues/64339): The query cache now considers two identical queries against different databases as different. The previous behavior could be used to bypass missing privileges to read from a table. [#64199](https://github.com/ClickHouse/ClickHouse/pull/64199) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#64259](https://github.com/ClickHouse/ClickHouse/issues/64259): Ignore `text_log` config when using Keeper. [#64218](https://github.com/ClickHouse/ClickHouse/pull/64218) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#64688](https://github.com/ClickHouse/ClickHouse/issues/64688): Fix Query Tree size validation. Closes [#63701](https://github.com/ClickHouse/ClickHouse/issues/63701). [#64377](https://github.com/ClickHouse/ClickHouse/pull/64377) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64725](https://github.com/ClickHouse/ClickHouse/issues/64725): Fixed `CREATE TABLE AS` queries for tables with default expressions. [#64455](https://github.com/ClickHouse/ClickHouse/pull/64455) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64621](https://github.com/ClickHouse/ClickHouse/issues/64621): Fix an error `Cannot find column` in distributed queries with constant CTE in the `GROUP BY` key. [#64519](https://github.com/ClickHouse/ClickHouse/pull/64519) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64678](https://github.com/ClickHouse/ClickHouse/issues/64678): Fix [#64612](https://github.com/ClickHouse/ClickHouse/issues/64612). Do not rewrite aggregation if `-If` combinator is already used. [#64638](https://github.com/ClickHouse/ClickHouse/pull/64638) ([Dmitry Novik](https://github.com/novikd)). +* Backported in [#64831](https://github.com/ClickHouse/ClickHouse/issues/64831): Fix bug which could lead to non-working TTLs with expressions. Fixes [#63700](https://github.com/ClickHouse/ClickHouse/issues/63700). [#64694](https://github.com/ClickHouse/ClickHouse/pull/64694) ([alesapin](https://github.com/alesapin)). +* Backported in [#64940](https://github.com/ClickHouse/ClickHouse/issues/64940): Fix OrderByLimitByDuplicateEliminationVisitor across subqueries. [#64766](https://github.com/ClickHouse/ClickHouse/pull/64766) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64869](https://github.com/ClickHouse/ClickHouse/issues/64869): Fixed memory possible incorrect memory tracking in several kinds of queries: queries that read any data from S3, queries via http protocol, asynchronous inserts. [#64844](https://github.com/ClickHouse/ClickHouse/pull/64844) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64980](https://github.com/ClickHouse/ClickHouse/issues/64980): Fix the `Block structure mismatch` error for queries reading with `PREWHERE` from the materialized view when the materialized view has columns of different types than the source table. Fixes [#64611](https://github.com/ClickHouse/ClickHouse/issues/64611). [#64855](https://github.com/ClickHouse/ClickHouse/pull/64855) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64972](https://github.com/ClickHouse/ClickHouse/issues/64972): Fix rare crash when table has TTL with subquery + database replicated + parallel replicas + analyzer. It's really rare, but please don't use TTLs with subqueries. [#64858](https://github.com/ClickHouse/ClickHouse/pull/64858) ([alesapin](https://github.com/alesapin)). +* Backported in [#65070](https://github.com/ClickHouse/ClickHouse/issues/65070): Fix `ALTER MODIFY COMMENT` query that was broken for parameterized VIEWs in https://github.com/ClickHouse/ClickHouse/pull/54211. [#65031](https://github.com/ClickHouse/ClickHouse/pull/65031) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#65175](https://github.com/ClickHouse/ClickHouse/issues/65175): Fix the `Unknown expression identifier` error for remote queries with `INTERPOLATE (alias)` (new analyzer). Fixes [#64636](https://github.com/ClickHouse/ClickHouse/issues/64636). [#65090](https://github.com/ClickHouse/ClickHouse/pull/65090) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### Critical Bug Fix (crash, LOGICAL_ERROR, data loss, RBAC) + +* Backported in [#64587](https://github.com/ClickHouse/ClickHouse/issues/64587): Disabled `enable_vertical_final` setting by default. This feature should not be used because it has a bug: [#64543](https://github.com/ClickHouse/ClickHouse/issues/64543). [#64544](https://github.com/ClickHouse/ClickHouse/pull/64544) ([Alexander Tokmakov](https://github.com/tavplubix)). +* Backported in [#64878](https://github.com/ClickHouse/ClickHouse/issues/64878): This PR fixes an error when a user in a specific situation can escalate their privileges on the default database without necessary grants. [#64769](https://github.com/ClickHouse/ClickHouse/pull/64769) ([pufit](https://github.com/pufit)). + +#### NO CL CATEGORY + +* Backported in [#63304](https://github.com/ClickHouse/ClickHouse/issues/63304):. [#63297](https://github.com/ClickHouse/ClickHouse/pull/63297) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#63708](https://github.com/ClickHouse/ClickHouse/issues/63708):. [#63415](https://github.com/ClickHouse/ClickHouse/pull/63415) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). + +#### NO CL ENTRY + +* NO CL ENTRY: 'Revert "Backport [#64363](https://github.com/ClickHouse/ClickHouse/issues/64363) to 24.3: Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts"'. [#64907](https://github.com/ClickHouse/ClickHouse/pull/64907) ([Raúl Marín](https://github.com/Algunenano)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#63751](https://github.com/ClickHouse/ClickHouse/issues/63751): group_by_use_nulls strikes back. [#62922](https://github.com/ClickHouse/ClickHouse/pull/62922) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#63558](https://github.com/ClickHouse/ClickHouse/issues/63558): Try fix segfault in `MergeTreeReadPoolBase::createTask`. [#63323](https://github.com/ClickHouse/ClickHouse/pull/63323) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63336](https://github.com/ClickHouse/ClickHouse/issues/63336): The commit url has different pattern. [#63331](https://github.com/ClickHouse/ClickHouse/pull/63331) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#63374](https://github.com/ClickHouse/ClickHouse/issues/63374): Add tags for the test 03000_traverse_shadow_system_data_paths.sql to make it stable. [#63366](https://github.com/ClickHouse/ClickHouse/pull/63366) ([Aleksei Filatov](https://github.com/aalexfvk)). +* Backported in [#63625](https://github.com/ClickHouse/ClickHouse/issues/63625): Workaround for `oklch()` inside canvas bug for firefox. [#63404](https://github.com/ClickHouse/ClickHouse/pull/63404) ([Sergei Trifonov](https://github.com/serxa)). +* Backported in [#63569](https://github.com/ClickHouse/ClickHouse/issues/63569): Add `jwcrypto` to integration tests runner. [#63551](https://github.com/ClickHouse/ClickHouse/pull/63551) ([Konstantin Bogdanov](https://github.com/thevar1able)). +* Backported in [#63649](https://github.com/ClickHouse/ClickHouse/issues/63649): Fix `02362_part_log_merge_algorithm` flaky test. [#63635](https://github.com/ClickHouse/ClickHouse/pull/63635) ([Miсhael Stetsyuk](https://github.com/mstetsyuk)). +* Backported in [#63762](https://github.com/ClickHouse/ClickHouse/issues/63762): Cancel S3 reads properly when parallel reads are used. [#63687](https://github.com/ClickHouse/ClickHouse/pull/63687) ([Antonio Andelic](https://github.com/antonio2368)). +* Backported in [#63741](https://github.com/ClickHouse/ClickHouse/issues/63741): Userspace page cache: don't collect stats if cache is unused. [#63730](https://github.com/ClickHouse/ClickHouse/pull/63730) ([Michael Kolupaev](https://github.com/al13n321)). +* Backported in [#63826](https://github.com/ClickHouse/ClickHouse/issues/63826): Fix `test_odbc_interaction` for arm64 on linux. [#63787](https://github.com/ClickHouse/ClickHouse/pull/63787) ([alesapin](https://github.com/alesapin)). +* Backported in [#63895](https://github.com/ClickHouse/ClickHouse/issues/63895): Fix `test_catboost_evaluate` for aarch64. [#63789](https://github.com/ClickHouse/ClickHouse/pull/63789) ([alesapin](https://github.com/alesapin)). +* Backported in [#63887](https://github.com/ClickHouse/ClickHouse/issues/63887): Fix `test_disk_types` for aarch64. [#63832](https://github.com/ClickHouse/ClickHouse/pull/63832) ([alesapin](https://github.com/alesapin)). +* Backported in [#63879](https://github.com/ClickHouse/ClickHouse/issues/63879): Fix `test_short_strings_aggregation` for arm. [#63836](https://github.com/ClickHouse/ClickHouse/pull/63836) ([alesapin](https://github.com/alesapin)). +* Backported in [#63916](https://github.com/ClickHouse/ClickHouse/issues/63916): Disable `test_non_default_compression/test.py::test_preconfigured_deflateqpl_codec` on arm. [#63839](https://github.com/ClickHouse/ClickHouse/pull/63839) ([alesapin](https://github.com/alesapin)). +* Backported in [#63969](https://github.com/ClickHouse/ClickHouse/issues/63969): fix 02124_insert_deduplication_token_multiple_blocks. [#63950](https://github.com/ClickHouse/ClickHouse/pull/63950) ([Han Fei](https://github.com/hanfei1991)). +* Backported in [#64047](https://github.com/ClickHouse/ClickHouse/issues/64047): Do not create new release in release branch automatically. [#64039](https://github.com/ClickHouse/ClickHouse/pull/64039) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64076](https://github.com/ClickHouse/ClickHouse/issues/64076): Files without shebang have mime 'text/plain' or 'inode/x-empty'. [#64062](https://github.com/ClickHouse/ClickHouse/pull/64062) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64142](https://github.com/ClickHouse/ClickHouse/issues/64142): Fix sanitizers. [#64090](https://github.com/ClickHouse/ClickHouse/pull/64090) ([Azat Khuzhin](https://github.com/azat)). +* Backported in [#64159](https://github.com/ClickHouse/ClickHouse/issues/64159): Add retries in `git submodule update`. [#64125](https://github.com/ClickHouse/ClickHouse/pull/64125) ([Alexey Milovidov](https://github.com/alexey-milovidov)). +* Backported in [#64473](https://github.com/ClickHouse/ClickHouse/issues/64473): Split tests 03039_dynamic_all_merge_algorithms to avoid timeouts. [#64363](https://github.com/ClickHouse/ClickHouse/pull/64363) ([Kruglov Pavel](https://github.com/Avogar)). +* Backported in [#65113](https://github.com/ClickHouse/ClickHouse/issues/65113): Adjust the `version_helper` and script to a new release scheme. [#64759](https://github.com/ClickHouse/ClickHouse/pull/64759) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64999](https://github.com/ClickHouse/ClickHouse/issues/64999): Fix crash with DISTINCT and window functions. [#64767](https://github.com/ClickHouse/ClickHouse/pull/64767) ([Igor Nikonov](https://github.com/devcrafter)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 2f96daf4887..fe4adf1b446 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,8 @@ +v24.5.2.34-stable 2024-06-13 v24.5.1.1763-stable 2024-06-01 v24.4.2.141-stable 2024-06-07 v24.4.1.2088-stable 2024-05-01 +v24.3.4.147-lts 2024-06-13 v24.3.3.102-lts 2024-05-01 v24.3.2.23-lts 2024-04-03 v24.3.1.2672-lts 2024-03-27 From 11020f71f30389991bde012fc33d8dc0d05fde80 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 13 Jun 2024 13:38:53 +0000 Subject: [PATCH 200/215] Update version_date.tsv and changelogs after v24.5.2.34-stable --- docs/changelogs/v24.5.2.34-stable.md | 38 ++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 docs/changelogs/v24.5.2.34-stable.md diff --git a/docs/changelogs/v24.5.2.34-stable.md b/docs/changelogs/v24.5.2.34-stable.md new file mode 100644 index 00000000000..2db05a5f5dc --- /dev/null +++ b/docs/changelogs/v24.5.2.34-stable.md @@ -0,0 +1,38 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.5.2.34-stable (45589aeee49) FIXME as compared to v24.5.1.1763-stable (647c154a94d) + +#### Improvement +* Backported in [#65096](https://github.com/ClickHouse/ClickHouse/issues/65096): The setting `allow_experimental_join_condition` was accidentally marked as important which may prevent distributed queries in a mixed versions cluster from being executed successfully. [#65008](https://github.com/ClickHouse/ClickHouse/pull/65008) ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)). + +#### Build/Testing/Packaging Improvement +* Backported in [#65132](https://github.com/ClickHouse/ClickHouse/issues/65132): Decrease the `unit-test` image a few times. [#65102](https://github.com/ClickHouse/ClickHouse/pull/65102) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). + +#### Bug Fix (user-visible misbehavior in an official stable release) + +* Backported in [#64729](https://github.com/ClickHouse/ClickHouse/issues/64729): Fixed `CREATE TABLE AS` queries for tables with default expressions. [#64455](https://github.com/ClickHouse/ClickHouse/pull/64455) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#65061](https://github.com/ClickHouse/ClickHouse/issues/65061): Fix the `Expression nodes list expected 1 projection names` and `Unknown expression or identifier` errors for queries with aliases to `GLOBAL IN.` Fixes [#64445](https://github.com/ClickHouse/ClickHouse/issues/64445). [#64517](https://github.com/ClickHouse/ClickHouse/pull/64517) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#65088](https://github.com/ClickHouse/ClickHouse/issues/65088): Fix removing the `WHERE` and `PREWHERE` expressions, which are always true (for the new analyzer). Fixes [#64575](https://github.com/ClickHouse/ClickHouse/issues/64575). [#64695](https://github.com/ClickHouse/ClickHouse/pull/64695) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64944](https://github.com/ClickHouse/ClickHouse/issues/64944): Fix OrderByLimitByDuplicateEliminationVisitor across subqueries. [#64766](https://github.com/ClickHouse/ClickHouse/pull/64766) ([Raúl Marín](https://github.com/Algunenano)). +* Backported in [#64873](https://github.com/ClickHouse/ClickHouse/issues/64873): Fixed memory possible incorrect memory tracking in several kinds of queries: queries that read any data from S3, queries via http protocol, asynchronous inserts. [#64844](https://github.com/ClickHouse/ClickHouse/pull/64844) ([Anton Popov](https://github.com/CurtizJ)). +* Backported in [#64984](https://github.com/ClickHouse/ClickHouse/issues/64984): Fix the `Block structure mismatch` error for queries reading with `PREWHERE` from the materialized view when the materialized view has columns of different types than the source table. Fixes [#64611](https://github.com/ClickHouse/ClickHouse/issues/64611). [#64855](https://github.com/ClickHouse/ClickHouse/pull/64855) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#64976](https://github.com/ClickHouse/ClickHouse/issues/64976): Fix rare crash when table has TTL with subquery + database replicated + parallel replicas + analyzer. It's really rare, but please don't use TTLs with subqueries. [#64858](https://github.com/ClickHouse/ClickHouse/pull/64858) ([alesapin](https://github.com/alesapin)). +* Backported in [#65074](https://github.com/ClickHouse/ClickHouse/issues/65074): Fix `ALTER MODIFY COMMENT` query that was broken for parameterized VIEWs in https://github.com/ClickHouse/ClickHouse/pull/54211. [#65031](https://github.com/ClickHouse/ClickHouse/pull/65031) ([Nikolay Degterinsky](https://github.com/evillique)). +* Backported in [#65179](https://github.com/ClickHouse/ClickHouse/issues/65179): Fix the `Unknown expression identifier` error for remote queries with `INTERPOLATE (alias)` (new analyzer). Fixes [#64636](https://github.com/ClickHouse/ClickHouse/issues/64636). [#65090](https://github.com/ClickHouse/ClickHouse/pull/65090) ([Nikolai Kochetov](https://github.com/KochetovNicolai)). +* Backported in [#65163](https://github.com/ClickHouse/ClickHouse/issues/65163): Fix pushing arithmetic operations out of aggregation. In the new analyzer, optimization was applied only once. Part of [#62245](https://github.com/ClickHouse/ClickHouse/issues/62245). [#65104](https://github.com/ClickHouse/ClickHouse/pull/65104) ([Dmitry Novik](https://github.com/novikd)). + +#### Critical Bug Fix (crash, LOGICAL_ERROR, data loss, RBAC) + +* Backported in [#64882](https://github.com/ClickHouse/ClickHouse/issues/64882): This PR fixes an error when a user in a specific situation can escalate their privileges on the default database without necessary grants. [#64769](https://github.com/ClickHouse/ClickHouse/pull/64769) ([pufit](https://github.com/pufit)). + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#65002](https://github.com/ClickHouse/ClickHouse/issues/65002): Be more graceful with existing tables with `inverted` indexes. [#64656](https://github.com/ClickHouse/ClickHouse/pull/64656) ([Robert Schulze](https://github.com/rschu1ze)). +* Backported in [#65115](https://github.com/ClickHouse/ClickHouse/issues/65115): Adjust the `version_helper` and script to a new release scheme. [#64759](https://github.com/ClickHouse/ClickHouse/pull/64759) ([Mikhail f. Shiryaev](https://github.com/Felixoid)). +* Backported in [#64796](https://github.com/ClickHouse/ClickHouse/issues/64796): Fix crash with DISTINCT and window functions. [#64767](https://github.com/ClickHouse/ClickHouse/pull/64767) ([Igor Nikonov](https://github.com/devcrafter)). + From 3bba8e80f1dc55226bf99c7a2e238585e1afed60 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 13 Jun 2024 17:51:46 +0000 Subject: [PATCH 201/215] Update version_date.tsv and changelogs after v24.5.3.5-stable --- docker/keeper/Dockerfile | 2 +- docker/server/Dockerfile.alpine | 2 +- docker/server/Dockerfile.ubuntu | 2 +- docs/changelogs/v24.5.3.5-stable.md | 14 ++++++++++++++ utils/list-versions/version_date.tsv | 3 +++ 5 files changed, 20 insertions(+), 3 deletions(-) create mode 100644 docs/changelogs/v24.5.3.5-stable.md diff --git a/docker/keeper/Dockerfile b/docker/keeper/Dockerfile index b3271d94184..24f38740ff5 100644 --- a/docker/keeper/Dockerfile +++ b/docker/keeper/Dockerfile @@ -34,7 +34,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-keeper" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 3f3b880c8f3..c71319a2a7e 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -32,7 +32,7 @@ RUN arch=${TARGETARCH:-amd64} \ # lts / testing / prestable / etc ARG REPO_CHANNEL="stable" ARG REPOSITORY="https://packages.clickhouse.com/tgz/${REPO_CHANNEL}" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" ARG DIRECT_DOWNLOAD_URLS="" diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index 5fd22ee9b51..ed8cf3d657d 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -28,7 +28,7 @@ RUN sed -i "s|http://archive.ubuntu.com|${apt_archive}|g" /etc/apt/sources.list ARG REPO_CHANNEL="stable" ARG REPOSITORY="deb [signed-by=/usr/share/keyrings/clickhouse-keyring.gpg] https://packages.clickhouse.com/deb ${REPO_CHANNEL} main" -ARG VERSION="24.5.1.1763" +ARG VERSION="24.5.3.5" ARG PACKAGES="clickhouse-client clickhouse-server clickhouse-common-static" #docker-official-library:off diff --git a/docs/changelogs/v24.5.3.5-stable.md b/docs/changelogs/v24.5.3.5-stable.md new file mode 100644 index 00000000000..4606e58d0a4 --- /dev/null +++ b/docs/changelogs/v24.5.3.5-stable.md @@ -0,0 +1,14 @@ +--- +sidebar_position: 1 +sidebar_label: 2024 +--- + +# 2024 Changelog + +### ClickHouse release v24.5.3.5-stable (e0eb66f8e17) FIXME as compared to v24.5.2.34-stable (45589aeee49) + +#### NOT FOR CHANGELOG / INSIGNIFICANT + +* Backported in [#65227](https://github.com/ClickHouse/ClickHouse/issues/65227): Capture weak_ptr of ContextAccess for safety. [#65051](https://github.com/ClickHouse/ClickHouse/pull/65051) ([Alexander Gololobov](https://github.com/davenger)). +* Backported in [#65219](https://github.com/ClickHouse/ClickHouse/issues/65219): Fix false positives leaky memory warnings in OpenSSL. [#65125](https://github.com/ClickHouse/ClickHouse/pull/65125) ([Robert Schulze](https://github.com/rschu1ze)). + diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 2f96daf4887..719c25bdc95 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,9 @@ +v24.5.3.5-stable 2024-06-13 +v24.5.2.34-stable 2024-06-13 v24.5.1.1763-stable 2024-06-01 v24.4.2.141-stable 2024-06-07 v24.4.1.2088-stable 2024-05-01 +v24.3.4.147-lts 2024-06-13 v24.3.3.102-lts 2024-05-01 v24.3.2.23-lts 2024-04-03 v24.3.1.2672-lts 2024-03-27 From 5bc879c07c77f4ddaf4498ba6db52ecffc49fb3e Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 16:40:01 +0200 Subject: [PATCH 202/215] comment --- tests/ci/finish_check.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/tests/ci/finish_check.py b/tests/ci/finish_check.py index 0d59c3b43a4..904b565ad86 100644 --- a/tests/ci/finish_check.py +++ b/tests/ci/finish_check.py @@ -81,17 +81,12 @@ def main(): has_pending = False error_cnt = 0 for status in statuses: - if status.context in (StatusNames.MERGEABLE, StatusNames.CI): + if status.context in (StatusNames.MERGEABLE, StatusNames.CI, StatusNames.SYNC): # do not account these statuses continue if status.state == PENDING: - if status.context == StatusNames.SYNC: - # do not account sync status if pending - it's a different WF - continue has_pending = True - elif status.state == SUCCESS: - continue - else: + elif status.state != SUCCESS: has_failure = True error_cnt += 1 @@ -108,16 +103,15 @@ def main(): description = "ERROR: workflow has pending jobs" ci_state = FAILURE - if ci_status.state == PENDING: - post_commit_status( - commit, - ci_state, - ci_status.target_url, - description, - StatusNames.CI, - pr_info, - dump_to_file=True, - ) + post_commit_status( + commit, + ci_state, + ci_status.target_url, + description, + StatusNames.CI, + pr_info, + dump_to_file=True, + ) if __name__ == "__main__": From 02677892e33fef568df1b1317d766e81e9c8d9ba Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 13 Jun 2024 21:03:02 +0200 Subject: [PATCH 203/215] Fix libunwind in CI --- docker/test/fuzzer/run-fuzzer.sh | 1 + docker/test/stateless/attach_gdb.lib | 1 + 2 files changed, 2 insertions(+) diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index b8f967ed9c2..6191aeaf304 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -208,6 +208,7 @@ handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass +handle SIGSEGV nostop pass handle SIG$RTMIN nostop noprint pass info signals continue diff --git a/docker/test/stateless/attach_gdb.lib b/docker/test/stateless/attach_gdb.lib index d288288bb17..eb54f920b98 100644 --- a/docker/test/stateless/attach_gdb.lib +++ b/docker/test/stateless/attach_gdb.lib @@ -20,6 +20,7 @@ handle SIGPIPE nostop noprint pass handle SIGTERM nostop noprint pass handle SIGUSR1 nostop noprint pass handle SIGUSR2 nostop noprint pass +handle SIGSEGV nostop pass handle SIG$RTMIN nostop noprint pass info signals continue From dfcb36506273b9c401447a3fdc0eabc2dee90f6a Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 21:53:55 +0200 Subject: [PATCH 204/215] CI: Do not skip FinishCheck in Merge Queue --- .github/workflows/merge_queue.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/merge_queue.yml b/.github/workflows/merge_queue.yml index c8b2452829b..5f8ff407e93 100644 --- a/.github/workflows/merge_queue.yml +++ b/.github/workflows/merge_queue.yml @@ -99,7 +99,7 @@ jobs: ################################# Stage Final ################################# # FinishCheck: - if: ${{ !failure() && !cancelled() }} + if: ${{ !cancelled() }} needs: [RunConfig, BuildDockers, StyleCheck, FastTest, Builds_1, Tests_1] runs-on: [self-hosted, style-checker-aarch64] steps: From 99ce17fb2b0a1b5c7787470690f8e14c11005101 Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 22:15:39 +0200 Subject: [PATCH 205/215] style fix --- .github/workflows/pull_request.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 079208eb65a..66ca3381a40 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -191,7 +191,7 @@ jobs: - name: Finish label run: | cd "$GITHUB_WORKSPACE/tests/ci" - python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} + python3 finish_check.py --wf-status ${{ contains(needs.*.result, 'failure') && 'failure' || 'success' }} ############################################################################################# ###################################### JEPSEN TESTS ######################################### From 87f451d7641e1f2b5392eeddf4c0655bae58236b Mon Sep 17 00:00:00 2001 From: Max K Date: Thu, 13 Jun 2024 22:36:57 +0200 Subject: [PATCH 206/215] Revert "Change default s3_throw_on_zero_files_match to true, document that presigned S3 URLs are not supported" --- docs/en/sql-reference/table-functions/s3.md | 11 +--------- src/Core/Settings.h | 6 +++--- src/Core/SettingsChangesHistory.h | 6 ++---- .../ObjectStorage/Azure/Configuration.cpp | 1 - .../ObjectStorage/HDFS/Configuration.cpp | 1 - .../ObjectStorage/S3/Configuration.cpp | 1 - .../ObjectStorage/StorageObjectStorage.h | 1 - .../StorageObjectStorageSource.cpp | 12 +++-------- .../StorageObjectStorageSource.h | 2 -- src/Storages/S3Queue/StorageS3Queue.cpp | 2 +- tests/integration/test_storage_hdfs/test.py | 21 ++++++------------- ...02481_s3_throw_if_mismatch_files.reference | 4 ++-- .../02481_s3_throw_if_mismatch_files.sql | 4 ++-- ...ed_url_and_url_with_special_characters.sql | 4 ++-- .../aspell-ignore/en/aspell-dict.txt | 5 ++--- 15 files changed, 24 insertions(+), 57 deletions(-) diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 7538d66996f..1a7e2b8d66a 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -248,6 +248,7 @@ FROM s3( LIMIT 5; ``` + ## Working with archives Suppose that we have several archive files with following URIs on S3: @@ -265,16 +266,6 @@ FROM s3( ); ``` -## Presigned URL - -Presigned URLs are currently not supported. Use `url()` table function instead: -```sql -SELECT * -FROM url( - 'https://example.amazonaws.com/f.csv?X-Amz-Security-Token=[...]' -) -``` - ## Virtual Columns {#virtual-columns} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index efa84f19f78..b3e83092a77 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -115,9 +115,9 @@ class IColumn; M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \ M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \ M(Bool, azure_allow_parallel_part_upload, true, "Use multiple threads for azure multipart upload.", 0) \ - M(Bool, s3_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, hdfs_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ - M(Bool, azure_throw_on_zero_files_match, true, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, hdfs_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ + M(Bool, azure_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \ M(Bool, s3_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in S3 table engine", 0) \ M(Bool, hdfs_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in HDFS table engine", 0) \ M(Bool, azure_ignore_file_doesnt_exist, false, "Return 0 rows when the requested files don't exist, instead of throwing an exception in AzureBlobStorage table engine", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 31da77fddaf..69bc8c5d207 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -88,9 +88,8 @@ static const std::map StorageObjectStorageSourc iterator = std::make_unique( object_storage, configuration, predicate, virtual_columns, local_context, is_archive ? nullptr : read_keys, settings.list_object_keys_size, - settings.throw_on_zero_files_match, settings.throw_on_zero_files_match_setting_name, - file_progress_callback); + settings.throw_on_zero_files_match, file_progress_callback); } else { @@ -426,7 +425,6 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( ObjectInfos * read_keys_, size_t list_object_keys_size, bool throw_on_zero_files_match_, - const char * throw_on_zero_files_match_setting_name_, std::function file_progress_callback_) : IIterator("GlobIterator") , WithContext(context_) @@ -434,7 +432,6 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , configuration(configuration_) , virtual_columns(virtual_columns_) , throw_on_zero_files_match(throw_on_zero_files_match_) - , throw_on_zero_files_match_setting_name(throw_on_zero_files_match_setting_name_) , read_keys(read_keys_) , file_progress_callback(file_progress_callback_) { @@ -487,11 +484,8 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne if (first_iteration && !object_info && throw_on_zero_files_match) { throw Exception(ErrorCodes::FILE_DOESNT_EXIST, - "Can not match any files with path {}{}", - configuration->getPath(), - throw_on_zero_files_match_setting_name - ? fmt::format(" (this error can be suppressed by setting {} = false)", throw_on_zero_files_match_setting_name) - : ""); + "Can not match any files with path {}", + configuration->getPath()); } first_iteration = false; return object_info; diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 5e76d8e979f..fd7c7aa7102 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -168,7 +168,6 @@ public: ObjectInfos * read_keys_, size_t list_object_keys_size, bool throw_on_zero_files_match_, - const char * throw_on_zero_files_match_setting_name_, std::function file_progress_callback_ = {}); ~GlobIterator() override = default; @@ -185,7 +184,6 @@ private: const ConfigurationPtr configuration; const NamesAndTypesList virtual_columns; const bool throw_on_zero_files_match; - const char * throw_on_zero_files_match_setting_name; size_t index = 0; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 7e26335c691..afb75a21b21 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -486,7 +486,7 @@ std::shared_ptr StorageS3Queue::createFileIterator { auto settings = configuration->getQuerySettings(local_context); auto glob_iterator = std::make_unique( - object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match, settings.throw_on_zero_files_match_setting_name); + object_storage, configuration, predicate, getVirtualsList(), local_context, nullptr, settings.list_object_keys_size, settings.throw_on_zero_files_match); return std::make_shared(files_metadata, std::move(glob_iterator), shutdown_called, log); } diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 818a1e7447d..47d8f44c0b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -111,7 +111,7 @@ def test_storage_with_multidirectory_glob(started_cluster): try: node1.query( - "SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p4/path1,p2/path3}/postfix/data{1,2}.nonexist', TSV) SETTINGS hdfs_throw_on_zero_files_match=0" + "SELECT * FROM hdfs('hdfs://hdfs1:9000/multiglob/{p4/path1,p2/path3}/postfix/data{1,2}.nonexist', TSV)" ) assert False, "Exception have to be thrown" except Exception as ex: @@ -220,22 +220,14 @@ def test_globs_in_read_table(started_cluster): ) print("inside_table_func ", inside_table_func) assert ( - node1.query( - "select * from hdfs(" - + inside_table_func - + ") settings hdfs_throw_on_zero_files_match=0" - ) + node1.query("select * from hdfs(" + inside_table_func + ")") == paths_amount * some_data ) assert node1.query( - "select count(distinct _path) from hdfs(" - + inside_table_func - + ") settings hdfs_throw_on_zero_files_match=0" + "select count(distinct _path) from hdfs(" + inside_table_func + ")" ).rstrip() == str(paths_amount) assert node1.query( - "select count(distinct _file) from hdfs(" - + inside_table_func - + ") settings hdfs_throw_on_zero_files_match=0" + "select count(distinct _file) from hdfs(" + inside_table_func + ")" ).rstrip() == str(files_amount) @@ -643,7 +635,6 @@ def test_cluster_join(started_cluster): SELECT l.id,r.id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') as l JOIN hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') as r ON l.id = r.id - SETTINGS hdfs_throw_on_zero_files_match=0 """ ) assert "AMBIGUOUS_COLUMN_NAME" not in result @@ -652,13 +643,13 @@ def test_cluster_join(started_cluster): def test_cluster_macro(started_cluster): with_macro = node1.query( """ - SELECT id FROM hdfsCluster('{default_cluster_macro}', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') SETTINGS hdfs_throw_on_zero_files_match=0 + SELECT id FROM hdfsCluster('{default_cluster_macro}', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') """ ) no_macro = node1.query( """ - SELECT id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') SETTINGS hdfs_throw_on_zero_files_match=0 + SELECT id FROM hdfsCluster('test_cluster_two_shards', 'hdfs://hdfs1:9000/test_hdfsCluster/file*', 'TSV', 'id UInt32') """ ) diff --git a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference index 752b12ff3bd..a7096a686f5 100644 --- a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference +++ b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.reference @@ -3,5 +3,5 @@ drop table if exists test_02481_mismatch_files; create table test_02481_mismatch_files (a UInt64, b String) engine = S3(s3_conn, filename='test_02481_mismatch_files_{_partition_id}', format=Parquet) partition by a; set s3_truncate_on_insert=1; insert into test_02481_mismatch_files values (1, 'a'), (22, 'b'), (333, 'c'); -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError FILE_DOESNT_EXIST } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=1; -- { serverError FILE_DOESNT_EXIST } diff --git a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql index cd500b58946..7ec1d3ebd5f 100644 --- a/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql +++ b/tests/queries/0_stateless/02481_s3_throw_if_mismatch_files.sql @@ -7,6 +7,6 @@ create table test_02481_mismatch_files (a UInt64, b String) engine = S3(s3_conn, set s3_truncate_on_insert=1; insert into test_02481_mismatch_files values (1, 'a'), (22, 'b'), (333, 'c'); -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet); -- { serverError FILE_DOESNT_EXIST } +select a, b from s3(s3_conn, filename='test_02481_mismatch_filesxxx*', format=Parquet) settings s3_throw_on_zero_files_match=1; -- { serverError FILE_DOESNT_EXIST } diff --git a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql index 078a5701aca..1e99eb8b83d 100644 --- a/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql +++ b/tests/queries/0_stateless/02873_s3_presigned_url_and_url_with_special_characters.sql @@ -1,5 +1,5 @@ -- Tags: no-fasttest -select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/BU%20-%20UNIT%20-%201/*.parquet', NOSIGN) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } +select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/BU%20-%20UNIT%20-%201/*.parquet'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } -select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD', NOSIGN) settings s3_throw_on_zero_files_match=0; -- { serverError CANNOT_DETECT_FORMAT } +select * from s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/MyPrefix/*.parquet?some_tocken=ABCD'); -- { serverError CANNOT_DETECT_FORMAT } diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index c4b70de1f65..84682689934 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2912 +personal_ws-1.1 en 2758 AArch ACLs ALTERs @@ -722,7 +722,6 @@ Postgres PostgresSQL Precompiled Preprocess -Presigned PrettyCompact PrettyCompactMonoBlock PrettyCompactNoEscapes @@ -1937,9 +1936,9 @@ loghouse london lookups loongarch +lowcardinality lowCardinalityIndices lowCardinalityKeys -lowcardinality lowerUTF lowercased lttb From cd2d825dd10d3b9ed9a13cb6d7df803a1ee84d5f Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 13 Jun 2024 18:52:40 +0100 Subject: [PATCH 207/215] impl --- src/Interpreters/HashJoin.cpp | 12 +++++++++--- src/Interpreters/HashJoin.h | 3 +++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 75da8bbc3e7..1c2a054b2a5 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -869,6 +869,7 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits) || (min_rows_to_compress && getTotalRowCount() >= min_rows_to_compress))) { block_to_save = block_to_save.compress(); + have_compressed = true; } data->blocks_allocated_size += block_to_save.allocatedBytes(); @@ -2317,14 +2318,19 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) } }; - for (const Block & compressed_block_right : data->blocks) + for (const Block & block_right : data->blocks) { ++block_number; if (block_number < start_right_block) continue; - auto block_right = compressed_block_right.decompress(); - process_right_block(block_right); + /// The following statement cannot be substituted with `process_right_block(!have_compressed ? block_right : block_right.decompress())` + /// because it will lead to copying of `block_right` even if its branch is taken (because common type of `block_right` and `block_right.decompress()` is `Block`). + if (!have_compressed) + process_right_block(block_right); + else + process_right_block(block_right.decompress()); + if (rows_added > max_joined_block_rows) { break; diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index a0996556f9a..56a1768a7ff 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -434,7 +434,10 @@ private: /// Changes in hash table broke correspondence, /// so we must guarantee constantness of hash table during HashJoin lifetime (using method setLock) mutable JoinStuff::JoinUsedFlags used_flags; + RightTableDataPtr data; + bool have_compressed = false; + std::vector key_sizes; /// Needed to do external cross join From 04897eb0faf2c489197a2727f41dd9d9b2033fa8 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 12 Jun 2024 13:42:28 +0000 Subject: [PATCH 208/215] Done --- src/Core/Settings.h | 3 + src/Core/SettingsChangesHistory.h | 3 + .../Serializations/SerializationTuple.cpp | 96 +++++++- src/Formats/EscapingRuleUtils.cpp | 2 +- src/Formats/FormatFactory.cpp | 3 + src/Formats/FormatSettings.h | 2 + tests/queries/0_stateless/00300_csv.reference | 6 +- .../01016_input_null_as_default.sh | 4 +- ...csv_best_effort_schema_inference.reference | 80 +++---- .../02969_auto_format_detection.reference | 209 +++++++++--------- .../02977_csv_format_support_tuple.sql | 4 + 11 files changed, 250 insertions(+), 162 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b3e83092a77..bb5e8411a23 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1009,6 +1009,8 @@ class IColumn; M(Char, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.", 0) \ M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ + M(Bool, format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ + M(Bool, format_csv_deserialize_separate_columns_into_tuple, true, "if it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ @@ -1047,6 +1049,7 @@ class IColumn; M(UInt64, input_format_max_bytes_to_read_for_schema_inference, 32 * 1024 * 1024, "The maximum bytes of data to read for automatic schema inference", 0) \ M(Bool, input_format_csv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in CSV format", 0) \ M(Bool, input_format_csv_try_infer_numbers_from_strings, false, "Try to infer numbers from string fields while schema inference in CSV format", 0) \ + M(Bool, input_format_csv_try_infer_strings_from_quoted_tuples, true, "Interpret quoted tuples in the input data as a value of type String.", 0) \ M(Bool, input_format_tsv_use_best_effort_in_schema_inference, true, "Use some tweaks and heuristics to infer schema in TSV format", 0) \ M(Bool, input_format_csv_detect_header, true, "Automatically detect header with names and types in CSV format", 0) \ M(Bool, input_format_csv_allow_whitespace_or_tab_as_delimiter, false, "Allow to use spaces and tabs(\\t) as field delimiter in the CSV strings", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 69bc8c5d207..c47893d79f8 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -172,6 +172,9 @@ static const std::mapserializeTextCSV(extractElementColumn(column, i), row_num, ostr, settings); + } + } + else + { + WriteBufferFromOwnString wb; + serializeText(column, row_num, wb, settings); + writeCSV(wb.str(), ostr); + } } void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - String s; - readCSV(s, istr, settings.csv); - ReadBufferFromString rb(s); - deserializeText(column, rb, settings, true); + if (settings.csv.deserialize_separate_columns_into_tuple) + { + addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + assertChar(settings.csv.tuple_delimiter, istr); + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + SerializationNullable::deserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i]); + else + elems[i]->deserializeTextCSV(element_column, istr, settings); + } + return true; + }); + } + else + { + String s; + readCSV(s, istr, settings.csv); + ReadBufferFromString rb(s); + deserializeText(column, rb, settings, true); + } } bool SerializationTuple::tryDeserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const { - String s; - if (!tryReadCSV(s, istr, settings.csv)) - return false; - ReadBufferFromString rb(s); - return tryDeserializeText(column, rb, settings, true); + if (settings.csv.deserialize_separate_columns_into_tuple) + { + return addElementSafe(elems.size(), column, [&] + { + const size_t size = elems.size(); + for (size_t i = 0; i < size; ++i) + { + if (i != 0) + { + skipWhitespaceIfAny(istr); + if (!checkChar(settings.csv.tuple_delimiter, istr)) + return false; + skipWhitespaceIfAny(istr); + } + + auto & element_column = extractElementColumn(column, i); + if (settings.null_as_default && !isColumnNullableOrLowCardinalityNullable(element_column)) + { + if (!SerializationNullable::tryDeserializeNullAsDefaultOrNestedTextCSV(element_column, istr, settings, elems[i])) + return false; + } + else + { + if (!elems[i]->tryDeserializeTextCSV(element_column, istr, settings)) + return false; + } + } + + return true; + }); + } + else + { + String s; + if (!tryReadCSV(s, istr, settings.csv)) + return false; + ReadBufferFromString rb(s); + return tryDeserializeText(column, rb, settings, true); + } } struct SerializeBinaryBulkStateTuple : public ISerialization::SerializeBinaryBulkState diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 9577ca2a8df..10cf279bf7e 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -303,7 +303,7 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet auto type = tryInferDataTypeForSingleField(data, format_settings); /// If we couldn't infer any type or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string. - if (!type || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings)) + if (!type || (isTuple(type) && format_settings.csv.try_infer_strings_from_quoted_tuples) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings)) return std::make_shared(); return type; diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index e90986f2236..d2a9c723d4a 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -77,6 +77,8 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.avro.output_rows_in_file = settings.output_format_avro_rows_in_file; format_settings.csv.allow_double_quotes = settings.format_csv_allow_double_quotes; format_settings.csv.allow_single_quotes = settings.format_csv_allow_single_quotes; + format_settings.csv.serialize_tuple_into_separate_columns = settings.format_csv_serialize_tuple_into_separate_columns; + format_settings.csv.deserialize_separate_columns_into_tuple = settings.format_csv_deserialize_separate_columns_into_tuple; format_settings.csv.crlf_end_of_line = settings.output_format_csv_crlf_end_of_line; format_settings.csv.allow_cr_end_of_line = settings.input_format_csv_allow_cr_end_of_line; format_settings.csv.delimiter = settings.format_csv_delimiter; @@ -94,6 +96,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se format_settings.csv.allow_variable_number_of_columns = settings.input_format_csv_allow_variable_number_of_columns; format_settings.csv.use_default_on_bad_values = settings.input_format_csv_use_default_on_bad_values; format_settings.csv.try_infer_numbers_from_strings = settings.input_format_csv_try_infer_numbers_from_strings; + format_settings.csv.try_infer_strings_from_quoted_tuples = settings.input_format_csv_try_infer_strings_from_quoted_tuples; format_settings.hive_text.fields_delimiter = settings.input_format_hive_text_fields_delimiter; format_settings.hive_text.collection_items_delimiter = settings.input_format_hive_text_collection_items_delimiter; format_settings.hive_text.map_keys_delimiter = settings.input_format_hive_text_map_keys_delimiter; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 337aafbbe9c..8c571987e74 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -153,6 +153,8 @@ struct FormatSettings char delimiter = ','; bool allow_single_quotes = true; bool allow_double_quotes = true; + bool serialize_tuple_into_separate_columns = true; + bool deserialize_separate_columns_into_tuple = true; bool empty_as_default = false; bool crlf_end_of_line = false; bool allow_cr_end_of_line = false; diff --git a/tests/queries/0_stateless/00300_csv.reference b/tests/queries/0_stateless/00300_csv.reference index e7966a9e8d9..42cd22078c4 100644 --- a/tests/queries/0_stateless/00300_csv.reference +++ b/tests/queries/0_stateless/00300_csv.reference @@ -1,11 +1,11 @@ -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" "x","y","z","a","b" -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" "x","y","z","a","b" "String","UInt8","Array(UInt8)","Tuple(UInt16, Array(String))","String" -"Hello, ""World""",123,"[1,2,3]","(456,['abc','def'])","Newline +"Hello, ""World""",123,"[1,2,3]",456,"['abc','def']","Newline here" 0,"0","[]","2000-01-01","2000-01-01 00:00:00" 1,"1","[0]","2000-01-02","2000-01-01 00:00:01" diff --git a/tests/queries/0_stateless/01016_input_null_as_default.sh b/tests/queries/0_stateless/01016_input_null_as_default.sh index 8d6a9a07435..24d93b2703c 100755 --- a/tests/queries/0_stateless/01016_input_null_as_default.sh +++ b/tests/queries/0_stateless/01016_input_null_as_default.sh @@ -11,8 +11,8 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE default_by_other_column (a Float32 DEFA echo 'CSV' echo '\N, 1, \N, "2019-07-22", "[10, 20, 30]", \N -1, world, 3, "2019-07-23", \N, "('\''tuple'\'', 3.14)" -2, \N, 123, \N, "[]", "('\''test'\'', 2.71828)" +1, world, 3, "2019-07-23", \N, tuple, 3.14 +2, \N, 123, \N, "[]", test, 2.71828 3, \N, \N, \N, \N, \N' | $CLICKHOUSE_CLIENT --input_format_null_as_default=1 --query="INSERT INTO null_as_default FORMAT CSV"; $CLICKHOUSE_CLIENT --query="SELECT * FROM null_as_default ORDER BY i"; $CLICKHOUSE_CLIENT --query="TRUNCATE TABLE null_as_default"; diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference index 8ad0a566c62..777f1155f0c 100644 --- a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -1,107 +1,107 @@ TSV -c1 Nullable(Int64) -c2 Nullable(String) -c3 Array(Nullable(Int64)) -c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) +c1 Nullable(Int64) +c2 Nullable(String) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) 42 Some string [1,2,3,4] (1,2,3) 42 abcd [] (4,5,6) -c1 Nullable(String) +c1 Nullable(String) [({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] [] [({}, [], 0)] [({}, [NULL], NULL)] [({}, [\'String3\'], NULL)] [({\'key3\': NULL}, []), NULL] -c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) [({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] [] [({},[],0)] [({},[NULL],NULL)] [({},['String3'],NULL)] [({'key3':NULL},[],NULL)] -c1 Nullable(Bool) +c1 Nullable(Bool) true false \N -c1 Array(Nullable(Bool)) +c1 Array(Nullable(Bool)) [true,NULL] [] [NULL] [false] -c1 Nullable(String) +c1 Nullable(String) [] -c1 Nullable(String) +c1 Nullable(String) {} -c1 Nullable(String) +c1 Nullable(String) () -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 -c1 Nullable(String) +c1 Nullable(String) [(1, 2, 3 4)] -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 + 4] -c1 Nullable(String) +c1 Nullable(String) (1, 2, -c1 Nullable(String) +c1 Nullable(String) [1, Some trash, 42.2] -c1 Nullable(String) +c1 Nullable(String) [1, \'String\', {\'key\' : 2}] -c1 Nullable(String) +c1 Nullable(String) {\'key\' : 1, [1] : 10} -c1 Nullable(String) +c1 Nullable(String) {}{} -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 -c1 Nullable(String) +c1 Nullable(String) [abc, def] -c1 Array(Nullable(String)) +c1 Array(Nullable(String)) ['abc','def'] -c1 Nullable(String) +c1 Nullable(String) [\'string] -c1 Nullable(String) +c1 Nullable(String) \'string -c1 Nullable(Float64) +c1 Nullable(Float64) 42.42 -c1 Nullable(String) +c1 Nullable(String) 42.42sometrash -c1 Nullable(String) +c1 Nullable(String) [42.42sometrash, 42.42] CSV -c1 Nullable(String) -c2 Nullable(String) -c3 Array(Nullable(Int64)) -c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(Int64)) +c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) 42 Some string [1,2,3,4] [(1,2,3)] 42\\ abcd [] [(4,5,6)] -c1 Nullable(String) +c1 Nullable(String) [({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] [] [({}, [], 0)] [({}, [NULL], NULL)] [({}, [\'String3\'], NULL)] [({\'key3\': NULL}, []), NULL] -c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) [({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] [] [({},[],0)] [({},[NULL],NULL)] [({},['String3'],NULL)] [({'key3':NULL},[],NULL)] -c1 Nullable(Bool) +c1 Nullable(Bool) true false \N -c1 Array(Nullable(Bool)) +c1 Array(Nullable(Bool)) [true,NULL] [] [NULL] [false] -c1 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) -(1,2,3) -c1 Nullable(String) +c1 Nullable(String) +(1, 2, 3) +c1 Nullable(String) 123.123 -c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] -c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference index 865db11defc..9fcaef54db1 100644 --- a/tests/queries/0_stateless/02969_auto_format_detection.reference +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -1,122 +1,123 @@ Parquet -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) ORC -a Nullable(Int64) -b Nullable(String) -c Array(Nullable(Int64)) -d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) Arrow -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) ArrowStream -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) Avro -a Int64 -b String -c Array(Int64) -d Tuple(\n a Int64,\n b String) +a Int64 +b String +c Array(Int64) +d Tuple(\n a Int64,\n b String) Native -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) BSONEachRow -a Nullable(Int64) -b Nullable(String) -c Array(Nullable(Int64)) -d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) JSONCompact -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) Values -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) TSKV -a Nullable(String) -b Nullable(String) -c Array(Nullable(UInt64)) -d Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(UInt64)) +d Nullable(String) JSONObjectEachRow -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) JSONColumns -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) JSONCompactColumns -c1 Nullable(String) -c2 Nullable(String) -c3 Array(Nullable(String)) -c4 Tuple(\n a Nullable(String),\n b Nullable(String)) +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(String)) +c4 Tuple(\n a Nullable(String),\n b Nullable(String)) JSONCompact -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) JSON -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) TSV -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) CSV -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Nullable(UInt64) +c5 Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) 1 -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) diff --git a/tests/queries/0_stateless/02977_csv_format_support_tuple.sql b/tests/queries/0_stateless/02977_csv_format_support_tuple.sql index d00cc00e097..3e9a51049c7 100644 --- a/tests/queries/0_stateless/02977_csv_format_support_tuple.sql +++ b/tests/queries/0_stateless/02977_csv_format_support_tuple.sql @@ -1,5 +1,9 @@ -- Tags: no-parallel +SET format_csv_serialize_tuple_into_separate_columns = false; +SET format_csv_deserialize_separate_columns_into_tuple = false; +SET input_format_csv_try_infer_strings_from_quoted_tuples = false; + insert into function file('02977_1.csv') select '20240305', 1, ['s', 'd'], map('a', 2), tuple('222', 33, map('abc', 5)) SETTINGS engine_file_truncate_on_insert=1; desc file('02977_1.csv'); select * from file('02977_1.csv') settings max_threads=1; From a98ec2e0e9531446fd231ceb8a4fbc0c0ef0f62b Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 12 Jun 2024 15:58:29 +0200 Subject: [PATCH 209/215] Update src/Formats/EscapingRuleUtils.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Formats/EscapingRuleUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index 10cf279bf7e..36d16d8d154 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -303,7 +303,7 @@ DataTypePtr tryInferDataTypeByEscapingRule(const String & field, const FormatSet auto type = tryInferDataTypeForSingleField(data, format_settings); /// If we couldn't infer any type or it's a number and csv.try_infer_numbers_from_strings = 0, we determine it as a string. - if (!type || (isTuple(type) && format_settings.csv.try_infer_strings_from_quoted_tuples) || (isNumber(type) && !format_settings.csv.try_infer_numbers_from_strings)) + if (!type || (format_settings.csv.try_infer_strings_from_quoted_tuples && isTuple(type)) || (!format_settings.csv.try_infer_numbers_from_strings && isNumber(type))) return std::make_shared(); return type; From e48ebf55293f8dceaf42b30dc4bbb4615efad92b Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 12 Jun 2024 14:34:12 +0000 Subject: [PATCH 210/215] Fixed build and tests --- src/Core/Settings.h | 4 ++-- src/Core/SettingsChangesHistory.h | 4 ++-- src/Formats/FormatFactory.cpp | 4 ++-- src/Formats/FormatSettings.h | 1 + .../0_stateless/00309_formats.reference | Bin 18545 -> 18537 bytes .../02977_csv_format_support_tuple.sql | 4 ++-- 6 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index bb5e8411a23..70cd5bf2a62 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1009,8 +1009,8 @@ class IColumn; M(Char, format_csv_delimiter, ',', "The character to be considered as a delimiter in CSV data. If setting with a string, a string has to have a length of 1.", 0) \ M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ - M(Bool, format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ - M(Bool, format_csv_deserialize_separate_columns_into_tuple, true, "if it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ + M(Bool, output_format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ + M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, "if it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index c47893d79f8..e73527c063d 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -172,8 +172,8 @@ static const std::map Date: Wed, 12 Jun 2024 15:37:16 +0000 Subject: [PATCH 211/215] Fix test --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 70cd5bf2a62..db560570f67 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1010,7 +1010,7 @@ class IColumn; M(Bool, format_csv_allow_single_quotes, false, "If it is set to true, allow strings in single quotes.", 0) \ M(Bool, format_csv_allow_double_quotes, true, "If it is set to true, allow strings in double quotes.", 0) \ M(Bool, output_format_csv_serialize_tuple_into_separate_columns, true, "If it set to true, then Tuples in CSV format are serialized as separate columns (that is, their nesting in the tuple is lost)", 0) \ - M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, "if it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ + M(Bool, input_format_csv_deserialize_separate_columns_into_tuple, true, "If it set to true, then separate columns written in CSV format can be deserialized to Tuple column.", 0) \ M(Bool, output_format_csv_crlf_end_of_line, false, "If it is set true, end of line in CSV format will be \\r\\n instead of \\n.", 0) \ M(Bool, input_format_csv_allow_cr_end_of_line, false, "If it is set true, \\r will be allowed at end of line not followed by \\n", 0) \ M(Bool, input_format_csv_enum_as_number, false, "Treat inserted enum values in CSV formats as enum indices", 0) \ From 74dc6cdb5adc3913180df3aa8e9ae8a0dc101ea8 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 12 Jun 2024 19:45:58 +0000 Subject: [PATCH 212/215] Fix tests --- ...csv_best_effort_schema_inference.reference | 78 +++---- .../02969_auto_format_detection.reference | 210 +++++++++--------- 2 files changed, 144 insertions(+), 144 deletions(-) diff --git a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference index 777f1155f0c..1c60e40942c 100644 --- a/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference +++ b/tests/queries/0_stateless/02246_tsv_csv_best_effort_schema_inference.reference @@ -1,107 +1,107 @@ TSV -c1 Nullable(Int64) -c2 Nullable(String) -c3 Array(Nullable(Int64)) -c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) +c1 Nullable(Int64) +c2 Nullable(String) +c3 Array(Nullable(Int64)) +c4 Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64)) 42 Some string [1,2,3,4] (1,2,3) 42 abcd [] (4,5,6) -c1 Nullable(String) +c1 Nullable(String) [({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] [] [({}, [], 0)] [({}, [NULL], NULL)] [({}, [\'String3\'], NULL)] [({\'key3\': NULL}, []), NULL] -c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) [({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] [] [({},[],0)] [({},[NULL],NULL)] [({},['String3'],NULL)] [({'key3':NULL},[],NULL)] -c1 Nullable(Bool) +c1 Nullable(Bool) true false \N -c1 Array(Nullable(Bool)) +c1 Array(Nullable(Bool)) [true,NULL] [] [NULL] [false] -c1 Nullable(String) +c1 Nullable(String) [] -c1 Nullable(String) +c1 Nullable(String) {} -c1 Nullable(String) +c1 Nullable(String) () -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 -c1 Nullable(String) +c1 Nullable(String) [(1, 2, 3 4)] -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 + 4] -c1 Nullable(String) +c1 Nullable(String) (1, 2, -c1 Nullable(String) +c1 Nullable(String) [1, Some trash, 42.2] -c1 Nullable(String) +c1 Nullable(String) [1, \'String\', {\'key\' : 2}] -c1 Nullable(String) +c1 Nullable(String) {\'key\' : 1, [1] : 10} -c1 Nullable(String) +c1 Nullable(String) {}{} -c1 Nullable(String) +c1 Nullable(String) [1, 2, 3 -c1 Nullable(String) +c1 Nullable(String) [abc, def] -c1 Array(Nullable(String)) +c1 Array(Nullable(String)) ['abc','def'] -c1 Nullable(String) +c1 Nullable(String) [\'string] -c1 Nullable(String) +c1 Nullable(String) \'string -c1 Nullable(Float64) +c1 Nullable(Float64) 42.42 -c1 Nullable(String) +c1 Nullable(String) 42.42sometrash -c1 Nullable(String) +c1 Nullable(String) [42.42sometrash, 42.42] CSV -c1 Nullable(String) -c2 Nullable(String) -c3 Array(Nullable(Int64)) -c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(Int64)) +c4 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) 42 Some string [1,2,3,4] [(1,2,3)] 42\\ abcd [] [(4,5,6)] -c1 Nullable(String) +c1 Nullable(String) [({\'key\' : 42.42}, [\'String\', \'String2\'], 42.42), ({}, [], -42), ({\'key2\' : NULL}, [NULL], NULL)] [] [({}, [], 0)] [({}, [NULL], NULL)] [({}, [\'String3\'], NULL)] [({\'key3\': NULL}, []), NULL] -c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) +c1 Array(Tuple(Map(String, Nullable(Float64)), Array(Nullable(String)), Nullable(Float64))) [({'key':42.42},['String','String2'],42.42),({},[],-42),({'key2':NULL},[NULL],NULL)] [] [({},[],0)] [({},[NULL],NULL)] [({},['String3'],NULL)] [({'key3':NULL},[],NULL)] -c1 Nullable(Bool) +c1 Nullable(Bool) true false \N -c1 Array(Nullable(Bool)) +c1 Array(Nullable(Bool)) [true,NULL] [] [NULL] [false] -c1 Nullable(String) +c1 Nullable(String) (1, 2, 3) -c1 Nullable(String) +c1 Nullable(String) 123.123 -c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] -c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) +c1 Array(Tuple(Nullable(Int64), Nullable(Int64), Nullable(Int64))) [(1,2,3)] diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference index 9fcaef54db1..4b86be04996 100644 --- a/tests/queries/0_stateless/02969_auto_format_detection.reference +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -1,123 +1,123 @@ Parquet -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) ORC -a Nullable(Int64) -b Nullable(String) -c Array(Nullable(Int64)) -d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) Arrow -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) ArrowStream -a Nullable(UInt64) -b Nullable(String) -c Array(Nullable(UInt64)) -d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) Avro -a Int64 -b String -c Array(Int64) -d Tuple(\n a Int64,\n b String) +a Int64 +b String +c Array(Int64) +d Tuple(\n a Int64,\n b String) Native -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) BSONEachRow -a Nullable(Int64) -b Nullable(String) -c Array(Nullable(Int64)) -d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) JSONCompact -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) Values -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) TSKV -a Nullable(String) -b Nullable(String) -c Array(Nullable(UInt64)) -d Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(UInt64)) +d Nullable(String) JSONObjectEachRow -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) JSONColumns -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) JSONCompactColumns -c1 Nullable(String) -c2 Nullable(String) -c3 Array(Nullable(String)) -c4 Tuple(\n a Nullable(String),\n b Nullable(String)) +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(String)) +c4 Tuple(\n a Nullable(String),\n b Nullable(String)) JSONCompact -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) JSON -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) TSV -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Tuple(Nullable(UInt64), Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) CSV -c1 Nullable(UInt64) -c2 Nullable(String) -c3 Array(Nullable(UInt64)) -c4 Nullable(UInt64) -c5 Nullable(String) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a UInt64 -b String -c Array(UInt64) -d Tuple(\n a UInt64,\n b String) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Nullable(UInt64) +c5 Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) 1 -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) -a Nullable(String) -b Nullable(String) -c Array(Nullable(String)) -d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) From ad0b396d9d04ddd97d651f272c1b6d2bc0744d9a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Thu, 13 Jun 2024 15:14:15 +0000 Subject: [PATCH 213/215] Fix Upgrade Check --- src/Core/SettingsChangesHistory.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index e73527c063d..aa871184b8d 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -105,6 +105,9 @@ static const std::map Date: Fri, 14 Jun 2024 01:37:39 +0000 Subject: [PATCH 214/215] fix the bug --- ...dictionary_short_circuit_bug_fix.reference | 6 ++++ ...ashed_dictionary_short_circuit_bug_fix.sql | 30 +++++++++++++++++++ 2 files changed, 36 insertions(+) create mode 100644 tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference create mode 100644 tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql diff --git a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference new file mode 100644 index 00000000000..a2ac115060f --- /dev/null +++ b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.reference @@ -0,0 +1,6 @@ +100 1 1 +300 3 0 +200 2 2 +100 1 1 +300 3 0 +200 2 2 diff --git a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql new file mode 100644 index 00000000000..6fa5c694cda --- /dev/null +++ b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql @@ -0,0 +1,30 @@ +-- Tags: no-parallel + +CREATE TABLE x ( hash_id UInt64, user_result Decimal(3, 2) ) ENGINE = Memory(); + +CREATE TABLE y ( hash_id UInt64, user_result DECIMAL(18, 6) ) ENGINE = Memory(); + +INSERT INTO x values (100, 1), (200, 2); +INSERT INTO y values (100, 1), (300, 3), (200, 2); + +CREATE DICTIONARY d1 (hash_id UInt64, user_result Decimal(3, 2) ) +PRIMARY KEY hash_id +SOURCE(CLICKHOUSE(QUERY `select * from x`)) +LIFETIME(0) +LAYOUT(HASHED()); + +SELECT hash_id, + dictGetOrDefault(d1, 'user_result', toUInt64(hash_id), toFloat64(user_result)), + dictGet(d1, 'user_result', toUInt64(hash_id)) +FROM y; + +CREATE DICTIONARY d2 (hash_id UInt64, user_result Decimal(3, 2) ) +PRIMARY KEY hash_id +SOURCE(CLICKHOUSE(QUERY `select * from x`)) +LIFETIME(0) +LAYOUT(HASHED_ARRAY()); + +SELECT hash_id, + dictGetOrDefault(d2, 'user_result', toUInt64(hash_id), toFloat64(user_result)), + dictGet(d2, 'user_result', toUInt64(hash_id)) +FROM y; From d32647a6cd1590fc6ae605fe616b64c238084a84 Mon Sep 17 00:00:00 2001 From: jsc0218 Date: Fri, 14 Jun 2024 02:15:21 +0000 Subject: [PATCH 215/215] fix --- .../03171_hashed_dictionary_short_circuit_bug_fix.sql | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql index 6fa5c694cda..e1b5531a442 100644 --- a/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql +++ b/tests/queries/0_stateless/03171_hashed_dictionary_short_circuit_bug_fix.sql @@ -9,7 +9,7 @@ INSERT INTO y values (100, 1), (300, 3), (200, 2); CREATE DICTIONARY d1 (hash_id UInt64, user_result Decimal(3, 2) ) PRIMARY KEY hash_id -SOURCE(CLICKHOUSE(QUERY `select * from x`)) +SOURCE(CLICKHOUSE(TABLE 'x')) LIFETIME(0) LAYOUT(HASHED()); @@ -20,7 +20,7 @@ FROM y; CREATE DICTIONARY d2 (hash_id UInt64, user_result Decimal(3, 2) ) PRIMARY KEY hash_id -SOURCE(CLICKHOUSE(QUERY `select * from x`)) +SOURCE(CLICKHOUSE(TABLE 'x')) LIFETIME(0) LAYOUT(HASHED_ARRAY());