diff --git a/CMakeLists.txt b/CMakeLists.txt index d5f389cfa99..1423f3a0bc2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -516,9 +516,9 @@ include (cmake/find/fast_float.cmake) include (cmake/find/rapidjson.cmake) include (cmake/find/fastops.cmake) include (cmake/find/odbc.cmake) +include (cmake/find/nanodbc.cmake) include (cmake/find/rocksdb.cmake) include (cmake/find/libpqxx.cmake) -include (cmake/find/nanodbc.cmake) include (cmake/find/nuraft.cmake) diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 363f281584e..9e60181e802 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -25,7 +25,7 @@ #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #endif @@ -1266,7 +1266,7 @@ public: }; #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif #endif diff --git a/base/mysqlxx/Pool.h b/base/mysqlxx/Pool.h index b6189663f55..530e2c78cf2 100644 --- a/base/mysqlxx/Pool.h +++ b/base/mysqlxx/Pool.h @@ -159,9 +159,9 @@ public: */ Pool(const std::string & db_, const std::string & server_, - const std::string & user_ = "", - const std::string & password_ = "", - unsigned port_ = 0, + const std::string & user_, + const std::string & password_, + unsigned port_, const std::string & socket_ = "", unsigned connect_timeout_ = MYSQLXX_DEFAULT_TIMEOUT, unsigned rw_timeout_ = MYSQLXX_DEFAULT_RW_TIMEOUT, diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt index 9d74179902d..51f4b974161 100644 --- a/cmake/autogenerated_versions.txt +++ b/cmake/autogenerated_versions.txt @@ -1,9 +1,9 @@ # This strings autochanged from release_lib.sh: -SET(VERSION_REVISION 54450) +SET(VERSION_REVISION 54451) SET(VERSION_MAJOR 21) -SET(VERSION_MINOR 5) +SET(VERSION_MINOR 6) SET(VERSION_PATCH 1) -SET(VERSION_GITHASH 3827789b3d8fd2021952e57e5110343d26daa1a1) -SET(VERSION_DESCRIBE v21.5.1.1-prestable) -SET(VERSION_STRING 21.5.1.1) +SET(VERSION_GITHASH 96fced4c3cf432fb0b401d2ab01f0c56e5f74a96) +SET(VERSION_DESCRIBE v21.6.1.1-prestable) +SET(VERSION_STRING 21.6.1.1) # end of autochange diff --git a/cmake/find/nanodbc.cmake b/cmake/find/nanodbc.cmake index 2c913abb13e..894a2a60bad 100644 --- a/cmake/find/nanodbc.cmake +++ b/cmake/find/nanodbc.cmake @@ -1,35 +1,16 @@ -option(ENABLE_NANODBC "Enalbe nanodbc" ${ENABLE_LIBRARIES}) - -if (NOT ENABLE_NANODBC) - set (USE_ODBC 0) - return() -endif() - if (NOT ENABLE_ODBC) - set (USE_NANODBC 0) - message (STATUS "Using nanodbc=${USE_NANODBC}") - return() -endif() + return () +endif () + +if (NOT USE_INTERNAL_NANODBC_LIBRARY) + message (FATAL_ERROR "Only the bundled nanodbc library can be used") +endif () if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/nanodbc/CMakeLists.txt") - message (WARNING "submodule contrib/nanodbc is missing. to fix try run: \n git submodule update --init --recursive") - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal nanodbc library") - set (USE_NANODBC 0) - return() + message (FATAL_ERROR "submodule contrib/nanodbc is missing. to fix try run: \n git submodule update --init --recursive") endif() -if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/unixodbc/include") - message (ERROR "submodule contrib/unixodbc is missing. to fix try run: \n git submodule update --init --recursive") - message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal unixodbc needed for nanodbc") - set (USE_NANODBC 0) - return() -endif() - -set (USE_NANODBC 1) - set (NANODBC_LIBRARY nanodbc) +set (NANODBC_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/nanodbc/nanodbc") -set (NANODBC_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/nanodbc/nanodbce") - -message (STATUS "Using nanodbc=${USE_NANODBC}: ${NANODBC_INCLUDE_DIR} : ${NANODBC_LIBRARY}") -message (STATUS "Using unixodbc") +message (STATUS "Using nanodbc: ${NANODBC_INCLUDE_DIR} : ${NANODBC_LIBRARY}") diff --git a/cmake/find/odbc.cmake b/cmake/find/odbc.cmake index a23f0c831e9..c475e600c0d 100644 --- a/cmake/find/odbc.cmake +++ b/cmake/find/odbc.cmake @@ -50,4 +50,6 @@ if (NOT EXTERNAL_ODBC_LIBRARY_FOUND) set (USE_INTERNAL_ODBC_LIBRARY 1) endif () +set (USE_INTERNAL_NANODBC_LIBRARY 1) + message (STATUS "Using unixodbc") diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake index a398c59e981..a85fe8963c7 100644 --- a/cmake/warnings.cmake +++ b/cmake/warnings.cmake @@ -171,6 +171,7 @@ elseif (COMPILER_GCC) add_cxx_compile_options(-Wtrampolines) # Obvious add_cxx_compile_options(-Wunused) + add_cxx_compile_options(-Wundef) # Warn if vector operation is not implemented via SIMD capabilities of the architecture add_cxx_compile_options(-Wvector-operation-performance) # XXX: libstdc++ has some of these for 3way compare diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt index 42792784a14..087212ad3b0 100644 --- a/contrib/CMakeLists.txt +++ b/contrib/CMakeLists.txt @@ -47,6 +47,7 @@ add_subdirectory (lz4-cmake) add_subdirectory (murmurhash) add_subdirectory (replxx-cmake) add_subdirectory (unixodbc-cmake) +add_subdirectory (nanodbc-cmake) if (USE_INTERNAL_XZ_LIBRARY) add_subdirectory (xz) @@ -320,10 +321,6 @@ if (USE_LIBPQXX) add_subdirectory (libpqxx-cmake) endif() -if (USE_NANODBC) - add_subdirectory (nanodbc-cmake) -endif() - if (USE_NURAFT) add_subdirectory(nuraft-cmake) endif() diff --git a/contrib/NuRaft b/contrib/NuRaft index d2feb5978b9..377f8e77491 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit d2feb5978b979729a07c3ca76eaa4ab94cef4ceb +Subproject commit 377f8e77491d9f66ce8e32e88aae19dffe8dc4d7 diff --git a/contrib/nanodbc-cmake/CMakeLists.txt b/contrib/nanodbc-cmake/CMakeLists.txt index 5de46d52a61..1673b311c49 100644 --- a/contrib/nanodbc-cmake/CMakeLists.txt +++ b/contrib/nanodbc-cmake/CMakeLists.txt @@ -1,3 +1,7 @@ +if (NOT USE_INTERNAL_NANODBC_LIBRARY) + return () +endif () + set (LIBRARY_DIR ${ClickHouse_SOURCE_DIR}/contrib/nanodbc) if (NOT TARGET unixodbc) diff --git a/contrib/zlib-ng b/contrib/zlib-ng index 7f254522fd6..5cc4d232020 160000 --- a/contrib/zlib-ng +++ b/contrib/zlib-ng @@ -1 +1 @@ -Subproject commit 7f254522fd676ff4e906c6d4e9b30d4df4214c2d +Subproject commit 5cc4d232020dc66d1d6c5438834457e2a2f6127b diff --git a/debian/changelog b/debian/changelog index be77dfdefe9..8b6626416a9 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,5 +1,5 @@ -clickhouse (21.5.1.1) unstable; urgency=low +clickhouse (21.6.1.1) unstable; urgency=low * Modified source code - -- clickhouse-release Fri, 02 Apr 2021 18:34:26 +0300 + -- clickhouse-release Tue, 20 Apr 2021 01:48:16 +0300 diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 2efba9735ae..569025dec1c 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.5.1.* +ARG version=21.6.1.* RUN apt-get update \ && apt-get install --yes --no-install-recommends \ diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 05ca29f22d4..48c978366c6 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:20.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.5.1.* +ARG version=21.6.1.* ARG gosu_ver=1.10 # set non-empty deb_location_url url to create a docker image diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile index 976c46ebe27..0e4646386ce 100644 --- a/docker/test/Dockerfile +++ b/docker/test/Dockerfile @@ -1,7 +1,7 @@ FROM ubuntu:18.04 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/" -ARG version=21.5.1.* +ARG version=21.6.1.* RUN apt-get update && \ apt-get install -y apt-transport-https dirmngr && \ diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 8d292e34eb3..a7cc398e5c9 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -308,10 +308,8 @@ function run_tests 01354_order_by_tuple_collate_const 01355_ilike 01411_bayesian_ab_testing - 01532_collate_in_low_cardinality - 01533_collate_in_nullable - 01542_collate_in_array - 01543_collate_in_tuple + collate + collation _orc_ arrow avro diff --git a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml index 41bc7f777bf..63e23d8453c 100644 --- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml +++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml @@ -17,6 +17,9 @@ 12 + + + 64Mi diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index 4727f485943..2588b9f4213 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -66,7 +66,12 @@ reportStageEnd('parse') subst_elems = root.findall('substitutions/substitution') available_parameters = {} # { 'table': ['hits_10m', 'hits_100m'], ... } for e in subst_elems: - available_parameters[e.find('name').text] = [v.text for v in e.findall('values/value')] + name = e.find('name').text + values = [v.text for v in e.findall('values/value')] + if not values: + raise Exception(f'No values given for substitution {{{name}}}') + + available_parameters[name] = values # Takes parallel lists of templates, substitutes them with all combos of # parameters. The set of parameters is determined based on the first list. diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index f9bb4570740..e6f2d678aa9 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -104,6 +104,12 @@ clickhouse-client -q "system flush logs" ||: pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz & clickhouse-client -q "select * from system.query_log format TSVWithNamesAndTypes" | pigz > /test_output/query-log.tsv.gz & clickhouse-client -q "select * from system.query_thread_log format TSVWithNamesAndTypes" | pigz > /test_output/query-thread-log.tsv.gz & +clickhouse-client --allow_introspection_functions=1 -q " + WITH + arrayMap(x -> concat(demangle(addressToSymbol(x)), ':', addressToLine(x)), trace) AS trace_array, + arrayStringConcat(trace_array, '\n') AS trace_string + SELECT * EXCEPT(trace), trace_string FROM system.trace_log FORMAT TSVWithNamesAndTypes +" | pigz > /test_output/trace-log.tsv.gz & wait ||: mv /var/log/clickhouse-server/stderr.log /test_output/ ||: diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index ad8ec731f23..74a88df21e0 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -136,6 +136,7 @@ pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhous tar -chf /test_output/coordination.tar /var/lib/clickhouse/coordination ||: mv /var/log/clickhouse-server/stderr.log /test_output/ tar -chf /test_output/query_log_dump.tar /var/lib/clickhouse/data/system/query_log ||: +tar -chf /test_output/trace_log_dump.tar /var/lib/clickhouse/data/system/trace_log ||: # Write check result into check_status.tsv clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' order by (lower(test) like '%hung%') LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv diff --git a/docs/en/engines/table-engines/integrations/s3.md b/docs/en/engines/table-engines/integrations/s3.md index 3d02aa13812..6592f8b9752 100644 --- a/docs/en/engines/table-engines/integrations/s3.md +++ b/docs/en/engines/table-engines/integrations/s3.md @@ -19,26 +19,26 @@ ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, - `path` — Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [below](#wildcards-in-path). - `format` — The [format](../../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — Compression type. Supported values: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. Parameter is optional. By default, it will autodetect compression by file extension. +- `compression` — Compression type. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Parameter is optional. By default, it will autodetect compression by file extension. -**Example:** +**Example** -**1.** Set up the `s3_engine_table` table: +1. Set up the `s3_engine_table` table: -```sql -CREATE TABLE s3_engine_table (name String, value UInt32) ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip') +``` sql +CREATE TABLE s3_engine_table (name String, value UInt32) ENGINE=S3('https://storage.yandexcloud.net/my-test-bucket-768/test-data.csv.gz', 'CSV', 'name String, value UInt32', 'gzip'); ``` -**2.** Fill file: +2. Fill file: -```sql -INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3) +``` sql +INSERT INTO s3_engine_table VALUES ('one', 1), ('two', 2), ('three', 3); ``` -**3.** Query the data: +3. Query the data: -```sql -SELECT * FROM s3_engine_table LIMIT 2 +``` sql +SELECT * FROM s3_engine_table LIMIT 2; ``` ```text @@ -73,13 +73,63 @@ For more information about virtual columns see [here](../../../engines/table-eng Constructions with `{}` are similar to the [remote](../../../sql-reference/table-functions/remote.md) table function. -## S3-related Settings {#s3-settings} +**Example** + +1. Suppose we have several files in CSV format with the following URIs on S3: + +- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_1.csv’ +- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_2.csv’ +- ‘https://storage.yandexcloud.net/my-test-bucket-768/some_prefix/some_file_3.csv’ +- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_1.csv’ +- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_2.csv’ +- ‘https://storage.yandexcloud.net/my-test-bucket-768/another_prefix/some_file_3.csv’ + +There are several ways to make a table consisting of all six files: + +The first way: + +``` sql +CREATE TABLE table_with_range (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_{1..3}', 'CSV'); +``` + +Another way: + +``` sql +CREATE TABLE table_with_question_mark (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/some_file_?', 'CSV'); +``` + +Table consists of all the files in both directories (all files should satisfy format and schema described in query): + +``` sql +CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV'); +``` + +If the listing of files contains number ranges with leading zeros, use the construction with braces for each digit separately or use `?`. + +**Example** + +Create table with files named `file-000.csv`, `file-001.csv`, … , `file-999.csv`: + +``` sql +CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); +``` + +## Virtual Columns {#virtual-columns} + +- `_path` — Path to the file. +- `_file` — Name of the file. + +**See Also** + +- [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) + +## S3-related settings {#settings} The following settings can be set before query execution or placed into configuration file. -- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`. +- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`. - `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`. -- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. +- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`. Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration. @@ -90,6 +140,7 @@ The following settings can be specified in configuration file for given endpoint - `endpoint` — Specifies prefix of an endpoint. Mandatory. - `access_key_id` and `secret_access_key` — Specifies credentials to use with given endpoint. Optional. - `use_environment_credentials` — If set to `true`, S3 client will try to obtain credentials from environment variables and Amazon EC2 metadata for given endpoint. Optional, default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Optional, default value is `false`. - `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be speficied multiple times. - `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. Optional. @@ -102,11 +153,13 @@ The following settings can be specified in configuration file for given endpoint + ``` + ## Usage {#usage-examples} Suppose we have several files in TSV format with the following URIs on HDFS: @@ -149,8 +202,7 @@ ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_p CREATE TABLE big_table (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file-{000..999}.csv', 'CSV'); ``` + ## See also - [S3 table function](../../../sql-reference/table-functions/s3.md) - -[Original article](https://clickhouse.tech/docs/en/engines/table-engines/integrations/s3/) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index a24b7229d17..9874e87be78 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -767,6 +767,7 @@ Required parameters: Optional parameters: - `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. - `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. - `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. - `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. diff --git a/docs/en/guides/apply-catboost-model.md b/docs/en/guides/apply-catboost-model.md index f614b121714..7c2c8a575ec 100644 --- a/docs/en/guides/apply-catboost-model.md +++ b/docs/en/guides/apply-catboost-model.md @@ -159,6 +159,9 @@ The fastest way to evaluate a CatBoost model is compile `libcatboostmodel./home/catboost/models/*_model.xml ``` +!!! note "Note" + You can change path to the CatBoost model configuration later without restarting server. + ## 4. Run the Model Inference from SQL {#run-model-inference} For test model run the ClickHouse client `$ clickhouse client`. diff --git a/docs/en/interfaces/third-party/gui.md b/docs/en/interfaces/third-party/gui.md index d2fd24ab9a4..e54e40441ca 100644 --- a/docs/en/interfaces/third-party/gui.md +++ b/docs/en/interfaces/third-party/gui.md @@ -169,24 +169,21 @@ Features: ### SeekTable {#seektable} -[SeekTable](https://www.seektable.com) is a self-service BI tool for data exploration and operational reporting. SeekTable is available both as a cloud service and a self-hosted version. SeekTable reports may be embedded into any web-app. +[SeekTable](https://www.seektable.com) is a self-service BI tool for data exploration and operational reporting. It is available both as a cloud service and a self-hosted version. Reports from SeekTable may be embedded into any web-app. Features: - Business users-friendly reports builder. - Powerful report parameters for SQL filtering and report-specific query customizations. - Can connect to ClickHouse both with a native TCP/IP endpoint and a HTTP(S) interface (2 different drivers). -- It is possible to use all power of CH SQL dialect in dimensions/measures definitions +- It is possible to use all power of ClickHouse SQL dialect in dimensions/measures definitions. - [Web API](https://www.seektable.com/help/web-api-integration) for automated reports generation. -- Supports reports development flow with account data [backup/restore](https://www.seektable.com/help/self-hosted-backup-restore), data models (cubes) / reports configuration is a human-readable XML and can be stored under version control. +- Supports reports development flow with account data [backup/restore](https://www.seektable.com/help/self-hosted-backup-restore); data models (cubes) / reports configuration is a human-readable XML and can be stored under version control system. SeekTable is [free](https://www.seektable.com/help/cloud-pricing) for personal/individual usage. [How to configure ClickHouse connection in SeekTable.](https://www.seektable.com/help/clickhouse-pivot-table) - ### Chadmin {#chadmin} [Chadmin](https://github.com/bun4uk/chadmin) is a simple UI where you can visualize your currently running queries on your ClickHouse cluster and info about them and kill them if you want. - -[Original article](https://clickhouse.tech/docs/en/interfaces/third-party/gui/) diff --git a/docs/en/introduction/adopters.md b/docs/en/introduction/adopters.md index 3b3c1203b50..fa257a84173 100644 --- a/docs/en/introduction/adopters.md +++ b/docs/en/introduction/adopters.md @@ -77,7 +77,8 @@ toc_title: Adopters | Marilyn | Advertising | Statistics | — | — | [Talk in Russian, June 2017](https://www.youtube.com/watch?v=iXlIgx2khwc) | | Mello | Marketing | Analytics | 1 server | — | [Article, Oct 2020](https://vc.ru/marketing/166180-razrabotka-tipovogo-otcheta-skvoznoy-analitiki) | | MessageBird | Telecommunications | Statistics | — | — | [Slides in English, November 2018](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup20/messagebird.pdf) | -| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) |x +| Microsoft | Web Analytics | Clarity (Main Product) | — | — | [A question on GitHub](https://github.com/ClickHouse/ClickHouse/issues/21556) | +| MindsDB | Machine Learning | Main Product | — | — | [Official Website](https://www.mindsdb.com/blog/machine-learning-models-as-tables-in-ch) | | MUX | Online Video | Video Analytics | — | — | [Talk in English, August 2019](https://altinity.com/presentations/2019/8/13/how-clickhouse-became-the-default-analytics-database-for-mux/) | | MGID | Ad network | Web-analytics | — | — | [Blog post in Russian, April 2020](http://gs-studio.com/news-about-it/32777----clickhouse---c) | | Netskope | Network Security | — | — | — | [Job advertisement, March 2021](https://www.mendeley.com/careers/job/senior-software-developer-backend-developer-1346348) | diff --git a/docs/en/operations/system-tables/columns.md b/docs/en/operations/system-tables/columns.md index 92a6315d06b..9160dca9a1a 100644 --- a/docs/en/operations/system-tables/columns.md +++ b/docs/en/operations/system-tables/columns.md @@ -4,7 +4,9 @@ Contains information about columns in all the tables. You can use this table to get information similar to the [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table) query, but for multiple tables at once. -The `system.columns` table contains the following columns (the column type is shown in brackets): +Columns from [temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.columns` only in those session where they have been created. They are shown with the empty `database` field. + +Columns: - `database` ([String](../../sql-reference/data-types/string.md)) — Database name. - `table` ([String](../../sql-reference/data-types/string.md)) — Table name. @@ -26,7 +28,7 @@ The `system.columns` table contains the following columns (the column type is sh **Example** ```sql -:) select * from system.columns LIMIT 2 FORMAT Vertical; +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; ``` ```text @@ -65,8 +67,6 @@ is_in_sorting_key: 0 is_in_primary_key: 0 is_in_sampling_key: 0 compression_codec: - -2 rows in set. Elapsed: 0.002 sec. ``` [Original article](https://clickhouse.tech/docs/en/operations/system_tables/columns) diff --git a/docs/en/operations/system-tables/tables.md b/docs/en/operations/system-tables/tables.md index 6ad1425e032..ccc9ab94f8b 100644 --- a/docs/en/operations/system-tables/tables.md +++ b/docs/en/operations/system-tables/tables.md @@ -1,59 +1,65 @@ # system.tables {#system-tables} -Contains metadata of each table that the server knows about. Detached tables are not shown in `system.tables`. +Contains metadata of each table that the server knows about. -This table contains the following columns (the column type is shown in brackets): +[Detached](../../sql-reference/statements/detach.md) tables are not shown in `system.tables`. -- `database` (String) — The name of the database the table is in. +[Temporary tables](../../sql-reference/statements/create/table.md#temporary-tables) are visible in the `system.tables` only in those session where they have been created. They are shown with the empty `database` field and with the `is_temporary` flag switched on. -- `name` (String) — Table name. +Columns: -- `engine` (String) — Table engine name (without parameters). +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. -- `is_temporary` (UInt8) - Flag that indicates whether the table is temporary. +- `name` ([String](../../sql-reference/data-types/string.md)) — Table name. -- `data_path` (String) - Path to the table data in the file system. +- `engine` ([String](../../sql-reference/data-types/string.md)) — Table engine name (without parameters). -- `metadata_path` (String) - Path to the table metadata in the file system. +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) - Flag that indicates whether the table is temporary. -- `metadata_modification_time` (DateTime) - Time of latest modification of the table metadata. +- `data_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table data in the file system. -- `dependencies_database` (Array(String)) - Database dependencies. +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) - Path to the table metadata in the file system. -- `dependencies_table` (Array(String)) - Table dependencies ([MaterializedView](../../engines/table-engines/special/materializedview.md) tables based on the current table). +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) - Time of latest modification of the table metadata. -- `create_table_query` (String) - The query that was used to create the table. +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Database dependencies. -- `engine_full` (String) - Parameters of the table engine. +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) - Table dependencies ([MaterializedView](../../engines/table-engines/special/materializedview.md) tables based on the current table). -- `partition_key` (String) - The partition key expression specified in the table. +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) - The query that was used to create the table. -- `sorting_key` (String) - The sorting key expression specified in the table. +- `engine_full` ([String](../../sql-reference/data-types/string.md)) - Parameters of the table engine. -- `primary_key` (String) - The primary key expression specified in the table. +- `partition_key` ([String](../../sql-reference/data-types/string.md)) - The partition key expression specified in the table. -- `sampling_key` (String) - The sampling key expression specified in the table. +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) - The sorting key expression specified in the table. -- `storage_policy` (String) - The storage policy: +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - The primary key expression specified in the table. + +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) - The sampling key expression specified in the table. + +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - The storage policy: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` (Nullable(UInt64)) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `Null` (including underying `Buffer` table). +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows, if it is possible to quickly determine exact number of rows in the table, otherwise `NULL` (including underying `Buffer` table). -- `total_bytes` (Nullable(UInt64)) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `Null` (**does not** includes any underlying storage). +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes, if it is possible to quickly determine exact number of bytes for the table on storage, otherwise `NULL` (does not includes any underlying storage). - If the table stores data on disk, returns used space on disk (i.e. compressed). - If the table stores data in memory, returns approximated number of used bytes in memory. -- `lifetime_rows` (Nullable(UInt64)) - Total number of rows INSERTed since server start (only for `Buffer` tables). +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of rows INSERTed since server start (only for `Buffer` tables). -- `lifetime_bytes` (Nullable(UInt64)) - Total number of bytes INSERTed since server start (only for `Buffer` tables). +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - Total number of bytes INSERTed since server start (only for `Buffer` tables). The `system.tables` table is used in `SHOW TABLES` query implementation. +**Example** + ```sql -:) SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; ``` ```text @@ -100,8 +106,6 @@ sampling_key: storage_policy: total_rows: ᴺᵁᴸᴸ total_bytes: ᴺᵁᴸᴸ - -2 rows in set. Elapsed: 0.004 sec. ``` [Original article](https://clickhouse.tech/docs/en/operations/system_tables/tables) diff --git a/docs/en/operations/update.md b/docs/en/operations/update.md index 22995705af9..dbcf9ae2b3e 100644 --- a/docs/en/operations/update.md +++ b/docs/en/operations/update.md @@ -29,6 +29,3 @@ $ sudo apt-get update $ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b $ sudo service clickhouse-server restart ``` - - - diff --git a/docs/en/sql-reference/aggregate-functions/combinators.md b/docs/en/sql-reference/aggregate-functions/combinators.md index cddef68d49c..259202805d3 100644 --- a/docs/en/sql-reference/aggregate-functions/combinators.md +++ b/docs/en/sql-reference/aggregate-functions/combinators.md @@ -27,7 +27,37 @@ Example 2: `uniqArray(arr)` – Counts the number of unique elements in all ‘a ## -SimpleState {#agg-functions-combinator-simplestate} -If you apply this combinator, the aggregate function returns the same value but with a different type. This is an `SimpleAggregateFunction(...)` that can be stored in a table to work with [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) table engines. +If you apply this combinator, the aggregate function returns the same value but with a different type. This is a [SimpleAggregateFunction(...)](../../sql-reference/data-types/simpleaggregatefunction.md) that can be stored in a table to work with [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md) tables. + +**Syntax** + +``` sql +SimpleState(x) +``` + +**Arguments** + +- `x` — Aggregate function parameters. + +**Returned values** + +The value of an aggregate function with the `SimpleAggregateFunction(...)` type. + +**Example** + +Query: + +``` sql +WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); +``` + +Result: + +``` text +┌─toTypeName(c)────────────────────────┬─c─┐ +│ SimpleAggregateFunction(any, UInt64) │ 0 │ +└──────────────────────────────────────┴───┘ +``` ## -State {#agg-functions-combinator-state} @@ -249,4 +279,3 @@ FROM people └────────┴───────────────────────────┘ ``` - diff --git a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md index dcc665a68af..dd0d59978d1 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/quantiletdigest.md @@ -6,7 +6,7 @@ toc_priority: 207 Computes an approximate [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using the [t-digest](https://github.com/tdunning/t-digest/blob/master/docs/t-digest-paper/histo.pdf) algorithm. -The maximum error is 1%. Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. +Memory consumption is `log(n)`, where `n` is a number of values. The result depends on the order of running the query, and is nondeterministic. The performance of the function is lower than performance of [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md#quantile) or [quantileTiming](../../../sql-reference/aggregate-functions/reference/quantiletiming.md#quantiletiming). In terms of the ratio of State size to precision, this function is much better than `quantile`. diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 244779c5ca8..af12a03ab51 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -2,6 +2,8 @@ `SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we don’t have to store and process any extra data. +The common way to produce an aggregate function value is by calling the aggregate function with the [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate) suffix. + The following aggregate functions are supported: - [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index ca6ef684faf..d545a0ae4e6 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -16,46 +16,60 @@ The following assumptions are made: ## visitParamHas(params, name) {#visitparamhasparams-name} -Checks whether there is a field with the ‘name’ name. +Checks whether there is a field with the `name` name. + +Alias: `simpleJSONHas`. ## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -Parses UInt64 from the value of the field named ‘name’. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn’t exist, or it exists but doesn’t contain a number, it returns 0. +Parses UInt64 from the value of the field named `name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field doesn’t exist, or it exists but doesn’t contain a number, it returns 0. + +Alias: `simpleJSONExtractUInt`. ## visitParamExtractInt(params, name) {#visitparamextractintparams-name} The same as for Int64. +Alias: `simpleJSONExtractInt`. + ## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} The same as for Float64. +Alias: `simpleJSONExtractFloat`. + ## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} Parses a true/false value. The result is UInt8. +Alias: `simpleJSONExtractBool`. + ## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} Returns the value of a field, including separators. +Alias: `simpleJSONExtractRaw`. + Examples: ``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' +visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'; +visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'; ``` ## visitParamExtractString(params, name) {#visitparamextractstringparams-name} Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. +Alias: `simpleJSONExtractString`. + Examples: ``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' +visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'; +visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'; +visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''; +visitParamExtractString('{"abc":"hello}', 'abc') = ''; ``` There is currently no support for code points in the format `\uXXXX\uYYYY` that are not from the basic multilingual plane (they are converted to CESU-8 instead of UTF-8). diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 3ece30be5b8..d661bd4cd59 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -74,6 +74,9 @@ Deletes the column with the name `name`. If the `IF EXISTS` clause is specified, Deletes data from the file system. Since this deletes entire files, the query is completed almost instantly. +!!! warning "Warning" + You can’t delete a column if it is referenced by [materialized view](../../../sql-reference/statements/create/view.md#materialized). Otherwise, it returns an error. + Example: ``` sql @@ -180,7 +183,7 @@ ALTER TABLE table_name MODIFY column_name REMOVE property; ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; ``` -## See Also +**See Also** - [REMOVE TTL](ttl.md). diff --git a/docs/en/sql-reference/statements/create/table.md b/docs/en/sql-reference/statements/create/table.md index bad99980191..5f1f0151350 100644 --- a/docs/en/sql-reference/statements/create/table.md +++ b/docs/en/sql-reference/statements/create/table.md @@ -50,15 +50,32 @@ Creates a table with the same result as that of the [table function](../../../sq ### From SELECT query {#from-select-query} ``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... +CREATE TABLE [IF NOT EXISTS] [db.]table_name[(name1 [type1], name2 [type2], ...)] ENGINE = engine AS SELECT ... ``` -Creates a table with a structure like the result of the `SELECT` query, with the `engine` engine, and fills it with data from SELECT. +Creates a table with a structure like the result of the `SELECT` query, with the `engine` engine, and fills it with data from `SELECT`. Also you can explicitly specify columns description. -In all cases, if `IF NOT EXISTS` is specified, the query won’t return an error if the table already exists. In this case, the query won’t do anything. +If the table already exists and `IF NOT EXISTS` is specified, the query won’t do anything. There can be other clauses after the `ENGINE` clause in the query. See detailed documentation on how to create tables in the descriptions of [table engines](../../../engines/table-engines/index.md#table_engines). +**Example** + +Query: + +``` sql +CREATE TABLE t1 (x String) ENGINE = Memory AS SELECT 1; +SELECT x, toTypeName(x) FROM t1; +``` + +Result: + +```text +┌─x─┬─toTypeName(x)─┐ +│ 1 │ String │ +└───┴───────────────┘ +``` + ## NULL Or NOT NULL Modifiers {#null-modifiers} `NULL` and `NOT NULL` modifiers after data type in column definition allow or do not allow it to be [Nullable](../../../sql-reference/data-types/nullable.md#data_type-nullable). diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index 49a7404d76e..247252d3f4e 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -5,13 +5,18 @@ toc_title: OPTIMIZE # OPTIMIZE Statement {#misc_operations-optimize} +This query tries to initialize an unscheduled merge of data parts for tables. + +!!! warning "Warning" + `OPTIMIZE` can’t fix the `Too many parts` error. + +**Syntax** + ``` sql OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] ``` -This query tries to initialize an unscheduled merge of data parts for tables with a table engine from the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family. - -The `OPTMIZE` query is also supported for the [MaterializedView](../../engines/table-engines/special/materializedview.md) and the [Buffer](../../engines/table-engines/special/buffer.md) engines. Other table engines aren’t supported. +The `OPTMIZE` query is supported for [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) family, the [MaterializedView](../../engines/table-engines/special/materializedview.md) and the [Buffer](../../engines/table-engines/special/buffer.md) engines. Other table engines aren’t supported. When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md) family of table engines, ClickHouse creates a task for merging and waits for execution on all nodes (if the `replication_alter_partitions_sync` setting is enabled). @@ -21,12 +26,13 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin - If you specify `DEDUPLICATE`, then completely identical rows (unless by-clause is specified) will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. -### BY expression {#by-expression} +## BY expression {#by-expression} If you want to perform deduplication on custom set of columns rather than on all, you can specify list of columns explicitly or use any combination of [`*`](../../sql-reference/statements/select/index.md#asterisk), [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) or [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier) expressions. The explictly written or implicitly expanded list of columns must include all columns specified in row ordering expression (both primary and sorting keys) and partitioning expression (partitioning key). -Note that `*` behaves just like in `SELECT`: `MATERIALIZED`, and `ALIAS` columns are not used for expansion. -Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an ALIAS column. +!!! note "Note" + Notice that `*` behaves just like in `SELECT`: `MATERIALIZED` and `ALIAS` columns are not used for expansion. + Also, it is an error to specify empty list of columns, or write an expression that results in an empty list of columns, or deduplicate by an ALIAS column. ``` sql OPTIMIZE TABLE table DEDUPLICATE; -- the old one @@ -39,9 +45,10 @@ OPTIMIZE TABLE table DEDUPLICATE BY COLUMNS('column-matched-by-regex') EXCEPT co OPTIMIZE TABLE table DEDUPLICATE BY COLUMNS('column-matched-by-regex') EXCEPT (colX, colY); ``` -**Example:** +**Examples** + +Create a table: -A silly synthetic table. ``` sql CREATE TABLE example ( primary_key Int32, @@ -56,31 +63,31 @@ PARTITION BY partition_key ORDER BY (primary_key, secondary_key); ``` +The 'old' deduplicate, all columns are taken into account, i.e. row is removed only if all values in all columns are equal to corresponding values in previous row. + ``` sql --- The 'old' deduplicate, all columns are taken into account, i.e. row is removed only if all values in all columns are equal to corresponding values in previous row. OPTIMIZE TABLE example FINAL DEDUPLICATE; ``` +Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED`: `primary_key`, `secondary_key`, `value`, `partition_key`, and `materialized_value` columns. + ``` sql --- Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED`: `primary_key`, `secondary_key`, `value`, `partition_key`, and `materialized_value` columns. OPTIMIZE TABLE example FINAL DEDUPLICATE BY *; ``` +Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED` and explicitly not `materialized_value`: `primary_key`, `secondary_key`, `value`, and `partition_key` columns. + ``` sql --- Deduplicate by all columns that are not `ALIAS` or `MATERIALIZED` and explicitly not `materialized_value`: `primary_key`, `secondary_key`, `value`, and `partition_key` columns. OPTIMIZE TABLE example FINAL DEDUPLICATE BY * EXCEPT materialized_value; ``` +Deduplicate explicitly by `primary_key`, `secondary_key`, and `partition_key` columns. ``` sql --- Deduplicate explicitly by `primary_key`, `secondary_key`, and `partition_key` columns. OPTIMIZE TABLE example FINAL DEDUPLICATE BY primary_key, secondary_key, partition_key; ``` +Deduplicate by any column matching a regex: `primary_key`, `secondary_key`, and `partition_key` columns. + ``` sql --- Deduplicate by any column matching a regex: `primary_key`, `secondary_key`, and `partition_key` columns. OPTIMIZE TABLE example FINAL DEDUPLICATE BY COLUMNS('.*_key'); ``` - - -!!! warning "Warning" - `OPTIMIZE` can’t fix the “Too many parts” error. diff --git a/docs/en/sql-reference/table-functions/s3.md b/docs/en/sql-reference/table-functions/s3.md index 34f0607b94c..285ec862aab 100644 --- a/docs/en/sql-reference/table-functions/s3.md +++ b/docs/en/sql-reference/table-functions/s3.md @@ -18,7 +18,7 @@ s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compres - `path` — Bucket url with path to file. Supports following wildcards in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc'`, `'def'` — strings. For more information see [here](../../engines/table-engines/integrations/s3.md#wildcards-in-path). - `format` — The [format](../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — Parameter is optional. Supported values: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. By default, it will autodetect compression by file extension. +- `compression` — Parameter is optional. Supported values: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. By default, it will autodetect compression by file extension. **Returned value** diff --git a/docs/ru/commercial/cloud.md b/docs/ru/commercial/cloud.md index 6c192a34f9f..e00fc3be673 100644 --- a/docs/ru/commercial/cloud.md +++ b/docs/ru/commercial/cloud.md @@ -39,4 +39,20 @@ toc_title: "Поставщики облачных услуг ClickHouse" - поддержка прав доступа, one-key восстановления, многоуровневая защита сети, шифрование облачного диска; - полная интеграция с облачными системами логирования, базами данных и инструментами обработки данных; - встроенная платформа для мониторинга и управления базами данных; -- техническая поддержка от экспертов по работе с базами данных. \ No newline at end of file +- техническая поддержка от экспертов по работе с базами данных. + +## SberCloud {#sbercloud} + +[Облачная платформа SberCloud.Advanced](https://sbercloud.ru/ru/advanced): + +- предоставляет более 50 высокотехнологичных сервисов; +- позволяет быстро создавать и эффективно управлять ИТ-инфраструктурой, приложениями и интернет-сервисами; +- радикально минимизирует ресурсы, требуемые для работы корпоративных ИТ-систем; +- в разы сокращает время вывода новых продуктов на рынок. + +SberCloud.Advanced предоставляет [MapReduce Service (MRS)](https://docs.sbercloud.ru/mrs/ug/topics/ug__clickhouse.html) — надежную, безопасную и простую в использовании платформу корпоративного уровня для хранения, обработки и анализа больших данных. MRS позволяет быстро создавать и управлять кластерами ClickHouse. + +- Инстанс ClickHouse состоит из трех узлов ZooKeeper и нескольких узлов ClickHouse. Выделенный режим реплики используется для обеспечения высокой надежности двойных копий данных. +- MRS предлагает возможности гибкого масштабирования при быстром росте сервисов в сценариях, когда емкости кластерного хранилища или вычислительных ресурсов процессора недостаточно. MRS в один клик предоставляет инструмент для балансировки данных при расширении узлов ClickHouse в кластере. Вы можете определить режим и время балансировки данных на основе характеристик сервиса, чтобы обеспечить доступность сервиса. +- MRS использует архитектуру развертывания высокой доступности на основе Elastic Load Balance (ELB) — сервиса для автоматического распределения трафика на несколько внутренних узлов. Благодаря ELB, данные записываются в локальные таблицы и считываются из распределенных таблиц на разных узлах. Такая архитектура повышает отказоустойчивость кластера и гарантирует высокую доступность приложений. + diff --git a/docs/ru/engines/table-engines/index.md b/docs/ru/engines/table-engines/index.md index a364a3cb972..b17b2124250 100644 --- a/docs/ru/engines/table-engines/index.md +++ b/docs/ru/engines/table-engines/index.md @@ -48,6 +48,14 @@ toc_title: "Введение" Движки семейства: +- [Kafka](integrations/kafka.md#kafka) +- [MySQL](integrations/mysql.md#mysql) +- [ODBC](integrations/odbc.md#table-engine-odbc) +- [JDBC](integrations/jdbc.md#table-engine-jdbc) +- [S3](integrations/s3.md#table-engine-s3) + +### Специальные движки {#spetsialnye-dvizhki} + - [ODBC](../../engines/table-engines/integrations/odbc.md) - [JDBC](../../engines/table-engines/integrations/jdbc.md) - [MySQL](../../engines/table-engines/integrations/mysql.md) @@ -84,4 +92,3 @@ toc_title: "Введение" Чтобы получить данные из виртуального столбца, необходимо указать его название в запросе `SELECT`. `SELECT *` не отображает данные из виртуальных столбцов. При создании таблицы со столбцом, имя которого совпадает с именем одного из виртуальных столбцов таблицы, виртуальный столбец становится недоступным. Не делайте так. Чтобы помочь избежать конфликтов, имена виртуальных столбцов обычно предваряются подчеркиванием. - diff --git a/docs/ru/engines/table-engines/integrations/s3.md b/docs/ru/engines/table-engines/integrations/s3.md index fa10e8ebc34..216db98077c 100644 --- a/docs/ru/engines/table-engines/integrations/s3.md +++ b/docs/ru/engines/table-engines/integrations/s3.md @@ -19,7 +19,7 @@ ENGINE = S3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, - `path` — URL-адрес бакета с указанием пути к файлу. Поддерживает следующие подстановочные знаки в режиме "только чтение": `*`, `?`, `{abc,def}` и `{N..M}` где `N`, `M` — числа, `'abc'`, `'def'` — строки. Подробнее смотри [ниже](#wildcards-in-path). - `format` — [формат](../../../interfaces/formats.md#formats) файла. - `structure` — структура таблицы в формате `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — тип сжатия. Возможные значения: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. Необязательный параметр. Если не указано, то тип сжатия определяется автоматически по расширению файла. +- `compression` — тип сжатия. Возможные значения: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Необязательный параметр. Если не указано, то тип сжатия определяется автоматически по расширению файла. **Пример** @@ -73,17 +73,17 @@ SELECT * FROM s3_engine_table LIMIT 2; Соображение безопасности: если злонамеренный пользователь попробует указать произвольные URL-адреса S3, параметр `s3_max_redirects` должен быть установлен в ноль, чтобы избежать атак [SSRF] (https://en.wikipedia.org/wiki/Server-side_request_forgery). Как альтернатива, в конфигурации сервера должен быть указан `remote_host_filter`. -## Настройки конечных точек {#endpoint-settings} +## Настройки точки приема запроса {#endpoint-settings} -Для конечной точки (которая соответствует точному префиксу URL-адреса) в конфигурационном файле могут быть заданы следующие настройки: +Для точки приема запроса (которая соответствует точному префиксу URL-адреса) в конфигурационном файле могут быть заданы следующие настройки: Обязательная настройка: -- `endpoint` — указывает префикс конечной точки. +- `endpoint` — указывает префикс точки приема запроса. Необязательные настройки: -- `access_key_id` и `secret_access_key` — указывают учетные данные для использования с данной конечной точкой. -- `use_environment_credentials` — если `true`, S3-клиент будет пытаться получить учетные данные из переменных среды и метаданных Amazon EC2 для данной конечной точки. Значение по умолчанию - `false`. -- `header` — добавляет указанный HTTP-заголовок к запросу на заданную конечную точку. Может быть определен несколько раз. +- `access_key_id` и `secret_access_key` — указывают учетные данные для использования с данной точкой приема запроса. +- `use_environment_credentials` — если `true`, S3-клиент будет пытаться получить учетные данные из переменных среды и метаданных Amazon EC2 для данной точки приема запроса. Значение по умолчанию - `false`. +- `header` — добавляет указанный HTTP-заголовок к запросу на заданную точку приема запроса. Может быть определен несколько раз. - `server_side_encryption_customer_key_base64` — устанавливает необходимые заголовки для доступа к объектам S3 с шифрованием SSE-C. **Пример** @@ -133,8 +133,7 @@ CREATE TABLE table_with_asterisk (name String, value UInt32) ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/{some,another}_prefix/*', 'CSV'); ``` -!!! warning "Warning" - Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. +Если список файлов содержит диапазоны чисел с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры отдельно или используйте `?`. 4. Создание таблицы из файлов с именами `file-000.csv`, `file-001.csv`, … , `file-999.csv`: @@ -145,6 +144,3 @@ ENGINE = S3('https://storage.yandexcloud.net/my-test-bucket-768/big_prefix/file- **Смотрите также** - [Табличная функция S3](../../../sql-reference/table-functions/s3.md) - -[Оригинальная статья](https://clickhouse.tech/docs/ru/engines/table-engines/integrations/s3/) - diff --git a/docs/ru/engines/table-engines/mergetree-family/mergetree.md b/docs/ru/engines/table-engines/mergetree-family/mergetree.md index 7d7641a417d..b8bd259167a 100644 --- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md @@ -753,7 +753,8 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd' Необязательные параметры: -- `use_environment_credentials` — признак, нужно ли считывать учетные данные AWS из переменных окружения `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` и `AWS_SESSION_TOKEN`, если они есть. Значение по умолчанию: `false`. +- `use_environment_credentials` — признак, нужно ли считывать учетные данные AWS из сетевого окружения, а также из переменных окружения `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY` и `AWS_SESSION_TOKEN`, если они есть. Значение по умолчанию: `false`. +- `use_insecure_imds_request` — признак, нужно ли использовать менее безопасное соединение при выполнении запроса к IMDS при получении учётных данных из метаданных Amazon EC2. Значение по умолчанию: `false`. - `proxy` — конфигурация прокси-сервера для конечной точки S3. Каждый элемент `uri` внутри блока `proxy` должен содержать URL прокси-сервера. - `connect_timeout_ms` — таймаут подключения к сокету в миллисекундах. Значение по умолчанию: 10 секунд. - `request_timeout_ms` — таймаут выполнения запроса в миллисекундах. Значение по умолчанию: 5 секунд. diff --git a/docs/ru/guides/apply-catboost-model.md b/docs/ru/guides/apply-catboost-model.md index 11964c57fc7..db2be63692f 100644 --- a/docs/ru/guides/apply-catboost-model.md +++ b/docs/ru/guides/apply-catboost-model.md @@ -158,7 +158,9 @@ FROM amazon_train /home/catboost/data/libcatboostmodel.so /home/catboost/models/*_model.xml ``` - +!!! note "Примечание" + Вы можете позднее изменить путь к конфигурации модели CatBoost без перезагрузки сервера. + ## 4. Запустите вывод модели из SQL {#run-model-inference} Для тестирования модели запустите клиент ClickHouse `$ clickhouse client`. diff --git a/docs/ru/interfaces/third-party/gui.md b/docs/ru/interfaces/third-party/gui.md index f913a0ff2cc..156f7130bc5 100644 --- a/docs/ru/interfaces/third-party/gui.md +++ b/docs/ru/interfaces/third-party/gui.md @@ -166,4 +166,19 @@ toc_title: "Визуальные интерфейсы от сторонних р [Как сконфигурировать ClickHouse в Looker.](https://docs.looker.com/setup-and-management/database-config/clickhouse) -[Original article](https://clickhouse.tech/docs/ru/interfaces/third-party/gui/) +### SeekTable {#seektable} + +[SeekTable](https://www.seektable.com) — это аналитический инструмент для самостоятельного анализа и обработки данных бизнес-аналитики. Он доступен как в виде облачного сервиса, так и в виде локальной версии. Отчеты из SeekTable могут быть встроены в любое веб-приложение. + +Основные возможности: + +- Удобный конструктор отчетов. +- Гибкая настройка отчетов SQL и создание запросов для специфичных отчетов. +- Интегрируется с ClickHouse, используя собственную точку приема запроса TCP/IP или интерфейс HTTP(S) (два разных драйвера). +- Поддерживает всю мощь диалекта ClickHouse SQL для построения запросов по различным измерениям и показателям. +- [WEB-API](https://www.seektable.com/help/web-api-integration) для автоматизированной генерации отчетов. +- Процесс разработки отчетов поддерживает [резервное копирование/восстановление данных](https://www.seektable.com/help/self-hosted-backup-restore); конфигурация моделей данных (кубов) / отчетов представляет собой удобочитаемый XML-файл, который может храниться в системе контроля версий. + +SeekTable [бесплатен](https://www.seektable.com/help/cloud-pricing) для личного/индивидуального использования. + +[Как сконфигурировать подключение ClickHouse в SeekTable.](https://www.seektable.com/help/clickhouse-pivot-table) diff --git a/docs/ru/operations/system-tables/columns.md b/docs/ru/operations/system-tables/columns.md index af4cff85439..b8a0aef2299 100644 --- a/docs/ru/operations/system-tables/columns.md +++ b/docs/ru/operations/system-tables/columns.md @@ -4,7 +4,9 @@ С помощью этой таблицы можно получить информацию аналогично запросу [DESCRIBE TABLE](../../sql-reference/statements/misc.md#misc-describe-table), но для многих таблиц сразу. -Таблица `system.columns` содержит столбцы (тип столбца указан в скобках): +Колонки [временных таблиц](../../sql-reference/statements/create/table.md#temporary-tables) содержатся в `system.columns` только в тех сессиях, в которых эти таблицы были созданы. Поле `database` у таких колонок пустое. + +Cтолбцы: - `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных. - `table` ([String](../../sql-reference/data-types/string.md)) — имя таблицы. @@ -23,3 +25,46 @@ - `is_in_sampling_key` ([UInt8](../../sql-reference/data-types/int-uint.md)) — флаг, показывающий включение столбца в ключ выборки. - `compression_codec` ([String](../../sql-reference/data-types/string.md)) — имя кодека сжатия. +**Пример** + +```sql +SELECT * FROM system.columns LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +table: aggregate_function_combinators +name: name +type: String +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: + +Row 2: +────── +database: system +table: aggregate_function_combinators +name: is_internal +type: UInt8 +default_kind: +default_expression: +data_compressed_bytes: 0 +data_uncompressed_bytes: 0 +marks_bytes: 0 +comment: +is_in_partition_key: 0 +is_in_sorting_key: 0 +is_in_primary_key: 0 +is_in_sampling_key: 0 +compression_codec: +``` diff --git a/docs/ru/operations/system-tables/tables.md b/docs/ru/operations/system-tables/tables.md index 42e55b1f6b7..11bb6a9eda2 100644 --- a/docs/ru/operations/system-tables/tables.md +++ b/docs/ru/operations/system-tables/tables.md @@ -1,39 +1,94 @@ # system.tables {#system-tables} -Содержит метаданные каждой таблицы, о которой знает сервер. Отсоединённые таблицы не отображаются в `system.tables`. +Содержит метаданные каждой таблицы, о которой знает сервер. -Эта таблица содержит следующие столбцы (тип столбца показан в скобках): +Отсоединённые таблицы ([DETACH](../../sql-reference/statements/detach.md)) не отображаются в `system.tables`. -- `database String` — имя базы данных, в которой находится таблица. -- `name` (String) — имя таблицы. -- `engine` (String) — движок таблицы (без параметров). -- `is_temporary` (UInt8) — флаг, указывающий на то, временная это таблица или нет. -- `data_path` (String) — путь к данным таблицы в файловой системе. -- `metadata_path` (String) — путь к табличным метаданным в файловой системе. -- `metadata_modification_time` (DateTime) — время последней модификации табличных метаданных. -- `dependencies_database` (Array(String)) — зависимости базы данных. -- `dependencies_table` (Array(String)) — табличные зависимости (таблицы [MaterializedView](../../engines/table-engines/special/materializedview.md), созданные на базе текущей таблицы). -- `create_table_query` (String) — запрос, которым создавалась таблица. -- `engine_full` (String) — параметры табличного движка. -- `partition_key` (String) — ключ партиционирования таблицы. -- `sorting_key` (String) — ключ сортировки таблицы. -- `primary_key` (String) - первичный ключ таблицы. -- `sampling_key` (String) — ключ сэмплирования таблицы. -- `storage_policy` (String) - политика хранения данных: +Информация о [временных таблицах](../../sql-reference/statements/create/table.md#temporary-tables) содержится в `system.tables` только в тех сессиях, в которых эти таблицы были созданы. Поле `database` у таких таблиц пустое, а флаг `is_temporary` включен. + +Столбцы: + +- `database` ([String](../../sql-reference/data-types/string.md)) — имя базы данных, в которой находится таблица. +- `name` ([String](../../sql-reference/data-types/string.md)) — имя таблицы. +- `engine` ([String](../../sql-reference/data-types/string.md)) — движок таблицы (без параметров). +- `is_temporary` ([UInt8](../../sql-reference/data-types/int-uint.md)) — флаг, указывающий на то, временная это таблица или нет. +- `data_path` ([String](../../sql-reference/data-types/string.md)) — путь к данным таблицы в файловой системе. +- `metadata_path` ([String](../../sql-reference/data-types/string.md)) — путь к табличным метаданным в файловой системе. +- `metadata_modification_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время последней модификации табличных метаданных. +- `dependencies_database` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — зависимости базы данных. +- `dependencies_table` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — табличные зависимости (таблицы [MaterializedView](../../engines/table-engines/special/materializedview.md), созданные на базе текущей таблицы). +- `create_table_query` ([String](../../sql-reference/data-types/string.md)) — запрос, при помощи которого создавалась таблица. +- `engine_full` ([String](../../sql-reference/data-types/string.md)) — параметры табличного движка. +- `partition_key` ([String](../../sql-reference/data-types/string.md)) — ключ партиционирования таблицы. +- `sorting_key` ([String](../../sql-reference/data-types/string.md)) — ключ сортировки таблицы. +- `primary_key` ([String](../../sql-reference/data-types/string.md)) - первичный ключ таблицы. +- `sampling_key` ([String](../../sql-reference/data-types/string.md)) — ключ сэмплирования таблицы. +- `storage_policy` ([String](../../sql-reference/data-types/string.md)) - политика хранения данных: - [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes) - [Distributed](../../engines/table-engines/special/distributed.md#distributed) -- `total_rows` (Nullable(UInt64)) - общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `Null` (включая базовую таблицу `Buffer`). +- `total_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `NULL` (включая базовую таблицу `Buffer`). -- `total_bytes` (Nullable(UInt64)) - общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `Null` (**не включает** в себя никакого базового хранилища). +- `total_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `NULL` (не включает в себя никакого базового хранилища). - Если таблица хранит данные на диске, возвращает используемое пространство на диске (т. е. сжатое). - Если таблица хранит данные в памяти, возвращает приблизительное количество используемых байт в памяти. -- `lifetime_rows` (Nullable(UInt64)) - общее количество строк, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). +- `lifetime_rows` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество строк, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). -- `lifetime_bytes` (Nullable(UInt64)) - общее количество байт, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). +- `lifetime_bytes` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt64](../../sql-reference/data-types/int-uint.md))) - общее количество байт, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`). Таблица `system.tables` используется при выполнении запроса `SHOW TABLES`. +**Пример** + +```sql +SELECT * FROM system.tables LIMIT 2 FORMAT Vertical; +``` + +```text +Row 1: +────── +database: system +name: aggregate_function_combinators +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAggregateFunctionCombinators +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/aggregate_function_combinators.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ + +Row 2: +────── +database: system +name: asynchronous_metrics +uuid: 00000000-0000-0000-0000-000000000000 +engine: SystemAsynchronousMetrics +is_temporary: 0 +data_paths: [] +metadata_path: /var/lib/clickhouse/metadata/system/asynchronous_metrics.sql +metadata_modification_time: 1970-01-01 03:00:00 +dependencies_database: [] +dependencies_table: [] +create_table_query: +engine_full: +partition_key: +sorting_key: +primary_key: +sampling_key: +storage_policy: +total_rows: ᴺᵁᴸᴸ +total_bytes: ᴺᵁᴸᴸ +``` diff --git a/docs/ru/operations/update.md b/docs/ru/operations/update.md index 34f86656b61..a3e87b52ede 100644 --- a/docs/ru/operations/update.md +++ b/docs/ru/operations/update.md @@ -29,5 +29,3 @@ $ sudo apt-get update $ sudo apt-get install clickhouse-server=xx.yy.a.b clickhouse-client=xx.yy.a.b clickhouse-common-static=xx.yy.a.b $ sudo service clickhouse-server restart ``` - -[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/update/) diff --git a/docs/ru/sql-reference/aggregate-functions/combinators.md b/docs/ru/sql-reference/aggregate-functions/combinators.md index eb52fa9bc75..74f9d1c1c05 100644 --- a/docs/ru/sql-reference/aggregate-functions/combinators.md +++ b/docs/ru/sql-reference/aggregate-functions/combinators.md @@ -27,6 +27,40 @@ toc_title: "Комбинаторы агрегатных функций" Комбинаторы -If и -Array можно сочетать. При этом, должен сначала идти Array, а потом If. Примеры: `uniqArrayIf(arr, cond)`, `quantilesTimingArrayIf(level1, level2)(arr, cond)`. Из-за такого порядка получается, что аргумент cond не должен быть массивом. +## -SimpleState {#agg-functions-combinator-simplestate} + +При использовании этого комбинатора агрегатная функция возвращает то же значение, но типа [SimpleAggregateFunction(...)](../../sql-reference/data-types/simpleaggregatefunction.md). Текущее значение функции может храниться в таблице для последующей работы с таблицами семейства [AggregatingMergeTree](../../engines/table-engines/mergetree-family/aggregatingmergetree.md). + +**Синтаксис** + +``` sql +SimpleState(x) +``` + +**Аргументы** + +- `x` — параметры агрегатной функции. + +**Возвращаемое значение** + +Значение агрегатной функции типа `SimpleAggregateFunction(...)`. + +**Пример** + +Запрос: + +``` sql +WITH anySimpleState(number) AS c SELECT toTypeName(c), c FROM numbers(1); +``` + +Результат: + +``` text +┌─toTypeName(c)────────────────────────┬─c─┐ +│ SimpleAggregateFunction(any, UInt64) │ 0 │ +└──────────────────────────────────────┴───┘ +``` + ## -State {#state} В случае применения этого комбинатора, агрегатная функция возвращает не готовое значение (например, в случае функции [uniq](reference/uniq.md#agg_function-uniq) — количество уникальных значений), а промежуточное состояние агрегации (например, в случае функции `uniq` — хэш-таблицу для расчёта количества уникальных значений), которое имеет тип `AggregateFunction(...)` и может использоваться для дальнейшей обработки или может быть сохранено в таблицу для последующей доагрегации. @@ -247,4 +281,3 @@ FROM people │ [3,2] │ [11.5,12.949999809265137] │ └────────┴───────────────────────────┘ ``` - diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 33a64f6bf26..7b81c577762 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -3,6 +3,8 @@ Хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, которые обладают следующим свойством: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому хранить и обрабатывать какие-либо дополнительные данные не требуется. +Чтобы получить промежуточное значение, обычно используются агрегатные функции с суффиксом [-SimpleState](../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-simplestate). + Поддерживаются следующие агрегатные функции: - [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) diff --git a/docs/ru/sql-reference/functions/json-functions.md b/docs/ru/sql-reference/functions/json-functions.md index 5d419d26981..4de487c03ad 100644 --- a/docs/ru/sql-reference/functions/json-functions.md +++ b/docs/ru/sql-reference/functions/json-functions.md @@ -16,51 +16,65 @@ toc_title: JSON ## visitParamHas(params, name) {#visitparamhasparams-name} -Проверить наличие поля с именем name. +Проверяет наличие поля с именем `name`. + +Алиас: `simpleJSONHas`. ## visitParamExtractUInt(params, name) {#visitparamextractuintparams-name} -Распарсить UInt64 из значения поля с именем name. Если поле строковое - попытаться распарсить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то вернуть 0. +Пытается выделить число типа UInt64 из значения поля с именем `name`. Если поле строковое, пытается выделить число из начала строки. Если такого поля нет, или если оно есть, но содержит не число, то возвращает 0. + +Алиас: `simpleJSONExtractUInt`. ## visitParamExtractInt(params, name) {#visitparamextractintparams-name} Аналогично для Int64. +Алиас: `simpleJSONExtractInt`. + ## visitParamExtractFloat(params, name) {#visitparamextractfloatparams-name} Аналогично для Float64. +Алиас: `simpleJSONExtractFloat`. + ## visitParamExtractBool(params, name) {#visitparamextractboolparams-name} -Распарсить значение true/false. Результат - UInt8. +Пытается выделить значение true/false. Результат — UInt8. + +Алиас: `simpleJSONExtractBool`. ## visitParamExtractRaw(params, name) {#visitparamextractrawparams-name} -Вернуть значение поля, включая разделители. +Возвращает значение поля, включая разделители. + +Алиас: `simpleJSONExtractRaw`. Примеры: ``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"' -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}' +visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'; +visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'; ``` ## visitParamExtractString(params, name) {#visitparamextractstringparams-name} -Распарсить строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. +Разбирает строку в двойных кавычках. У значения убирается экранирование. Если убрать экранированные символы не удалось, то возвращается пустая строка. + +Алиас: `simpleJSONExtractString`. Примеры: ``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0' -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺' -visitParamExtractString('{"abc":"\\u263"}', 'abc') = '' -visitParamExtractString('{"abc":"hello}', 'abc') = '' +visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'; +visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'; +visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''; +visitParamExtractString('{"abc":"hello}', 'abc') = ''; ``` -На данный момент, не поддерживаются записанные в формате `\uXXXX\uYYYY` кодовые точки не из basic multilingual plane (они переводятся не в UTF-8, а в CESU-8). +На данный момент не поддерживаются записанные в формате `\uXXXX\uYYYY` кодовые точки не из basic multilingual plane (они переводятся не в UTF-8, а в CESU-8). -Следующие функции используют [simdjson](https://github.com/lemire/simdjson) который разработан под более сложные требования для разбора JSON. Упомянутое выше предположение 2 по-прежнему применимо. +Следующие функции используют [simdjson](https://github.com/lemire/simdjson), который разработан под более сложные требования для разбора JSON. Упомянутое выше допущение 2 по-прежнему применимо. ## isValidJSON(json) {#isvalidjsonjson} @@ -292,4 +306,3 @@ SELECT JSONExtractKeysAndValuesRaw('{"a": [-100, 200.0], "b":{"c": {"d": "hello" │ [('d','"hello"'),('f','"world"')] │ └───────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` - diff --git a/docs/ru/sql-reference/statements/alter/column.md b/docs/ru/sql-reference/statements/alter/column.md index 87fc1c78cd0..158ab2e7385 100644 --- a/docs/ru/sql-reference/statements/alter/column.md +++ b/docs/ru/sql-reference/statements/alter/column.md @@ -63,6 +63,9 @@ DROP COLUMN [IF EXISTS] name Запрос удаляет данные из файловой системы. Так как это представляет собой удаление целых файлов, запрос выполняется почти мгновенно. +!!! warning "Предупреждение" + Вы не можете удалить столбец, используемый в [материализованном представлениии](../../../sql-reference/statements/create/view.md#materialized). В противном случае будет ошибка. + Пример: ``` sql @@ -155,7 +158,7 @@ ALTER TABLE table_name MODIFY column_name REMOVE property; ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; ``` -## Смотрите также +**Смотрите также** - [REMOVE TTL](ttl.md). diff --git a/docs/ru/sql-reference/statements/create/table.md b/docs/ru/sql-reference/statements/create/table.md index b998435bcd8..1ccd0a600f3 100644 --- a/docs/ru/sql-reference/statements/create/table.md +++ b/docs/ru/sql-reference/statements/create/table.md @@ -46,15 +46,32 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name AS table_function() ### Из запроса SELECT {#from-select-query} ``` sql -CREATE TABLE [IF NOT EXISTS] [db.]table_name ENGINE = engine AS SELECT ... +CREATE TABLE [IF NOT EXISTS] [db.]table_name[(name1 [type1], name2 [type2], ...)] ENGINE = engine AS SELECT ... ``` -Создаёт таблицу со структурой, как результат запроса `SELECT`, с движком engine, и заполняет её данными из SELECT-а. +Создаёт таблицу со структурой, как результат запроса `SELECT`, с движком `engine`, и заполняет её данными из `SELECT`. Также вы можете явно задать описание столбцов. -Во всех случаях, если указано `IF NOT EXISTS`, то запрос не будет возвращать ошибку, если таблица уже существует. В этом случае, запрос будет ничего не делать. +Если таблица уже существует и указано `IF NOT EXISTS`, то запрос ничего не делает. После секции `ENGINE` в запросе могут использоваться и другие секции в зависимости от движка. Подробную документацию по созданию таблиц смотрите в описаниях [движков таблиц](../../../engines/table-engines/index.md#table_engines). +**Пример** + +Запрос: + +``` sql +CREATE TABLE t1 (x String) ENGINE = Memory AS SELECT 1; +SELECT x, toTypeName(x) FROM t1; +``` + +Результат: + +```text +┌─x─┬─toTypeName(x)─┐ +│ 1 │ String │ +└───┴───────────────┘ +``` + ## Модификатор NULL или NOT NULL {#null-modifiers} Модификатор `NULL` или `NOT NULL`, указанный после типа данных в определении столбца, позволяет или не позволяет типу данных быть [Nullable](../../../sql-reference/data-types/nullable.md#data_type-nullable). @@ -230,7 +247,7 @@ CREATE TABLE codec_example ) ENGINE = MergeTree() ``` -## Временные таблицы {#vremennye-tablitsy} +## Временные таблицы {#temporary-tables} ClickHouse поддерживает временные таблицы со следующими характеристиками: diff --git a/docs/ru/sql-reference/statements/optimize.md b/docs/ru/sql-reference/statements/optimize.md index 44101910a6c..e1a9d613537 100644 --- a/docs/ru/sql-reference/statements/optimize.md +++ b/docs/ru/sql-reference/statements/optimize.md @@ -5,19 +5,83 @@ toc_title: OPTIMIZE # OPTIMIZE {#misc_operations-optimize} -``` sql -OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE] -``` - -Запрос пытается запустить внеплановый мёрж кусков данных для таблиц семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md). Другие движки таблиц не поддерживаются. - -Если `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md), ClickHouse создаёт задачу на мёрж и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). - -- Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop). -- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr). -- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния. -- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree. +Запрос пытается запустить внеплановое слияние кусков данных для таблиц. !!! warning "Внимание" - Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts». - + `OPTIMIZE` не устраняет причину появления ошибки `Too many parts`. + +**Синтаксис** + +``` sql +OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION ID 'partition_id'] [FINAL] [DEDUPLICATE [BY expression]] +``` + +Может применяться к таблицам семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md), [MaterializedView](../../engines/table-engines/special/materializedview.md) и [Buffer](../../engines/table-engines/special/buffer.md). Другие движки таблиц не поддерживаются. + +Если запрос `OPTIMIZE` применяется к таблицам семейства [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/replication.md), ClickHouse создаёт задачу на слияние и ожидает её исполнения на всех узлах (если активирована настройка `replication_alter_partitions_sync`). + +- По умолчанию, если запросу `OPTIMIZE` не удалось выполнить слияние, то +ClickHouse не оповещает клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop). +- Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr). +- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске данных. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния. +- Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех столбцах), имеет смысл только для движка MergeTree. + +## Выражение BY {#by-expression} + +Чтобы выполнить дедупликацию по произвольному набору столбцов, вы можете явно указать список столбцов или использовать любую комбинацию подстановки [`*`](../../sql-reference/statements/select/index.md#asterisk), выражений [`COLUMNS`](../../sql-reference/statements/select/index.md#columns-expression) и [`EXCEPT`](../../sql-reference/statements/select/index.md#except-modifier). + + Список столбцов для дедупликации должен включать все столбцы, указанные в условиях сортировки (первичный ключ и ключ сортировки), а также в условиях партиционирования (ключ партиционирования). + + !!! note "Примечание" + Обратите внимание, что символ подстановки `*` обрабатывается так же, как и в запросах `SELECT`: столбцы `MATERIALIZED` и `ALIAS` не включаются в результат. + Если указать пустой список или выражение, которое возвращает пустой список, или дедуплицировать столбец по псевдониму (`ALIAS`), то сервер вернет ошибку. + + +**Примеры** + +Рассмотрим таблицу: + +``` sql +CREATE TABLE example ( + primary_key Int32, + secondary_key Int32, + value UInt32, + partition_key UInt32, + materialized_value UInt32 MATERIALIZED 12345, + aliased_value UInt32 ALIAS 2, + PRIMARY KEY primary_key +) ENGINE=MergeTree +PARTITION BY partition_key; +``` + +Прежний способ дедупликации, когда учитываются все столбцы. Строка удаляется только в том случае, если все значения во всех столбцах равны соответствующим значениям в предыдущей строке. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE; +``` + +Дедупликация по всем столбцам, кроме `ALIAS` и `MATERIALIZED`: `primary_key`, `secondary_key`, `value`, `partition_key` и `materialized_value`. + + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY *; +``` + +Дедупликация по всем столбцам, кроме `ALIAS`, `MATERIALIZED` и `materialized_value`: столбцы `primary_key`, `secondary_key`, `value` и `partition_key`. + + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY * EXCEPT materialized_value; +``` + +Дедупликация по столбцам `primary_key`, `secondary_key` и `partition_key`. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY primary_key, secondary_key, partition_key; +``` + +Дедупликация по любому столбцу, соответствующему регулярному выражению: столбцам `primary_key`, `secondary_key` и `partition_key`. + +``` sql +OPTIMIZE TABLE example FINAL DEDUPLICATE BY COLUMNS('.*_key'); +``` diff --git a/docs/ru/sql-reference/table-functions/s3.md b/docs/ru/sql-reference/table-functions/s3.md index 1d3fc8cfdb7..e062e59c67c 100644 --- a/docs/ru/sql-reference/table-functions/s3.md +++ b/docs/ru/sql-reference/table-functions/s3.md @@ -18,7 +18,7 @@ s3(path, [aws_access_key_id, aws_secret_access_key,] format, structure, [compres - `path` — URL-адрес бакета с указанием пути к файлу. Поддерживает следующие подстановочные знаки в режиме "только чтение": `*, ?, {abc,def} и {N..M}` где `N, M` — числа, `'abc', 'def'` — строки. Подробнее смотри [здесь](../../engines/table-engines/integrations/s3.md#wildcards-in-path). - `format` — [формат](../../interfaces/formats.md#formats) файла. - `structure` — cтруктура таблицы. Формат `'column1_name column1_type, column2_name column2_type, ...'`. -- `compression` — автоматически обнаруживает сжатие по расширению файла. Возможные значения: none, gzip/gz, brotli/br, xz/LZMA, zstd/zst. Необязательный параметр. +- `compression` — автоматически обнаруживает сжатие по расширению файла. Возможные значения: `none`, `gzip/gz`, `brotli/br`, `xz/LZMA`, `zstd/zst`. Необязательный параметр. **Возвращаемые значения** diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index c917dbe30a3..ad3ff84d8bf 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -33,8 +33,12 @@ option (ENABLE_CLICKHOUSE_OBFUSCATOR "Table data obfuscator (convert real data t ${ENABLE_CLICKHOUSE_ALL}) # https://clickhouse.tech/docs/en/operations/utilities/odbc-bridge/ -option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" - ${ENABLE_CLICKHOUSE_ALL}) +if (ENABLE_ODBC) + option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" + ${ENABLE_CLICKHOUSE_ALL}) +else () + option (ENABLE_CLICKHOUSE_ODBC_BRIDGE "HTTP-server working like a proxy to ODBC driver" OFF) +endif () option (ENABLE_CLICKHOUSE_LIBRARY_BRIDGE "HTTP-server working like a proxy to Library dictionary source" ${ENABLE_CLICKHOUSE_ALL}) diff --git a/programs/server/.gitignore b/programs/server/.gitignore index b774776e4be..ddc480e4b29 100644 --- a/programs/server/.gitignore +++ b/programs/server/.gitignore @@ -1,8 +1,11 @@ -/access -/dictionaries_lib -/flags -/format_schemas +/metadata /metadata_dropped +/data +/store +/access +/flags +/dictionaries_lib +/format_schemas /preprocessed_configs /shadow /tmp diff --git a/programs/server/CMakeLists.txt b/programs/server/CMakeLists.txt index 697851b294b..3a04228942b 100644 --- a/programs/server/CMakeLists.txt +++ b/programs/server/CMakeLists.txt @@ -19,6 +19,7 @@ set (CLICKHOUSE_SERVER_LINK clickhouse_storages_system clickhouse_table_functions string_utils + jemalloc ${LINK_RESOURCE_LIB} diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index e3b4316079c..e874122250c 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -101,6 +101,10 @@ # include #endif +#if USE_JEMALLOC +# include +#endif + namespace CurrentMetrics { extern const Metric Revision; @@ -109,11 +113,35 @@ namespace CurrentMetrics extern const Metric MaxDDLEntryID; } +#if USE_JEMALLOC +static bool jemallocOptionEnabled(const char *name) +{ + bool value; + size_t size = sizeof(value); + + if (mallctl(name, reinterpret_cast(&value), &size, /* newp= */ nullptr, /* newlen= */ 0)) + throw Poco::SystemException("mallctl() failed"); + + return value; +} +#else +static bool jemallocOptionEnabled(const char *) { return 0; } +#endif + int mainEntryClickHouseServer(int argc, char ** argv) { DB::Server app; + if (jemallocOptionEnabled("opt.background_thread")) + { + LOG_ERROR(&app.logger(), + "jemalloc.background_thread was requested, " + "however ClickHouse uses percpu_arena and background_thread most likely will not give any benefits, " + "and also background_thread is not compatible with ClickHouse watchdog " + "(that can be disabled with CLICKHOUSE_WATCHDOG_ENABLE=0)"); + } + /// Do not fork separate process from watchdog if we attached to terminal. /// Otherwise it breaks gdb usage. /// Can be overridden by environment variable (cannot use server config at this moment). diff --git a/programs/server/data/.gitignore b/programs/server/data/.gitignore deleted file mode 100644 index b9719d9d1d1..00000000000 --- a/programs/server/data/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -*.txt -*.dat -*.idx diff --git a/programs/server/metadata/.gitignore b/programs/server/metadata/.gitignore deleted file mode 100644 index d1b811b7de5..00000000000 --- a/programs/server/metadata/.gitignore +++ /dev/null @@ -1 +0,0 @@ -*.sql diff --git a/src/AggregateFunctions/AggregateFunctionAvg.h b/src/AggregateFunctions/AggregateFunctionAvg.h index 96d48941f8b..8a6491d9b61 100644 --- a/src/AggregateFunctions/AggregateFunctionAvg.h +++ b/src/AggregateFunctions/AggregateFunctionAvg.h @@ -96,7 +96,7 @@ public: UInt32 num_scale_ = 0, UInt32 denom_scale_ = 0) : Base(argument_types_, {}), num_scale(num_scale_), denom_scale(denom_scale_) {} - DataTypePtr getReturnType() const final { return std::make_shared>(); } + DataTypePtr getReturnType() const override { return std::make_shared>(); } bool allocatesMemoryInArena() const override { return false; } diff --git a/src/AggregateFunctions/AggregateFunctionSumCount.cpp b/src/AggregateFunctions/AggregateFunctionSumCount.cpp new file mode 100644 index 00000000000..b979779d907 --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionSumCount.cpp @@ -0,0 +1,49 @@ +#include +#include +#include +#include +#include "registerAggregateFunctions.h" + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ +bool allowType(const DataTypePtr& type) noexcept +{ + const WhichDataType t(type); + return t.isInt() || t.isUInt() || t.isFloat() || t.isDecimal(); +} + +AggregateFunctionPtr createAggregateFunctionSumCount(const std::string & name, const DataTypes & argument_types, const Array & parameters) +{ + assertNoParameters(name, parameters); + assertUnary(name, argument_types); + + AggregateFunctionPtr res; + DataTypePtr data_type = argument_types[0]; + if (!allowType(data_type)) + throw Exception("Illegal type " + data_type->getName() + " of argument for aggregate function " + name, + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + if (isDecimal(data_type)) + res.reset(createWithDecimalType( + *data_type, argument_types, getDecimalScale(*data_type))); + else + res.reset(createWithNumericType(*data_type, argument_types)); + + return res; +} + +} + +void registerAggregateFunctionSumCount(AggregateFunctionFactory & factory) +{ + factory.registerFunction("sumCount", createAggregateFunctionSumCount); +} + +} diff --git a/src/AggregateFunctions/AggregateFunctionSumCount.h b/src/AggregateFunctions/AggregateFunctionSumCount.h new file mode 100644 index 00000000000..1026b6272ba --- /dev/null +++ b/src/AggregateFunctions/AggregateFunctionSumCount.h @@ -0,0 +1,55 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ +template +using DecimalOrNumberDataType = std::conditional_t, DataTypeDecimal>, DataTypeNumber>>; +template +class AggregateFunctionSumCount final : public AggregateFunctionAvgBase, UInt64, AggregateFunctionSumCount> +{ +public: + using Base = AggregateFunctionAvgBase, UInt64, AggregateFunctionSumCount>; + + AggregateFunctionSumCount(const DataTypes & argument_types_, UInt32 num_scale_ = 0) + : Base(argument_types_, num_scale_), scale(num_scale_) {} + + DataTypePtr getReturnType() const override + { + DataTypes types; + if constexpr (IsDecimalNumber) + types.emplace_back(std::make_shared>(DecimalOrNumberDataType::maxPrecision(), scale)); + else + types.emplace_back(std::make_shared>()); + + types.emplace_back(std::make_shared()); + + return std::make_shared(types); + } + + void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const final + { + assert_cast> &>((assert_cast(to)).getColumn(0)).getData().push_back( + this->data(place).numerator); + + assert_cast((assert_cast(to)).getColumn(1)).getData().push_back( + this->data(place).denominator); + } + + void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final + { + this->data(place).numerator += static_cast &>(*columns[0]).getData()[row_num]; + ++this->data(place).denominator; + } + + String getName() const final { return "sumCount"; } + +private: + UInt32 scale; +}; + +} diff --git a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h index 88f26ab312e..853e1cb6447 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h +++ b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h @@ -17,7 +17,7 @@ #include -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif @@ -280,7 +280,7 @@ public: } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 5cf97ae0f85..9c62160b964 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -163,7 +163,7 @@ public: sorted = false; } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif @@ -191,7 +191,7 @@ public: } } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/AggregateFunctions/registerAggregateFunctions.cpp b/src/AggregateFunctions/registerAggregateFunctions.cpp index ae26fdc5d40..ceba1531e03 100644 --- a/src/AggregateFunctions/registerAggregateFunctions.cpp +++ b/src/AggregateFunctions/registerAggregateFunctions.cpp @@ -25,6 +25,7 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory &); void registerAggregateFunctionsStatisticsStable(AggregateFunctionFactory &); void registerAggregateFunctionsStatisticsSimple(AggregateFunctionFactory &); void registerAggregateFunctionSum(AggregateFunctionFactory &); +void registerAggregateFunctionSumCount(AggregateFunctionFactory &); void registerAggregateFunctionSumMap(AggregateFunctionFactory &); void registerAggregateFunctionsUniq(AggregateFunctionFactory &); void registerAggregateFunctionUniqCombined(AggregateFunctionFactory &); @@ -83,6 +84,7 @@ void registerAggregateFunctions() registerAggregateFunctionsStatisticsStable(factory); registerAggregateFunctionsStatisticsSimple(factory); registerAggregateFunctionSum(factory); + registerAggregateFunctionSumCount(factory); registerAggregateFunctionSumMap(factory); registerAggregateFunctionsUniq(factory); registerAggregateFunctionUniqCombined(factory); diff --git a/src/AggregateFunctions/ya.make b/src/AggregateFunctions/ya.make index 3a8f0ad9fba..64605aee659 100644 --- a/src/AggregateFunctions/ya.make +++ b/src/AggregateFunctions/ya.make @@ -50,6 +50,7 @@ SRCS( AggregateFunctionStatisticsSimple.cpp AggregateFunctionStudentTTest.cpp AggregateFunctionSum.cpp + AggregateFunctionSumCount.cpp AggregateFunctionSumMap.cpp AggregateFunctionTopK.cpp AggregateFunctionUniq.cpp diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index 2992e991df7..350beffce28 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -13,6 +13,7 @@ namespace ErrorCodes extern const int MISMATCH_REPLICAS_DATA_SOURCES; extern const int NO_AVAILABLE_REPLICA; extern const int TIMEOUT_EXCEEDED; + extern const int UNKNOWN_PACKET_FROM_SERVER; } @@ -278,7 +279,22 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac Packet packet; { AsyncCallbackSetter async_setter(current_connection, std::move(async_callback)); - packet = current_connection->receivePacket(); + + try + { + packet = current_connection->receivePacket(); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_SERVER) + { + /// Exception may happen when packet is received, e.g. when got unknown packet. + /// In this case, invalidate replica, so that we would not read from it anymore. + current_connection->disconnect(); + invalidateReplica(state); + } + throw; + } } switch (packet.type) diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 1792491c60d..0814ebaa826 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -111,7 +111,7 @@ public: } /// Suppress gcc 7.3.1 warning: '*((void*)& +8)' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -128,7 +128,7 @@ public: offsets.push_back(new_size); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/Allocator.h b/src/Common/Allocator.h index e3c6ddf9ff4..ebfd654d558 100644 --- a/src/Common/Allocator.h +++ b/src/Common/Allocator.h @@ -277,7 +277,7 @@ private: * GCC 4.9 mistakenly assumes that we can call `free` from a pointer to the stack. * In fact, the combination of conditions inside AllocatorWithStackMemory does not allow this. */ -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wfree-nonheap-object" #endif @@ -359,6 +359,6 @@ extern template class Allocator; extern template class Allocator; extern template class Allocator; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/UInt128.h b/src/Common/UInt128.h index 06fddee8dc9..be96f409673 100644 --- a/src/Common/UInt128.h +++ b/src/Common/UInt128.h @@ -19,7 +19,7 @@ namespace DB struct UInt128 { /// Suppress gcc7 warnings: 'prev_key.DB::UInt128::low' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -92,7 +92,7 @@ struct UInt128 return static_cast(low); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -150,7 +150,7 @@ struct DummyUInt256 { /// Suppress gcc7 warnings: 'prev_key.DB::UInt256::a' may be used uninitialized in this function -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -179,7 +179,7 @@ struct DummyUInt256 bool operator== (const UInt64 rhs) const { return a == rhs && b == 0 && c == 0 && d == 0; } bool operator!= (const UInt64 rhs) const { return !operator==(rhs); } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index 9ff37a7045d..2d947bb402c 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -116,6 +116,7 @@ struct Request virtual ~Request() = default; virtual String getPath() const = 0; virtual void addRootPath(const String & /* root_path */) {} + virtual size_t bytesSize() const { return 0; } }; struct Response; @@ -131,6 +132,7 @@ struct Response Response & operator=(const Response &) = default; virtual ~Response() = default; virtual void removeRootPath(const String & /* root_path */) {} + virtual size_t bytesSize() const { return 0; } }; struct WatchResponse : virtual Response @@ -140,6 +142,8 @@ struct WatchResponse : virtual Response String path; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override { return path.size() + sizeof(type) + sizeof(state); } }; using WatchCallback = std::function; @@ -154,6 +158,9 @@ struct CreateRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + data.size() + + sizeof(is_ephemeral) + sizeof(is_sequential) + acls.size() * sizeof(ACL); } }; struct CreateResponse : virtual Response @@ -161,6 +168,8 @@ struct CreateResponse : virtual Response String path_created; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override { return path_created.size(); } }; struct RemoveRequest : virtual Request @@ -170,6 +179,8 @@ struct RemoveRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + sizeof(version); } }; struct RemoveResponse : virtual Response @@ -182,11 +193,15 @@ struct ExistsRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct ExistsResponse : virtual Response { Stat stat; + + size_t bytesSize() const override { return sizeof(Stat); } }; struct GetRequest : virtual Request @@ -195,12 +210,16 @@ struct GetRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct GetResponse : virtual Response { String data; Stat stat; + + size_t bytesSize() const override { return data.size() + sizeof(stat); } }; struct SetRequest : virtual Request @@ -211,11 +230,15 @@ struct SetRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return data.size() + data.size() + sizeof(version); } }; struct SetResponse : virtual Response { Stat stat; + + size_t bytesSize() const override { return sizeof(stat); } }; struct ListRequest : virtual Request @@ -224,12 +247,22 @@ struct ListRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size(); } }; struct ListResponse : virtual Response { std::vector names; Stat stat; + + size_t bytesSize() const override + { + size_t size = sizeof(stat); + for (const auto & name : names) + size += name.size(); + return size; + } }; struct CheckRequest : virtual Request @@ -239,6 +272,8 @@ struct CheckRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return path; } + + size_t bytesSize() const override { return path.size() + sizeof(version); } }; struct CheckResponse : virtual Response @@ -251,6 +286,14 @@ struct MultiRequest : virtual Request void addRootPath(const String & root_path) override; String getPath() const override { return {}; } + + size_t bytesSize() const override + { + size_t size = 0; + for (const auto & request : requests) + size += request->bytesSize(); + return size; + } }; struct MultiResponse : virtual Response @@ -258,6 +301,14 @@ struct MultiResponse : virtual Response Responses responses; void removeRootPath(const String & root_path) override; + + size_t bytesSize() const override + { + size_t size = 0; + for (const auto & response : responses) + size += response->bytesSize(); + return size; + } }; /// This response may be received only as an element of responses in MultiResponse. diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.cpp b/src/Common/ZooKeeper/ZooKeeperCommon.cpp index 56f9de31ec8..50bdc6c77ba 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.cpp +++ b/src/Common/ZooKeeper/ZooKeeperCommon.cpp @@ -455,6 +455,39 @@ ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const { return std::m ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const { return std::make_shared(requests); } ZooKeeperResponsePtr ZooKeeperCloseRequest::makeResponse() const { return std::make_shared(); } +void ZooKeeperSessionIDRequest::writeImpl(WriteBuffer & out) const +{ + Coordination::write(internal_id, out); + Coordination::write(session_timeout_ms, out); + Coordination::write(server_id, out); +} + +void ZooKeeperSessionIDRequest::readImpl(ReadBuffer & in) +{ + Coordination::read(internal_id, in); + Coordination::read(session_timeout_ms, in); + Coordination::read(server_id, in); +} + +Coordination::ZooKeeperResponsePtr ZooKeeperSessionIDRequest::makeResponse() const +{ + return std::make_shared(); +} + +void ZooKeeperSessionIDResponse::readImpl(ReadBuffer & in) +{ + Coordination::read(internal_id, in); + Coordination::read(session_id, in); + Coordination::read(server_id, in); +} + +void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const +{ + Coordination::write(internal_id, out); + Coordination::write(session_id, out); + Coordination::write(server_id, out); +} + void ZooKeeperRequestFactory::registerRequest(OpNum op_num, Creator creator) { if (!op_num_to_request.try_emplace(op_num, creator).second) @@ -511,6 +544,7 @@ ZooKeeperRequestFactory::ZooKeeperRequestFactory() registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); registerZooKeeperRequest(*this); + registerZooKeeperRequest(*this); } } diff --git a/src/Common/ZooKeeper/ZooKeeperCommon.h b/src/Common/ZooKeeper/ZooKeeperCommon.h index 92b1e7c9858..dd95eaa6b67 100644 --- a/src/Common/ZooKeeper/ZooKeeperCommon.h +++ b/src/Common/ZooKeeper/ZooKeeperCommon.h @@ -84,6 +84,8 @@ struct ZooKeeperSyncRequest final : ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + path.size(); } }; struct ZooKeeperSyncResponse final : ZooKeeperResponse @@ -92,6 +94,8 @@ struct ZooKeeperSyncResponse final : ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Sync; } + + size_t bytesSize() const override { return path.size(); } }; struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse @@ -128,6 +132,9 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return ZooKeeperRequest::bytesSize() + sizeof(xid) + + sizeof(type) + scheme.size() + data.size(); } }; struct ZooKeeperAuthResponse final : ZooKeeperResponse @@ -136,6 +143,8 @@ struct ZooKeeperAuthResponse final : ZooKeeperResponse void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Auth; } + + size_t bytesSize() const override { return ZooKeeperResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperCloseRequest final : ZooKeeperRequest @@ -172,6 +181,8 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return CreateRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse @@ -181,6 +192,8 @@ struct ZooKeeperCreateResponse final : CreateResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Create; } + + size_t bytesSize() const override { return CreateResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest @@ -194,6 +207,8 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return RemoveRequest::bytesSize() + sizeof(xid); } }; struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse @@ -201,6 +216,8 @@ struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse void readImpl(ReadBuffer &) override {} void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Remove; } + + size_t bytesSize() const override { return RemoveResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest @@ -211,6 +228,8 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return ExistsRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse @@ -218,6 +237,8 @@ struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Exists; } + + size_t bytesSize() const override { return ExistsResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest @@ -228,6 +249,8 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return GetRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse @@ -235,6 +258,8 @@ struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Get; } + + size_t bytesSize() const override { return GetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest @@ -247,6 +272,8 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return false; } + + size_t bytesSize() const override { return SetRequest::bytesSize() + sizeof(xid); } }; struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse @@ -254,6 +281,8 @@ struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Set; } + + size_t bytesSize() const override { return SetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest @@ -263,6 +292,8 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest void readImpl(ReadBuffer & in) override; ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return ListRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperSimpleListRequest final : ZooKeeperListRequest @@ -275,6 +306,8 @@ struct ZooKeeperListResponse : ListResponse, ZooKeeperResponse void readImpl(ReadBuffer & in) override; void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::List; } + + size_t bytesSize() const override { return ListResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperSimpleListResponse final : ZooKeeperListResponse @@ -293,6 +326,8 @@ struct ZooKeeperCheckRequest final : CheckRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override { return !has_watch; } + + size_t bytesSize() const override { return CheckRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse @@ -300,6 +335,8 @@ struct ZooKeeperCheckResponse final : CheckResponse, ZooKeeperResponse void readImpl(ReadBuffer &) override {} void writeImpl(WriteBuffer &) const override {} OpNum getOpNum() const override { return OpNum::Check; } + + size_t bytesSize() const override { return CheckResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; /// This response may be received only as an element of responses in MultiResponse. @@ -309,6 +346,8 @@ struct ZooKeeperErrorResponse final : ErrorResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; OpNum getOpNum() const override { return OpNum::Error; } + + size_t bytesSize() const override { return ErrorResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } }; struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest @@ -323,6 +362,8 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest ZooKeeperResponsePtr makeResponse() const override; bool isReadRequest() const override; + + size_t bytesSize() const override { return MultiRequest::bytesSize() + sizeof(xid) + sizeof(has_watch); } }; struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse @@ -346,6 +387,41 @@ struct ZooKeeperMultiResponse final : MultiResponse, ZooKeeperResponse void writeImpl(WriteBuffer & out) const override; + size_t bytesSize() const override { return MultiResponse::bytesSize() + sizeof(xid) + sizeof(zxid); } +}; + +/// Fake internal coordination (keeper) response. Never received from client +/// and never send to client. +struct ZooKeeperSessionIDRequest final : ZooKeeperRequest +{ + int64_t internal_id; + int64_t session_timeout_ms; + /// Who requested this session + int32_t server_id; + + Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } + String getPath() const override { return {}; } + void writeImpl(WriteBuffer & out) const override; + void readImpl(ReadBuffer & in) override; + + Coordination::ZooKeeperResponsePtr makeResponse() const override; + bool isReadRequest() const override { return false; } +}; + +/// Fake internal coordination (keeper) response. Never received from client +/// and never send to client. +struct ZooKeeperSessionIDResponse final : ZooKeeperResponse +{ + int64_t internal_id; + int64_t session_id; + /// Who requested this session + int32_t server_id; + + void readImpl(ReadBuffer & in) override; + + void writeImpl(WriteBuffer & out) const override; + + Coordination::OpNum getOpNum() const override { return OpNum::SessionID; } }; class ZooKeeperRequestFactory final : private boost::noncopyable diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.cpp b/src/Common/ZooKeeper/ZooKeeperConstants.cpp index 295094b336b..d2dde4c4cdd 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.cpp +++ b/src/Common/ZooKeeper/ZooKeeperConstants.cpp @@ -21,6 +21,7 @@ static const std::unordered_set VALID_OPERATIONS = static_cast(OpNum::Check), static_cast(OpNum::Multi), static_cast(OpNum::Auth), + static_cast(OpNum::SessionID), }; std::string toString(OpNum op_num) @@ -55,6 +56,8 @@ std::string toString(OpNum op_num) return "Heartbeat"; case OpNum::Auth: return "Auth"; + case OpNum::SessionID: + return "SessionID"; } int32_t raw_op = static_cast(op_num); throw Exception("Operation " + std::to_string(raw_op) + " is unknown", Error::ZUNIMPLEMENTED); diff --git a/src/Common/ZooKeeper/ZooKeeperConstants.h b/src/Common/ZooKeeper/ZooKeeperConstants.h index 81ca6c6a460..f91204693a0 100644 --- a/src/Common/ZooKeeper/ZooKeeperConstants.h +++ b/src/Common/ZooKeeper/ZooKeeperConstants.h @@ -30,6 +30,7 @@ enum class OpNum : int32_t Check = 13, Multi = 14, Auth = 100, + SessionID = 997, /// Special internal request }; std::string toString(OpNum op_num); diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.cpp b/src/Common/ZooKeeper/ZooKeeperImpl.cpp index 93ecaef8365..a717052a1ba 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.cpp +++ b/src/Common/ZooKeeper/ZooKeeperImpl.cpp @@ -1012,6 +1012,16 @@ void ZooKeeper::pushRequest(RequestInfo && info) ProfileEvents::increment(ProfileEvents::ZooKeeperTransactions); } +void ZooKeeper::executeGenericRequest( + const ZooKeeperRequestPtr & request, + ResponseCallback callback) +{ + RequestInfo request_info; + request_info.request = request; + request_info.callback = callback; + + pushRequest(std::move(request_info)); +} void ZooKeeper::create( const String & path, diff --git a/src/Common/ZooKeeper/ZooKeeperImpl.h b/src/Common/ZooKeeper/ZooKeeperImpl.h index afd2e89538f..2210fd98b18 100644 --- a/src/Common/ZooKeeper/ZooKeeperImpl.h +++ b/src/Common/ZooKeeper/ZooKeeperImpl.h @@ -121,6 +121,9 @@ public: /// Useful to check owner of ephemeral node. int64_t getSessionID() const override { return session_id; } + void executeGenericRequest( + const ZooKeeperRequestPtr & request, + ResponseCallback callback); /// See the documentation about semantics of these methods in IKeeper class. diff --git a/src/Common/parseRemoteDescription.cpp b/src/Common/parseRemoteDescription.cpp index 6bb1186bfea..477f5e0f250 100644 --- a/src/Common/parseRemoteDescription.cpp +++ b/src/Common/parseRemoteDescription.cpp @@ -181,7 +181,7 @@ std::vector> parseRemoteDescriptionForExternalDataba size_t colon = address.find(':'); if (colon == String::npos) { - LOG_WARNING(&Poco::Logger::get("ParseRemoteDescription"), "Port is not found for host: {}. Using default port {}", default_port); + LOG_WARNING(&Poco::Logger::get("ParseRemoteDescription"), "Port is not found for host: {}. Using default port {}", address, default_port); result.emplace_back(std::make_pair(address, default_port)); } else diff --git a/src/Common/tests/compact_array.cpp b/src/Common/tests/compact_array.cpp index a63859ac712..af6257e1963 100644 --- a/src/Common/tests/compact_array.cpp +++ b/src/Common/tests/compact_array.cpp @@ -1,5 +1,5 @@ /// Bug in GCC: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=59124 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" #endif @@ -263,6 +263,6 @@ int main() return 0; } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/tests/parallel_aggregation.cpp b/src/Common/tests/parallel_aggregation.cpp index e39be163619..045a385671b 100644 --- a/src/Common/tests/parallel_aggregation.cpp +++ b/src/Common/tests/parallel_aggregation.cpp @@ -69,7 +69,7 @@ static void aggregate1(Map & map, Source::const_iterator begin, Source::const_it ++map[*it]; } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -122,7 +122,7 @@ static void aggregate22(MapTwoLevel & map, Source::const_iterator begin, Source: } } -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Common/tests/parallel_aggregation2.cpp b/src/Common/tests/parallel_aggregation2.cpp index 1f8e598b122..e2ad36232de 100644 --- a/src/Common/tests/parallel_aggregation2.cpp +++ b/src/Common/tests/parallel_aggregation2.cpp @@ -62,7 +62,7 @@ struct AggregateIndependent } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -115,7 +115,7 @@ struct AggregateIndependentWithSequentialKeysOptimization } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -265,7 +265,7 @@ struct Creator void operator()(Value &) const {} }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -275,7 +275,7 @@ struct Updater void operator()(Value & x) const { ++x; } }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 7fc24de1cb9..ba1664b23da 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -80,7 +80,7 @@ public: {} - off_t appendRecord(ChangelogRecord && record, bool sync) + off_t appendRecord(ChangelogRecord && record) { off_t result = plain_buf.count(); writeIntBinary(computeRecordChecksum(record), plain_buf); @@ -96,23 +96,21 @@ public: entries_written++; - if (sync) - plain_buf.sync(); - else - plain_buf.next(); return result; } void truncateToLength(off_t new_length) { - flush(); + plain_buf.next(); plain_buf.truncate(new_length); plain_buf.seek(new_length, SEEK_SET); } - void flush() + void flush(bool force_fsync) { - plain_buf.sync(); + plain_buf.next(); + if (force_fsync) + plain_buf.sync(); } uint64_t getEntriesWritten() const @@ -247,9 +245,14 @@ private: ReadBufferFromFile read_buf; }; -Changelog::Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, Poco::Logger * log_) +Changelog::Changelog( + const std::string & changelogs_dir_, + uint64_t rotate_interval_, + bool force_sync_, + Poco::Logger * log_) : changelogs_dir(changelogs_dir_) , rotate_interval(rotate_interval_) + , force_sync(force_sync_) , log(log_) { namespace fs = std::filesystem; @@ -357,6 +360,9 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin void Changelog::rotate(uint64_t new_start_log_index) { + /// Flush previous log + flush(); + ChangelogFileDescription new_description; new_description.prefix = DEFAULT_PREFIX; new_description.from_log_index = new_start_log_index; @@ -387,7 +393,7 @@ ChangelogRecord Changelog::buildRecord(uint64_t index, const LogEntryPtr & log_e return record; } -void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool force_sync) +void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry) { if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); @@ -398,14 +404,14 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); - auto offset = current_writer->appendRecord(buildRecord(index, log_entry), force_sync); + auto offset = current_writer->appendRecord(buildRecord(index, log_entry)); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); logs[index] = makeClone(log_entry); } -void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry, bool force_sync) +void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) { if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); @@ -451,7 +457,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry, bool forc current_writer->setEntriesWritten(entries_written); - appendEntry(index, log_entry, force_sync); + appendEntry(index, log_entry); } void Changelog::compact(uint64_t up_to_log_index) @@ -540,7 +546,7 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, return buf_out; } -void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer, bool force_sync) +void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer) { buffer.pos(0); int num_logs = buffer.get_int(); @@ -555,23 +561,23 @@ void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer, LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); if (i == 0 && logs.count(cur_index)) - writeAt(cur_index, log_entry, force_sync); + writeAt(cur_index, log_entry); else - appendEntry(cur_index, log_entry, force_sync); + appendEntry(cur_index, log_entry); } } void Changelog::flush() { - current_writer->flush(); + if (current_writer) + current_writer->flush(force_sync); } Changelog::~Changelog() { try { - if (current_writer) - current_writer->flush(); + flush(); } catch (...) { diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 21d96cd1438..d669f56aded 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -63,17 +63,17 @@ class Changelog { public: - Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, Poco::Logger * log_); + Changelog(const std::string & changelogs_dir_, uint64_t rotate_interval_, bool force_sync_, Poco::Logger * log_); /// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index /// Truncate broken entries, remove files after broken entries. void readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep); - /// Add entry to log with index. Call fsync if force_sync true. - void appendEntry(uint64_t index, const LogEntryPtr & log_entry, bool force_sync); + /// Add entry to log with index. + void appendEntry(uint64_t index, const LogEntryPtr & log_entry); /// Write entry at index and truncate all subsequent entries. - void writeAt(uint64_t index, const LogEntryPtr & log_entry, bool force_sync); + void writeAt(uint64_t index, const LogEntryPtr & log_entry); /// Remove log files with to_log_index <= up_to_log_index. void compact(uint64_t up_to_log_index); @@ -101,9 +101,9 @@ public: BufferPtr serializeEntriesToBuffer(uint64_t index, int32_t count); /// Apply entries from buffer overriding existing entries - void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer, bool force_sync); + void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer); - /// Fsync log to disk + /// Fsync latest log to disk and flush buffer void flush(); uint64_t size() const @@ -124,6 +124,7 @@ private: private: const std::string changelogs_dir; const uint64_t rotate_interval; + const bool force_sync; Poco::Logger * log; std::map existing_changelogs; diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 45eb1348ac6..7a98e3f200d 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -22,18 +22,19 @@ struct Settings; M(Milliseconds, heart_beat_interval_ms, 500, "Heartbeat interval between quorum nodes", 0) \ M(Milliseconds, election_timeout_lower_bound_ms, 1000, "Lower bound of election timer (avoid too often leader elections)", 0) \ M(Milliseconds, election_timeout_upper_bound_ms, 2000, "Lower bound of election timer (avoid too often leader elections)", 0) \ - M(UInt64, reserved_log_items, 10000, "How many log items to store (don't remove during compaction)", 0) \ - M(UInt64, snapshot_distance, 10000, "How many log items we have to collect to write new snapshot", 0) \ + M(UInt64, reserved_log_items, 100000, "How many log items to store (don't remove during compaction)", 0) \ + M(UInt64, snapshot_distance, 100000, "How many log items we have to collect to write new snapshot", 0) \ M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \ M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ - M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \ + M(UInt64, rotate_log_storage_interval, 100000, "How many records will be stored in one log storage file", 0) \ M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \ + M(UInt64, max_requests_batch_size, 100, "Max size of batch in requests count before it will be sent to RAFT", 0) \ M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ - M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) + M(Bool, force_sync, true, "Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/KeeperLogStore.cpp b/src/Coordination/KeeperLogStore.cpp index f78d57ee1ce..3896cf9b6fd 100644 --- a/src/Coordination/KeeperLogStore.cpp +++ b/src/Coordination/KeeperLogStore.cpp @@ -5,9 +5,12 @@ namespace DB KeeperLogStore::KeeperLogStore(const std::string & changelogs_path, uint64_t rotate_interval_, bool force_sync_) : log(&Poco::Logger::get("KeeperLogStore")) - , changelog(changelogs_path, rotate_interval_, log) - , force_sync(force_sync_) + , changelog(changelogs_path, rotate_interval_, force_sync_, log) { + if (force_sync_) + LOG_INFO(log, "force_sync enabled"); + else + LOG_INFO(log, "force_sync disabled"); } uint64_t KeeperLogStore::start_index() const @@ -38,7 +41,7 @@ uint64_t KeeperLogStore::append(nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); uint64_t idx = changelog.getNextEntryIndex(); - changelog.appendEntry(idx, entry, force_sync); + changelog.appendEntry(idx, entry); return idx; } @@ -46,7 +49,7 @@ uint64_t KeeperLogStore::append(nuraft::ptr & entry) void KeeperLogStore::write_at(uint64_t index, nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); - changelog.writeAt(index, entry, force_sync); + changelog.writeAt(index, entry); } nuraft::ptr>> KeeperLogStore::log_entries(uint64_t start, uint64_t end) @@ -93,7 +96,7 @@ bool KeeperLogStore::flush() void KeeperLogStore::apply_pack(uint64_t index, nuraft::buffer & pack) { std::lock_guard lock(changelog_lock); - changelog.applyEntriesFromBuffer(index, pack, force_sync); + changelog.applyEntriesFromBuffer(index, pack); } uint64_t KeeperLogStore::size() const @@ -102,4 +105,10 @@ uint64_t KeeperLogStore::size() const return changelog.size(); } +void KeeperLogStore::end_of_append_batch(uint64_t /*start_index*/, uint64_t /*count*/) +{ + std::lock_guard lock(changelog_lock); + changelog.flush(); +} + } diff --git a/src/Coordination/KeeperLogStore.h b/src/Coordination/KeeperLogStore.h index bc2ae719f0e..01315e6e879 100644 --- a/src/Coordination/KeeperLogStore.h +++ b/src/Coordination/KeeperLogStore.h @@ -42,11 +42,12 @@ public: uint64_t size() const; + void end_of_append_batch(uint64_t start_index, uint64_t count) override; + private: mutable std::mutex changelog_lock; Poco::Logger * log; Changelog changelog; - bool force_sync; }; } diff --git a/src/Coordination/KeeperServer.cpp b/src/Coordination/KeeperServer.cpp index 7827a25afdd..6557ad5504d 100644 --- a/src/Coordination/KeeperServer.cpp +++ b/src/Coordination/KeeperServer.cpp @@ -24,6 +24,7 @@ namespace ErrorCodes extern const int RAFT_ERROR; extern const int NO_ELEMENTS_IN_CONFIG; extern const int SUPPORT_IS_DISABLED; + extern const int LOGICAL_ERROR; } namespace @@ -73,7 +74,6 @@ KeeperServer::KeeperServer( config.getString("keeper_server.snapshot_storage_path", config.getString("path", DBMS_DEFAULT_PATH) + "coordination/snapshots"), coordination_settings)) , state_manager(nuraft::cs_new(server_id, "keeper_server", config, coordination_settings)) - , responses_queue(responses_queue_) , log(&Poco::Logger::get("KeeperServer")) { if (coordination_settings->quorum_reads) @@ -111,7 +111,7 @@ void KeeperServer::startup() params.auto_forwarding_ = coordination_settings->auto_forwarding; params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; - params.return_method_ = nuraft::raft_params::blocking; + params.return_method_ = nuraft::raft_params::async_handler; nuraft::asio_service::options asio_opts{}; if (state_manager->isSecure()) @@ -222,75 +222,26 @@ nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coord } -void KeeperServer::putRequest(const KeeperStorage::RequestForSession & request_for_session) + +void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & request_for_session) { - auto [session_id, request] = request_for_session; - if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest()) - { - state_machine->processReadRequest(request_for_session); - } - else - { - std::vector> entries; - entries.push_back(getZooKeeperLogEntry(session_id, request)); + if (!request_for_session.request->isReadRequest()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot process non-read request locally"); - std::lock_guard lock(append_entries_mutex); - - auto result = raft_instance->append_entries(entries); - if (!result->get_accepted()) - { - KeeperStorage::ResponsesForSessions responses; - auto response = request->makeResponse(); - response->xid = request->xid; - response->zxid = 0; - response->error = Coordination::Error::ZOPERATIONTIMEOUT; - responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response}); - } - - if (result->get_result_code() == nuraft::cmd_result_code::TIMEOUT) - { - KeeperStorage::ResponsesForSessions responses; - auto response = request->makeResponse(); - response->xid = request->xid; - response->zxid = 0; - response->error = Coordination::Error::ZOPERATIONTIMEOUT; - responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response}); - } - else if (result->get_result_code() != nuraft::cmd_result_code::OK) - throw Exception(ErrorCodes::RAFT_ERROR, "Requests result failed with code {} and message: '{}'", result->get_result_code(), result->get_result_str()); - } + state_machine->processReadRequest(request_for_session); } -int64_t KeeperServer::getSessionID(int64_t session_timeout_ms) +RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions) { - /// Just some sanity check. We don't want to make a lot of clients wait with lock. - if (active_session_id_requests > 10) - throw Exception(ErrorCodes::RAFT_ERROR, "Too many concurrent SessionID requests already in flight"); - ++active_session_id_requests; - SCOPE_EXIT({ --active_session_id_requests; }); + std::vector> entries; + for (const auto & [session_id, request] : requests_for_sessions) + entries.push_back(getZooKeeperLogEntry(session_id, request)); - auto entry = nuraft::buffer::alloc(sizeof(int64_t)); - /// Just special session request - nuraft::buffer_serializer bs(entry); - bs.put_i64(session_timeout_ms); - - std::lock_guard lock(append_entries_mutex); - - auto result = raft_instance->append_entries({entry}); - - if (!result->get_accepted()) - throw Exception(ErrorCodes::RAFT_ERROR, "Cannot send session_id request to RAFT"); - - if (result->get_result_code() != nuraft::cmd_result_code::OK) - throw Exception(ErrorCodes::RAFT_ERROR, "session_id request failed to RAFT"); - - auto resp = result->get(); - if (resp == nullptr) - throw Exception(ErrorCodes::RAFT_ERROR, "Received nullptr as session_id"); - - nuraft::buffer_serializer bs_resp(resp); - return bs_resp.get_i64(); + { + std::lock_guard lock(append_entries_mutex); + return raft_instance->append_entries(entries); + } } bool KeeperServer::isLeader() const diff --git a/src/Coordination/KeeperServer.h b/src/Coordination/KeeperServer.h index 5af948305ef..11900ebb213 100644 --- a/src/Coordination/KeeperServer.h +++ b/src/Coordination/KeeperServer.h @@ -12,10 +12,12 @@ namespace DB { +using RaftAppendResult = nuraft::ptr>>; + class KeeperServer { private: - int server_id; + const int server_id; CoordinationSettingsPtr coordination_settings; @@ -29,13 +31,10 @@ private: std::mutex append_entries_mutex; - ResponsesQueue & responses_queue; - std::mutex initialized_mutex; std::atomic initialized_flag = false; std::condition_variable initialized_cv; std::atomic initial_batch_committed = false; - std::atomic active_session_id_requests = 0; Poco::Logger * log; @@ -60,9 +59,9 @@ public: void startup(); - void putRequest(const KeeperStorage::RequestForSession & request); + void putLocalReadRequest(const KeeperStorage::RequestForSession & request); - int64_t getSessionID(int64_t session_timeout_ms); + RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests); std::unordered_set getDeadSessions(); @@ -73,6 +72,8 @@ public: void waitInit(); void shutdown(); + + int getServerID() const { return server_id; } }; } diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp index c909cfc68e2..df68b8df266 100644 --- a/src/Coordination/KeeperStateMachine.cpp +++ b/src/Coordination/KeeperStateMachine.cpp @@ -90,25 +90,29 @@ void KeeperStateMachine::init() nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data) { - if (data.size() == sizeof(int64_t)) + auto request_for_session = parseRequest(data); + if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID) { - nuraft::buffer_serializer timeout_data(data); - int64_t session_timeout_ms = timeout_data.get_i64(); - auto response = nuraft::buffer::alloc(sizeof(int64_t)); + const Coordination::ZooKeeperSessionIDRequest & session_id_request = dynamic_cast(*request_for_session.request); int64_t session_id; - nuraft::buffer_serializer bs(response); { std::lock_guard lock(storage_lock); - session_id = storage->getSessionID(session_timeout_ms); - bs.put_i64(session_id); + session_id = storage->getSessionID(session_id_request.session_timeout_ms); } - LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms); - last_committed_idx = log_idx; - return response; + LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms); + + std::shared_ptr response = std::make_shared(); + response->internal_id = session_id_request.internal_id; + response->session_id = session_id; + response->server_id = session_id_request.server_id; + + KeeperStorage::ResponseForSession response_for_session; + response_for_session.session_id = -1; + response_for_session.response = response; + responses_queue.push(response_for_session); } else { - auto request_for_session = parseRequest(data); KeeperStorage::ResponsesForSessions responses_for_sessions; { std::lock_guard lock(storage_lock); @@ -116,10 +120,10 @@ nuraft::ptr KeeperStateMachine::commit(const uint64_t log_idx, n for (auto & response_for_session : responses_for_sessions) responses_queue.push(response_for_session); } - - last_committed_idx = log_idx; - return nullptr; } + + last_committed_idx = log_idx; + return nullptr; } bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s) diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index 3e9b64cd0a4..197cc4323cf 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -405,8 +405,6 @@ struct KeeperStorageListRequest final : public KeeperStorageRequest response.names.insert(response.names.end(), it->value.children.begin(), it->value.children.end()); - std::sort(response.names.begin(), response.names.end()); - response.stat = it->value.stat; response.error = Coordination::Error::ZOK; } diff --git a/src/Coordination/KeeperStorageDispatcher.cpp b/src/Coordination/KeeperStorageDispatcher.cpp index 7f9f9170dc2..fc79f5bee97 100644 --- a/src/Coordination/KeeperStorageDispatcher.cpp +++ b/src/Coordination/KeeperStorageDispatcher.cpp @@ -1,5 +1,9 @@ #include #include +#include +#include +#include +#include namespace DB { @@ -17,29 +21,116 @@ KeeperStorageDispatcher::KeeperStorageDispatcher() { } + void KeeperStorageDispatcher::requestThread() { setThreadName("KeeperReqT"); + + /// Result of requests batch from previous iteration + RaftAppendResult prev_result = nullptr; + /// Requests from previous iteration. We store them to be able + /// to send errors to the client. + KeeperStorage::RequestsForSessions prev_batch; + while (!shutdown_called) { KeeperStorage::RequestForSession request; UInt64 max_wait = UInt64(coordination_settings->operation_timeout_ms.totalMilliseconds()); + uint64_t max_batch_size = coordination_settings->max_requests_batch_size; - if (requests_queue.tryPop(request, max_wait)) + /// The code below do a very simple thing: batch all write (quorum) requests into vector until + /// previous write batch is not finished or max_batch size achieved. The main complexity goes from + /// the ability to process read requests without quorum (from local state). So when we are collecting + /// requests into a batch we must check that the new request is not read request. Otherwise we have to + /// process all already accumulated write requests, wait them synchronously and only after that process + /// read request. So reads are some kind of "separator" for writes. + try { - if (shutdown_called) - break; + if (requests_queue->tryPop(request, max_wait)) + { + if (shutdown_called) + break; - try - { - server->putRequest(request); - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); + KeeperStorage::RequestsForSessions current_batch; + + bool has_read_request = false; + + /// If new request is not read request or we must to process it through quorum. + /// Otherwise we will process it locally. + if (coordination_settings->quorum_reads || !request.request->isReadRequest()) + { + current_batch.emplace_back(request); + + /// Waiting until previous append will be successful, or batch is big enough + /// has_result == false && get_result_code == OK means that our request still not processed. + /// Sometimes NuRaft set errorcode without setting result, so we check both here. + while (prev_result && (!prev_result->has_result() && prev_result->get_result_code() == nuraft::cmd_result_code::OK) && current_batch.size() <= max_batch_size) + { + /// Trying to get batch requests as fast as possible + if (requests_queue->tryPop(request, 1)) + { + /// Don't append read request into batch, we have to process them separately + if (!coordination_settings->quorum_reads && request.request->isReadRequest()) + { + has_read_request = true; + break; + } + else + { + + current_batch.emplace_back(request); + } + } + + if (shutdown_called) + break; + } + } + else + has_read_request = true; + + if (shutdown_called) + break; + + /// Forcefully process all previous pending requests + if (prev_result) + forceWaitAndProcessResult(prev_result, prev_batch); + + /// Process collected write requests batch + if (!current_batch.empty()) + { + auto result = server->putRequestBatch(current_batch); + + if (result) + { + if (has_read_request) /// If we will execute read request next, than we have to process result now + forceWaitAndProcessResult(result, current_batch); + } + else + { + addErrorResponses(current_batch, Coordination::Error::ZRUNTIMEINCONSISTENCY); + current_batch.clear(); + } + + prev_batch = current_batch; + prev_result = result; + } + + /// Read request always goes after write batch (last request) + if (has_read_request) + { + if (server->isLeaderAlive()) + server->putLocalReadRequest(request); + else + addErrorResponses({request}, Coordination::Error::ZRUNTIMEINCONSISTENCY); + } } } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } } } @@ -94,14 +185,32 @@ void KeeperStorageDispatcher::snapshotThread() void KeeperStorageDispatcher::setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response) { std::lock_guard lock(session_to_response_callback_mutex); - auto session_writer = session_to_response_callback.find(session_id); - if (session_writer == session_to_response_callback.end()) - return; + if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::SessionID) + { + const Coordination::ZooKeeperSessionIDResponse & session_id_resp = dynamic_cast(*response); - session_writer->second(response); - /// Session closed, no more writes - if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close) - session_to_response_callback.erase(session_writer); + /// Nobody waits for this session id + if (session_id_resp.server_id != server->getServerID() || !new_session_id_response_callback.count(session_id_resp.internal_id)) + return; + + auto callback = new_session_id_response_callback[session_id_resp.internal_id]; + callback(response); + new_session_id_response_callback.erase(session_id_resp.internal_id); + } + else + { + auto session_writer = session_to_response_callback.find(session_id); + if (session_writer == session_to_response_callback.end()) + return; + + session_writer->second(response); + + /// Session closed, no more writes + if (response->xid != Coordination::WATCH_XID && response->getOpNum() == Coordination::OpNum::Close) + { + session_to_response_callback.erase(session_writer); + } + } } bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & request, int64_t session_id) @@ -119,8 +228,8 @@ bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr std::lock_guard lock(push_request_mutex); /// Put close requests without timeouts if (request->getOpNum() == Coordination::OpNum::Close) - requests_queue.push(std::move(request_info)); - else if (!requests_queue.tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds())) + requests_queue->push(std::move(request_info)); + else if (!requests_queue->tryPush(std::move(request_info), coordination_settings->operation_timeout_ms.totalMilliseconds())) throw Exception("Cannot push request to queue within operation timeout", ErrorCodes::TIMEOUT_EXCEEDED); return true; } @@ -131,6 +240,7 @@ void KeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfiguration int myid = config.getInt("keeper_server.server_id"); coordination_settings->loadFromConfig("keeper_server.coordination_settings", config); + requests_queue = std::make_unique(coordination_settings->max_requests_batch_size); request_thread = ThreadFromGlobalPool([this] { requestThread(); }); responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); @@ -175,7 +285,7 @@ void KeeperStorageDispatcher::shutdown() session_cleaner_thread.join(); /// FIXME not the best way to notify - requests_queue.push({}); + requests_queue->push({}); if (request_thread.joinable()) request_thread.join(); @@ -192,7 +302,7 @@ void KeeperStorageDispatcher::shutdown() server->shutdown(); KeeperStorage::RequestForSession request_for_session; - while (requests_queue.tryPop(request_for_session)) + while (requests_queue->tryPop(request_for_session)) { if (request_for_session.request) { @@ -249,7 +359,7 @@ void KeeperStorageDispatcher::sessionCleanerTask() request_info.session_id = dead_session; { std::lock_guard lock(push_request_mutex); - requests_queue.push(std::move(request_info)); + requests_queue->push(std::move(request_info)); } finishSession(dead_session); LOG_INFO(log, "Dead session close request pushed"); @@ -273,4 +383,79 @@ void KeeperStorageDispatcher::finishSession(int64_t session_id) session_to_response_callback.erase(session_it); } +void KeeperStorageDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error) +{ + for (const auto & [session_id, request] : requests_for_sessions) + { + KeeperStorage::ResponsesForSessions responses; + auto response = request->makeResponse(); + response->xid = request->xid; + response->zxid = 0; + response->error = error; + responses_queue.push(DB::KeeperStorage::ResponseForSession{session_id, response}); + } +} + +void KeeperStorageDispatcher::forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions) +{ + if (!result->has_result()) + result->get(); + + /// If we get some errors, than send them to clients + if (!result->get_accepted() || result->get_result_code() == nuraft::cmd_result_code::TIMEOUT) + addErrorResponses(requests_for_sessions, Coordination::Error::ZOPERATIONTIMEOUT); + else if (result->get_result_code() != nuraft::cmd_result_code::OK) + addErrorResponses(requests_for_sessions, Coordination::Error::ZRUNTIMEINCONSISTENCY); + + result = nullptr; + requests_for_sessions.clear(); +} + +int64_t KeeperStorageDispatcher::getSessionID(int64_t session_timeout_ms) +{ + KeeperStorage::RequestForSession request_info; + std::shared_ptr request = std::make_shared(); + request->internal_id = internal_session_id_counter.fetch_add(1); + request->session_timeout_ms = session_timeout_ms; + request->server_id = server->getServerID(); + + request_info.request = request; + request_info.session_id = -1; + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + { + std::lock_guard lock(session_to_response_callback_mutex); + new_session_id_response_callback[request->internal_id] = [promise, internal_id = request->internal_id] (const Coordination::ZooKeeperResponsePtr & response) + { + if (response->getOpNum() != Coordination::OpNum::SessionID) + promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR, + "Incorrect response of type {} instead of SessionID response", Coordination::toString(response->getOpNum())))); + + auto session_id_response = dynamic_cast(*response); + if (session_id_response.internal_id != internal_id) + { + promise->set_exception(std::make_exception_ptr(Exception(ErrorCodes::LOGICAL_ERROR, + "Incorrect response with internal id {} instead of {}", session_id_response.internal_id, internal_id))); + } + + if (response->error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException("SessionID request failed with error", response->error))); + + promise->set_value(session_id_response.session_id); + }; + } + + { + std::lock_guard lock(push_request_mutex); + if (!requests_queue->tryPush(std::move(request_info), session_timeout_ms)) + throw Exception("Cannot push session id request to queue within session timeout", ErrorCodes::TIMEOUT_EXCEEDED); + } + + if (future.wait_for(std::chrono::milliseconds(session_timeout_ms)) != std::future_status::ready) + throw Exception("Cannot receive session id within session timeout", ErrorCodes::TIMEOUT_EXCEEDED); + + return future.get(); +} + } diff --git a/src/Coordination/KeeperStorageDispatcher.h b/src/Coordination/KeeperStorageDispatcher.h index 622b63be800..e4cfa620e6c 100644 --- a/src/Coordination/KeeperStorageDispatcher.h +++ b/src/Coordination/KeeperStorageDispatcher.h @@ -32,24 +32,42 @@ private: using RequestsQueue = ConcurrentBoundedQueue; using SessionToResponseCallback = std::unordered_map; - RequestsQueue requests_queue{1}; + /// Size depends on coordination settings + std::unique_ptr requests_queue; ResponsesQueue responses_queue; SnapshotsQueue snapshots_queue{1}; std::atomic shutdown_called{false}; std::mutex session_to_response_callback_mutex; + /// These two maps looks similar, but serves different purposes. + /// The first map is subscription map for normal responses like + /// (get, set, list, etc.). Dispatcher determines callback for each response + /// using session id from this map. SessionToResponseCallback session_to_response_callback; + /// But when client connects to the server for the first time it doesn't + /// have session_id. It request it from server. We give temporary + /// internal id for such requests just to much client with its response. + SessionToResponseCallback new_session_id_response_callback; + /// Reading and batching new requests from client handlers ThreadFromGlobalPool request_thread; + /// Pushing responses to clients client handlers + /// using session_id. ThreadFromGlobalPool responses_thread; + /// Cleaning old dead sessions ThreadFromGlobalPool session_cleaner_thread; + /// Dumping new snapshots to disk ThreadFromGlobalPool snapshot_thread; + /// RAFT wrapper. Most important class. std::unique_ptr server; Poco::Logger * log; + /// Counter for new session_id requests. + std::atomic internal_session_id_counter{0}; + private: void requestThread(); void responseThread(); @@ -57,6 +75,14 @@ private: void snapshotThread(); void setResponse(int64_t session_id, const Coordination::ZooKeeperResponsePtr & response); + /// Add error responses for requests to responses queue. + /// Clears requests. + void addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error); + + /// Forcefully wait for result and sets errors if something when wrong. + /// Clears both arguments + void forceWaitAndProcessResult(RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions); + public: KeeperStorageDispatcher(); @@ -78,10 +104,7 @@ public: return server->isLeaderAlive(); } - int64_t getSessionID(long session_timeout_ms) - { - return server->getSessionID(session_timeout_ms); - } + int64_t getSessionID(int64_t session_timeout_ms); void registerSession(int64_t session_id, ZooKeeperResponseCallback callback); /// Call if we don't need any responses for this session no more (session was expired) diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 0e06af1a7f3..515565a3b9f 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -211,6 +211,8 @@ TEST(CoordinationTest, ChangelogTestSimple) changelog.init(1, 0); auto entry = getLogEntry("hello world", 77); changelog.append(entry); + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.next_slot(), 2); EXPECT_EQ(changelog.start_index(), 1); EXPECT_EQ(changelog.last_entry()->get_term(), 77); @@ -225,6 +227,7 @@ TEST(CoordinationTest, ChangelogTestFile) changelog.init(1, 0); auto entry = getLogEntry("hello world", 77); changelog.append(entry); + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); for (const auto & p : fs::directory_iterator("./logs")) EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); @@ -234,6 +237,7 @@ TEST(CoordinationTest, ChangelogTestFile) changelog.append(entry); changelog.append(entry); changelog.append(entry); + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -249,6 +253,8 @@ TEST(CoordinationTest, ChangelogReadWrite) auto entry = getLogEntry("hello world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 10); DB::KeeperLogStore changelog_reader("./logs", 1000, true); changelog_reader.init(1, 0); @@ -276,10 +282,14 @@ TEST(CoordinationTest, ChangelogWriteAt) auto entry = getLogEntry("hello world", i * 10); changelog.append(entry); } + + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 10); auto entry = getLogEntry("writer", 77); changelog.write_at(7, entry); + changelog.end_of_append_batch(0, 0); + EXPECT_EQ(changelog.size(), 7); EXPECT_EQ(changelog.last_entry()->get_term(), 77); EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); @@ -305,6 +315,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) auto entry = getLogEntry("hello world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 7); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -319,6 +330,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) auto entry = getLogEntry("hello world", i * 10); changelog_reader.append(entry); } + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 10); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -331,6 +343,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) auto entry = getLogEntry("someentry", 77); changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 11); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -354,6 +367,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) auto entry = getLogEntry("hello world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 3); @@ -373,6 +387,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) changelog.append(e3); auto e4 = getLogEntry("hello world", 60); changelog.append(e4); + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -405,6 +420,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 10); @@ -420,6 +436,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) EXPECT_EQ(apply_changelog.size(), 10); apply_changelog.apply_pack(8, *entries); + apply_changelog.end_of_append_batch(0, 0); EXPECT_EQ(apply_changelog.size(), 12); EXPECT_EQ(apply_changelog.start_index(), 1); @@ -447,6 +464,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 10); @@ -458,6 +476,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) EXPECT_EQ(changelog_new.size(), 0); changelog_new.apply_pack(5, *entries); + changelog_new.end_of_append_batch(0, 0); EXPECT_EQ(changelog_new.size(), 5); EXPECT_EQ(changelog_new.start_index(), 5); @@ -468,6 +487,8 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) auto e = getLogEntry("hello_world", 110); changelog_new.append(e); + changelog_new.end_of_append_batch(0, 0); + EXPECT_EQ(changelog_new.size(), 6); EXPECT_EQ(changelog_new.start_index(), 5); EXPECT_EQ(changelog_new.next_slot(), 11); @@ -488,6 +509,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -501,6 +523,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) auto e1 = getLogEntry("helloworld", 5555); changelog.write_at(7, e1); + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 7); EXPECT_EQ(changelog.start_index(), 1); EXPECT_EQ(changelog.next_slot(), 8); @@ -534,6 +557,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -547,6 +571,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) auto e1 = getLogEntry("helloworld", 5555); changelog.write_at(11, e1); + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 11); EXPECT_EQ(changelog.start_index(), 1); EXPECT_EQ(changelog.next_slot(), 12); @@ -580,6 +605,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -593,6 +619,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) auto e1 = getLogEntry("helloworld", 5555); changelog.write_at(1, e1); + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 1); EXPECT_EQ(changelog.start_index(), 1); EXPECT_EQ(changelog.next_slot(), 2); @@ -619,6 +646,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 35); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -635,6 +663,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) auto entry = getLogEntry("36_hello_world", 360); changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 36); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -660,6 +689,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_EQ(changelog.size(), 35); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -674,6 +704,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) DB::KeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1, 0); + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 10); EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90); @@ -689,6 +720,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) auto entry = getLogEntry("h", 7777); changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 11); EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); @@ -719,6 +751,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); @@ -735,6 +768,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); auto entry = getLogEntry("hello_world", 7777); changelog_reader.append(entry); + changelog_reader.end_of_append_batch(0, 0); EXPECT_EQ(changelog_reader.size(), 3); EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); @@ -757,6 +791,7 @@ TEST(CoordinationTest, ChangelogTestLostFiles) auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); changelog.append(entry); } + changelog.end_of_append_batch(0, 0); EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); @@ -1105,6 +1140,7 @@ void testLogAndStateMachine(Coordination::CoordinationSettingsPtr settings, uint request->path = "/hello_" + std::to_string(i); auto entry = getLogEntryFromZKRequest(0, 1, request); changelog.append(entry); + changelog.end_of_append_batch(0, 0); state_machine->commit(i, changelog.entry_at(i)->get_buf()); bool snapshot_created = false; diff --git a/src/Core/Block.cpp b/src/Core/Block.cpp index 0c9a470dc1d..0f470c10b81 100644 --- a/src/Core/Block.cpp +++ b/src/Core/Block.cpp @@ -484,7 +484,7 @@ DataTypes Block::getDataTypes() const template -static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description) +static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, const std::string & context_description, bool allow_remove_constants) { auto on_error = [](const std::string & message [[maybe_unused]], int code [[maybe_unused]]) { @@ -515,7 +515,16 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons if (!actual.column || !expected.column) continue; - if (actual.column->getName() != expected.column->getName()) + const IColumn * actual_column = actual.column.get(); + + /// If we allow to remove constants, and expected column is not const, then unwrap actual constant column. + if (allow_remove_constants && !isColumnConst(*expected.column)) + { + if (const auto * column_const = typeid_cast(actual_column)) + actual_column = &column_const->getDataColumn(); + } + + if (actual_column->getName() != expected.column->getName()) return on_error("Block structure mismatch in " + context_description + " stream: different columns:\n" + lhs.dumpStructure() + "\n" + rhs.dumpStructure(), ErrorCodes::LOGICAL_ERROR); @@ -537,13 +546,25 @@ static ReturnType checkBlockStructure(const Block & lhs, const Block & rhs, cons bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs) { - return checkBlockStructure(lhs, rhs, {}); + return checkBlockStructure(lhs, rhs, {}, false); } void assertBlocksHaveEqualStructure(const Block & lhs, const Block & rhs, const std::string & context_description) { - checkBlockStructure(lhs, rhs, context_description); + checkBlockStructure(lhs, rhs, context_description, false); +} + + +bool isCompatibleHeader(const Block & actual, const Block & desired) +{ + return checkBlockStructure(actual, desired, {}, true); +} + + +void assertCompatibleHeader(const Block & actual, const Block & desired, const std::string & context_description) +{ + checkBlockStructure(actual, desired, context_description, true); } diff --git a/src/Core/Block.h b/src/Core/Block.h index 0cfd17b27dc..6a94034b8fd 100644 --- a/src/Core/Block.h +++ b/src/Core/Block.h @@ -184,6 +184,12 @@ bool blocksHaveEqualStructure(const Block & lhs, const Block & rhs); /// Throw exception when blocks are different. void assertBlocksHaveEqualStructure(const Block & lhs, const Block & rhs, const std::string & context_description); +/// Actual header is compatible to desired if block have equal structure except constants. +/// It is allowed when column from actual header is constant, but in desired is not. +/// If both columns are constant, it is checked that they have the same value. +bool isCompatibleHeader(const Block & actual, const Block & desired); +void assertCompatibleHeader(const Block & actual, const Block & desired, const std::string & context_description); + /// Calculate difference in structure of blocks and write description into output strings. NOTE It doesn't compare values of constant columns. void getBlocksDifference(const Block & lhs, const Block & rhs, std::string & out_lhs_diff, std::string & out_rhs_diff); diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 2cd50ab8d08..2131a6e3c33 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -103,7 +103,10 @@ inline DecimalType decimalFromComponentsWithMultiplier( if (common::mulOverflow(whole, scale_multiplier, whole_scaled)) throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); - const T value = whole_scaled + fractional_sign * (fractional % scale_multiplier); + T value; + if (common::addOverflow(whole_scaled, fractional_sign * (fractional % scale_multiplier), value)) + throw Exception("Decimal math overflow", ErrorCodes::DECIMAL_OVERFLOW); + return DecimalType(value); } diff --git a/src/Core/Field.h b/src/Core/Field.h index 77549854982..5c4c2e165ad 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -96,7 +96,7 @@ template bool decimalEqual(T x, T y, UInt32 x_scale, UInt32 y_scale template bool decimalLess(T x, T y, UInt32 x_scale, UInt32 y_scale); template bool decimalLessOrEqual(T x, T y, UInt32 x_scale, UInt32 y_scale); -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -159,7 +159,7 @@ private: T dec; UInt32 scale; }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif @@ -563,7 +563,7 @@ public: { case Types::Null: return f(field.template get()); // gcc 8.2.1 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -583,7 +583,7 @@ public: case Types::Int128: return f(field.template get()); case Types::UInt256: return f(field.template get()); case Types::Int256: return f(field.template get()); -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c566d18ca4f..eaa5e2d34f8 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -70,6 +70,7 @@ class IColumn; M(UInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.", 0) \ M(UInt64, s3_min_upload_part_size, 512*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_max_single_part_upload_size, 64*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ + M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \ M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \ M(Bool, extremes, false, "Calculate minimums and maximums of the result columns. They can be output in JSON-formats.", IMPORTANT) \ @@ -142,7 +143,7 @@ class IColumn; M(UInt64, optimize_min_equality_disjunction_chain_length, 3, "The minimum length of the expression `expr = x1 OR ... expr = xN` for optimization ", 0) \ \ M(UInt64, min_bytes_to_use_direct_io, 0, "The minimum number of bytes for reading the data with O_DIRECT option during SELECT queries execution. 0 - disabled.", 0) \ - M(UInt64, min_bytes_to_use_mmap_io, (64 * 1024 * 1024), "The minimum number of bytes for reading the data with mmap option during SELECT queries execution. 0 - disabled.", 0) \ + M(UInt64, min_bytes_to_use_mmap_io, 0, "The minimum number of bytes for reading the data with mmap option during SELECT queries execution. 0 - disabled.", 0) \ M(Bool, checksum_on_read, true, "Validate checksums on reading. It is enabled by default and should be always enabled in production. Please do not expect any benefits in disabling this setting. It may only be used for experiments and benchmarks. The setting only applicable for tables of MergeTree family. Checksums are always validated for other table engines and when receiving data over network.", 0) \ \ M(Bool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \ @@ -224,6 +225,7 @@ class IColumn; /** Settings for testing hedged requests */ \ M(Milliseconds, sleep_in_send_tables_status_ms, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \ M(Milliseconds, sleep_in_send_data_ms, 0, "Time to sleep in sending data in TCPHandler", 0) \ + M(UInt64, unknown_packet_in_send_data, 0, "Send unknown packet instead of data Nth data packet", 0) \ \ M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \ M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ @@ -424,6 +426,7 @@ class IColumn; M(Bool, allow_non_metadata_alters, true, "Allow to execute alters which affects not only tables metadata, but also data on disk", 0) \ M(Bool, enable_global_with_statement, true, "Propagate WITH statements to UNION queries and all subqueries", 0) \ M(Bool, aggregate_functions_null_for_empty, false, "Rewrite all aggregate functions in a query, adding -OrNull suffix to them", 0) \ + M(Bool, optimize_fuse_sum_count_avg, false, "Fuse aggregate functions sum(), avg(), count() with identical arguments into one sumCount() call, if the query has at least two different functions", 0) \ M(Bool, flatten_nested, true, "If true, columns of type Nested will be flatten to separate array columns instead of one array of tuples", 0) \ M(Bool, asterisk_include_materialized_columns, false, "Include MATERIALIZED columns for wildcard query", 0) \ M(Bool, asterisk_include_alias_columns, false, "Include ALIAS columns for wildcard query", 0) \ @@ -445,6 +448,8 @@ class IColumn; M(Bool, database_replicated_always_detach_permanently, false, "Execute DETACH TABLE as DETACH TABLE PERMANENTLY if database engine is Replicated", 0) \ M(DistributedDDLOutputMode, distributed_ddl_output_mode, DistributedDDLOutputMode::THROW, "Format of distributed DDL query result", 0) \ M(UInt64, distributed_ddl_entry_format_version, 1, "Version of DDL entry to write into ZooKeeper", 0) \ + M(UInt64, external_storage_max_read_rows, 0, "Limit maximum number of rows when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializeMySQL. If equal to 0, this setting is disabled", 0) \ + M(UInt64, external_storage_max_read_bytes, 0, "Limit maximum number of bytes when table with external engine should flush history data. Now supported only for MySQL table engine, database engine, dictionary and MaterializeMySQL. If equal to 0, this setting is disabled", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ diff --git a/src/Core/Types.h b/src/Core/Types.h index a463455cbe1..b9ecda4a46d 100644 --- a/src/Core/Types.h +++ b/src/Core/Types.h @@ -15,7 +15,7 @@ namespace DB struct Null {}; /// Ignore strange gcc warning https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55776 -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" #endif @@ -59,7 +59,7 @@ enum class TypeIndex LowCardinality, Map, }; -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 84df8b21813..926d529a5d8 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -19,9 +19,12 @@ public: TimezoneMixin(const TimezoneMixin &) = default; const DateLUTImpl & getTimeZone() const { return time_zone; } + bool hasExplicitTimeZone() const { return has_explicit_time_zone; } protected: + /// true if time zone name was provided in data type parameters, false if it's using default time zone. bool has_explicit_time_zone; + const DateLUTImpl & time_zone; const DateLUTImpl & utc_time_zone; }; diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index c9e533dfafb..6d564bc29a3 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -365,8 +365,8 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid) /// 4. INSERT INTO table ...; (both Storage instances writes data without any synchronization) /// To avoid it, we remember UUIDs of detached tables and does not allow ATTACH table with such UUID until detached instance still in use. if (detached_tables.count(uuid)) - throw Exception("Cannot attach table with UUID " + toString(uuid) + - ", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS); + throw Exception(ErrorCodes::TABLE_ALREADY_EXISTS, "Cannot attach table with UUID {}, " + "because it was detached but still used by some query. Retry later.", toString(uuid)); } void DatabaseAtomic::setDetachedTableNotInUseForce(const UUID & uuid) @@ -573,12 +573,6 @@ void DatabaseAtomic::renameDictionaryInMemoryUnlocked(const StorageID & old_name } void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) { - { - std::lock_guard lock{mutex}; - if (detached_tables.count(uuid) == 0) - return; - } - /// Table is in use while its shared_ptr counter is greater than 1. /// We cannot trigger condvar on shared_ptr destruction, so it's busy wait. while (true) @@ -594,5 +588,13 @@ void DatabaseAtomic::waitDetachedTableNotInUse(const UUID & uuid) } } +void DatabaseAtomic::checkDetachedTableNotInUse(const UUID & uuid) +{ + DetachedTables not_in_use; + std::lock_guard lock{mutex}; + not_in_use = cleanupDetachedTables(); + assertDetachedTableNotInUse(uuid); +} + } diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index d35495fc962..695d22360ca 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -58,6 +58,7 @@ public: void tryRemoveSymlink(const String & table_name); void waitDetachedTableNotInUse(const UUID & uuid) override; + void checkDetachedTableNotInUse(const UUID & uuid) override; void setDetachedTableNotInUseForce(const UUID & uuid); protected: diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 03c2849ac94..9d09ac731d2 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -158,7 +158,7 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String const auto & [remote_host_name, remote_port] = parseAddress(host_port, 3306); MySQLClient client(remote_host_name, remote_port, mysql_user_name, mysql_user_password); - auto mysql_pool = mysqlxx::Pool(mysql_database_name, remote_host_name, mysql_user_name, mysql_user_password); + auto mysql_pool = mysqlxx::Pool(mysql_database_name, remote_host_name, mysql_user_name, mysql_user_password, remote_port); auto materialize_mode_settings = std::make_unique(); diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index 5180b251a5f..c97417e292c 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -169,11 +169,22 @@ void DatabaseWithDictionaries::createDictionary(ContextPtr local_context, const } bool succeeded = false; + bool uuid_locked = false; SCOPE_EXIT({ if (!succeeded) + { + if (uuid_locked) + DatabaseCatalog::instance().removeUUIDMappingFinally(dict_id.uuid); Poco::File(dictionary_metadata_tmp_path).remove(); + } }); + if (dict_id.uuid != UUIDHelpers::Nil) + { + DatabaseCatalog::instance().addUUIDMapping(dict_id.uuid); + uuid_locked = true; + } + /// Add a temporary repository containing the dictionary. /// We need this temp repository to try loading the dictionary before actually attaching it to the database. auto temp_repository = external_loader.addConfigRepository(std::make_unique( diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 6cd0e468709..8c356b88460 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -345,7 +345,8 @@ public: virtual void assertCanBeDetached(bool /*cleanup*/) {} - virtual void waitDetachedTableNotInUse(const UUID & /*uuid*/) { assert(false); } + virtual void waitDetachedTableNotInUse(const UUID & /*uuid*/) { } + virtual void checkDetachedTableNotInUse(const UUID & /*uuid*/) { } /// Ask all tables to complete the background threads they are using and delete all table objects. virtual void shutdown() = 0; diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp index 9615aa85e05..4e63bed9c6d 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp @@ -198,7 +198,7 @@ ASTPtr DatabaseConnectionMySQL::getCreateDatabaseQuery() const void DatabaseConnectionMySQL::fetchTablesIntoLocalCache(ContextPtr local_context) const { - const auto & tables_with_modification_time = fetchTablesWithModificationTime(); + const auto & tables_with_modification_time = fetchTablesWithModificationTime(local_context); destroyLocalCacheExtraTables(tables_with_modification_time); fetchLatestTablesStructureIntoCache(tables_with_modification_time, local_context); @@ -252,7 +252,7 @@ void DatabaseConnectionMySQL::fetchLatestTablesStructureIntoCache( } } -std::map DatabaseConnectionMySQL::fetchTablesWithModificationTime() const +std::map DatabaseConnectionMySQL::fetchTablesWithModificationTime(ContextPtr local_context) const { Block tables_status_sample_block { @@ -268,7 +268,8 @@ std::map DatabaseConnectionMySQL::fetchTablesWithModificationTim " WHERE TABLE_SCHEMA = " << quote << database_name_in_mysql; std::map tables_with_modification_time; - MySQLBlockInputStream result(mysql_pool.get(), query.str(), tables_status_sample_block, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(local_context->getSettingsRef()); + MySQLBlockInputStream result(mysql_pool.get(), query.str(), tables_status_sample_block, mysql_input_stream_settings); while (Block block = result.read()) { @@ -292,7 +293,7 @@ DatabaseConnectionMySQL::fetchTablesColumnsList(const std::vector & tabl mysql_pool, database_name_in_mysql, tables_name, - settings.external_table_functions_use_nulls, + settings, database_settings->mysql_datatypes_support_level); } diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.h b/src/Databases/MySQL/DatabaseConnectionMySQL.h index a626924d6dd..7e81003e9a9 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.h +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.h @@ -108,7 +108,7 @@ private: void fetchTablesIntoLocalCache(ContextPtr context) const; - std::map fetchTablesWithModificationTime() const; + std::map fetchTablesWithModificationTime(ContextPtr local_context) const; std::map fetchTablesColumnsList(const std::vector & tables_name, ContextPtr context) const; diff --git a/src/Databases/MySQL/FetchTablesColumnsList.cpp b/src/Databases/MySQL/FetchTablesColumnsList.cpp index 6344553ba69..cfd01d4ddc4 100644 --- a/src/Databases/MySQL/FetchTablesColumnsList.cpp +++ b/src/Databases/MySQL/FetchTablesColumnsList.cpp @@ -44,7 +44,7 @@ std::map fetchTablesColumnsList( mysqlxx::PoolWithFailover & pool, const String & database_name, const std::vector & tables_name, - bool external_table_functions_use_nulls, + const Settings & settings, MultiEnum type_support) { std::map tables_and_columns; @@ -72,13 +72,18 @@ std::map fetchTablesColumnsList( " IS_NULLABLE = 'YES' AS is_nullable," " COLUMN_TYPE LIKE '%unsigned' AS is_unsigned," " CHARACTER_MAXIMUM_LENGTH AS length," - " NUMERIC_PRECISION as ''," + " NUMERIC_PRECISION as numeric_precision," " IF(ISNULL(NUMERIC_SCALE), DATETIME_PRECISION, NUMERIC_SCALE) AS scale" // we know DATETIME_PRECISION as a scale in CH " FROM INFORMATION_SCHEMA.COLUMNS" - " WHERE TABLE_SCHEMA = " << quote << database_name - << " AND TABLE_NAME IN " << toQueryStringWithQuote(tables_name) << " ORDER BY ORDINAL_POSITION"; + " WHERE "; - MySQLBlockInputStream result(pool.get(), query.str(), tables_columns_sample_block, DEFAULT_BLOCK_SIZE); + if (!database_name.empty()) + query << " TABLE_SCHEMA = " << quote << database_name << " AND "; + + query << " TABLE_NAME IN " << toQueryStringWithQuote(tables_name) << " ORDER BY ORDINAL_POSITION"; + + StreamSettings mysql_input_stream_settings(settings); + MySQLBlockInputStream result(pool.get(), query.str(), tables_columns_sample_block, mysql_input_stream_settings); while (Block block = result.read()) { const auto & table_name_col = *block.getByPosition(0).column; @@ -99,7 +104,7 @@ std::map fetchTablesColumnsList( convertMySQLDataType( type_support, column_type_col[i].safeGet(), - external_table_functions_use_nulls && is_nullable_col[i].safeGet(), + settings.external_table_functions_use_nulls && is_nullable_col[i].safeGet(), is_unsigned_col[i].safeGet(), char_max_length_col[i].safeGet(), precision_col[i].safeGet(), diff --git a/src/Databases/MySQL/FetchTablesColumnsList.h b/src/Databases/MySQL/FetchTablesColumnsList.h index d609f4896e7..55f18e0115f 100644 --- a/src/Databases/MySQL/FetchTablesColumnsList.h +++ b/src/Databases/MySQL/FetchTablesColumnsList.h @@ -12,6 +12,7 @@ #include #include +#include namespace DB { @@ -20,7 +21,7 @@ std::map fetchTablesColumnsList( mysqlxx::PoolWithFailover & pool, const String & database_name, const std::vector & tables_name, - bool external_table_functions_use_nulls, + const Settings & settings, MultiEnum type_support); } diff --git a/src/Databases/MySQL/MaterializeMetadata.cpp b/src/Databases/MySQL/MaterializeMetadata.cpp index f5e648903ed..c389ab5a1b0 100644 --- a/src/Databases/MySQL/MaterializeMetadata.cpp +++ b/src/Databases/MySQL/MaterializeMetadata.cpp @@ -24,7 +24,8 @@ namespace ErrorCodes } static std::unordered_map fetchTablesCreateQuery( - const mysqlxx::PoolWithFailover::Entry & connection, const String & database_name, const std::vector & fetch_tables) + const mysqlxx::PoolWithFailover::Entry & connection, const String & database_name, + const std::vector & fetch_tables, const Settings & global_settings) { std::unordered_map tables_create_query; for (const auto & fetch_table_name : fetch_tables) @@ -34,9 +35,10 @@ static std::unordered_map fetchTablesCreateQuery( {std::make_shared(), "Create Table"}, }; + StreamSettings mysql_input_stream_settings(global_settings, false, true); MySQLBlockInputStream show_create_table( connection, "SHOW CREATE TABLE " + backQuoteIfNeed(database_name) + "." + backQuoteIfNeed(fetch_table_name), - show_create_table_header, DEFAULT_BLOCK_SIZE, false, true); + show_create_table_header, mysql_input_stream_settings); Block create_query_block = show_create_table.read(); if (!create_query_block || create_query_block.rows() != 1) @@ -49,13 +51,14 @@ static std::unordered_map fetchTablesCreateQuery( } -static std::vector fetchTablesInDB(const mysqlxx::PoolWithFailover::Entry & connection, const std::string & database) +static std::vector fetchTablesInDB(const mysqlxx::PoolWithFailover::Entry & connection, const std::string & database, const Settings & global_settings) { Block header{{std::make_shared(), "table_name"}}; String query = "SELECT TABLE_NAME AS table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE != 'VIEW' AND TABLE_SCHEMA = " + quoteString(database); std::vector tables_in_db; - MySQLBlockInputStream input(connection, query, header, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(global_settings); + MySQLBlockInputStream input(connection, query, header, mysql_input_stream_settings); while (Block block = input.read()) { @@ -77,7 +80,8 @@ void MaterializeMetadata::fetchMasterStatus(mysqlxx::PoolWithFailover::Entry & c {std::make_shared(), "Executed_Gtid_Set"}, }; - MySQLBlockInputStream input(connection, "SHOW MASTER STATUS;", header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream input(connection, "SHOW MASTER STATUS;", header, mysql_input_stream_settings); Block master_status = input.read(); if (!master_status || master_status.rows() != 1) @@ -99,7 +103,8 @@ void MaterializeMetadata::fetchMasterVariablesValue(const mysqlxx::PoolWithFailo }; const String & fetch_query = "SHOW VARIABLES WHERE Variable_name = 'binlog_checksum'"; - MySQLBlockInputStream variables_input(connection, fetch_query, variables_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream variables_input(connection, fetch_query, variables_header, mysql_input_stream_settings); while (Block variables_block = variables_input.read()) { @@ -114,7 +119,7 @@ void MaterializeMetadata::fetchMasterVariablesValue(const mysqlxx::PoolWithFailo } } -static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & connection, WriteBuffer & out) +static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & connection, const Settings & global_settings, WriteBuffer & out) { Block sync_user_privs_header { @@ -122,7 +127,8 @@ static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & conne }; String grants_query, sub_privs; - MySQLBlockInputStream input(connection, "SHOW GRANTS FOR CURRENT_USER();", sync_user_privs_header, DEFAULT_BLOCK_SIZE); + StreamSettings mysql_input_stream_settings(global_settings); + MySQLBlockInputStream input(connection, "SHOW GRANTS FOR CURRENT_USER();", sync_user_privs_header, mysql_input_stream_settings); while (Block block = input.read()) { for (size_t index = 0; index < block.rows(); ++index) @@ -146,11 +152,11 @@ static bool checkSyncUserPrivImpl(const mysqlxx::PoolWithFailover::Entry & conne return false; } -static void checkSyncUserPriv(const mysqlxx::PoolWithFailover::Entry & connection) +static void checkSyncUserPriv(const mysqlxx::PoolWithFailover::Entry & connection, const Settings & global_settings) { WriteBufferFromOwnString out; - if (!checkSyncUserPrivImpl(connection, out)) + if (!checkSyncUserPrivImpl(connection, global_settings, out)) throw Exception("MySQL SYNC USER ACCESS ERR: mysql sync user needs " "at least GLOBAL PRIVILEGES:'RELOAD, REPLICATION SLAVE, REPLICATION CLIENT' " "and SELECT PRIVILEGE on MySQL Database." @@ -167,7 +173,8 @@ bool MaterializeMetadata::checkBinlogFileExists(const mysqlxx::PoolWithFailover: {std::make_shared(), "File_size"} }; - MySQLBlockInputStream input(connection, "SHOW MASTER LOGS", logs_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream input(connection, "SHOW MASTER LOGS", logs_header, mysql_input_stream_settings); while (Block block = input.read()) { @@ -222,7 +229,7 @@ void MaterializeMetadata::transaction(const MySQLReplication::Position & positio commitMetadata(std::move(fun), persistent_tmp_path, persistent_path); } -MaterializeMetadata::MaterializeMetadata(const String & path_) : persistent_path(path_) +MaterializeMetadata::MaterializeMetadata(const String & path_, const Settings & settings_) : persistent_path(path_), settings(settings_) { if (Poco::File(persistent_path).exists()) { @@ -244,7 +251,7 @@ void MaterializeMetadata::startReplication( mysqlxx::PoolWithFailover::Entry & connection, const String & database, bool & opened_transaction, std::unordered_map & need_dumping_tables) { - checkSyncUserPriv(connection); + checkSyncUserPriv(connection, settings); if (checkBinlogFileExists(connection)) return; @@ -263,7 +270,7 @@ void MaterializeMetadata::startReplication( connection->query("START TRANSACTION /*!40100 WITH CONSISTENT SNAPSHOT */;").execute(); opened_transaction = true; - need_dumping_tables = fetchTablesCreateQuery(connection, database, fetchTablesInDB(connection, database)); + need_dumping_tables = fetchTablesCreateQuery(connection, database, fetchTablesInDB(connection, database, settings), settings); connection->query("UNLOCK TABLES;").execute(); } catch (...) diff --git a/src/Databases/MySQL/MaterializeMetadata.h b/src/Databases/MySQL/MaterializeMetadata.h index 3a82d1349ba..079786c261c 100644 --- a/src/Databases/MySQL/MaterializeMetadata.h +++ b/src/Databases/MySQL/MaterializeMetadata.h @@ -10,6 +10,7 @@ #include #include #include +#include namespace DB { @@ -25,6 +26,7 @@ namespace DB struct MaterializeMetadata { const String persistent_path; + const Settings settings; String binlog_file; UInt64 binlog_position; @@ -50,7 +52,7 @@ struct MaterializeMetadata bool & opened_transaction, std::unordered_map & need_dumping_tables); - MaterializeMetadata(const String & path_); + MaterializeMetadata(const String & path_, const Settings & settings_); }; } diff --git a/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp index b8e135eef32..82161ac5c8d 100644 --- a/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp +++ b/src/Databases/MySQL/MaterializeMySQLSyncThread.cpp @@ -90,7 +90,7 @@ MaterializeMySQLSyncThread::~MaterializeMySQLSyncThread() } } -static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) +static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection, const Settings & settings) { Block variables_header{ {std::make_shared(), "Variable_name"}, @@ -104,19 +104,19 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) "OR (Variable_name = 'default_authentication_plugin' AND upper(Value) = 'MYSQL_NATIVE_PASSWORD') " "OR (Variable_name = 'log_bin_use_v1_row_events' AND upper(Value) = 'OFF');"; - MySQLBlockInputStream variables_input(connection, check_query, variables_header, DEFAULT_BLOCK_SIZE, false, true); + StreamSettings mysql_input_stream_settings(settings, false, true); + MySQLBlockInputStream variables_input(connection, check_query, variables_header, mysql_input_stream_settings); - Block variables_block = variables_input.read(); - if (!variables_block || variables_block.rows() != 5) + std::unordered_map variables_error_message{ + {"log_bin", "log_bin = 'ON'"}, + {"binlog_format", "binlog_format='ROW'"}, + {"binlog_row_image", "binlog_row_image='FULL'"}, + {"default_authentication_plugin", "default_authentication_plugin='mysql_native_password'"}, + {"log_bin_use_v1_row_events", "log_bin_use_v1_row_events='OFF'"} + }; + + while (Block variables_block = variables_input.read()) { - std::unordered_map variables_error_message{ - {"log_bin", "log_bin = 'ON'"}, - {"binlog_format", "binlog_format='ROW'"}, - {"binlog_row_image", "binlog_row_image='FULL'"}, - {"default_authentication_plugin", "default_authentication_plugin='mysql_native_password'"}, - {"log_bin_use_v1_row_events", "log_bin_use_v1_row_events='OFF'"} - }; - ColumnPtr variable_name_column = variables_block.getByName("Variable_name").column; for (size_t index = 0; index < variables_block.rows(); ++index) @@ -126,7 +126,10 @@ static void checkMySQLVariables(const mysqlxx::Pool::Entry & connection) if (error_message_it != variables_error_message.end()) variables_error_message.erase(error_message_it); } + } + if (!variables_error_message.empty()) + { bool first = true; WriteBufferFromOwnString error_message; error_message << "Illegal MySQL variables, the MaterializeMySQL engine requires "; @@ -167,7 +170,7 @@ void MaterializeMySQLSyncThread::synchronization() try { MaterializeMetadata metadata( - DatabaseCatalog::instance().getDatabase(database_name)->getMetadataPath() + "/.metadata"); + DatabaseCatalog::instance().getDatabase(database_name)->getMetadataPath() + "/.metadata", getContext()->getSettingsRef()); bool need_reconnect = true; Stopwatch watch; @@ -240,7 +243,7 @@ void MaterializeMySQLSyncThread::assertMySQLAvailable() { try { - checkMySQLVariables(pool.get()); + checkMySQLVariables(pool.get(), getContext()->getSettingsRef()); } catch (const mysqlxx::ConnectionFailed & e) { @@ -326,9 +329,10 @@ static inline void dumpDataForTables( tryToExecuteQuery(query_prefix + " " + iterator->second, query_context, database_name, comment); /// create table. auto out = std::make_shared(getTableOutput(database_name, table_name, query_context)); + StreamSettings mysql_input_stream_settings(context->getSettingsRef()); MySQLBlockInputStream input( connection, "SELECT * FROM " + backQuoteIfNeed(mysql_database_name) + "." + backQuoteIfNeed(table_name), - out->getHeader(), DEFAULT_BLOCK_SIZE); + out->getHeader(), mysql_input_stream_settings); Stopwatch watch; copyData(input, *out, is_cancelled); @@ -375,7 +379,7 @@ bool MaterializeMySQLSyncThread::prepareSynchronized(MaterializeMetadata & metad opened_transaction = false; - checkMySQLVariables(connection); + checkMySQLVariables(connection, getContext()->getSettingsRef()); std::unordered_map need_dumping_tables; metadata.startReplication(connection, mysql_database_name, opened_transaction, need_dumping_tables); diff --git a/src/Dictionaries/MySQLDictionarySource.cpp b/src/Dictionaries/MySQLDictionarySource.cpp index a78da1c7e74..676863ae588 100644 --- a/src/Dictionaries/MySQLDictionarySource.cpp +++ b/src/Dictionaries/MySQLDictionarySource.cpp @@ -4,9 +4,15 @@ #include "DictionarySourceFactory.h" #include "DictionaryStructure.h" #include "registerDictionaries.h" +#include +#include namespace DB { + +[[maybe_unused]] +static const size_t default_num_tries_on_connection_loss = 3; + namespace ErrorCodes { extern const int SUPPORT_IS_DISABLED; @@ -14,20 +20,20 @@ namespace ErrorCodes void registerDictionarySourceMysql(DictionarySourceFactory & factory) { - auto create_table_source = [=](const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - Block & sample_block, - ContextPtr /* context */, + auto create_table_source = [=]([[maybe_unused]] const DictionaryStructure & dict_struct, + [[maybe_unused]] const Poco::Util::AbstractConfiguration & config, + [[maybe_unused]] const std::string & config_prefix, + [[maybe_unused]] Block & sample_block, + [[maybe_unused]] ContextPtr context, const std::string & /* default_database */, bool /* check_config */) -> DictionarySourcePtr { #if USE_MYSQL - return std::make_unique(dict_struct, config, config_prefix + ".mysql", sample_block); + StreamSettings mysql_input_stream_settings(context->getSettingsRef() + , config.getBool(config_prefix + ".mysql.close_connection", false) || config.getBool(config_prefix + ".mysql.share_connection", false) + , false + , config.getBool(config_prefix + ".mysql.fail_on_connection_loss", false) ? 1 : default_num_tries_on_connection_loss); + return std::make_unique(dict_struct, config, config_prefix + ".mysql", sample_block, mysql_input_stream_settings); #else - (void)dict_struct; - (void)config; - (void)config_prefix; - (void)sample_block; throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Dictionary source of type `mysql` is disabled because ClickHouse was built without mysql support."); #endif @@ -45,22 +51,21 @@ void registerDictionarySourceMysql(DictionarySourceFactory & factory) # include # include # include -# include # include "readInvalidateQuery.h" # include # include +# include namespace DB { -static const UInt64 max_block_size = 8192; -static const size_t default_num_tries_on_connection_loss = 3; MySQLDictionarySource::MySQLDictionarySource( const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - const Block & sample_block_) + const Block & sample_block_, + const StreamSettings & settings_) : log(&Poco::Logger::get("MySQLDictionarySource")) , update_time{std::chrono::system_clock::from_time_t(0)} , dict_struct{dict_struct_} @@ -74,10 +79,7 @@ MySQLDictionarySource::MySQLDictionarySource( , query_builder{dict_struct, db, "", table, where, IdentifierQuotingStyle::Backticks} , load_all_query{query_builder.composeLoadAllQuery()} , invalidate_query{config.getString(config_prefix + ".invalidate_query", "")} - , close_connection( - config.getBool(config_prefix + ".close_connection", false) || config.getBool(config_prefix + ".share_connection", false)) - , max_tries_for_mysql_block_input_stream( - config.getBool(config_prefix + ".fail_on_connection_loss", false) ? 1 : default_num_tries_on_connection_loss) + , settings(settings_) { } @@ -98,8 +100,7 @@ MySQLDictionarySource::MySQLDictionarySource(const MySQLDictionarySource & other , last_modification{other.last_modification} , invalidate_query{other.invalidate_query} , invalidate_query_response{other.invalidate_query_response} - , close_connection{other.close_connection} - , max_tries_for_mysql_block_input_stream{other.max_tries_for_mysql_block_input_stream} + , settings(other.settings) { } @@ -122,7 +123,7 @@ std::string MySQLDictionarySource::getUpdateFieldAndDate() BlockInputStreamPtr MySQLDictionarySource::loadFromQuery(const String & query) { return std::make_shared( - pool, query, sample_block, max_block_size, close_connection, false, max_tries_for_mysql_block_input_stream); + pool, query, sample_block, settings); } BlockInputStreamPtr MySQLDictionarySource::loadAll() @@ -245,7 +246,7 @@ LocalDateTime MySQLDictionarySource::getLastModification(mysqlxx::Pool::Entry & ++fetched_rows; } - if (close_connection && allow_connection_closure) + if (settings.auto_close && allow_connection_closure) { connection.disconnect(); } @@ -269,7 +270,7 @@ std::string MySQLDictionarySource::doInvalidateQuery(const std::string & request Block invalidate_sample_block; ColumnPtr column(ColumnString::create()); invalidate_sample_block.insert(ColumnWithTypeAndName(column, std::make_shared(), "Sample Block")); - MySQLBlockInputStream block_input_stream(pool->get(), request, invalidate_sample_block, 1, close_connection); + MySQLBlockInputStream block_input_stream(pool->get(), request, invalidate_sample_block, settings); return readInvalidateQuery(block_input_stream); } diff --git a/src/Dictionaries/MySQLDictionarySource.h b/src/Dictionaries/MySQLDictionarySource.h index dcd911146aa..ef1d81b862f 100644 --- a/src/Dictionaries/MySQLDictionarySource.h +++ b/src/Dictionaries/MySQLDictionarySource.h @@ -12,7 +12,7 @@ # include "DictionaryStructure.h" # include "ExternalQueryBuilder.h" # include "IDictionarySource.h" - +# include namespace Poco { @@ -35,7 +35,8 @@ public: const DictionaryStructure & dict_struct_, const Poco::Util::AbstractConfiguration & config, const String & config_prefix, - const Block & sample_block_); + const Block & sample_block_, + const StreamSettings & settings_); /// copy-constructor is provided in order to support cloneability MySQLDictionarySource(const MySQLDictionarySource & other); @@ -87,8 +88,7 @@ private: LocalDateTime last_modification; std::string invalidate_query; mutable std::string invalidate_query_response; - const bool close_connection; - const size_t max_tries_for_mysql_block_input_stream; + const StreamSettings settings; }; } diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index f101de340f1..95034b8e107 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -1,7 +1,6 @@ #include "DiskCacheWrapper.h" #include #include -#include #include namespace DB @@ -114,7 +113,7 @@ DiskCacheWrapper::readFile( if (!cache_file_predicate(path)) return DiskDecorator::readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Read file {} from cache", backQuote(path)); + LOG_DEBUG(log, "Read file {} from cache", backQuote(path)); if (cache_disk->exists(path)) return cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, mmap_threshold, mmap_cache); @@ -128,11 +127,11 @@ DiskCacheWrapper::readFile( { /// This thread will responsible for file downloading to cache. metadata->status = DOWNLOADING; - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} doesn't exist in cache. Will download it", backQuote(path)); + LOG_DEBUG(log, "File {} doesn't exist in cache. Will download it", backQuote(path)); } else if (metadata->status == DOWNLOADING) { - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Waiting for file {} download to cache", backQuote(path)); + LOG_DEBUG(log, "Waiting for file {} download to cache", backQuote(path)); metadata->condition.wait(lock, [metadata] { return metadata->status == DOWNLOADED || metadata->status == ERROR; }); } } @@ -157,7 +156,7 @@ DiskCacheWrapper::readFile( } cache_disk->moveFile(tmp_path, path); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "File {} downloaded to cache", backQuote(path)); + LOG_DEBUG(log, "File {} downloaded to cache", backQuote(path)); } catch (...) { @@ -186,7 +185,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode if (!cache_file_predicate(path)) return DiskDecorator::writeFile(path, buf_size, mode); - LOG_DEBUG(&Poco::Logger::get("DiskCache"), "Write file {} to cache", backQuote(path)); + LOG_DEBUG(log, "Write file {} to cache", backQuote(path)); auto dir_path = directoryPath(path); if (!cache_disk->exists(dir_path)) diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index 7e627b0c3c3..6d58394640f 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "DiskDecorator.h" #include "DiskLocal.h" @@ -63,6 +64,8 @@ private: mutable std::unordered_map> file_downloads; /// Protects concurrent downloading files to cache. mutable std::mutex mutex; + + Poco::Logger * log = &Poco::Logger::get("DiskCache"); }; } diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index 9d2bad0373d..d0cf6a00344 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -2,13 +2,13 @@ #include #include "DiskFactory.h" +#include #include #include #include -#include #include -#include + #include @@ -96,7 +96,7 @@ bool DiskLocal::tryReserve(UInt64 bytes) std::lock_guard lock(DiskLocal::reservation_mutex); if (bytes == 0) { - LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving 0 bytes on disk {}", backQuote(name)); + LOG_DEBUG(log, "Reserving 0 bytes on disk {}", backQuote(name)); ++reservation_count; return true; } @@ -105,7 +105,7 @@ bool DiskLocal::tryReserve(UInt64 bytes) UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes); if (unreserved_space >= bytes) { - LOG_DEBUG(&Poco::Logger::get("DiskLocal"), "Reserving {} on disk {}, having unreserved {}.", + LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.", ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space)); ++reservation_count; reserved_bytes += bytes; @@ -339,7 +339,7 @@ DiskLocalReservation::~DiskLocalReservation() if (disk->reserved_bytes < size) { disk->reserved_bytes = 0; - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservations size for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName()); } else { @@ -347,7 +347,7 @@ DiskLocalReservation::~DiskLocalReservation() } if (disk->reservation_count == 0) - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservation count for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName()); else --disk->reservation_count; } diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index d957fc6f847..567ca24eb50 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -115,6 +116,8 @@ private: UInt64 reservation_count = 0; static std::mutex reservation_mutex; + + Poco::Logger * log = &Poco::Logger::get("DiskLocal"); }; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 1de4ab843ac..14d3e1e5d5e 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -2,9 +2,11 @@ #include "Disks/DiskFactory.h" +#include #include #include #include +#include #include #include #include @@ -18,7 +20,6 @@ #include #include #include -#include #include #include @@ -249,8 +250,12 @@ class ReadIndirectBufferFromS3 final : public ReadBufferFromFileBase { public: ReadIndirectBufferFromS3( - std::shared_ptr client_ptr_, const String & bucket_, DiskS3::Metadata metadata_, size_t buf_size_) - : client_ptr(std::move(client_ptr_)), bucket(bucket_), metadata(std::move(metadata_)), buf_size(buf_size_) + std::shared_ptr client_ptr_, const String & bucket_, DiskS3::Metadata metadata_, UInt64 s3_max_single_read_retries_, size_t buf_size_) + : client_ptr(std::move(client_ptr_)) + , bucket(bucket_) + , metadata(std::move(metadata_)) + , s3_max_single_read_retries(s3_max_single_read_retries_) + , buf_size(buf_size_) { } @@ -306,7 +311,7 @@ private: const auto & [path, size] = metadata.s3_objects[i]; if (size > offset) { - auto buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, buf_size); + auto buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, s3_max_single_read_retries, buf_size); buf->seek(offset, SEEK_SET); return buf; } @@ -335,7 +340,7 @@ private: ++current_buf_idx; const auto & path = metadata.s3_objects[current_buf_idx].first; - current_buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, buf_size); + current_buf = std::make_unique(client_ptr, bucket, metadata.s3_root_path + path, s3_max_single_read_retries, buf_size); current_buf->next(); working_buffer = current_buf->buffer(); absolute_position += working_buffer.size(); @@ -346,6 +351,7 @@ private: std::shared_ptr client_ptr; const String & bucket; DiskS3::Metadata metadata; + UInt64 s3_max_single_read_retries; size_t buf_size; size_t absolute_position = 0; @@ -491,7 +497,7 @@ public: if (disk->reserved_bytes < size) { disk->reserved_bytes = 0; - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservations size for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservations size for disk '{}'.", disk->getName()); } else { @@ -499,7 +505,7 @@ public: } if (disk->reservation_count == 0) - LOG_ERROR(&Poco::Logger::get("DiskLocal"), "Unbalanced reservation count for disk '{}'.", disk->getName()); + LOG_ERROR(disk->log, "Unbalanced reservation count for disk '{}'.", disk->getName()); else --disk->reservation_count; } @@ -535,7 +541,7 @@ public: } catch (...) { - tryLogCurrentException(&Poco::Logger::get("DiskS3"), "Failed to run async task"); + tryLogCurrentException("DiskS3", "Failed to run async task"); try { @@ -559,6 +565,7 @@ DiskS3::DiskS3( String bucket_, String s3_root_path_, String metadata_path_, + UInt64 s3_max_single_read_retries_, size_t min_upload_part_size_, size_t max_single_part_upload_size_, size_t min_bytes_for_seek_, @@ -572,6 +579,7 @@ DiskS3::DiskS3( , bucket(std::move(bucket_)) , s3_root_path(std::move(s3_root_path_)) , metadata_path(std::move(metadata_path_)) + , s3_max_single_read_retries(s3_max_single_read_retries_) , min_upload_part_size(min_upload_part_size_) , max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) @@ -675,10 +683,10 @@ std::unique_ptr DiskS3::readFile(const String & path, si { auto metadata = readMeta(path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Read from file by path: {}. Existing S3 objects: {}", + LOG_DEBUG(log, "Read from file by path: {}. Existing S3 objects: {}", backQuote(metadata_path + path), metadata.s3_objects.size()); - auto reader = std::make_unique(client, bucket, metadata, buf_size); + auto reader = std::make_unique(client, bucket, metadata, s3_max_single_read_retries, buf_size); return std::make_unique(std::move(reader), min_bytes_for_seek); } @@ -711,7 +719,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, /// Save empty metadata to disk to have ability to get file size while buffer is not finalized. metadata.save(); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), s3_root_path + s3_path); + LOG_DEBUG(log, "Write to file by path: {}. New S3 path: {}", backQuote(metadata_path + path), s3_root_path + s3_path); return std::make_unique( client, bucket, metadata, s3_path, object_metadata, min_upload_part_size, max_single_part_upload_size, buf_size); @@ -720,7 +728,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, { auto metadata = readMeta(path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.", + LOG_DEBUG(log, "Append to file by path: {}. New S3 path: {}. Existing S3 objects: {}.", backQuote(metadata_path + path), s3_root_path + s3_path, metadata.s3_objects.size()); return std::make_unique( @@ -730,7 +738,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Remove file by path: {}", backQuote(metadata_path + path)); + LOG_DEBUG(log, "Remove file by path: {}", backQuote(metadata_path + path)); Poco::File file(metadata_path + path); @@ -762,7 +770,7 @@ void DiskS3::removeMeta(const String & path, AwsS3KeyKeeper & keys) if (e.code() == ErrorCodes::UNKNOWN_FORMAT) { LOG_WARNING( - &Poco::Logger::get("DiskS3"), + log, "Metadata file {} can't be read by reason: {}. Removing it forcibly.", backQuote(path), e.nested() ? e.nested()->message() : e.message()); @@ -846,7 +854,7 @@ bool DiskS3::tryReserve(UInt64 bytes) std::lock_guard lock(reservation_mutex); if (bytes == 0) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Reserving 0 bytes on s3 disk {}", backQuote(name)); + LOG_DEBUG(log, "Reserving 0 bytes on s3 disk {}", backQuote(name)); ++reservation_count; return true; } @@ -855,7 +863,7 @@ bool DiskS3::tryReserve(UInt64 bytes) UInt64 unreserved_space = available_space - std::min(available_space, reserved_bytes); if (unreserved_space >= bytes) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Reserving {} on disk {}, having unreserved {}.", + LOG_DEBUG(log, "Reserving {} on disk {}, having unreserved {}.", ReadableSize(bytes), backQuote(name), ReadableSize(unreserved_space)); ++reservation_count; reserved_bytes += bytes; @@ -940,40 +948,36 @@ void DiskS3::startup() if (!send_metadata) return; - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting up disk {}", name); + LOG_INFO(log, "Starting up disk {}", name); if (readSchemaVersion(bucket, s3_root_path) < RESTORABLE_SCHEMA_VERSION) migrateToRestorableSchema(); findLastRevision(); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Disk {} started up", name); + LOG_INFO(log, "Disk {} started up", name); } void DiskS3::findLastRevision() { - UInt64 l = 0, r = LATEST_REVISION; - while (l < r) + /// Construct revision number from high to low bits. + String revision; + revision.reserve(64); + for (int bit = 0; bit < 64; bit++) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check revision in bounds {}-{}", l, r); + auto revision_prefix = revision + "1"; - auto revision = l + (r - l + 1) / 2; - if (revision == 0) - break; + LOG_DEBUG(log, "Check object exists with revision prefix {}", revision_prefix); - auto revision_str = revisionToString(revision); - - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Check object with revision {}", revision); - - /// Check file or operation with such revision exists. - if (checkObjectExists(bucket, s3_root_path + "r" + revision_str) - || checkObjectExists(bucket, s3_root_path + "operations/r" + revision_str)) - l = revision; + /// Check file or operation with such revision prefix exists. + if (checkObjectExists(bucket, s3_root_path + "r" + revision_prefix) + || checkObjectExists(bucket, s3_root_path + "operations/r" + revision_prefix)) + revision += "1"; else - r = revision - 1; + revision += "0"; } - revision_counter = l; - LOG_INFO(&Poco::Logger::get("DiskS3"), "Found last revision number {} for disk {}", revision_counter, name); + revision_counter = static_cast(std::bitset<64>(revision).to_ullong()); + LOG_INFO(log, "Found last revision number {} for disk {}", revision_counter, name); } int DiskS3::readSchemaVersion(const String & source_bucket, const String & source_path) @@ -982,7 +986,7 @@ int DiskS3::readSchemaVersion(const String & source_bucket, const String & sourc if (!checkObjectExists(source_bucket, source_path + SCHEMA_VERSION_OBJECT)) return version; - ReadBufferFromS3 buffer (client, source_bucket, source_path + SCHEMA_VERSION_OBJECT); + ReadBufferFromS3 buffer(client, source_bucket, source_path + SCHEMA_VERSION_OBJECT, s3_max_single_read_retries); readIntText(version, buffer); return version; @@ -1010,7 +1014,7 @@ void DiskS3::updateObjectMetadata(const String & key, const ObjectMetadata & met void DiskS3::migrateFileToRestorableSchema(const String & path) { - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Migrate file {} to restorable schema", metadata_path + path); + LOG_DEBUG(log, "Migrate file {} to restorable schema", metadata_path + path); auto meta = readMeta(path); @@ -1027,7 +1031,7 @@ void DiskS3::migrateToRestorableSchemaRecursive(const String & path, Futures & r { checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Migrate directory {} to restorable schema", metadata_path + path); + LOG_DEBUG(log, "Migrate directory {} to restorable schema", metadata_path + path); bool dir_contains_only_files = true; for (auto it = iterateDirectory(path); it->isValid(); it->next()) @@ -1070,7 +1074,7 @@ void DiskS3::migrateToRestorableSchema() { try { - LOG_INFO(&Poco::Logger::get("DiskS3"), "Start migration to restorable schema for disk {}", name); + LOG_INFO(log, "Start migration to restorable schema for disk {}", name); Futures results; @@ -1085,9 +1089,9 @@ void DiskS3::migrateToRestorableSchema() saveSchemaVersion(RESTORABLE_SCHEMA_VERSION); } - catch (const Exception & e) + catch (const Exception &) { - LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to migrate to restorable schema. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString()); + tryLogCurrentException(log, fmt::format("Failed to migrate to restorable schema for disk {}", name)); throw; } @@ -1173,6 +1177,7 @@ struct DiskS3::RestoreInformation UInt64 revision = LATEST_REVISION; String source_bucket; String source_path; + bool detached = false; }; void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_information) @@ -1180,33 +1185,50 @@ void DiskS3::readRestoreInformation(DiskS3::RestoreInformation & restore_informa ReadBufferFromFile buffer(metadata_path + RESTORE_FILE_NAME, 512); buffer.next(); - /// Empty file - just restore all metadata. - if (!buffer.hasPendingData()) - return; - try { - readIntText(restore_information.revision, buffer); - assertChar('\n', buffer); + std::map properties; - if (!buffer.hasPendingData()) - return; + while (buffer.hasPendingData()) + { + String property; + readText(property, buffer); + assertChar('\n', buffer); - readText(restore_information.source_bucket, buffer); - assertChar('\n', buffer); + auto pos = property.find('='); + if (pos == String::npos || pos == 0 || pos == property.length()) + throw Exception(fmt::format("Invalid property {} in restore file", property), ErrorCodes::UNKNOWN_FORMAT); - if (!buffer.hasPendingData()) - return; + auto key = property.substr(0, pos); + auto value = property.substr(pos + 1); - readText(restore_information.source_path, buffer); - assertChar('\n', buffer); + auto it = properties.find(key); + if (it != properties.end()) + throw Exception(fmt::format("Property key duplication {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT); - if (buffer.hasPendingData()) - throw Exception("Extra information at the end of restore file", ErrorCodes::UNKNOWN_FORMAT); + properties[key] = value; + } + + for (const auto & [key, value] : properties) + { + ReadBufferFromString value_buffer (value); + + if (key == "revision") + readIntText(restore_information.revision, value_buffer); + else if (key == "source_bucket") + readText(restore_information.source_bucket, value_buffer); + else if (key == "source_path") + readText(restore_information.source_path, value_buffer); + else if (key == "detached") + readBoolTextWord(restore_information.detached, value_buffer); + else + throw Exception(fmt::format("Unknown key {} in restore file", key), ErrorCodes::UNKNOWN_FORMAT); + } } - catch (const Exception & e) + catch (const Exception &) { - throw Exception("Failed to read restore information", e, ErrorCodes::UNKNOWN_FORMAT); + tryLogCurrentException(log, "Failed to read restore information"); + throw; } } @@ -1239,43 +1261,43 @@ void DiskS3::restore() throw Exception("Restoring to the same bucket is allowed only if source path is not a sub-path of configured path in S3 disk", ErrorCodes::BAD_ARGUMENTS); } + LOG_INFO(log, "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}", + name, information.revision, information.source_bucket, information.source_path); + if (readSchemaVersion(information.source_bucket, information.source_path) < RESTORABLE_SCHEMA_VERSION) throw Exception("Source bucket doesn't have restorable schema.", ErrorCodes::BAD_ARGUMENTS); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting to restore disk {}. Revision: {}, Source bucket: {}, Source path: {}", - name, information.revision, information.source_bucket, information.source_path); - - LOG_INFO(&Poco::Logger::get("DiskS3"), "Removing old metadata..."); + LOG_INFO(log, "Removing old metadata..."); bool cleanup_s3 = information.source_bucket != bucket || information.source_path != s3_root_path; for (const auto & root : data_roots) if (exists(root)) removeSharedRecursive(root + '/', !cleanup_s3); - restoreFiles(information.source_bucket, information.source_path, information.revision); - restoreFileOperations(information.source_bucket, information.source_path, information.revision); + restoreFiles(information); + restoreFileOperations(information); Poco::File restore_file(metadata_path + RESTORE_FILE_NAME); restore_file.remove(); saveSchemaVersion(RESTORABLE_SCHEMA_VERSION); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Restore disk {} finished", name); + LOG_INFO(log, "Restore disk {} finished", name); } - catch (const Exception & e) + catch (const Exception &) { - LOG_ERROR(&Poco::Logger::get("DiskS3"), "Failed to restore disk. Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString()); + tryLogCurrentException(log, fmt::format("Failed to restore disk {}", name)); throw; } } -void DiskS3::restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision) +void DiskS3::restoreFiles(const RestoreInformation & restore_information) { - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore files for disk {}", name); + LOG_INFO(log, "Starting restore files for disk {}", name); std::vector> results; - listObjects(source_bucket, source_path, [this, &source_bucket, &source_path, &target_revision, &results](auto list_result) + auto restore_files = [this, &restore_information, &results](auto list_result) { std::vector keys; for (const auto & row : list_result.GetContents()) @@ -1288,7 +1310,7 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa const auto [revision, _] = extractRevisionAndOperationFromKey(key); /// Filter early if it's possible to get revision from key. - if (revision > target_revision) + if (revision > restore_information.revision) continue; keys.push_back(key); @@ -1296,23 +1318,26 @@ void DiskS3::restoreFiles(const String & source_bucket, const String & source_pa if (!keys.empty()) { - auto result = getExecutor().execute([this, &source_bucket, &source_path, keys]() + auto result = getExecutor().execute([this, &restore_information, keys]() { - processRestoreFiles(source_bucket, source_path, keys); + processRestoreFiles(restore_information.source_bucket, restore_information.source_path, keys); }); results.push_back(std::move(result)); } return true; - }); + }; + + /// Execute. + listObjects(restore_information.source_bucket, restore_information.source_path, restore_files); for (auto & result : results) result.wait(); for (auto & result : results) result.get(); - LOG_INFO(&Poco::Logger::get("DiskS3"), "Files are restored for disk {}", name); + LOG_INFO(log, "Files are restored for disk {}", name); } void DiskS3::processRestoreFiles(const String & source_bucket, const String & source_path, Strings keys) @@ -1327,7 +1352,7 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so if (path_entry == object_metadata.end()) { /// Such keys can remain after migration, we can skip them. - LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} because it doesn't have 'path' in metadata", key); + LOG_WARNING(log, "Skip key {} because it doesn't have 'path' in metadata", key); continue; } @@ -1344,18 +1369,19 @@ void DiskS3::processRestoreFiles(const String & source_bucket, const String & so metadata.addObject(relative_key, head_result.GetContentLength()); metadata.save(); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Restored file {}", path); + LOG_DEBUG(log, "Restored file {}", path); } } -void DiskS3::restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision) +void DiskS3::restoreFileOperations(const RestoreInformation & restore_information) { - LOG_INFO(&Poco::Logger::get("DiskS3"), "Starting restore file operations for disk {}", name); + LOG_INFO(log, "Starting restore file operations for disk {}", name); /// Enable recording file operations if we restore to different bucket / path. - send_metadata = bucket != source_bucket || s3_root_path != source_path; + send_metadata = bucket != restore_information.source_bucket || s3_root_path != restore_information.source_path; - listObjects(source_bucket, source_path + "operations/", [this, &source_bucket, &target_revision](auto list_result) + std::set renames; + auto restore_file_operations = [this, &restore_information, &renames](auto list_result) { const String rename = "rename"; const String hardlink = "hardlink"; @@ -1367,20 +1393,20 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & const auto [revision, operation] = extractRevisionAndOperationFromKey(key); if (revision == UNKNOWN_REVISION) { - LOG_WARNING(&Poco::Logger::get("DiskS3"), "Skip key {} with unknown revision", key); + LOG_WARNING(log, "Skip key {} with unknown revision", key); continue; } /// S3 ensures that keys will be listed in ascending UTF-8 bytes order (revision order). /// We can stop processing if revision of the object is already more than required. - if (revision > target_revision) + if (revision > restore_information.revision) return false; /// Keep original revision if restore to different bucket / path. if (send_metadata) revision_counter = revision - 1; - auto object_metadata = headObject(source_bucket, key).GetMetadata(); + auto object_metadata = headObject(restore_information.source_bucket, key).GetMetadata(); if (operation == rename) { auto from_path = object_metadata["from_path"]; @@ -1388,7 +1414,23 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & if (exists(from_path)) { moveFile(from_path, to_path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored rename {} -> {}", revision, from_path, to_path); + LOG_DEBUG(log, "Revision {}. Restored rename {} -> {}", revision, from_path, to_path); + + if (restore_information.detached && isDirectory(to_path)) + { + /// Sometimes directory paths are passed without trailing '/'. We should keep them in one consistent way. + if (!from_path.ends_with('/')) + from_path += '/'; + if (!to_path.ends_with('/')) + to_path += '/'; + + /// Always keep latest actual directory path to avoid 'detaching' not existing paths. + auto it = renames.find(from_path); + if (it != renames.end()) + renames.erase(it); + + renames.insert(to_path); + } } } else if (operation == hardlink) @@ -1399,27 +1441,55 @@ void DiskS3::restoreFileOperations(const String & source_bucket, const String & { createDirectories(directoryPath(dst_path)); createHardLink(src_path, dst_path); - LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path); + LOG_DEBUG(log, "Revision {}. Restored hardlink {} -> {}", revision, src_path, dst_path); } } } return true; - }); + }; + + /// Execute. + listObjects(restore_information.source_bucket, restore_information.source_path + "operations/", restore_file_operations); + + if (restore_information.detached) + { + Strings not_finished_prefixes{"tmp_", "delete_tmp_", "attaching_", "deleting_"}; + + for (const auto & path : renames) + { + /// Skip already detached parts. + if (path.find("/detached/") != std::string::npos) + continue; + + /// Skip not finished parts. They shouldn't be in 'detached' directory, because CH wouldn't be able to finish processing them. + Poco::Path directory_path (path); + auto directory_name = directory_path.directory(directory_path.depth() - 1); + auto predicate = [&directory_name](String & prefix) { return directory_name.starts_with(prefix); }; + if (std::any_of(not_finished_prefixes.begin(), not_finished_prefixes.end(), predicate)) + continue; + + auto detached_path = pathToDetached(path); + + LOG_DEBUG(log, "Move directory to 'detached' {} -> {}", path, detached_path); + + Poco::File(metadata_path + path).moveTo(metadata_path + detached_path); + } + } send_metadata = true; - LOG_INFO(&Poco::Logger::get("DiskS3"), "File operations restored for disk {}", name); + LOG_INFO(log, "File operations restored for disk {}", name); } std::tuple DiskS3::extractRevisionAndOperationFromKey(const String & key) { - UInt64 revision = UNKNOWN_REVISION; + String revision_str; String operation; - re2::RE2::FullMatch(key, key_regexp, &revision, &operation); + re2::RE2::FullMatch(key, key_regexp, &revision_str, &operation); - return {revision, operation}; + return {(revision_str.empty() ? UNKNOWN_REVISION : static_cast(std::bitset<64>(revision_str).to_ullong())), operation}; } String DiskS3::shrinkKey(const String & path, const String & key) @@ -1432,15 +1502,12 @@ String DiskS3::shrinkKey(const String & path, const String & key) String DiskS3::revisionToString(UInt64 revision) { - static constexpr size_t max_digits = 19; /// UInt64 max digits in decimal representation. + return std::bitset<64>(revision).to_string(); +} - /// Align revision number with leading zeroes to have strict lexicographical order of them. - auto revision_str = std::to_string(revision); - auto digits_to_align = max_digits - revision_str.length(); - for (size_t i = 0; i < digits_to_align; ++i) - revision_str = "0" + revision_str; - - return revision_str; +String DiskS3::pathToDetached(const String & source_path) +{ + return Poco::Path(source_path).parent().append(Poco::Path("detached")).toString() + '/'; } void DiskS3::onFreeze(const String & path) diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 87aab71fc44..758d4055a3e 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include "Disks/DiskFactory.h" #include "Disks/Executor.h" #include "ProxyConfiguration.h" @@ -40,6 +41,7 @@ public: String bucket_, String s3_root_path_, String metadata_path_, + UInt64 s3_max_single_read_retries_, size_t min_upload_part_size_, size_t max_single_part_upload_size_, size_t min_bytes_for_seek_, @@ -148,6 +150,7 @@ private: Metadata createMeta(const String & path) const; void createFileOperationObject(const String & operation_name, UInt64 revision, const ObjectMetadata & metadata); + /// Converts revision to binary string with leading zeroes (64 bit). static String revisionToString(UInt64 revision); bool checkObjectExists(const String & source_bucket, const String & prefix); @@ -165,21 +168,25 @@ private: void copyObject(const String & src_bucket, const String & src_key, const String & dst_bucket, const String & dst_key); void readRestoreInformation(RestoreInformation & restore_information); - void restoreFiles(const String & source_bucket, const String & source_path, UInt64 target_revision); + void restoreFiles(const RestoreInformation & restore_information); void processRestoreFiles(const String & source_bucket, const String & source_path, std::vector keys); - void restoreFileOperations(const String & source_bucket, const String & source_path, UInt64 target_revision); + void restoreFileOperations(const RestoreInformation & restore_information); /// Remove 'path' prefix from 'key' to get relative key. /// It's needed to store keys to metadata files in RELATIVE_PATHS version. static String shrinkKey(const String & path, const String & key); std::tuple extractRevisionAndOperationFromKey(const String & key); + /// Forms detached path '../../detached/part_name/' from '../../part_name/' + static String pathToDetached(const String & source_path); + const String name; std::shared_ptr client; std::shared_ptr proxy_configuration; const String bucket; const String s3_root_path; String metadata_path; + UInt64 s3_max_single_read_retries; size_t min_upload_part_size; size_t max_single_part_upload_size; size_t min_bytes_for_seek; @@ -207,6 +214,8 @@ private: static constexpr int RESTORABLE_SCHEMA_VERSION = 1; /// Directories with data. const std::vector data_roots {"data", "store"}; + + Poco::Logger * log = &Poco::Logger::get("DiskS3"); }; } diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp index 97acec5d632..352f7467ba0 100644 --- a/src/Disks/S3/registerDiskS3.cpp +++ b/src/Disks/S3/registerDiskS3.cpp @@ -148,7 +148,8 @@ void registerDiskS3(DiskFactory & factory) config.getString(config_prefix + ".secret_access_key", ""), config.getString(config_prefix + ".server_side_encryption_customer_key_base64", ""), {}, - config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false)) + config.getBool(config_prefix + ".use_environment_credentials", config.getBool("s3.use_environment_credentials", false)), + config.getBool(config_prefix + ".use_insecure_imds_request", config.getBool("s3.use_insecure_imds_request", false)) ); String metadata_path = config.getString(config_prefix + ".metadata_path", context->getPath() + "disks/" + name + "/"); @@ -160,6 +161,7 @@ void registerDiskS3(DiskFactory & factory) uri.bucket, uri.key, metadata_path, + context->getSettingsRef().s3_max_single_read_retries, context->getSettingsRef().s3_min_upload_part_size, context->getSettingsRef().s3_max_single_part_upload_size, config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), diff --git a/src/Disks/tests/gtest_disk.cpp b/src/Disks/tests/gtest_disk.cpp index 525b5e6ce38..3b9dca63002 100644 --- a/src/Disks/tests/gtest_disk.cpp +++ b/src/Disks/tests/gtest_disk.cpp @@ -4,7 +4,7 @@ #include #include "gtest_disk.h" -#if !__clang__ +#if !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-override" #endif diff --git a/src/Formats/MySQLBlockInputStream.cpp b/src/Formats/MySQLBlockInputStream.cpp index e2620c71fbb..be0cb31f22d 100644 --- a/src/Formats/MySQLBlockInputStream.cpp +++ b/src/Formats/MySQLBlockInputStream.cpp @@ -30,6 +30,15 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +StreamSettings::StreamSettings(const Settings & settings, bool auto_close_, bool fetch_by_name_, size_t max_retry_) + : max_read_mysql_row_nums((settings.external_storage_max_read_rows) ? settings.external_storage_max_read_rows : settings.max_block_size) + , max_read_mysql_bytes_size(settings.external_storage_max_read_bytes) + , auto_close(auto_close_) + , fetch_by_name(fetch_by_name_) + , default_num_tries_on_connection_loss(max_retry_) +{ +} + MySQLBlockInputStream::Connection::Connection( const mysqlxx::PoolWithFailover::Entry & entry_, const std::string & query_str) @@ -44,29 +53,19 @@ MySQLBlockInputStream::MySQLBlockInputStream( const mysqlxx::PoolWithFailover::Entry & entry, const std::string & query_str, const Block & sample_block, - const UInt64 max_block_size_, - const bool auto_close_, - const bool fetch_by_name_) + const StreamSettings & settings_) : log(&Poco::Logger::get("MySQLBlockInputStream")) , connection{std::make_unique(entry, query_str)} - , max_block_size{max_block_size_} - , auto_close{auto_close_} - , fetch_by_name(fetch_by_name_) + , settings{std::make_unique(settings_)} { description.init(sample_block); initPositionMappingFromQueryResultStructure(); } /// For descendant MySQLWithFailoverBlockInputStream -MySQLBlockInputStream::MySQLBlockInputStream( - const Block & sample_block_, - UInt64 max_block_size_, - bool auto_close_, - bool fetch_by_name_) + MySQLBlockInputStream::MySQLBlockInputStream(const Block &sample_block_, const StreamSettings & settings_) : log(&Poco::Logger::get("MySQLBlockInputStream")) - , max_block_size(max_block_size_) - , auto_close(auto_close_) - , fetch_by_name(fetch_by_name_) + , settings(std::make_unique(settings_)) { description.init(sample_block_); } @@ -76,14 +75,10 @@ MySQLWithFailoverBlockInputStream::MySQLWithFailoverBlockInputStream( mysqlxx::PoolWithFailoverPtr pool_, const std::string & query_str_, const Block & sample_block_, - const UInt64 max_block_size_, - const bool auto_close_, - const bool fetch_by_name_, - const size_t max_tries_) - : MySQLBlockInputStream(sample_block_, max_block_size_, auto_close_, fetch_by_name_) - , pool(pool_) - , query_str(query_str_) - , max_tries(max_tries_) + const StreamSettings & settings_) +: MySQLBlockInputStream(sample_block_, settings_) +, pool(pool_) +, query_str(query_str_) { } @@ -101,12 +96,12 @@ void MySQLWithFailoverBlockInputStream::readPrefix() } catch (const mysqlxx::ConnectionLost & ecl) /// There are two retriable failures: CR_SERVER_GONE_ERROR, CR_SERVER_LOST { - LOG_WARNING(log, "Failed connection ({}/{}). Trying to reconnect... (Info: {})", count_connect_attempts, max_tries, ecl.displayText()); + LOG_WARNING(log, "Failed connection ({}/{}). Trying to reconnect... (Info: {})", count_connect_attempts, settings->default_num_tries_on_connection_loss, ecl.displayText()); } - if (++count_connect_attempts > max_tries) + if (++count_connect_attempts > settings->default_num_tries_on_connection_loss) { - LOG_ERROR(log, "Failed to create connection to MySQL. ({}/{})", count_connect_attempts, max_tries); + LOG_ERROR(log, "Failed to create connection to MySQL. ({}/{})", count_connect_attempts, settings->default_num_tries_on_connection_loss); throw; } } @@ -118,45 +113,57 @@ namespace { using ValueType = ExternalResultDescription::ValueType; - void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value) + void insertValue(const IDataType & data_type, IColumn & column, const ValueType type, const mysqlxx::Value & value, size_t & read_bytes_size) { switch (type) { case ValueType::vtUInt8: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 1; break; case ValueType::vtUInt16: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 2; break; case ValueType::vtUInt32: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 4; break; case ValueType::vtUInt64: assert_cast(column).insertValue(value.getUInt()); + read_bytes_size += 8; break; case ValueType::vtInt8: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 1; break; case ValueType::vtInt16: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 2; break; case ValueType::vtInt32: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 4; break; case ValueType::vtInt64: assert_cast(column).insertValue(value.getInt()); + read_bytes_size += 8; break; case ValueType::vtFloat32: assert_cast(column).insertValue(value.getDouble()); + read_bytes_size += 4; break; case ValueType::vtFloat64: assert_cast(column).insertValue(value.getDouble()); + read_bytes_size += 8; break; case ValueType::vtString: assert_cast(column).insertData(value.data(), value.size()); + read_bytes_size += assert_cast(column).byteSize(); break; case ValueType::vtDate: assert_cast(column).insertValue(UInt16(value.getDate().getDayNum())); + read_bytes_size += 2; break; case ValueType::vtDateTime: { @@ -166,10 +173,12 @@ namespace if (time < 0) time = 0; assert_cast(column).insertValue(time); + read_bytes_size += 4; break; } case ValueType::vtUUID: assert_cast(column).insert(parse(value.data(), value.size())); + read_bytes_size += assert_cast(column).byteSize(); break; case ValueType::vtDateTime64:[[fallthrough]]; case ValueType::vtDecimal32: [[fallthrough]]; @@ -179,10 +188,12 @@ namespace { ReadBuffer buffer(const_cast(value.data()), value.size(), 0); data_type.getDefaultSerialization()->deserializeWholeText(column, buffer, FormatSettings{}); + read_bytes_size += column.sizeOfValueIfFixed(); break; } case ValueType::vtFixedString: assert_cast(column).insertData(value.data(), value.size()); + read_bytes_size += column.sizeOfValueIfFixed(); break; default: throw Exception("Unsupported value type", ErrorCodes::NOT_IMPLEMENTED); @@ -198,7 +209,7 @@ Block MySQLBlockInputStream::readImpl() auto row = connection->result.fetch(); if (!row) { - if (auto_close) + if (settings->auto_close) connection->entry.disconnect(); return {}; @@ -209,6 +220,8 @@ Block MySQLBlockInputStream::readImpl() columns[i] = description.sample_block.getByPosition(i).column->cloneEmpty(); size_t num_rows = 0; + size_t read_bytes_size = 0; + while (row) { for (size_t index = 0; index < position_mapping.size(); ++index) @@ -224,12 +237,12 @@ Block MySQLBlockInputStream::readImpl() { ColumnNullable & column_nullable = assert_cast(*columns[index]); const auto & data_type = assert_cast(*sample.type); - insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value); + insertValue(*data_type.getNestedType(), column_nullable.getNestedColumn(), description.types[index].first, value, read_bytes_size); column_nullable.getNullMapData().emplace_back(false); } else { - insertValue(*sample.type, *columns[index], description.types[index].first, value); + insertValue(*sample.type, *columns[index], description.types[index].first, value, read_bytes_size); } } else @@ -245,7 +258,7 @@ Block MySQLBlockInputStream::readImpl() } ++num_rows; - if (num_rows == max_block_size) + if (num_rows == settings->max_read_mysql_row_nums || (settings->max_read_mysql_bytes_size && read_bytes_size >= settings->max_read_mysql_bytes_size)) break; row = connection->result.fetch(); @@ -257,7 +270,7 @@ void MySQLBlockInputStream::initPositionMappingFromQueryResultStructure() { position_mapping.resize(description.sample_block.columns()); - if (!fetch_by_name) + if (!settings->fetch_by_name) { if (description.sample_block.columns() != connection->result.getNumFields()) throw Exception{"mysqlxx::UseQueryResult contains " + toString(connection->result.getNumFields()) + " columns while " diff --git a/src/Formats/MySQLBlockInputStream.h b/src/Formats/MySQLBlockInputStream.h index 86380c1597a..12deb9c3146 100644 --- a/src/Formats/MySQLBlockInputStream.h +++ b/src/Formats/MySQLBlockInputStream.h @@ -6,11 +6,24 @@ #include #include #include - +#include namespace DB { +struct StreamSettings +{ + /// Check if setting is enabled, otherwise use common `max_block_size` setting. + size_t max_read_mysql_row_nums; + size_t max_read_mysql_bytes_size; + bool auto_close; + bool fetch_by_name; + size_t default_num_tries_on_connection_loss; + + StreamSettings(const Settings & settings, bool auto_close_ = false, bool fetch_by_name_ = false, size_t max_retry_ = 5); + +}; + /// Allows processing results of a MySQL query as a sequence of Blocks, simplifies chaining class MySQLBlockInputStream : public IBlockInputStream { @@ -19,16 +32,14 @@ public: const mysqlxx::PoolWithFailover::Entry & entry, const std::string & query_str, const Block & sample_block, - const UInt64 max_block_size_, - const bool auto_close_ = false, - const bool fetch_by_name_ = false); + const StreamSettings & settings_); String getName() const override { return "MySQL"; } Block getHeader() const override { return description.sample_block.cloneEmpty(); } protected: - MySQLBlockInputStream(const Block & sample_block_, UInt64 max_block_size_, bool auto_close_, bool fetch_by_name_); + MySQLBlockInputStream(const Block & sample_block_, const StreamSettings & settings); Block readImpl() override; void initPositionMappingFromQueryResultStructure(); @@ -44,9 +55,7 @@ protected: Poco::Logger * log; std::unique_ptr connection; - const UInt64 max_block_size; - const bool auto_close; - const bool fetch_by_name; + const std::unique_ptr settings; std::vector position_mapping; ExternalResultDescription description; }; @@ -57,23 +66,18 @@ protected: class MySQLWithFailoverBlockInputStream final : public MySQLBlockInputStream { public: - static constexpr inline auto MAX_TRIES_MYSQL_CONNECT = 5; MySQLWithFailoverBlockInputStream( mysqlxx::PoolWithFailoverPtr pool_, const std::string & query_str_, const Block & sample_block_, - const UInt64 max_block_size_, - const bool auto_close_ = false, - const bool fetch_by_name_ = false, - const size_t max_tries_ = MAX_TRIES_MYSQL_CONNECT); + const StreamSettings & settings_); private: void readPrefix() override; mysqlxx::PoolWithFailoverPtr pool; std::string query_str; - size_t max_tries; }; } diff --git a/src/Functions/FunctionCustomWeekToSomething.h b/src/Functions/FunctionCustomWeekToSomething.h index 4050bb512bd..c6f56fdc50a 100644 --- a/src/Functions/FunctionCustomWeekToSomething.h +++ b/src/Functions/FunctionCustomWeekToSomething.h @@ -5,7 +5,6 @@ #include #include #include -#include #include diff --git a/src/Functions/GatherUtils/Sources.h b/src/Functions/GatherUtils/Sources.h index e3eb5f6df75..4dbaff9f567 100644 --- a/src/Functions/GatherUtils/Sources.h +++ b/src/Functions/GatherUtils/Sources.h @@ -140,7 +140,7 @@ struct NumericArraySource : public ArraySourceImpl> /// The methods can be virtual or not depending on the template parameter. See IStringSource. -#if !__clang__ +#if !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-override" #elif __clang_major__ >= 11 @@ -233,7 +233,7 @@ struct ConstSource : public Base } }; -#if !__clang__ || __clang_major__ >= 11 +#if !defined(__clang__) || __clang_major__ >= 11 # pragma GCC diagnostic pop #endif diff --git a/src/Functions/PolygonUtils.h b/src/Functions/PolygonUtils.h index c8e96f0b63a..27ee1d2a99f 100644 --- a/src/Functions/PolygonUtils.h +++ b/src/Functions/PolygonUtils.h @@ -12,7 +12,7 @@ /// Warning in boost::geometry during template strategy substitution. #pragma GCC diagnostic push -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -285,7 +285,7 @@ void PointInPolygonWithGrid::calcGridAttributes( const Point & max_corner = box.max_corner(); #pragma GCC diagnostic push -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif @@ -322,7 +322,7 @@ void PointInPolygonWithGrid::buildGrid() for (size_t row = 0; row < grid_size; ++row) { #pragma GCC diagnostic push -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif CoordinateType y_min = min_corner.y() + row * cell_height; diff --git a/src/Functions/extractTimeZoneFromFunctionArguments.cpp b/src/Functions/extractTimeZoneFromFunctionArguments.cpp index 0ba08b3c612..dbc014ad2eb 100644 --- a/src/Functions/extractTimeZoneFromFunctionArguments.cpp +++ b/src/Functions/extractTimeZoneFromFunctionArguments.cpp @@ -44,9 +44,9 @@ std::string extractTimeZoneNameFromFunctionArguments(const ColumnsWithTypeAndNam /// If time zone is attached to an argument of type DateTime. if (const auto * type = checkAndGetDataType(arguments[datetime_arg_num].type.get())) - return type->getTimeZone().getTimeZone(); + return type->hasExplicitTimeZone() ? type->getTimeZone().getTimeZone() : std::string(); if (const auto * type = checkAndGetDataType(arguments[datetime_arg_num].type.get())) - return type->getTimeZone().getTimeZone(); + return type->hasExplicitTimeZone() ? type->getTimeZone().getTimeZone() : std::string(); return {}; } diff --git a/src/Functions/extractTimeZoneFromFunctionArguments.h b/src/Functions/extractTimeZoneFromFunctionArguments.h index 6323b45f568..9dd3fdd4903 100644 --- a/src/Functions/extractTimeZoneFromFunctionArguments.h +++ b/src/Functions/extractTimeZoneFromFunctionArguments.h @@ -13,6 +13,7 @@ namespace DB class Block; /// Determine working timezone either from optional argument with time zone name or from time zone in DateTime type of argument. +/// Returns empty string if default time zone should be used. std::string extractTimeZoneNameFromFunctionArguments( const ColumnsWithTypeAndName & arguments, size_t time_zone_arg_num, size_t datetime_arg_num); diff --git a/src/Functions/registerFunctions.cpp b/src/Functions/registerFunctions.cpp index d827cc40a86..6db0f9183b2 100644 --- a/src/Functions/registerFunctions.cpp +++ b/src/Functions/registerFunctions.cpp @@ -45,6 +45,7 @@ void registerFunctionsUnixTimestamp64(FunctionFactory & factory); void registerFunctionBitHammingDistance(FunctionFactory & factory); void registerFunctionTupleHammingDistance(FunctionFactory & factory); void registerFunctionsStringHash(FunctionFactory & factory); +void registerFunctionValidateNestedArraySizes(FunctionFactory & factory); #if !defined(ARCADIA_BUILD) void registerFunctionBayesAB(FunctionFactory &); #endif @@ -103,6 +104,7 @@ void registerFunctions() registerFunctionBitHammingDistance(factory); registerFunctionTupleHammingDistance(factory); registerFunctionsStringHash(factory); + registerFunctionValidateNestedArraySizes(factory); #if !defined(ARCADIA_BUILD) registerFunctionBayesAB(factory); diff --git a/src/Functions/validateNestedArraySizes.cpp b/src/Functions/validateNestedArraySizes.cpp new file mode 100644 index 00000000000..b589b1e2e5c --- /dev/null +++ b/src/Functions/validateNestedArraySizes.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int SIZES_OF_ARRAYS_DOESNT_MATCH; +} + +/** Function validateNestedArraySizes is used to check the consistency of Nested DataType subcolumns's offsets when Update + * Arguments: num > 2 + * The first argument is the condition of WHERE in UPDATE operation, only when this is true, we need to check + * The rest arguments are the subcolumns of Nested DataType. + */ +class FunctionValidateNestedArraySizes : public IFunction +{ +public: + static constexpr auto name = "validateNestedArraySizes"; + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + + String getName() const override { return name; } + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool useDefaultImplementationForConstants() const override { return true; } + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override; + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override; +}; + +DataTypePtr FunctionValidateNestedArraySizes::getReturnTypeImpl(const DataTypes & arguments) const +{ + size_t num_args = arguments.size(); + + if (num_args < 3) + throw Exception( + "Function " + getName() + " needs more than two arguments; passed " + toString(arguments.size()) + ".", + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + + if (!WhichDataType(arguments[0]).isUInt8()) + throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName() + " Must be UInt.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + for (size_t i = 1; i < num_args; ++i) + if (!WhichDataType(arguments[i]).isArray()) + throw Exception( + "Illegal type " + arguments[i]->getName() + " of " + toString(i) + " argument of function " + getName() + " Must be Array.", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + + return std::make_shared(); +} + +ColumnPtr FunctionValidateNestedArraySizes::executeImpl( + const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t input_rows_count) const +{ + const ColumnUInt8 * condition_column = typeid_cast(arguments[0].column.get()); + + size_t args_num = arguments.size(); + + for (size_t i = 0; i < input_rows_count; ++i) + { + if (!condition_column->getData()[i]) + continue; + + /// The condition is true, then check the row in subcolumns in Nested Type has the same array size + size_t first_length = 0; + size_t length = 0; + for (size_t args_idx = 1; args_idx < args_num; ++args_idx) + { + const auto & current_arg = arguments[args_idx]; + const ColumnArray * current_column = nullptr; + if (const auto * const_array = checkAndGetColumnConst(current_arg.column.get())) + { + current_column = checkAndGetColumn(&const_array->getDataColumn()); + length = current_column->getOffsets()[0]; + } + else + { + current_column = checkAndGetColumn(current_arg.column.get()); + const auto & offsets = current_column->getOffsets(); + length = offsets[i] - offsets[i - 1]; + } + + if (args_idx == 1) + { + first_length = length; + } + else if (first_length != length) + { + throw Exception( + ErrorCodes::SIZES_OF_ARRAYS_DOESNT_MATCH, + "Elements '{}' and '{}' of Nested data structure (Array columns) " + "have different array sizes ({} and {} respectively) on row {}", + arguments[1].name, arguments[args_idx].name, first_length, length, i); + } + } + } + + return ColumnUInt8::create(input_rows_count, 1); +} + +void registerFunctionValidateNestedArraySizes(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 660f7b115bf..0f4f7a321b2 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -532,6 +532,7 @@ SRCS( upper.cpp upperUTF8.cpp uptime.cpp + validateNestedArraySizes.cpp version.cpp visibleWidth.cpp visitParamExtractBool.cpp diff --git a/src/IO/ReadBufferFromS3.cpp b/src/IO/ReadBufferFromS3.cpp index fd07a7f309a..1e27b0284b7 100644 --- a/src/IO/ReadBufferFromS3.cpp +++ b/src/IO/ReadBufferFromS3.cpp @@ -12,10 +12,12 @@ # include + namespace ProfileEvents { extern const Event S3ReadMicroseconds; extern const Event S3ReadBytes; + extern const Event S3ReadRequestsErrors; } namespace DB @@ -29,26 +31,58 @@ namespace ErrorCodes ReadBufferFromS3::ReadBufferFromS3( - std::shared_ptr client_ptr_, const String & bucket_, const String & key_, size_t buffer_size_) - : SeekableReadBuffer(nullptr, 0), client_ptr(std::move(client_ptr_)), bucket(bucket_), key(key_), buffer_size(buffer_size_) + std::shared_ptr client_ptr_, const String & bucket_, const String & key_, UInt64 s3_max_single_read_retries_, size_t buffer_size_) + : SeekableReadBuffer(nullptr, 0) + , client_ptr(std::move(client_ptr_)) + , bucket(bucket_) + , key(key_) + , s3_max_single_read_retries(s3_max_single_read_retries_) + , buffer_size(buffer_size_) { } - bool ReadBufferFromS3::nextImpl() { - if (!initialized) - { + /// Restoring valid value of `count()` during `nextImpl()`. See `ReadBuffer::next()`. + pos = working_buffer.begin(); + + if (!impl) impl = initialize(); - initialized = true; - } Stopwatch watch; - auto res = impl->next(); + bool next_result = false; + + for (Int64 attempt = static_cast(s3_max_single_read_retries); attempt >= 0; --attempt) + { + if (!impl) + impl = initialize(); + + try + { + next_result = impl->next(); + /// FIXME. 1. Poco `istream` cannot read less than buffer_size or this state is being discarded during + /// istream <-> iostream conversion. `gcount` always contains 0, + /// that's why we always have error "Cannot read from istream at offset 0". + + break; + } + catch (const Exception & e) + { + ProfileEvents::increment(ProfileEvents::S3ReadRequestsErrors, 1); + + LOG_INFO(log, "Caught exception while reading S3 object. Bucket: {}, Key: {}, Offset: {}, Remaining attempts: {}, Message: {}", + bucket, key, getPosition(), attempt, e.message()); + + impl.reset(); + + if (!attempt) + throw; + } + } + watch.stop(); ProfileEvents::increment(ProfileEvents::S3ReadMicroseconds, watch.elapsedMicroseconds()); - - if (!res) + if (!next_result) return false; internal_buffer = impl->buffer(); @@ -60,7 +94,7 @@ bool ReadBufferFromS3::nextImpl() off_t ReadBufferFromS3::seek(off_t offset_, int whence) { - if (initialized) + if (impl) throw Exception("Seek is allowed only before first read attempt from the buffer.", ErrorCodes::CANNOT_SEEK_THROUGH_FILE); if (whence != SEEK_SET) @@ -74,7 +108,6 @@ off_t ReadBufferFromS3::seek(off_t offset_, int whence) return offset; } - off_t ReadBufferFromS3::getPosition() { return offset + count(); @@ -82,13 +115,13 @@ off_t ReadBufferFromS3::getPosition() std::unique_ptr ReadBufferFromS3::initialize() { - LOG_TRACE(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, std::to_string(offset)); + LOG_TRACE(log, "Read S3 object. Bucket: {}, Key: {}, Offset: {}", bucket, key, getPosition()); Aws::S3::Model::GetObjectRequest req; req.SetBucket(bucket); req.SetKey(key); - if (offset != 0) - req.SetRange("bytes=" + std::to_string(offset) + "-"); + if (getPosition()) + req.SetRange("bytes=" + std::to_string(getPosition()) + "-"); Aws::S3::Model::GetObjectOutcome outcome = client_ptr->GetObject(req); diff --git a/src/IO/ReadBufferFromS3.h b/src/IO/ReadBufferFromS3.h index 829b73d0af6..1f4124d909f 100644 --- a/src/IO/ReadBufferFromS3.h +++ b/src/IO/ReadBufferFromS3.h @@ -27,8 +27,8 @@ private: std::shared_ptr client_ptr; String bucket; String key; + UInt64 s3_max_single_read_retries; size_t buffer_size; - bool initialized = false; off_t offset = 0; Aws::S3::Model::GetObjectResult read_result; std::unique_ptr impl; @@ -40,6 +40,7 @@ public: std::shared_ptr client_ptr_, const String & bucket_, const String & key_, + UInt64 s3_max_single_read_retries_, size_t buffer_size_ = DBMS_DEFAULT_BUFFER_SIZE); bool nextImpl() override; diff --git a/src/IO/S3Common.cpp b/src/IO/S3Common.cpp index f9962735ddc..8e93c375573 100644 --- a/src/IO/S3Common.cpp +++ b/src/IO/S3Common.cpp @@ -6,16 +6,20 @@ # include # include +# include # include # include # include # include # include +# include +# include # include # include # include -# include # include +# include + # include # include # include @@ -91,28 +95,289 @@ private: std::unordered_map tag_loggers; }; +class AWSEC2MetadataClient : public Aws::Internal::AWSHttpResourceClient +{ + static constexpr char EC2_SECURITY_CREDENTIALS_RESOURCE[] = "/latest/meta-data/iam/security-credentials"; + static constexpr char EC2_IMDS_TOKEN_RESOURCE[] = "/latest/api/token"; + static constexpr char EC2_IMDS_TOKEN_HEADER[] = "x-aws-ec2-metadata-token"; + static constexpr char EC2_IMDS_TOKEN_TTL_DEFAULT_VALUE[] = "21600"; + static constexpr char EC2_IMDS_TOKEN_TTL_HEADER[] = "x-aws-ec2-metadata-token-ttl-seconds"; + + static constexpr char EC2_DEFAULT_METADATA_ENDPOINT[] = "http://169.254.169.254"; + +public: + /// See EC2MetadataClient. + + explicit AWSEC2MetadataClient(const Aws::Client::ClientConfiguration & client_configuration) + : Aws::Internal::AWSHttpResourceClient(client_configuration) + , logger(&Poco::Logger::get("AWSEC2InstanceProfileConfigLoader")) + { + } + + AWSEC2MetadataClient& operator =(const AWSEC2MetadataClient & rhs) = delete; + AWSEC2MetadataClient(const AWSEC2MetadataClient & rhs) = delete; + AWSEC2MetadataClient& operator =(const AWSEC2MetadataClient && rhs) = delete; + AWSEC2MetadataClient(const AWSEC2MetadataClient && rhs) = delete; + + virtual ~AWSEC2MetadataClient() override = default; + + using Aws::Internal::AWSHttpResourceClient::GetResource; + + virtual Aws::String GetResource(const char * resource_path) const + { + return GetResource(endpoint.c_str(), resource_path, nullptr/*authToken*/); + } + + virtual Aws::String getDefaultCredentials() const + { + String credentials_string; + { + std::unique_lock locker(token_mutex); + + LOG_TRACE(logger, "Getting default credentials for EC2 instance."); + auto result = GetResourceWithAWSWebServiceResult(endpoint.c_str(), EC2_SECURITY_CREDENTIALS_RESOURCE, nullptr); + credentials_string = result.GetPayload(); + if (result.GetResponseCode() == Aws::Http::HttpResponseCode::UNAUTHORIZED) + { + return {}; + } + } + + String trimmed_credentials_string = Aws::Utils::StringUtils::Trim(credentials_string.c_str()); + if (trimmed_credentials_string.empty()) + return {}; + + std::vector security_credentials = Aws::Utils::StringUtils::Split(trimmed_credentials_string, '\n'); + + LOG_DEBUG(logger, "Calling EC2MetadataService resource, {} returned credential string {}.", + EC2_SECURITY_CREDENTIALS_RESOURCE, trimmed_credentials_string); + + if (security_credentials.empty()) + { + LOG_WARNING(logger, "Initial call to EC2MetadataService to get credentials failed."); + return {}; + } + + Aws::StringStream ss; + ss << EC2_SECURITY_CREDENTIALS_RESOURCE << "/" << security_credentials[0]; + LOG_DEBUG(logger, "Calling EC2MetadataService resource {}.", ss.str()); + return GetResource(ss.str().c_str()); + } + + static Aws::String awsComputeUserAgentString() + { + Aws::StringStream ss; + ss << "aws-sdk-cpp/" << Aws::Version::GetVersionString() << " " << Aws::OSVersionInfo::ComputeOSVersionString() + << " " << Aws::Version::GetCompilerVersionString(); + return ss.str(); + } + + virtual Aws::String getDefaultCredentialsSecurely() const + { + String user_agent_string = awsComputeUserAgentString(); + String new_token; + + { + std::unique_lock locker(token_mutex); + + Aws::StringStream ss; + ss << endpoint << EC2_IMDS_TOKEN_RESOURCE; + std::shared_ptr token_request(Aws::Http::CreateHttpRequest(ss.str(), Aws::Http::HttpMethod::HTTP_PUT, + Aws::Utils::Stream::DefaultResponseStreamFactoryMethod)); + token_request->SetHeaderValue(EC2_IMDS_TOKEN_TTL_HEADER, EC2_IMDS_TOKEN_TTL_DEFAULT_VALUE); + token_request->SetUserAgent(user_agent_string); + LOG_TRACE(logger, "Calling EC2MetadataService to get token."); + auto result = GetResourceWithAWSWebServiceResult(token_request); + const String & token_string = result.GetPayload(); + new_token = Aws::Utils::StringUtils::Trim(token_string.c_str()); + + if (result.GetResponseCode() == Aws::Http::HttpResponseCode::BAD_REQUEST) + { + return {}; + } + else if (result.GetResponseCode() != Aws::Http::HttpResponseCode::OK || new_token.empty()) + { + LOG_TRACE(logger, "Calling EC2MetadataService to get token failed, falling back to less secure way."); + return getDefaultCredentials(); + } + token = new_token; + } + + String url = endpoint + EC2_SECURITY_CREDENTIALS_RESOURCE; + std::shared_ptr profile_request(Aws::Http::CreateHttpRequest(url, + Aws::Http::HttpMethod::HTTP_GET, + Aws::Utils::Stream::DefaultResponseStreamFactoryMethod)); + profile_request->SetHeaderValue(EC2_IMDS_TOKEN_HEADER, new_token); + profile_request->SetUserAgent(user_agent_string); + String profile_string = GetResourceWithAWSWebServiceResult(profile_request).GetPayload(); + + String trimmed_profile_string = Aws::Utils::StringUtils::Trim(profile_string.c_str()); + std::vector security_credentials = Aws::Utils::StringUtils::Split(trimmed_profile_string, '\n'); + + LOG_DEBUG(logger, "Calling EC2MetadataService resource, {} with token returned profile string {}.", + EC2_SECURITY_CREDENTIALS_RESOURCE, trimmed_profile_string); + + if (security_credentials.empty()) + { + LOG_WARNING(logger, "Calling EC2Metadataservice to get profiles failed."); + return {}; + } + + Aws::StringStream ss; + ss << endpoint << EC2_SECURITY_CREDENTIALS_RESOURCE << "/" << security_credentials[0]; + std::shared_ptr credentials_request(Aws::Http::CreateHttpRequest(ss.str(), + Aws::Http::HttpMethod::HTTP_GET, + Aws::Utils::Stream::DefaultResponseStreamFactoryMethod)); + credentials_request->SetHeaderValue(EC2_IMDS_TOKEN_HEADER, new_token); + credentials_request->SetUserAgent(user_agent_string); + LOG_DEBUG(logger, "Calling EC2MetadataService resource {} with token.", ss.str()); + return GetResourceWithAWSWebServiceResult(credentials_request).GetPayload(); + } + + virtual Aws::String getCurrentRegion() const + { + return Aws::Region::AWS_GLOBAL; + } + +private: + const Aws::String endpoint = EC2_DEFAULT_METADATA_ENDPOINT; + mutable std::recursive_mutex token_mutex; + mutable Aws::String token; + Poco::Logger * logger; +}; + +class AWSEC2InstanceProfileConfigLoader : public Aws::Config::AWSProfileConfigLoader +{ +public: + explicit AWSEC2InstanceProfileConfigLoader(const std::shared_ptr & client_, bool use_secure_pull_) + : client(client_) + , use_secure_pull(use_secure_pull_) + , logger(&Poco::Logger::get("AWSEC2InstanceProfileConfigLoader")) + { + } + + virtual ~AWSEC2InstanceProfileConfigLoader() override = default; + +protected: + virtual bool LoadInternal() override + { + auto credentials_str = use_secure_pull ? client->getDefaultCredentialsSecurely() : client->getDefaultCredentials(); + + /// See EC2InstanceProfileConfigLoader. + if (credentials_str.empty()) + return false; + + Aws::Utils::Json::JsonValue credentials_doc(credentials_str); + if (!credentials_doc.WasParseSuccessful()) + { + LOG_ERROR(logger, "Failed to parse output from EC2MetadataService."); + return false; + } + String access_key, secret_key, token; + + auto credentials_view = credentials_doc.View(); + access_key = credentials_view.GetString("AccessKeyId"); + LOG_ERROR(logger, "Successfully pulled credentials from EC2MetadataService with access key {}.", access_key); + + secret_key = credentials_view.GetString("SecretAccessKey"); + token = credentials_view.GetString("Token"); + + auto region = client->getCurrentRegion(); + + Aws::Config::Profile profile; + profile.SetCredentials(Aws::Auth::AWSCredentials(access_key, secret_key, token)); + profile.SetRegion(region); + profile.SetName(Aws::Config::INSTANCE_PROFILE_KEY); + + m_profiles[Aws::Config::INSTANCE_PROFILE_KEY] = profile; + + return true; + } + +private: + std::shared_ptr client; + bool use_secure_pull; + Poco::Logger * logger; +}; + +class AWSInstanceProfileCredentialsProvider : public Aws::Auth::AWSCredentialsProvider +{ +public: + /// See InstanceProfileCredentialsProvider. + + explicit AWSInstanceProfileCredentialsProvider(const std::shared_ptr & config_loader) + : ec2_metadata_config_loader(config_loader) + , load_frequency_ms(Aws::Auth::REFRESH_THRESHOLD) + , logger(&Poco::Logger::get("AWSInstanceProfileCredentialsProvider")) + { + LOG_INFO(logger, "Creating Instance with injected EC2MetadataClient and refresh rate {}."); + } + + Aws::Auth::AWSCredentials GetAWSCredentials() override + { + refreshIfExpired(); + Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock); + auto profile_it = ec2_metadata_config_loader->GetProfiles().find(Aws::Config::INSTANCE_PROFILE_KEY); + + if (profile_it != ec2_metadata_config_loader->GetProfiles().end()) + { + return profile_it->second.GetCredentials(); + } + + return Aws::Auth::AWSCredentials(); + } + +protected: + void Reload() override + { + LOG_INFO(logger, "Credentials have expired attempting to repull from EC2 Metadata Service."); + ec2_metadata_config_loader->Load(); + AWSCredentialsProvider::Reload(); + } + +private: + void refreshIfExpired() + { + LOG_DEBUG(logger, "Checking if latest credential pull has expired."); + Aws::Utils::Threading::ReaderLockGuard guard(m_reloadLock); + if (!IsTimeToRefresh(load_frequency_ms)) + { + return; + } + + guard.UpgradeToWriterLock(); + if (!IsTimeToRefresh(load_frequency_ms)) // double-checked lock to avoid refreshing twice + { + return; + } + Reload(); + } + + std::shared_ptr ec2_metadata_config_loader; + Int64 load_frequency_ms; + Poco::Logger * logger; +}; + class S3CredentialsProviderChain : public Aws::Auth::AWSCredentialsProviderChain { public: - explicit S3CredentialsProviderChain(const DB::S3::PocoHTTPClientConfiguration & configuration, const Aws::Auth::AWSCredentials & credentials, bool use_environment_credentials) + explicit S3CredentialsProviderChain(const DB::S3::PocoHTTPClientConfiguration & configuration, const Aws::Auth::AWSCredentials & credentials, bool use_environment_credentials, bool use_insecure_imds_request) { + auto * logger = &Poco::Logger::get("S3CredentialsProviderChain"); + if (use_environment_credentials) { - const DB::RemoteHostFilter & remote_host_filter = configuration.remote_host_filter; - const unsigned int s3_max_redirects = configuration.s3_max_redirects; - static const char AWS_ECS_CONTAINER_CREDENTIALS_RELATIVE_URI[] = "AWS_CONTAINER_CREDENTIALS_RELATIVE_URI"; static const char AWS_ECS_CONTAINER_CREDENTIALS_FULL_URI[] = "AWS_CONTAINER_CREDENTIALS_FULL_URI"; static const char AWS_ECS_CONTAINER_AUTHORIZATION_TOKEN[] = "AWS_CONTAINER_AUTHORIZATION_TOKEN"; static const char AWS_EC2_METADATA_DISABLED[] = "AWS_EC2_METADATA_DISABLED"; - auto * logger = &Poco::Logger::get("S3CredentialsProviderChain"); - /// The only difference from DefaultAWSCredentialsProviderChain::DefaultAWSCredentialsProviderChain() /// is that this chain uses custom ClientConfiguration. AddProvider(std::make_shared()); AddProvider(std::make_shared()); + AddProvider(std::make_shared()); AddProvider(std::make_shared()); /// ECS TaskRole Credentials only available when ENVIRONMENT VARIABLE is set. @@ -145,7 +410,7 @@ public: } else if (Aws::Utils::StringUtils::ToLower(ec2_metadata_disabled.c_str()) != "true") { - DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(remote_host_filter, s3_max_redirects); + DB::S3::PocoHTTPClientConfiguration aws_client_configuration = DB::S3::ClientFactory::instance().createClientConfiguration(configuration.remote_host_filter, configuration.s3_max_redirects); /// See MakeDefaultHttpResourceClientConfiguration(). /// This is part of EC2 metadata client, but unfortunately it can't be accessed from outside @@ -163,13 +428,16 @@ public: /// EC2MetadataService throttles by delaying the response so the service client should set a large read timeout. /// EC2MetadataService delay is in order of seconds so it only make sense to retry after a couple of seconds. aws_client_configuration.connectTimeoutMs = 1000; + + /// FIXME. Somehow this timeout does not work in docker without --net=host. aws_client_configuration.requestTimeoutMs = 1000; + aws_client_configuration.retryStrategy = std::make_shared(1, 1000); - auto ec2_metadata_client = std::make_shared(aws_client_configuration); - auto config_loader = std::make_shared(ec2_metadata_client); + auto ec2_metadata_client = std::make_shared(aws_client_configuration); + auto config_loader = std::make_shared(ec2_metadata_client, !use_insecure_imds_request); - AddProvider(std::make_shared(config_loader)); + AddProvider(std::make_shared(config_loader)); LOG_INFO(logger, "Added EC2 metadata service credentials provider to the provider chain."); } } @@ -185,12 +453,14 @@ public: const Aws::Client::ClientConfiguration & client_configuration, const Aws::Auth::AWSCredentials & credentials, const DB::HeaderCollection & headers_, - bool use_environment_credentials) + bool use_environment_credentials, + bool use_insecure_imds_request) : Aws::Client::AWSAuthV4Signer( std::make_shared( static_cast(client_configuration), credentials, - use_environment_credentials), + use_environment_credentials, + use_insecure_imds_request), "s3", client_configuration.region, Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, @@ -281,7 +551,8 @@ namespace S3 const String & secret_access_key, const String & server_side_encryption_customer_key_base64, HeaderCollection headers, - bool use_environment_credentials) + bool use_environment_credentials, + bool use_insecure_imds_request) { PocoHTTPClientConfiguration client_configuration = cfg_; client_configuration.updateSchemeAndRegion(); @@ -308,7 +579,8 @@ namespace S3 client_configuration, std::move(credentials), std::move(headers), - use_environment_credentials); + use_environment_credentials, + use_insecure_imds_request); return std::make_shared( std::move(auth_signer), diff --git a/src/IO/S3Common.h b/src/IO/S3Common.h index 53230d49f2b..981218643ea 100644 --- a/src/IO/S3Common.h +++ b/src/IO/S3Common.h @@ -38,7 +38,8 @@ public: const String & secret_access_key, const String & server_side_encryption_customer_key_base64, HeaderCollection headers, - bool use_environment_credentials); + bool use_environment_credentials, + bool use_insecure_imds_request); PocoHTTPClientConfiguration createClientConfiguration( const RemoteHostFilter & remote_host_filter, diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 26745a8f138..a7a9eaaa4dd 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -46,7 +46,7 @@ inline size_t readAlpha(char * res, size_t max_chars, ReadBuffer & in) } #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #endif @@ -634,7 +634,7 @@ ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuf } #if defined(__PPC__) -#if !__clang__ +#if !defined(__clang__) #pragma GCC diagnostic pop #endif #endif diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 0a281436183..1fa0f4eab7c 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1510,7 +1510,8 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( optimize_read_in_order = settings.optimize_read_in_order - && storage && query.orderBy() + && storage + && query.orderBy() && !query_analyzer.hasAggregation() && !query_analyzer.hasWindow() && !query.final() diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8db4415822f..d155a0eea4b 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -996,6 +996,19 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, create.attach_from_path = std::nullopt; } + if (create.attach) + { + /// If table was detached it's not possible to attach it back while some threads are using + /// old instance of the storage. For example, AsynchronousMetrics may cause ATTACH to fail, + /// so we allow waiting here. If database_atomic_wait_for_drop_and_detach_synchronously is disabled + /// and old storage instance still exists it will throw exception. + bool throw_if_table_in_use = getContext()->getSettingsRef().database_atomic_wait_for_drop_and_detach_synchronously; + if (throw_if_table_in_use) + database->checkDetachedTableNotInUse(create.uuid); + else + database->waitDetachedTableNotInUse(create.uuid); + } + StoragePtr res; /// NOTE: CREATE query may be rewritten by Storage creator or table function if (create.as_table_function) diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index a0195ec85e6..e8578a07491 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -129,6 +129,7 @@ struct QueryPlanSettings {"header", query_plan_options.header}, {"description", query_plan_options.description}, {"actions", query_plan_options.actions}, + {"indexes", query_plan_options.indexes}, {"optimize", optimize}, }; }; diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index f44e55a3df9..06922527eaa 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1658,7 +1658,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc { /// Table. if (max_streams == 0) - throw Exception("Logical error: zero number of streams requested", ErrorCodes::LOGICAL_ERROR); + max_streams = 1; /// If necessary, we request more sources than the number of threads - to distribute the work evenly over the threads. if (max_streams > 1 && !is_remote) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index f7872e0f742..1315f9efa05 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -26,6 +26,7 @@ #include #include #include +#include namespace DB @@ -349,6 +350,35 @@ static void validateUpdateColumns( } } +/// Returns ASTs of updated nested subcolumns, if all of subcolumns were updated. +/// They are used to validate sizes of nested arrays. +/// If some of subcolumns were updated and some weren't, +/// it makes sense to validate only updated columns with their old versions, +/// because their sizes couldn't change, since sizes of all nested subcolumns must be consistent. +static std::optional> getExpressionsOfUpdatedNestedSubcolumns( + const String & column_name, + const NamesAndTypesList & all_columns, + const std::unordered_map & column_to_update_expression) +{ + std::vector res; + auto source_name = Nested::splitName(column_name).first; + + /// Check this nested subcolumn + for (const auto & column : all_columns) + { + auto split = Nested::splitName(column.name); + if (isArray(column.type) && split.first == source_name && !split.second.empty()) + { + auto it = column_to_update_expression.find(column.name); + if (it == column_to_update_expression.end()) + return {}; + + res.push_back(it->second); + } + } + + return res; +} ASTPtr MutationsInterpreter::prepare(bool dry_run) { @@ -398,7 +428,7 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) auto dependencies = getAllColumnDependencies(metadata_snapshot, updated_columns); /// First, break a sequence of commands into stages. - for (const auto & command : commands) + for (auto & command : commands) { if (command.type == MutationCommand::DELETE) { @@ -438,12 +468,43 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) /// /// Outer CAST is added just in case if we don't trust the returning type of 'if'. - auto type_literal = std::make_shared(columns_desc.getPhysical(column).type->getName()); + const auto & type = columns_desc.getPhysical(column).type; + auto type_literal = std::make_shared(type->getName()); const auto & update_expr = kv.second; + + ASTPtr condition = getPartitionAndPredicateExpressionForMutationCommand(command); + + /// And new check validateNestedArraySizes for Nested subcolumns + if (isArray(type) && !Nested::splitName(column).second.empty()) + { + std::shared_ptr function = nullptr; + + auto nested_update_exprs = getExpressionsOfUpdatedNestedSubcolumns(column, all_columns, command.column_to_update_expression); + if (!nested_update_exprs) + { + function = makeASTFunction("validateNestedArraySizes", + condition, + update_expr->clone(), + std::make_shared(column)); + condition = makeASTFunction("and", condition, function); + } + else if (nested_update_exprs->size() > 1) + { + function = std::make_shared(); + function->name = "validateNestedArraySizes"; + function->arguments = std::make_shared(); + function->children.push_back(function->arguments); + function->arguments->children.push_back(condition); + for (const auto & it : *nested_update_exprs) + function->arguments->children.push_back(it->clone()); + condition = makeASTFunction("and", condition, function); + } + } + auto updated_column = makeASTFunction("CAST", makeASTFunction("if", - getPartitionAndPredicateExpressionForMutationCommand(command), + condition, makeASTFunction("CAST", update_expr->clone(), type_literal), diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp index 4d1c0526910..399def00006 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace @@ -13,7 +14,7 @@ using namespace DB; Field executeFunctionOnField( const Field & field, const std::string & name, - const ExpressionActionsPtr & expr, + const ExpressionActionsPtr & sharding_expr, const std::string & sharding_key_column_name) { DataTypePtr type = applyVisitor(FieldToDataType{}, field); @@ -25,17 +26,23 @@ Field executeFunctionOnField( Block block{column}; size_t num_rows = 1; - expr->execute(block, num_rows); + sharding_expr->execute(block, num_rows); ColumnWithTypeAndName & ret = block.getByName(sharding_key_column_name); return (*ret.column)[0]; } -/// Return true if shard may contain such value (or it is unknown), otherwise false. +/// @param sharding_column_value - one of values from IN +/// @param sharding_column_name - name of that column +/// @param sharding_expr - expression of sharding_key for the Distributed() table +/// @param sharding_key_column_name - name of the column for sharding_expr +/// @param shard_info - info for the current shard (to compare shard_num with calculated) +/// @param slots - weight -> shard mapping +/// @return true if shard may contain such value (or it is unknown), otherwise false. bool shardContains( const Field & sharding_column_value, const std::string & sharding_column_name, - const ExpressionActionsPtr & expr, + const ExpressionActionsPtr & sharding_expr, const std::string & sharding_key_column_name, const Cluster::ShardInfo & shard_info, const Cluster::SlotToShard & slots) @@ -45,7 +52,14 @@ bool shardContains( if (sharding_column_value.isNull()) return false; - Field sharding_value = executeFunctionOnField(sharding_column_value, sharding_column_name, expr, sharding_key_column_name); + Field sharding_value = executeFunctionOnField(sharding_column_value, sharding_column_name, sharding_expr, sharding_key_column_name); + /// The value from IN can be non-numeric, + /// but in this case it should be convertible to numeric type, let's try. + sharding_value = convertFieldToType(sharding_value, DataTypeUInt64()); + /// In case of conversion is not possible (NULL), shard cannot contain the value anyway. + if (sharding_value.isNull()) + return false; + UInt64 value = sharding_value.get(); const auto shard_num = slots[value % slots.size()] + 1; return shard_info.shard_num == shard_num; @@ -78,10 +92,10 @@ void OptimizeShardingKeyRewriteInMatcher::visit(ASTFunction & function, Data & d if (!identifier) return; - const auto & expr = data.sharding_key_expr; + const auto & sharding_expr = data.sharding_key_expr; const auto & sharding_key_column_name = data.sharding_key_column_name; - if (!expr->getRequiredColumnsWithTypes().contains(identifier->name())) + if (!sharding_expr->getRequiredColumnsWithTypes().contains(identifier->name())) return; /// NOTE: that we should not take care about empty tuple, @@ -93,7 +107,7 @@ void OptimizeShardingKeyRewriteInMatcher::visit(ASTFunction & function, Data & d std::erase_if(tuple_elements->children, [&](auto & child) { auto * literal = child->template as(); - return literal && !shardContains(literal->value, identifier->name(), expr, sharding_key_column_name, data.shard_info, data.slots); + return literal && !shardContains(literal->value, identifier->name(), sharding_expr, sharding_key_column_name, data.shard_info, data.slots); }); } else if (auto * tuple_literal = right->as(); @@ -102,7 +116,7 @@ void OptimizeShardingKeyRewriteInMatcher::visit(ASTFunction & function, Data & d auto & tuple = tuple_literal->value.get(); std::erase_if(tuple, [&](auto & child) { - return !shardContains(child, identifier->name(), expr, sharding_key_column_name, data.shard_info, data.slots); + return !shardContains(child, identifier->name(), sharding_expr, sharding_key_column_name, data.shard_info, data.slots); }); } } diff --git a/src/Interpreters/QueryAliasesVisitor.cpp b/src/Interpreters/QueryAliasesVisitor.cpp index d395bfc20e9..bd0b2e88d2f 100644 --- a/src/Interpreters/QueryAliasesVisitor.cpp +++ b/src/Interpreters/QueryAliasesVisitor.cpp @@ -15,15 +15,22 @@ namespace ErrorCodes extern const int MULTIPLE_EXPRESSIONS_FOR_ALIAS; } -static String wrongAliasMessage(const ASTPtr & ast, const ASTPtr & prev_ast, const String & alias) +namespace { - WriteBufferFromOwnString message; - message << "Different expressions with the same alias " << backQuoteIfNeed(alias) << ":\n"; - formatAST(*ast, message, false, true); - message << "\nand\n"; - formatAST(*prev_ast, message, false, true); - message << '\n'; - return message.str(); + + constexpr auto dummy_subquery_name_prefix = "_subquery"; + + String wrongAliasMessage(const ASTPtr & ast, const ASTPtr & prev_ast, const String & alias) + { + WriteBufferFromOwnString message; + message << "Different expressions with the same alias " << backQuoteIfNeed(alias) << ":\n"; + formatAST(*ast, message, false, true); + message << "\nand\n"; + formatAST(*prev_ast, message, false, true); + message << '\n'; + return message.str(); + } + } @@ -99,7 +106,7 @@ void QueryAliasesMatcher::visit(const ASTSubquery & const_subquery, const AST String alias; do { - alias = "_subquery" + std::to_string(++subquery_index); + alias = dummy_subquery_name_prefix + std::to_string(++subquery_index); } while (aliases.count(alias)); @@ -124,6 +131,30 @@ void QueryAliasesMatcher::visitOther(const ASTPtr & ast, Data & data) aliases[alias] = ast; } + + /** QueryAliasesVisitor is executed before ExecuteScalarSubqueriesVisitor. + For example we have subquery in our query (SELECT sum(number) FROM numbers(10)). + + After running QueryAliasesVisitor it will be (SELECT sum(number) FROM numbers(10)) as _subquery_1 + and prefer_alias_to_column_name for this subquery will be true. + + After running ExecuteScalarSubqueriesVisitor it will be converted to (45 as _subquery_1) + and prefer_alias_to_column_name for ast literal will be true. + + But if we send such query on remote host with Distributed engine for example we cannot send prefer_alias_to_column_name + information for our ast node with query string. And this alias will be dropped because prefer_alias_to_column_name for ASTWIthAlias + by default is false. + + It is imporant that subquery can be converted to literal during ExecuteScalarSubqueriesVisitor. + And code below check if we previously set for subquery alias as _subquery, and if it is true + then set prefer_alias_to_column_name = true for node that was optimized during ExecuteScalarSubqueriesVisitor. + */ + + if (auto * ast_with_alias = dynamic_cast(ast.get())) + { + if (startsWith(alias, dummy_subquery_name_prefix)) + ast_with_alias->prefer_alias_to_column_name = true; + } } /// Explicit template instantiations diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index eb9602b4759..324a773fbc2 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -181,8 +182,72 @@ struct CustomizeAggregateFunctionsMoveSuffixData } }; +struct FuseSumCountAggregates +{ + std::vector sums {}; + std::vector counts {}; + std::vector avgs {}; + + void addFuncNode(ASTFunction * func) + { + if (func->name == "sum") + sums.push_back(func); + else if (func->name == "count") + counts.push_back(func); + else + { + assert(func->name == "avg"); + avgs.push_back(func); + } + } + + bool canBeFused() const + { + // Need at least two different kinds of functions to fuse. + if (sums.empty() && counts.empty()) + return false; + if (sums.empty() && avgs.empty()) + return false; + if (counts.empty() && avgs.empty()) + return false; + return true; + } +}; + +struct FuseSumCountAggregatesVisitorData +{ + using TypeToVisit = ASTFunction; + + std::unordered_map fuse_map; + + void visit(ASTFunction & func, ASTPtr &) + { + if (func.name == "sum" || func.name == "avg" || func.name == "count") + { + if (func.arguments->children.empty()) + return; + + // Probably we can extend it to match count() for non-nullable argument + // to sum/avg with any other argument. Now we require strict match. + const auto argument = func.arguments->children.at(0)->getColumnName(); + auto it = fuse_map.find(argument); + if (it != fuse_map.end()) + { + it->second.addFuncNode(&func); + } + else + { + FuseSumCountAggregates funcs{}; + funcs.addFuncNode(&func); + fuse_map[argument] = funcs; + } + } + } +}; + using CustomizeAggregateFunctionsOrNullVisitor = InDepthNodeVisitor, true>; using CustomizeAggregateFunctionsMoveOrNullVisitor = InDepthNodeVisitor, true>; +using FuseSumCountAggregatesVisitor = InDepthNodeVisitor, true>; /// Translate qualified names such as db.table.column, table.column, table_alias.column to names' normal form. /// Expand asterisks and qualified asterisks with column names. @@ -200,6 +265,49 @@ void translateQualifiedNames(ASTPtr & query, const ASTSelectQuery & select_query throw Exception("Empty list of columns in SELECT query", ErrorCodes::EMPTY_LIST_OF_COLUMNS_QUERIED); } +// Replaces one avg/sum/count function with an appropriate expression with +// sumCount(). +void replaceWithSumCount(String column_name, ASTFunction & func) +{ + auto func_base = makeASTFunction("sumCount", std::make_shared(column_name)); + auto exp_list = std::make_shared(); + if (func.name == "sum" || func.name == "count") + { + /// Rewrite "sum" to sumCount().1, rewrite "count" to sumCount().2 + UInt8 idx = (func.name == "sum" ? 1 : 2); + func.name = "tupleElement"; + exp_list->children.push_back(func_base); + exp_list->children.push_back(std::make_shared(idx)); + } + else + { + /// Rewrite "avg" to sumCount().1 / sumCount().2 + auto new_arg1 = makeASTFunction("tupleElement", func_base, std::make_shared(UInt8(1))); + auto new_arg2 = makeASTFunction("tupleElement", func_base, std::make_shared(UInt8(2))); + func.name = "divide"; + exp_list->children.push_back(new_arg1); + exp_list->children.push_back(new_arg2); + } + func.arguments = exp_list; + func.children.push_back(func.arguments); +} + +void fuseSumCountAggregates(std::unordered_map & fuse_map) +{ + for (auto & it : fuse_map) + { + if (it.second.canBeFused()) + { + for (auto & func: it.second.sums) + replaceWithSumCount(it.first, *func); + for (auto & func: it.second.avgs) + replaceWithSumCount(it.first, *func); + for (auto & func: it.second.counts) + replaceWithSumCount(it.first, *func); + } + } +} + bool hasArrayJoin(const ASTPtr & ast) { if (const ASTFunction * function = ast->as()) @@ -910,7 +1018,18 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const NameSet & CustomizeGlobalNotInVisitor(data_global_not_null_in).visit(query); } - // Rewrite all aggregate functions to add -OrNull suffix to them + // Try to fuse sum/avg/count with identical arguments to one sumCount call, + // if we have at least two different functions. E.g. we will replace sum(x) + // and count(x) with sumCount(x).1 and sumCount(x).2, and sumCount() will + // be calculated only once because of CSE. + if (settings.optimize_fuse_sum_count_avg) + { + FuseSumCountAggregatesVisitor::Data data; + FuseSumCountAggregatesVisitor(data).visit(query); + fuseSumCountAggregates(data.fuse_map); + } + + /// Rewrite all aggregate functions to add -OrNull suffix to them if (settings.aggregate_functions_null_for_empty) { CustomizeAggregateFunctionsOrNullVisitor::Data data_or_null{"OrNull"}; diff --git a/src/Parsers/ASTFunction.cpp b/src/Parsers/ASTFunction.cpp index 18a47eb2126..6871a817351 100644 --- a/src/Parsers/ASTFunction.cpp +++ b/src/Parsers/ASTFunction.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -234,7 +235,11 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format * interpreted as a comment. Instead, negate the literal * in place. Another possible solution is to use parentheses, * but the old comment said it is impossible, without mentioning - * the reason. + * the reason. We should also negate the nonnegative literals, + * for symmetry. We print the negated value without parentheses, + * because they are not needed around a single literal. Also we + * use formatting from FieldVisitorToString, so that the type is + * preserved (e.g. -0. is printed with trailing period). */ if (literal && name == "negate") { @@ -251,26 +256,18 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format { // The parser doesn't create decimal literals, but // they can be produced by constant folding or the - // fuzzer. + // fuzzer. Decimals are always signed, so no need + // to deduce the result type like we do for ints. const auto int_value = value.getValue().value; - // We compare to zero so we don't care about scale. - if (int_value >= 0) - { - return false; - } - - settings.ostr << ValueType{-int_value, - value.getScale()}; + settings.ostr << FieldVisitorToString{}(ValueType{ + -int_value, + value.getScale()}); } else if constexpr (std::is_arithmetic_v) { - if (value >= 0) - { - return false; - } - // We don't need parentheses around a single - // literal. - settings.ostr << -value; + using ResultType = typename NumberTraits::ResultOfNegate::Type; + settings.ostr << FieldVisitorToString{}( + -static_cast(value)); return true; } diff --git a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp index 00381ab96d0..4ccc0db4cfe 100644 --- a/src/Processors/Formats/Impl/CSVRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/CSVRowInputFormat.cpp @@ -201,7 +201,10 @@ void CSVRowInputFormat::readPrefix() return; } else + { skipRow(in, format_settings.csv, num_columns); + setupAllColumnsByTableSchema(); + } } else if (!column_mapping->is_set) setupAllColumnsByTableSchema(); diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp index 9af03e93c32..6f43addc4ed 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.cpp @@ -19,6 +19,13 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; } +#define THROW_ARROW_NOT_OK(status) \ + do \ + { \ + if (::arrow::Status _s = (status); !_s.ok()) \ + throw Exception(_s.ToString(), ErrorCodes::BAD_ARGUMENTS); \ + } while (false) + ORCBlockInputFormat::ORCBlockInputFormat(ReadBuffer & in_, Block header_) : IInputFormat(std::move(header_), in_) { } @@ -28,21 +35,26 @@ Chunk ORCBlockInputFormat::generate() Chunk res; const Block & header = getPort().getHeader(); - if (file_reader) + if (!file_reader) + prepareReader(); + + if (stripe_current >= stripe_total) return res; - arrow::Status open_status = arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader); - if (!open_status.ok()) - throw Exception(open_status.ToString(), ErrorCodes::BAD_ARGUMENTS); + std::shared_ptr batch_result; + arrow::Status batch_status = file_reader->ReadStripe(stripe_current, include_indices, &batch_result); + if (!batch_status.ok()) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, + "Error while reading batch of ORC data: {}", batch_status.ToString()); - std::shared_ptr table; - arrow::Status read_status = file_reader->Read(&table); - if (!read_status.ok()) - throw ParsingException{"Error while reading ORC data: " + read_status.ToString(), - ErrorCodes::CANNOT_READ_ALL_DATA}; + auto table_result = arrow::Table::FromRecordBatches({batch_result}); + if (!table_result.ok()) + throw ParsingException(ErrorCodes::CANNOT_READ_ALL_DATA, + "Error while reading batch of ORC data: {}", table_result.status().ToString()); - ArrowColumnToCHColumn::arrowTableToCHChunk(res, table, header, "ORC"); + ++stripe_current; + ArrowColumnToCHColumn::arrowTableToCHChunk(res, *table_result, header, "ORC"); return res; } @@ -51,6 +63,26 @@ void ORCBlockInputFormat::resetParser() IInputFormat::resetParser(); file_reader.reset(); + include_indices.clear(); + stripe_current = 0; +} + +void ORCBlockInputFormat::prepareReader() +{ + THROW_ARROW_NOT_OK(arrow::adapters::orc::ORCFileReader::Open(asArrowFile(in), arrow::default_memory_pool(), &file_reader)); + stripe_total = file_reader->NumberOfStripes(); + stripe_current = 0; + + std::shared_ptr schema; + THROW_ARROW_NOT_OK(file_reader->ReadSchema(&schema)); + + for (int i = 0; i < schema->num_fields(); ++i) + { + if (getPort().getHeader().has(schema->field(i)->name())) + { + include_indices.push_back(i+1); + } + } } void registerInputFormatProcessorORC(FormatFactory &factory) diff --git a/src/Processors/Formats/Impl/ORCBlockInputFormat.h b/src/Processors/Formats/Impl/ORCBlockInputFormat.h index cff42560366..0c78290f3cc 100644 --- a/src/Processors/Formats/Impl/ORCBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ORCBlockInputFormat.h @@ -25,6 +25,15 @@ private: // TODO: check that this class implements every part of its parent std::unique_ptr file_reader; + + int stripe_total = 0; + + int stripe_current = 0; + + // indices of columns to read from ORC file + std::vector include_indices; + + void prepareReader(); }; } diff --git a/src/Processors/Pipe.cpp b/src/Processors/Pipe.cpp index 677956db299..044975448ad 100644 --- a/src/Processors/Pipe.cpp +++ b/src/Processors/Pipe.cpp @@ -8,6 +8,7 @@ #include #include #include +#include namespace DB { @@ -250,6 +251,47 @@ static Pipes removeEmptyPipes(Pipes pipes) return res; } +/// Calculate common header for pipes. +/// This function is needed only to remove ColumnConst from common header in case if some columns are const, and some not. +/// E.g. if the first header is `x, const y, const z` and the second is `const x, y, const z`, the common header will be `x, y, const z`. +static Block getCommonHeader(const Pipes & pipes) +{ + Block res; + + for (const auto & pipe : pipes) + { + if (const auto & header = pipe.getHeader()) + { + res = header; + break; + } + } + + for (const auto & pipe : pipes) + { + const auto & header = pipe.getHeader(); + for (size_t i = 0; i < res.columns(); ++i) + { + /// We do not check that headers are compatible here. Will do it later. + + if (i >= header.columns()) + break; + + auto & common = res.getByPosition(i).column; + const auto & cur = header.getByPosition(i).column; + + /// Only remove const from common header if it is not const for current pipe. + if (cur && common && !isColumnConst(*cur)) + { + if (const auto * column_const = typeid_cast(common.get())) + common = column_const->getDataColumnPtr(); + } + } + } + + return res; +} + Pipe Pipe::unitePipes(Pipes pipes) { return Pipe::unitePipes(std::move(pipes), nullptr, false); @@ -276,23 +318,12 @@ Pipe Pipe::unitePipes(Pipes pipes, Processors * collected_processors, bool allow OutputPortRawPtrs totals; OutputPortRawPtrs extremes; res.collected_processors = collected_processors; - res.header = pipes.front().header; - if (allow_empty_header && !res.header) - { - for (const auto & pipe : pipes) - { - if (const auto & header = pipe.getHeader()) - { - res.header = header; - break; - } - } - } + res.header = getCommonHeader(pipes); for (auto & pipe : pipes) { if (!allow_empty_header || pipe.header) - assertBlocksHaveEqualStructure(res.header, pipe.header, "Pipe::unitePipes"); + assertCompatibleHeader(pipe.header, res.header, "Pipe::unitePipes"); res.processors.insert(res.processors.end(), pipe.processors.begin(), pipe.processors.end()); res.output_ports.insert(res.output_ports.end(), pipe.output_ports.begin(), pipe.output_ports.end()); diff --git a/src/Processors/Port.cpp b/src/Processors/Port.cpp index 7e7ccb1adad..0a6026b27f2 100644 --- a/src/Processors/Port.cpp +++ b/src/Processors/Port.cpp @@ -16,7 +16,7 @@ void connect(OutputPort & output, InputPort & input) auto out_name = output.getProcessor().getName(); auto in_name = input.getProcessor().getName(); - assertBlocksHaveEqualStructure(input.getHeader(), output.getHeader(), " function connect between " + out_name + " and " + in_name); + assertCompatibleHeader(output.getHeader(), input.getHeader(), " function connect between " + out_name + " and " + in_name); input.output_port = &output; output.input_port = &input; diff --git a/src/Processors/QueryPipeline.cpp b/src/Processors/QueryPipeline.cpp index cabf5f19190..1b803ec0886 100644 --- a/src/Processors/QueryPipeline.cpp +++ b/src/Processors/QueryPipeline.cpp @@ -232,8 +232,6 @@ QueryPipeline QueryPipeline::unitePipelines( pipeline.checkInitialized(); pipeline.pipe.collected_processors = collected_processors; - assertBlocksHaveEqualStructure(pipeline.getHeader(), common_header, "QueryPipeline::unitePipelines"); - pipes.emplace_back(std::move(pipeline.pipe)); max_threads += pipeline.max_threads; diff --git a/src/Processors/QueryPlan/IQueryPlanStep.h b/src/Processors/QueryPlan/IQueryPlanStep.h index 8211b52a6c4..2974891e2bf 100644 --- a/src/Processors/QueryPlan/IQueryPlanStep.h +++ b/src/Processors/QueryPlan/IQueryPlanStep.h @@ -99,6 +99,9 @@ public: /// Get detailed description of step actions. This is shown in EXPLAIN query with options `actions = 1`. virtual void describeActions(FormatSettings & /*settings*/) const {} + /// Get detailed description of read-from-storage step indexes (if any). Shown in with options `indexes = 1`. + virtual void describeIndexes(FormatSettings & /*settings*/) const {} + /// Get description of processors added in current step. Should be called after updatePipeline(). virtual void describePipeline(FormatSettings & /*settings*/) const {} diff --git a/src/Processors/QueryPlan/QueryPlan.cpp b/src/Processors/QueryPlan/QueryPlan.cpp index 974da579d0c..ad3649385fd 100644 --- a/src/Processors/QueryPlan/QueryPlan.cpp +++ b/src/Processors/QueryPlan/QueryPlan.cpp @@ -243,6 +243,9 @@ static void explainStep( if (options.actions) step.describeActions(settings); + + if (options.indexes) + step.describeIndexes(settings); } std::string debugExplainStep(const IQueryPlanStep & step) diff --git a/src/Processors/QueryPlan/QueryPlan.h b/src/Processors/QueryPlan/QueryPlan.h index bf7ed81fdc1..901d83c3ab8 100644 --- a/src/Processors/QueryPlan/QueryPlan.h +++ b/src/Processors/QueryPlan/QueryPlan.h @@ -66,6 +66,8 @@ public: bool description = true; /// Add detailed information about step actions. bool actions = false; + /// Add information about indexes actions. + bool indexes = false; }; struct ExplainPipelineOptions diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp new file mode 100644 index 00000000000..ebf9c9e4121 --- /dev/null +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -0,0 +1,249 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +ReadFromMergeTree::ReadFromMergeTree( + const MergeTreeData & storage_, + StorageMetadataPtr metadata_snapshot_, + String query_id_, + Names required_columns_, + RangesInDataParts parts_, + IndexStatPtr index_stats_, + PrewhereInfoPtr prewhere_info_, + Names virt_column_names_, + Settings settings_, + size_t num_streams_, + ReadType read_type_) + : ISourceStep(DataStream{.header = MergeTreeBaseSelectProcessor::transformHeader( + metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), + prewhere_info_, + virt_column_names_)}) + , storage(storage_) + , metadata_snapshot(std::move(metadata_snapshot_)) + , query_id(std::move(query_id_)) + , required_columns(std::move(required_columns_)) + , parts(std::move(parts_)) + , index_stats(std::move(index_stats_)) + , prewhere_info(std::move(prewhere_info_)) + , virt_column_names(std::move(virt_column_names_)) + , settings(std::move(settings_)) + , num_streams(num_streams_) + , read_type(read_type_) +{ +} + +Pipe ReadFromMergeTree::readFromPool() +{ + Pipes pipes; + size_t sum_marks = 0; + size_t total_rows = 0; + + for (const auto & part : parts) + { + sum_marks += part.getMarksCount(); + total_rows += part.getRowsCount(); + } + + auto pool = std::make_shared( + num_streams, + sum_marks, + settings.min_marks_for_concurrent_read, + std::move(parts), + storage, + metadata_snapshot, + prewhere_info, + true, + required_columns, + settings.backoff_settings, + settings.preferred_block_size_bytes, + false); + + auto * logger = &Poco::Logger::get(storage.getLogName() + " (SelectExecutor)"); + LOG_DEBUG(logger, "Reading approx. {} rows with {} streams", total_rows, num_streams); + + for (size_t i = 0; i < num_streams; ++i) + { + auto source = std::make_shared( + i, pool, settings.min_marks_for_concurrent_read, settings.max_block_size, + settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, + storage, metadata_snapshot, settings.use_uncompressed_cache, + prewhere_info, settings.reader_settings, virt_column_names); + + if (i == 0) + { + /// Set the approximate number of rows for the first source only + source->addTotalRowsApprox(total_rows); + } + + pipes.emplace_back(std::move(source)); + } + + return Pipe::unitePipes(std::move(pipes)); +} + +template +ProcessorPtr ReadFromMergeTree::createSource(const RangesInDataPart & part) +{ + return std::make_shared( + storage, metadata_snapshot, part.data_part, settings.max_block_size, settings.preferred_block_size_bytes, + settings.preferred_max_column_in_block_size_bytes, required_columns, part.ranges, settings.use_uncompressed_cache, + prewhere_info, true, settings.reader_settings, virt_column_names, part.part_index_in_query); +} + +Pipe ReadFromMergeTree::readInOrder() +{ + Pipes pipes; + for (const auto & part : parts) + { + auto source = read_type == ReadType::InReverseOrder + ? createSource(part) + : createSource(part); + + pipes.emplace_back(std::move(source)); + } + + auto pipe = Pipe::unitePipes(std::move(pipes)); + + if (read_type == ReadType::InReverseOrder) + { + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared(header); + }); + } + + return pipe; +} + +Pipe ReadFromMergeTree::read() +{ + if (read_type == ReadType::Default && num_streams > 1) + return readFromPool(); + + auto pipe = readInOrder(); + + /// Use ConcatProcessor to concat sources together. + /// It is needed to read in parts order (and so in PK order) if single thread is used. + if (read_type == ReadType::Default && pipe.numOutputPorts() > 1) + pipe.addTransform(std::make_shared(pipe.getHeader(), pipe.numOutputPorts())); + + return pipe; +} + +void ReadFromMergeTree::initializePipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings &) +{ + Pipe pipe = read(); + + for (const auto & processor : pipe.getProcessors()) + processors.emplace_back(processor); + + // Attach QueryIdHolder if needed + if (!query_id.empty()) + pipe.addQueryIdHolder(std::make_shared(query_id, storage)); + + pipeline.init(std::move(pipe)); +} + +static const char * indexTypeToString(ReadFromMergeTree::IndexType type) +{ + switch (type) + { + case ReadFromMergeTree::IndexType::None: + return "None"; + case ReadFromMergeTree::IndexType::MinMax: + return "MinMax"; + case ReadFromMergeTree::IndexType::Partition: + return "Partition"; + case ReadFromMergeTree::IndexType::PrimaryKey: + return "PrimaryKey"; + case ReadFromMergeTree::IndexType::Skip: + return "Skip"; + } + + __builtin_unreachable(); +} + +static const char * readTypeToString(ReadFromMergeTree::ReadType type) +{ + switch (type) + { + case ReadFromMergeTree::ReadType::Default: + return "Default"; + case ReadFromMergeTree::ReadType::InOrder: + return "InOrder"; + case ReadFromMergeTree::ReadType::InReverseOrder: + return "InReverseOrder"; + } + + __builtin_unreachable(); +} + +void ReadFromMergeTree::describeActions(FormatSettings & format_settings) const +{ + std::string prefix(format_settings.offset, format_settings.indent_char); + format_settings.out << prefix << "ReadType: " << readTypeToString(read_type) << '\n'; + + if (index_stats && !index_stats->empty()) + { + format_settings.out << prefix << "Parts: " << index_stats->back().num_parts_after << '\n'; + format_settings.out << prefix << "Granules: " << index_stats->back().num_granules_after << '\n'; + } +} + +void ReadFromMergeTree::describeIndexes(FormatSettings & format_settings) const +{ + std::string prefix(format_settings.offset, format_settings.indent_char); + if (index_stats && !index_stats->empty()) + { + std::string indent(format_settings.indent, format_settings.indent_char); + + /// Do not print anything if no indexes is applied. + if (index_stats->size() > 1 || index_stats->front().type != IndexType::None) + format_settings.out << prefix << "Indexes:\n"; + + for (size_t i = 0; i < index_stats->size(); ++i) + { + const auto & stat = (*index_stats)[i]; + if (stat.type == IndexType::None) + continue; + + format_settings.out << prefix << indent << indexTypeToString(stat.type) << '\n'; + + if (!stat.name.empty()) + format_settings.out << prefix << indent << indent << "Name: " << stat.name << '\n'; + + if (!stat.description.empty()) + format_settings.out << prefix << indent << indent << "Description: " << stat.description << '\n'; + + if (!stat.used_keys.empty()) + { + format_settings.out << prefix << indent << indent << "Keys: " << stat.name << '\n'; + for (const auto & used_key : stat.used_keys) + format_settings.out << prefix << indent << indent << indent << used_key << '\n'; + } + + if (!stat.condition.empty()) + format_settings.out << prefix << indent << indent << "Condition: " << stat.condition << '\n'; + + format_settings.out << prefix << indent << indent << "Parts: " << stat.num_parts_after; + if (i) + format_settings.out << '/' << (*index_stats)[i - 1].num_parts_after; + format_settings.out << '\n'; + + format_settings.out << prefix << indent << indent << "Granules: " << stat.num_granules_after; + if (i) + format_settings.out << '/' << (*index_stats)[i - 1].num_granules_after; + format_settings.out << '\n'; + } + } +} + +} diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.h b/src/Processors/QueryPlan/ReadFromMergeTree.h new file mode 100644 index 00000000000..1d6a4491588 --- /dev/null +++ b/src/Processors/QueryPlan/ReadFromMergeTree.h @@ -0,0 +1,113 @@ +#pragma once +#include +#include +#include +#include + +namespace DB +{ + +/// This step is created to read from MergeTree* table. +/// For now, it takes a list of parts and creates source from it. +class ReadFromMergeTree final : public ISourceStep +{ +public: + + enum class IndexType + { + None, + MinMax, + Partition, + PrimaryKey, + Skip, + }; + + /// This is a struct with information about applied indexes. + /// Is used for introspection only, in EXPLAIN query. + struct IndexStat + { + IndexType type; + std::string name; + std::string description; + std::string condition; + std::vector used_keys; + size_t num_parts_after; + size_t num_granules_after; + }; + + using IndexStats = std::vector; + using IndexStatPtr = std::unique_ptr; + + /// Part of settings which are needed for reading. + struct Settings + { + UInt64 max_block_size; + size_t preferred_block_size_bytes; + size_t preferred_max_column_in_block_size_bytes; + size_t min_marks_for_concurrent_read; + bool use_uncompressed_cache; + + MergeTreeReaderSettings reader_settings; + MergeTreeReadPool::BackoffSettings backoff_settings; + }; + + enum class ReadType + { + /// By default, read will use MergeTreeReadPool and return pipe with num_streams outputs. + /// If num_streams == 1, will read without pool, in order specified in parts. + Default, + /// Read in sorting key order. + /// Returned pipe will have the number of ports equals to parts.size(). + /// Parameter num_streams_ is ignored in this case. + /// User should add MergingSorted itself if needed. + InOrder, + /// The same as InOrder, but in reverse order. + /// For every part, read ranges and granules from end to begin. Also add ReverseTransform. + InReverseOrder, + }; + + ReadFromMergeTree( + const MergeTreeData & storage_, + StorageMetadataPtr metadata_snapshot_, + String query_id_, + Names required_columns_, + RangesInDataParts parts_, + IndexStatPtr index_stats_, + PrewhereInfoPtr prewhere_info_, + Names virt_column_names_, + Settings settings_, + size_t num_streams_, + ReadType read_type_ + ); + + String getName() const override { return "ReadFromMergeTree"; } + + void initializePipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings &) override; + + void describeActions(FormatSettings & format_settings) const override; + void describeIndexes(FormatSettings & format_settings) const override; + +private: + const MergeTreeData & storage; + StorageMetadataPtr metadata_snapshot; + String query_id; + + Names required_columns; + RangesInDataParts parts; + IndexStatPtr index_stats; + PrewhereInfoPtr prewhere_info; + Names virt_column_names; + Settings settings; + + size_t num_streams; + ReadType read_type; + + Pipe read(); + Pipe readFromPool(); + Pipe readInOrder(); + + template + ProcessorPtr createSource(const RangesInDataPart & part); +}; + +} diff --git a/src/Processors/QueryPlan/ReverseRowsStep.cpp b/src/Processors/QueryPlan/ReverseRowsStep.cpp deleted file mode 100644 index 0a2e9f20cd9..00000000000 --- a/src/Processors/QueryPlan/ReverseRowsStep.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -namespace DB -{ - -static ITransformingStep::Traits getTraits() -{ - return ITransformingStep::Traits - { - { - .preserves_distinct_columns = true, - .returns_single_stream = false, - .preserves_number_of_streams = true, - .preserves_sorting = false, - }, - { - .preserves_number_of_rows = true, - } - }; -} - -ReverseRowsStep::ReverseRowsStep(const DataStream & input_stream_) - : ITransformingStep(input_stream_, input_stream_.header, getTraits()) -{ -} - -void ReverseRowsStep::transformPipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings &) -{ - pipeline.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header); - }); -} - -} diff --git a/src/Processors/QueryPlan/ReverseRowsStep.h b/src/Processors/QueryPlan/ReverseRowsStep.h deleted file mode 100644 index 08d7833d130..00000000000 --- a/src/Processors/QueryPlan/ReverseRowsStep.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once -#include - -namespace DB -{ - -/// Reverse rows in chunk. -class ReverseRowsStep : public ITransformingStep -{ -public: - explicit ReverseRowsStep(const DataStream & input_stream_); - - String getName() const override { return "ReverseRows"; } - - void transformPipeline(QueryPipeline & pipeline, const BuildQueryPipelineSettings &) override; -}; - -} diff --git a/src/Processors/ya.make b/src/Processors/ya.make index ef1ff03568e..18f285e60a2 100644 --- a/src/Processors/ya.make +++ b/src/Processors/ya.make @@ -124,9 +124,9 @@ SRCS( QueryPlan/PartialSortingStep.cpp QueryPlan/QueryIdHolder.cpp QueryPlan/QueryPlan.cpp + QueryPlan/ReadFromMergeTree.cpp QueryPlan/ReadFromPreparedSource.cpp QueryPlan/ReadNothingStep.cpp - QueryPlan/ReverseRowsStep.cpp QueryPlan/RollupStep.cpp QueryPlan/SettingQuotaAndLimitsStep.cpp QueryPlan/TotalsHavingStep.cpp diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index c6cd74f6c6a..36bc8d0e391 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -158,6 +158,8 @@ void TCPHandler::runImpl() } Settings connection_settings = connection_context->getSettings(); + UInt64 idle_connection_timeout = connection_settings.idle_connection_timeout; + UInt64 poll_interval = connection_settings.poll_interval; sendHello(); @@ -168,10 +170,10 @@ void TCPHandler::runImpl() /// We are waiting for a packet from the client. Thus, every `poll_interval` seconds check whether we need to shut down. { Stopwatch idle_time; - while (!server.isCancelled() && !static_cast(*in).poll( - std::min(connection_settings.poll_interval, connection_settings.idle_connection_timeout) * 1000000)) + UInt64 timeout_ms = std::min(poll_interval, idle_connection_timeout) * 1000000; + while (!server.isCancelled() && !static_cast(*in).poll(timeout_ms)) { - if (idle_time.elapsedSeconds() > connection_settings.idle_connection_timeout) + if (idle_time.elapsedSeconds() > idle_connection_timeout) { LOG_TRACE(log, "Closing idle connection"); return; @@ -212,6 +214,15 @@ void TCPHandler::runImpl() if (!receivePacket()) continue; + /** If Query received, then settings in query_context has been updated + * So, update some other connection settings, for flexibility. + */ + { + const Settings & settings = query_context->getSettingsRef(); + idle_connection_timeout = settings.idle_connection_timeout; + poll_interval = settings.poll_interval; + } + /** If part_uuids got received in previous packet, trying to read again. */ if (state.empty() && state.part_uuids && !receivePacket()) @@ -274,10 +285,10 @@ void TCPHandler::runImpl() if (context != query_context) throw Exception("Unexpected context in InputBlocksReader", ErrorCodes::LOGICAL_ERROR); - size_t poll_interval; + size_t poll_interval_ms; int receive_timeout; - std::tie(poll_interval, receive_timeout) = getReadTimeouts(connection_settings); - if (!readDataNext(poll_interval, receive_timeout)) + std::tie(poll_interval_ms, receive_timeout) = getReadTimeouts(connection_settings); + if (!readDataNext(poll_interval_ms, receive_timeout)) { state.block_in.reset(); state.maybe_compressed_in.reset(); @@ -300,6 +311,8 @@ void TCPHandler::runImpl() /// Processing Query state.io = executeQuery(state.query, query_context, false, state.stage, may_have_embedded_data); + unknown_packet_in_send_data = query_context->getSettingsRef().unknown_packet_in_send_data; + after_check_cancelled.restart(); after_send_progress.restart(); @@ -985,8 +998,6 @@ bool TCPHandler::receivePacket() switch (packet_type) { - case Protocol::Client::ReadTaskResponse: - throw Exception("ReadTaskResponse must be received only after requesting in callback", ErrorCodes::LOGICAL_ERROR); case Protocol::Client::IgnoredPartUUIDs: /// Part uuids packet if any comes before query. receiveIgnoredPartUUIDs(); @@ -1463,6 +1474,14 @@ void TCPHandler::sendData(const Block & block) try { + /// For testing hedged requests + if (unknown_packet_in_send_data) + { + --unknown_packet_in_send_data; + if (unknown_packet_in_send_data == 0) + writeVarUInt(UInt64(-1), *out); + } + writeVarUInt(Protocol::Server::Data, *out); /// Send external table name (empty name is the main table) writeStringBinary("", *out); diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 708d21c8251..ce0a4cee3ff 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -135,6 +135,8 @@ private: ContextPtr connection_context; ContextPtr query_context; + size_t unknown_packet_in_send_data = 0; + /// Streams for reading/writing from/to client connection socket. std::shared_ptr in; std::shared_ptr out; diff --git a/src/Storages/HDFS/HDFSCommon.cpp b/src/Storages/HDFS/HDFSCommon.cpp index e5ec8a06139..40f52921008 100644 --- a/src/Storages/HDFS/HDFSCommon.cpp +++ b/src/Storages/HDFS/HDFSCommon.cpp @@ -9,14 +9,15 @@ #include #include + namespace DB { namespace ErrorCodes { -extern const int BAD_ARGUMENTS; -extern const int NETWORK_ERROR; -extern const int EXCESSIVE_ELEMENT_IN_CONFIG; -extern const int NO_ELEMENTS_IN_CONFIG; + extern const int BAD_ARGUMENTS; + extern const int NETWORK_ERROR; + extern const int EXCESSIVE_ELEMENT_IN_CONFIG; + extern const int NO_ELEMENTS_IN_CONFIG; } const String HDFSBuilderWrapper::CONFIG_PREFIX = "hdfs"; diff --git a/src/Storages/HDFS/HDFSCommon.h b/src/Storages/HDFS/HDFSCommon.h index fa1ca88464e..154c253a76b 100644 --- a/src/Storages/HDFS/HDFSCommon.h +++ b/src/Storages/HDFS/HDFSCommon.h @@ -17,6 +17,7 @@ namespace DB { + namespace detail { struct HDFSFsDeleter @@ -28,16 +29,14 @@ namespace detail }; } + struct HDFSFileInfo { hdfsFileInfo * file_info; int length; - HDFSFileInfo() - : file_info(nullptr) - , length(0) - { - } + HDFSFileInfo() : file_info(nullptr) , length(0) {} + HDFSFileInfo(const HDFSFileInfo & other) = delete; HDFSFileInfo(HDFSFileInfo && other) = default; HDFSFileInfo & operator=(const HDFSFileInfo & other) = delete; @@ -49,17 +48,30 @@ struct HDFSFileInfo } }; + class HDFSBuilderWrapper { - hdfsBuilder * hdfs_builder; - String hadoop_kerberos_keytab; - String hadoop_kerberos_principal; - String hadoop_kerberos_kinit_command = "kinit"; - String hadoop_security_kerberos_ticket_cache_path; - static std::mutex kinit_mtx; +friend HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration &); - std::vector> config_stor; +static const String CONFIG_PREFIX; + +public: + HDFSBuilderWrapper() : hdfs_builder(hdfsNewBuilder()) {} + + ~HDFSBuilderWrapper() { hdfsFreeBuilder(hdfs_builder); } + + HDFSBuilderWrapper(const HDFSBuilderWrapper &) = delete; + HDFSBuilderWrapper(HDFSBuilderWrapper &&) = default; + + hdfsBuilder * get() { return hdfs_builder; } + +private: + void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & config_path, bool isUser = false); + + String getKinitCmd(); + + void runKinit(); // hdfs builder relies on an external config data storage std::pair& keep(const String & k, const String & v) @@ -67,48 +79,24 @@ class HDFSBuilderWrapper return config_stor.emplace_back(std::make_pair(k, v)); } + hdfsBuilder * hdfs_builder; + String hadoop_kerberos_keytab; + String hadoop_kerberos_principal; + String hadoop_kerberos_kinit_command = "kinit"; + String hadoop_security_kerberos_ticket_cache_path; + + static std::mutex kinit_mtx; + std::vector> config_stor; bool need_kinit{false}; - - static const String CONFIG_PREFIX; - -private: - - void loadFromConfig(const Poco::Util::AbstractConfiguration & config, const String & config_path, bool isUser = false); - - String getKinitCmd(); - - void runKinit(); - -public: - - hdfsBuilder * - get() - { - return hdfs_builder; - } - - HDFSBuilderWrapper() - : hdfs_builder(hdfsNewBuilder()) - { - } - - ~HDFSBuilderWrapper() - { - hdfsFreeBuilder(hdfs_builder); - - } - - HDFSBuilderWrapper(const HDFSBuilderWrapper &) = delete; - HDFSBuilderWrapper(HDFSBuilderWrapper &&) = default; - - friend HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration &); }; using HDFSFSPtr = std::unique_ptr, detail::HDFSFsDeleter>; + // set read/connect timeout, default value in libhdfs3 is about 1 hour, and too large /// TODO Allow to tune from query Settings. HDFSBuilderWrapper createHDFSBuilder(const String & uri_str, const Poco::Util::AbstractConfiguration &); HDFSFSPtr createHDFSFS(hdfsBuilder * builder); + } #endif diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index affb76314b1..29ea46c7590 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -8,6 +8,7 @@ namespace DB { + namespace ErrorCodes { extern const int NETWORK_ERROR; @@ -21,34 +22,39 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl /// HDFS create/open functions are not thread safe static std::mutex hdfs_init_mutex; - std::string hdfs_uri; + String hdfs_uri; + String hdfs_file_path; + hdfsFile fin; HDFSBuilderWrapper builder; HDFSFSPtr fs; - ReadBufferFromHDFSImpl(const std::string & hdfs_name_, + explicit ReadBufferFromHDFSImpl( + const std::string & hdfs_uri_, + const std::string & hdfs_file_path_, const Poco::Util::AbstractConfiguration & config_) - : hdfs_uri(hdfs_name_), - builder(createHDFSBuilder(hdfs_uri, config_)) + : hdfs_uri(hdfs_uri_) + , hdfs_file_path(hdfs_file_path_) + , builder(createHDFSBuilder(hdfs_uri_, config_)) { std::lock_guard lock(hdfs_init_mutex); fs = createHDFSFS(builder.get()); - const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2); - const std::string path = hdfs_uri.substr(begin_of_path); - fin = hdfsOpenFile(fs.get(), path.c_str(), O_RDONLY, 0, 0, 0); + fin = hdfsOpenFile(fs.get(), hdfs_file_path.c_str(), O_RDONLY, 0, 0, 0); if (fin == nullptr) - throw Exception("Unable to open HDFS file: " + path + " error: " + std::string(hdfsGetLastError()), - ErrorCodes::CANNOT_OPEN_FILE); + throw Exception(ErrorCodes::CANNOT_OPEN_FILE, + "Unable to open HDFS file: {}. Error: {}", + hdfs_uri + hdfs_file_path, std::string(hdfsGetLastError())); } int read(char * start, size_t size) const { int bytes_read = hdfsRead(fs.get(), fin, start, size); if (bytes_read < 0) - throw Exception("Fail to read HDFS file: " + hdfs_uri + " " + std::string(hdfsGetLastError()), - ErrorCodes::NETWORK_ERROR); + throw Exception(ErrorCodes::NETWORK_ERROR, + "Fail to read from HDFS: {}, file path: {}. Error: {}", + hdfs_uri, hdfs_file_path, std::string(hdfsGetLastError())); return bytes_read; } @@ -62,11 +68,13 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl std::mutex ReadBufferFromHDFS::ReadBufferFromHDFSImpl::hdfs_init_mutex; -ReadBufferFromHDFS::ReadBufferFromHDFS(const std::string & hdfs_name_, - const Poco::Util::AbstractConfiguration & config_, - size_t buf_size_) +ReadBufferFromHDFS::ReadBufferFromHDFS( + const String & hdfs_uri_, + const String & hdfs_file_path_, + const Poco::Util::AbstractConfiguration & config_, + size_t buf_size_) : BufferWithOwnMemory(buf_size_) - , impl(std::make_unique(hdfs_name_, config_)) + , impl(std::make_unique(hdfs_uri_, hdfs_file_path_, config_)) { } diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.h b/src/Storages/HDFS/ReadBufferFromHDFS.h index 8d26c001b2e..bd14e3d3792 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.h +++ b/src/Storages/HDFS/ReadBufferFromHDFS.h @@ -7,11 +7,8 @@ #include #include #include - #include - #include - #include @@ -22,13 +19,19 @@ namespace DB */ class ReadBufferFromHDFS : public BufferWithOwnMemory { - struct ReadBufferFromHDFSImpl; - std::unique_ptr impl; +struct ReadBufferFromHDFSImpl; + public: - ReadBufferFromHDFS(const std::string & hdfs_name_, const Poco::Util::AbstractConfiguration &, size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE); + ReadBufferFromHDFS(const String & hdfs_uri_, const String & hdfs_file_path_, + const Poco::Util::AbstractConfiguration &, size_t buf_size_ = DBMS_DEFAULT_BUFFER_SIZE); + ~ReadBufferFromHDFS() override; bool nextImpl() override; + +private: + std::unique_ptr impl; }; } + #endif diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 392495ff77f..ad2a63c44b1 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -122,7 +122,7 @@ public: current_path = uri + path; auto compression = chooseCompressionMethod(path, compression_method); - auto read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(current_path, getContext()->getGlobalContext()->getConfigRef()), compression); + auto read_buf = wrapReadBufferWithCompressionMethod(std::make_unique(uri, path, getContext()->getGlobalContext()->getConfigRef()), compression); auto input_format = FormatFactory::instance().getInput(format, *read_buf, sample_block, getContext(), max_block_size); auto input_stream = std::make_shared(input_format); @@ -271,7 +271,15 @@ Pipe StorageHDFS::read( size_t max_block_size, unsigned num_streams) { - const size_t begin_of_path = uri.find('/', uri.find("//") + 2); + size_t begin_of_path; + /// This uri is checked for correctness in constructor of StorageHDFS and never modified afterwards + auto two_slash = uri.find("//"); + + if (two_slash == std::string::npos) + begin_of_path = uri.find('/'); + else + begin_of_path = uri.find('/', two_slash + 2); + const String path_from_uri = uri.substr(begin_of_path); const String uri_without_path = uri.substr(0, begin_of_path); @@ -281,6 +289,9 @@ Pipe StorageHDFS::read( auto sources_info = std::make_shared(); sources_info->uris = LSWithRegexpMatching("/", fs, path_from_uri); + if (sources_info->uris.empty()) + LOG_WARNING(log, "No file in HDFS matches the path: {}", uri); + for (const auto & column : column_names) { if (column == "_path") diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index 0dd57685354..e3f235296ac 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -42,7 +42,7 @@ protected: const String & compression_method_); private: - String uri; + const String uri; String format_name; String compression_method; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 71a99aa2a87..36032f9208f 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1107,13 +1107,13 @@ void IMergeTreeDataPart::remove(bool keep_s3) const { /// Remove each expected file in directory, then remove directory itself. - #if !__clang__ + #if !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-variable" #endif for (const auto & [file, _] : checksums.files) volume->getDisk()->removeSharedFile(to + "/" + file, keep_s3); - #if !__clang__ + #if !defined(__clang__) # pragma GCC diagnostic pop #endif diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index e4c9f0ae0cc..43419f9ce5e 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -938,6 +938,9 @@ public: return func->getMonotonicityForRange(type, left, right); } + Kind getKind() const { return kind; } + const ColumnWithTypeAndName & getConstArg() const { return const_arg; } + private: FunctionBasePtr func; ColumnWithTypeAndName const_arg; @@ -1308,6 +1311,235 @@ String KeyCondition::toString() const return res; } +KeyCondition::Description KeyCondition::getDescription() const +{ + /// This code may seem to be too difficult. + /// Here we want to convert RPN back to tree, and also simplify some logical expressions like `and(x, true) -> x`. + Description description; + + /// That's a binary tree. Explicit. + /// Build and optimize it simultaneously. + struct Node + { + enum class Type + { + /// Leaf, which is RPNElement. + Leaf, + /// Leafs, which are logical constants. + True, + False, + /// Binary operators. + And, + Or, + }; + + Type type; + + /// Only for Leaf + const RPNElement * element = nullptr; + /// This means that logical NOT is applied to leaf. + bool negate = false; + + std::unique_ptr left = nullptr; + std::unique_ptr right = nullptr; + }; + + /// The algorithm is the same as in KeyCondition::checkInHyperrectangle + /// We build a pair of trees on stack. For checking if key condition may be true, and if it may be false. + /// We need only `can_be_true` in result. + struct Frame + { + std::unique_ptr can_be_true; + std::unique_ptr can_be_false; + }; + + /// Combine two subtrees using logical operator. + auto combine = [](std::unique_ptr left, std::unique_ptr right, Node::Type type) + { + /// Simplify operators with for one constant condition. + + if (type == Node::Type::And) + { + /// false AND right + if (left->type == Node::Type::False) + return left; + + /// left AND false + if (right->type == Node::Type::False) + return right; + + /// true AND right + if (left->type == Node::Type::True) + return right; + + /// left AND true + if (right->type == Node::Type::True) + return left; + } + + if (type == Node::Type::Or) + { + /// false OR right + if (left->type == Node::Type::False) + return right; + + /// left OR false + if (right->type == Node::Type::False) + return left; + + /// true OR right + if (left->type == Node::Type::True) + return left; + + /// left OR true + if (right->type == Node::Type::True) + return right; + } + + return std::make_unique(Node{ + .type = type, + .left = std::move(left), + .right = std::move(right) + }); + }; + + std::vector rpn_stack; + for (const auto & element : rpn) + { + if (element.function == RPNElement::FUNCTION_UNKNOWN) + { + auto can_be_true = std::make_unique(Node{.type = Node::Type::True}); + auto can_be_false = std::make_unique(Node{.type = Node::Type::True}); + rpn_stack.emplace_back(Frame{.can_be_true = std::move(can_be_true), .can_be_false = std::move(can_be_false)}); + } + else if ( + element.function == RPNElement::FUNCTION_IN_RANGE + || element.function == RPNElement::FUNCTION_NOT_IN_RANGE + || element.function == RPNElement::FUNCTION_IN_SET + || element.function == RPNElement::FUNCTION_NOT_IN_SET) + { + auto can_be_true = std::make_unique(Node{.type = Node::Type::Leaf, .element = &element, .negate = false}); + auto can_be_false = std::make_unique(Node{.type = Node::Type::Leaf, .element = &element, .negate = true}); + rpn_stack.emplace_back(Frame{.can_be_true = std::move(can_be_true), .can_be_false = std::move(can_be_false)}); + } + else if (element.function == RPNElement::FUNCTION_NOT) + { + assert(!rpn_stack.empty()); + + std::swap(rpn_stack.back().can_be_true, rpn_stack.back().can_be_false); + } + else if (element.function == RPNElement::FUNCTION_AND) + { + assert(!rpn_stack.empty()); + auto arg1 = std::move(rpn_stack.back()); + + rpn_stack.pop_back(); + + assert(!rpn_stack.empty()); + auto arg2 = std::move(rpn_stack.back()); + + Frame frame; + frame.can_be_true = combine(std::move(arg1.can_be_true), std::move(arg2.can_be_true), Node::Type::And); + frame.can_be_false = combine(std::move(arg1.can_be_false), std::move(arg2.can_be_false), Node::Type::Or); + + rpn_stack.back() = std::move(frame); + } + else if (element.function == RPNElement::FUNCTION_OR) + { + assert(!rpn_stack.empty()); + auto arg1 = std::move(rpn_stack.back()); + + rpn_stack.pop_back(); + + assert(!rpn_stack.empty()); + auto arg2 = std::move(rpn_stack.back()); + + Frame frame; + frame.can_be_true = combine(std::move(arg1.can_be_true), std::move(arg2.can_be_true), Node::Type::Or); + frame.can_be_false = combine(std::move(arg1.can_be_false), std::move(arg2.can_be_false), Node::Type::And); + + rpn_stack.back() = std::move(frame); + } + else if (element.function == RPNElement::ALWAYS_FALSE) + { + auto can_be_true = std::make_unique(Node{.type = Node::Type::False}); + auto can_be_false = std::make_unique(Node{.type = Node::Type::True}); + + rpn_stack.emplace_back(Frame{.can_be_true = std::move(can_be_true), .can_be_false = std::move(can_be_false)}); + } + else if (element.function == RPNElement::ALWAYS_TRUE) + { + auto can_be_true = std::make_unique(Node{.type = Node::Type::True}); + auto can_be_false = std::make_unique(Node{.type = Node::Type::False}); + rpn_stack.emplace_back(Frame{.can_be_true = std::move(can_be_true), .can_be_false = std::move(can_be_false)}); + } + else + throw Exception("Unexpected function type in KeyCondition::RPNElement", ErrorCodes::LOGICAL_ERROR); + } + + if (rpn_stack.size() != 1) + throw Exception("Unexpected stack size in KeyCondition::checkInRange", ErrorCodes::LOGICAL_ERROR); + + std::vector key_names(key_columns.size()); + std::vector is_key_used(key_columns.size(), false); + + for (const auto & key : key_columns) + key_names[key.second] = key.first; + + WriteBufferFromOwnString buf; + + std::function describe; + describe = [&describe, &key_names, &is_key_used, &buf](const Node * node) + { + switch (node->type) + { + case Node::Type::Leaf: + { + is_key_used[node->element->key_column] = true; + + /// Note: for condition with double negation, like `not(x not in set)`, + /// we can replace it to `x in set` here. + /// But I won't do it, because `cloneASTWithInversionPushDown` already push down `not`. + /// So, this seem to be impossible for `can_be_true` tree. + if (node->negate) + buf << "not("; + buf << node->element->toString(key_names[node->element->key_column], true); + if (node->negate) + buf << ")"; + break; + } + case Node::Type::True: + buf << "true"; + break; + case Node::Type::False: + buf << "false"; + break; + case Node::Type::And: + buf << "and("; + describe(node->left.get()); + buf << ", "; + describe(node->right.get()); + buf << ")"; + break; + case Node::Type::Or: + buf << "or("; + describe(node->left.get()); + buf << ", "; + describe(node->right.get()); + buf << ")"; + break; + } + }; + + describe(rpn_stack.front().can_be_true.get()); + description.condition = std::move(buf.str()); + + for (size_t i = 0; i < key_names.size(); ++i) + if (is_key_used[i]) + description.used_keys.emplace_back(key_names[i]); + + return description; +} /** Index is the value of key every `index_granularity` rows. * This value is called a "mark". That is, the index consists of marks. @@ -1733,18 +1965,38 @@ bool KeyCondition::mayBeTrueAfter( return checkInRange(used_key_size, left_key, nullptr, data_types, false, BoolMask::consider_only_can_be_true).can_be_true; } - -String KeyCondition::RPNElement::toString() const +String KeyCondition::RPNElement::toString() const { return toString("column " + std::to_string(key_column), false); } +String KeyCondition::RPNElement::toString(const std::string_view & column_name, bool print_constants) const { - auto print_wrapped_column = [this](WriteBuffer & buf) + auto print_wrapped_column = [this, &column_name, print_constants](WriteBuffer & buf) { for (auto it = monotonic_functions_chain.rbegin(); it != monotonic_functions_chain.rend(); ++it) + { buf << (*it)->getName() << "("; + if (print_constants) + { + if (const auto * func = typeid_cast(it->get())) + { + if (func->getKind() == FunctionWithOptionalConstArg::Kind::LEFT_CONST) + buf << applyVisitor(FieldVisitorToString(), (*func->getConstArg().column)[0]) << ", "; + } + } + } - buf << "column " << key_column; + buf << column_name; for (auto it = monotonic_functions_chain.rbegin(); it != monotonic_functions_chain.rend(); ++it) + { + if (print_constants) + { + if (const auto * func = typeid_cast(it->get())) + { + if (func->getKind() == FunctionWithOptionalConstArg::Kind::RIGHT_CONST) + buf << ", " << applyVisitor(FieldVisitorToString(), (*func->getConstArg().column)[0]); + } + } buf << ")"; + } }; WriteBufferFromOwnString buf; diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 631fb0b7cc4..bd51769ad1f 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -293,6 +293,16 @@ public: String toString() const; + /// Condition description for EXPLAIN query. + struct Description + { + /// Which columns from PK were used, in PK order. + std::vector used_keys; + /// Condition which was applied, mostly human-readable. + std::string condition; + }; + + Description getDescription() const; /** A chain of possibly monotone functions. * If the key column is wrapped in functions that can be monotonous in some value ranges @@ -345,6 +355,7 @@ private: : function(function_), range(range_), key_column(key_column_) {} String toString() const; + String toString(const std::string_view & column_name, bool print_constants) const; Function function = FUNCTION_UNKNOWN; diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 6bf164dd824..41ad71c89ce 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -30,7 +30,7 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( const MergeTreeReaderSettings & reader_settings_, bool use_uncompressed_cache_, const Names & virt_column_names_) - : SourceWithProgress(getHeader(std::move(header), prewhere_info_, virt_column_names_)) + : SourceWithProgress(transformHeader(std::move(header), prewhere_info_, virt_column_names_)) , storage(storage_) , metadata_snapshot(metadata_snapshot_) , prewhere_info(prewhere_info_) @@ -370,7 +370,7 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P } } -Block MergeTreeBaseSelectProcessor::getHeader( +Block MergeTreeBaseSelectProcessor::transformHeader( Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns) { executePrewhereActions(block, prewhere_info); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 00ef131ae45..a4c55cbae45 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -33,6 +33,8 @@ public: ~MergeTreeBaseSelectProcessor() override; + static Block transformHeader(Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns); + static void executePrewhereActions(Block & block, const PrewhereInfoPtr & prewhere_info); protected: @@ -49,8 +51,6 @@ protected: static void injectVirtualColumns(Block & block, MergeTreeReadTask * task, const Names & virtual_columns); static void injectVirtualColumns(Chunk & chunk, MergeTreeReadTask * task, const Names & virtual_columns); - static Block getHeader(Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns); - void initializeRangeReaders(MergeTreeReadTask & task); protected: diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index a1691fe7931..57e8cca46cd 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -393,8 +394,9 @@ void MergeTreeDataPartWriterWide::validateColumnOfFixedSize(const String & name, throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot validate column of non fixed type {}", type.getName()); auto disk = data_part->volume->getDisk(); - String mrk_path = fullPath(disk, part_path + name + marks_file_extension); - String bin_path = fullPath(disk, part_path + name + DATA_FILE_EXTENSION); + String escaped_name = escapeForFileName(name); + String mrk_path = fullPath(disk, part_path + escaped_name + marks_file_extension); + String bin_path = fullPath(disk, part_path + escaped_name + DATA_FILE_EXTENSION); DB::ReadBufferFromFile mrk_in(mrk_path); DB::CompressedReadBufferFromFile bin_in(bin_path, 0, 0, 0, nullptr); bool must_be_last = false; diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 7f7370e6f1f..af72b3e53f2 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include @@ -282,11 +282,40 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( auto query_context = context->hasQueryContext() ? context->getQueryContext() : context; - if (query_context->getSettingsRef().allow_experimental_query_deduplication) - selectPartsToReadWithUUIDFilter(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read, query_context); - else - selectPartsToRead(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read); + PartFilterCounters part_filter_counters; + auto index_stats = std::make_unique(); + if (query_context->getSettingsRef().allow_experimental_query_deduplication) + selectPartsToReadWithUUIDFilter(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read, query_context, part_filter_counters); + else + selectPartsToRead(parts, part_values, minmax_idx_condition, minmax_columns_types, partition_pruner, max_block_numbers_to_read, part_filter_counters); + + index_stats->emplace_back(ReadFromMergeTree::IndexStat{ + .type = ReadFromMergeTree::IndexType::None, + .num_parts_after = part_filter_counters.num_initial_selected_parts, + .num_granules_after = part_filter_counters.num_initial_selected_granules}); + + if (minmax_idx_condition) + { + auto description = minmax_idx_condition->getDescription(); + index_stats->emplace_back(ReadFromMergeTree::IndexStat{ + .type = ReadFromMergeTree::IndexType::MinMax, + .condition = std::move(description.condition), + .used_keys = std::move(description.used_keys), + .num_parts_after = part_filter_counters.num_parts_after_minmax, + .num_granules_after = part_filter_counters.num_granules_after_minmax}); + } + + if (partition_pruner) + { + auto description = partition_pruner->getKeyCondition().getDescription(); + index_stats->emplace_back(ReadFromMergeTree::IndexStat{ + .type = ReadFromMergeTree::IndexType::Partition, + .condition = std::move(description.condition), + .used_keys = std::move(description.used_keys), + .num_parts_after = part_filter_counters.num_parts_after_partition_pruner, + .num_granules_after = part_filter_counters.num_granules_after_partition_pruner}); + } /// Sampling. Names column_names_to_read = real_column_names; @@ -568,6 +597,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( MergeTreeIndexConditionPtr condition; std::atomic total_granules{0}; std::atomic granules_dropped{0}; + std::atomic total_parts{0}; + std::atomic parts_dropped{0}; DataSkippingIndexAndCondition(MergeTreeIndexPtr index_, MergeTreeIndexConditionPtr condition_) : index(index_) @@ -620,6 +651,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( RangesInDataParts parts_with_ranges(parts.size()); size_t sum_marks = 0; std::atomic sum_marks_pk = 0; + std::atomic sum_parts_pk = 0; std::atomic total_marks_pk = 0; size_t sum_ranges = 0; @@ -642,25 +674,29 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( RangesInDataPart ranges(part, part_index); - total_marks_pk.fetch_add(part->index_granularity.getMarksCount(), std::memory_order_relaxed); + size_t total_marks_count = part->getMarksCount(); + if (total_marks_count && part->index_granularity.hasFinalMark()) + --total_marks_count; + + total_marks_pk.fetch_add(total_marks_count, std::memory_order_relaxed); if (metadata_snapshot->hasPrimaryKey()) ranges.ranges = markRangesFromPKRange(part, metadata_snapshot, key_condition, settings, log); - else - { - size_t total_marks_count = part->getMarksCount(); - if (total_marks_count) - { - if (part->index_granularity.hasFinalMark()) - --total_marks_count; - ranges.ranges = MarkRanges{MarkRange{0, total_marks_count}}; - } - } + else if (total_marks_count) + ranges.ranges = MarkRanges{MarkRange{0, total_marks_count}}; sum_marks_pk.fetch_add(ranges.getMarksCount(), std::memory_order_relaxed); + if (!ranges.ranges.empty()) + sum_parts_pk.fetch_add(1, std::memory_order_relaxed); + for (auto & index_and_condition : useful_indices) { + if (ranges.ranges.empty()) + break; + + index_and_condition.total_parts.fetch_add(1, std::memory_order_relaxed); + size_t total_granules = 0; size_t granules_dropped = 0; ranges.ranges = filterMarksUsingIndex( @@ -672,6 +708,9 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( index_and_condition.total_granules.fetch_add(total_granules, std::memory_order_relaxed); index_and_condition.granules_dropped.fetch_add(granules_dropped, std::memory_order_relaxed); + + if (ranges.ranges.empty()) + index_and_condition.parts_dropped.fetch_add(1, std::memory_order_relaxed); } if (!ranges.ranges.empty()) @@ -737,12 +776,34 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( parts_with_ranges.resize(next_part); } + if (metadata_snapshot->hasPrimaryKey()) + { + auto description = key_condition.getDescription(); + + index_stats->emplace_back(ReadFromMergeTree::IndexStat{ + .type = ReadFromMergeTree::IndexType::PrimaryKey, + .condition = std::move(description.condition), + .used_keys = std::move(description.used_keys), + .num_parts_after = sum_parts_pk.load(std::memory_order_relaxed), + .num_granules_after = sum_marks_pk.load(std::memory_order_relaxed)}); + } + for (const auto & index_and_condition : useful_indices) { const auto & index_name = index_and_condition.index->index.name; LOG_DEBUG(log, "Index {} has dropped {}/{} granules.", backQuote(index_name), index_and_condition.granules_dropped, index_and_condition.total_granules); + + std::string description = index_and_condition.index->index.type + + " GRANULARITY " + std::to_string(index_and_condition.index->index.granularity); + + index_stats->emplace_back(ReadFromMergeTree::IndexStat{ + .type = ReadFromMergeTree::IndexType::Skip, + .name = index_name, + .description = std::move(description), + .num_parts_after = index_and_condition.total_parts - index_and_condition.parts_dropped, + .num_granules_after = index_and_condition.total_granules - index_and_condition.granules_dropped}); } LOG_DEBUG(log, "Selected {}/{} parts by partition key, {} parts by primary key, {}/{} marks by primary key, {} marks to read from {} ranges", @@ -809,6 +870,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( plan = spreadMarkRangesAmongStreamsFinal( std::move(parts_with_ranges), + std::move(index_stats), num_streams, column_names_to_read, metadata_snapshot, @@ -832,6 +894,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( plan = spreadMarkRangesAmongStreamsWithOrder( std::move(parts_with_ranges), + std::move(index_stats), num_streams, column_names_to_read, metadata_snapshot, @@ -849,6 +912,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( { plan = spreadMarkRangesAmongStreams( std::move(parts_with_ranges), + std::move(index_stats), num_streams, column_names_to_read, metadata_snapshot, @@ -960,25 +1024,9 @@ size_t minMarksForConcurrentRead( } -static QueryPlanPtr createPlanFromPipe(Pipe pipe, const String & query_id, const MergeTreeData & data, const std::string & description = "") -{ - auto plan = std::make_unique(); - - std::string storage_name = "MergeTree"; - if (!description.empty()) - storage_name += ' ' + description; - - // Attach QueryIdHolder if needed - if (!query_id.empty()) - pipe.addQueryIdHolder(std::make_shared(query_id, data)); - - auto step = std::make_unique(std::move(pipe), storage_name); - plan->addStep(std::move(step)); - return plan; -} - QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -1030,75 +1078,32 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( if (0 == sum_marks) return {}; + ReadFromMergeTree::Settings step_settings + { + .max_block_size = max_block_size, + .preferred_block_size_bytes = settings.preferred_block_size_bytes, + .preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes, + .min_marks_for_concurrent_read = min_marks_for_concurrent_read, + .use_uncompressed_cache = use_uncompressed_cache, + .reader_settings = reader_settings, + .backoff_settings = MergeTreeReadPool::BackoffSettings(settings), + }; + if (num_streams > 1) { - /// Parallel query execution. - Pipes res; - /// Reduce the number of num_streams if the data is small. if (sum_marks < num_streams * min_marks_for_concurrent_read && parts.size() < num_streams) num_streams = std::max((sum_marks + min_marks_for_concurrent_read - 1) / min_marks_for_concurrent_read, parts.size()); - - MergeTreeReadPoolPtr pool = std::make_shared( - num_streams, - sum_marks, - min_marks_for_concurrent_read, - std::move(parts), - data, - metadata_snapshot, - query_info.prewhere_info, - true, - column_names, - MergeTreeReadPool::BackoffSettings(settings), - settings.preferred_block_size_bytes, - false); - - /// Let's estimate total number of rows for progress bar. - LOG_DEBUG(log, "Reading approx. {} rows with {} streams", total_rows, num_streams); - - for (size_t i = 0; i < num_streams; ++i) - { - auto source = std::make_shared( - i, pool, min_marks_for_concurrent_read, max_block_size, - settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, - data, metadata_snapshot, use_uncompressed_cache, - query_info.prewhere_info, reader_settings, virt_columns); - - if (i == 0) - { - /// Set the approximate number of rows for the first source only - source->addTotalRowsApprox(total_rows); - } - - res.emplace_back(std::move(source)); - } - - return createPlanFromPipe(Pipe::unitePipes(std::move(res)), query_id, data); } - else - { - /// Sequential query execution. - Pipes res; - for (const auto & part : parts) - { - auto source = std::make_shared( - data, metadata_snapshot, part.data_part, max_block_size, settings.preferred_block_size_bytes, - settings.preferred_max_column_in_block_size_bytes, column_names, part.ranges, use_uncompressed_cache, - query_info.prewhere_info, true, reader_settings, virt_columns, part.part_index_in_query); + auto plan = std::make_unique(); + auto step = std::make_unique( + data, metadata_snapshot, query_id, + column_names, std::move(parts), std::move(index_stats), query_info.prewhere_info, virt_columns, + step_settings, num_streams, ReadFromMergeTree::ReadType::Default); - res.emplace_back(std::move(source)); - } - - auto pipe = Pipe::unitePipes(std::move(res)); - - /// Use ConcatProcessor to concat sources together. - /// It is needed to read in parts order (and so in PK order) if single thread is used. - if (pipe.numOutputPorts() > 1) - pipe.addTransform(std::make_shared(pipe.getHeader(), pipe.numOutputPorts())); - - return createPlanFromPipe(std::move(pipe), query_id, data); - } + plan->addStep(std::move(step)); + return plan; } static ActionsDAGPtr createProjection(const Block & header) @@ -1111,6 +1116,7 @@ static ActionsDAGPtr createProjection(const Block & header) QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -1218,8 +1224,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( for (size_t i = 0; i < num_streams && !parts.empty(); ++i) { size_t need_marks = min_marks_per_stream; - - Pipes pipes; + RangesInDataParts new_parts; /// Loop over parts. /// We will iteratively take part or some subrange of a part from the back @@ -1274,53 +1279,31 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( parts.emplace_back(part); } ranges_to_get_from_part = split_ranges(ranges_to_get_from_part, input_order_info->direction); - - if (input_order_info->direction == 1) - { - pipes.emplace_back(std::make_shared( - data, - metadata_snapshot, - part.data_part, - max_block_size, - settings.preferred_block_size_bytes, - settings.preferred_max_column_in_block_size_bytes, - column_names, - ranges_to_get_from_part, - use_uncompressed_cache, - query_info.prewhere_info, - true, - reader_settings, - virt_columns, - part.part_index_in_query)); - } - else - { - pipes.emplace_back(std::make_shared( - data, - metadata_snapshot, - part.data_part, - max_block_size, - settings.preferred_block_size_bytes, - settings.preferred_max_column_in_block_size_bytes, - column_names, - ranges_to_get_from_part, - use_uncompressed_cache, - query_info.prewhere_info, - true, - reader_settings, - virt_columns, - part.part_index_in_query)); - } + new_parts.emplace_back(part.data_part, part.part_index_in_query, std::move(ranges_to_get_from_part)); } - auto plan = createPlanFromPipe(Pipe::unitePipes(std::move(pipes)), query_id, data, "with order"); - - if (input_order_info->direction != 1) + ReadFromMergeTree::Settings step_settings { - auto reverse_step = std::make_unique(plan->getCurrentDataStream()); - plan->addStep(std::move(reverse_step)); - } + .max_block_size = max_block_size, + .preferred_block_size_bytes = settings.preferred_block_size_bytes, + .preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes, + .min_marks_for_concurrent_read = min_marks_for_concurrent_read, + .use_uncompressed_cache = use_uncompressed_cache, + .reader_settings = reader_settings, + .backoff_settings = MergeTreeReadPool::BackoffSettings(settings), + }; + auto read_type = input_order_info->direction == 1 + ? ReadFromMergeTree::ReadType::InOrder + : ReadFromMergeTree::ReadType::InReverseOrder; + + auto plan = std::make_unique(); + auto step = std::make_unique( + data, metadata_snapshot, query_id, + column_names, std::move(new_parts), std::move(index_stats), query_info.prewhere_info, virt_columns, + step_settings, num_streams, read_type); + + plan->addStep(std::move(step)); plans.emplace_back(std::move(plan)); } @@ -1371,6 +1354,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -1412,7 +1396,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( num_streams = settings.max_final_threads; /// If setting do_not_merge_across_partitions_select_final is true than we won't merge parts from different partitions. - /// We have all parts in parts vector, where parts with same partition are nerby. + /// We have all parts in parts vector, where parts with same partition are nearby. /// So we will store iterators pointed to the beginning of each partition range (and parts.end()), /// then we will create a pipe for each partition that will run selecting processor and merging processor /// for the parts with this partition. In the end we will unite all the pipes. @@ -1451,7 +1435,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( QueryPlanPtr plan; { - Pipes pipes; + RangesInDataParts new_parts; /// If do_not_merge_across_partitions_select_final is true and there is only one part in partition /// with level > 0 then we won't postprocess this part and if num_streams > 1 we @@ -1470,36 +1454,35 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( { for (auto part_it = parts_to_merge_ranges[range_index]; part_it != parts_to_merge_ranges[range_index + 1]; ++part_it) { - auto source_processor = std::make_shared( - data, - metadata_snapshot, - part_it->data_part, - max_block_size, - settings.preferred_block_size_bytes, - settings.preferred_max_column_in_block_size_bytes, - column_names, - part_it->ranges, - use_uncompressed_cache, - query_info.prewhere_info, - true, - reader_settings, - virt_columns, - part_it->part_index_in_query); - - pipes.emplace_back(std::move(source_processor)); + new_parts.emplace_back(part_it->data_part, part_it->part_index_in_query, part_it->ranges); } } - if (pipes.empty()) + if (new_parts.empty()) continue; - auto pipe = Pipe::unitePipes(std::move(pipes)); + ReadFromMergeTree::Settings step_settings + { + .max_block_size = max_block_size, + .preferred_block_size_bytes = settings.preferred_block_size_bytes, + .preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes, + .min_marks_for_concurrent_read = 0, /// this setting is not used for reading in order + .use_uncompressed_cache = use_uncompressed_cache, + .reader_settings = reader_settings, + .backoff_settings = MergeTreeReadPool::BackoffSettings(settings), + }; + + plan = std::make_unique(); + auto step = std::make_unique( + data, metadata_snapshot, query_id, + column_names, std::move(new_parts), std::move(index_stats), query_info.prewhere_info, virt_columns, + step_settings, num_streams, ReadFromMergeTree::ReadType::InOrder); + + plan->addStep(std::move(step)); /// Drop temporary columns, added by 'sorting_key_expr' if (!out_projection) - out_projection = createProjection(pipe.getHeader()); - - plan = createPlanFromPipe(std::move(pipe), query_id, data, "with final"); + out_projection = createProjection(plan->getCurrentDataStream().header); } auto expression_step = std::make_unique( @@ -1546,7 +1529,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( if (!lonely_parts.empty()) { - Pipes pipes; + RangesInDataParts new_parts; size_t num_streams_for_lonely_parts = num_streams * lonely_parts.size(); @@ -1561,41 +1544,28 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( if (sum_marks_in_lonely_parts < num_streams_for_lonely_parts * min_marks_for_concurrent_read && lonely_parts.size() < num_streams_for_lonely_parts) num_streams_for_lonely_parts = std::max((sum_marks_in_lonely_parts + min_marks_for_concurrent_read - 1) / min_marks_for_concurrent_read, lonely_parts.size()); - - MergeTreeReadPoolPtr pool = std::make_shared( - num_streams_for_lonely_parts, - sum_marks_in_lonely_parts, - min_marks_for_concurrent_read, - std::move(lonely_parts), - data, - metadata_snapshot, - query_info.prewhere_info, - true, - column_names, - MergeTreeReadPool::BackoffSettings(settings), - settings.preferred_block_size_bytes, - false); - - LOG_DEBUG(log, "Reading approx. {} rows with {} streams", total_rows_in_lonely_parts, num_streams_for_lonely_parts); - - for (size_t i = 0; i < num_streams_for_lonely_parts; ++i) + ReadFromMergeTree::Settings step_settings { - auto source = std::make_shared( - i, pool, min_marks_for_concurrent_read, max_block_size, - settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, - data, metadata_snapshot, use_uncompressed_cache, - query_info.prewhere_info, reader_settings, virt_columns); + .max_block_size = max_block_size, + .preferred_block_size_bytes = settings.preferred_block_size_bytes, + .preferred_max_column_in_block_size_bytes = settings.preferred_max_column_in_block_size_bytes, + .min_marks_for_concurrent_read = min_marks_for_concurrent_read, + .use_uncompressed_cache = use_uncompressed_cache, + .reader_settings = reader_settings, + .backoff_settings = MergeTreeReadPool::BackoffSettings(settings), + }; - pipes.emplace_back(std::move(source)); - } + auto plan = std::make_unique(); + auto step = std::make_unique( + data, metadata_snapshot, query_id, + column_names, std::move(lonely_parts), std::move(index_stats), query_info.prewhere_info, virt_columns, + step_settings, num_streams_for_lonely_parts, ReadFromMergeTree::ReadType::Default); - auto pipe = Pipe::unitePipes(std::move(pipes)); + plan->addStep(std::move(step)); /// Drop temporary columns, added by 'sorting_key_expr' if (!out_projection) - out_projection = createProjection(pipe.getHeader()); - - QueryPlanPtr plan = createPlanFromPipe(std::move(pipe), query_id, data, "with final"); + out_projection = createProjection(plan->getCurrentDataStream().header); auto expression_step = std::make_unique( plan->getCurrentDataStream(), @@ -1896,7 +1866,8 @@ void MergeTreeDataSelectExecutor::selectPartsToRead( const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, std::optional & partition_pruner, - const PartitionIdToMaxBlock * max_block_numbers_to_read) + const PartitionIdToMaxBlock * max_block_numbers_to_read, + PartFilterCounters & counters) { auto prev_parts = parts; parts.clear(); @@ -1909,22 +1880,35 @@ void MergeTreeDataSelectExecutor::selectPartsToRead( if (part->isEmpty()) continue; + if (max_block_numbers_to_read) + { + auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id); + if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second) + continue; + } + + size_t num_granules = part->getMarksCount(); + if (num_granules && part->index_granularity.hasFinalMark()) + --num_granules; + + counters.num_initial_selected_parts += 1; + counters.num_initial_selected_granules += num_granules; + if (minmax_idx_condition && !minmax_idx_condition->checkInHyperrectangle( part->minmax_idx.hyperrectangle, minmax_columns_types).can_be_true) continue; + counters.num_parts_after_minmax += 1; + counters.num_granules_after_minmax += num_granules; + if (partition_pruner) { if (partition_pruner->canBePruned(part)) continue; } - if (max_block_numbers_to_read) - { - auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id); - if (blocks_iterator == max_block_numbers_to_read->end() || part->info.max_block > blocks_iterator->second) - continue; - } + counters.num_parts_after_partition_pruner += 1; + counters.num_granules_after_partition_pruner += num_granules; parts.push_back(part); } @@ -1937,7 +1921,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( const DataTypes & minmax_columns_types, std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, - ContextPtr query_context) const + ContextPtr query_context, + PartFilterCounters & counters) const { /// process_parts prepare parts that have to be read for the query, /// returns false if duplicated parts' UUID have been met @@ -1957,17 +1942,6 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( if (part->isEmpty()) continue; - if (minmax_idx_condition - && !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, minmax_columns_types) - .can_be_true) - continue; - - if (partition_pruner) - { - if (partition_pruner->canBePruned(part)) - continue; - } - if (max_block_numbers_to_read) { auto blocks_iterator = max_block_numbers_to_read->find(part->info.partition_id); @@ -1975,13 +1949,37 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( continue; } + /// Skip the part if its uuid is meant to be excluded + if (part->uuid != UUIDHelpers::Nil && ignored_part_uuids->has(part->uuid)) + continue; + + size_t num_granules = part->getMarksCount(); + if (num_granules && part->index_granularity.hasFinalMark()) + --num_granules; + + counters.num_initial_selected_parts += 1; + counters.num_initial_selected_granules += num_granules; + + if (minmax_idx_condition + && !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, minmax_columns_types) + .can_be_true) + continue; + + counters.num_parts_after_minmax += 1; + counters.num_granules_after_minmax += num_granules; + + if (partition_pruner) + { + if (partition_pruner->canBePruned(part)) + continue; + } + + counters.num_parts_after_partition_pruner += 1; + counters.num_granules_after_partition_pruner += num_granules; + /// populate UUIDs and exclude ignored parts if enabled if (part->uuid != UUIDHelpers::Nil) { - /// Skip the part if its uuid is meant to be excluded - if (ignored_part_uuids->has(part->uuid)) - continue; - auto result = temp_part_uuids.insert(part->uuid); if (!result.second) throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR); @@ -2013,6 +2011,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter( { LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them"); + counters = PartFilterCounters(); + /// Second attempt didn't help, throw an exception if (!select_parts(parts)) throw Exception("Found duplicate UUIDs while processing query.", ErrorCodes::DUPLICATED_PART_UUIDS); diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h index 927a82b738f..4129b3ea2a0 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB @@ -57,6 +58,7 @@ private: QueryPlanPtr spreadMarkRangesAmongStreams( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -71,6 +73,7 @@ private: /// out_projection - save projection only with columns, requested to read QueryPlanPtr spreadMarkRangesAmongStreamsWithOrder( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -86,6 +89,7 @@ private: QueryPlanPtr spreadMarkRangesAmongStreamsFinal( RangesInDataParts && parts, + ReadFromMergeTree::IndexStatPtr index_stats, size_t num_streams, const Names & column_names, const StorageMetadataPtr & metadata_snapshot, @@ -123,6 +127,16 @@ private: size_t & granules_dropped, Poco::Logger * log); + struct PartFilterCounters + { + size_t num_initial_selected_parts = 0; + size_t num_initial_selected_granules = 0; + size_t num_parts_after_minmax = 0; + size_t num_granules_after_minmax = 0; + size_t num_parts_after_partition_pruner = 0; + size_t num_granules_after_partition_pruner = 0; + }; + /// Select the parts in which there can be data that satisfy `minmax_idx_condition` and that match the condition on `_part`, /// as well as `max_block_number_to_read`. static void selectPartsToRead( @@ -131,7 +145,8 @@ private: const std::optional & minmax_idx_condition, const DataTypes & minmax_columns_types, std::optional & partition_pruner, - const PartitionIdToMaxBlock * max_block_numbers_to_read); + const PartitionIdToMaxBlock * max_block_numbers_to_read, + PartFilterCounters & counters); /// Same as previous but also skip parts uuids if any to the query context, or skip parts which uuids marked as excluded. void selectPartsToReadWithUUIDFilter( @@ -141,7 +156,8 @@ private: const DataTypes & minmax_columns_types, std::optional & partition_pruner, const PartitionIdToMaxBlock * max_block_numbers_to_read, - ContextPtr query_context) const; + ContextPtr query_context, + PartFilterCounters & counters) const; }; } diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index 366e9a2381a..9949bdf86f8 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -100,7 +100,7 @@ private: const MergeTreeData & data; StorageMetadataPtr metadata_snapshot; - Names column_names; + const Names column_names; bool do_not_steal_tasks; bool predict_block_size_bytes; std::vector per_part_column_name_set; diff --git a/src/Storages/MergeTree/PartitionPruner.h b/src/Storages/MergeTree/PartitionPruner.h index 944461f4403..a4035087b89 100644 --- a/src/Storages/MergeTree/PartitionPruner.h +++ b/src/Storages/MergeTree/PartitionPruner.h @@ -32,6 +32,8 @@ public: bool canBePruned(const DataPartPtr & part); bool isUseless() const { return useless; } + + const KeyCondition & getKeyCondition() const { return partition_condition; } }; } diff --git a/src/Storages/ReadInOrderOptimizer.cpp b/src/Storages/ReadInOrderOptimizer.cpp index 5f2b5f4db96..3bb7034b588 100644 --- a/src/Storages/ReadInOrderOptimizer.cpp +++ b/src/Storages/ReadInOrderOptimizer.cpp @@ -44,7 +44,7 @@ InputOrderInfoPtr ReadInOrderOptimizer::getInputOrder(const StorageMetadataPtr & int read_direction = required_sort_description.at(0).direction; size_t prefix_size = std::min(required_sort_description.size(), sorting_key_columns.size()); - auto aliase_columns = metadata_snapshot->getColumns().getAliases(); + auto aliased_columns = metadata_snapshot->getColumns().getAliases(); for (size_t i = 0; i < prefix_size; ++i) { @@ -55,13 +55,18 @@ InputOrderInfoPtr ReadInOrderOptimizer::getInputOrder(const StorageMetadataPtr & /// or in some simple cases when order key element is wrapped into monotonic function. auto apply_order_judge = [&] (const ExpressionActions::Actions & actions, const String & sort_column) { + /// If required order depend on collation, it cannot be matched with primary key order. + /// Because primary keys cannot have collations. + if (required_sort_description[i].collator) + return false; + int current_direction = required_sort_description[i].direction; - /// For the path: order by (sort_column, ...) + /// For the path: order by (sort_column, ...) if (sort_column == sorting_key_columns[i] && current_direction == read_direction) { return true; } - /// For the path: order by (function(sort_column), ...) + /// For the path: order by (function(sort_column), ...) /// Allow only one simple monotonic functions with one argument /// Why not allow multi monotonic functions? else @@ -125,7 +130,7 @@ InputOrderInfoPtr ReadInOrderOptimizer::getInputOrder(const StorageMetadataPtr & /// currently we only support alias column without any function wrapper /// ie: `order by aliased_column` can have this optimization, but `order by function(aliased_column)` can not. /// This suits most cases. - if (context->getSettingsRef().optimize_respect_aliases && aliase_columns.contains(required_sort_description[i].column_name)) + if (context->getSettingsRef().optimize_respect_aliases && aliased_columns.contains(required_sort_description[i].column_name)) { auto column_expr = metadata_snapshot->getColumns().get(required_sort_description[i].column_name).default_desc.expression->clone(); replaceAliasColumnsInQuery(column_expr, metadata_snapshot->getColumns(), forbidden_columns, context); diff --git a/src/Storages/StorageMaterializeMySQL.cpp b/src/Storages/StorageMaterializeMySQL.cpp index a15e4957abe..8e6f2e1ad63 100644 --- a/src/Storages/StorageMaterializeMySQL.cpp +++ b/src/Storages/StorageMaterializeMySQL.cpp @@ -92,7 +92,7 @@ Pipe StorageMaterializeMySQL::read( { Block pipe_header = pipe.getHeader(); auto syntax = TreeRewriter(context).analyze(expressions, pipe_header.getNamesAndTypesList()); - ExpressionActionsPtr expression_actions = ExpressionAnalyzer(expressions, syntax, context).getActions(true); + ExpressionActionsPtr expression_actions = ExpressionAnalyzer(expressions, syntax, context).getActions(true /* add_aliases */, false /* project_result */); pipe.addSimpleTransform([&](const Block & header) { diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp index c635b0e939c..35eb85e41d2 100644 --- a/src/Storages/StorageMySQL.cpp +++ b/src/Storages/StorageMySQL.cpp @@ -71,7 +71,7 @@ Pipe StorageMySQL::read( SelectQueryInfo & query_info_, ContextPtr context_, QueryProcessingStage::Enum /*processed_stage*/, - size_t max_block_size_, + size_t /*max_block_size*/, unsigned) { metadata_snapshot->check(column_names_, getVirtuals(), getStorageID()); @@ -95,8 +95,10 @@ Pipe StorageMySQL::read( sample_block.insert({ column_data.type, column_data.name }); } + + StreamSettings mysql_input_stream_settings(context_->getSettingsRef(), true, false); return Pipe(std::make_shared( - std::make_shared(pool, query, sample_block, max_block_size_, /* auto_close = */ true))); + std::make_shared(pool, query, sample_block, mysql_input_stream_settings))); } @@ -144,7 +146,9 @@ public: { WriteBufferFromOwnString sqlbuf; sqlbuf << (storage.replace_query ? "REPLACE" : "INSERT") << " INTO "; - sqlbuf << backQuoteMySQL(remote_database_name) << "." << backQuoteMySQL(remote_table_name); + if (!remote_database_name.empty()) + sqlbuf << backQuoteMySQL(remote_database_name) << "."; + sqlbuf << backQuoteMySQL(remote_table_name); sqlbuf << " (" << dumpNamesWithBackQuote(block) << ") VALUES "; auto writer = FormatFactory::instance().getOutputStream("Values", sqlbuf, metadata_snapshot->getSampleBlock(), storage.getContext()); diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index a5cbd004d55..8a42caf41b1 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -166,6 +166,7 @@ StorageS3Source::StorageS3Source( ContextPtr context_, const ColumnsDescription & columns_, UInt64 max_block_size_, + UInt64 s3_max_single_read_retries_, const String compression_hint_, const std::shared_ptr & client_, const String & bucket_, @@ -177,6 +178,7 @@ StorageS3Source::StorageS3Source( , format(format_) , columns_desc(columns_) , max_block_size(max_block_size_) + , s3_max_single_read_retries(s3_max_single_read_retries_) , compression_hint(compression_hint_) , client(client_) , sample_block(sample_block_) @@ -197,7 +199,7 @@ bool StorageS3Source::initialize() file_path = bucket + "/" + current_key; read_buf = wrapReadBufferWithCompressionMethod( - std::make_unique(client, bucket, current_key), chooseCompressionMethod(current_key, compression_hint)); + std::make_unique(client, bucket, current_key, s3_max_single_read_retries), chooseCompressionMethod(current_key, compression_hint)); auto input_format = FormatFactory::instance().getInput(format, *read_buf, sample_block, getContext(), max_block_size); reader = std::make_shared(input_format); @@ -312,6 +314,7 @@ StorageS3::StorageS3( const String & secret_access_key_, const StorageID & table_id_, const String & format_name_, + UInt64 s3_max_single_read_retries_, UInt64 min_upload_part_size_, UInt64 max_single_part_upload_size_, UInt64 max_connections_, @@ -323,6 +326,7 @@ StorageS3::StorageS3( : IStorage(table_id_) , client_auth{uri_, access_key_id_, secret_access_key_, max_connections_, {}, {}} /// Client and settings will be updated later , format_name(format_name_) + , s3_max_single_read_retries(s3_max_single_read_retries_) , min_upload_part_size(min_upload_part_size_) , max_single_part_upload_size(max_single_part_upload_size_) , compression_method(compression_method_) @@ -389,6 +393,7 @@ Pipe StorageS3::read( local_context, metadata_snapshot->getColumns(), max_block_size, + s3_max_single_read_retries, compression_method, client_auth.client, client_auth.uri.bucket, @@ -442,7 +447,8 @@ void StorageS3::updateClientAndAuthSettings(ContextPtr ctx, StorageS3::ClientAut credentials.GetAWSSecretKey(), settings.server_side_encryption_customer_key_base64, std::move(headers), - settings.use_environment_credentials.value_or(ctx->getConfigRef().getBool("s3.use_environment_credentials", false))); + settings.use_environment_credentials.value_or(ctx->getConfigRef().getBool("s3.use_environment_credentials", false)), + settings.use_insecure_imds_request.value_or(ctx->getConfigRef().getBool("s3.use_insecure_imds_request", false))); upd.auth_settings = std::move(settings); } @@ -473,6 +479,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) secret_access_key = engine_args[2]->as().value.safeGet(); } + UInt64 s3_max_single_read_retries = args.getLocalContext()->getSettingsRef().s3_max_single_read_retries; UInt64 min_upload_part_size = args.getLocalContext()->getSettingsRef().s3_min_upload_part_size; UInt64 max_single_part_upload_size = args.getLocalContext()->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = args.getLocalContext()->getSettingsRef().s3_max_connections; @@ -496,6 +503,7 @@ void registerStorageS3Impl(const String & name, StorageFactory & factory) secret_access_key, args.table_id, format_name, + s3_max_single_read_retries, min_upload_part_size, max_single_part_upload_size, max_connections, diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 1e1d76fa6e3..b068f82cfb1 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -55,6 +55,7 @@ public: ContextPtr context_, const ColumnsDescription & columns_, UInt64 max_block_size_, + UInt64 s3_max_single_read_retries_, const String compression_hint_, const std::shared_ptr & client_, const String & bucket, @@ -71,6 +72,7 @@ private: String format; ColumnsDescription columns_desc; UInt64 max_block_size; + UInt64 s3_max_single_read_retries; String compression_hint; std::shared_ptr client; Block sample_block; @@ -100,6 +102,7 @@ public: const String & secret_access_key, const StorageID & table_id_, const String & format_name_, + UInt64 s3_max_single_read_retries_, UInt64 min_upload_part_size_, UInt64 max_single_part_upload_size_, UInt64 max_connections_, @@ -145,6 +148,7 @@ private: ClientAuthentificaiton client_auth; String format_name; + UInt64 s3_max_single_read_retries; size_t min_upload_part_size; size_t max_single_part_upload_size; String compression_method; diff --git a/src/Storages/StorageS3Settings.cpp b/src/Storages/StorageS3Settings.cpp index 6d97e6fae95..8aafc12a688 100644 --- a/src/Storages/StorageS3Settings.cpp +++ b/src/Storages/StorageS3Settings.cpp @@ -36,6 +36,11 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U { use_environment_credentials = config.getBool(config_elem + "." + key + ".use_environment_credentials"); } + std::optional use_insecure_imds_request; + if (config.has(config_elem + "." + key + ".use_insecure_imds_request")) + { + use_insecure_imds_request = config.getBool(config_elem + "." + key + ".use_insecure_imds_request"); + } HeaderCollection headers; Poco::Util::AbstractConfiguration::Keys subconfig_keys; @@ -52,7 +57,7 @@ void StorageS3Settings::loadFromConfig(const String & config_elem, const Poco::U } } - settings.emplace(endpoint, S3AuthSettings{std::move(access_key_id), std::move(secret_access_key), std::move(server_side_encryption_customer_key_base64), std::move(headers), use_environment_credentials}); + settings.emplace(endpoint, S3AuthSettings{std::move(access_key_id), std::move(secret_access_key), std::move(server_side_encryption_customer_key_base64), std::move(headers), use_environment_credentials, use_insecure_imds_request}); } } } diff --git a/src/Storages/StorageS3Settings.h b/src/Storages/StorageS3Settings.h index 29c6c3bb415..66e776dbea2 100644 --- a/src/Storages/StorageS3Settings.h +++ b/src/Storages/StorageS3Settings.h @@ -33,12 +33,14 @@ struct S3AuthSettings HeaderCollection headers; std::optional use_environment_credentials; + std::optional use_insecure_imds_request; inline bool operator==(const S3AuthSettings & other) const { return access_key_id == other.access_key_id && secret_access_key == other.secret_access_key && server_side_encryption_customer_key_base64 == other.server_side_encryption_customer_key_base64 && headers == other.headers - && use_environment_credentials == other.use_environment_credentials; + && use_environment_credentials == other.use_environment_credentials + && use_insecure_imds_request == other.use_insecure_imds_request; } }; diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp index 46ead225102..b8741e6951c 100644 --- a/src/Storages/System/StorageSystemContributors.generated.cpp +++ b/src/Storages/System/StorageSystemContributors.generated.cpp @@ -17,6 +17,7 @@ const char * auto_contributors[] { "Aleksei Semiglazov", "Aleksey", "Aleksey Akulovich", + "Alex", "Alex Bocharov", "Alex Karo", "Alex Krash", @@ -144,6 +145,7 @@ const char * auto_contributors[] { "Chao Wang", "Chen Yufei", "Chienlung Cheung", + "Christian", "Ciprian Hacman", "Clement Rodriguez", "Clément Rodriguez", @@ -175,6 +177,7 @@ const char * auto_contributors[] { "Dmitry Belyavtsev", "Dmitry Bilunov", "Dmitry Galuza", + "Dmitry Krylov", "Dmitry Luhtionov", "Dmitry Moskowski", "Dmitry Muzyka", @@ -185,6 +188,7 @@ const char * auto_contributors[] { "Dongdong Yang", "DoomzD", "Dr. Strange Looker", + "Egor O'Sten", "Ekaterina", "Eldar Zaitov", "Elena Baskakova", @@ -286,6 +290,7 @@ const char * auto_contributors[] { "Jochen Schalanda", "John", "John Hummel", + "John Skopis", "Jonatas Freitas", "Kang Liu", "Karl Pietrzak", @@ -395,6 +400,7 @@ const char * auto_contributors[] { "NeZeD [Mac Pro]", "Neeke Gao", "Neng Liu", + "Nickolay Yastrebov", "Nico Mandery", "Nico Piderman", "Nicolae Vartolomei", @@ -472,6 +478,7 @@ const char * auto_contributors[] { "Sami Kerola", "Samuel Chou", "Saulius Valatka", + "Serg Kulakov", "Serge Rider", "Sergei Bocharov", "Sergei Semin", @@ -606,6 +613,7 @@ const char * auto_contributors[] { "abyss7", "achimbab", "achulkov2", + "adevyatova", "ageraab", "akazz", "akonyaev", @@ -631,6 +639,7 @@ const char * auto_contributors[] { "artpaul", "asiana21", "avasiliev", + "avogar", "avsharapov", "awesomeleo", "benamazing", @@ -647,6 +656,8 @@ const char * auto_contributors[] { "centos7", "champtar", "chang.chen", + "changvvb", + "chasingegg", "chengy8934", "chenqi", "chenxing-xc", @@ -769,6 +780,7 @@ const char * auto_contributors[] { "maxim-babenko", "maxkuzn", "maxulan", + "mehanizm", "melin", "memo", "meo", @@ -831,6 +843,7 @@ const char * auto_contributors[] { "shangshujie", "shedx", "simon-says", + "songenjie", "spff", "spongedc", "spyros87", diff --git a/src/Storages/tests/gtest_storage_log.cpp b/src/Storages/tests/gtest_storage_log.cpp index de2139a4a5a..41c1b6ac75a 100644 --- a/src/Storages/tests/gtest_storage_log.cpp +++ b/src/Storages/tests/gtest_storage_log.cpp @@ -19,7 +19,7 @@ #include #include -#if !__clang__ +#if !defined(__clang__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsuggest-override" #endif diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp index c7dda873a00..7d3fca58451 100644 --- a/src/TableFunctions/TableFunctionMySQL.cpp +++ b/src/TableFunctions/TableFunctionMySQL.cpp @@ -79,16 +79,21 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr ColumnsDescription TableFunctionMySQL::getActualTableStructure(ContextPtr context) const { const auto & settings = context->getSettingsRef(); - const auto tables_and_columns = fetchTablesColumnsList(*pool, remote_database_name, {remote_table_name}, settings.external_table_functions_use_nulls, settings.mysql_datatypes_support_level); + const auto tables_and_columns = fetchTablesColumnsList(*pool, remote_database_name, {remote_table_name}, settings, settings.mysql_datatypes_support_level); const auto columns = tables_and_columns.find(remote_table_name); if (columns == tables_and_columns.end()) - throw Exception("MySQL table " + backQuoteIfNeed(remote_database_name) + "." + backQuoteIfNeed(remote_table_name) + " doesn't exist.", ErrorCodes::UNKNOWN_TABLE); + throw Exception("MySQL table " + (remote_database_name.empty() ? "" : (backQuote(remote_database_name) + ".")) + + backQuote(remote_table_name) + " doesn't exist.", ErrorCodes::UNKNOWN_TABLE); return ColumnsDescription{columns->second}; } -StoragePtr TableFunctionMySQL::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +StoragePtr TableFunctionMySQL::executeImpl( + const ASTPtr & /*ast_function*/, + ContextPtr context, + const std::string & table_name, + ColumnsDescription /*cached_columns*/) const { auto columns = getActualTableStructure(context); diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index 2da597f49ff..973899d2101 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -83,6 +83,7 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context { Poco::URI uri (filename); S3::URI s3_uri (uri); + UInt64 s3_max_single_read_retries = context->getSettingsRef().s3_max_single_read_retries; UInt64 min_upload_part_size = context->getSettingsRef().s3_min_upload_part_size; UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; @@ -93,6 +94,7 @@ StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, Context secret_access_key, StorageID(getDatabaseName(), table_name), format, + s3_max_single_read_retries, min_upload_part_size, max_single_part_upload_size, max_connections, diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index 26ef07ef97f..16f48c70608 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -109,12 +109,17 @@ StoragePtr TableFunctionS3Cluster::executeImpl( Poco::URI uri (filename); S3::URI s3_uri (uri); /// Actually this parameters are not used + UInt64 s3_max_single_read_retries = context->getSettingsRef().s3_max_single_read_retries; UInt64 min_upload_part_size = context->getSettingsRef().s3_min_upload_part_size; UInt64 max_single_part_upload_size = context->getSettingsRef().s3_max_single_part_upload_size; UInt64 max_connections = context->getSettingsRef().s3_max_connections; storage = StorageS3::create( s3_uri, access_key_id, secret_access_key, StorageID(getDatabaseName(), table_name), - format, min_upload_part_size, max_single_part_upload_size, max_connections, + format, + s3_max_single_read_retries, + min_upload_part_size, + max_single_part_upload_size, + max_connections, getActualTableStructure(context), ConstraintsDescription{}, context, compression_method, /*distributed_processing=*/true); } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 112f30852ee..5ae894cc55f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -132,7 +132,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std return ''.join(random.choice(alphabet) for _ in range(length)) database = 'test_{suffix}'.format(suffix=random_str()) - clickhouse_proc_create = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) + clickhouse_proc_create = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=None, universal_newlines=True) try: clickhouse_proc_create.communicate(("CREATE DATABASE " + database + get_db_engine(args, database)), timeout=args.timeout) except TimeoutExpired: @@ -179,7 +179,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std need_drop_database = not maybe_passed if need_drop_database: - clickhouse_proc_create = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) + clickhouse_proc_create = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=None, universal_newlines=True) seconds_left = max(args.timeout - (datetime.now() - start_time).total_seconds(), 20) try: drop_database_query = "DROP DATABASE " + database @@ -706,10 +706,10 @@ def main(args): args.shard = False if args.database and args.database != "test": - clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) + clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=None, universal_newlines=True) clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS " + args.database + get_db_engine(args, args.database))) - clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) + clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=None, universal_newlines=True) clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS test" + get_db_engine(args, 'test'))) def is_test_from_dir(suite_dir, case): diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 3d85f494676..69a66a50b6d 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -1124,23 +1124,28 @@ class ClickHouseInstance: return self.http_query(sql=sql, data=data, params=params, user=user, password=password, expect_fail_and_get_error=True) - def stop_clickhouse(self, start_wait_sec=5, kill=False): + def stop_clickhouse(self, stop_wait_sec=30, kill=False): if not self.stay_alive: raise Exception("clickhouse can be stopped only with stay_alive=True instance") self.exec_in_container(["bash", "-c", "pkill {} clickhouse".format("-9" if kill else "")], user='root') - time.sleep(start_wait_sec) + deadline = time.time() + stop_wait_sec + while time.time() < deadline: + time.sleep(0.5) + if self.get_process_pid("clickhouse") is None: + break + assert self.get_process_pid("clickhouse") is None, "ClickHouse was not stopped" - def start_clickhouse(self, stop_wait_sec=5): + def start_clickhouse(self, start_wait_sec=30): if not self.stay_alive: raise Exception("clickhouse can be started again only with stay_alive=True instance") self.exec_in_container(["bash", "-c", "{} --daemon".format(CLICKHOUSE_START_COMMAND)], user=str(os.getuid())) # wait start from helpers.test_tools import assert_eq_with_retry - assert_eq_with_retry(self, "select 1", "1", retry_count=int(stop_wait_sec / 0.5), sleep_time=0.5) + assert_eq_with_retry(self, "select 1", "1", retry_count=int(start_wait_sec / 0.5), sleep_time=0.5) - def restart_clickhouse(self, stop_start_wait_sec=5, kill=False): + def restart_clickhouse(self, stop_start_wait_sec=30, kill=False): self.stop_clickhouse(stop_start_wait_sec, kill) self.start_clickhouse(stop_start_wait_sec) diff --git a/tests/integration/test_hedged_requests/configs/users.xml b/tests/integration/test_hedged_requests/configs/users.xml index a3ab176b811..ac42155a18a 100644 --- a/tests/integration/test_hedged_requests/configs/users.xml +++ b/tests/integration/test_hedged_requests/configs/users.xml @@ -5,6 +5,8 @@ in_order 100 2000 + 1 + 1 diff --git a/tests/integration/test_hedged_requests_parallel/configs/users.xml b/tests/integration/test_hedged_requests_parallel/configs/users.xml index 3f3578903b4..9600c0c7124 100644 --- a/tests/integration/test_hedged_requests_parallel/configs/users.xml +++ b/tests/integration/test_hedged_requests_parallel/configs/users.xml @@ -6,6 +6,8 @@ 2 100 2000 + 1 + 1 diff --git a/tests/integration/test_materialize_mysql_database/configs/users_disable_bytes_settings.xml b/tests/integration/test_materialize_mysql_database/configs/users_disable_bytes_settings.xml new file mode 100644 index 00000000000..4516cb80c17 --- /dev/null +++ b/tests/integration/test_materialize_mysql_database/configs/users_disable_bytes_settings.xml @@ -0,0 +1,21 @@ + + + + + 1 + Atomic + 1 + 0 + + + + + + + + ::/0 + + default + + + diff --git a/tests/integration/test_materialize_mysql_database/configs/users_disable_rows_settings.xml b/tests/integration/test_materialize_mysql_database/configs/users_disable_rows_settings.xml new file mode 100644 index 00000000000..dea20eb9e12 --- /dev/null +++ b/tests/integration/test_materialize_mysql_database/configs/users_disable_rows_settings.xml @@ -0,0 +1,21 @@ + + + + + 1 + Atomic + 0 + 1 + + + + + + + + ::/0 + + default + + + diff --git a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py index 38574a81d0a..813a654add3 100644 --- a/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialize_mysql_database/materialize_with_ddl.py @@ -842,3 +842,31 @@ def system_tables_test(clickhouse_node, mysql_node, service_name): mysql_node.query("CREATE TABLE system_tables_test.test (id int NOT NULL PRIMARY KEY) ENGINE=InnoDB") clickhouse_node.query("CREATE DATABASE system_tables_test ENGINE = MaterializeMySQL('{}:3306', 'system_tables_test', 'root', 'clickhouse')".format(service_name)) check_query(clickhouse_node, "SELECT partition_key, sorting_key, primary_key FROM system.tables WHERE database = 'system_tables_test' AND name = 'test'", "intDiv(id, 4294967)\tid\tid\n") + +def move_to_prewhere_and_column_filtering(clickhouse_node, mysql_node, service_name): + clickhouse_node.query("DROP DATABASE IF EXISTS cond_on_key_col") + mysql_node.query("DROP DATABASE IF EXISTS cond_on_key_col") + mysql_node.query("CREATE DATABASE cond_on_key_col") + clickhouse_node.query("CREATE DATABASE cond_on_key_col ENGINE = MaterializeMySQL('{}:3306', 'cond_on_key_col', 'root', 'clickhouse')".format(service_name)) + mysql_node.query("create table cond_on_key_col.products (id int primary key, product_id int not null, catalog_id int not null, brand_id int not null, name text)") + mysql_node.query("insert into cond_on_key_col.products (id, name, catalog_id, brand_id, product_id) values (915, 'ertyui', 5287, 15837, 0), (990, 'wer', 1053, 24390, 1), (781, 'qwerty', 1041, 1176, 2);") + check_query(clickhouse_node, "SELECT DISTINCT P.id, P.name, P.catalog_id FROM cond_on_key_col.products P WHERE P.name ILIKE '%e%' and P.catalog_id=5287", '915\tertyui\t5287\n') + clickhouse_node.query("DROP DATABASE cond_on_key_col") + mysql_node.query("DROP DATABASE cond_on_key_col") + +def mysql_settings_test(clickhouse_node, mysql_node, service_name): + mysql_node.query("DROP DATABASE IF EXISTS test_database") + clickhouse_node.query("DROP DATABASE IF EXISTS test_database") + mysql_node.query("CREATE DATABASE test_database") + mysql_node.query("CREATE TABLE test_database.a (id INT(11) NOT NULL PRIMARY KEY, value VARCHAR(255))") + mysql_node.query("INSERT INTO test_database.a VALUES(1, 'foo')") + mysql_node.query("INSERT INTO test_database.a VALUES(2, 'bar')") + + clickhouse_node.query("CREATE DATABASE test_database ENGINE = MaterializeMySQL('{}:3306', 'test_database', 'root', 'clickhouse')".format(service_name)) + check_query(clickhouse_node, "SELECT COUNT() FROM test_database.a FORMAT TSV", "2\n") + + assert clickhouse_node.query("SELECT COUNT(DISTINCT blockNumber()) FROM test_database.a FORMAT TSV") == "2\n" + + clickhouse_node.query("DROP DATABASE test_database") + mysql_node.query("DROP DATABASE test_database") + diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py index 3c41c0a2177..6c777c7e6f8 100644 --- a/tests/integration/test_materialize_mysql_database/test.py +++ b/tests/integration/test_materialize_mysql_database/test.py @@ -16,7 +16,8 @@ cluster = ClickHouseCluster(__file__) node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True) node_db_atomic = cluster.add_instance('node2', user_configs=["configs/users_db_atomic.xml"], with_mysql=False, stay_alive=True) - +node_disable_bytes_settings = cluster.add_instance('node3', user_configs=["configs/users_disable_bytes_settings.xml"], with_mysql=False, stay_alive=True) +node_disable_rows_settings = cluster.add_instance('node4', user_configs=["configs/users_disable_rows_settings.xml"], with_mysql=False, stay_alive=True) @pytest.fixture(scope="module") def started_cluster(): @@ -152,6 +153,7 @@ def test_materialize_database_dml_with_mysql_5_7(started_cluster, started_mysql_ materialize_with_ddl.dml_with_materialize_mysql_database(clickhouse_node, started_mysql_5_7, "mysql1") materialize_with_ddl.materialize_mysql_database_with_views(clickhouse_node, started_mysql_5_7, "mysql1") materialize_with_ddl.materialize_mysql_database_with_datetime_and_decimal(clickhouse_node, started_mysql_5_7, "mysql1") + materialize_with_ddl.move_to_prewhere_and_column_filtering(clickhouse_node, started_mysql_5_7, "mysql1") @pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_atomic]) @@ -159,6 +161,7 @@ def test_materialize_database_dml_with_mysql_8_0(started_cluster, started_mysql_ materialize_with_ddl.dml_with_materialize_mysql_database(clickhouse_node, started_mysql_8_0, "mysql8_0") materialize_with_ddl.materialize_mysql_database_with_views(clickhouse_node, started_mysql_8_0, "mysql8_0") materialize_with_ddl.materialize_mysql_database_with_datetime_and_decimal(clickhouse_node, started_mysql_8_0, "mysql8_0") + materialize_with_ddl.move_to_prewhere_and_column_filtering(clickhouse_node, started_mysql_8_0, "mysql8_0") @pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_atomic]) @@ -289,5 +292,12 @@ def test_multi_table_update(started_cluster, started_mysql_8_0, started_mysql_5_ @pytest.mark.parametrize(('clickhouse_node'), [node_db_ordinary, node_db_ordinary]) -def test_system_tables_table(started_cluster, started_mysql_8_0, clickhouse_node): +def test_system_tables_table(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): + materialize_with_ddl.system_tables_test(clickhouse_node, started_mysql_5_7, "mysql1") materialize_with_ddl.system_tables_test(clickhouse_node, started_mysql_8_0, "mysql8_0") + + +@pytest.mark.parametrize(('clickhouse_node'), [node_disable_bytes_settings, node_disable_rows_settings]) +def test_mysql_settings(started_cluster, started_mysql_8_0, started_mysql_5_7, clickhouse_node): + materialize_with_ddl.mysql_settings_test(clickhouse_node, started_mysql_5_7, "mysql1") + materialize_with_ddl.mysql_settings_test(clickhouse_node, started_mysql_8_0, "mysql8_0") diff --git a/tests/integration/test_merge_tree_s3_restore/configs/config.d/clusters.xml b/tests/integration/test_merge_tree_s3_restore/configs/config.d/clusters.xml new file mode 100644 index 00000000000..4808ae4bc4a --- /dev/null +++ b/tests/integration/test_merge_tree_s3_restore/configs/config.d/clusters.xml @@ -0,0 +1,23 @@ + + + + + + true + + node + 9000 + + + + + + true + + node_another_bucket + 9000 + + + + + diff --git a/tests/integration/test_merge_tree_s3_restore/test.py b/tests/integration/test_merge_tree_s3_restore/test.py index c0ebce68480..0781f0b9ce9 100644 --- a/tests/integration/test_merge_tree_s3_restore/test.py +++ b/tests/integration/test_merge_tree_s3_restore/test.py @@ -7,20 +7,21 @@ import time import pytest from helpers.cluster import ClickHouseCluster + logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) - SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) -CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml') +NOT_RESTORABLE_CONFIG_PATH = os.path.join(SCRIPT_DIR, './_instances/node_not_restorable/configs/config.d/storage_conf_not_restorable.xml') +COMMON_CONFIGS = ["configs/config.d/bg_processing_pool_conf.xml", "configs/config.d/log_conf.xml", "configs/config.d/clusters.xml"] def replace_config(old, new): - config = open(CONFIG_PATH, 'r') + config = open(NOT_RESTORABLE_CONFIG_PATH, 'r') config_lines = config.readlines() config.close() config_lines = [line.replace(old, new) for line in config_lines] - config = open(CONFIG_PATH, 'w') + config = open(NOT_RESTORABLE_CONFIG_PATH, 'w') config.writelines(config_lines) config.close() @@ -29,22 +30,22 @@ def replace_config(old, new): def cluster(): try: cluster = ClickHouseCluster(__file__) - cluster.add_instance("node", main_configs=[ - "configs/config.d/storage_conf.xml", - "configs/config.d/bg_processing_pool_conf.xml", - "configs/config.d/log_conf.xml"], user_configs=[], with_minio=True, stay_alive=True) - cluster.add_instance("node_another_bucket", main_configs=[ - "configs/config.d/storage_conf_another_bucket.xml", - "configs/config.d/bg_processing_pool_conf.xml", - "configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True) - cluster.add_instance("node_another_bucket_path", main_configs=[ - "configs/config.d/storage_conf_another_bucket_path.xml", - "configs/config.d/bg_processing_pool_conf.xml", - "configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True) - cluster.add_instance("node_not_restorable", main_configs=[ - "configs/config.d/storage_conf_not_restorable.xml", - "configs/config.d/bg_processing_pool_conf.xml", - "configs/config.d/log_conf.xml"], user_configs=[], stay_alive=True) + + cluster.add_instance("node", + main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf.xml"], + macros={"cluster": "node", "replica": "0"}, + with_minio=True, with_zookeeper=True, stay_alive=True) + cluster.add_instance("node_another_bucket", + main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_another_bucket.xml"], + macros={"cluster": "node_another_bucket", "replica": "0"}, + with_zookeeper=True, stay_alive=True) + cluster.add_instance("node_another_bucket_path", + main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_another_bucket_path.xml"], + stay_alive=True) + cluster.add_instance("node_not_restorable", + main_configs=COMMON_CONFIGS + ["configs/config.d/storage_conf_not_restorable.xml"], + stay_alive=True) + logging.info("Starting cluster...") cluster.start() logging.info("Cluster started") @@ -65,28 +66,26 @@ def generate_values(date_str, count, sign=1): return ",".join(["('{}',{},'{}',{})".format(x, y, z, 0) for x, y, z in data]) -def create_table(node, table_name, additional_settings=None): +def create_table(node, table_name, replicated=False): node.query("CREATE DATABASE IF NOT EXISTS s3 ENGINE = Ordinary") create_table_statement = """ - CREATE TABLE s3.{} ( + CREATE TABLE s3.{table_name} {on_cluster} ( dt Date, id Int64, data String, counter Int64, INDEX min_max (id) TYPE minmax GRANULARITY 3 - ) ENGINE=MergeTree() + ) ENGINE={engine} PARTITION BY dt ORDER BY (dt, id) SETTINGS storage_policy='s3', old_parts_lifetime=600, index_granularity=512 - """.format(table_name) - - if additional_settings: - create_table_statement += "," - create_table_statement += additional_settings + """.format(table_name=table_name, + on_cluster="ON CLUSTER '{}'".format(node.name) if replicated else "", + engine="ReplicatedMergeTree('/clickhouse/tables/{cluster}/test', '{replica}')" if replicated else "MergeTree()") node.query(create_table_statement) @@ -107,17 +106,23 @@ def drop_shadow_information(node): node.exec_in_container(['bash', '-c', 'rm -rf /var/lib/clickhouse/shadow/*'], user='root') -def create_restore_file(node, revision=0, bucket=None, path=None): - add_restore_option = 'echo -en "{}\n" >> /var/lib/clickhouse/disks/s3/restore' - node.exec_in_container(['bash', '-c', add_restore_option.format(revision)], user='root') +def create_restore_file(node, revision=None, bucket=None, path=None, detached=None): + node.exec_in_container(['bash', '-c', 'touch /var/lib/clickhouse/disks/s3/restore'], user='root') + + add_restore_option = 'echo -en "{}={}\n" >> /var/lib/clickhouse/disks/s3/restore' + if revision: + node.exec_in_container(['bash', '-c', add_restore_option.format('revision', revision)], user='root') if bucket: - node.exec_in_container(['bash', '-c', add_restore_option.format(bucket)], user='root') + node.exec_in_container(['bash', '-c', add_restore_option.format('source_bucket', bucket)], user='root') if path: - node.exec_in_container(['bash', '-c', add_restore_option.format(path)], user='root') + node.exec_in_container(['bash', '-c', add_restore_option.format('source_path', path)], user='root') + if detached: + node.exec_in_container(['bash', '-c', add_restore_option.format('detached', 'true')], user='root') def get_revision_counter(node, backup_number): - return int(node.exec_in_container(['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root')) + return int(node.exec_in_container( + ['bash', '-c', 'cat /var/lib/clickhouse/disks/s3/shadow/{}/revision.txt'.format(backup_number)], user='root')) @pytest.fixture(autouse=True) @@ -128,7 +133,8 @@ def drop_table(cluster): for node_name in node_names: node = cluster.instances[node_name] - node.query("DROP TABLE IF EXISTS s3.test NO DELAY") + node.query("DROP TABLE IF EXISTS s3.test SYNC") + node.query("DROP DATABASE IF EXISTS s3 SYNC") drop_s3_metadata(node) drop_shadow_information(node) @@ -138,32 +144,23 @@ def drop_table(cluster): purge_s3(cluster, bucket) -def test_full_restore(cluster): +@pytest.mark.parametrize( + "replicated", [False, True] +) +def test_full_restore(cluster, replicated): node = cluster.instances["node"] - create_table(node, "test") + create_table(node, "test", replicated) node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096))) node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1))) node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096))) node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096, -1))) - # To ensure parts have merged - node.query("OPTIMIZE TABLE s3.test") - - assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) - assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) - node.stop_clickhouse() drop_s3_metadata(node) - node.start_clickhouse() - - # All data is removed. - assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(0) - - node.stop_clickhouse() create_restore_file(node) - node.start_clickhouse(10) + node.start_clickhouse() assert node.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) assert node.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -191,7 +188,7 @@ def test_restore_another_bucket_path(cluster): node_another_bucket.stop_clickhouse() create_restore_file(node_another_bucket, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -202,7 +199,7 @@ def test_restore_another_bucket_path(cluster): node_another_bucket_path.stop_clickhouse() create_restore_file(node_another_bucket_path, bucket="root2", path="data") - node_another_bucket_path.start_clickhouse(10) + node_another_bucket_path.start_clickhouse() assert node_another_bucket_path.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) assert node_another_bucket_path.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -244,7 +241,7 @@ def test_restore_different_revisions(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision1, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -255,7 +252,7 @@ def test_restore_different_revisions(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision2, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -266,7 +263,7 @@ def test_restore_different_revisions(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision3, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -298,7 +295,7 @@ def test_restore_mutations(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision_before_mutation, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -309,7 +306,7 @@ def test_restore_mutations(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision_after_mutation, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 2) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) @@ -323,7 +320,7 @@ def test_restore_mutations(cluster): purge_s3(cluster, cluster.minio_bucket_2) revision = (revision_before_mutation + revision_after_mutation) // 2 create_restore_file(node_another_bucket, revision=revision, bucket="root") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() # Wait for unfinished mutation completion. time.sleep(3) @@ -365,7 +362,57 @@ def test_migrate_to_restorable_schema(cluster): drop_s3_metadata(node_another_bucket) purge_s3(cluster, cluster.minio_bucket_2) create_restore_file(node_another_bucket, revision=revision, bucket="root", path="another_data") - node_another_bucket.start_clickhouse(10) + node_another_bucket.start_clickhouse() assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 6) assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) + + +@pytest.mark.parametrize( + "replicated", [False, True] +) +def test_restore_to_detached(cluster, replicated): + node = cluster.instances["node"] + + create_table(node, "test", replicated) + + node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-03', 4096))) + node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-04', 4096, -1))) + node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-05', 4096))) + node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-06', 4096, -1))) + node.query("INSERT INTO s3.test VALUES {}".format(generate_values('2020-01-07', 4096, 0))) + + # Add some mutation. + node.query("ALTER TABLE s3.test UPDATE counter = 1 WHERE 1", settings={"mutations_sync": 2}) + + # Detach some partition. + node.query("ALTER TABLE s3.test DETACH PARTITION '2020-01-07'") + + node.query("ALTER TABLE s3.test FREEZE") + revision = get_revision_counter(node, 1) + + node_another_bucket = cluster.instances["node_another_bucket"] + + create_table(node_another_bucket, "test", replicated) + + node_another_bucket.stop_clickhouse() + create_restore_file(node_another_bucket, revision=revision, bucket="root", path="data", detached=True) + node_another_bucket.start_clickhouse() + + assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(0) + + node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-03'") + node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-04'") + node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-05'") + node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-06'") + + assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) + assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) + assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 4) + + # Attach partition that was already detached before backup-restore. + node_another_bucket.query("ALTER TABLE s3.test ATTACH PARTITION '2020-01-07'") + + assert node_another_bucket.query("SELECT count(*) FROM s3.test FORMAT Values") == "({})".format(4096 * 5) + assert node_another_bucket.query("SELECT sum(id) FROM s3.test FORMAT Values") == "({})".format(0) + assert node_another_bucket.query("SELECT sum(counter) FROM s3.test FORMAT Values") == "({})".format(4096 * 5) diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py index 0ca6e6a6e6b..65c789f9d02 100644 --- a/tests/integration/test_secure_socket/test.py +++ b/tests/integration/test_secure_socket/test.py @@ -64,7 +64,7 @@ def test(started_cluster): assert end - start < 10 start = time.time() - error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, send_timeout=5, use_hedged_requests=0;') + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, send_timeout=5, use_hedged_requests=0, async_socket_for_remote=1;') end = time.time() assert end - start < 10 @@ -73,7 +73,7 @@ def test(started_cluster): assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 start = time.time() - error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, send_timeout=5;') + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, send_timeout=5, use_hedged_requests=1, async_socket_for_remote=1;') end = time.time() assert end - start < 10 diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index a6c8b7e1ee9..a0dc342e910 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -201,6 +201,24 @@ def test_write_gzip_storage(started_cluster): assert started_cluster.hdfs_api.read_gzip_data("/gzip_storage") == "1\tMark\t72.53\n" assert node1.query("select * from GZIPHDFSStorage") == "1\tMark\t72.53\n" + +def test_virtual_columns(started_cluster): + node1.query("create table virtual_cols (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/file*', 'TSV')") + started_cluster.hdfs_api.write_data("/file1", "1\n") + started_cluster.hdfs_api.write_data("/file2", "2\n") + started_cluster.hdfs_api.write_data("/file3", "3\n") + expected = "1\tfile1\thdfs://hdfs1:9000//file1\n2\tfile2\thdfs://hdfs1:9000//file2\n3\tfile3\thdfs://hdfs1:9000//file3\n" + assert node1.query("select id, _file as file_name, _path as file_path from virtual_cols order by id") == expected + + +def test_read_files_with_spaces(started_cluster): + started_cluster.hdfs_api.write_data("/test test test 1.txt", "1\n") + started_cluster.hdfs_api.write_data("/test test test 2.txt", "2\n") + started_cluster.hdfs_api.write_data("/test test test 3.txt", "3\n") + node1.query("create table test (id UInt32) ENGINE = HDFS('hdfs://hdfs1:9000/test*', 'TSV')") + assert node1.query("select * from test order by id") == "1\n2\n3\n" + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_mysql/configs/users.xml b/tests/integration/test_storage_mysql/configs/users.xml new file mode 100644 index 00000000000..27c4d46984e --- /dev/null +++ b/tests/integration/test_storage_mysql/configs/users.xml @@ -0,0 +1,18 @@ + + + + + 2 + + + + + + + + ::/0 + + default + + + diff --git a/tests/integration/test_storage_mysql/test.py b/tests/integration/test_storage_mysql/test.py index 4520b3f3837..9c3abd799af 100644 --- a/tests/integration/test_storage_mysql/test.py +++ b/tests/integration/test_storage_mysql/test.py @@ -9,6 +9,7 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml'], with_mysql=True) node2 = cluster.add_instance('node2', main_configs=['configs/remote_servers.xml'], with_mysql_cluster=True) +node3 = cluster.add_instance('node3', main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml'], with_mysql=True) create_table_sql_template = """ CREATE TABLE `clickhouse`.`{}` ( @@ -260,6 +261,25 @@ def test_mysql_distributed(started_cluster): assert(result == 'host2\nhost4\n' or result == 'host3\nhost4\n') +def test_external_settings(started_cluster): + table_name = 'test_external_settings' + conn = get_mysql_conn() + create_mysql_table(conn, table_name) + + node3.query(''' +CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL('mysql1:3306', 'clickhouse', '{}', 'root', 'clickhouse'); +'''.format(table_name, table_name)) + node3.query( + "INSERT INTO {}(id, name, money) select number, concat('name_', toString(number)), 3 from numbers(100) ".format( + table_name)) + assert node3.query("SELECT count() FROM {}".format(table_name)).rstrip() == '100' + assert node3.query("SELECT sum(money) FROM {}".format(table_name)).rstrip() == '300' + node3.query("select value from system.settings where name = 'max_block_size' FORMAT TSV") == "2\n" + node3.query("select value from system.settings where name = 'external_storage_max_read_rows' FORMAT TSV") == "0\n" + assert node3.query("SELECT COUNT(DISTINCT blockNumber()) FROM {} FORMAT TSV".format(table_name)) == '50\n' + conn.close() + + if __name__ == '__main__': with contextmanager(started_cluster)() as cluster: for name, instance in list(cluster.instances.items()): diff --git a/tests/integration/test_storage_s3/s3_mock/mock_s3.py b/tests/integration/test_storage_s3/s3_mocks/mock_s3.py similarity index 89% rename from tests/integration/test_storage_s3/s3_mock/mock_s3.py rename to tests/integration/test_storage_s3/s3_mocks/mock_s3.py index 088cc883e57..3e876689175 100644 --- a/tests/integration/test_storage_s3/s3_mock/mock_s3.py +++ b/tests/integration/test_storage_s3/s3_mocks/mock_s3.py @@ -1,3 +1,5 @@ +import sys + from bottle import abort, route, run, request, response @@ -21,4 +23,4 @@ def ping(): return 'OK' -run(host='0.0.0.0', port=8080) +run(host='0.0.0.0', port=int(sys.argv[1])) diff --git a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py new file mode 100644 index 00000000000..4a27845ff9f --- /dev/null +++ b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py @@ -0,0 +1,90 @@ +import http.server +import random +import re +import socket +import struct +import sys + + +def gen_n_digit_number(n): + assert 0 < n < 19 + return random.randint(10**(n-1), 10**n-1) + + +def gen_line(): + columns = 4 + + row = [] + def add_number(): + digits = random.randint(1, 18) + row.append(gen_n_digit_number(digits)) + + for i in range(columns // 2): + add_number() + row.append(1) + for i in range(columns - 1 - columns // 2): + add_number() + + line = ",".join(map(str, row)) + "\n" + return line.encode() + + +random.seed("Unstable server/1.0") +lines = b"".join((gen_line() for _ in range(500000))) + + +class RequestHandler(http.server.BaseHTTPRequestHandler): + def do_HEAD(self): + if self.path == "/root/test.csv": + self.from_bytes = 0 + self.end_bytes = len(lines) + self.size = self.end_bytes + self.send_block_size = 256 + self.stop_at = random.randint(900000, 1200000) // self.send_block_size # Block size is 1024**2. + + if "Range" in self.headers: + cr = self.headers["Range"] + parts = re.split("[ -/=]+", cr) + assert parts[0] == "bytes" + self.from_bytes = int(parts[1]) + if parts[2]: + self.end_bytes = int(parts[2])+1 + self.send_response(206) + self.send_header("Content-Range", f"bytes {self.from_bytes}-{self.end_bytes-1}/{self.size}") + else: + self.send_response(200) + + self.send_header("Accept-Ranges", "bytes") + self.send_header("Content-Type", "text/plain") + self.send_header("Content-Length", f"{self.end_bytes-self.from_bytes}") + self.end_headers() + + elif self.path == "/": + self.send_response(200) + self.send_header("Content-Type", "text/plain") + self.end_headers() + + else: + self.send_response(404) + self.send_header("Content-Type", "text/plain") + self.end_headers() + + + def do_GET(self): + self.do_HEAD() + if self.path == "/root/test.csv": + for c, i in enumerate(range(self.from_bytes, self.end_bytes, self.send_block_size)): + self.wfile.write(lines[i:min(i+self.send_block_size, self.end_bytes)]) + if (c + 1) % self.stop_at == 0: + #self.wfile._sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, struct.pack("ii", 0, 0)) + #self.wfile._sock.shutdown(socket.SHUT_RDWR) + #self.wfile._sock.close() + print('Dropping connection') + break + + elif self.path == "/": + self.wfile.write(b"OK") + + +httpd = http.server.HTTPServer(("0.0.0.0", int(sys.argv[1])), RequestHandler) +httpd.serve_forever() diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 9e91aae66b3..c239dc68810 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -96,7 +96,7 @@ def cluster(): prepare_s3_bucket(cluster) logging.info("S3 bucket created") - run_s3_mock(cluster) + run_s3_mocks(cluster) yield cluster finally: @@ -384,26 +384,32 @@ def test_s3_glob_scheherazade(cluster): assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"] -def run_s3_mock(cluster): - logging.info("Starting s3 mock") - container_id = cluster.get_container_id('resolver') - current_dir = os.path.dirname(__file__) - cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_mock", "mock_s3.py"), "mock_s3.py") - cluster.exec_in_container(container_id, ["python", "mock_s3.py"], detach=True) +def run_s3_mocks(cluster): + logging.info("Starting s3 mocks") + mocks = ( + ("mock_s3.py", "resolver", "8080"), + ("unstable_server.py", "resolver", "8081"), + ) + for mock_filename, container, port in mocks: + container_id = cluster.get_container_id(container) + current_dir = os.path.dirname(__file__) + cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_mocks", mock_filename), mock_filename) + cluster.exec_in_container(container_id, ["python", mock_filename, port], detach=True) - # Wait for S3 mock start - for attempt in range(10): - ping_response = cluster.exec_in_container(cluster.get_container_id('resolver'), - ["curl", "-s", "http://resolver:8080/"], nothrow=True) - if ping_response != 'OK': - if attempt == 9: - assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response) + # Wait for S3 mocks to start + for mock_filename, container, port in mocks: + for attempt in range(10): + ping_response = cluster.exec_in_container(cluster.get_container_id(container), + ["curl", "-s", f"http://{container}:{port}/"], nothrow=True) + if ping_response != 'OK': + if attempt == 9: + assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response) + else: + time.sleep(1) else: - time.sleep(1) - else: - break + break - logging.info("S3 mock started") + logging.info("S3 mocks started") def replace_config(old, new): @@ -523,6 +529,15 @@ def test_storage_s3_get_gzip(cluster, extension, method): run_query(instance, f"DROP TABLE {name}") +def test_storage_s3_get_unstable(cluster): + bucket = cluster.minio_bucket + instance = cluster.instances["dummy"] + table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64" + get_query = f"SELECT count(), sum(column3) FROM s3('http://resolver:8081/{cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') FORMAT CSV" + result = run_query(instance, get_query) + assert result.splitlines() == ["500000,500000"] + + def test_storage_s3_put_uncompressed(cluster): bucket = cluster.minio_bucket instance = cluster.instances["dummy"] diff --git a/tests/jepsen.clickhouse-keeper/resources/zoo.cfg b/tests/jepsen.clickhouse-keeper/resources/zoo.cfg new file mode 100644 index 00000000000..fd49be16d0f --- /dev/null +++ b/tests/jepsen.clickhouse-keeper/resources/zoo.cfg @@ -0,0 +1,23 @@ +# http://hadoop.apache.org/zookeeper/docs/current/zookeeperAdmin.html + +# The number of milliseconds of each tick +tickTime=2000 +# The number of ticks that the initial +# synchronization phase can take +initLimit=10 +# The number of ticks that can pass between +# sending a request and getting an acknowledgement +syncLimit=5 +# the directory where the snapshot is stored. +dataDir=/var/lib/zookeeper +# Place the dataLogDir to a separate physical disc for better performance +# dataLogDir=/disk2/zookeeper + +# the port at which the clients will connect +clientPort=2181 + +# Leader accepts client connections. Default value is "yes". The leader machine +# coordinates updates. For higher update throughput at thes slight expense of +# read throughput the leader can be configured to not accept clients and focus +# on coordination. +leaderServes=yes diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/bench.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/bench.clj new file mode 100644 index 00000000000..040d2eaa77b --- /dev/null +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/bench.clj @@ -0,0 +1,39 @@ +(ns jepsen.clickhouse-keeper.bench + (:require [clojure.tools.logging :refer :all] + [jepsen + [client :as client]]) + (:import (java.lang ProcessBuilder) + (java.lang ProcessBuilder$Redirect))) + +(defn exec-process-builder + [command & args] + (let [pbuilder (ProcessBuilder. (into-array (cons command args)))] + (.redirectOutput pbuilder ProcessBuilder$Redirect/INHERIT) + (.redirectError pbuilder ProcessBuilder$Redirect/INHERIT) + (let [p (.start pbuilder)] + (.waitFor p)))) + +(defrecord BenchClient [port] + client/Client + (open! [this test node] + this) + + (setup! [this test] + this) + + (invoke! [this test op] + (let [bench-opts (into [] (clojure.string/split (:bench-opts op) #" ")) + bench-path (:bench-path op) + nodes (into [] (flatten (map (fn [x] (identity ["-h" (str x ":" port)])) (:nodes test)))) + all-args (concat [bench-path] bench-opts nodes)] + (info "Running cmd" all-args) + (apply exec-process-builder all-args) + (assoc op :type :ok :value "ok"))) + + (teardown! [_ test]) + + (close! [_ test])) + +(defn bench-client + [port] + (BenchClient. port)) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj index 5a13bf024ae..fdb6b233fec 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/db.clj @@ -19,12 +19,17 @@ [url] (non-precise-cached-wget! url)) +(defn get-clickhouse-scp + [path] + (c/upload path (str common-prefix "/clickhouse"))) + (defn download-clickhouse [source] (info "Downloading clickhouse from" source) (cond (clojure.string/starts-with? source "rbtorrent:") (get-clickhouse-sky source) (clojure.string/starts-with? source "http") (get-clickhouse-url source) + (.exists (io/file source)) (get-clickhouse-scp source) :else (throw (Exception. (str "Don't know how to download clickhouse from" source))))) (defn unpack-deb @@ -128,11 +133,11 @@ db/LogFiles (log-files [_ test node] (c/su - (if (cu/exists? pid-file-path) - (do - (info node "Collecting traces") - (collect-traces test node)) - (info node "Pid files doesn't exists")) + ;(if (cu/exists? pid-file-path) + ;(do + ; (info node "Collecting traces") + ; (collect-traces test node)) + ;(info node "Pid files doesn't exists")) (kill-clickhouse! node test) (if (cu/exists? coordination-data-dir) (do diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj index f88026500e6..0384d4d583a 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/main.clj @@ -4,11 +4,13 @@ [clojure.pprint :refer [pprint]] [jepsen.clickhouse-keeper.set :as set] [jepsen.clickhouse-keeper.db :refer :all] + [jepsen.clickhouse-keeper.zookeeperdb :refer :all] [jepsen.clickhouse-keeper.nemesis :as custom-nemesis] [jepsen.clickhouse-keeper.register :as register] [jepsen.clickhouse-keeper.unique :as unique] [jepsen.clickhouse-keeper.queue :as queue] [jepsen.clickhouse-keeper.counter :as counter] + [jepsen.clickhouse-keeper.bench :as bench] [jepsen.clickhouse-keeper.constants :refer :all] [clojure.string :as str] [jepsen @@ -72,12 +74,29 @@ :validate [pos? "Must be a positive integer."]] [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"] [nil, "--reuse-binary" "Use already downloaded binary if it exists, don't remove it on shutdown"] + [nil, "--bench" "Run perf-test mode"] + [nil, "--zookeeper-version VERSION" "Run zookeeper with version" + :default ""] + [nil, "--bench-opts STR" "Run perf-test mode" + :default "--generator list_medium_nodes -c 30 -i 1000"] ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package" - :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]]) + :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"] + [nil "--bench-path path" "Path to keeper-bench util" + :default "/home/alesap/code/cpp/BuildCH/utils/keeper-bench/keeper-bench"]]) -(defn clickhouse-keeper-test - "Given an options map from the command line runner (e.g. :nodes, :ssh, - :concurrency, ...), constructs a test map." +(defn get-db + [opts] + (if (empty? (:zookeeper-version opts)) + (db (:clickhouse-source opts) (boolean (:reuse-binary opts))) + (zookeeper-db (:zookeeper-version opts)))) + +(defn get-port + [opts] + (if (empty? (:zookeeper-version opts)) + 9181 + 2181)) + +(defn clickhouse-func-tests [opts] (info "Test opts\n" (with-out-str (pprint opts))) (let [quorum (boolean (:quorum opts)) @@ -87,7 +106,7 @@ opts {:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts))) :os ubuntu/os - :db (db (:clickhouse-source opts) (boolean (:reuse-binary opts))) + :db (get-db opts) :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) @@ -105,6 +124,30 @@ (gen/sleep 10) (gen/clients (:final-generator workload)))}))) +(defn clickhouse-perf-test + [opts] + (info "Starting performance test") + (let [dct {:type :invoke :bench-opts (:bench-opts opts) :bench-path (:bench-path opts)}] + (merge tests/noop-test + opts + {:name (str "clickhouse-keeper-perf") + :os ubuntu/os + :db (get-db opts) + :pure-generators true + :client (bench/bench-client (get-port opts)) + :nemesis nemesis/noop + :generator (->> dct + (gen/stagger 1) + (gen/nemesis nil))}))) + +(defn clickhouse-keeper-test + "Given an options map from the command line runner (e.g. :nodes, :ssh, + :concurrency, ...), constructs a test map." + [opts] + (if (boolean (:bench opts)) + (clickhouse-perf-test opts) + (clickhouse-func-tests opts))) + (def all-nemesises (keys custom-nemesis/custom-nemesises)) (def all-workloads (keys workloads)) diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj index 4abe797ea05..70813457251 100644 --- a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/utils.clj @@ -45,7 +45,7 @@ (defn zk-connect [host port timeout] - (exec-with-retries 15 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout)))) + (exec-with-retries 30 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout)))) (defn zk-create-range [conn n] diff --git a/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/zookeeperdb.clj b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/zookeeperdb.clj new file mode 100644 index 00000000000..7cb88cd1fd9 --- /dev/null +++ b/tests/jepsen.clickhouse-keeper/src/jepsen/clickhouse_keeper/zookeeperdb.clj @@ -0,0 +1,64 @@ +(ns jepsen.clickhouse-keeper.zookeeperdb + (:require [clojure.tools.logging :refer :all] + [jepsen.clickhouse-keeper.utils :refer :all] + [clojure.java.io :as io] + [jepsen + [control :as c] + [db :as db]] + [jepsen.os.ubuntu :as ubuntu])) + +(defn zk-node-ids + "Returns a map of node names to node ids." + [test] + (->> test + :nodes + (map-indexed (fn [i node] [node (inc i)])) + (into {}))) + +(defn zk-node-id + "Given a test and a node name from that test, returns the ID for that node." + [test node] + ((zk-node-ids test) node)) + +(defn zoo-cfg-servers + "Constructs a zoo.cfg fragment for servers." + [test mynode] + (->> (zk-node-ids test) + (map (fn [[node id]] + (str "server." id "=" (if (= (name node) mynode) "0.0.0.0" (name node)) ":2888:3888"))) + (clojure.string/join "\n"))) + +(defn zookeeper-db + "Zookeeper DB for a particular version." + [version] + (reify db/DB + (setup! [_ test node] + (c/su + (info node "Installing ZK" version) + (c/exec :apt-get :update) + (c/exec :apt-get :install (str "zookeeper=" version)) + (c/exec :apt-get :install (str "zookeeperd=" version)) + (c/exec :echo (zk-node-id test node) :> "/etc/zookeeper/conf/myid") + + (c/exec :echo (str (slurp (io/resource "zoo.cfg")) + "\n" + (zoo-cfg-servers test node)) + :> "/etc/zookeeper/conf/zoo.cfg") + + (info node "ZK restarting") + (c/exec :service :zookeeper :restart) + (info "Connecting to zk" (name node)) + (zk-connect (name node) 2181 1000) + (info node "ZK ready"))) + + (teardown! [_ test node] + (info node "tearing down ZK") + (c/su + (c/exec :service :zookeeper :stop :|| true) + (c/exec :rm :-rf + (c/lit "/var/lib/zookeeper/version-*") + (c/lit "/var/log/zookeeper/*")))) + + db/LogFiles + (log-files [_ test node] + ["/var/log/zookeeper/zookeeper.log"]))) diff --git a/tests/performance/agg_functions_min_max_any.xml b/tests/performance/agg_functions_min_max_any.xml index 79c9e2c6976..6ca9e3eb65a 100644 --- a/tests/performance/agg_functions_min_max_any.xml +++ b/tests/performance/agg_functions_min_max_any.xml @@ -6,7 +6,9 @@ group_scale - 1000000 + + 1000000 + diff --git a/tests/performance/fuse_sumcount.xml b/tests/performance/fuse_sumcount.xml new file mode 100644 index 00000000000..b2eb0e678e2 --- /dev/null +++ b/tests/performance/fuse_sumcount.xml @@ -0,0 +1,33 @@ + + + + 1 + + + + + key + + 1 + intHash32(number) % 1000 + + + + + SELECT sum(number) FROM numbers(1000000000) FORMAT Null + SELECT sum(number), count(number) FROM numbers(1000000000) FORMAT Null + SELECT sum(number), count(number) FROM numbers(1000000000) SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(1000000000) SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null + + SELECT sum(number) FROM numbers(100000000) GROUP BY intHash32(number) % 1000 FORMAT Null + SELECT sum(number), count(number) FROM numbers(100000000) GROUP BY intHash32(number) % 1000 FORMAT Null + SELECT sum(number), count(number) FROM numbers(100000000) GROUP BY intHash32(number) % 1000 SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(100000000) GROUP BY intHash32(number) % 1000 FORMAT Null + SELECT sum(number), avg(number), count(number) FROM numbers(100000000) GROUP BY intHash32(number) % 1000 SETTINGS optimize_fuse_sum_count_avg = 0 FORMAT Null + diff --git a/tests/queries/0_stateless/00717_merge_and_distributed.sql b/tests/queries/0_stateless/00717_merge_and_distributed.sql index f0d34b5165f..35dad18937a 100644 --- a/tests/queries/0_stateless/00717_merge_and_distributed.sql +++ b/tests/queries/0_stateless/00717_merge_and_distributed.sql @@ -18,9 +18,9 @@ SELECT * FROM merge(currentDatabase(), 'test_local_1'); SELECT *, _table FROM merge(currentDatabase(), 'test_local_1') ORDER BY _table; SELECT sum(value), _table FROM merge(currentDatabase(), 'test_local_1') GROUP BY _table ORDER BY _table; SELECT * FROM merge(currentDatabase(), 'test_local_1') WHERE _table = 'test_local_1'; -SELECT * FROM merge(currentDatabase(), 'test_local_1') PREWHERE _table = 'test_local_1'; -- { serverError 16 } +SELECT * FROM merge(currentDatabase(), 'test_local_1') PREWHERE _table = 'test_local_1'; -- { serverError 10 } SELECT * FROM merge(currentDatabase(), 'test_local_1') WHERE _table in ('test_local_1', 'test_local_2'); -SELECT * FROM merge(currentDatabase(), 'test_local_1') PREWHERE _table in ('test_local_1', 'test_local_2'); -- { serverError 16 } +SELECT * FROM merge(currentDatabase(), 'test_local_1') PREWHERE _table in ('test_local_1', 'test_local_2'); -- { serverError 10 } SELECT '--------------Single Distributed------------'; SELECT * FROM merge(currentDatabase(), 'test_distributed_1'); @@ -36,9 +36,9 @@ SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') ORDER BY _ta SELECT *, _table FROM merge(currentDatabase(), 'test_local_1|test_local_2') ORDER BY _table; SELECT sum(value), _table FROM merge(currentDatabase(), 'test_local_1|test_local_2') GROUP BY _table ORDER BY _table; SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') WHERE _table = 'test_local_1'; -SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') PREWHERE _table = 'test_local_1'; -- { serverError 16 } +SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') PREWHERE _table = 'test_local_1'; -- { serverError 10 } SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') WHERE _table in ('test_local_1', 'test_local_2') ORDER BY value; -SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') PREWHERE _table in ('test_local_1', 'test_local_2') ORDER BY value; -- { serverError 16 } +SELECT * FROM merge(currentDatabase(), 'test_local_1|test_local_2') PREWHERE _table in ('test_local_1', 'test_local_2') ORDER BY value; -- { serverError 10 } SELECT '--------------Local Merge Distributed------------'; SELECT * FROM merge(currentDatabase(), 'test_local_1|test_distributed_2') ORDER BY _table; diff --git a/tests/queries/0_stateless/01508_partition_pruning_long.reference b/tests/queries/0_stateless/01508_partition_pruning_long.reference index 70f529c6058..334ecb63164 100644 --- a/tests/queries/0_stateless/01508_partition_pruning_long.reference +++ b/tests/queries/0_stateless/01508_partition_pruning_long.reference @@ -5,11 +5,11 @@ Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primar select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-09-01'); 2 2880 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toDate(d)=toDate('2020-10-15'); 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toDate(d)='2020-09-15'; 0 0 @@ -17,27 +17,27 @@ Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primar select uniqExact(_part), count() from tMM where toYYYYMM(d)=202009; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20200816; 2 2880 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMMDD(d)=20201015; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toDate(d)='2020-10-15'; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where d >= '2020-09-01 00:00:00' and d<'2020-10-15 00:00:00'; 3 15000 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from tMM where d >= '2020-01-16 00:00:00' and d < toDateTime('2021-08-17 00:00:00'); 6 30000 -Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges +Selected 6/6 parts by partition key, 6 parts by primary key, 6/6 marks by primary key, 6 marks to read from 6 ranges select uniqExact(_part), count() from tMM where d >= '2020-09-16 00:00:00' and d < toDateTime('2020-10-01 00:00:00'); 0 0 @@ -45,117 +45,117 @@ Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primar select uniqExact(_part), count() from tMM where d >= '2020-09-12 00:00:00' and d < '2020-10-16 00:00:00'; 2 6440 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-12 00:00:00'; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-09-01 00:00:00'; 2 2880 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toStartOfDay(d) = '2020-10-01 00:00:00'; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toStartOfDay(d) >= '2020-09-15 00:00:00' and d < '2020-10-16 00:00:00'; 2 6440 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010; 4 20000 -Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges +Selected 4/6 parts by partition key, 4 parts by primary key, 4/4 marks by primary key, 4 marks to read from 4 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202009; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010 and toStartOfDay(d) = '2020-10-01 00:00:00'; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) >= 202009 and toStartOfDay(d) < '2020-10-02 00:00:00'; 3 11440 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00'; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202009 and toStartOfDay(d) < '2020-10-02 00:00:00'; 3 11440 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010 and toStartOfDay(d) < '2020-10-02 00:00:00'; 1 1440 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010; 3 9999 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-15'; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01'; 4 20000 -Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges +Selected 4/6 parts by partition key, 4 parts by primary key, 4/4 marks by primary key, 4 marks to read from 4 ranges select uniqExact(_part), count() from tMM where toStartOfMonth(d) >= '2020-09-01' and toStartOfMonth(d) < '2020-10-01'; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d-1)+1 = 202010; 2 9999 -Selected 2/3 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/3 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d)+1 > 202010; 1 10000 -Selected 1/3 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/3 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from tMM where toYYYYMM(d) between 202009 and 202010; 2 20000 -Selected 2/3 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/3 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges --------- tDD ---------------------------- select uniqExact(_part), count() from tDD where toDate(d)=toDate('2020-09-24'); 1 10000 -Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/4 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() FROM tDD WHERE toDate(d) = toDate('2020-09-24'); 1 10000 -Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/4 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() FROM tDD WHERE toDate(d) = '2020-09-24'; 1 10000 -Selected 1/4 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/4 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() FROM tDD WHERE toDate(d) >= '2020-09-23' and toDate(d) <= '2020-09-26'; 3 40000 -Selected 3/4 parts by partition key, 3 parts by primary key, 4/7 marks by primary key, 4 marks to read from 3 ranges +Selected 3/4 parts by partition key, 3 parts by primary key, 4/4 marks by primary key, 4 marks to read from 3 ranges select uniqExact(_part), count() FROM tDD WHERE toYYYYMMDD(d) >= 20200923 and toDate(d) <= '2020-09-26'; 3 40000 -Selected 3/4 parts by partition key, 3 parts by primary key, 4/7 marks by primary key, 4 marks to read from 3 ranges +Selected 3/4 parts by partition key, 3 parts by primary key, 4/4 marks by primary key, 4 marks to read from 3 ranges --------- sDD ---------------------------- select uniqExact(_part), count() from sDD; 6 30000 -Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges +Selected 6/6 parts by partition key, 6 parts by primary key, 6/6 marks by primary key, 6 marks to read from 6 ranges select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1)+1 = 202010; 3 9999 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202010; 2 9999 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) = 202110; 0 0 @@ -163,52 +163,52 @@ Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primar select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toStartOfDay(toDateTime(intDiv(d,1000),'UTC')) < toDateTime('2020-10-02 00:00:00','UTC'); 3 11440 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from sDD where toYYYYMM(toDateTime(intDiv(d,1000),'UTC'))+1 > 202009 and toDateTime(intDiv(d,1000),'UTC') < toDateTime('2020-10-01 00:00:00','UTC'); 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from sDD where d >= 1598918400000; 4 20000 -Selected 4/6 parts by partition key, 4 parts by primary key, 4/8 marks by primary key, 4 marks to read from 4 ranges +Selected 4/6 parts by partition key, 4 parts by primary key, 4/4 marks by primary key, 4 marks to read from 4 ranges select uniqExact(_part), count() from sDD where d >= 1598918400000 and toYYYYMM(toDateTime(intDiv(d,1000),'UTC')-1) < 202010; 3 10001 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges --------- xMM ---------------------------- select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00'; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00'; 3 10001 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00'; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a=1; 1 1 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3; 2 5001 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3; 1 5000 -Selected 1/6 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/6 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-11-01 00:00:00' and a = 1; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where a = 1; 3 15000 -Selected 3/6 parts by partition key, 3 parts by primary key, 3/6 marks by primary key, 3 marks to read from 3 ranges +Selected 3/6 parts by partition key, 3 parts by primary key, 3/3 marks by primary key, 3 marks to read from 3 ranges select uniqExact(_part), count() from xMM where a = 66; 0 0 @@ -216,29 +216,29 @@ Selected 0/6 parts by partition key, 0 parts by primary key, 0/0 marks by primar select uniqExact(_part), count() from xMM where a <> 66; 6 30000 -Selected 6/6 parts by partition key, 6 parts by primary key, 6/12 marks by primary key, 6 marks to read from 6 ranges +Selected 6/6 parts by partition key, 6 parts by primary key, 6/6 marks by primary key, 6 marks to read from 6 ranges select uniqExact(_part), count() from xMM where a = 2; 2 10000 -Selected 2/6 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/6 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where a = 1; 2 15000 -Selected 2/5 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/5 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where toStartOfDay(d) >= '2020-10-01 00:00:00'; 1 10000 -Selected 1/5 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/5 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges select uniqExact(_part), count() from xMM where a <> 66; 5 30000 -Selected 5/5 parts by partition key, 5 parts by primary key, 5/10 marks by primary key, 5 marks to read from 5 ranges +Selected 5/5 parts by partition key, 5 parts by primary key, 5/5 marks by primary key, 5 marks to read from 5 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d <= '2020-10-01 00:00:00' and a<>3; 2 5001 -Selected 2/5 parts by partition key, 2 parts by primary key, 2/4 marks by primary key, 2 marks to read from 2 ranges +Selected 2/5 parts by partition key, 2 parts by primary key, 2/2 marks by primary key, 2 marks to read from 2 ranges select uniqExact(_part), count() from xMM where d >= '2020-09-01 00:00:00' and d < '2020-10-01 00:00:00' and a<>3; 1 5000 -Selected 1/5 parts by partition key, 1 parts by primary key, 1/2 marks by primary key, 1 marks to read from 1 ranges +Selected 1/5 parts by partition key, 1 parts by primary key, 1/1 marks by primary key, 1 marks to read from 1 ranges diff --git a/tests/queries/0_stateless/01551_mergetree_read_in_order_spread.reference b/tests/queries/0_stateless/01551_mergetree_read_in_order_spread.reference index becc626c1bb..835e2af269a 100644 --- a/tests/queries/0_stateless/01551_mergetree_read_in_order_spread.reference +++ b/tests/queries/0_stateless/01551_mergetree_read_in_order_spread.reference @@ -13,16 +13,16 @@ ExpressionTransform (MergingSorted) (Expression) ExpressionTransform - (ReadFromStorage) + (ReadFromMergeTree) MergeTree 0 → 1 (MergingSorted) MergingSortedTransform 2 → 1 (Expression) ExpressionTransform × 2 - (ReadFromStorage) + (ReadFromMergeTree) MergeTree × 2 0 → 1 (MergingSorted) (Expression) ExpressionTransform - (ReadFromStorage) + (ReadFromMergeTree) MergeTree 0 → 1 diff --git a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference index a1a1814a581..0eb7e06f724 100644 --- a/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference +++ b/tests/queries/0_stateless/01562_optimize_monotonous_functions_in_order_by.reference @@ -11,7 +11,7 @@ Expression (Projection) PartialSorting (Sort each block for ORDER BY) Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (MergeTree) + ReadFromMergeTree SELECT timestamp, key @@ -23,7 +23,7 @@ Expression (Projection) FinishSorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree SELECT timestamp, key @@ -37,7 +37,7 @@ Expression (Projection) FinishSorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree SELECT timestamp, key diff --git a/tests/queries/0_stateless/01566_negate_formatting.reference b/tests/queries/0_stateless/01566_negate_formatting.reference index b955d4cbbc5..69d79cf929a 100644 --- a/tests/queries/0_stateless/01566_negate_formatting.reference +++ b/tests/queries/0_stateless/01566_negate_formatting.reference @@ -9,12 +9,26 @@ SELECT explain syntax select negate(1.), negate(-1.), - -1., -(-1.), (-1.) in (-1.); SELECT -1., - 1, - 1, - 1, + 1., + 1., + 1., -1. IN (-1.) explain syntax select negate(-9223372036854775808), -(-9223372036854775808), - -9223372036854775808; SELECT -9223372036854775808, -9223372036854775808, -9223372036854775808 +explain syntax select negate(0), negate(-0), - -0, -(-0), (-0) in (-0); +SELECT + 0, + 0, + 0, + 0, + 0 IN (0) +explain syntax select negate(0.), negate(-0.), - -0., -(-0.), (-0.) in (-0.); +SELECT + -0., + 0., + 0., + 0., + -0. IN (-0.) diff --git a/tests/queries/0_stateless/01566_negate_formatting.sql b/tests/queries/0_stateless/01566_negate_formatting.sql index 035ff80e8d8..65e983fbdd1 100644 --- a/tests/queries/0_stateless/01566_negate_formatting.sql +++ b/tests/queries/0_stateless/01566_negate_formatting.sql @@ -2,3 +2,5 @@ explain syntax select negate(1), negate(-1), - -1, -(-1), (-1) in (-1); explain syntax select negate(1.), negate(-1.), - -1., -(-1.), (-1.) in (-1.); explain syntax select negate(-9223372036854775808), -(-9223372036854775808), - -9223372036854775808; +explain syntax select negate(0), negate(-0), - -0, -(-0), (-0) in (-0); +explain syntax select negate(0.), negate(-0.), - -0., -(-0.), (-0.) in (-0.); diff --git a/tests/queries/0_stateless/01576_alias_column_rewrite.reference b/tests/queries/0_stateless/01576_alias_column_rewrite.reference index 334ebc7eb1f..c5679544e1d 100644 --- a/tests/queries/0_stateless/01576_alias_column_rewrite.reference +++ b/tests/queries/0_stateless/01576_alias_column_rewrite.reference @@ -28,47 +28,47 @@ Expression (Projection) PartialSorting (Sort each block for ORDER BY) Expression ((Before ORDER BY + Add table aliases)) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (MergeTree) + ReadFromMergeTree Expression (Projection) Limit (preliminary LIMIT) FinishSorting Expression ((Before ORDER BY + Add table aliases)) SettingQuotaAndLimits (Set limits and quota after reading from storage) Union - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree + ReadFromMergeTree + ReadFromMergeTree Expression (Projection) Limit (preliminary LIMIT) FinishSorting Expression (Before ORDER BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) Union - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree + ReadFromMergeTree + ReadFromMergeTree optimize_aggregation_in_order Expression ((Projection + Before ORDER BY)) Aggregating Expression ((Before GROUP BY + Add table aliases)) SettingQuotaAndLimits (Set limits and quota after reading from storage) - ReadFromStorage (MergeTree) + ReadFromMergeTree Expression ((Projection + Before ORDER BY)) Aggregating Expression ((Before GROUP BY + Add table aliases)) SettingQuotaAndLimits (Set limits and quota after reading from storage) Union - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree + ReadFromMergeTree + ReadFromMergeTree Expression ((Projection + Before ORDER BY)) Aggregating Expression (Before GROUP BY) SettingQuotaAndLimits (Set limits and quota after reading from storage) Union - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) - ReadFromStorage (MergeTree with order) + ReadFromMergeTree + ReadFromMergeTree + ReadFromMergeTree second-index 1 1 diff --git a/tests/queries/0_stateless/01602_max_distributed_connections.reference b/tests/queries/0_stateless/01602_max_distributed_connections.reference index e69de29bb2d..7326d960397 100644 --- a/tests/queries/0_stateless/01602_max_distributed_connections.reference +++ b/tests/queries/0_stateless/01602_max_distributed_connections.reference @@ -0,0 +1 @@ +Ok diff --git a/tests/queries/0_stateless/01602_max_distributed_connections.sh b/tests/queries/0_stateless/01602_max_distributed_connections.sh index 93c6071c091..772acb39344 100755 --- a/tests/queries/0_stateless/01602_max_distributed_connections.sh +++ b/tests/queries/0_stateless/01602_max_distributed_connections.sh @@ -4,13 +4,31 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -common_opts=( - "--format=Null" +# We check that even if max_threads is small, the setting max_distributed_connections +# will allow to process queries on multiple shards concurrently. - "--max_threads=1" - "--max_distributed_connections=3" -) +# We do sleep 1.5 seconds on ten machines. +# If concurrency is one (bad) the query will take at least 15 seconds and the following loops are guaranteed to be infinite. +# If concurrency is 10 (good), the query may take less than 10 second with non-zero probability +# and the following loops will finish with probability 1 assuming independent random variables. -# NOTE: the test use higher timeout to avoid flakiness. -timeout 9s ${CLICKHOUSE_CLIENT} "$@" "${common_opts[@]}" -q "select sleep(3) from remote('127.{1,2,3,4,5}', system.one)" --prefer_localhost_replica=0 -timeout 9s ${CLICKHOUSE_CLIENT} "$@" "${common_opts[@]}" -q "select sleep(3) from remote('127.{1,2,3,4,5}', system.one)" --prefer_localhost_replica=1 +while true; do + timeout 10 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 10 --query " + SELECT sleep(1.5) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=0 && break +done + +while true; do + timeout 10 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 10 --query " + SELECT sleep(1.5) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=1 && break +done + +# If max_distributed_connections is low and async_socket_for_remote is disabled, +# the concurrency of distributed queries will be also low. + +timeout 1 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 1 --async_socket_for_remote 0 --query " + SELECT sleep(0.15) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=0 && echo 'Fail' + +timeout 1 ${CLICKHOUSE_CLIENT} --max_threads 1 --max_distributed_connections 1 --async_socket_for_remote 0 --query " + SELECT sleep(0.15) FROM remote('127.{1..10}', system.one) FORMAT Null" --prefer_localhost_replica=1 && echo 'Fail' + +echo 'Ok' diff --git a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh index 5bb93371483..c5fbb35a9cd 100755 --- a/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh +++ b/tests/queries/0_stateless/01666_merge_tree_max_query_limit.sh @@ -15,13 +15,13 @@ drop table if exists simple; create table simple (i int, j int) engine = MergeTree order by i settings index_granularity = 1, max_concurrent_queries = 1, min_marks_to_honor_max_concurrent_queries = 2; -insert into simple select number, number + 100 from numbers(1000); +insert into simple select number, number + 100 from numbers(5000); " query_id="long_running_query-$CLICKHOUSE_DATABASE" echo "Spin up a long running query" -${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.01) from simple settings max_block_size = 1 format Null" --query_id "$query_id" > /dev/null 2>&1 & +${CLICKHOUSE_CLIENT} --query "select sleepEachRow(0.1) from simple settings max_block_size = 1 format Null" --query_id "$query_id" > /dev/null 2>&1 & wait_for_query_to_start "$query_id" # query which reads marks >= min_marks_to_honor_max_concurrent_queries is throttled diff --git a/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.reference b/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.reference new file mode 100644 index 00000000000..70c19fc8ced --- /dev/null +++ b/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.reference @@ -0,0 +1,12 @@ +210 230 20 +SELECT + sum(a), + sumCount(b).1, + sumCount(b).2 +FROM fuse_tbl +---------NOT trigger fuse-------- +210 11.5 +SELECT + sum(a), + avg(b) +FROM fuse_tbl diff --git a/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.sql b/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.sql new file mode 100644 index 00000000000..cad7b5803d4 --- /dev/null +++ b/tests/queries/0_stateless/01744_fuse_sum_count_aggregate.sql @@ -0,0 +1,11 @@ +DROP TABLE IF EXISTS fuse_tbl; +CREATE TABLE fuse_tbl(a Int8, b Int8) Engine = Log; +INSERT INTO fuse_tbl SELECT number, number + 1 FROM numbers(1, 20); + +SET optimize_fuse_sum_count_avg = 1; +SELECT sum(a), sum(b), count(b) from fuse_tbl; +EXPLAIN SYNTAX SELECT sum(a), sum(b), count(b) from fuse_tbl; +SELECT '---------NOT trigger fuse--------'; +SELECT sum(a), avg(b) from fuse_tbl; +EXPLAIN SYNTAX SELECT sum(a), avg(b) from fuse_tbl; +DROP TABLE fuse_tbl; diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference index 494e9ca3237..a1bfcf043da 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.reference @@ -17,6 +17,9 @@ others 0 0 0 +different types -- prohibited +different types -- conversion +0 optimize_skip_unused_shards_limit 0 0 diff --git a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql index 59e2ad75fcc..dc481ccca72 100644 --- a/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql +++ b/tests/queries/0_stateless/01756_optimize_skip_unused_shards_rewrite_in.sql @@ -5,6 +5,7 @@ drop table if exists dist_01756; drop table if exists dist_01756_str; +drop table if exists dist_01756_column; drop table if exists data_01756_str; -- SELECT @@ -90,8 +91,10 @@ select * from dist_01756 where dummy in (0); -- { serverError 507 } -- optimize_skip_unused_shards does not support non-constants select * from dist_01756 where dummy in (select * from system.one); -- { serverError 507 } select * from dist_01756 where dummy in (toUInt8(0)); -- { serverError 507 } --- wrong type +-- wrong type (tuple) select * from dist_01756 where dummy in ('0'); -- { serverError 507 } +-- intHash64 does not accept string +select * from dist_01756 where dummy in ('0', '2'); -- { serverError 43 } -- NOT IN does not supported select * from dist_01756 where dummy not in (0, 2); -- { serverError 507 } @@ -110,6 +113,7 @@ select (2 IN (2,)), * from dist_01756 where dummy in (0, 2) format Null; select (dummy IN (toUInt8(2),)), * from dist_01756 where dummy in (0, 2) format Null; -- different type +select 'different types -- prohibited'; create table data_01756_str (key String) engine=Memory(); create table dist_01756_str as data_01756_str engine=Distributed(test_cluster_two_shards, currentDatabase(), data_01756_str, cityHash64(key)); select * from dist_01756_str where key in ('0', '2'); @@ -117,6 +121,12 @@ select * from dist_01756_str where key in ('0', Null); -- { serverError 507 } select * from dist_01756_str where key in (0, 2); -- { serverError 53 } select * from dist_01756_str where key in (0, Null); -- { serverError 53 } +-- different type #2 +select 'different types -- conversion'; +create table dist_01756_column as system.one engine=Distributed(test_cluster_two_shards, system, one, dummy); +select * from dist_01756_column where dummy in (0, '255'); +select * from dist_01756_column where dummy in (0, '255foo'); -- { serverError 53 } + -- optimize_skip_unused_shards_limit select 'optimize_skip_unused_shards_limit'; select * from dist_01756 where dummy in (0, 2) settings optimize_skip_unused_shards_limit=1; -- { serverError 507 } @@ -124,4 +134,5 @@ select * from dist_01756 where dummy in (0, 2) settings optimize_skip_unused_sha drop table dist_01756; drop table dist_01756_str; +drop table dist_01756_column; drop table data_01756_str; diff --git a/tests/queries/0_stateless/01783_merge_engine_join_key_condition.sql b/tests/queries/0_stateless/01783_merge_engine_join_key_condition.sql index 115ee42fe11..372c1bd3572 100644 --- a/tests/queries/0_stateless/01783_merge_engine_join_key_condition.sql +++ b/tests/queries/0_stateless/01783_merge_engine_join_key_condition.sql @@ -10,7 +10,7 @@ CREATE TABLE foo_merge as foo ENGINE=Merge(currentDatabase(), '^foo'); CREATE TABLE t2 (Id Int32, Val Int32, X Int32) Engine=Memory; INSERT INTO t2 values (4, 3, 4); -SET force_primary_key = 1; +SET force_primary_key = 1, force_index_by_date=1; SELECT * FROM foo_merge WHERE Val = 3 AND Id = 3; SELECT count(), X FROM foo_merge JOIN t2 USING Val WHERE Val = 3 AND Id = 3 AND t2.X == 4 GROUP BY X; diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.reference b/tests/queries/0_stateless/01786_explain_merge_tree.reference new file mode 100644 index 00000000000..51eb52688a3 --- /dev/null +++ b/tests/queries/0_stateless/01786_explain_merge_tree.reference @@ -0,0 +1,51 @@ + ReadFromMergeTree + Indexes: + MinMax + Keys: + y + Condition: (y in [1, +inf)) + Parts: 4/5 + Granules: 11/12 + Partition + Keys: + y + bitAnd(z, 3) + Condition: and((bitAnd(z, 3) not in [1, 1]), and((y in [1, +inf)), (bitAnd(z, 3) not in [1, 1]))) + Parts: 3/4 + Granules: 10/11 + PrimaryKey + Keys: + x + y + Condition: and((x in [11, +inf)), (y in [1, +inf))) + Parts: 2/3 + Granules: 6/10 + Skip + Name: t_minmax + Description: minmax GRANULARITY 2 + Parts: 1/2 + Granules: 2/6 + Skip + Name: t_set + Description: set GRANULARITY 2 + Parts: 1/1 + Granules: 1/2 +----------------- + ReadFromMergeTree + ReadType: InOrder + Parts: 1 + Granules: 3 +----------------- + ReadFromMergeTree + ReadType: InReverseOrder + Parts: 1 + Granules: 3 + ReadFromMergeTree + Indexes: + PrimaryKey + Keys: + x + plus(x, y) + Condition: or((x in 2-element set), (plus(plus(x, y), 1) in (-inf, 2])) + Parts: 1/1 + Granules: 1/1 diff --git a/tests/queries/0_stateless/01786_explain_merge_tree.sh b/tests/queries/0_stateless/01786_explain_merge_tree.sh new file mode 100755 index 00000000000..2791d0c6921 --- /dev/null +++ b/tests/queries/0_stateless/01786_explain_merge_tree.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +$CLICKHOUSE_CLIENT -q "drop table if exists test_index" +$CLICKHOUSE_CLIENT -q "drop table if exists idx" + +$CLICKHOUSE_CLIENT -q "create table test_index (x UInt32, y UInt32, z UInt32, t UInt32, index t_minmax t % 20 TYPE minmax GRANULARITY 2, index t_set t % 19 type set(4) granularity 2) engine = MergeTree order by (x, y) partition by (y, bitAnd(z, 3), intDiv(t, 15)) settings index_granularity = 2, min_bytes_for_wide_part = 0" +$CLICKHOUSE_CLIENT -q "insert into test_index select number, number > 3 ? 3 : number, number = 1 ? 1 : 0, number from numbers(20)" + +$CLICKHOUSE_CLIENT -q " + explain indexes = 1 select *, _part from test_index where t % 19 = 16 and y > 0 and bitAnd(z, 3) != 1 and x > 10 and t % 20 > 14; + " | grep -A 100 "ReadFromMergeTree" # | grep -v "Description" + +echo "-----------------" + +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select x from test_index where x > 15 order by x; + " | grep -A 100 "ReadFromMergeTree" + +echo "-----------------" + +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select x from test_index where x > 15 order by x desc; + " | grep -A 100 "ReadFromMergeTree" + +$CLICKHOUSE_CLIENT -q "CREATE TABLE idx (x UInt32, y UInt32, z UInt32) ENGINE = MergeTree ORDER BY (x, x + y) settings min_bytes_for_wide_part = 0" +$CLICKHOUSE_CLIENT -q "insert into idx select number, number, number from numbers(10)" + +$CLICKHOUSE_CLIENT -q " + explain indexes = 1 select z from idx where not(x + y + 1 > 2 and x not in (4, 5)) + " | grep -A 100 "ReadFromMergeTree" + +$CLICKHOUSE_CLIENT -q "drop table if exists test_index" +$CLICKHOUSE_CLIENT -q "drop table if exists idx" diff --git a/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.reference b/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.reference new file mode 100644 index 00000000000..c6f75cab8b7 --- /dev/null +++ b/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.reference @@ -0,0 +1,21 @@ +1 [100,200] ['aa','bb'] [1,2] +0 [0,1] ['aa','bb'] [0,0] +1 [100,200] ['aa','bb'] [1,2] +2 [100,200,300] ['a','b','c'] [10,20,30] +3 [3,4] ['aa','bb'] [3,6] +4 [4,5] ['aa','bb'] [4,8] +0 [0,1] ['aa','bb'] [0,0] +1 [100,200] ['aa','bb'] [1,2] +2 [100,200,300] ['a','b','c'] [100,200,300] +3 [3,4] ['aa','bb'] [3,6] +4 [4,5] ['aa','bb'] [4,8] +0 [0,1] ['aa','bb'] [0,0] +1 [100,200] ['aa','bb'] [1,2] +2 [100,200,300] ['a','b','c'] [100,200,300] +3 [68,72] ['aa','bb'] [68,72] +4 [4,5] ['aa','bb'] [4,8] +0 0 aa 0 +1 1 bb 2 +2 2 aa 4 +3 3 aa 6 +4 4 aa 8 diff --git a/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.sql b/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.sql new file mode 100644 index 00000000000..8e850b70c24 --- /dev/null +++ b/tests/queries/0_stateless/01788_update_nested_type_subcolumn_check.sql @@ -0,0 +1,70 @@ +DROP TABLE IF EXISTS test_wide_nested; + +CREATE TABLE test_wide_nested +( + `id` Int, + `info.id` Array(Int), + `info.name` Array(String), + `info.age` Array(Int) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_bytes_for_wide_part = 0; + +set mutations_sync = 1; + +INSERT INTO test_wide_nested SELECT number, [number,number + 1], ['aa','bb'], [number,number * 2] FROM numbers(5); + +alter table test_wide_nested update `info.id` = [100,200] where id = 1; +select * from test_wide_nested where id = 1 order by id; + +alter table test_wide_nested update `info.id` = [100,200,300], `info.age` = [10,20,30], `info.name` = ['a','b','c'] where id = 2; +select * from test_wide_nested; + +alter table test_wide_nested update `info.id` = [100,200,300], `info.age` = `info.id`, `info.name` = ['a','b','c'] where id = 2; +select * from test_wide_nested; + +alter table test_wide_nested update `info.id` = [100,200], `info.age`=[68,72] where id = 3; +alter table test_wide_nested update `info.id` = `info.age` where id = 3; +select * from test_wide_nested; + +alter table test_wide_nested update `info.id` = [100,200], `info.age` = [10,20,30], `info.name` = ['a','b','c'] where id = 0; -- { serverError 341 } + +-- Recreate table, because KILL MUTATION is not suitable for parallel tests execution. +DROP TABLE test_wide_nested; + +CREATE TABLE test_wide_nested +( + `id` Int, + `info.id` Array(Int), + `info.name` Array(String), + `info.age` Array(Int) +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO test_wide_nested SELECT number, [number,number + 1], ['aa','bb'], [number,number * 2] FROM numbers(5); + +alter table test_wide_nested update `info.id` = [100,200,300], `info.age` = [10,20,30] where id = 1; -- { serverError 341 } + +DROP TABLE test_wide_nested; + +DROP TABLE IF EXISTS test_wide_not_nested; + +CREATE TABLE test_wide_not_nested +( + `id` Int, + `info.id` Int, + `info.name` String, + `info.age` Int +) +ENGINE = MergeTree +ORDER BY tuple() +SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO test_wide_not_nested SELECT number, number, 'aa', number * 2 FROM numbers(5); +ALTER TABLE test_wide_not_nested UPDATE `info.name` = 'bb' WHERE id = 1; +SELECT * FROM test_wide_not_nested ORDER BY id; + +DROP TABLE test_wide_not_nested; diff --git a/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.reference b/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.reference new file mode 100644 index 00000000000..5565ed6787f --- /dev/null +++ b/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.reference @@ -0,0 +1,4 @@ +0 +1 +0 +1 diff --git a/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.sql b/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.sql new file mode 100644 index 00000000000..722bd4af5bb --- /dev/null +++ b/tests/queries/0_stateless/01813_distributed_scalar_subqueries_alias.sql @@ -0,0 +1,18 @@ +DROP TABLE IF EXISTS data; +CREATE TABLE data (a Int64, b Int64) ENGINE = TinyLog(); + +DROP TABLE IF EXISTS data_distributed; +CREATE TABLE data_distributed (a Int64, b Int64) ENGINE = Distributed(test_shard_localhost, currentDatabase(), 'data'); + +INSERT INTO data VALUES (0, 0); + +SET prefer_localhost_replica = 1; +SELECT a / (SELECT sum(number) FROM numbers(10)) FROM data_distributed; +SELECT a < (SELECT 1) FROM data_distributed; + +SET prefer_localhost_replica = 0; +SELECT a / (SELECT sum(number) FROM numbers(10)) FROM data_distributed; +SELECT a < (SELECT 1) FROM data_distributed; + +DROP TABLE data_distributed; +DROP TABLE data; diff --git a/tests/queries/0_stateless/01818_input_format_with_names_use_header.reference b/tests/queries/0_stateless/01818_input_format_with_names_use_header.reference new file mode 100644 index 00000000000..b7b577c4685 --- /dev/null +++ b/tests/queries/0_stateless/01818_input_format_with_names_use_header.reference @@ -0,0 +1,2 @@ +testdata1 +testdata2 diff --git a/tests/queries/0_stateless/01818_input_format_with_names_use_header.sh b/tests/queries/0_stateless/01818_input_format_with_names_use_header.sh new file mode 100755 index 00000000000..953c39a40a2 --- /dev/null +++ b/tests/queries/0_stateless/01818_input_format_with_names_use_header.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS \`01818_with_names\`;" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE \`01818_with_names\` (t String) ENGINE = MergeTree ORDER BY t;" + +echo -ne "t\ntestdata1\ntestdata2" | ${CLICKHOUSE_CLIENT} --input_format_with_names_use_header 0 --query "INSERT INTO \`01818_with_names\` FORMAT CSVWithNames" + +${CLICKHOUSE_CLIENT} -q "SELECT * FROM \`01818_with_names\`;" + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS \`01818_with_names\`;" diff --git a/programs/server/data/default/.gitignore b/tests/queries/0_stateless/01821_to_date_time_ubsan.reference similarity index 100% rename from programs/server/data/default/.gitignore rename to tests/queries/0_stateless/01821_to_date_time_ubsan.reference diff --git a/tests/queries/0_stateless/01821_to_date_time_ubsan.sql b/tests/queries/0_stateless/01821_to_date_time_ubsan.sql new file mode 100644 index 00000000000..74226fc221f --- /dev/null +++ b/tests/queries/0_stateless/01821_to_date_time_ubsan.sql @@ -0,0 +1,2 @@ +SELECT toDateTime('9223372036854775806', 7); -- { serverError 407 } +SELECT toDateTime('9223372036854775806', 8); -- { serverError 407 } diff --git a/programs/server/metadata/default/.gitignore b/tests/queries/0_stateless/01822_async_read_from_socket_crash.reference similarity index 100% rename from programs/server/metadata/default/.gitignore rename to tests/queries/0_stateless/01822_async_read_from_socket_crash.reference diff --git a/tests/queries/0_stateless/01822_async_read_from_socket_crash.sh b/tests/queries/0_stateless/01822_async_read_from_socket_crash.sh new file mode 100755 index 00000000000..b4bb2228a91 --- /dev/null +++ b/tests/queries/0_stateless/01822_async_read_from_socket_crash.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + + +for _ in {1..10}; do $CLICKHOUSE_CLIENT -q "select number from remote('127.0.0.{2,3}', numbers(20)) limit 8 settings max_block_size = 2, unknown_packet_in_send_data=4, sleep_in_send_data_ms=100, async_socket_for_remote=1 format Null" > /dev/null 2>&1 || true; done diff --git a/tests/queries/0_stateless/01822_union_and_constans_error.reference b/tests/queries/0_stateless/01822_union_and_constans_error.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/01822_union_and_constans_error.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/01822_union_and_constans_error.sql b/tests/queries/0_stateless/01822_union_and_constans_error.sql new file mode 100644 index 00000000000..38b7df700cd --- /dev/null +++ b/tests/queries/0_stateless/01822_union_and_constans_error.sql @@ -0,0 +1,20 @@ +drop table if exists t0; +CREATE TABLE t0 (c0 String) ENGINE = Log(); + +SELECT isNull(t0.c0) OR COUNT('\n?pVa') +FROM t0 +GROUP BY t0.c0 +HAVING isNull(t0.c0) +UNION ALL +SELECT isNull(t0.c0) OR COUNT('\n?pVa') +FROM t0 +GROUP BY t0.c0 +HAVING NOT isNull(t0.c0) +UNION ALL +SELECT isNull(t0.c0) OR COUNT('\n?pVa') +FROM t0 +GROUP BY t0.c0 +HAVING isNull(isNull(t0.c0)) +SETTINGS aggregate_functions_null_for_empty = 1, enable_optimize_predicate_expression = 0; + +drop table if exists t0; diff --git a/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.reference b/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.reference new file mode 100644 index 00000000000..2439021d2e0 --- /dev/null +++ b/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.reference @@ -0,0 +1 @@ +[['a'],['b','c']] diff --git a/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.sql b/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.sql new file mode 100644 index 00000000000..528a3b464b3 --- /dev/null +++ b/tests/queries/0_stateless/01823_array_low_cardinality_KuliginStepan.sql @@ -0,0 +1,7 @@ +create temporary table test ( + arr Array(Array(LowCardinality(String))) +); + +insert into test(arr) values ([['a'], ['b', 'c']]); + +select arrayFilter(x -> 1, arr) from test; diff --git a/tests/queries/0_stateless/01831_max_streams.reference b/tests/queries/0_stateless/01831_max_streams.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/01831_max_streams.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/01831_max_streams.sql b/tests/queries/0_stateless/01831_max_streams.sql new file mode 100644 index 00000000000..aa835dea5ac --- /dev/null +++ b/tests/queries/0_stateless/01831_max_streams.sql @@ -0,0 +1 @@ +select * from remote('127.1', system.one) settings max_distributed_connections=0; diff --git a/tests/queries/0_stateless/01833_test_collation_alvarotuso.reference b/tests/queries/0_stateless/01833_test_collation_alvarotuso.reference new file mode 100644 index 00000000000..c55134e07d3 --- /dev/null +++ b/tests/queries/0_stateless/01833_test_collation_alvarotuso.reference @@ -0,0 +1,6 @@ +a a +A A +b b +B B +c c +C C diff --git a/tests/queries/0_stateless/01833_test_collation_alvarotuso.sql b/tests/queries/0_stateless/01833_test_collation_alvarotuso.sql new file mode 100644 index 00000000000..65422731711 --- /dev/null +++ b/tests/queries/0_stateless/01833_test_collation_alvarotuso.sql @@ -0,0 +1,21 @@ +DROP TABLE IF EXISTS test_collation; + +CREATE TABLE test_collation +( + `v` String, + `v2` String +) +ENGINE = MergeTree +ORDER BY v +SETTINGS index_granularity = 8192; + +insert into test_collation values ('A', 'A'); +insert into test_collation values ('B', 'B'); +insert into test_collation values ('C', 'C'); +insert into test_collation values ('a', 'a'); +insert into test_collation values ('b', 'b'); +insert into test_collation values ('c', 'c'); + +SELECT * FROM test_collation ORDER BY v ASC COLLATE 'en'; + +DROP TABLE test_collation; diff --git a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference new file mode 100644 index 00000000000..7326d960397 --- /dev/null +++ b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.reference @@ -0,0 +1 @@ +Ok diff --git a/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh new file mode 100755 index 00000000000..793f477b3cb --- /dev/null +++ b/tests/queries/0_stateless/01834_alias_columns_laziness_filimonov.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --multiquery --query " +drop table if exists aliases_lazyness; +create table aliases_lazyness (x UInt32, y ALIAS sleepEachRow(0.1)) Engine=MergeTree ORDER BY x; +insert into aliases_lazyness(x) select * from numbers(40); +" + +# In very old ClickHouse versions alias column was calculated for every row. +# If it works this way, the query will take at least 0.1 * 40 = 4 seconds. +# If the issue does not exist, the query should take slightly more than 0.1 seconds. +# The exact time is not guaranteed, so we check in a loop that at least once +# the query will process in less than one second, that proves that the behaviour is not like it was long time ago. + +while true +do + timeout 1 ${CLICKHOUSE_CLIENT} --query "SELECT x, y FROM aliases_lazyness WHERE x = 1 FORMAT Null" && break +done + +${CLICKHOUSE_CLIENT} --multiquery --query " +drop table aliases_lazyness; +SELECT 'Ok'; +" diff --git a/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.reference b/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.reference new file mode 100644 index 00000000000..1f49e6b362b --- /dev/null +++ b/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.reference @@ -0,0 +1,2 @@ +2017-12-15 1 1 +2017-12-15 1 1 diff --git a/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.sql b/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.sql new file mode 100644 index 00000000000..54ffb7b4c1f --- /dev/null +++ b/tests/queries/0_stateless/01835_alias_to_primary_key_cyfdecyf.sql @@ -0,0 +1,21 @@ +DROP TABLE IF EXISTS db; + +CREATE TABLE tb +( + date Date, + `index` Int32, + value Int32, + idx Int32 ALIAS `index` +) +ENGINE = MergeTree +PARTITION BY date +ORDER BY (date, `index`); + +insert into tb values ('2017-12-15', 1, 1); + +SET force_primary_key = 1; + +select * from tb where `index` >= 0 AND `index` <= 2; +select * from tb where idx >= 0 AND idx <= 2; + +DROP TABLE tb; diff --git a/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.reference b/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.reference new file mode 100644 index 00000000000..fc624e3510f --- /dev/null +++ b/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.reference @@ -0,0 +1,6 @@ +DateTime +DateTime +DateTime(\'UTC\') +DateTime64(3) +DateTime64(3) +DateTime64(3, \'UTC\') diff --git a/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.sql b/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.sql new file mode 100644 index 00000000000..be47cfb0411 --- /dev/null +++ b/tests/queries/0_stateless/01836_date_time_keep_default_timezone_on_operations_den_crane.sql @@ -0,0 +1,26 @@ +SELECT toTypeName(now()); +SELECT toTypeName(now() - 1); +SELECT toTypeName(now('UTC') - 1); + +SELECT toTypeName(now64(3)); +SELECT toTypeName(now64(3) - 1); +SELECT toTypeName(toTimeZone(now64(3), 'UTC') - 1); + +DROP TABLE IF EXISTS tt_null; +DROP TABLE IF EXISTS tt; +DROP TABLE IF EXISTS tt_mv; + +create table tt_null(p String) engine = Null; + +create table tt(p String,tmin AggregateFunction(min, DateTime)) +engine = AggregatingMergeTree order by p; + +create materialized view tt_mv to tt as +select p, minState(now() - interval 30 minute) as tmin +from tt_null group by p; + +insert into tt_null values('x'); + +DROP TABLE tt_null; +DROP TABLE tt; +DROP TABLE tt_mv; diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index f435c00a989..f7068c16edd 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -231,3 +231,4 @@ 01801_distinct_group_by_shard 01804_dictionary_decimal256_type 01801_s3_distributed +01833_test_collation_alvarotuso diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 128a881f414..08a66c7499d 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -392,6 +392,8 @@ "01475_read_subcolumns_storages", "01674_clickhouse_client_query_param_cte", "01666_merge_tree_max_query_limit", + "01786_explain_merge_tree", + "01666_merge_tree_max_query_limit", "01802_test_postgresql_protocol_with_row_policy" /// It cannot parse DROP ROW POLICY ], "parallel": diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index 04d9a07008e..5b98e28c0c8 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -32,6 +32,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (db-generator) add_subdirectory (wal-dump) add_subdirectory (check-mysql-binlog) + add_subdirectory (keeper-bench) if (USE_NURAFT) add_subdirectory (keeper-data-dumper) diff --git a/utils/keeper-bench/CMakeLists.txt b/utils/keeper-bench/CMakeLists.txt new file mode 100644 index 00000000000..2f12194d1b7 --- /dev/null +++ b/utils/keeper-bench/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(keeper-bench Generator.cpp Runner.cpp Stats.cpp main.cpp) +target_link_libraries(keeper-bench PRIVATE clickhouse_common_zookeeper) diff --git a/utils/keeper-bench/Generator.cpp b/utils/keeper-bench/Generator.cpp new file mode 100644 index 00000000000..852de07f2e1 --- /dev/null +++ b/utils/keeper-bench/Generator.cpp @@ -0,0 +1,238 @@ +#include "Generator.h" +#include +#include + +using namespace Coordination; +using namespace zkutil; + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} + +namespace +{ +std::string generateRandomString(size_t length) +{ + if (length == 0) + return ""; + + static const auto & chars = "0123456789" + "abcdefghijklmnopqrstuvwxyz" + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + + static pcg64 rng(randomSeed()); + static std::uniform_int_distribution pick(0, sizeof(chars) - 2); + + std::string s; + + s.reserve(length); + + while (length--) + s += chars[pick(rng)]; + + return s; +} +} + +std::string generateRandomPath(const std::string & prefix, size_t length) +{ + return std::filesystem::path(prefix) / generateRandomString(length); +} + +std::string generateRandomData(size_t size) +{ + return generateRandomString(size); +} + +void CreateRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) +{ + auto promise = std::make_shared>(); + auto future = promise->get_future(); + auto create_callback = [promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(); + }; + zookeeper.create(path_prefix, "", false, false, default_acls, create_callback); + future.get(); +} + +ZooKeeperRequestPtr CreateRequestGenerator::generate() +{ + auto request = std::make_shared(); + request->acls = default_acls; + size_t plength = 5; + if (path_length) + plength = *path_length; + auto path_candidate = generateRandomPath(path_prefix, plength); + + while (paths_created.count(path_candidate)) + path_candidate = generateRandomPath(path_prefix, plength); + + paths_created.insert(path_candidate); + + request->path = path_candidate; + if (data_size) + request->data = generateRandomData(*data_size); + + return request; +} + + +void GetRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) +{ + auto promise = std::make_shared>(); + auto future = promise->get_future(); + auto create_callback = [promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(); + }; + zookeeper.create(path_prefix, "", false, false, default_acls, create_callback); + future.get(); + size_t total_nodes = 1; + if (num_nodes) + total_nodes = *num_nodes; + + for (size_t i = 0; i < total_nodes; ++i) + { + auto path = generateRandomPath(path_prefix, 5); + while (std::find(paths_to_get.begin(), paths_to_get.end(), path) != paths_to_get.end()) + path = generateRandomPath(path_prefix, 5); + + auto create_promise = std::make_shared>(); + auto create_future = create_promise->get_future(); + auto callback = [create_promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + create_promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + create_promise->set_value(); + }; + std::string data; + if (nodes_data_size) + data = generateRandomString(*nodes_data_size); + + zookeeper.create(path, data, false, false, default_acls, callback); + create_future.get(); + paths_to_get.push_back(path); + } +} + +Coordination::ZooKeeperRequestPtr GetRequestGenerator::generate() +{ + auto request = std::make_shared(); + + size_t path_index = distribution(rng); + request->path = paths_to_get[path_index]; + return request; +} + +void ListRequestGenerator::startup(Coordination::ZooKeeper & zookeeper) +{ + auto promise = std::make_shared>(); + auto future = promise->get_future(); + auto create_callback = [promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(); + }; + zookeeper.create(path_prefix, "", false, false, default_acls, create_callback); + future.get(); + + size_t total_nodes = 1; + if (num_nodes) + total_nodes = *num_nodes; + + size_t path_length = 5; + if (paths_length) + path_length = *paths_length; + + for (size_t i = 0; i < total_nodes; ++i) + { + auto path = generateRandomPath(path_prefix, path_length); + + auto create_promise = std::make_shared>(); + auto create_future = create_promise->get_future(); + auto callback = [create_promise] (const CreateResponse & response) + { + if (response.error != Coordination::Error::ZOK) + create_promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + create_promise->set_value(); + }; + zookeeper.create(path, "", false, false, default_acls, callback); + create_future.get(); + } +} + +Coordination::ZooKeeperRequestPtr ListRequestGenerator::generate() +{ + auto request = std::make_shared(); + request->path = path_prefix; + return request; +} + +std::unique_ptr getGenerator(const std::string & name) +{ + if (name == "create_no_data") + { + return std::make_unique(); + } + else if (name == "create_small_data") + { + return std::make_unique("/create_generator", 5, 32); + } + else if (name == "create_medium_data") + { + return std::make_unique("/create_generator", 5, 1024); + } + else if (name == "create_big_data") + { + return std::make_unique("/create_generator", 5, 512 * 1024); + } + else if (name == "get_no_data") + { + return std::make_unique("/get_generator", 10, 0); + } + else if (name == "get_small_data") + { + return std::make_unique("/get_generator", 10, 32); + } + else if (name == "get_medium_data") + { + return std::make_unique("/get_generator", 10, 1024); + } + else if (name == "get_big_data") + { + return std::make_unique("/get_generator", 10, 512 * 1024); + } + else if (name == "list_no_nodes") + { + return std::make_unique("/list_generator", 0, 1); + } + else if (name == "list_few_nodes") + { + return std::make_unique("/list_generator", 10, 5); + } + else if (name == "list_medium_nodes") + { + return std::make_unique("/list_generator", 1000, 5); + } + else if (name == "list_a_lot_nodes") + { + return std::make_unique("/list_generator", 100000, 5); + } + + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown generator {}", name); +} diff --git a/utils/keeper-bench/Generator.h b/utils/keeper-bench/Generator.h new file mode 100644 index 00000000000..d6cc0eec335 --- /dev/null +++ b/utils/keeper-bench/Generator.h @@ -0,0 +1,107 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include + + +std::string generateRandomPath(const std::string & prefix, size_t length = 5); + +std::string generateRandomData(size_t size); + +class IGenerator +{ +public: + IGenerator() + { + Coordination::ACL acl; + acl.permissions = Coordination::ACL::All; + acl.scheme = "world"; + acl.id = "anyone"; + default_acls.emplace_back(std::move(acl)); + } + virtual void startup(Coordination::ZooKeeper & /*zookeeper*/) {} + virtual Coordination::ZooKeeperRequestPtr generate() = 0; + + virtual ~IGenerator() = default; + + Coordination::ACLs default_acls; + +}; + +class CreateRequestGenerator final : public IGenerator +{ +public: + explicit CreateRequestGenerator( + std::string path_prefix_ = "/create_generator", + std::optional path_length_ = std::nullopt, + std::optional data_size_ = std::nullopt) + : path_prefix(path_prefix_) + , path_length(path_length_) + , data_size(data_size_) + {} + + void startup(Coordination::ZooKeeper & zookeeper) override; + Coordination::ZooKeeperRequestPtr generate() override; + +private: + std::string path_prefix; + std::optional path_length; + std::optional data_size; + std::unordered_set paths_created; +}; + + +class GetRequestGenerator final : public IGenerator +{ +public: + explicit GetRequestGenerator( + std::string path_prefix_ = "/get_generator", + std::optional num_nodes_ = std::nullopt, + std::optional nodes_data_size_ = std::nullopt) + : path_prefix(path_prefix_) + , num_nodes(num_nodes_) + , nodes_data_size(nodes_data_size_) + , rng(randomSeed()) + , distribution(0, num_nodes ? *num_nodes - 1 : 0) + {} + + void startup(Coordination::ZooKeeper & zookeeper) override; + Coordination::ZooKeeperRequestPtr generate() override; + +private: + std::string path_prefix; + std::optional num_nodes; + std::optional nodes_data_size; + std::vector paths_to_get; + + pcg64 rng; + std::uniform_int_distribution distribution; +}; + +class ListRequestGenerator final : public IGenerator +{ +public: + explicit ListRequestGenerator( + std::string path_prefix_ = "/list_generator", + std::optional num_nodes_ = std::nullopt, + std::optional paths_length_ = std::nullopt) + : path_prefix(path_prefix_) + , num_nodes(num_nodes_) + , paths_length(paths_length_) + {} + + void startup(Coordination::ZooKeeper & zookeeper) override; + Coordination::ZooKeeperRequestPtr generate() override; + +private: + std::string path_prefix; + std::optional num_nodes; + std::optional paths_length; +}; + +std::unique_ptr getGenerator(const std::string & name); diff --git a/utils/keeper-bench/Runner.cpp b/utils/keeper-bench/Runner.cpp new file mode 100644 index 00000000000..d3f51fb2356 --- /dev/null +++ b/utils/keeper-bench/Runner.cpp @@ -0,0 +1,188 @@ +#include "Runner.h" + +namespace DB +{ +namespace ErrorCodes +{ + extern const int CANNOT_BLOCK_SIGNAL; +} +} + +void Runner::thread(std::vector> & zookeepers) +{ + Coordination::ZooKeeperRequestPtr request; + /// Randomly choosing connection index + pcg64 rng(randomSeed()); + std::uniform_int_distribution distribution(0, zookeepers.size() - 1); + + /// In these threads we do not accept INT signal. + sigset_t sig_set; + if (sigemptyset(&sig_set) + || sigaddset(&sig_set, SIGINT) + || pthread_sigmask(SIG_BLOCK, &sig_set, nullptr)) + { + DB::throwFromErrno("Cannot block signal.", DB::ErrorCodes::CANNOT_BLOCK_SIGNAL); + } + + while (true) + { + bool extracted = false; + + while (!extracted) + { + extracted = queue.tryPop(request, 100); + + if (shutdown + || (max_iterations && requests_executed >= max_iterations)) + { + return; + } + } + + const auto connection_index = distribution(rng); + auto & zk = zookeepers[connection_index]; + + auto promise = std::make_shared>(); + auto future = promise->get_future(); + Coordination::ResponseCallback callback = [promise](const Coordination::Response & response) + { + if (response.error != Coordination::Error::ZOK) + promise->set_exception(std::make_exception_ptr(zkutil::KeeperException(response.error))); + else + promise->set_value(response.bytesSize()); + }; + + Stopwatch watch; + + zk->executeGenericRequest(request, callback); + + try + { + auto response_size = future.get(); + double seconds = watch.elapsedSeconds(); + + std::lock_guard lock(mutex); + + if (request->isReadRequest()) + info->addRead(seconds, 1, request->bytesSize() + response_size); + else + info->addWrite(seconds, 1, request->bytesSize() + response_size); + } + catch (...) + { + if (!continue_on_error) + { + shutdown = true; + throw; + } + std::cerr << DB::getCurrentExceptionMessage(true, true /*check embedded stack trace*/) << std::endl; + } + + ++requests_executed; + } +} + +bool Runner::tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr & request, DB::InterruptListener & interrupt_listener) +{ + bool inserted = false; + + while (!inserted) + { + inserted = queue.tryPush(request, 100); + + if (shutdown) + { + /// An exception occurred in a worker + return false; + } + + if (max_time > 0 && total_watch.elapsedSeconds() >= max_time) + { + std::cout << "Stopping launch of queries. Requested time limit is exhausted.\n"; + return false; + } + + if (interrupt_listener.check()) + { + std::cout << "Stopping launch of queries. SIGINT received." << std::endl; + return false; + } + + if (delay > 0 && delay_watch.elapsedSeconds() > delay) + { + printNumberOfRequestsExecuted(requests_executed); + + std::lock_guard lock(mutex); + report(info, concurrency); + delay_watch.restart(); + } + } + + return true; +} + + +void Runner::runBenchmark() +{ + auto aux_connections = getConnections(); + + std::cerr << "Preparing to run\n"; + generator->startup(*aux_connections[0]); + std::cerr << "Prepared\n"; + try + { + for (size_t i = 0; i < concurrency; ++i) + { + auto connections = getConnections(); + pool.scheduleOrThrowOnError([this, connections]() mutable { thread(connections); }); + } + } + catch (...) + { + pool.wait(); + throw; + } + + DB::InterruptListener interrupt_listener; + delay_watch.restart(); + + /// Push queries into queue + for (size_t i = 0; !max_iterations || i < max_iterations; ++i) + { + if (!tryPushRequestInteractively(generator->generate(), interrupt_listener)) + { + shutdown = true; + break; + } + } + + pool.wait(); + total_watch.stop(); + + printNumberOfRequestsExecuted(requests_executed); + + std::lock_guard lock(mutex); + report(info, concurrency); +} + + +std::vector> Runner::getConnections() +{ + std::vector> zookeepers; + for (const auto & host_string : hosts_strings) + { + Coordination::ZooKeeper::Node node{Poco::Net::SocketAddress{host_string}, false}; + std::vector nodes; + nodes.push_back(node); + zookeepers.emplace_back(std::make_shared( + nodes, + "", /*chroot*/ + "", /*identity type*/ + "", /*identity*/ + Poco::Timespan(0, 30000 * 1000), + Poco::Timespan(0, 1000 * 1000), + Poco::Timespan(0, 10000 * 1000))); + } + + return zookeepers; +} diff --git a/utils/keeper-bench/Runner.h b/utils/keeper-bench/Runner.h new file mode 100644 index 00000000000..bb83e790214 --- /dev/null +++ b/utils/keeper-bench/Runner.h @@ -0,0 +1,79 @@ +#pragma once +#include +#include "Generator.h" +#include +#include +#include +#include +#include +#include +#include + +#include +#include "Stats.h" + +using Ports = std::vector; +using Strings = std::vector; + +class Runner +{ +public: + Runner( + size_t concurrency_, + const std::string & generator_name, + const Strings & hosts_strings_, + double max_time_, + double delay_, + bool continue_on_error_, + size_t max_iterations_) + : concurrency(concurrency_) + , pool(concurrency) + , hosts_strings(hosts_strings_) + , generator(getGenerator(generator_name)) + , max_time(max_time_) + , delay(delay_) + , continue_on_error(continue_on_error_) + , max_iterations(max_iterations_) + , info(std::make_shared()) + , queue(concurrency) + { + } + + void thread(std::vector> & zookeepers); + + void printNumberOfRequestsExecuted(size_t num) + { + std::cerr << "Requests executed: " << num << ".\n"; + } + + bool tryPushRequestInteractively(const Coordination::ZooKeeperRequestPtr & request, DB::InterruptListener & interrupt_listener); + + void runBenchmark(); + + +private: + + size_t concurrency = 1; + + ThreadPool pool; + Strings hosts_strings; + std::unique_ptr generator; + double max_time = 0; + double delay = 1; + bool continue_on_error = false; + std::atomic max_iterations = 0; + std::atomic requests_executed = 0; + std::atomic shutdown = false; + + std::shared_ptr info; + + Stopwatch total_watch; + Stopwatch delay_watch; + + std::mutex mutex; + + using Queue = ConcurrentBoundedQueue; + Queue queue; + + std::vector> getConnections(); +}; diff --git a/utils/keeper-bench/Stats.cpp b/utils/keeper-bench/Stats.cpp new file mode 100644 index 00000000000..1f8b02ed09d --- /dev/null +++ b/utils/keeper-bench/Stats.cpp @@ -0,0 +1,67 @@ +#include "Stats.h" +#include + +void report(std::shared_ptr & info, size_t concurrency) +{ + std::cerr << "\n"; + + /// Avoid zeros, nans or exceptions + if (0 == info->read_requests && 0 == info->write_requests) + return; + + double read_seconds = info->read_work_time / concurrency; + double write_seconds = info->write_work_time / concurrency; + + std::cerr << "read requests " << info->read_requests << ", write requests " << info->write_requests << ", "; + if (info->errors) + { + std::cerr << "errors " << info->errors << ", "; + } + if (0 != info->read_requests) + { + std::cerr + << "Read RPS: " << (info->read_requests / read_seconds) << ", " + << "Read MiB/s: " << (info->requests_read_bytes / read_seconds / 1048576); + if (0 != info->write_requests) + std::cerr << ", "; + } + if (0 != info->write_requests) + { + std::cerr + << "Write RPS: " << (info->write_requests / write_seconds) << ", " + << "Write MiB/s: " << (info->requests_write_bytes / write_seconds / 1048576) << ". " + << "\n"; + } + std::cerr << "\n"; + + auto print_percentile = [&](double percent, Stats::Sampler & sampler) + { + std::cerr << percent << "%\t\t"; + std::cerr << sampler.quantileNearest(percent / 100.0) << " sec.\t"; + std::cerr << "\n"; + }; + + if (0 != info->read_requests) + { + std::cerr << "Read sampler:\n"; + for (int percent = 0; percent <= 90; percent += 10) + print_percentile(percent, info->read_sampler); + + print_percentile(95, info->read_sampler); + print_percentile(99, info->read_sampler); + print_percentile(99.9, info->read_sampler); + print_percentile(99.99, info->read_sampler); + } + + if (0 != info->write_requests) + { + std::cerr << "Write sampler:\n"; + for (int percent = 0; percent <= 90; percent += 10) + print_percentile(percent, info->write_sampler); + + print_percentile(95, info->write_sampler); + print_percentile(99, info->write_sampler); + print_percentile(99.9, info->write_sampler); + print_percentile(99.99, info->write_sampler); + } +} diff --git a/utils/keeper-bench/Stats.h b/utils/keeper-bench/Stats.h new file mode 100644 index 00000000000..1b9a31bb734 --- /dev/null +++ b/utils/keeper-bench/Stats.h @@ -0,0 +1,52 @@ +#pragma once + +#include +#include + +#include + +struct Stats +{ + std::atomic read_requests{0}; + std::atomic write_requests{0}; + size_t errors = 0; + size_t requests_write_bytes = 0; + size_t requests_read_bytes = 0; + double read_work_time = 0; + double write_work_time = 0; + + using Sampler = ReservoirSampler; + Sampler read_sampler {1 << 16}; + Sampler write_sampler {1 << 16}; + + void addRead(double seconds, size_t requests_inc, size_t bytes_inc) + { + read_work_time += seconds; + read_requests += requests_inc; + requests_read_bytes += bytes_inc; + read_sampler.insert(seconds); + } + + void addWrite(double seconds, size_t requests_inc, size_t bytes_inc) + { + write_work_time += seconds; + write_requests += requests_inc; + requests_write_bytes += bytes_inc; + write_sampler.insert(seconds); + } + + void clear() + { + read_requests = 0; + write_requests = 0; + read_work_time = 0; + write_work_time = 0; + requests_read_bytes = 0; + requests_write_bytes = 0; + read_sampler.clear(); + write_sampler.clear(); + } +}; + + +void report(std::shared_ptr & info, size_t concurrency); diff --git a/utils/keeper-bench/main.cpp b/utils/keeper-bench/main.cpp new file mode 100644 index 00000000000..378d7c2f6e4 --- /dev/null +++ b/utils/keeper-bench/main.cpp @@ -0,0 +1,61 @@ +#include +#include +#include "Runner.h" +#include "Stats.h" +#include "Generator.h" +#include +#include + +using namespace std; + +int main(int argc, char *argv[]) +{ + + bool print_stacktrace = true; + + try + { + using boost::program_options::value; + + boost::program_options::options_description desc = createOptionsDescription("Allowed options", getTerminalWidth()); + desc.add_options() + ("help", "produce help message") + ("generator", value()->default_value("create_small_data"), "query to execute") + ("concurrency,c", value()->default_value(1), "number of parallel queries") + ("delay,d", value()->default_value(1), "delay between intermediate reports in seconds (set 0 to disable reports)") + ("iterations,i", value()->default_value(0), "amount of queries to be executed") + ("timelimit,t", value()->default_value(0.), "stop launch of queries after specified time limit") + ("hosts,h", value()->multitoken(), "") + ("continue_on_errors", "continue testing even if a query fails") + ("reconnect", "establish new connection for every query") + ; + + boost::program_options::variables_map options; + boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); + boost::program_options::notify(options); + + if (options.count("help")) + { + std::cout << "Usage: " << argv[0] << " [options] < queries.txt\n"; + std::cout << desc << "\n"; + return 1; + } + + Runner runner(options["concurrency"].as(), + options["generator"].as(), + options["hosts"].as(), + options["timelimit"].as(), + options["delay"].as(), + options.count("continue_on_errors"), + options["iterations"].as()); + + runner.runBenchmark(); + + return 0; + } + catch (...) + { + std::cerr << DB::getCurrentExceptionMessage(print_stacktrace, true) << std::endl; + return DB::getCurrentExceptionCode(); + } +}