diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index f0cef54b0b8..3e3bb7ec2b2 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -16,7 +16,9 @@ endmacro() if (SANITIZE) if (SANITIZE STREQUAL "address") - set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope") + # LLVM-15 has a bug in Address Sanitizer, preventing the usage of 'sanitize-address-use-after-scope', + # see https://github.com/llvm/llvm-project/issues/58633 + set (ASAN_FLAGS "-fsanitize=address -fno-sanitize-address-use-after-scope") set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}") set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}") diff --git a/docker/docs/builder/run.sh b/docker/docs/builder/run.sh index a4f678b2f24..87e6218547f 100755 --- a/docker/docs/builder/run.sh +++ b/docker/docs/builder/run.sh @@ -25,6 +25,7 @@ done sed -i '/onBrokenMarkdownLinks:/ s/ignore/error/g' docusaurus.config.js if [[ $# -lt 1 ]] || [[ "$1" == "--"* ]]; then + export CI=true exec yarn build "$@" fi diff --git a/docker/server/Dockerfile.ubuntu b/docker/server/Dockerfile.ubuntu index d5fc5d8e0d3..f50160321e1 100644 --- a/docker/server/Dockerfile.ubuntu +++ b/docker/server/Dockerfile.ubuntu @@ -80,6 +80,16 @@ RUN arch=${TARGETARCH:-amd64} \ && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client +# Remove as much of Ubuntu as possible. +# ClickHouse does not need Ubuntu. It can run on top of Linux kernel without any OS distribution. +# ClickHouse does not need Docker at all. ClickHouse is above all that. +# It does not care about Ubuntu, Docker, or other cruft and you should neither. +# The fact that this Docker image is based on Ubuntu is just a misconception. +# Some vulnerability scanners are arguing about Ubuntu, which is not relevant to ClickHouse at all. +# ClickHouse does not care when you report false vulnerabilities by running some Docker scanners. + +RUN apt-get remove --purge -y libksba8 && apt-get autoremove -y + # we need to allow "others" access to clickhouse folder, because docker container # can be started with arbitrary uid (openshift usecase) diff --git a/docs/en/engines/table-engines/integrations/index.md b/docs/en/engines/table-engines/integrations/index.md index 7e67bcb6249..09e89209ea9 100644 --- a/docs/en/engines/table-engines/integrations/index.md +++ b/docs/en/engines/table-engines/integrations/index.md @@ -6,7 +6,7 @@ sidebar_label: Integrations # Table Engines for Integrations -ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like external dictionaries or table functions, which require to use custom query methods on each use. +ClickHouse provides various means for integrating with external systems, including table engines. Like with all other table engines, the configuration is done using `CREATE TABLE` or `ALTER TABLE` queries. Then from a user perspective, the configured integration looks like a normal table, but queries to it are proxied to the external system. This transparent querying is one of the key advantages of this approach over alternative integration methods, like dictionaries or table functions, which require to use custom query methods on each use. List of supported integrations: diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md index 7c9c4cfea53..9f637c50989 100644 --- a/docs/en/engines/table-engines/integrations/mysql.md +++ b/docs/en/engines/table-engines/integrations/mysql.md @@ -180,6 +180,6 @@ Default value: `300`. ## See Also {#see-also} - [The mysql table function](../../../sql-reference/table-functions/mysql.md) -- [Using MySQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/mysql/) diff --git a/docs/en/engines/table-engines/integrations/odbc.md b/docs/en/engines/table-engines/integrations/odbc.md index 043d5170654..e21a64bc5b2 100644 --- a/docs/en/engines/table-engines/integrations/odbc.md +++ b/docs/en/engines/table-engines/integrations/odbc.md @@ -126,7 +126,7 @@ SELECT * FROM odbc_t ## See Also {#see-also} -- [ODBC external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table function](../../../sql-reference/table-functions/odbc.md) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/odbc/) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 4bb8033de9c..c07512cf0ce 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -174,6 +174,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md) -- [Using PostgreSQL as a source of external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.com/docs/en/engines/table-engines/integrations/postgresql/) diff --git a/docs/en/getting-started/example-datasets/cell-towers.md b/docs/en/getting-started/example-datasets/cell-towers.md index 67ee8cdb7e2..c9fb78205d7 100644 --- a/docs/en/getting-started/example-datasets/cell-towers.md +++ b/docs/en/getting-started/example-datasets/cell-towers.md @@ -163,7 +163,7 @@ SELECT mcc, count() FROM cell_towers GROUP BY mcc ORDER BY count() DESC LIMIT 10 Based on the above query and the [MCC list](https://en.wikipedia.org/wiki/Mobile_country_code), the countries with the most cell towers are: the USA, Germany, and Russia. -You may want to create an [External Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. +You may want to create a [Dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) in ClickHouse to decode these values. ## Use case: Incorporate geo data {#use-case} diff --git a/docs/en/getting-started/example-datasets/github.md b/docs/en/getting-started/example-datasets/github.md index fbc65d36ba9..9a4fbb7da06 100644 --- a/docs/en/getting-started/example-datasets/github.md +++ b/docs/en/getting-started/example-datasets/github.md @@ -10,8 +10,8 @@ This dataset contains all of the commits and changes for the ClickHouse reposito The generated data provides a `tsv` file for each of the following tables: -- `commits` - commits with statistics; -- `file_changes` - files changed in every commit with the info about the change and statistics; +- `commits` - commits with statistics. +- `file_changes` - files changed in every commit with the info about the change and statistics. - `line_changes` - every changed line in every changed file in every commit with full info about the line and the information about the previous change of this line. As of November 8th, 2022, each TSV is approximately the following size and number of rows: @@ -59,13 +59,15 @@ As of November 8th, 2022, each TSV is approximately the following size and numbe # Generating the data +This is optional. We distribute the data freely - see [Downloading and inserting the data](#downloading-and-inserting-the-data). + ```bash git clone git@github.com:ClickHouse/ClickHouse.git cd ClickHouse clickhouse git-import --skip-paths 'generated\.cpp|^(contrib|docs?|website|libs/(libcityhash|liblz4|libdivide|libvectorclass|libdouble-conversion|libcpuid|libzstd|libfarmhash|libmetrohash|libpoco|libwidechar_width))/' --skip-commits-with-messages '^Merge branch ' ``` -This will take around 3 minutes (as of November 8th 2022) to complete for the ClickHouse repository. +This will take around 3 minutes (as of November 8th 2022 on a MacBook Pro 2021) to complete for the ClickHouse repository. A full list of available options can be obtained from the tools native help. @@ -100,16 +102,18 @@ CREATE TABLE git.commits # Downloading and inserting the data +The following data can be used to reproduce a working environment. Alternatively, this dataset is available in play.clickhouse.com - see [Queries](#queries) for further details. + Generated files for the following repositories can be found below: - ClickHouse (Nov 8th 2022) - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz - 2.5 MB - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz - 4.5MB - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz - 127.4 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz - 2.5 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz - 4.5MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz - 127.4 MB - Linux (Nov 8th 2022) - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/linux/commits.tsv.xz - 2.5 MB - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/linux/file_changes.tsv.xz - 4.5MB - - https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/linux/line_changes.tsv.xz - 127.4 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/commits.tsv.xz - 44 MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/file_changes.tsv.xz - 467MB + - https://datasets-documentation.s3.amazonaws.com/github/commits/linux/line_changes.tsv.xz - 1.1G To insert this data, prepare the database by executing the following queries: @@ -212,7 +216,7 @@ Insert the data using `INSERT INTO SELECT` and the [s3 function](https://clickho ```sql INSERT INTO git.commits SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz', 'TSV', 'hash String,author LowCardinality(String), time DateTime, message String, files_added UInt32, files_deleted UInt32, files_renamed UInt32, files_modified UInt32, lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32') +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/commits.tsv.xz', 'TSV', 'hash String,author LowCardinality(String), time DateTime, message String, files_added UInt32, files_deleted UInt32, files_renamed UInt32, files_modified UInt32, lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32') 0 rows in set. Elapsed: 1.826 sec. Processed 62.78 thousand rows, 8.50 MB (34.39 thousand rows/s., 4.66 MB/s.) ``` @@ -221,7 +225,7 @@ FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commit ```sql INSERT INTO git.file_changes SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz', 'TSV', 'change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6), path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32, commit_hash String, author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/file_changes.tsv.xz', 'TSV', 'change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6), path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), lines_added UInt32, lines_deleted UInt32, hunks_added UInt32, hunks_removed UInt32, hunks_changed UInt32, commit_hash String, author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') 0 rows in set. Elapsed: 2.688 sec. Processed 266.05 thousand rows, 48.30 MB (98.97 thousand rows/s., 17.97 MB/s.) ``` @@ -230,56 +234,23 @@ FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commit ```sql INSERT INTO git.line_changes SELECT * -FROM s3('https://datasets-documentation.s3.eu-west-3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz', 'TSV', ' sign Int8, line_number_old UInt32, line_number_new UInt32, hunk_num UInt32, hunk_start_line_number_old UInt32, hunk_start_line_number_new UInt32, hunk_lines_added UInt32,\n hunk_lines_deleted UInt32, hunk_context LowCardinality(String), line LowCardinality(String), indent UInt8, line_type Enum(\'Empty\' = 0, \'Comment\' = 1, \'Punct\' = 2, \'Code\' = 3), prev_commit_hash String, prev_author LowCardinality(String), prev_time DateTime, file_change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6),\n path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), file_lines_added UInt32, file_lines_deleted UInt32, file_hunks_added UInt32, file_hunks_removed UInt32, file_hunks_changed UInt32, commit_hash String,\n author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') +FROM s3('https://datasets-documentation.s3.amazonaws.com/github/commits/clickhouse/line_changes.tsv.xz', 'TSV', ' sign Int8, line_number_old UInt32, line_number_new UInt32, hunk_num UInt32, hunk_start_line_number_old UInt32, hunk_start_line_number_new UInt32, hunk_lines_added UInt32,\n hunk_lines_deleted UInt32, hunk_context LowCardinality(String), line LowCardinality(String), indent UInt8, line_type Enum(\'Empty\' = 0, \'Comment\' = 1, \'Punct\' = 2, \'Code\' = 3), prev_commit_hash String, prev_author LowCardinality(String), prev_time DateTime, file_change_type Enum(\'Add\' = 1, \'Delete\' = 2, \'Modify\' = 3, \'Rename\' = 4, \'Copy\' = 5, \'Type\' = 6),\n path LowCardinality(String), old_path LowCardinality(String), file_extension LowCardinality(String), file_lines_added UInt32, file_lines_deleted UInt32, file_hunks_added UInt32, file_hunks_removed UInt32, file_hunks_changed UInt32, commit_hash String,\n author LowCardinality(String), time DateTime, commit_message String, commit_files_added UInt32, commit_files_deleted UInt32, commit_files_renamed UInt32, commit_files_modified UInt32, commit_lines_added UInt32, commit_lines_deleted UInt32, commit_hunks_added UInt32, commit_hunks_removed UInt32, commit_hunks_changed UInt32') 0 rows in set. Elapsed: 50.535 sec. Processed 7.54 million rows, 2.09 GB (149.11 thousand rows/s., 41.40 MB/s.) ``` # Queries -The tool suggests several queries via its help output. We have answered these in addition to some additional supplementary questions of interest: +The tool suggests several queries via its help output. We have answered these in addition to some additional supplementary questions of interest. These queries are of approximately increasing complexity vs. the tool's arbitrary order. -- [ClickHouse GitHub data](#clickhouse-github-data) -- [Table of Contents](#table-of-contents) -- [Generating the data](#generating-the-data) -- [Downloading and inserting the data](#downloading-and-inserting-the-data) -- [Queries](#queries) - - [History of a single file](#history-of-a-single-file) - - [Find the current active files](#find-the-current-active-files) - - [List files with most modifications](#list-files-with-most-modifications) - - [What day of the week do commits usually occur?](#what-day-of-the-week-do-commits-usually-occur) - - [History of subdirectory/file - number of lines, commits and contributors over time](#history-of-subdirectoryfile---number-of-lines-commits-and-contributors-over-time) - - [List files with maximum number of authors](#list-files-with-maximum-number-of-authors) - - [Oldest lines of code in the repository](#oldest-lines-of-code-in-the-repository) - - [Files with longest history](#files-with-longest-history) - - [Distribution of contributors with respect to docs and code over the month](#distribution-of-contributors-with-respect-to-docs-and-code-over-the-month) - - [Authors with the most diverse impact](#authors-with-the-most-diverse-impact) - - [Favorite files for an author](#favorite-files-for-an-author) - - [Largest files with lowest number of authors](#largest-files-with-lowest-number-of-authors) - - [Commits and lines of code distribution by time; by weekday, by author; for specific subdirectories](#commits-and-lines-of-code-distribution-by-time-by-weekday-by-author-for-specific-subdirectories) - - [Matrix of authors that shows what authors tends to rewrite another authors code](#matrix-of-authors-that-shows-what-authors-tends-to-rewrite-another-authors-code) - - [Who is the highest percentage contributor per day of week?](#who-is-the-highest-percentage-contributor-per-day-of-week) - - [Distribution of code age across repository](#distribution-of-code-age-across-repository) - - [What percentage of code for an author has been removed by other authors?](#what-percentage-of-code-for-an-author-has-been-removed-by-other-authors) - - [List files that were rewritten most number of times?](#list-files-that-were-rewritten-most-number-of-times) - - [What weekday does the code have the highest chance to stay in the repository?](#what-weekday-does-the-code-have-the-highest-chance-to-stay-in-the-repository) - - [Files sorted by average code age](#files-sorted-by-average-code-age) - - [Who tends to write more tests / CPP code / comments?](#who-tends-to-write-more-tests--cpp-code--comments) - - [How does an authors commits change over time with respect to code/comments percentage?](#how-does-an-authors-commits-change-over-time-with-respect-to-codecomments-percentage) - - [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) - - [What is the worst time to write code in sense that the code has highest chance to be re-written?](#what-is-the-worst-time-to-write-code-in-sense-that-the-code-has-highest-chance-to-be-re-written) - - [Which authors code is the most sticky?](#which-authors-code-is-the-most-sticky) - - [Most consecutive days of commits by an author](#most-consecutive-days-of-commits-by-an-author) - - [Line by line commit history of a file](#line-by-line-commit-history-of-a-file) -- [Unsolved Questions](#unsolved-questions) - - [Git blame](#git-blame) - -These queries are of approximately increasing complexity vs. the tool's arbitrary order. +This dataset is available in [play.clickhouse.com](https://play.clickhouse.com/play?user=play#U0hPVyBUQUJMRVMgSU4gZ2l0X2NsaWNraG91c2U=) in the `git_clickhouse` databases. We provide a link to this environment for all queries, adapting the database name as required. Note that play results may vary from the those presented here due to differences in time of data collection. ## History of a single file The simplest of queries. Here we look at all commit messages for the `StorageReplicatedMergeTree.cpp`. Since these are likely more interesting, we sort by the most recent messages first. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgb2xkX3BhdGgsCiAgICBsaW5lc19hZGRlZCwKICAgIGxpbmVzX2RlbGV0ZWQsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoID0gJ3NyYy9TdG9yYWdlcy9TdG9yYWdlUmVwbGljYXRlZE1lcmdlVHJlZS5jcHAnCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxMA==) + ```sql SELECT time, @@ -312,8 +283,11 @@ LIMIT 10 10 rows in set. Elapsed: 0.006 sec. Processed 12.10 thousand rows, 1.60 MB (1.93 million rows/s., 255.40 MB/s.) ``` + We can also review the line changes, excluding renames i.e. we won't show changes before a rename event when the file existed under a different name: +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgc2lnbiwKICAgIGxpbmVfbnVtYmVyX29sZCwKICAgIGxpbmVfbnVtYmVyX25ldywKICAgIGF1dGhvciwKICAgIGxpbmUKRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKV0hFUkUgcGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJwpPUkRFUiBCWSBsaW5lX251bWJlcl9uZXcgQVNDCkxJTUlUIDEw) + ```sql SELECT time, @@ -352,6 +326,8 @@ This is important for later analysis when we only want to consider the current f **Note there appears to have been a broken commit history in relation to files under the `dbms`, `libs`, `tests/testflows/` directories during their renames. We also thus exclude these.** +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHBhdGgKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIEdST1VQIEJZIG9sZF9wYXRoCiAgICBVTklPTiBBTEwKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZIHBhdGgKSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIE5PVCBtYXRjaChwYXRoLCAnKF5kYm1zLyl8KF5saWJzLyl8KF50ZXN0cy90ZXN0Zmxvd3MvKXwoXnByb2dyYW1zL3NlcnZlci9zdG9yZS8pJykgT1JERVIgQlkgcGF0aApMSU1JVCAxMA==) + ```sql SELECT path FROM @@ -392,6 +368,8 @@ LIMIT 10 Note that this allows for files to be renamed and then re-renamed to their original values. First we aggregate `old_path` for a list of deleted files as a result of renaming. We union this with the last operation for every `path`. Finally, we filter this list to those where the final event is not a `Delete`. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIHVuaXEocGF0aCkKRlJPTQooCiAgICBTRUxFQ1QgcGF0aAogICAgRlJPTQogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAyIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgIFVOSU9OIEFMTAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICApCiAgICBHUk9VUCBCWSBwYXRoCiAgICBIQVZJTkcgKGFyZ01heChjaGFuZ2VfdHlwZSwgbGFzdF90aW1lKSAhPSAyKSBBTkQgTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSBPUkRFUiBCWSBwYXRoCikK) + ```sql SELECT uniq(path) FROM @@ -440,7 +418,9 @@ The difference here is caused by a few factors: - A rename can occur alongside other modifications to the file. These are listed as separate events in file_changes but with the same time. The `argMax` function has no way of distinguishing these - it picks the first value. The natural ordering of the inserts (the only means of knowing the correct order) is not maintained across the union so modified events can be selected. For example, below the `src/Functions/geometryFromColumn.h` file has several modifications before being renamed to `src/Functions/geometryConverters.h`. Our current solution may pick a Modify event as the latest change causing `src/Functions/geometryFromColumn.h` to be retained. - ```sql +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICAgIGNoYW5nZV90eXBlLAogICAgICBwYXRoLAogICAgICBvbGRfcGF0aCwKICAgICAgdGltZSwKICAgICAgY29tbWl0X2hhc2gKICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogIFdIRVJFIChwYXRoID0gJ3NyYy9GdW5jdGlvbnMvZ2VvbWV0cnlGcm9tQ29sdW1uLmgnKSBPUiAob2xkX3BhdGggPSAnc3JjL0Z1bmN0aW9ucy9nZW9tZXRyeUZyb21Db2x1bW4uaCcpCg==) + +```sql SELECT change_type, path, @@ -464,7 +444,7 @@ The difference here is caused by a few factors: │ Rename │ src/Functions/geometryConverters.h │ src/Functions/geometryFromColumn.h │ 2021-03-11 12:08:16 │ 125945769586baf6ffd15919b29565b1b2a63218 │ └─────────────┴────────────────────────────────────┴────────────────────────────────────┴─────────────────────┴──────────────────────────────────────────┘ 11 rows in set. Elapsed: 0.030 sec. Processed 266.05 thousand rows, 6.61 MB (8.89 million rows/s., 220.82 MB/s.) - ``` +``` - Broken commit history - missing delete events. Source and cause TBD. These differences shouldn't meaningfully impact our analysis. **We welcome improved versions of this query**. @@ -473,6 +453,8 @@ These differences shouldn't meaningfully impact our analysis. **We welcome impro Limiting to current files, we consider the number of modifications to be the sum of deletes and additions. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgKyBzdW0obGluZXNfZGVsZXRlZCkgQVMgbW9kaWZpY2F0aW9ucwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBtb2RpZmljYXRpb25zIERFU0MKTElNSVQgMTA=) + ```sql WITH current_files AS ( @@ -524,17 +506,23 @@ LIMIT 10 ## What day of the week do commits usually occur? +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrCg==) + ```sql -SELECT day_of_week, count() as c FROM git.commits GROUP BY dayOfWeek(time) as day_of_week +SELECT + day_of_week, + count() AS c +FROM git.commits +GROUP BY dayOfWeek(time) AS day_of_week ┌─day_of_week─┬─────c─┐ -│ 1 │ 10291 │ -│ 2 │ 10519 │ -│ 3 │ 10598 │ -│ 4 │ 10812 │ -│ 5 │ 9964 │ -│ 6 │ 4583 │ -│ 7 │ 5134 │ +│ 1 │ 10575 │ +│ 2 │ 10645 │ +│ 3 │ 10748 │ +│ 4 │ 10944 │ +│ 5 │ 10090 │ +│ 6 │ 4617 │ +│ 7 │ 5166 │ └─────────────┴───────┘ 7 rows in set. Elapsed: 0.262 sec. Processed 62.78 thousand rows, 251.14 KB (239.73 thousand rows/s., 958.93 KB/s.) ``` @@ -545,6 +533,8 @@ This makes sense with some productivity drop-off on Fridays. Great to see people This would produce a large query result that is unrealistic to show or visualize if unfiltered. We, therefore, allow a file or subdirectory to be filtered in the following example. Here we group by week using the `toStartOfWeek` function - adapt as required. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB3ZWVrLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkLAogICAgdW5pcShjb21taXRfaGFzaCkgQVMgbnVtX2NvbW1pdHMsCiAgICB1bmlxKGF1dGhvcikgQVMgYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIExJS0UgJ3NyYy9TdG9yYWdlcyUnCkdST1VQIEJZIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawpPUkRFUiBCWSB3ZWVrIEFTQwpMSU1JVCAxMAo=) + ```sql SELECT week, @@ -587,6 +577,8 @@ This data visualizes well. Below we use Superset. Limit to current files only. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHVuaXEoYXV0aG9yKSBBUyBudW1fYXV0aG9ycwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIG51bV9hdXRob3JzIERFU0MKTElNSVQgMTA=) + ```sql WITH current_files AS ( @@ -640,6 +632,8 @@ LIMIT 10 Limited to current files only. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgYW55KHBhdGgpIEFTIGZpbGVfcGF0aCwKICAgIGxpbmUsCiAgICBtYXgodGltZSkgQVMgbGF0ZXN0X2NoYW5nZSwKICAgIGFueShmaWxlX2NoYW5nZV90eXBlKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIChjdXJyZW50X2ZpbGVzKQpHUk9VUCBCWSBsaW5lCk9SREVSIEJZIGxhdGVzdF9jaGFuZ2UgQVNDCkxJTUlUIDEw) + ```sql WITH current_files AS ( @@ -695,6 +689,8 @@ LIMIT 10 Limited to current files only. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY291bnQoKSBBUyBjLAogICAgcGF0aCwKICAgIG1heCh0aW1lKSBBUyBsYXRlc3RfY2hhbmdlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEw) + ```sql WITH current_files AS ( @@ -749,10 +745,12 @@ Our core data structure, the Merge Tree, is obviously under constant evolution w ## Distribution of contributors with respect to docs and code over the month -**During data capture the changes on the `docs/` folder have been filtered out so this is an estimate only** +**During data capture the changes on the `docs/` folder have been filtered out due to a very commit dirty history. The results of this query are therefore not accurate.** Do we write more docs at certain times of the month e.g., around release dates? We can use the `countIf` function to compute a simple ratio, visualizing the result using the `bar` function. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXksCiAgICBiYXIoZG9jc19yYXRpbyAqIDEwMDAsIDAsIDEwMCwgMTAwKSBBUyBiYXIKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXksCiAgICAgICAgY291bnRJZihmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBUyBjb2RlLAogICAgICAgIGNvdW50SWYoZmlsZV9leHRlbnNpb24gPSAnbWQnKSBBUyBkb2NzLAogICAgICAgIGRvY3MgLyAoY29kZSArIGRvY3MpIEFTIGRvY3NfcmF0aW8KICAgIEZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCiAgICBXSEVSRSAoc2lnbiA9IDEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnbWQnKSkKICAgIEdST1VQIEJZIGRheU9mTW9udGgodGltZSkgQVMgZGF5CikK) + ```sql SELECT day, @@ -761,7 +759,7 @@ FROM ( SELECT day, - countIf(file_extension IN ('h', 'cpp', 'sql', 'md')) AS code, + countIf(file_extension IN ('h', 'cpp', 'sql')) AS code, countIf(file_extension = 'md') AS docs, docs / (code + docs) AS docs_ratio FROM git.line_changes @@ -769,49 +767,51 @@ FROM GROUP BY dayOfMonth(time) AS day ) -┌─day─┬─bar──────────────────────────────────────────────────────────┐ -│ 1 │ ██████████████████████████████████▎ │ -│ 2 │ ███████████████████████▏ │ -│ 3 │ ███████████████████████████████▋ │ -│ 4 │ ████████████▊ │ -│ 5 │ ████████████████████▊ │ -│ 6 │ ███████▊ │ -│ 7 │ ███▋ │ -│ 8 │ ████████▍ │ -│ 9 │ ██████████████ │ -│ 10 │ ████████████████▋ │ -│ 11 │ █████████████▏ │ -│ 12 │ ██████████████████████████████████▌ │ -│ 13 │ ████████████████████████████▌ │ -│ 14 │ ██████▋ │ -│ 15 │ ████████████████████████████████████████▎ │ -│ 16 │ ██████████▏ │ -│ 17 │ █████████████████████████████████████▏ │ -│ 18 │ ████████████████████████████████▍ │ -│ 19 │ ██████████▊ │ -│ 20 │ ████████████████████████████████▋ │ -│ 21 │ █████ │ -│ 22 │ ███████████████████████▏ │ -│ 23 │ ██████████████████████████▋ │ -│ 24 │ ███████▌ │ -│ 25 │ █████████████████████████████████▏ │ -│ 26 │ ███████████ │ -│ 27 │ ███████████████████████████████████████████████████████████▎ │ -│ 28 │ █████████████████████████████████████████████████▌ │ -│ 29 │ ███▌ │ -│ 30 │ ██████████████████████████████████████▋ │ -│ 31 │ ████████████████████████████████▏ │ -└─────┴──────────────────────────────────────────────────────────────┘ +┌─day─┬─bar─────────────────────────────────────────────────────────────┐ +│ 1 │ ███████████████████████████████████▍ │ +│ 2 │ ███████████████████████▋ │ +│ 3 │ ████████████████████████████████▋ │ +│ 4 │ █████████████ │ +│ 5 │ █████████████████████▎ │ +│ 6 │ ████████ │ +│ 7 │ ███▋ │ +│ 8 │ ████████▌ │ +│ 9 │ ██████████████▎ │ +│ 10 │ █████████████████▏ │ +│ 11 │ █████████████▎ │ +│ 12 │ ███████████████████████████████████▋ │ +│ 13 │ █████████████████████████████▎ │ +│ 14 │ ██████▋ │ +│ 15 │ █████████████████████████████████████████▊ │ +│ 16 │ ██████████▎ │ +│ 17 │ ██████████████████████████████████████▋ │ +│ 18 │ █████████████████████████████████▌ │ +│ 19 │ ███████████ │ +│ 20 │ █████████████████████████████████▊ │ +│ 21 │ █████ │ +│ 22 │ ███████████████████████▋ │ +│ 23 │ ███████████████████████████▌ │ +│ 24 │ ███████▌ │ +│ 25 │ ██████████████████████████████████▎ │ +│ 26 │ ███████████▏ │ +│ 27 │ ███████████████████████████████████████████████████████████████ │ +│ 28 │ ████████████████████████████████████████████████████▏ │ +│ 29 │ ███▌ │ +│ 30 │ ████████████████████████████████████████▎ │ +│ 31 │ █████████████████████████████████▏ │ +└─────┴─────────────────────────────────────────────────────────────────┘ -31 rows in set. Elapsed: 0.043 sec. Processed 7.54 million rows, 40.53 MB (176.56 million rows/s., 949.58 MB/s.) +31 rows in set. Elapsed: 0.043 sec. Processed 7.54 million rows, 40.53 MB (176.71 million rows/s., 950.40 MB/s.) ``` -Maybe a little more near the end of the month, but overall we keep a good even distribution. +Maybe a little more near the end of the month, but overall we keep a good even distribution. Again this is unrealiable due to the filtering of the docs filter during data insertion. ## Authors with the most diverse impact We consider diversity here to be the number of unique files an author has contributed to. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICB1bmlxKHBhdGgpIEFTIG51bV9maWxlcwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYXV0aG9yCk9SREVSIEJZIG51bV9maWxlcyBERVNDCkxJTUlUIDEw) + ```sql SELECT author, @@ -838,10 +838,57 @@ LIMIT 10 10 rows in set. Elapsed: 0.041 sec. Processed 266.05 thousand rows, 4.92 MB (6.56 million rows/s., 121.21 MB/s.) ``` +Let's see who has the most diverse commits in their recent work. Rather than limit by date, we'll restrict to an author's last N commits (in this case, we've used 3 but feel free to modify): + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBzdW0obnVtX2ZpbGVzX2NvbW1pdCkgQVMgbnVtX2ZpbGVzCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzX2NvbW1pdCwKICAgICAgICBtYXgodGltZSkgQVMgY29tbWl0X3RpbWUKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoCiAgICBPUkRFUiBCWQogICAgICAgIGF1dGhvciBBU0MsCiAgICAgICAgY29tbWl0X3RpbWUgREVTQwogICAgTElNSVQgMyBCWSBhdXRob3IKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbnVtX2ZpbGVzIERFU0MKTElNSVQgMTA=) + +```sql +SELECT + author, + sum(num_files_commit) AS num_files +FROM +( + SELECT + author, + commit_hash, + uniq(path) AS num_files_commit, + max(time) AS commit_time + FROM git.file_changes + WHERE (change_type IN ('Add', 'Modify')) AND (file_extension IN ('h', 'cpp', 'sql')) + GROUP BY + author, + commit_hash + ORDER BY + author ASC, + commit_time DESC + LIMIT 3 BY author +) +GROUP BY author +ORDER BY num_files DESC +LIMIT 10 + +┌─author───────────────┬─num_files─┐ +│ Mikhail │ 782 │ +│ Li Yin │ 553 │ +│ Roman Peshkurov │ 119 │ +│ Vladimir Smirnov │ 88 │ +│ f1yegor │ 65 │ +│ maiha │ 54 │ +│ Vitaliy Lyudvichenko │ 53 │ +│ Pradeep Chhetri │ 40 │ +│ Orivej Desh │ 38 │ +│ liyang │ 36 │ +└──────────────────────┴───────────┘ + +10 rows in set. Elapsed: 0.106 sec. Processed 266.05 thousand rows, 21.04 MB (2.52 million rows/s., 198.93 MB/s.) +``` + ## Favorite files for an author Here we select our founder [Alexey Milovidov](https://github.com/alexey-milovidov) and limit our analysis to current files. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAoYXV0aG9yID0gJ0FsZXhleSBNaWxvdmlkb3YnKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKQpHUk9VUCBCWSBwYXRoCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxMA==) + ```sql WITH current_files AS ( @@ -870,52 +917,53 @@ SELECT path, count() AS c FROM git.file_changes -WHERE (author = 'alexey-milovidov') AND (path IN (current_files)) +WHERE (author = 'Alexey Milovidov') AND (path IN (current_files)) GROUP BY path ORDER BY c DESC LIMIT 10 -┌─path────────────────────────────────────────────┬───c─┐ -│ CHANGELOG.md │ 174 │ -│ CMakeLists.txt │ 22 │ -│ src/Common/HashTable/HashTable.h │ 8 │ -│ .github/PULL_REQUEST_TEMPLATE.md │ 8 │ -│ src/Core/Settings.h │ 8 │ -│ src/Storages/StorageReplicatedMergeTree.cpp │ 7 │ -│ README.md │ 7 │ -│ docker/test/fuzzer/run-fuzzer.sh │ 7 │ -│ programs/install/Install.cpp │ 7 │ -│ src/Dictionaries/ExecutableDictionarySource.cpp │ 6 │ -└─────────────────────────────────────────────────┴─────┘ +┌─path────────────────────────────────────────┬───c─┐ +│ CMakeLists.txt │ 165 │ +│ CHANGELOG.md │ 126 │ +│ programs/server/Server.cpp │ 73 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 71 │ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 68 │ +│ src/Core/Settings.h │ 65 │ +│ programs/client/Client.cpp │ 57 │ +│ programs/server/play.html │ 48 │ +│ .gitmodules │ 47 │ +│ programs/install/Install.cpp │ 37 │ +└─────────────────────────────────────────────┴─────┘ 10 rows in set. Elapsed: 0.106 sec. Processed 798.15 thousand rows, 13.97 MB (7.51 million rows/s., 131.41 MB/s.) ``` This makes sense because Alexey has been responsible for maintaining the Change log. But what if we use the basename of the file to identify his popular files - this allows for renames and should focus on code contributions. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBiYXNlLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIChhdXRob3IgPSAnQWxleGV5IE1pbG92aWRvdicpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKR1JPVVAgQlkgYmFzZW5hbWUocGF0aCkgQVMgYmFzZQpPUkRFUiBCWSBjIERFU0MKTElNSVQgMTA=) ```sql SELECT base, count() AS c FROM git.file_changes -WHERE (author = 'alexey-milovidov') AND (file_extension IN ('h', 'cpp', 'sql')) +WHERE (author = 'Alexey Milovidov') AND (file_extension IN ('h', 'cpp', 'sql')) GROUP BY basename(path) AS base ORDER BY c DESC LIMIT 10 -┌─base───────────────────────────┬──c─┐ -│ StorageReplicatedMergeTree.cpp │ 22 │ -│ Settings.h │ 22 │ -│ InterpreterSelectQuery.cpp │ 19 │ -│ MergeTreeData.cpp │ 18 │ -│ Client.cpp │ 17 │ -│ Context.cpp │ 17 │ -│ Server.cpp │ 12 │ -│ ExecutableDictionarySource.cpp │ 12 │ -│ ExpressionAnalyzer.cpp │ 12 │ -│ PODArray.h │ 12 │ -└────────────────────────────────┴────┘ +┌─base───────────────────────────┬───c─┐ +│ StorageReplicatedMergeTree.cpp │ 393 │ +│ InterpreterSelectQuery.cpp │ 299 │ +│ Aggregator.cpp │ 297 │ +│ Client.cpp │ 280 │ +│ MergeTreeData.cpp │ 274 │ +│ Server.cpp │ 264 │ +│ ExpressionAnalyzer.cpp │ 259 │ +│ StorageMergeTree.cpp │ 239 │ +│ Settings.h │ 225 │ +│ TCPHandler.cpp │ 205 │ +└────────────────────────────────┴─────┘ 10 rows in set. Elapsed: 0.032 sec. Processed 266.05 thousand rows, 5.68 MB (8.22 million rows/s., 175.50 MB/s.) ``` @@ -927,6 +975,8 @@ For this, we first need to identify the largest files. Estimating this via a ful To estimate, assuming we restrict to current files, we sum line additions and subtract deletions. We can then compute a ratio of length to the number of authors. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiAoY3VycmVudF9maWxlcykKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) + ```sql WITH current_files AS ( @@ -980,6 +1030,8 @@ LIMIT 10 Text dictionaries aren't maybe realistic, so lets restrict to code only via a file extension filter! +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKT1JERVIgQlkgbGluZXNfYXV0aG9yX3JhdGlvIERFU0MKTElNSVQgMTA=) + ```sql WITH current_files AS ( @@ -1032,6 +1084,8 @@ LIMIT 10 There is some recency bias in this - newer files have fewer opportunities for commits. What about if we restrict to files at least 1 yr old? +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgbWluKHRpbWUpIEFTIG1pbl9kYXRlLAogICAgcGF0aCwKICAgIHN1bShsaW5lc19hZGRlZCkgLSBzdW0obGluZXNfZGVsZXRlZCkgQVMgbnVtX2xpbmVzLAogICAgdW5pcUV4YWN0KGF1dGhvcikgQVMgbnVtX2F1dGhvcnMsCiAgICBudW1fbGluZXMgLyBudW1fYXV0aG9ycyBBUyBsaW5lc19hdXRob3JfcmF0aW8KRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCkdST1VQIEJZIHBhdGgKSEFWSU5HIG1pbl9kYXRlIDw9IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpPUkRFUiBCWSBsaW5lc19hdXRob3JfcmF0aW8gREVTQwpMSU1JVCAxMA==) + ```sql WITH current_files AS ( @@ -1089,6 +1143,8 @@ LIMIT 10 We interpret this as the number of lines added and removed by the day of the week. In this case, we focus on the [Functions directory](https://github.com/ClickHouse/ClickHouse/tree/master/src/Functions) +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlPZldlZWssCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvRGF5T2ZXZWVrKHRpbWUpIEFTIGRheU9mV2Vlaw==) + ```sql SELECT dayOfWeek, @@ -1114,6 +1170,8 @@ GROUP BY toDayOfWeek(time) AS dayOfWeek And by time of day, +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICB1bmlxKGNvbW1pdF9oYXNoKSBBUyBjb21taXRzLAogICAgc3VtKGxpbmVzX2FkZGVkKSBBUyBsaW5lc19hZGRlZCwKICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCldIRVJFIHBhdGggTElLRSAnc3JjL0Z1bmN0aW9ucyUnCkdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXk=) + ```sql SELECT hourOfDay, @@ -1156,6 +1214,8 @@ GROUP BY toHour(time) AS hourOfDay This distribution makes sense given most of our development team is in Amsterdam. The `bar` functions helps us visualize these distributions: +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBob3VyT2ZEYXksCiAgICBiYXIoY29tbWl0cywgMCwgNDAwLCA1MCkgQVMgY29tbWl0cywKICAgIGJhcihsaW5lc19hZGRlZCwgMCwgMzAwMDAsIDUwKSBBUyBsaW5lc19hZGRlZCwKICAgIGJhcihsaW5lc19kZWxldGVkLCAwLCAxNTAwMCwgNTApIEFTIGxpbmVzX2RlbGV0ZWQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBob3VyT2ZEYXksCiAgICAgICAgdW5pcShjb21taXRfaGFzaCkgQVMgY29tbWl0cywKICAgICAgICBzdW0obGluZXNfYWRkZWQpIEFTIGxpbmVzX2FkZGVkLAogICAgICAgIHN1bShsaW5lc19kZWxldGVkKSBBUyBsaW5lc19kZWxldGVkCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgV0hFUkUgcGF0aCBMSUtFICdzcmMvRnVuY3Rpb25zJScKICAgIEdST1VQIEJZIHRvSG91cih0aW1lKSBBUyBob3VyT2ZEYXkKKQ==) + ```sql SELECT hourOfDay, @@ -1208,10 +1268,12 @@ FROM The `sign = -1` indicates a code deletion. We exclude punctuation and the insertion of empty lines. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBwcmV2X2F1dGhvciB8fCAnKGEpJyBhcyBhZGRfYXV0aG9yLAogICAgYXV0aG9yICB8fCAnKGQpJyBhcyBkZWxldGVfYXV0aG9yLAogICAgY291bnQoKSBBUyBjCkZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCldIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikgQU5EIChwcmV2X2F1dGhvciAhPSAnJykKR1JPVVAgQlkKICAgIHByZXZfYXV0aG9yLAogICAgYXV0aG9yCk9SREVSIEJZIGMgREVTQwpMSU1JVCAxIEJZIHByZXZfYXV0aG9yCkxJTUlUIDEwMA==) + ```sql SELECT - prev_author, - author, + prev_author || '(a)' as add_author, + author || '(d)' as delete_author, count() AS c FROM git.line_changes WHERE (sign = -1) AND (file_extension IN ('h', 'cpp')) AND (line_type NOT IN ('Punct', 'Empty')) AND (author != prev_author) AND (prev_author != '') @@ -1220,7 +1282,7 @@ GROUP BY author ORDER BY c DESC LIMIT 1 BY prev_author -LIMIT 20 +LIMIT 100 ┌─prev_author──────────┬─author───────────┬─────c─┐ │ Ivan │ Alexey Milovidov │ 18554 │ @@ -1248,10 +1310,22 @@ LIMIT 20 20 rows in set. Elapsed: 0.098 sec. Processed 7.54 million rows, 42.16 MB (76.67 million rows/s., 428.99 MB/s.) ``` +A Sankey chart (SuperSet) allows this to be visualized nicely. Note we increase our `LIMIT BY` to 3, to get the top 3 code removers for each author, to improve the variety in the visual. + + +![](./images/superset-authors-matrix.png) + + +Alexey clearly likes removing other peoples code. Lets exclude him for a more balanced view of code removal. + +![](./images/superset-authors-matrix_v2.png) + ## Who is the highest percentage contributor per day of week? If we consider by just number of commits: +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKR1JPVVAgQlkKICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlaywKICAgIGF1dGhvcgpPUkRFUiBCWQogICAgZGF5X29mX3dlZWsgQVNDLAogICAgYyBERVNDCkxJTUlUIDEgQlkgZGF5X29mX3dlZWs=) + ```sql SELECT day_of_week, @@ -1281,6 +1355,7 @@ LIMIT 1 BY day_of_week OK, some possible advantages here to the longest contributor - our founder Alexey. Lets limit our analysis to the last year. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2VlaywKICAgIGF1dGhvciwKICAgIGNvdW50KCkgQVMgYwpGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKV0hFUkUgdGltZSA+IChub3coKSAtIHRvSW50ZXJ2YWxZZWFyKDEpKQpHUk9VUCBCWQogICAgZGF5T2ZXZWVrKHRpbWUpIEFTIGRheV9vZl93ZWVrLAogICAgYXV0aG9yCk9SREVSIEJZCiAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICBjIERFU0MKTElNSVQgMSBCWSBkYXlfb2Zfd2Vlaw==) ```sql SELECT @@ -1314,6 +1389,8 @@ This is still a little simple and doesn't reflect people's work. A better metric might be who is the top contributor each day as a fraction of the total work performed in the last year. Note that we treat the deletion and adding code equally. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0b3BfYXV0aG9yLmRheV9vZl93ZWVrLAogICAgdG9wX2F1dGhvci5hdXRob3IsCiAgICB0b3BfYXV0aG9yLmF1dGhvcl93b3JrIC8gYWxsX3dvcmsudG90YWxfd29yayBBUyB0b3BfYXV0aG9yX3BlcmNlbnQKRlJPTQooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBhdXRob3IsCiAgICAgICAgc3VtKGxpbmVzX2FkZGVkKSArIHN1bShsaW5lc19kZWxldGVkKSBBUyBhdXRob3Jfd29yawogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIHRpbWUgPiAobm93KCkgLSB0b0ludGVydmFsWWVhcigxKSkKICAgIEdST1VQIEJZCiAgICAgICAgYXV0aG9yLAogICAgICAgIGRheU9mV2Vlayh0aW1lKSBBUyBkYXlfb2Zfd2VlawogICAgT1JERVIgQlkKICAgICAgICBkYXlfb2Zfd2VlayBBU0MsCiAgICAgICAgYXV0aG9yX3dvcmsgREVTQwogICAgTElNSVQgMSBCWSBkYXlfb2Zfd2VlawopIEFTIHRvcF9hdXRob3IKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBkYXlfb2Zfd2VlaywKICAgICAgICBzdW0obGluZXNfYWRkZWQpICsgc3VtKGxpbmVzX2RlbGV0ZWQpIEFTIHRvdGFsX3dvcmsKICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICBXSEVSRSB0aW1lID4gKG5vdygpIC0gdG9JbnRlcnZhbFllYXIoMSkpCiAgICBHUk9VUCBCWSBkYXlPZldlZWsodGltZSkgQVMgZGF5X29mX3dlZWsKKSBBUyBhbGxfd29yayBVU0lORyAoZGF5X29mX3dlZWsp) + ```sql SELECT top_author.day_of_week, @@ -1362,6 +1439,8 @@ INNER JOIN We limit the analysis to the current files. For brevity, we restrict the results to a depth of 2 with 5 files per root folder. Adjust as required. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjdXJyZW50X2ZpbGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUIHBhdGgKICAgICAgICBGUk9NCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIG9sZF9wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgbGFzdF90aW1lLAogICAgICAgICAgICAgICAgMiBBUyBjaGFuZ2VfdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBvbGRfcGF0aAogICAgICAgICAgICBVTklPTiBBTEwKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIGFyZ01heChjaGFuZ2VfdHlwZSwgdGltZSkgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgICkKICAgICAgICBHUk9VUCBCWSBwYXRoCiAgICAgICAgSEFWSU5HIChhcmdNYXgoY2hhbmdlX3R5cGUsIGxhc3RfdGltZSkgIT0gMikgQU5EIChOT1QgbWF0Y2gocGF0aCwgJyheZGJtcy8pfChebGlicy8pfChedGVzdHMvdGVzdGZsb3dzLyl8KF5wcm9ncmFtcy9zZXJ2ZXIvc3RvcmUvKScpKQogICAgICAgIE9SREVSIEJZIHBhdGggQVNDCiAgICApClNFTEVDVAogICAgY29uY2F0KHJvb3QsICcvJywgc3ViX2ZvbGRlcikgQVMgZm9sZGVyLAogICAgcm91bmQoYXZnKGRheXNfcHJlc2VudCkpIEFTIGF2Z19hZ2Vfb2ZfZmlsZXMsCiAgICBtaW4oZGF5c19wcmVzZW50KSBBUyBtaW5fYWdlX2ZpbGVzLAogICAgbWF4KGRheXNfcHJlc2VudCkgQVMgbWF4X2FnZV9maWxlcywKICAgIGNvdW50KCkgQVMgYwpGUk9NCigKICAgIFNFTEVDVAogICAgICAgIHBhdGgsCiAgICAgICAgZGF0ZURpZmYoJ2RheScsIG1pbih0aW1lKSwgdG9EYXRlKCcyMDIyLTExLTAzJykpIEFTIGRheXNfcHJlc2VudAogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgIFdIRVJFIChwYXRoIElOIChjdXJyZW50X2ZpbGVzKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgR1JPVVAgQlkgcGF0aAopCkdST1VQIEJZCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzFdIEFTIHJvb3QsCiAgICBzcGxpdEJ5Q2hhcignLycsIHBhdGgpWzJdIEFTIHN1Yl9mb2xkZXIKT1JERVIgQlkKICAgIHJvb3QgQVNDLAogICAgYyBERVNDCkxJTUlUIDUgQlkgcm9vdAo=) + ```sql WITH current_files AS ( @@ -1443,6 +1522,8 @@ LIMIT 5 BY root For this question, we need the number of lines written by an author divided by the total number of lines they have had removed by another contributor. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBrLAogICAgd3JpdHRlbl9jb2RlLmMsCiAgICByZW1vdmVkX2NvZGUuYywKICAgIHJlbW92ZWRfY29kZS5jIC8gd3JpdHRlbl9jb2RlLmMgQVMgcmVtb3ZlX3JhdGlvCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yIEFTIGssCiAgICAgICAgY291bnQoKSBBUyBjCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnKSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgIEdST1VQIEJZIGsKKSBBUyB3cml0dGVuX2NvZGUKSU5ORVIgSk9JTgooCiAgICBTRUxFQ1QKICAgICAgICBwcmV2X2F1dGhvciBBUyBrLAogICAgICAgIGNvdW50KCkgQVMgYwogICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcpKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKSBBTkQgKGF1dGhvciAhPSBwcmV2X2F1dGhvcikKICAgIEdST1VQIEJZIGsKKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKGspCldIRVJFIHdyaXR0ZW5fY29kZS5jID4gMTAwMApPUkRFUiBCWSByZW1vdmVfcmF0aW8gREVTQwpMSU1JVCAxMAo=) + ```sql SELECT k, @@ -1489,10 +1570,63 @@ LIMIT 10 ## List files that were rewritten most number of times? -We consider a rewrite to be when over 50% of the file are deleted, and 50% added. Adjust the query to your own interpretation of what constitutes this. + +The simplest approach to this question might be to simply count the most number of line modifications per path (restricted to current files) e.g.: + +```sql +WITH current_files AS + ( + SELECT path + FROM + ( + SELECT + old_path AS path, + max(time) AS last_time, + 2 AS change_type + FROM git.file_changes + GROUP BY old_path + UNION ALL + SELECT + path, + max(time) AS last_time, + argMax(change_type, time) AS change_type + FROM git.file_changes + GROUP BY path + ) + GROUP BY path + HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) + ORDER BY path ASC + ) +SELECT + path, + count() AS c +FROM git.line_changes +WHERE (file_extension IN ('h', 'cpp', 'sql')) AND (path IN (current_files)) +GROUP BY path +ORDER BY c DESC +LIMIT 10 + +┌─path───────────────────────────────────────────────────┬─────c─┐ +│ src/Storages/StorageReplicatedMergeTree.cpp │ 21871 │ +│ src/Storages/MergeTree/MergeTreeData.cpp │ 17709 │ +│ programs/client/Client.cpp │ 15882 │ +│ src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp │ 14249 │ +│ src/Interpreters/InterpreterSelectQuery.cpp │ 12636 │ +│ src/Parsers/ExpressionListParsers.cpp │ 11794 │ +│ src/Analyzer/QueryAnalysisPass.cpp │ 11760 │ +│ src/Coordination/KeeperStorage.cpp │ 10225 │ +│ src/Functions/FunctionsConversion.h │ 9247 │ +│ src/Parsers/ExpressionElementParsers.cpp │ 8197 │ +└────────────────────────────────────────────────────────┴───────┘ + +10 rows in set. Elapsed: 0.160 sec. Processed 8.07 million rows, 98.99 MB (50.49 million rows/s., 619.49 MB/s.) +``` + +This doesn't capture the notion of a "re-write" however, where a large portion of the file changes in any commit. This requires a more complex query. If we consider a rewrite to be when over 50% of the file are deleted, and 50% added. You can adjust the query to your own interpretation of what constitutes this. The query is limited to the current files only. We list all file changes by grouping by `path` and `commit_hash`, returning the number of lines added and removed. Using a window function, we estimate the file's total size at any moment in time by performing a cumulative sum and estimating the impact of any change on file size as `lines added - lines removed`. Using this statistic, we can calculate the percentage of the file that has been added or removed for each change. Finally, we count the number of file changes that constitute a rewrite per file i.e. `(percent_add >= 0.5) AND (percent_delete >= 0.5) AND current_size > 50`. Note we require files to be more than 50 lines to avoid early contributions to a file being counted as a rewrite. This also avoids a bias to very small files, which may be more likely to be rewritten. +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGNoYW5nZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgIGFueShsaW5lc19kZWxldGVkKSBBUyBudW1fZGVsZXRlZCwKICAgICAgICAgICAgYW55KGNoYW5nZV90eXBlKSBBUyB0eXBlCiAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICBXSEVSRSAoY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAocGF0aCBJTiAoY3VycmVudF9maWxlcykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgKQpTRUxFQ1QKICAgIHBhdGgsCiAgICBjb3VudCgpIEFTIG51bV9yZXdyaXRlcwpGUk9NIHJld3JpdGVzCldIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBudW1fcmV3cml0ZXMgREVTQwpMSU1JVCAxMA==) ```sql WITH @@ -1519,7 +1653,7 @@ WITH HAVING (argMax(change_type, last_time) != 2) AND (NOT match(path, '(^dbms/)|(^libs/)|(^tests/testflows/)|(^programs/server/store/)')) ORDER BY path ASC ), - file_changes AS + changes AS ( SELECT path, @@ -1549,7 +1683,7 @@ WITH sum(num_added - num_deleted) OVER (PARTITION BY path ORDER BY max_time ASC) AS current_size, if(current_size > 0, num_added / current_size, 0) AS percent_add, if(current_size > 0, num_deleted / current_size, 0) AS percent_delete - FROM file_changes + FROM changes ) SELECT path, @@ -1584,6 +1718,8 @@ We query for lines added, joining this with the lines removed - filtering to cas Finally, we aggregate across this dataset to compute the average number of days lines stay in the repository by the day of the week. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBkYXlfb2Zfd2Vla19hZGRlZCwKICAgIGNvdW50KCkgQVMgbnVtLAogICAgYXZnKGRheXNfcHJlc2VudCkgQVMgYXZnX2RheXNfcHJlc2VudApGUk9NCigKICAgIFNFTEVDVAogICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICBhZGRlZF9jb2RlLnRpbWUgQVMgYWRkZWRfZGF5LAogICAgICAgIGRhdGVEaWZmKCdkYXknLCBhZGRlZF9jb2RlLnRpbWUsIHJlbW92ZWRfY29kZS50aW1lKSBBUyBkYXlzX3ByZXNlbnQKICAgIEZST00KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBsaW5lCiAgICApIEFTIGFkZGVkX2NvZGUKICAgIElOTkVSIEpPSU4KICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZSwKICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgIFdIRVJFIChzaWduID0gLTEpIEFORCAobGluZV90eXBlIE5PVCBJTiAoJ1B1bmN0JywgJ0VtcHR5JykpCiAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgbGluZQogICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICBXSEVSRSByZW1vdmVkX2NvZGUudGltZSA+IGFkZGVkX2NvZGUudGltZQopCkdST1VQIEJZIGRheU9mV2VlayhhZGRlZF9kYXkpIEFTIGRheV9vZl93ZWVrX2FkZGVk) + ```sql SELECT day_of_week_added, @@ -1641,6 +1777,8 @@ GROUP BY dayOfWeek(added_day) AS day_of_week_added This query uses the same principle as [What weekday does the code have the highest chance to stay in the repository](#what-weekday-does-the-code-have-the-highest-chance-to-stay-in-the-repository) - by aiming to uniquely identify a line of code using the path and line contents. This allows us to identify the time between when a line was added and removed. We filter to current files and code only, however, and average the time for each file across lines. +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY3VycmVudF9maWxlcyBBUwogICAgKAogICAgICAgIFNFTEVDVCBwYXRoCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBvbGRfcGF0aCBBUyBwYXRoLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIGxhc3RfdGltZSwKICAgICAgICAgICAgICAgIDIgQVMgY2hhbmdlX3R5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgR1JPVVAgQlkgb2xkX3BhdGgKICAgICAgICAgICAgVU5JT04gQUxMCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBsYXN0X3RpbWUsCiAgICAgICAgICAgICAgICBhcmdNYXgoY2hhbmdlX3R5cGUsIHRpbWUpIEFTIGNoYW5nZV90eXBlCiAgICAgICAgICAgIEZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCiAgICAgICAgICAgIEdST1VQIEJZIHBhdGgKICAgICAgICApCiAgICAgICAgR1JPVVAgQlkgcGF0aAogICAgICAgIEhBVklORyAoYXJnTWF4KGNoYW5nZV90eXBlLCBsYXN0X3RpbWUpICE9IDIpIEFORCAoTk9UIG1hdGNoKHBhdGgsICcoXmRibXMvKXwoXmxpYnMvKXwoXnRlc3RzL3Rlc3RmbG93cy8pfChecHJvZ3JhbXMvc2VydmVyL3N0b3JlLyknKSkKICAgICAgICBPUkRFUiBCWSBwYXRoIEFTQwogICAgKSwKICAgIGxpbmVzX3JlbW92ZWQgQVMKICAgICgKICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgYWRkZWRfY29kZS5wYXRoIEFTIHBhdGgsCiAgICAgICAgICAgIGFkZGVkX2NvZGUubGluZSwKICAgICAgICAgICAgYWRkZWRfY29kZS50aW1lIEFTIGFkZGVkX2RheSwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIGFkZGVkX2NvZGUudGltZSwgcmVtb3ZlZF9jb2RlLnRpbWUpIEFTIGRheXNfcHJlc2VudAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUsCiAgICAgICAgICAgICAgICBtYXgodGltZSkgQVMgdGltZSwKICAgICAgICAgICAgICAgIGFueShmaWxlX2V4dGVuc2lvbikgQVMgZmlsZV9leHRlbnNpb24KICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAxKSBBTkQgKGxpbmVfdHlwZSBOT1QgSU4gKCdQdW5jdCcsICdFbXB0eScpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGxpbmUKICAgICAgICApIEFTIGFkZGVkX2NvZGUKICAgICAgICBJTk5FUiBKT0lOCiAgICAgICAgKAogICAgICAgICAgICBTRUxFQ1QKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIHRpbWUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKHNpZ24gPSAtMSkgQU5EIChsaW5lX3R5cGUgTk9UIElOICgnUHVuY3QnLCAnRW1wdHknKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBsaW5lCiAgICAgICAgKSBBUyByZW1vdmVkX2NvZGUgVVNJTkcgKHBhdGgsIGxpbmUpCiAgICAgICAgV0hFUkUgKHJlbW92ZWRfY29kZS50aW1lID4gYWRkZWRfY29kZS50aW1lKSBBTkQgKHBhdGggSU4gKGN1cnJlbnRfZmlsZXMpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICApClNFTEVDVAogICAgcGF0aCwKICAgIGF2ZyhkYXlzX3ByZXNlbnQpIEFTIGF2Z19jb2RlX2FnZQpGUk9NIGxpbmVzX3JlbW92ZWQKR1JPVVAgQlkgcGF0aApPUkRFUiBCWSBhdmdfY29kZV9hZ2UgREVTQwpMSU1JVCAxMA==) + ```sql WITH current_files AS @@ -1730,10 +1868,12 @@ There are a few ways we can address this question. Focusing on the code to test Note we limit to users with more than 20 changes to focus on regular committers and avoid a bias to one-off contributions. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcsICdzaCcsICdweScsICdleHBlY3QnKSkgQU5EIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkgQVMgdGVzdCwKICAgIGNvdW50SWYoKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpIEFORCAoTk9UIChwYXRoIExJS0UgJyV0ZXN0cyUnKSkpIEFTIGNvZGUsCiAgICBjb2RlIC8gKGNvZGUgKyB0ZXN0KSBBUyByYXRpb19jb2RlCkZST00gZ2l0X2NsaWNraG91c2UuZmlsZV9jaGFuZ2VzCkdST1VQIEJZIGF1dGhvcgpIQVZJTkcgY29kZSA+IDIwCk9SREVSIEJZIGNvZGUgREVTQwpMSU1JVCAyMA==) + ```sql SELECT author, - countIf((file_extension NOT IN ('h', 'cpp')) AND (path LIKE '%tests%')) AS test, + countIf((file_extension IN ('h', 'cpp', 'sql', 'sh', 'py', 'expect')) AND (path LIKE '%tests%')) AS test, countIf((file_extension IN ('h', 'cpp', 'sql')) AND (NOT (path LIKE '%tests%'))) AS code, code / (code + test) AS ratio_code FROM git.file_changes @@ -1743,26 +1883,26 @@ ORDER BY code DESC LIMIT 20 ┌─author───────────────┬─test─┬──code─┬─────────ratio_code─┐ -│ Alexey Milovidov │ 9016 │ 41799 │ 0.8225720751746531 │ -│ Nikolai Kochetov │ 1376 │ 13361 │ 0.9066295718260161 │ -│ alesapin │ 3704 │ 8796 │ 0.70368 │ -│ kssenii │ 1257 │ 6769 │ 0.843384001993521 │ -│ Maksim Kita │ 1352 │ 5862 │ 0.8125866370945384 │ -│ Alexander Tokmakov │ 2011 │ 5727 │ 0.740113724476609 │ -│ Vitaly Baranov │ 2245 │ 5521 │ 0.7109193922225083 │ -│ Ivan Lezhankin │ 803 │ 4698 │ 0.8540265406289765 │ -│ Anton Popov │ 1056 │ 4346 │ 0.8045168456127361 │ -│ Ivan │ 4937 │ 4269 │ 0.4637193134912014 │ -│ Azat Khuzhin │ 2651 │ 3697 │ 0.5823881537492124 │ -│ Amos Bird │ 702 │ 2901 │ 0.8051623646960866 │ -│ proller │ 1662 │ 2377 │ 0.5885120079227532 │ -│ chertus │ 706 │ 2359 │ 0.7696574225122349 │ -│ alexey-milovidov │ 330 │ 2321 │ 0.8755186721991701 │ -│ Alexey Arno │ 207 │ 2310 │ 0.9177592371871275 │ -│ Vitaliy Lyudvichenko │ 479 │ 2283 │ 0.8265749456915279 │ -│ Robert Schulze │ 328 │ 2196 │ 0.8700475435816165 │ -│ CurtizJ │ 812 │ 2158 │ 0.7265993265993266 │ -│ Alexander Kuzmenkov │ 1198 │ 2092 │ 0.6358662613981763 │ +│ Alexey Milovidov │ 6617 │ 41799 │ 0.8633303040317251 │ +│ Nikolai Kochetov │ 916 │ 13361 │ 0.9358408629263851 │ +│ alesapin │ 2408 │ 8796 │ 0.785076758300607 │ +│ kssenii │ 869 │ 6769 │ 0.8862267609321812 │ +│ Maksim Kita │ 799 │ 5862 │ 0.8800480408347096 │ +│ Alexander Tokmakov │ 1472 │ 5727 │ 0.7955271565495208 │ +│ Vitaly Baranov │ 1764 │ 5521 │ 0.7578586135895676 │ +│ Ivan Lezhankin │ 843 │ 4698 │ 0.8478613968597726 │ +│ Anton Popov │ 599 │ 4346 │ 0.8788675429726996 │ +│ Ivan │ 2630 │ 4269 │ 0.6187853312074214 │ +│ Azat Khuzhin │ 1664 │ 3697 │ 0.689610147360567 │ +│ Amos Bird │ 400 │ 2901 │ 0.8788245986064829 │ +│ proller │ 1207 │ 2377 │ 0.6632254464285714 │ +│ chertus │ 453 │ 2359 │ 0.8389046941678521 │ +│ alexey-milovidov │ 303 │ 2321 │ 0.8845274390243902 │ +│ Alexey Arno │ 169 │ 2310 │ 0.9318273497377975 │ +│ Vitaliy Lyudvichenko │ 334 │ 2283 │ 0.8723729461215132 │ +│ Robert Schulze │ 182 │ 2196 │ 0.9234650967199327 │ +│ CurtizJ │ 460 │ 2158 │ 0.8242933537051184 │ +│ Alexander Kuzmenkov │ 298 │ 2092 │ 0.8753138075313808 │ └──────────────────────┴──────┴───────┴────────────────────┘ 20 rows in set. Elapsed: 0.034 sec. Processed 266.05 thousand rows, 4.65 MB (7.93 million rows/s., 138.76 MB/s.) @@ -1770,6 +1910,8 @@ LIMIT 20 We can plot this distribution as a histogram. +[play](https://play.clickhouse.com/play?user=play#V0lUSCAoCiAgICAgICAgU0VMRUNUIGhpc3RvZ3JhbSgxMCkocmF0aW9fY29kZSkgQVMgaGlzdAogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgY291bnRJZigoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnLCAnc2gnLCAncHknLCAnZXhwZWN0JykpIEFORCAocGF0aCBMSUtFICcldGVzdHMlJykpIEFTIHRlc3QsCiAgICAgICAgICAgICAgICBjb3VudElmKChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKSBBTkQgKE5PVCAocGF0aCBMSUtFICcldGVzdHMlJykpKSBBUyBjb2RlLAogICAgICAgICAgICAgICAgY29kZSAvIChjb2RlICsgdGVzdCkgQVMgcmF0aW9fY29kZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWSBjb2RlIERFU0MKICAgICAgICAgICAgTElNSVQgMjAKICAgICAgICApCiAgICApIEFTIGhpc3QKU0VMRUNUCiAgICBhcnJheUpvaW4oaGlzdCkuMSBBUyBsb3dlciwKICAgIGFycmF5Sm9pbihoaXN0KS4yIEFTIHVwcGVyLAogICAgYmFyKGFycmF5Sm9pbihoaXN0KS4zLCAwLCAxMDAsIDUwMCkgQVMgYmFy) + ```sql WITH ( SELECT histogram(10)(ratio_code) AS hist @@ -1777,74 +1919,86 @@ WITH ( ( SELECT author, - countIf((file_extension NOT IN ('h', 'cpp')) AND (path LIKE '%tests%')) AS test, + countIf((file_extension IN ('h', 'cpp', 'sql', 'sh', 'py', 'expect')) AND (path LIKE '%tests%')) AS test, countIf((file_extension IN ('h', 'cpp', 'sql')) AND (NOT (path LIKE '%tests%'))) AS code, code / (code + test) AS ratio_code FROM git.file_changes GROUP BY author HAVING code > 20 + ORDER BY code DESC + LIMIT 20 ) ) AS hist SELECT arrayJoin(hist).1 AS lower, arrayJoin(hist).2 AS upper, - bar(arrayJoin(hist).3, 0, 100, 20) AS bar + bar(arrayJoin(hist).3, 0, 100, 500) AS bar -┌────────────────lower─┬───────────────upper─┬─bar───────────┐ -│ 0.033562166285278416 │ 0.08337307389808846 │ ▏ │ -│ 0.08337307389808846 │ 0.17470067710547066 │ ▍ │ -│ 0.17470067710547066 │ 0.25909878535992237 │ ▍ │ -│ 0.25909878535992237 │ 0.3775444108257119 │ ▋ │ -│ 0.3775444108257119 │ 0.5108436376911997 │ ███▏ │ -│ 0.5108436376911997 │ 0.627700343453621 │ █████▋ │ -│ 0.627700343453621 │ 0.7417374581723406 │ ███████████▊ │ -│ 0.7417374581723406 │ 0.8467725898688147 │ ████████████▏ │ -│ 0.8467725898688147 │ 0.9427852671078976 │ ██████████▌ │ -│ 0.9427852671078976 │ 1 │ █████████▊ │ -└──────────────────────┴─────────────────────┴───────────────┘ - -10 rows in set. Elapsed: 0.053 sec. Processed 266.05 thousand rows, 4.65 MB (5.01 million rows/s., 87.61 MB/s.) +┌──────────────lower─┬──────────────upper─┬─bar───────────────────────────┐ +│ 0.6187853312074214 │ 0.6410053888179964 │ █████ │ +│ 0.6410053888179964 │ 0.6764177968945693 │ █████ │ +│ 0.6764177968945693 │ 0.7237343804750673 │ █████ │ +│ 0.7237343804750673 │ 0.7740802855073157 │ █████▋ │ +│ 0.7740802855073157 │ 0.807297655565091 │ ████████▋ │ +│ 0.807297655565091 │ 0.8338381996094653 │ ██████▎ │ +│ 0.8338381996094653 │ 0.8533566747727687 │ ████████▋ │ +│ 0.8533566747727687 │ 0.871392376017531 │ █████████▍ │ +│ 0.871392376017531 │ 0.904916108899021 │ ████████████████████████████▋ │ +│ 0.904916108899021 │ 0.9358408629263851 │ █████████████████▌ │ +└────────────────────┴────────────────────┴───────────────────────────────┘ +10 rows in set. Elapsed: 0.051 sec. Processed 266.05 thousand rows, 4.65 MB (5.24 million rows/s., 91.64 MB/s.) ``` Most contributors write more code than tests, as you'd expect. -What about who adds the most comments when contributing code? +What about who adds the most comments when contributing code? + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBhdXRob3IsCiAgICBhdmcocmF0aW9fY29tbWVudHMpIEFTIGF2Z19yYXRpb19jb21tZW50cywKICAgIHN1bShjb2RlKSBBUyBjb2RlCkZST00KKAogICAgU0VMRUNUCiAgICAgICAgYXV0aG9yLAogICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgIGNvdW50SWYobGluZV90eXBlID0gJ0NvbW1lbnQnKSBBUyBjb21tZW50cywKICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICBpZihjb21tZW50cyA+IDAsIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSksIDApIEFTIHJhdGlvX2NvbW1lbnRzCiAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgR1JPVVAgQlkKICAgICAgICBhdXRob3IsCiAgICAgICAgY29tbWl0X2hhc2gKKQpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgY29kZSBERVNDCkxJTUlUIDEwCg==) ```sql SELECT author, - countIf((line_type = 'Comment') OR (line_type = 'Punct')) AS comments, - countIf(line_type = 'Code') AS code, - comments / (comments + code) AS ratio_comments -FROM git.line_changes -WHERE (file_extension IN ('h', 'cpp', 'sql')) AND (sign = 1) + avg(ratio_comments) AS avg_ratio_comments, + sum(code) AS code +FROM +( + SELECT + author, + commit_hash, + countIf(line_type = 'Comment') AS comments, + countIf(line_type = 'Code') AS code, + if(comments > 0, comments / (comments + code), 0) AS ratio_comments + FROM git.line_changes + GROUP BY + author, + commit_hash +) GROUP BY author -HAVING code > 20 ORDER BY code DESC LIMIT 10 - -┌─author─────────────┬─comments─┬───code─┬──────ratio_comments─┐ -│ Alexey Milovidov │ 100869 │ 356978 │ 0.22031158880586746 │ -│ Nikolai Kochetov │ 34057 │ 113261 │ 0.2311801680717903 │ -│ Vitaly Baranov │ 24994 │ 84504 │ 0.22825987689272864 │ -│ Maksim Kita │ 23338 │ 78778 │ 0.22854400877433506 │ -│ alesapin │ 21976 │ 72279 │ 0.23315473980160203 │ -│ kssenii │ 19465 │ 61852 │ 0.23937184106644366 │ -│ Alexey Arno │ 16469 │ 61674 │ 0.21075464213045314 │ -│ Alexander Tokmakov │ 10760 │ 41964 │ 0.20408163265306123 │ -│ Anton Popov │ 9158 │ 38448 │ 0.1923707095744234 │ -│ Ivan │ 6480 │ 33711 │ 0.161230126147645 │ -└────────────────────┴──────────┴────────┴─────────────────────┘ - -10 rows in set. Elapsed: 0.136 sec. Processed 7.54 million rows, 31.57 MB (55.33 million rows/s., 231.83 MB/s.) +┌─author─────────────┬──avg_ratio_comments─┬────code─┐ +│ Alexey Milovidov │ 0.1034915408309902 │ 1147196 │ +│ s-kat │ 0.1361718900215362 │ 614224 │ +│ Nikolai Kochetov │ 0.08722993407690126 │ 218328 │ +│ alesapin │ 0.1040477684726504 │ 198082 │ +│ Vitaly Baranov │ 0.06446875712939285 │ 161801 │ +│ Maksim Kita │ 0.06863376297549255 │ 156381 │ +│ Alexey Arno │ 0.11252677608033655 │ 146642 │ +│ Vitaliy Zakaznikov │ 0.06199215397180561 │ 138530 │ +│ kssenii │ 0.07455322590796751 │ 131143 │ +│ Artur │ 0.12383737231074826 │ 121484 │ +└────────────────────┴─────────────────────┴─────────┘ +10 rows in set. Elapsed: 0.290 sec. Processed 7.54 million rows, 394.57 MB (26.00 million rows/s., 1.36 GB/s.) ``` -Surprisingly high % for all our contributors and part of what makes our code so readable. +Note we sort by code contributions. Surprisingly high % for all our largest contributors and part of what makes our code so readable. ## How does an authors commits change over time with respect to code/comments percentage? To compute this by author is trivial, +[play](#U0VMRUNUCiAgICBhdXRob3IsCiAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZV9saW5lcywKICAgIGNvdW50SWYoKGxpbmVfdHlwZSA9ICdDb21tZW50JykgT1IgKGxpbmVfdHlwZSA9ICdQdW5jdCcpKSBBUyBjb21tZW50cywKICAgIGNvZGVfbGluZXMgLyAoY29tbWVudHMgKyBjb2RlX2xpbmVzKSBBUyByYXRpb19jb2RlLAogICAgdG9TdGFydE9mV2Vlayh0aW1lKSBBUyB3ZWVrCkZST00gZ2l0X2NsaWNraG91c2UubGluZV9jaGFuZ2VzCkdST1VQIEJZCiAgICB0aW1lLAogICAgYXV0aG9yCk9SREVSIEJZCiAgICBhdXRob3IgQVNDLAogICAgdGltZSBBU0MKTElNSVQgMTA=) + ```sql SELECT author, @@ -1883,6 +2037,8 @@ To compute this, we first work out each author's comments ratio over time - simi After calculating the average by-week offset across all authors, we sample these results by selecting every 10th week. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBhdXRob3JfcmF0aW9zX2J5X29mZnNldCBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRhdGVEaWZmKCd3ZWVrJywgc3RhcnRfZGF0ZXMuc3RhcnRfZGF0ZSwgY29udHJpYnV0aW9ucy53ZWVrKSBBUyB3ZWVrX29mZnNldCwKICAgICAgICAgICAgcmF0aW9fY29kZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mV2VlayhtaW4odGltZSkpIEFTIHN0YXJ0X2RhdGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKQogICAgICAgICAgICBHUk9VUCBCWSBhdXRob3IgQVMgc3RhcnRfZGF0ZXMKICAgICAgICApIEFTIHN0YXJ0X2RhdGVzCiAgICAgICAgSU5ORVIgSk9JTgogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgICAgICBjb3VudElmKGxpbmVfdHlwZSA9ICdDb2RlJykgQVMgY29kZSwKICAgICAgICAgICAgICAgIGNvdW50SWYoKGxpbmVfdHlwZSA9ICdDb21tZW50JykgT1IgKGxpbmVfdHlwZSA9ICdQdW5jdCcpKSBBUyBjb21tZW50cywKICAgICAgICAgICAgICAgIGNvbW1lbnRzIC8gKGNvbW1lbnRzICsgY29kZSkgQVMgcmF0aW9fY29kZSwKICAgICAgICAgICAgICAgIHRvU3RhcnRPZldlZWsodGltZSkgQVMgd2VlawogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkgQU5EIChzaWduID0gMSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHRpbWUsCiAgICAgICAgICAgICAgICBhdXRob3IKICAgICAgICAgICAgSEFWSU5HIGNvZGUgPiAyMAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIHRpbWUgQVNDCiAgICAgICAgKSBBUyBjb250cmlidXRpb25zIFVTSU5HIChhdXRob3IpCiAgICApClNFTEVDVAogICAgd2Vla19vZmZzZXQsCiAgICBhdmcocmF0aW9fY29kZSkgQVMgYXZnX2NvZGVfcmF0aW8KRlJPTSBhdXRob3JfcmF0aW9zX2J5X29mZnNldApHUk9VUCBCWSB3ZWVrX29mZnNldApIQVZJTkcgKHdlZWtfb2Zmc2V0ICUgMTApID0gMApPUkRFUiBCWSB3ZWVrX29mZnNldCBBU0MKTElNSVQgMjAK) + ```sql WITH author_ratios_by_offset AS ( @@ -1959,8 +2115,9 @@ Encouragingly, our comment % is pretty constant and doesn't degrade the longer a We can use the same principle as [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors) to identify rewrites but consider all files. A window function is used to compute the time between rewrites for each file. From this, we can calculate an average and median across all files. -```sql +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGxpbmVzX2FkZGVkKSBBUyBudW1fYWRkZWQsCiAgICAgICAgICAgICAgICBhbnkobGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGNoYW5nZV90eXBlIElOICgnQWRkJywgJ01vZGlmeScpKSBBTkQgKGZpbGVfZXh0ZW5zaW9uIElOICgnaCcsICdjcHAnLCAnc3FsJykpCiAgICAgICAgICAgIEdST1VQIEJZCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gKICAgICAgICAgICAgT1JERVIgQlkKICAgICAgICAgICAgICAgIHBhdGggQVNDLAogICAgICAgICAgICAgICAgbWF4X3RpbWUgQVNDCiAgICAgICAgKQogICAgKSwKICAgIHJld3JpdGVzIEFTCiAgICAoCiAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICosCiAgICAgICAgICAgIGFueShtYXhfdGltZSkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX3Jld3JpdGUsCiAgICAgICAgICAgIGRhdGVEaWZmKCdkYXknLCBwcmV2aW91c19yZXdyaXRlLCBtYXhfdGltZSkgQVMgcmV3cml0ZV9kYXlzCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIGF2Z0lmKHJld3JpdGVfZGF5cywgcmV3cml0ZV9kYXlzID4gMCkgQVMgYXZnX3Jld3JpdGVfdGltZSwKICAgIHF1YW50aWxlc1RpbWluZ0lmKDAuNSkocmV3cml0ZV9kYXlzLCByZXdyaXRlX2RheXMgPiAwKSBBUyBoYWxmX2xpZmUKRlJPTSByZXdyaXRlcw==) +```sql WITH changes AS ( @@ -2018,6 +2175,8 @@ FROM rewrites Similar to [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) and [List files that were rewritten most number of time or by most of authors](#list-files-that-were-rewritten-most-number-of-time-or-by-most-of-authors), except we aggregate by day of week. Adjust as required e.g. month of year. +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBjb21taXRfaGFzaCwKICAgICAgICAgICAgbWF4X3RpbWUsCiAgICAgICAgICAgIHR5cGUsCiAgICAgICAgICAgIG51bV9hZGRlZCwKICAgICAgICAgICAgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgIHN1bShudW1fYWRkZWQgLSBudW1fZGVsZXRlZCkgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDKSBBUyBjdXJyZW50X3NpemUsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9hZGRlZCAvIGN1cnJlbnRfc2l6ZSwgMCkgQVMgcGVyY2VudF9hZGQsCiAgICAgICAgICAgIGlmKGN1cnJlbnRfc2l6ZSA+IDAsIG51bV9kZWxldGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2RlbGV0ZQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIG1heCh0aW1lKSBBUyBtYXhfdGltZSwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfYWRkZWQpIEFTIG51bV9hZGRlZCwKICAgICAgICAgICAgICAgIGFueShmaWxlX2xpbmVzX2RlbGV0ZWQpIEFTIG51bV9kZWxldGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfY2hhbmdlX3R5cGUpIEFTIHR5cGUKICAgICAgICAgICAgRlJPTSBnaXRfY2xpY2tob3VzZS5saW5lX2NoYW5nZXMKICAgICAgICAgICAgV0hFUkUgKGZpbGVfY2hhbmdlX3R5cGUgSU4gKCdBZGQnLCAnTW9kaWZ5JykpIEFORCAoZmlsZV9leHRlbnNpb24gSU4gKCdoJywgJ2NwcCcsICdzcWwnKSkKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIHBhdGgsCiAgICAgICAgICAgICAgICBjb21taXRfaGFzaAogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgcGF0aCBBU0MsCiAgICAgICAgICAgICAgICBtYXhfdGltZSBBU0MKICAgICAgICApCiAgICApLAogICAgcmV3cml0ZXMgQVMKICAgICgKICAgICAgICBTRUxFQ1QgYW55KG1heF90aW1lKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MgUk9XUyBCRVRXRUVOIDEgUFJFQ0VESU5HIEFORCBDVVJSRU5UIFJPVykgQVMgcHJldmlvdXNfcmV3cml0ZQogICAgICAgIEZST00gY2hhbmdlcwogICAgICAgIFdIRVJFICh0eXBlID0gJ01vZGlmeScpIEFORCAocGVyY2VudF9hZGQgPj0gMC41KSBBTkQgKHBlcmNlbnRfZGVsZXRlID49IDAuNSkgQU5EIChjdXJyZW50X3NpemUgPiA1MCkKICAgICkKU0VMRUNUCiAgICBkYXlPZldlZWsocHJldmlvdXNfcmV3cml0ZSkgQVMgZGF5T2ZXZWVrLAogICAgY291bnQoKSBBUyBudW1fcmVfd3JpdGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgZGF5T2ZXZWVr) + ```sql WITH changes AS @@ -2080,6 +2239,8 @@ GROUP BY dayOfWeek We define "sticky" as how long does an author's code stay before its rewritten. Similar to the previous question [What is the average time before code will be rewritten and the median (half-life of code decay)?](#what-is-the-average-time-before-code-will-be-rewritten-and-the-median-half-life-of-code-decay) - using the same metric for rewrites i.e. 50% additions and 50% deletions to the file. We compute the average rewrite time per author and only consider contributors with more than two files. +[play](https://play.clickhouse.com/play?user=play#V0lUSAogICAgY2hhbmdlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBwYXRoLAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGNvbW1pdF9oYXNoLAogICAgICAgICAgICBtYXhfdGltZSwKICAgICAgICAgICAgdHlwZSwKICAgICAgICAgICAgbnVtX2FkZGVkLAogICAgICAgICAgICBudW1fZGVsZXRlZCwKICAgICAgICAgICAgc3VtKG51bV9hZGRlZCAtIG51bV9kZWxldGVkKSBPVkVSIChQQVJUSVRJT04gQlkgcGF0aCBPUkRFUiBCWSBtYXhfdGltZSBBU0MpIEFTIGN1cnJlbnRfc2l6ZSwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2FkZGVkIC8gY3VycmVudF9zaXplLCAwKSBBUyBwZXJjZW50X2FkZCwKICAgICAgICAgICAgaWYoY3VycmVudF9zaXplID4gMCwgbnVtX2RlbGV0ZWQgLyBjdXJyZW50X3NpemUsIDApIEFTIHBlcmNlbnRfZGVsZXRlCiAgICAgICAgRlJPTQogICAgICAgICgKICAgICAgICAgICAgU0VMRUNUCiAgICAgICAgICAgICAgICBwYXRoLAogICAgICAgICAgICAgICAgYW55KGF1dGhvcikgQVMgYXV0aG9yLAogICAgICAgICAgICAgICAgbWF4KHRpbWUpIEFTIG1heF90aW1lLAogICAgICAgICAgICAgICAgY29tbWl0X2hhc2gsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9saW5lc19hZGRlZCkgQVMgbnVtX2FkZGVkLAogICAgICAgICAgICAgICAgYW55KGZpbGVfbGluZXNfZGVsZXRlZCkgQVMgbnVtX2RlbGV0ZWQsCiAgICAgICAgICAgICAgICBhbnkoZmlsZV9jaGFuZ2VfdHlwZSkgQVMgdHlwZQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwogICAgICAgICAgICBXSEVSRSAoZmlsZV9jaGFuZ2VfdHlwZSBJTiAoJ0FkZCcsICdNb2RpZnknKSkgQU5EIChmaWxlX2V4dGVuc2lvbiBJTiAoJ2gnLCAnY3BwJywgJ3NxbCcpKQogICAgICAgICAgICBHUk9VUCBCWQogICAgICAgICAgICAgICAgcGF0aCwKICAgICAgICAgICAgICAgIGNvbW1pdF9oYXNoCiAgICAgICAgICAgIE9SREVSIEJZCiAgICAgICAgICAgICAgICBwYXRoIEFTQywKICAgICAgICAgICAgICAgIG1heF90aW1lIEFTQwogICAgICAgICkKICAgICksCiAgICByZXdyaXRlcyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICAqLAogICAgICAgICAgICBhbnkobWF4X3RpbWUpIE9WRVIgKFBBUlRJVElPTiBCWSBwYXRoIE9SREVSIEJZIG1heF90aW1lIEFTQyBST1dTIEJFVFdFRU4gMSBQUkVDRURJTkcgQU5EIENVUlJFTlQgUk9XKSBBUyBwcmV2aW91c19yZXdyaXRlLAogICAgICAgICAgICBkYXRlRGlmZignZGF5JywgcHJldmlvdXNfcmV3cml0ZSwgbWF4X3RpbWUpIEFTIHJld3JpdGVfZGF5cywKICAgICAgICAgICAgYW55KGF1dGhvcikgT1ZFUiAoUEFSVElUSU9OIEJZIHBhdGggT1JERVIgQlkgbWF4X3RpbWUgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZfYXV0aG9yCiAgICAgICAgRlJPTSBjaGFuZ2VzCiAgICAgICAgV0hFUkUgKHR5cGUgPSAnTW9kaWZ5JykgQU5EIChwZXJjZW50X2FkZCA+PSAwLjUpIEFORCAocGVyY2VudF9kZWxldGUgPj0gMC41KSBBTkQgKGN1cnJlbnRfc2l6ZSA+IDUwKQogICAgKQpTRUxFQ1QKICAgIHByZXZfYXV0aG9yLAogICAgYXZnKHJld3JpdGVfZGF5cykgQVMgYywKICAgIHVuaXEocGF0aCkgQVMgbnVtX2ZpbGVzCkZST00gcmV3cml0ZXMKR1JPVVAgQlkgcHJldl9hdXRob3IKSEFWSU5HIG51bV9maWxlcyA+IDIKT1JERVIgQlkgYyBERVNDCkxJTUlUIDEwCg==) + ```sql WITH changes AS @@ -2157,6 +2318,8 @@ This query first requires us to calculate the days when an author has committed. Our subsequent array functions compute each author's longest sequence of consecutive ones. First, the `groupArray` function is used to collate all `consecutive_day` values for an author. This array of 1s and 0s, is then split on 0 values into subarrays. Finally, we calculate the longest subarray. +[play](https://play.clickhouse.com/play?user=play#V0lUSCBjb21taXRfZGF5cyBBUwogICAgKAogICAgICAgIFNFTEVDVAogICAgICAgICAgICBhdXRob3IsCiAgICAgICAgICAgIGRheSwKICAgICAgICAgICAgYW55KGRheSkgT1ZFUiAoUEFSVElUSU9OIEJZIGF1dGhvciBPUkRFUiBCWSBkYXkgQVNDIFJPV1MgQkVUV0VFTiAxIFBSRUNFRElORyBBTkQgQ1VSUkVOVCBST1cpIEFTIHByZXZpb3VzX2NvbW1pdCwKICAgICAgICAgICAgZGF0ZURpZmYoJ2RheScsIHByZXZpb3VzX2NvbW1pdCwgZGF5KSBBUyBkYXlzX3NpbmNlX2xhc3QsCiAgICAgICAgICAgIGlmKGRheXNfc2luY2VfbGFzdCA9IDEsIDEsIDApIEFTIGNvbnNlY3V0aXZlX2RheQogICAgICAgIEZST00KICAgICAgICAoCiAgICAgICAgICAgIFNFTEVDVAogICAgICAgICAgICAgICAgYXV0aG9yLAogICAgICAgICAgICAgICAgdG9TdGFydE9mRGF5KHRpbWUpIEFTIGRheQogICAgICAgICAgICBGUk9NIGdpdF9jbGlja2hvdXNlLmNvbW1pdHMKICAgICAgICAgICAgR1JPVVAgQlkKICAgICAgICAgICAgICAgIGF1dGhvciwKICAgICAgICAgICAgICAgIGRheQogICAgICAgICAgICBPUkRFUiBCWQogICAgICAgICAgICAgICAgYXV0aG9yIEFTQywKICAgICAgICAgICAgICAgIGRheSBBU0MKICAgICAgICApCiAgICApClNFTEVDVAogICAgYXV0aG9yLAogICAgYXJyYXlNYXgoYXJyYXlNYXAoeCAtPiBsZW5ndGgoeCksIGFycmF5U3BsaXQoeCAtPiAoeCA9IDApLCBncm91cEFycmF5KGNvbnNlY3V0aXZlX2RheSkpKSkgQVMgbWF4X2NvbnNlY3V0aXZlX2RheXMKRlJPTSBjb21taXRfZGF5cwpHUk9VUCBCWSBhdXRob3IKT1JERVIgQlkgbWF4X2NvbnNlY3V0aXZlX2RheXMgREVTQwpMSU1JVCAxMA==) + ```sql WITH commit_days AS ( @@ -2182,23 +2345,23 @@ WITH commit_days AS ) SELECT author, - arrayMax(arrayMap(x -> length(x), arraySplit(x -> (x = 0), groupArray(consecutive_day)))) AS max_consecutive_days + arrayMax(arrayMap(x -> length(x), arraySplit(x -> (x = 0), groupArray(consecutive_day)))) - 1 AS max_consecutive_days FROM commit_days GROUP BY author ORDER BY max_consecutive_days DESC LIMIT 10 ┌─author───────────┬─max_consecutive_days─┐ -│ kssenii │ 33 │ -│ Alexey Milovidov │ 31 │ -│ alesapin │ 27 │ -│ Azat Khuzhin │ 24 │ -│ Nikolai Kochetov │ 16 │ -│ Igor Nikonov │ 12 │ -│ feng lv │ 12 │ -│ alexey-milovidov │ 12 │ -│ Maksim Kita │ 12 │ -│ Nikita Vasilev │ 12 │ +│ kssenii │ 32 │ +│ Alexey Milovidov │ 30 │ +│ alesapin │ 26 │ +│ Azat Khuzhin │ 23 │ +│ Nikolai Kochetov │ 15 │ +│ feng lv │ 11 │ +│ alexey-milovidov │ 11 │ +│ Igor Nikonov │ 11 │ +│ Maksim Kita │ 11 │ +│ Nikita Vasilev │ 11 │ └──────────────────┴──────────────────────┘ 10 rows in set. Elapsed: 0.025 sec. Processed 62.78 thousand rows, 395.47 KB (2.54 million rows/s., 16.02 MB/s.) @@ -2208,6 +2371,7 @@ LIMIT 10 Files can be renamed. When this occurs, we get a rename event, where the `path` column is set to the new path of the file and the `old_path` represents the previous location e.g. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgcGF0aCwKICAgIG9sZF9wYXRoLAogICAgY29tbWl0X2hhc2gsCiAgICBjb21taXRfbWVzc2FnZQpGUk9NIGdpdF9jbGlja2hvdXNlLmZpbGVfY2hhbmdlcwpXSEVSRSAocGF0aCA9ICdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQU5EIChjaGFuZ2VfdHlwZSA9ICdSZW5hbWUnKQ==) ```sql SELECT @@ -2233,7 +2397,7 @@ To address this, we can use User Defined Functions (UDFs). These cannot, current This means we can only track renames to a maximum depth - the below example is 5 deep. It is unlikely a file will be renamed more times than this, so for now, this is sufficient. ```sql -CREATE FUNCTION file_path_history AS (n) -> if(empty(n), [], arrayConcat([n], file_history_01((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); +CREATE FUNCTION file_path_history AS (n) -> if(empty(n), [], arrayConcat([n], file_path_history_01((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); CREATE FUNCTION file_path_history_01 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_02((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); CREATE FUNCTION file_path_history_02 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_03((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); CREATE FUNCTION file_path_history_03 AS (n) -> if(isNull(n), [], arrayConcat([n], file_path_history_04((SELECT if(empty(old_path), Null, old_path) FROM git.file_changes WHERE path = n AND (change_type = 'Rename' OR change_type = 'Add') LIMIT 1)))); @@ -2245,6 +2409,8 @@ By calling `file_path_history('src/Storages/StorageReplicatedMergeTree.cpp')` we For example, +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykgQVMgcGF0aHMK) + ```sql SELECT file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') AS paths @@ -2257,6 +2423,8 @@ SELECT file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') AS paths We can use this capability to now assemble the commits for the entire history of a file. In this example, we show one commit for each of the `path` values. +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICB0aW1lLAogICAgc3Vic3RyaW5nKGNvbW1pdF9oYXNoLCAxLCAxMSkgQVMgY29tbWl0LAogICAgY2hhbmdlX3R5cGUsCiAgICBhdXRob3IsCiAgICBwYXRoLAogICAgY29tbWl0X21lc3NhZ2UKRlJPTSBnaXRfY2xpY2tob3VzZS5maWxlX2NoYW5nZXMKV0hFUkUgcGF0aCBJTiBmaWxlX3BhdGhfaGlzdG9yeSgnc3JjL1N0b3JhZ2VzL1N0b3JhZ2VSZXBsaWNhdGVkTWVyZ2VUcmVlLmNwcCcpCk9SREVSIEJZIHRpbWUgREVTQwpMSU1JVCAxIEJZIHBhdGgKRk9STUFUIFByZXR0eUNvbXBhY3RNb25vQmxvY2s=) + ```sql SELECT time, @@ -2286,4 +2454,46 @@ FORMAT PrettyCompactMonoBlock This is particularly difficult to get an exact result due to the inability to currently keep state in array functions. This will be possible with an `arrayFold` or `arrayReduce`, which allows state to be held on each iteration. -We welcome solutions here. +An approximate solution, sufficient for a high-level analysis, may look something like this: + +[play](https://play.clickhouse.com/play?user=play#U0VMRUNUCiAgICBsaW5lX251bWJlcl9uZXcsCiAgICBhcmdNYXgoYXV0aG9yLCB0aW1lKSwKICAgIGFyZ01heChsaW5lLCB0aW1lKQpGUk9NIGdpdF9jbGlja2hvdXNlLmxpbmVfY2hhbmdlcwpXSEVSRSBwYXRoIElOIGZpbGVfcGF0aF9oaXN0b3J5KCdzcmMvU3RvcmFnZXMvU3RvcmFnZVJlcGxpY2F0ZWRNZXJnZVRyZWUuY3BwJykKR1JPVVAgQlkgbGluZV9udW1iZXJfbmV3Ck9SREVSIEJZIGxpbmVfbnVtYmVyX25ldyBBU0MKTElNSVQgMjA=) + +```sql +SELECT + line_number_new, + argMax(author, time), + argMax(line, time) +FROM git.line_changes +WHERE path IN file_path_history('src/Storages/StorageReplicatedMergeTree.cpp') +GROUP BY line_number_new +ORDER BY line_number_new ASC +LIMIT 20 + +┌─line_number_new─┬─argMax(author, time)─┬─argMax(line, time)────────────────────────────────────────────┐ +│ 1 │ Alexey Milovidov │ #include │ +│ 2 │ s-kat │ #include │ +│ 3 │ Anton Popov │ #include │ +│ 4 │ Alexander Burmak │ #include │ +│ 5 │ avogar │ #include │ +│ 6 │ Alexander Burmak │ #include │ +│ 7 │ Alexander Burmak │ #include │ +│ 8 │ Alexander Burmak │ #include │ +│ 9 │ Alexander Burmak │ #include │ +│ 10 │ Alexander Burmak │ #include │ +│ 11 │ Alexander Burmak │ #include │ +│ 12 │ Nikolai Kochetov │ #include │ +│ 13 │ alesapin │ #include │ +│ 14 │ alesapin │ │ +│ 15 │ Alexey Milovidov │ #include │ +│ 16 │ Alexey Zatelepin │ #include │ +│ 17 │ CurtizJ │ #include │ +│ 18 │ Kirill Shvakov │ #include │ +│ 19 │ s-kat │ #include │ +│ 20 │ Nikita Mikhaylov │ #include │ +└─────────────────┴──────────────────────┴───────────────────────────────────────────────────────────────┘ +20 rows in set. Elapsed: 0.547 sec. Processed 7.88 million rows, 679.20 MB (14.42 million rows/s., 1.24 GB/s.) +``` + +We welcome exact and improved solutions here. + + diff --git a/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png b/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png new file mode 100644 index 00000000000..bdfc6b6f304 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-authors-matrix.png differ diff --git a/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png b/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png new file mode 100644 index 00000000000..aad98b5b077 Binary files /dev/null and b/docs/en/getting-started/example-datasets/images/superset-authors-matrix_v2.png differ diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 58e986cc2f3..56708def497 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -5,7 +5,7 @@ sidebar_label: Input and Output Formats title: Formats for Input and Output Data --- -ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read an external dictionary. A format supported for output can be used to arrange the +ClickHouse can accept and return data in various formats. A format supported for input can be used to parse the data provided to `INSERT`s, to perform `SELECT`s from a file-backed table such as File, URL or HDFS, or to read a dictionary. A format supported for output can be used to arrange the results of a `SELECT`, and to perform `INSERT`s into a file-backed table. The supported formats are: diff --git a/docs/en/operations/named-collections.md b/docs/en/operations/named-collections.md index f605045a0ad..cbb8d0a4c02 100644 --- a/docs/en/operations/named-collections.md +++ b/docs/en/operations/named-collections.md @@ -130,7 +130,7 @@ SHOW TABLES FROM mydatabase; └────────┘ ``` -### Example of using named collections with an external dictionary with source MySQL +### Example of using named collections with a dictionary with source MySQL ```sql CREATE DICTIONARY dict (A Int64, B String) @@ -213,7 +213,7 @@ SHOW TABLES FROM mydatabase └──────┘ ``` -### Example of using named collections with an external dictionary with source POSTGRESQL +### Example of using named collections with a dictionary with source POSTGRESQL ```sql CREATE DICTIONARY dict (a Int64, b String) @@ -270,7 +270,7 @@ SELECT * FROM remote(remote1, database = default, table = test); └───┴───┘ ``` -### Example of using named collections with an external dictionary with source ClickHouse +### Example of using named collections with a dictionary with source ClickHouse ```sql CREATE DICTIONARY dict(a Int64, b String) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index dcda7536935..5faf3819d7e 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -268,14 +268,14 @@ The path to the table in ZooKeeper. ## dictionaries_config {#server_configuration_parameters-dictionaries_config} -The path to the config file for external dictionaries. +The path to the config file for dictionaries. Path: - Specify the absolute path or the path relative to the server config file. - The path can contain wildcards \* and ?. -See also “[External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. +See also “[Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md)”. **Example** diff --git a/docs/en/operations/settings/permissions-for-queries.md b/docs/en/operations/settings/permissions-for-queries.md index 3ba62b78cfe..c565de9b21a 100644 --- a/docs/en/operations/settings/permissions-for-queries.md +++ b/docs/en/operations/settings/permissions-for-queries.md @@ -16,44 +16,54 @@ Queries in ClickHouse can be divided into several types: The following settings regulate user permissions by the type of query: -- [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries. -- [allow_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries. +## readonly +Restricts permissions for read data, write data, and change settings queries. -`KILL QUERY` can be performed with any settings. +When set to 1, allows: -## readonly {#settings_readonly} +- All types of read queries (like SELECT and equivalent queries). +- Queries that modify only session context (like USE). -Restricts permissions for reading data, write data and change settings queries. +When set to 2, allows the above plus: +- SET and CREATE TEMPORARY TABLE -See how the queries are divided into types [above](#permissions_for_queries). + :::tip + Queries like EXISTS, DESCRIBE, EXPLAIN, SHOW PROCESSLIST, etc are equivalent to SELECT, because they just do select from system tables. + ::: Possible values: -- 0 — All queries are allowed. -- 1 — Only read data queries are allowed. -- 2 — Read data and change settings queries are allowed. +- 0 — Read, Write, and Change settings queries are allowed. +- 1 — Only Read data queries are allowed. +- 2 — Read data and Change settings queries are allowed. +Default value: 0 + +:::note After setting `readonly = 1`, the user can’t change `readonly` and `allow_ddl` settings in the current session. When using the `GET` method in the [HTTP interface](../../interfaces/http.md), `readonly = 1` is set automatically. To modify data, use the `POST` method. -Setting `readonly = 1` prohibit the user from changing all the settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md). +Setting `readonly = 1` prohibits the user from changing settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md). +::: -Default value: 0 ## allow_ddl {#settings_allow_ddl} Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries. -See how the queries are divided into types [above](#permissions_for_queries). - Possible values: - 0 — DDL queries are not allowed. - 1 — DDL queries are allowed. -You can’t execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. - Default value: 1 -[Original article](https://clickhouse.com/docs/en/operations/settings/permissions_for_queries/) +:::note +You cannot run `SET allow_ddl = 1` if `allow_ddl = 0` for the current session. +::: + + +:::note KILL QUERY +`KILL QUERY` can be performed with any combination of readonly and allow_ddl settings. +::: diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index d2f0f46f637..7abe4affbd1 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3399,6 +3399,17 @@ Use schema from cache for URL with last modification time validation (for urls w Default value: `true`. +## use_structure_from_insertion_table_in_table_functions {use_structure_from_insertion_table_in_table_functions} + +Use structure from insertion table instead of schema inference from data. + +Possible values: +- 0 - disabled +- 1 - enabled +- 2 - auto + +Default value: 2. + ## compatibility {#compatibility} This setting changes other settings according to provided ClickHouse version. diff --git a/docs/en/operations/system-tables/crash-log.md b/docs/en/operations/system-tables/crash-log.md index 0c0a4cd967d..a44b0db8e9b 100644 --- a/docs/en/operations/system-tables/crash-log.md +++ b/docs/en/operations/system-tables/crash-log.md @@ -7,8 +7,8 @@ Contains information about stack traces for fatal errors. The table does not exi Columns: -- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date of the event. -- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Time of the event. +- `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date of the event. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the event. - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Timestamp of the event with nanoseconds. - `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — Signal number. - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Thread ID. diff --git a/docs/en/operations/system-tables/dictionaries.md b/docs/en/operations/system-tables/dictionaries.md index 112e2cc2cdf..4b256f0de97 100644 --- a/docs/en/operations/system-tables/dictionaries.md +++ b/docs/en/operations/system-tables/dictionaries.md @@ -3,7 +3,7 @@ slug: /en/operations/system-tables/dictionaries --- # dictionaries -Contains information about [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Contains information about [dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). Columns: @@ -33,7 +33,7 @@ Columns: - `lifetime_min` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Minimum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. - `lifetime_max` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Maximum [lifetime](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of the dictionary in memory, after which ClickHouse tries to reload the dictionary (if `invalidate_query` is set, then only if it has changed). Set in seconds. - `loading_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time for loading the dictionary. -- `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with external sources and investigate causes. +- `last_successful_update_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — End time for loading or updating the dictionary. Helps to monitor some troubles with dictionary sources and investigate the causes. - `loading_duration` ([Float32](../../sql-reference/data-types/float.md)) — Duration of a dictionary loading. - `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text of the error that occurs when creating or reloading the dictionary if the dictionary couldn’t be created. - `comment` ([String](../../sql-reference/data-types/string.md)) — Text of the comment to dictionary. diff --git a/docs/en/operations/system-tables/mutations.md b/docs/en/operations/system-tables/mutations.md index 782d7c42ad2..0d3b764846b 100644 --- a/docs/en/operations/system-tables/mutations.md +++ b/docs/en/operations/system-tables/mutations.md @@ -15,7 +15,7 @@ Columns: - `command` ([String](/docs/en/sql-reference/data-types/string.md)) — The mutation command string (the part of the query after `ALTER TABLE [db.]table`). -- `create_time` ([Datetime](/docs/en/sql-reference/data-types/datetime.md)) — Date and time when the mutation command was submitted for execution. +- `create_time` ([DateTime](/docs/en/sql-reference/data-types/datetime.md)) — Date and time when the mutation command was submitted for execution. - `block_numbers.partition_id` ([Array](/docs/en/sql-reference/data-types/array.md)([String](/docs/en/sql-reference/data-types/string.md))) — For mutations of replicated tables, the array contains the partitions' IDs (one record for each partition). For mutations of non-replicated tables the array is empty. @@ -39,7 +39,7 @@ If there were problems with mutating some data parts, the following columns cont - `latest_failed_part` ([String](/docs/en/sql-reference/data-types/string.md)) — The name of the most recent part that could not be mutated. -- `latest_fail_time` ([Datetime](/docs/en/sql-reference/data-types/datetime.md)) — The date and time of the most recent part mutation failure. +- `latest_fail_time` ([DateTime](/docs/en/sql-reference/data-types/datetime.md)) — The date and time of the most recent part mutation failure. - `latest_fail_reason` ([String](/docs/en/sql-reference/data-types/string.md)) — The exception message that caused the most recent part mutation failure. diff --git a/docs/en/operations/system-tables/replication_queue.md b/docs/en/operations/system-tables/replication_queue.md index ced20b0048a..dff3bce246a 100644 --- a/docs/en/operations/system-tables/replication_queue.md +++ b/docs/en/operations/system-tables/replication_queue.md @@ -29,7 +29,7 @@ Columns: - `MUTATE_PART` — Apply one or several mutations to the part. - `ALTER_METADATA` — Apply alter modification according to global /metadata and /columns paths. -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was submitted for execution. - `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of replicas waiting for the task to complete with confirmation of completion. This column is only relevant for the `GET_PARTS` task. @@ -47,13 +47,13 @@ Columns: - `last_exception` ([String](../../sql-reference/data-types/string.md)) — Text message about the last error that occurred (if any). -- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last attempted. +- `last_attempt_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last attempted. - `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of postponed tasks. - `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — The reason why the task was postponed. -- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last postponed. +- `last_postpone_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Date and time when the task was last postponed. - `merge_type` ([String](../../sql-reference/data-types/string.md)) — Type of the current merge. Empty if it's a mutation. diff --git a/docs/en/sql-reference/data-types/date32.md b/docs/en/sql-reference/data-types/date32.md index ff1a745785b..c8c7470d2cb 100644 --- a/docs/en/sql-reference/data-types/date32.md +++ b/docs/en/sql-reference/data-types/date32.md @@ -6,7 +6,7 @@ sidebar_label: Date32 # Date32 -A date. Supports the date range same with [Datetime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1900-01-01. Allows storing values till 2299-12-31. +A date. Supports the date range same with [DateTime64](../../sql-reference/data-types/datetime64.md). Stored in four bytes as the number of days since 1900-01-01. Allows storing values till 2299-12-31. **Examples** diff --git a/docs/en/sql-reference/data-types/datetime.md b/docs/en/sql-reference/data-types/datetime.md index 85587882e01..7f7f21ded54 100644 --- a/docs/en/sql-reference/data-types/datetime.md +++ b/docs/en/sql-reference/data-types/datetime.md @@ -4,7 +4,7 @@ sidebar_position: 48 sidebar_label: DateTime --- -# Datetime +# DateTime Allows to store an instant in time, that can be expressed as a calendar date and a time of a day. diff --git a/docs/en/sql-reference/data-types/datetime64.md b/docs/en/sql-reference/data-types/datetime64.md index c7372e4b064..fa3a1eecd46 100644 --- a/docs/en/sql-reference/data-types/datetime64.md +++ b/docs/en/sql-reference/data-types/datetime64.md @@ -4,7 +4,7 @@ sidebar_position: 49 sidebar_label: DateTime64 --- -# Datetime64 +# DateTime64 Allows to store an instant in time, that can be expressed as a calendar date and a time of a day, with defined sub-second precision diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml index 1f98223c54c..af79ff9af23 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_category_.yml @@ -1,8 +1,8 @@ position: 37 -label: 'External Dictionaries' +label: 'Dictionaries' collapsible: true collapsed: true link: type: generated-index - title: External Dictionaries + title: Dictionaries slug: /en/sql-reference/dictionaries/external-dictionaries diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md b/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md new file mode 100644 index 00000000000..e6a0dac7afb --- /dev/null +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md @@ -0,0 +1,4 @@ +:::tip +If you are using a dictionary with ClickHouse Cloud please use the DDL query option to create your dictionaries, and create your dictionary as user `default`. +Also, verify the list of supported dictionary sources in the [Cloud Compatibility guide](/docs/en/whats-new/cloud-capabilities.md). +::: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index 02a4ad57a3b..aac0db208c6 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -3,6 +3,7 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-l sidebar_position: 41 sidebar_label: Storing Dictionaries in Memory --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Storing Dictionaries in Memory @@ -22,7 +23,9 @@ ClickHouse generates an exception for errors with dictionaries. Examples of erro - The dictionary being accessed could not be loaded. - Error querying a `cached` dictionary. -You can view the list of external dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. +You can view the list of dictionaries and their statuses in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table. + + The configuration looks like this: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md index 6e4c8c4b94e..e4edad4d9a1 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md @@ -3,6 +3,7 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-l sidebar_position: 42 sidebar_label: Dictionary Updates --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Dictionary Updates @@ -12,6 +13,8 @@ Dictionary updates (other than loading for first use) do not block queries. Duri Example of settings: + + ``` xml ... diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md index e5ee48c9166..366d88e07c7 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-polygon.md @@ -4,12 +4,15 @@ sidebar_position: 46 sidebar_label: Polygon Dictionaries With Grids title: "Polygon dictionaries" --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; Polygon dictionaries allow you to efficiently search for the polygon containing specified points. For example: defining a city area by geographical coordinates. Example of a polygon dictionary configuration: + + ``` xml @@ -78,7 +81,7 @@ To respond to the query, there is a corresponding cell, and the index for the po - `POLYGON`. Synonym to `POLYGON_INDEX_CELL`. -Dictionary queries are carried out using standard [functions](../../../sql-reference/functions/ext-dict-functions.md) for working with external dictionaries. +Dictionary queries are carried out using standard [functions](../../../sql-reference/functions/ext-dict-functions.md) for working with dictionaries. An important difference is that here the keys will be the points for which you want to find the polygon containing them. **Example** diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md index d457f327e7a..4eb96fe80a2 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md @@ -1,12 +1,15 @@ --- slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources sidebar_position: 43 -sidebar_label: Sources of External Dictionaries +sidebar_label: Dictionary Sources --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# Sources of External Dictionaries +# Dictionary Sources -An external dictionary can be connected to ClickHouse from many different sources. + + +A dictionary can be connected to ClickHouse from many different sources. If the dictionary is configured using an xml-file, the configuration looks like this: @@ -65,13 +68,13 @@ Types of sources (`source_type`): - [Executable Pool](#dicts-external_dicts_dict_sources-executable_pool) - [HTTP(s)](#dicts-external_dicts_dict_sources-http) - DBMS - - [ODBC](#dicts-external_dicts_dict_sources-odbc) - - [MySQL](#dicts-external_dicts_dict_sources-mysql) - - [ClickHouse](#dicts-external_dicts_dict_sources-clickhouse) - - [MongoDB](#dicts-external_dicts_dict_sources-mongodb) - - [Redis](#dicts-external_dicts_dict_sources-redis) - - [Cassandra](#dicts-external_dicts_dict_sources-cassandra) - - [PostgreSQL](#dicts-external_dicts_dict_sources-postgresql) + - [ODBC](#odbc) + - [MySQL](#mysql) + - [ClickHouse](#clickhouse) + - [MongoDB](#mongodb) + - [Redis](#redis) + - [Cassandra](#cassandra) + - [PostgreSQL](#postgresql) ## Local File diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md index 895743c3b50..881630167e3 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md @@ -3,9 +3,12 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-s sidebar_position: 44 sidebar_label: Dictionary Key and Fields --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; # Dictionary Key and Fields + + The `structure` clause describes the dictionary key and fields available for queries. XML description: @@ -171,5 +174,5 @@ Configuration fields: **See Also** -- [Functions for working with external dictionaries](../../../sql-reference/functions/ext-dict-functions.md). +- [Functions for working with dictionaries](../../../sql-reference/functions/ext-dict-functions.md). diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md index 5c237eea8c7..76ca3ac978f 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md @@ -1,10 +1,13 @@ --- slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict sidebar_position: 40 -sidebar_label: Configuring an External Dictionary +sidebar_label: Configuring a Dictionary --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# Configuring an External Dictionary +# Configuring a Dictionary + + If dictionary is configured using xml file, than dictionary configuration has the following structure: diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md index 095fb6360cd..06b5b8a6746 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts.md @@ -3,18 +3,21 @@ slug: /en/sql-reference/dictionaries/external-dictionaries/external-dicts sidebar_position: 39 sidebar_label: General Description --- +import CloudDetails from '@site/docs/en/sql-reference/dictionaries/external-dictionaries/_snippet_dictionary_in_cloud.md'; -# External Dictionaries +# Dictionaries -You can add your own dictionaries from various data sources. The data source for a dictionary can be a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Sources for external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. + + +You can add your own dictionaries from various data sources. The source for a dictionary can be a ClickHouse table, a local text or executable file, an HTTP(s) resource, or another DBMS. For more information, see “[Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md)”. ClickHouse: - Fully or partially stores dictionaries in RAM. - Periodically updates dictionaries and dynamically loads missing values. In other words, dictionaries can be loaded dynamically. -- Allows to create external dictionaries with xml files or [DDL queries](../../../sql-reference/statements/create/dictionary.md). +- Allows creating dictionaries with xml files or [DDL queries](../../../sql-reference/statements/create/dictionary.md). -The configuration of external dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. +The configuration of dictionaries can be located in one or more xml-files. The path to the configuration is specified in the [dictionaries_config](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_config) parameter. Dictionaries can be loaded at server startup or at first use, depending on the [dictionaries_lazy_load](../../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-dictionaries_lazy_load) setting. @@ -24,6 +27,20 @@ The [dictionaries](../../../operations/system-tables/dictionaries.md#system_tabl - Configuration parameters. - Metrics like amount of RAM allocated for the dictionary or a number of queries since the dictionary was successfully loaded. +## Creating a dictionary with a DDL query + +Dictionaries can be created with [DDL queries](../../../sql-reference/statements/create/dictionary.md), and this is the recommended method because with DDL created dictionaries: +- No additional records are added to server configuration files +- The dictionaries can be worked with as first-class entities, like tables or views +- Data can be read directly, using familiar SELECT rather than dictionary table functions +- The dictionaries can be easily renamed + +## Creating a dictionary with a configuration file + +:::note +Creating a dictionary with a configuration file is not applicable to ClickHouse Cloud. Please use DDL (see above), and create your dictionary as user `default`. +::: + The dictionary configuration file has the following format: ``` xml @@ -44,18 +61,17 @@ The dictionary configuration file has the following format: You can [configure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) any number of dictionaries in the same file. -[DDL queries for dictionaries](../../../sql-reference/statements/create/dictionary.md) does not require any additional records in server configuration. They allow to work with dictionaries as first-class entities, like tables or views. :::note -You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to external dictionaries. +You can convert values for a small dictionary by describing it in a `SELECT` query (see the [transform](../../../sql-reference/functions/other-functions.md) function). This functionality is not related to dictionaries. ::: ## See Also -- [Configuring an External Dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) +- [Configuring a Dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict.md) - [Storing Dictionaries in Memory](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) - [Dictionary Updates](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) -- [Sources of External Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) +- [Dictionary Sources](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md) - [Dictionary Key and Fields](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md) -- [Functions for Working with External Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) +- [Functions for Working with Dictionaries](../../../sql-reference/functions/ext-dict-functions.md) diff --git a/docs/en/sql-reference/dictionaries/index.md b/docs/en/sql-reference/dictionaries/index.md index eccd1215e30..b6aa62bdb47 100644 --- a/docs/en/sql-reference/dictionaries/index.md +++ b/docs/en/sql-reference/dictionaries/index.md @@ -12,6 +12,6 @@ ClickHouse supports special functions for working with dictionaries that can be ClickHouse supports: -- [Built-in dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). -- [Plug-in (external) dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). +- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md#dicts-external-dicts) with a [set of functions](../../sql-reference/functions/ext-dict-functions.md). +- [Embedded dictionaries](../../sql-reference/dictionaries/internal-dicts.md#internal_dicts) with a specific [set of functions](../../sql-reference/functions/ym-dict-functions.md). diff --git a/docs/en/sql-reference/dictionaries/internal-dicts.md b/docs/en/sql-reference/dictionaries/internal-dicts.md index dbc12a576f7..f26c60880a4 100644 --- a/docs/en/sql-reference/dictionaries/internal-dicts.md +++ b/docs/en/sql-reference/dictionaries/internal-dicts.md @@ -1,10 +1,13 @@ --- slug: /en/sql-reference/dictionaries/internal-dicts sidebar_position: 39 -sidebar_label: Internal Dictionaries +sidebar_label: Embedded Dictionaries --- +import SelfManaged from '@site/docs/en/_snippets/_self_managed_only_no_roadmap.md'; -# Internal Dictionaries +# Embedded Dictionaries + + ClickHouse contains a built-in feature for working with a geobase. diff --git a/docs/en/sql-reference/functions/arithmetic-functions.md b/docs/en/sql-reference/functions/arithmetic-functions.md index 9059facb0c6..ece50591ef9 100644 --- a/docs/en/sql-reference/functions/arithmetic-functions.md +++ b/docs/en/sql-reference/functions/arithmetic-functions.md @@ -65,6 +65,11 @@ An exception is thrown when dividing by zero or when dividing a minimal negative Differs from [modulo](#modulo) in that it returns zero when the divisor is zero. +## positive_modulo(a, b) +Calculates the remainder when dividing `a` by `b`. Similar to function `modulo` except that `positive_modulo` always return non-negative number. + +Notice that `positive_modulo` is 4-5 times slower than `modulo`. You should not use `positive_modulo` unless you want to get positive result and don't care about performance too much. + ## negate(a), -a operator Calculates a number with the reverse sign. The result is always signed. diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index f7ea2690b21..6156a823d58 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -550,7 +550,7 @@ Alias: `dateTrunc`. - Value, truncated to the specified part of date. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -881,7 +881,7 @@ now([timezone]) - Current date and time. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** @@ -932,7 +932,7 @@ now64([scale], [timezone]) - Current date and time with sub-second precision. -Type: [Datetime64](../../sql-reference/data-types/datetime64.md). +Type: [DateTime64](../../sql-reference/data-types/datetime64.md). **Example** @@ -968,7 +968,7 @@ nowInBlock([timezone]) - Current date and time at the moment of processing of each block of data. -Type: [Datetime](../../sql-reference/data-types/datetime.md). +Type: [DateTime](../../sql-reference/data-types/datetime.md). **Example** diff --git a/docs/en/sql-reference/functions/ext-dict-functions.md b/docs/en/sql-reference/functions/ext-dict-functions.md index 728e26d6958..1c33638da09 100644 --- a/docs/en/sql-reference/functions/ext-dict-functions.md +++ b/docs/en/sql-reference/functions/ext-dict-functions.md @@ -1,20 +1,20 @@ --- slug: /en/sql-reference/functions/ext-dict-functions sidebar_position: 58 -sidebar_label: External Dictionaries +sidebar_label: Dictionaries --- +# Functions for Working with Dictionaries + :::note For dictionaries created with [DDL queries](../../sql-reference/statements/create/dictionary.md), the `dict_name` parameter must be fully specified, like `.`. Otherwise, the current database is used. ::: -# Functions for Working with External Dictionaries - -For information on connecting and configuring external dictionaries, see [External dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +For information on connecting and configuring dictionaries, see [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). ## dictGet, dictGetOrDefault, dictGetOrNull -Retrieves values from an external dictionary. +Retrieves values from a dictionary. ``` sql dictGet('dict_name', attr_names, id_expr) @@ -52,7 +52,7 @@ Create a text file `ext-dict-test.csv` containing the following: The first column is `id`, the second column is `c1`. -Configure the external dictionary: +Configure the dictionary: ``` xml @@ -112,7 +112,7 @@ Create a text file `ext-dict-mult.csv` containing the following: The first column is `id`, the second is `c1`, the third is `c2`. -Configure the external dictionary: +Configure the dictionary: ``` xml @@ -185,7 +185,7 @@ INSERT INTO range_key_dictionary_source_table VALUES(2, toDate('2019-05-20'), to INSERT INTO range_key_dictionary_source_table VALUES(3, toDate('2019-05-20'), toDate('2019-05-20'), 'Third', 'Third'); ``` -Create the external dictionary: +Create the dictionary: ```sql CREATE DICTIONARY range_key_dictionary @@ -226,7 +226,7 @@ Result: **See Also** -- [External Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) +- [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) ## dictHas diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index 430762a1885..bcd118ce0be 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -549,3 +549,33 @@ Result: │ 3.141592653589793 │ └───────────────────┘ ``` + + +## factorial(n) + +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. + +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater will cause exception throw. + + +**Syntax** + +``` sql +factorial(n) +``` + +**Example** + +Query: + +``` sql +SELECT factorial(10); +``` + +Result: + +``` text +┌─factorial(10)─┐ +│ 3628800 │ +└───────────────┘ +``` diff --git a/docs/en/sql-reference/functions/random-functions.md b/docs/en/sql-reference/functions/random-functions.md index d77cc55e5eb..08f2620a009 100644 --- a/docs/en/sql-reference/functions/random-functions.md +++ b/docs/en/sql-reference/functions/random-functions.md @@ -24,6 +24,11 @@ Returns a pseudo-random UInt64 number, evenly distributed among all UInt64-type Uses a linear congruential generator. +## canonicalRand +The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). + +Non-deterministic. Return type is Float64. + ## randConstant Produces a constant column with a random value. diff --git a/docs/en/sql-reference/functions/ym-dict-functions.md b/docs/en/sql-reference/functions/ym-dict-functions.md index 04df3db571e..f92ad5db2ad 100644 --- a/docs/en/sql-reference/functions/ym-dict-functions.md +++ b/docs/en/sql-reference/functions/ym-dict-functions.md @@ -131,7 +131,7 @@ Type: `UInt32`. ### regionToPopulation(id\[, geobase\]) Gets the population for a region. -The population can be recorded in files with the geobase. See the section “External dictionaries”. +The population can be recorded in files with the geobase. See the section “Dictionaries”. If the population is not recorded for the region, it returns 0. In the geobase, the population might be recorded for child regions, but not for parent regions. diff --git a/docs/en/sql-reference/statements/alter/projection.md b/docs/en/sql-reference/statements/alter/projection.md index 7913c7cb7e4..99cb8fb8fd1 100644 --- a/docs/en/sql-reference/statements/alter/projection.md +++ b/docs/en/sql-reference/statements/alter/projection.md @@ -2,9 +2,134 @@ slug: /en/sql-reference/statements/alter/projection sidebar_position: 49 sidebar_label: PROJECTION -title: "Manipulating Projections" +title: "Projections" --- +Projections store data in a format that optimizes query execution, this feature is useful for: +- Running queries on a column that is not a part of the primary key +- Pre-aggregating columns, it will reduce both computation and IO + +You can define one or more projections for a table, and during the query analysis the projection with the least data to scan will be selected by ClickHouse without modifying the query provided by the user. + +## Example filtering without using primary keys + +Creating the table: +``` +CREATE TABLE visits_order +( + `user_id` UInt64, + `user_name` String, + `pages_visited` Nullable(Float64), + `user_agent` String +) +ENGINE = MergeTree() +PRIMARY KEY user_agent +``` +Using `ALTER TABLE`, we could add the Projection to an existing table: +``` +ALTER TABLE visits_order ADD PROJECTION user_name_projection ( +SELECT +* +ORDER BY user_name +) + +ALTER TABLE visits_order MATERIALIZE PROJECTION user_name_projection +``` +Inserting the data: +``` +INSERT INTO visits_order SELECT + number, + 'test', + 1.5 * (number / 2), + 'Android' +FROM numbers(1, 100); +``` + +The Projection will allow us to filter by `user_name` fast even if in the original Table `user_name` was not defined as a `PRIMARY_KEY`. +At query time ClickHouse determined that less data will be processed if the projection is used, as the data is ordered by `user_name`. +``` +SELECT + * +FROM visits_order +WHERE user_name='test' +LIMIT 2 +``` + +To verify that a query is using the projection, we could review the `system.query_log` table. On the `projections` field we have the name of the projection used or empty if none has been used: +``` +SELECT query, projections FROM system.query_log WHERE query_id='' +``` + +## Example pre-aggregation query + +Creating the table with the Projection: +``` +CREATE TABLE visits +( + `user_id` UInt64, + `user_name` String, + `pages_visited` Nullable(Float64), + `user_agent` String, + PROJECTION projection_visits_by_user + ( + SELECT + user_agent, + sum(pages_visited) + GROUP BY user_id, user_agent + ) +) +ENGINE = MergeTree() +ORDER BY user_agent +``` +Inserting the data: +``` +INSERT INTO visits SELECT + number, + 'test', + 1.5 * (number / 2), + 'Android' +FROM numbers(1, 100); +``` +``` +INSERT INTO visits SELECT + number, + 'test', + 1. * (number / 2), + 'IOS' +FROM numbers(100, 500); +``` +We will execute a first query using `GROUP BY` using the field `user_agent`, this query will not use the projection defined as the pre-aggregation does not match. +``` +SELECT + user_agent, + count(DISTINCT user_id) +FROM visits +GROUP BY user_agent +``` + +To use the projection we could execute queries that select part of, or all of the pre-aggregation and `GROUP BY` fields. +``` +SELECT + user_agent +FROM visits +WHERE user_id > 50 AND user_id < 150 +GROUP BY user_agent +``` +``` +SELECT + user_agent, + sum(pages_visited) +FROM visits +GROUP BY user_id +``` + +As mentioned before, we could review the `system.query_log` table. On the `projections` field we have the name of the projection used or empty if none has been used: +``` +SELECT query, projections FROM system.query_log WHERE query_id='' +``` + +# Manipulating Projections + The following operations with [projections](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#projections) are available: ## ADD PROJECTION diff --git a/docs/en/sql-reference/statements/create/dictionary.md b/docs/en/sql-reference/statements/create/dictionary.md index b24ff480c2d..a470b071971 100644 --- a/docs/en/sql-reference/statements/create/dictionary.md +++ b/docs/en/sql-reference/statements/create/dictionary.md @@ -5,9 +5,9 @@ sidebar_label: DICTIONARY title: "CREATE DICTIONARY" --- -Creates a new [external dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) with given [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) and [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). +Creates a new [dictionary](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) with given [structure](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md), [source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md), [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) and [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md). -**Syntax** +## Syntax ``` sql CREATE [OR REPLACE] DICTIONARY [IF NOT EXISTS] [db.]dictionary_name [ON CLUSTER cluster] @@ -25,17 +25,21 @@ SETTINGS(setting_name = setting_value, setting_name = setting_value, ...) COMMENT 'Comment' ``` -External dictionary structure consists of attributes. Dictionary attributes are specified similarly to table columns. The only required attribute property is its type, all other properties may have default values. +The dictionary structure consists of attributes. Dictionary attributes are specified similarly to table columns. The only required attribute property is its type, all other properties may have default values. `ON CLUSTER` clause allows creating dictionary on a cluster, see [Distributed DDL](../../../sql-reference/distributed-ddl.md). Depending on dictionary [layout](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md) one or more attributes can be specified as dictionary keys. -For more information, see [External Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +## SOURCE -You can add a comment to the dictionary when you creating it using `COMMENT` clause. +The source for a dictionary can be a: +- table in the current ClickHouse service +- table in a remote ClickHouse service +- file available by HTTP(S) +- another database -**Example** +### Create a dictionary from a table in the current ClickHouse service Input table `source_table`: @@ -49,51 +53,81 @@ Input table `source_table`: Creating the dictionary: ``` sql -CREATE DICTIONARY dictionary_with_comment +CREATE DICTIONARY id_value_dictionary ( id UInt64, value String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'source_table')) +SOURCE(CLICKHOUSE(TABLE 'source_table')) LAYOUT(FLAT()) LIFETIME(MIN 0 MAX 1000) -COMMENT 'The temporary dictionary'; ``` Output the dictionary: ``` sql -SHOW CREATE DICTIONARY dictionary_with_comment; +SHOW CREATE DICTIONARY id_value_dictionary; ``` -```text -┌─statement───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐ -│ CREATE DICTIONARY default.dictionary_with_comment +```response +CREATE DICTIONARY default.id_value_dictionary ( `id` UInt64, `value` String ) PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() TABLE 'source_table')) +SOURCE(CLICKHOUSE(TABLE 'source_table')) LIFETIME(MIN 0 MAX 1000) LAYOUT(FLAT()) -COMMENT 'The temporary dictionary' │ -└─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` -Output the comment to dictionary: +### Create a dictionary from a table in a remote ClickHouse service + +Input table (in the remote ClickHouse service) `source_table`: + +``` text +┌─id─┬─value──┐ +│ 1 │ First │ +│ 2 │ Second │ +└────┴────────┘ +``` + +Creating the dictionary: ``` sql -SELECT comment FROM system.dictionaries WHERE name == 'dictionary_with_comment' AND database == currentDatabase(); +CREATE DICTIONARY id_value_dictionary +( + id UInt64, + value String +) +PRIMARY KEY id +SOURCE(CLICKHOUSE(HOST 'HOSTNAME' PORT 9000 USER 'default' PASSWORD 'PASSWORD' TABLE 'source_table' DB 'default')) +LAYOUT(FLAT()) +LIFETIME(MIN 0 MAX 1000) ``` -```text -┌─comment──────────────────┐ -│ The temporary dictionary │ -└──────────────────────────┘ +### Create a dictionary from a file available by HTTP(S) + +```sql +statement: CREATE DICTIONARY default.taxi_zone_dictionary +( + `LocationID` UInt16 DEFAULT 0, + `Borough` String, + `Zone` String, + `service_zone` String +) +PRIMARY KEY LocationID +SOURCE(HTTP(URL 'https://datasets-documentation.s3.eu-west-3.amazonaws.com/nyc-taxi/taxi_zone_lookup.csv' FORMAT 'CSVWithNames')) +LIFETIME(MIN 0 MAX 0) +LAYOUT(HASHED()) ``` +### Create a dictionary from another database + +Please see the details in [Dictionary sources](/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md/#dbms). + **See Also** -- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [external dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +- For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +- [system.dictionaries](../../../operations/system-tables/dictionaries.md) — This table contains information about [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). diff --git a/docs/en/sql-reference/statements/select/join.md b/docs/en/sql-reference/statements/select/join.md index 1890ff081d8..62d3e9fd69a 100644 --- a/docs/en/sql-reference/statements/select/join.md +++ b/docs/en/sql-reference/statements/select/join.md @@ -282,7 +282,7 @@ Each time a query is run with the same `JOIN`, the subquery is run again because In some cases, it is more efficient to use [IN](../../../sql-reference/operators/in.md) instead of `JOIN`. -If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is an “external dictionaries” feature that you should use instead of `JOIN`. For more information, see the [External dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. +If you need a `JOIN` for joining with dimension tables (these are relatively small tables that contain dimension properties, such as names for advertising campaigns), a `JOIN` might not be very convenient due to the fact that the right table is re-accessed for every query. For such cases, there is a “dictionaries” feature that you should use instead of `JOIN`. For more information, see the [Dictionaries](../../../sql-reference/dictionaries/external-dictionaries/external-dicts.md) section. ### Memory Limitations diff --git a/docs/en/sql-reference/statements/set-role.md b/docs/en/sql-reference/statements/set-role.md index bf998d7841e..e017160623e 100644 --- a/docs/en/sql-reference/statements/set-role.md +++ b/docs/en/sql-reference/statements/set-role.md @@ -41,7 +41,7 @@ Purge default roles from a user: SET DEFAULT ROLE NONE TO user ``` -Set all the granted roles as default excepting some of them: +Set all the granted roles as default except for specific roles `role1` and `role2`: ``` sql SET DEFAULT ROLE ALL EXCEPT role1, role2 TO user diff --git a/docs/en/sql-reference/statements/show.md b/docs/en/sql-reference/statements/show.md index 87248bb115b..0efad3d460f 100644 --- a/docs/en/sql-reference/statements/show.md +++ b/docs/en/sql-reference/statements/show.md @@ -198,7 +198,7 @@ Result: ## SHOW DICTIONARIES -Displays a list of [external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). +Displays a list of [Dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts.md). ``` sql SHOW DICTIONARIES [FROM ] [LIKE ''] [LIMIT ] [INTO OUTFILE ] [FORMAT ] diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index f867cda45bd..de1567c052e 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -110,5 +110,5 @@ SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); **See Also** - [The ‘MySQL’ table engine](../../engines/table-engines/integrations/mysql.md) -- [Using MySQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) +- [Using MySQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-mysql) diff --git a/docs/en/sql-reference/table-functions/odbc.md b/docs/en/sql-reference/table-functions/odbc.md index f8c46fe44d8..7e13424bc8a 100644 --- a/docs/en/sql-reference/table-functions/odbc.md +++ b/docs/en/sql-reference/table-functions/odbc.md @@ -101,5 +101,5 @@ SELECT * FROM odbc('DSN=mysqlconn', 'test', 'test') ## See Also -- [ODBC external dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) +- [ODBC dictionaries](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-odbc) - [ODBC table engine](../../engines/table-engines/integrations/odbc.md). diff --git a/docs/en/sql-reference/table-functions/postgresql.md b/docs/en/sql-reference/table-functions/postgresql.md index 367edbe9a00..e98869de739 100644 --- a/docs/en/sql-reference/table-functions/postgresql.md +++ b/docs/en/sql-reference/table-functions/postgresql.md @@ -130,6 +130,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32) **See Also** - [The PostgreSQL table engine](../../engines/table-engines/integrations/postgresql.md) -- [Using PostgreSQL as a source of external dictionary](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) +- [Using PostgreSQL as a dictionary source](../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql) [Original article](https://clickhouse.com/docs/en/sql-reference/table-functions/postgresql/) diff --git a/docs/ru/operations/system-tables/crash-log.md b/docs/ru/operations/system-tables/crash-log.md index 4ca8be5a199..68148fec6bd 100644 --- a/docs/ru/operations/system-tables/crash-log.md +++ b/docs/ru/operations/system-tables/crash-log.md @@ -7,8 +7,8 @@ slug: /ru/operations/system-tables/crash-log Колонки: -- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — Дата события. -- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — Время события. +- `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — Дата события. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Время события. - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Время события с наносекундами. - `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — Номер сигнала, пришедшего в поток. - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Идентификатор треда. diff --git a/docs/ru/operations/system-tables/mutations.md b/docs/ru/operations/system-tables/mutations.md index 20e4ebfdaf1..bb0bd44ed7a 100644 --- a/docs/ru/operations/system-tables/mutations.md +++ b/docs/ru/operations/system-tables/mutations.md @@ -15,7 +15,7 @@ slug: /ru/operations/system-tables/mutations - `command` ([String](../../sql-reference/data-types/string.md)) — команда мутации (часть запроса после `ALTER TABLE [db.]table`). -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время создания мутации. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время создания мутации. - `block_numbers.partition_id` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — Для мутаций реплицированных таблиц массив содержит содержит номера партиций (по одной записи для каждой партиции). Для мутаций нереплицированных таблиц массив пустой. @@ -39,7 +39,7 @@ slug: /ru/operations/system-tables/mutations - `latest_failed_part` ([String](../../sql-reference/data-types/string.md)) — имя последнего куска, мутация которого не удалась. -- `latest_fail_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время последней ошибки мутации. +- `latest_fail_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время последней ошибки мутации. - `latest_fail_reason` ([String](../../sql-reference/data-types/string.md)) — причина последней ошибки мутации. diff --git a/docs/ru/operations/system-tables/replication_queue.md b/docs/ru/operations/system-tables/replication_queue.md index 25de174e98f..60d42133153 100644 --- a/docs/ru/operations/system-tables/replication_queue.md +++ b/docs/ru/operations/system-tables/replication_queue.md @@ -29,7 +29,7 @@ slug: /ru/operations/system-tables/replication_queue - `MUTATE_PART` — применить одну или несколько мутаций к куску. - `ALTER_METADATA` — применить изменения структуры таблицы в результате запросов с выражением `ALTER`. -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время отправки задачи на выполнение. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время отправки задачи на выполнение. - `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — количество реплик, ожидающих завершения задачи, с подтверждением о завершении. Этот столбец актуален только для задачи `GET_PARTS`. @@ -47,13 +47,13 @@ slug: /ru/operations/system-tables/replication_queue - `last_exception` ([String](../../sql-reference/data-types/string.md)) — текст сообщения о последней возникшей ошибке, если таковые имеются. -- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время последней попытки выполнить задачу. +- `last_attempt_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время последней попытки выполнить задачу. - `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — количество отложенных задач. - `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — причина, по которой была отложена задача. -- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — дата и время, когда была отложена задача в последний раз. +- `last_postpone_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — дата и время, когда была отложена задача в последний раз. - `merge_type` ([String](../../sql-reference/data-types/string.md)) — тип текущего слияния. Пусто, если это мутация. diff --git a/docs/ru/sql-reference/data-types/date32.md b/docs/ru/sql-reference/data-types/date32.md index fcb7d688c20..958b8e9763e 100644 --- a/docs/ru/sql-reference/data-types/date32.md +++ b/docs/ru/sql-reference/data-types/date32.md @@ -6,7 +6,7 @@ sidebar_label: Date32 # Date32 {#data_type-datetime32} -Дата. Поддерживается такой же диапазон дат, как для типа [Datetime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1900-01-01 по 2299-12-31. +Дата. Поддерживается такой же диапазон дат, как для типа [DateTime64](../../sql-reference/data-types/datetime64.md). Значение хранится в четырех байтах и соответствует числу дней с 1900-01-01 по 2299-12-31. **Пример** diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index f18c2ea258a..f430f5cae51 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -602,7 +602,7 @@ date_trunc(unit, value[, timezone]) - Дата и время, отсеченные до указанной части. -Тип: [Datetime](../../sql-reference/data-types/datetime.md). +Тип: [DateTime](../../sql-reference/data-types/datetime.md). **Примеры** @@ -913,7 +913,7 @@ now([timezone]) - Текущие дата и время. -Тип: [Datetime](../../sql-reference/data-types/datetime.md). +Тип: [DateTime](../../sql-reference/data-types/datetime.md). **Пример** diff --git a/docs/zh/operations/system-tables/crash-log.md b/docs/zh/operations/system-tables/crash-log.md index d0ed406fa0c..06087a34f35 100644 --- a/docs/zh/operations/system-tables/crash-log.md +++ b/docs/zh/operations/system-tables/crash-log.md @@ -7,8 +7,8 @@ slug: /zh/operations/system-tables/crash-log 列信息: -- `event_date` ([Datetime](../../sql-reference/data-types/datetime.md)) — 事件日期. -- `event_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — 事件时间. +- `event_date` ([DateTime](../../sql-reference/data-types/datetime.md)) — 事件日期. +- `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 事件时间. - `timestamp_ns` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 以纳秒为单位的事件时间戳. - `signal` ([Int32](../../sql-reference/data-types/int-uint.md)) — 信号编号. - `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — 线程ID. diff --git a/docs/zh/operations/system-tables/mutations.md b/docs/zh/operations/system-tables/mutations.md index dbce0a59063..f5f82c1717a 100644 --- a/docs/zh/operations/system-tables/mutations.md +++ b/docs/zh/operations/system-tables/mutations.md @@ -15,7 +15,7 @@ slug: /zh/operations/system-tables/mutations - `command` ([String](../../sql-reference/data-types/string.md)) — mutation命令字符串(`ALTER TABLE [db.]table`语句之后的部分)。 -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — mutation命令提交执行的日期和时间。 +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — mutation命令提交执行的日期和时间。 - `block_numbers.partition_id` ([Array](../../sql-reference/data-types/array.md)([String](../../sql-reference/data-types/string.md))) — 对于复制表的mutation,该数组包含分区的ID(每个分区都有一条记录)。对于非复制表的mutation,该数组为空。 @@ -39,7 +39,7 @@ slug: /zh/operations/system-tables/mutations - `latest_failed_part`([String](../../sql-reference/data-types/string.md)) — 最近不能mutation的part的名称。 -- `latest_fail_time`([Datetime](../../sql-reference/data-types/datetime.md)) — 最近的一个mutation失败的时间。 +- `latest_fail_time`([DateTime](../../sql-reference/data-types/datetime.md)) — 最近的一个mutation失败的时间。 - `latest_fail_reason`([String](../../sql-reference/data-types/string.md)) — 导致最近part的mutation失败的异常消息。 diff --git a/docs/zh/operations/system-tables/replication_queue.md b/docs/zh/operations/system-tables/replication_queue.md index e82569e378d..95a183cf9f7 100644 --- a/docs/zh/operations/system-tables/replication_queue.md +++ b/docs/zh/operations/system-tables/replication_queue.md @@ -29,7 +29,7 @@ slug: /zh/operations/system-tables/replication_queue - `MUTATE_PART` — 对分片应用一个或多个突变. - `ALTER_METADATA` — 根据全局 /metadata 和 /columns 路径应用alter修改. -- `create_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — 提交任务执行的日期和时间. +- `create_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 提交任务执行的日期和时间. - `required_quorum` ([UInt32](../../sql-reference/data-types/int-uint.md)) — 等待任务完成并确认完成的副本数. 此列仅与 `GET_PARTS` 任务相关. @@ -47,13 +47,13 @@ slug: /zh/operations/system-tables/replication_queue - `last_exception` ([String](../../sql-reference/data-types/string.md)) — 发生的最后一个错误的短信(如果有). -- `last_attempt_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — 上次尝试任务的日期和时间. +- `last_attempt_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 上次尝试任务的日期和时间. - `num_postponed` ([UInt32](../../sql-reference/data-types/int-uint.md)) — 延期任务数. - `postpone_reason` ([String](../../sql-reference/data-types/string.md)) — 任务延期的原因. -- `last_postpone_time` ([Datetime](../../sql-reference/data-types/datetime.md)) — 上次推迟任务的日期和时间. +- `last_postpone_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — 上次推迟任务的日期和时间. - `merge_type` ([String](../../sql-reference/data-types/string.md)) — 当前合并的类型. 如果是突变则为空. diff --git a/docs/zh/sql-reference/ansi.md b/docs/zh/sql-reference/ansi.md index 9cf335f89ef..cdccee0084f 100644 --- a/docs/zh/sql-reference/ansi.md +++ b/docs/zh/sql-reference/ansi.md @@ -152,7 +152,7 @@ sidebar_label: "ANSI\u517C\u5BB9\u6027" | F051-02 | TIME(时间)数据类型(并支持用于表达时间的字面量),小数秒精度至少为0 | 否 {.text-danger} | | | F051-03 | 时间戳数据类型(并支持用于表达时间戳的字面量),小数秒精度至少为0和6 | 是 {.text-danger} | | | F051-04 | 日期、时间和时间戳数据类型的比较谓词 | 是 {.text-success} | | -| F051-05 | Datetime 类型和字符串形式表达的时间之间的显式转换 | 是 {.text-success} | | +| F051-05 | DateTime 类型和字符串形式表达的时间之间的显式转换 | 是 {.text-success} | | | F051-06 | CURRENT_DATE | 否 {.text-danger} | 使用`today()`替代 | | F051-07 | LOCALTIME | 否 {.text-danger} | 使用`now()`替代 | | F051-08 | LOCALTIMESTAMP | 否 {.text-danger} | | diff --git a/docs/zh/sql-reference/data-types/datetime64.md b/docs/zh/sql-reference/data-types/datetime64.md index ee2d7a6f258..24888645cba 100644 --- a/docs/zh/sql-reference/data-types/datetime64.md +++ b/docs/zh/sql-reference/data-types/datetime64.md @@ -6,7 +6,7 @@ sidebar_position: 49 sidebar_label: DateTime64 --- -# Datetime64 {#data_type-datetime64} +# DateTime64 {#data_type-datetime64} 此类型允许以日期(date)加时间(time)的形式来存储一个时刻的时间值,具有定义的亚秒精度 diff --git a/docs/zh/sql-reference/functions/date-time-functions.md b/docs/zh/sql-reference/functions/date-time-functions.md index c666d01d15f..4bbd0e5b69b 100644 --- a/docs/zh/sql-reference/functions/date-time-functions.md +++ b/docs/zh/sql-reference/functions/date-time-functions.md @@ -539,7 +539,7 @@ date_trunc(unit, value[, timezone]) - 按指定的单位向前取整后的DateTime。 -类型: [Datetime](../../sql-reference/data-types/datetime.md). +类型: [DateTime](../../sql-reference/data-types/datetime.md). **示例** @@ -850,7 +850,7 @@ now([timezone]) - 当前日期和时间。 -类型: [Datetime](../../sql-reference/data-types/datetime.md). +类型: [DateTime](../../sql-reference/data-types/datetime.md). **示例** diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index 93136df2a5b..6e289b57845 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -243,6 +243,7 @@ try registerAggregateFunctions(); processConfig(); + initTtyBuffer(toProgressOption(config().getString("progress", "default"))); /// Includes delayed_interactive. if (is_interactive) @@ -1088,8 +1089,6 @@ void Client::processConfig() } else { - std::string progress = config().getString("progress", "tty"); - need_render_progress = (Poco::icompare(progress, "off") && Poco::icompare(progress, "no") && Poco::icompare(progress, "false") && Poco::icompare(progress, "0")); echo_queries = config().getBool("echo", false); ignore_error = config().getBool("ignore-error", false); diff --git a/programs/git-import/git-import.cpp b/programs/git-import/git-import.cpp index 030ddd263fa..9e464164da6 100644 --- a/programs/git-import/git-import.cpp +++ b/programs/git-import/git-import.cpp @@ -351,7 +351,7 @@ struct LineChange ++pos; } - indent = std::max(255U, num_spaces); + indent = std::min(255U, num_spaces); line.assign(pos, end); if (pos == end) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index e1d03b40b66..a1bf324f482 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -149,19 +149,7 @@ std::string getUserName(uid_t user_id) Poco::Net::SocketAddress Keeper::socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure) const { auto address = makeSocketAddress(host, port, &logger()); -#if !defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION < 0x01090100 - if (secure) - /// Bug in old (<1.9.1) poco, listen() after bind() with reusePort param will fail because have no implementation in SecureServerSocketImpl - /// https://github.com/pocoproject/poco/pull/2257 - socket.bind(address, /* reuseAddress = */ true); - else -#endif -#if POCO_VERSION < 0x01080000 - socket.bind(address, /* reuseAddress = */ true); -#else socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config().getBool("listen_reuse_port", false)); -#endif - socket.listen(/* backlog = */ config().getUInt("listen_backlog", 64)); return address; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 3ac9c1e7c37..4c07fa0a02d 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -118,6 +119,8 @@ void LocalServer::initialize(Poco::Util::Application & self) config().getUInt("max_io_thread_pool_size", 100), config().getUInt("max_io_thread_pool_free_size", 0), config().getUInt("io_thread_pool_queue_size", 10000)); + + NamedCollectionFactory::instance().initialize(config()); } @@ -414,6 +417,8 @@ try registerFormats(); processConfig(); + initTtyBuffer(toProgressOption(config().getString("progress", "default"))); + applyCmdSettings(global_context); if (is_interactive) @@ -489,8 +494,6 @@ void LocalServer::processConfig() } else { - std::string progress = config().getString("progress", "tty"); - need_render_progress = (Poco::icompare(progress, "off") && Poco::icompare(progress, "no") && Poco::icompare(progress, "false") && Poco::icompare(progress, "0")); echo_queries = config().hasOption("echo") || config().hasOption("verbose"); ignore_error = config().getBool("ignore-error", false); is_multiquery = true; diff --git a/programs/server/MetricsTransmitter.cpp b/programs/server/MetricsTransmitter.cpp index 8ad519ba5aa..f7829a49a39 100644 --- a/programs/server/MetricsTransmitter.cpp +++ b/programs/server/MetricsTransmitter.cpp @@ -123,7 +123,7 @@ void MetricsTransmitter::transmit(std::vector & prev_count { for (const auto & name_value : async_metrics_values) { - key_vals.emplace_back(asynchronous_metrics_path_prefix + name_value.first, name_value.second); + key_vals.emplace_back(asynchronous_metrics_path_prefix + name_value.first, name_value.second.value); } } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index b412b579539..a5321997779 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -341,19 +342,7 @@ Poco::Net::SocketAddress Server::socketBindListen( [[maybe_unused]] bool secure) const { auto address = makeSocketAddress(host, port, &logger()); -#if !defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION < 0x01090100 - if (secure) - /// Bug in old (<1.9.1) poco, listen() after bind() with reusePort param will fail because have no implementation in SecureServerSocketImpl - /// https://github.com/pocoproject/poco/pull/2257 - socket.bind(address, /* reuseAddress = */ true); - else -#endif -#if POCO_VERSION < 0x01080000 - socket.bind(address, /* reuseAddress = */ true); -#else socket.bind(address, /* reuseAddress = */ true, /* reusePort = */ config.getBool("listen_reuse_port", false)); -#endif - /// If caller requests any available port from the OS, discover it after binding. if (port == 0) { @@ -732,6 +721,8 @@ int Server::main(const std::vector & /*args*/) config().getUInt("max_io_thread_pool_free_size", 0), config().getUInt("io_thread_pool_queue_size", 10000)); + NamedCollectionFactory::instance().initialize(config()); + /// Initialize global local cache for remote filesystem. if (config().has("local_cache_for_remote_fs")) { @@ -805,41 +796,43 @@ int Server::main(const std::vector & /*args*/) /// that are interpreted (not executed) but can alter the behaviour of the program as well. /// Please keep the below log messages in-sync with the ones in daemon/BaseDaemon.cpp - - String calculated_binary_hash = getHashOfLoadedBinaryHex(); - if (stored_binary_hash.empty()) { - LOG_WARNING(log, "Integrity check of the executable skipped because the reference checksum could not be read." - " (calculated checksum: {})", calculated_binary_hash); - } - else if (calculated_binary_hash == stored_binary_hash) - { - LOG_INFO(log, "Integrity check of the executable successfully passed (checksum: {})", calculated_binary_hash); + LOG_WARNING(log, "Integrity check of the executable skipped because the reference checksum could not be read."); } else { - /// If program is run under debugger, ptrace will fail. - if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == -1) + String calculated_binary_hash = getHashOfLoadedBinaryHex(); + if (calculated_binary_hash == stored_binary_hash) { - /// Program is run under debugger. Modification of it's binary image is ok for breakpoints. - global_context->addWarningMessage( - fmt::format("Server is run under debugger and its binary image is modified (most likely with breakpoints).", - calculated_binary_hash) - ); + LOG_INFO(log, "Integrity check of the executable successfully passed (checksum: {})", calculated_binary_hash); } else { - throw Exception(ErrorCodes::CORRUPTED_DATA, - "Calculated checksum of the executable ({0}) does not correspond" - " to the reference checksum stored in the executable ({1})." - " This may indicate one of the following:" - " - the executable {2} was changed just after startup;" - " - the executable {2} was corrupted on disk due to faulty hardware;" - " - the loaded executable was corrupted in memory due to faulty hardware;" - " - the file {2} was intentionally modified;" - " - a logical error in the code." - , calculated_binary_hash, stored_binary_hash, executable_path); + /// If program is run under debugger, ptrace will fail. + if (ptrace(PTRACE_TRACEME, 0, nullptr, nullptr) == -1) + { + /// Program is run under debugger. Modification of it's binary image is ok for breakpoints. + global_context->addWarningMessage(fmt::format( + "Server is run under debugger and its binary image is modified (most likely with breakpoints).", + calculated_binary_hash)); + } + else + { + throw Exception( + ErrorCodes::CORRUPTED_DATA, + "Calculated checksum of the executable ({0}) does not correspond" + " to the reference checksum stored in the executable ({1})." + " This may indicate one of the following:" + " - the executable {2} was changed just after startup;" + " - the executable {2} was corrupted on disk due to faulty hardware;" + " - the loaded executable was corrupted in memory due to faulty hardware;" + " - the file {2} was intentionally modified;" + " - a logical error in the code.", + calculated_binary_hash, + stored_binary_hash, + executable_path); + } } } } @@ -1279,6 +1272,7 @@ int Server::main(const std::vector & /*args*/) #if USE_SSL CertificateReloader::instance().tryLoad(*config); #endif + NamedCollectionFactory::instance().reload(*config); ProfileEvents::increment(ProfileEvents::MainConfigLoads); /// Must be the last. @@ -1486,11 +1480,6 @@ int Server::main(const std::vector & /*args*/) #endif SCOPE_EXIT({ - /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because - /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart. - main_config_reloader.reset(); - access_control.stopPeriodicReloading(); - async_metrics.stop(); /** Ask to cancel background jobs all table engines, @@ -1789,10 +1778,17 @@ int Server::main(const std::vector & /*args*/) SCOPE_EXIT_SAFE({ LOG_DEBUG(log, "Received termination signal."); - LOG_DEBUG(log, "Waiting for current connections to close."); + + /// Stop reloading of the main config. This must be done before everything else because it + /// can try to access/modify already deleted objects. + /// E.g. it can recreate new servers or it may pass a changed config to some destroyed parts of ContextSharedPart. + main_config_reloader.reset(); + access_control.stopPeriodicReloading(); is_cancelled = true; + LOG_DEBUG(log, "Waiting for current connections to close."); + size_t current_connections = 0; { std::lock_guard lock(servers_lock); diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 5c85c93c98f..8263f50d1b0 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -130,6 +130,7 @@ enum class AccessType M(SHOW_ROW_POLICIES, "SHOW POLICIES, SHOW CREATE ROW POLICY, SHOW CREATE POLICY", TABLE, SHOW_ACCESS) \ M(SHOW_QUOTAS, "SHOW CREATE QUOTA", GLOBAL, SHOW_ACCESS) \ M(SHOW_SETTINGS_PROFILES, "SHOW PROFILES, SHOW CREATE SETTINGS PROFILE, SHOW CREATE PROFILE", GLOBAL, SHOW_ACCESS) \ + M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", GLOBAL, SHOW_ACCESS) \ M(SHOW_ACCESS, "", GROUP, ACCESS_MANAGEMENT) \ M(ACCESS_MANAGEMENT, "", GROUP, ALL) \ \ diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp index 7584dd2f2b1..f4c8acbebab 100644 --- a/src/Access/ContextAccess.cpp +++ b/src/Access/ContextAccess.cpp @@ -465,6 +465,17 @@ std::shared_ptr ContextAccess::getAccessRightsWithImplicit() template bool ContextAccess::checkAccessImplHelper(AccessFlags flags, const Args &... args) const { + if (user_was_dropped) + { + /// If the current user has been dropped we always throw an exception (even if `throw_if_denied` is false) + /// because dropping of the current user is considered as a situation which is exceptional enough to stop + /// query execution. + throw Exception(getUserName() + ": User has been dropped", ErrorCodes::UNKNOWN_USER); + } + + if (is_full_access) + return true; + auto access_granted = [&] { if (trace_log) @@ -483,12 +494,6 @@ bool ContextAccess::checkAccessImplHelper(AccessFlags flags, const Args &... arg return false; }; - if (is_full_access) - return true; - - if (user_was_dropped) - return access_denied("User has been dropped", ErrorCodes::UNKNOWN_USER); - if (flags & AccessType::CLUSTER && !access_control->doesOnClusterQueriesRequireClusterGrant()) flags &= ~AccessType::CLUSTER; diff --git a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp index ae1bbfddf75..432b1f39f84 100644 --- a/src/AggregateFunctions/AggregateFunctionAggThrow.cpp +++ b/src/AggregateFunctions/AggregateFunctionAggThrow.cpp @@ -95,7 +95,7 @@ public: void deserialize(AggregateDataPtr __restrict /* place */, ReadBuffer & buf, std::optional /* version */, Arena *) const override { char c = 0; - buf.read(c); + buf.readStrict(c); } void insertResultInto(AggregateDataPtr __restrict, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp index 89ffdfa6109..93b5de0c5ab 100644 --- a/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp +++ b/src/AggregateFunctions/AggregateFunctionCategoricalInformationValue.cpp @@ -118,7 +118,7 @@ public: void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional /* version */, Arena *) const override { - buf.read(place, sizeOfData()); + buf.readStrict(place, sizeOfData()); } DataTypePtr getReturnType() const override diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.h b/src/AggregateFunctions/AggregateFunctionGroupArray.h index 6888c113556..89b382de819 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.h @@ -270,7 +270,7 @@ public: auto & value = this->data(place).value; value.resize(size, arena); - buf.read(reinterpret_cast(value.data()), size * sizeof(value[0])); + buf.readStrict(reinterpret_cast(value.data()), size * sizeof(value[0])); if constexpr (Trait::sampler == Sampler::RNG) { @@ -343,7 +343,7 @@ struct GroupArrayNodeBase Node * node = reinterpret_cast(arena->alignedAlloc(sizeof(Node) + size, alignof(Node))); node->size = size; - buf.read(node->data(), size); + buf.readStrict(node->data(), size); return node; } }; diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h index 1fa568024af..40867b1949a 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h @@ -144,7 +144,7 @@ public: { auto & value = this->data(place).value; value.resize(size, arena); - buf.read(reinterpret_cast(value.data()), size * sizeof(value[0])); + buf.readStrict(reinterpret_cast(value.data()), size * sizeof(value[0])); this->data(place).sum = value.back(); } } diff --git a/src/AggregateFunctions/AggregateFunctionHistogram.h b/src/AggregateFunctions/AggregateFunctionHistogram.h index 18bfc085ba3..fbd92aa8220 100644 --- a/src/AggregateFunctions/AggregateFunctionHistogram.h +++ b/src/AggregateFunctions/AggregateFunctionHistogram.h @@ -293,7 +293,7 @@ public: if (size > max_bins * 2) throw Exception("Too many bins", ErrorCodes::TOO_LARGE_ARRAY_SIZE); - buf.read(reinterpret_cast(points), size * sizeof(WeightedValue)); + buf.readStrict(reinterpret_cast(points), size * sizeof(WeightedValue)); } }; diff --git a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h index 19547bdd247..d2f553172c9 100644 --- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.h +++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.h @@ -130,7 +130,7 @@ public: auto & value = this->data(place).value; value.resize(size, arena); - buf.read(reinterpret_cast(value.data()), size * sizeof(value[0])); + buf.readStrict(reinterpret_cast(value.data()), size * sizeof(value[0])); } void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override diff --git a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h index f8d252cf8e9..46be7331195 100644 --- a/src/AggregateFunctions/AggregateFunctionMinMaxAny.h +++ b/src/AggregateFunctions/AggregateFunctionMinMaxAny.h @@ -515,7 +515,7 @@ public: size = rhs_size; if (size > 0) - buf.read(small_data, size); + buf.readStrict(small_data, size); } else { @@ -527,7 +527,7 @@ public: } size = rhs_size; - buf.read(large_data, size); + buf.readStrict(large_data, size); } } else diff --git a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h index c29055ae8db..90caaee4d94 100644 --- a/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h +++ b/src/AggregateFunctions/AggregateFunctionSequenceNextNode.h @@ -29,6 +29,11 @@ namespace DB { struct Settings; +namespace ErrorCodes +{ + extern const int TOO_LARGE_ARRAY_SIZE; +} + enum class SequenceDirection { Forward, @@ -43,6 +48,9 @@ enum SequenceBase LastMatch, }; +/// This is for security +static const UInt64 max_node_size_deserialize = 0xFFFFFF; + /// NodeBase used to implement a linked list for storage of SequenceNextNodeImpl template struct NodeBase @@ -78,10 +86,12 @@ struct NodeBase { UInt64 size; readVarUInt(size, buf); + if unlikely (size > max_node_size_deserialize) + throw Exception("Too large node state size", ErrorCodes::TOO_LARGE_ARRAY_SIZE); Node * node = reinterpret_cast(arena->alignedAlloc(sizeof(Node) + size, alignof(Node))); node->size = size; - buf.read(node->data(), size); + buf.readStrict(node->data(), size); readBinary(node->event_time, buf); UInt64 ulong_bitset; diff --git a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h index 48b4c0f2c68..99f36b664d7 100644 --- a/src/AggregateFunctions/AggregateFunctionUniqUpTo.h +++ b/src/AggregateFunctions/AggregateFunctionUniqUpTo.h @@ -108,7 +108,7 @@ struct AggregateFunctionUniqUpToData readBinary(count, rb); if (count <= threshold) - rb.read(data_ptr, count * sizeof(T)); + rb.readStrict(data_ptr, count * sizeof(T)); } /// ALWAYS_INLINE is required to have better code layout for uniqUpTo function diff --git a/src/AggregateFunctions/QuantileExact.h b/src/AggregateFunctions/QuantileExact.h index eb1865d0355..bec7bd4c6c3 100644 --- a/src/AggregateFunctions/QuantileExact.h +++ b/src/AggregateFunctions/QuantileExact.h @@ -55,7 +55,7 @@ struct QuantileExactBase size_t size = 0; readVarUInt(size, buf); array.resize(size); - buf.read(reinterpret_cast(array.data()), size * sizeof(array[0])); + buf.readStrict(reinterpret_cast(array.data()), size * sizeof(array[0])); } Value get(Float64 level) diff --git a/src/AggregateFunctions/QuantileTDigest.h b/src/AggregateFunctions/QuantileTDigest.h index 5e9261919bd..109a98815ae 100644 --- a/src/AggregateFunctions/QuantileTDigest.h +++ b/src/AggregateFunctions/QuantileTDigest.h @@ -316,7 +316,7 @@ public: centroids.resize(size); // From now, TDigest will be in invalid state if exception is thrown. - buf.read(reinterpret_cast(centroids.data()), size * sizeof(centroids[0])); + buf.readStrict(reinterpret_cast(centroids.data()), size * sizeof(centroids[0])); for (const auto & c : centroids) { diff --git a/src/AggregateFunctions/StatCommon.h b/src/AggregateFunctions/StatCommon.h index ff824ca11b8..407c1a1cd67 100644 --- a/src/AggregateFunctions/StatCommon.h +++ b/src/AggregateFunctions/StatCommon.h @@ -112,8 +112,8 @@ struct StatisticalSample readVarUInt(size_y, buf); x.resize(size_x, arena); y.resize(size_y, arena); - buf.read(reinterpret_cast(x.data()), size_x * sizeof(x[0])); - buf.read(reinterpret_cast(y.data()), size_y * sizeof(y[0])); + buf.readStrict(reinterpret_cast(x.data()), size_x * sizeof(x[0])); + buf.readStrict(reinterpret_cast(y.data()), size_y * sizeof(y[0])); } }; diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp index 8c92ecc3900..b91fc94e6cd 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp @@ -33,18 +33,27 @@ public: if (function_node->getFunctionName() == "count" && !first_argument_constant_literal.isNull()) { + resolveAsCountAggregateFunction(*function_node); function_node->getArguments().getNodes().clear(); } - else if (function_node->getFunctionName() == "sum" && first_argument_constant_literal.getType() == Field::Types::UInt64 && + else if (function_node->getFunctionName() == "sum" && + first_argument_constant_literal.getType() == Field::Types::UInt64 && first_argument_constant_literal.get() == 1) { - auto result_type = function_node->getResultType(); - AggregateFunctionProperties properties; - auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties); - function_node->resolveAsAggregateFunction(std::move(aggregate_function), std::move(result_type)); + resolveAsCountAggregateFunction(*function_node); function_node->getArguments().getNodes().clear(); } } +private: + static inline void resolveAsCountAggregateFunction(FunctionNode & function_node) + { + auto function_result_type = function_node.getResultType(); + + AggregateFunctionProperties properties; + auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties); + + function_node.resolveAsAggregateFunction(std::move(aggregate_function), std::move(function_result_type)); + } }; } diff --git a/src/Backups/BackupEntriesCollector.cpp b/src/Backups/BackupEntriesCollector.cpp index 73f78a13765..c1f7f14960b 100644 --- a/src/Backups/BackupEntriesCollector.cpp +++ b/src/Backups/BackupEntriesCollector.cpp @@ -442,7 +442,7 @@ void BackupEntriesCollector::gatherTablesMetadata() if (it != database_info.tables.end()) { const auto & partitions = it->second.partitions; - if (partitions && !storage->supportsBackupPartition()) + if (partitions && storage && !storage->supportsBackupPartition()) { throw Exception( ErrorCodes::CANNOT_BACKUP_TABLE, diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 95018f8a632..1ed9ff58fdc 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -119,22 +119,27 @@ namespace ProfileEvents namespace DB { +ProgressOption toProgressOption(std::string progress) +{ + boost::to_upper(progress); + + if (progress == "OFF" || progress == "FALSE" || progress == "0" || progress == "NO") + return ProgressOption::OFF; + if (progress == "TTY" || progress == "ON" || progress == "TRUE" || progress == "1" || progress == "YES") + return ProgressOption::TTY; + if (progress == "ERR") + return ProgressOption::ERR; + if (progress == "DEFAULT") + return ProgressOption::DEFAULT; + + throw boost::program_options::validation_error(boost::program_options::validation_error::invalid_option_value); +} + std::istream& operator>> (std::istream & in, ProgressOption & progress) { std::string token; in >> token; - - boost::to_upper(token); - - if (token == "OFF" || token == "FALSE" || token == "0" || token == "NO") - progress = ProgressOption::OFF; - else if (token == "TTY" || token == "ON" || token == "TRUE" || token == "1" || token == "YES") - progress = ProgressOption::TTY; - else if (token == "ERR") - progress = ProgressOption::ERR; - else - throw boost::program_options::validation_error(boost::program_options::validation_error::invalid_option_value); - + progress = toProgressOption(token); return in; } @@ -662,56 +667,62 @@ void ClientBase::initLogsOutputStream() } } -void ClientBase::initTtyBuffer(bool to_err) +void ClientBase::initTtyBuffer(ProgressOption progress) { - if (!tty_buf) + if (tty_buf) + return; + + if (progress == ProgressOption::OFF || (!is_interactive && progress == ProgressOption::DEFAULT)) { - static constexpr auto tty_file_name = "/dev/tty"; + need_render_progress = false; + return; + } - /// Output all progress bar commands to terminal at once to avoid flicker. - /// This size is usually greater than the window size. - static constexpr size_t buf_size = 1024; + static constexpr auto tty_file_name = "/dev/tty"; - if (!to_err) + /// Output all progress bar commands to terminal at once to avoid flicker. + /// This size is usually greater than the window size. + static constexpr size_t buf_size = 1024; + + if (is_interactive || progress == ProgressOption::TTY) + { + std::error_code ec; + std::filesystem::file_status tty = std::filesystem::status(tty_file_name, ec); + + if (!ec && exists(tty) && is_character_file(tty) + && (tty.permissions() & std::filesystem::perms::others_write) != std::filesystem::perms::none) { - std::error_code ec; - std::filesystem::file_status tty = std::filesystem::status(tty_file_name, ec); - - if (!ec && exists(tty) && is_character_file(tty) - && (tty.permissions() & std::filesystem::perms::others_write) != std::filesystem::perms::none) + try { - try - { - tty_buf = std::make_unique(tty_file_name, buf_size); + tty_buf = std::make_unique(tty_file_name, buf_size); - /// It is possible that the terminal file has writeable permissions - /// but we cannot write anything there. Check it with invisible character. - tty_buf->write('\0'); - tty_buf->next(); + /// It is possible that the terminal file has writeable permissions + /// but we cannot write anything there. Check it with invisible character. + tty_buf->write('\0'); + tty_buf->next(); - return; - } - catch (const Exception & e) - { - if (tty_buf) - tty_buf.reset(); + return; + } + catch (const Exception & e) + { + if (tty_buf) + tty_buf.reset(); - if (e.code() != ErrorCodes::CANNOT_OPEN_FILE) - throw; + if (e.code() != ErrorCodes::CANNOT_OPEN_FILE) + throw; - /// It is normal if file exists, indicated as writeable but still cannot be opened. - /// Fallback to other options. - } + /// It is normal if file exists, indicated as writeable but still cannot be opened. + /// Fallback to other options. } } - - if (stderr_is_a_tty) - { - tty_buf = std::make_unique(STDERR_FILENO, buf_size); - } - else - need_render_progress = false; } + + if (stderr_is_a_tty || progress == ProgressOption::ERR) + { + tty_buf = std::make_unique(STDERR_FILENO, buf_size); + } + else + need_render_progress = false; } void ClientBase::updateSuggest(const ASTPtr & ast) @@ -1617,6 +1628,14 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin global_context->applySettingChange(change); } global_context->resetSettingsToDefaultValue(set_query->default_settings); + + /// Query parameters inside SET queries should be also saved on the client side + /// to override their previous definitions set with --param_* arguments + /// and for substitutions to work inside INSERT ... VALUES queries + for (const auto & [name, value] : set_query->query_parameters) + query_parameters.insert_or_assign(name, value); + + global_context->addQueryParameters(set_query->query_parameters); } if (const auto * use_query = parsed_query->as()) { @@ -2324,7 +2343,7 @@ void ClientBase::init(int argc, char ** argv) ("stage", po::value()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit") ("query_kind", po::value()->default_value("initial_query"), "One of initial_query/secondary_query/no_query") ("query_id", po::value(), "query_id") - ("progress", po::value()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::TTY, "tty"), "Print progress of queries execution - to TTY (default): tty|on|1|true|yes; to STDERR: err; OFF: off|0|false|no") + ("progress", po::value()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print progress of queries execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off") ("disable_suggestion,A", "Disable loading suggestion data. Note that suggestion data is loaded asynchronously through a second connection to ClickHouse server. Also it is reasonable to disable suggestion if you want to paste a query with TAB characters. Shorthand option -A is for those who get used to mysql client.") ("time,t", "print query execution time to stderr in non-interactive mode (for benchmarks)") @@ -2379,11 +2398,6 @@ void ClientBase::init(int argc, char ** argv) parseAndCheckOptions(options_description, options, common_arguments); po::notify(options); - if (options["progress"].as() == ProgressOption::OFF) - need_render_progress = false; - else - initTtyBuffer(options["progress"].as() == ProgressOption::ERR); - if (options.count("version") || options.count("V")) { showClientVersion(); @@ -2437,6 +2451,9 @@ void ClientBase::init(int argc, char ** argv) { switch (options["progress"].as()) { + case DEFAULT: + config().setString("progress", "default"); + break; case OFF: config().setString("progress", "off"); break; diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 9ea66881cf6..6c85d6a5f2b 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -38,10 +38,12 @@ enum MultiQueryProcessingStage enum ProgressOption { + DEFAULT, OFF, TTY, ERR, }; +ProgressOption toProgressOption(std::string progress); std::istream& operator>> (std::istream & in, ProgressOption & progress); void interruptSignalHandler(int signum); @@ -153,7 +155,6 @@ private: void initOutputFormat(const Block & block, ASTPtr parsed_query); void initLogsOutputStream(); - void initTtyBuffer(bool to_err = false); String prompt() const; @@ -168,6 +169,8 @@ protected: static bool isSyncInsertWithData(const ASTInsertQuery & insert_query, const ContextPtr & context); bool processMultiQueryFromFile(const String & file_name); + void initTtyBuffer(ProgressOption progress); + bool is_interactive = false; /// Use either interactive line editing interface or batch mode. bool is_multiquery = false; bool delayed_interactive = false; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index e80ad5c141a..2bc5d70421a 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -637,6 +637,8 @@ M(666, CANNOT_USE_CACHE) \ M(667, NOT_INITIALIZED) \ M(668, INVALID_STATE) \ + M(669, UNKNOWN_NAMED_COLLECTION) \ + M(670, NAMED_COLLECTION_ALREADY_EXISTS) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/Exception.h b/src/Common/Exception.h index c5259d157b2..62121cc22e1 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -12,6 +12,7 @@ #include + namespace Poco { class Logger; } @@ -121,11 +122,7 @@ public: } - std::string displayText() const -#if defined(POCO_CLICKHOUSE_PATCH) - override -#endif - ; + std::string displayText() const override; ssize_t getLineNumber() const { return line_number; } void setLineNumber(int line_number_) { line_number = line_number_;} diff --git a/src/Common/Stopwatch.h b/src/Common/Stopwatch.h index cabc6d8ba1e..32d1fca337d 100644 --- a/src/Common/Stopwatch.h +++ b/src/Common/Stopwatch.h @@ -63,6 +63,8 @@ private: using StopwatchUniquePtr = std::unique_ptr; +/// Allows to obtain the elapsed time concurrently with restarting the stopwatch. +/// Allows to atomically compare the elapsed time with a threshold and restart the watch if the elapsed time is not less. class AtomicStopwatch { public: diff --git a/src/Common/ZooKeeper/ZooKeeperIO.cpp b/src/Common/ZooKeeper/ZooKeeperIO.cpp index c84a8624d78..3bfa5585d87 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.cpp +++ b/src/Common/ZooKeeper/ZooKeeperIO.cpp @@ -143,7 +143,10 @@ void read(std::string & s, ReadBuffer & in) throw Exception("Too large string size while reading from ZooKeeper", Error::ZMARSHALLINGERROR); s.resize(size); - in.read(s.data(), size); + size_t read_bytes = in.read(s.data(), size); + if (read_bytes != static_cast(size)) + throw Exception( + Error::ZMARSHALLINGERROR, "Buffer size read from Zookeeper is not big enough. Expected {}. Got {}", size, read_bytes); } void read(ACL & acl, ReadBuffer & in) diff --git a/src/Common/ZooKeeper/ZooKeeperIO.h b/src/Common/ZooKeeper/ZooKeeperIO.h index ec77b46f3d9..2c5fdd5d8a3 100644 --- a/src/Common/ZooKeeper/ZooKeeperIO.h +++ b/src/Common/ZooKeeper/ZooKeeperIO.h @@ -67,7 +67,7 @@ void read(std::array & s, ReadBuffer & in) read(size, in); if (size != N) throw Exception("Unexpected array size while reading from ZooKeeper", Error::ZMARSHALLINGERROR); - in.read(s.data(), N); + in.readStrict(s.data(), N); } template diff --git a/src/Compression/CompressedReadBuffer.h b/src/Compression/CompressedReadBuffer.h index 4148f4fe4d4..1d338303c84 100644 --- a/src/Compression/CompressedReadBuffer.h +++ b/src/Compression/CompressedReadBuffer.h @@ -21,7 +21,7 @@ public: { } - size_t readBig(char * to, size_t n) override; + [[nodiscard]] size_t readBig(char * to, size_t n) override; /// The compressed size of the current block. size_t getSizeCompressed() const diff --git a/src/Compression/CompressedReadBufferFromFile.h b/src/Compression/CompressedReadBufferFromFile.h index 719959b96f4..d307503fb99 100644 --- a/src/Compression/CompressedReadBufferFromFile.h +++ b/src/Compression/CompressedReadBufferFromFile.h @@ -53,7 +53,7 @@ public: /// we store this offset inside nextimpl_working_buffer_offset. void seek(size_t offset_in_compressed_file, size_t offset_in_decompressed_block) override; - size_t readBig(char * to, size_t n) override; + [[nodiscard]] size_t readBig(char * to, size_t n) override; void setProfileCallback(const ReadBufferFromFileBase::ProfileCallback & profile_callback_, clockid_t clock_type_ = CLOCK_MONOTONIC_COARSE) { diff --git a/src/Compression/CompressionCodecDeflateQpl.cpp b/src/Compression/CompressionCodecDeflateQpl.cpp index 87c0ead0795..348496a2753 100644 --- a/src/Compression/CompressionCodecDeflateQpl.cpp +++ b/src/Compression/CompressionCodecDeflateQpl.cpp @@ -240,7 +240,7 @@ Int32 HardwareCodecDeflateQpl::doDecompressDataAsynchronous(const char * source, void HardwareCodecDeflateQpl::flushAsynchronousDecompressRequests() { - UInt32 n_jobs_processing = decomp_async_job_map.size(); + auto n_jobs_processing = decomp_async_job_map.size(); std::map::iterator it = decomp_async_job_map.begin(); while (n_jobs_processing) diff --git a/src/Coordination/KeeperStateManager.cpp b/src/Coordination/KeeperStateManager.cpp index 9b6aab5533e..9a3b423d4ac 100644 --- a/src/Coordination/KeeperStateManager.cpp +++ b/src/Coordination/KeeperStateManager.cpp @@ -349,7 +349,7 @@ nuraft::ptr KeeperStateManager::read_state() auto buffer_size = content_size - sizeof read_checksum - sizeof version; auto state_buf = nuraft::buffer::alloc(buffer_size); - read_buf.read(reinterpret_cast(state_buf->data_begin()), buffer_size); + read_buf.readStrict(reinterpret_cast(state_buf->data_begin()), buffer_size); SipHash hash; hash.update(version); diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index b211d746a2f..f4785875be0 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -116,7 +116,8 @@ namespace MySQLReplication if (!query.starts_with("XA COMMIT")) transaction_complete = false; } - else if (query.starts_with("SAVEPOINT")) + else if (query.starts_with("SAVEPOINT") || query.starts_with("ROLLBACK") + || query.starts_with("RELEASE SAVEPOINT")) { typ = QUERY_SAVEPOINT; } @@ -941,6 +942,7 @@ namespace MySQLReplication { case QUERY_EVENT_MULTI_TXN_FLAG: case QUERY_EVENT_XA: + /// Ignore queries that have no impact on the data. case QUERY_SAVEPOINT: { event = std::make_shared(std::move(query->header)); diff --git a/src/Core/PostgreSQLProtocol.h b/src/Core/PostgreSQLProtocol.h index 994494fc92f..a20151ec167 100644 --- a/src/Core/PostgreSQLProtocol.h +++ b/src/Core/PostgreSQLProtocol.h @@ -175,7 +175,7 @@ public: FrontMessageType receiveMessageType() { char type = 0; - in->read(type); + in->readStrict(type); return static_cast(type); } diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7e38a88cd22..83252b6f0a9 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -614,7 +614,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value) \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ \ - M(Bool, use_structure_from_insertion_table_in_table_functions, false, "Use structure from insertion table instead of schema inference from data", 0) \ + M(UInt64, use_structure_from_insertion_table_in_table_functions, 2, "Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto", 0) \ \ M(UInt64, http_max_tries, 10, "Max attempts to read via http.", 0) \ M(UInt64, http_retry_initial_backoff_ms, 100, "Min milliseconds for backoff, when retrying read via http", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b78b812da86..7635e121f8e 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -78,6 +78,7 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"22.11", {{"use_structure_from_insertion_table_in_table_functions", 0, 2, "Improve using structure from insertion table in table functions"}}}, {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}}, {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"}, {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"}, diff --git a/src/Daemon/BaseDaemon.cpp b/src/Daemon/BaseDaemon.cpp index 758f85e688f..7283973007b 100644 --- a/src/Daemon/BaseDaemon.cpp +++ b/src/Daemon/BaseDaemon.cpp @@ -355,27 +355,33 @@ private: #if defined(OS_LINUX) /// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace. /// Please keep the below log messages in-sync with the ones in programs/server/Server.cpp - String calculated_binary_hash = getHashOfLoadedBinaryHex(); + if (daemon.stored_binary_hash.empty()) { - LOG_FATAL(log, "Integrity check of the executable skipped because the reference checksum could not be read." - " (calculated checksum: {})", calculated_binary_hash); - } - else if (calculated_binary_hash == daemon.stored_binary_hash) - { - LOG_FATAL(log, "Integrity check of the executable successfully passed (checksum: {})", calculated_binary_hash); + LOG_FATAL(log, "Integrity check of the executable skipped because the reference checksum could not be read."); } else { - LOG_FATAL(log, "Calculated checksum of the executable ({0}) does not correspond" - " to the reference checksum stored in the executable ({1})." - " This may indicate one of the following:" - " - the executable was changed just after startup;" - " - the executable was corrupted on disk due to faulty hardware;" - " - the loaded executable was corrupted in memory due to faulty hardware;" - " - the file was intentionally modified;" - " - a logical error in the code." - , calculated_binary_hash, daemon.stored_binary_hash); + String calculated_binary_hash = getHashOfLoadedBinaryHex(); + if (calculated_binary_hash == daemon.stored_binary_hash) + { + LOG_FATAL(log, "Integrity check of the executable successfully passed (checksum: {})", calculated_binary_hash); + } + else + { + LOG_FATAL( + log, + "Calculated checksum of the executable ({0}) does not correspond" + " to the reference checksum stored in the executable ({1})." + " This may indicate one of the following:" + " - the executable was changed just after startup;" + " - the executable was corrupted on disk due to faulty hardware;" + " - the loaded executable was corrupted in memory due to faulty hardware;" + " - the file was intentionally modified;" + " - a logical error in the code.", + calculated_binary_hash, + daemon.stored_binary_hash); + } } #endif diff --git a/src/Daemon/BaseDaemon.h b/src/Daemon/BaseDaemon.h index d248ad9cec9..ae64651caed 100644 --- a/src/Daemon/BaseDaemon.h +++ b/src/Daemon/BaseDaemon.h @@ -136,11 +136,7 @@ protected: /// fork the main process and watch if it was killed void setupWatchdog(); - void waitForTerminationRequest() -#if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 // in old upstream poco not vitrual - override -#endif - ; + void waitForTerminationRequest() override; /// thread safe virtual void onInterruptSignals(int signal_id); diff --git a/src/DataTypes/NumberTraits.h b/src/DataTypes/NumberTraits.h index e63c5bcdcb7..6b068b0d8b1 100644 --- a/src/DataTypes/NumberTraits.h +++ b/src/DataTypes/NumberTraits.h @@ -116,6 +116,15 @@ template struct ResultOfModulo using Type = std::conditional_t || std::is_floating_point_v, Float64, Type0>; }; +template struct ResultOfPositiveModulo +{ + /// function positive_modulo always return non-negative number. + static constexpr size_t size_of_result = sizeof(B); + using Type0 = typename Construct::Type; + using Type = std::conditional_t || std::is_floating_point_v, Float64, Type0>; +}; + + template struct ResultOfModuloLegacy { using Type0 = typename Construct || is_signed_v, false, sizeof(B)>::Type; diff --git a/src/DataTypes/Serializations/SerializationIP.cpp b/src/DataTypes/Serializations/SerializationIP.cpp index ed0e9d54415..c89c2d7c8ac 100644 --- a/src/DataTypes/Serializations/SerializationIP.cpp +++ b/src/DataTypes/Serializations/SerializationIP.cpp @@ -47,7 +47,7 @@ void SerializationIPv4::deserializeText(IColumn & column, ReadBuffer & istr, con } char buffer[IPV4_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); + [[maybe_unused]] size_t read_bytes = istr.read(buffer, sizeof(buffer) - 1); UInt32 ipv4_value = 0; bool parse_result = parseIPv4(buffer, reinterpret_cast(&ipv4_value)); @@ -90,7 +90,7 @@ void SerializationIPv6::deserializeText(IColumn & column, ReadBuffer & istr, con } char buffer[IPV6_MAX_TEXT_LENGTH + 1] = {'\0'}; - istr.read(buffer, sizeof(buffer) - 1); + [[maybe_unused]] size_t read_bytes = istr.read(buffer, sizeof(buffer) - 1); std::string ipv6_value(IPV6_BINARY_LENGTH, '\0'); diff --git a/src/DataTypes/convertMySQLDataType.cpp b/src/DataTypes/convertMySQLDataType.cpp index 64633c6fd7b..307ff317204 100644 --- a/src/DataTypes/convertMySQLDataType.cpp +++ b/src/DataTypes/convertMySQLDataType.cpp @@ -55,7 +55,7 @@ DataTypePtr convertMySQLDataType(MultiEnum type_support, else res = std::make_shared(); } - else if (type_name == "int" || type_name == "mediumint") + else if (type_name == "int" || type_name == "mediumint" || type_name == "integer") { if (is_unsigned) res = std::make_shared(); diff --git a/src/Databases/DatabaseMemory.cpp b/src/Databases/DatabaseMemory.cpp index 8540c785419..99d88597385 100644 --- a/src/Databases/DatabaseMemory.cpp +++ b/src/Databases/DatabaseMemory.cpp @@ -177,6 +177,7 @@ std::vector> DatabaseMemory::getTablesForBackup(co if (create.getTable() != table_name) throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a create query with unexpected name {} for temporary table {}", backQuoteIfNeed(create.getTable()), backQuoteIfNeed(table_name)); + chassert(storage); storage->adjustCreateQueryForBackup(create_table_query); res.emplace_back(create_table_query, storage); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 4b7599affb0..20fa11e90e2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1214,6 +1214,7 @@ DatabaseReplicated::getTablesForBackup(const FilterByNameFunction & filter, cons String table_name = unescapeForFileName(escaped_table_name); if (!filter(table_name)) continue; + String zk_metadata; if (!zookeeper->tryGet(zookeeper_path + "/metadata/" + escaped_table_name, zk_metadata)) throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Metadata for table {} was not found in ZooKeeper", table_name); @@ -1233,6 +1234,10 @@ DatabaseReplicated::getTablesForBackup(const FilterByNameFunction & filter, cons if (storage) storage->adjustCreateQueryForBackup(create_table_query); } + + /// `storage` is allowed to be null here. In this case it means that this storage exists on other replicas + /// but it has not been created on this replica yet. + res.emplace_back(create_table_query, storage); } diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 93a9523d115..37fd055456e 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -329,6 +329,10 @@ std::vector> DatabaseWithOwnTablesBase::getTablesF for (auto it = getTablesIterator(local_context, filter); it->isValid(); it->next()) { + auto storage = it->table(); + if (!storage) + continue; /// Probably the table has been just dropped. + auto create_table_query = tryGetCreateTableQuery(it->name(), local_context); if (!create_table_query) throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Couldn't get a create query for table {}.{}", backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(it->name())); @@ -337,7 +341,6 @@ std::vector> DatabaseWithOwnTablesBase::getTablesF if (create.getTable() != it->name()) throw Exception(ErrorCodes::INCONSISTENT_METADATA_FOR_BACKUP, "Got a create query with unexpected name {} for table {}.{}", backQuoteIfNeed(create.getTable()), backQuoteIfNeed(getDatabaseName()), backQuoteIfNeed(it->name())); - auto storage = it->table(); storage->adjustCreateQueryForBackup(create_table_query); res.emplace_back(create_table_query, storage); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 590433e91e5..53712639f46 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -147,7 +147,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), - config.getInt(config_prefix + ".max_single_download_retries", 3) + config.getInt(config_prefix + ".max_single_download_retries", 3), + config.getInt(config_prefix + ".list_object_keys_size", 1000) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index c3549701ec1..15ddbe551da 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -141,18 +141,31 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO return std::make_unique(std::move(buffer), std::move(finalize_callback), object.absolute_path); } -void AzureObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children) const +void AzureObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const { auto client_ptr = client.get(); Azure::Storage::Blobs::ListBlobsOptions blobs_list_options; blobs_list_options.Prefix = path; + if (max_keys) + blobs_list_options.PageSizeHint = max_keys; + else + blobs_list_options.PageSizeHint = settings.get()->list_object_keys_size; auto blobs_list_response = client_ptr->ListBlobs(blobs_list_options); - auto blobs_list = blobs_list_response.Blobs; + for (;;) + { + auto blobs_list = blobs_list_response.Blobs; - for (const auto & blob : blobs_list) - children.emplace_back(blob.Name, blob.BlobSize); + for (const auto & blob : blobs_list) + children.emplace_back(blob.Name, blob.BlobSize); + + if (max_keys && children.size() >= static_cast(max_keys)) + break; + if (!blobs_list_response.HasPage()) + break; + blobs_list_response.MoveToNextPage(); + } } /// Remove file. Throws exception if file doesn't exists or it's a directory. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 6fd41dae2ec..648016fb732 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -29,11 +29,13 @@ struct AzureObjectStorageSettings uint64_t max_single_part_upload_size_, uint64_t min_bytes_for_seek_, int max_single_read_retries_, - int max_single_download_retries_) + int max_single_download_retries_, + int list_object_keys_size_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) + , list_object_keys_size(list_object_keys_size_) { } @@ -41,6 +43,7 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek; size_t max_single_read_retries; size_t max_single_download_retries; + int list_object_keys_size; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; @@ -84,7 +87,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children) const override; + void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; /// Remove file. Throws exception if file doesn't exists or it's a directory. void removeObject(const StoredObject & object) override; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp index f3d3f049dc1..505b26ebb3a 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.cpp @@ -282,9 +282,9 @@ std::unique_ptr CachedObjectStorage::cloneObjectStorage( return object_storage->cloneObjectStorage(new_namespace, config, config_prefix, context); } -void CachedObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children) const +void CachedObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const { - object_storage->findAllFiles(path, children); + object_storage->findAllFiles(path, children, max_keys); } ObjectMetadata CachedObjectStorage::getObjectMetadata(const std::string & path) const diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 64e6eed45bb..b84382a762a 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -72,7 +72,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children) const override; + void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; ObjectMetadata getObjectMetadata(const std::string & path) const override; diff --git a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp index 4ea42616ba2..2ae5e46eb66 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorageRemoteMetadataRestoreHelper.cpp @@ -390,7 +390,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFiles(IObjectStorage * }; RelativePathsWithSize children; - source_object_storage->findAllFiles(restore_information.source_path, children); + source_object_storage->findAllFiles(restore_information.source_path, children, /* max_keys= */ 0); restore_files(children); @@ -540,7 +540,7 @@ void DiskObjectStorageRemoteMetadataRestoreHelper::restoreFileOperations(IObject }; RelativePathsWithSize children; - source_object_storage->findAllFiles(restore_information.source_path + "operations/", children); + source_object_storage->findAllFiles(restore_information.source_path + "operations/", children, /* max_keys= */ 0); restore_file_operations(children); if (restore_information.detached) diff --git a/src/Disks/ObjectStorages/IObjectStorage.cpp b/src/Disks/ObjectStorages/IObjectStorage.cpp index 3f8ac566603..45ecad35747 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.cpp +++ b/src/Disks/ObjectStorages/IObjectStorage.cpp @@ -14,7 +14,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -void IObjectStorage::findAllFiles(const std::string &, RelativePathsWithSize &) const +void IObjectStorage::findAllFiles(const std::string &, RelativePathsWithSize &, int) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "findAllFiles() is not supported"); } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 9451ae31b07..c570dfb6e9b 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -73,13 +73,17 @@ public: /// find . -type f /// /// @param children - out files (relative paths) with their sizes. + /// @param max_keys - return not more then max_keys children + /// NOTE: max_keys is not the same as list_object_keys_size (disk property) + /// - if max_keys is set not more then max_keys keys should be returned + /// - however list_object_keys_size determine the size of the batch and should return all keys /// /// NOTE: It makes sense only for real object storages (S3, Azure), since /// it is used only for one of the following: /// - send_metadata (to restore metadata) /// - see DiskObjectStorage::restoreMetadataIfNeeded() /// - MetadataStorageFromPlainObjectStorage - only for s3_plain disk - virtual void findAllFiles(const std::string & path, RelativePathsWithSize & children) const; + virtual void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const; /// Analog of directory content for object storage (object storage does not /// have "directory" definition, but it can be emulated with usage of diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index 259f6e01fd7..34a9ae021b7 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -39,8 +39,11 @@ std::filesystem::path MetadataStorageFromPlainObjectStorage::getAbsolutePath(con bool MetadataStorageFromPlainObjectStorage::exists(const std::string & path) const { - auto object = StoredObject::create(*object_storage, getAbsolutePath(path)); - return object_storage->exists(object); + RelativePathsWithSize children; + /// NOTE: exists() cannot be used here since it works only for existing + /// key, and does not work for some intermediate path. + object_storage->findAllFiles(getAbsolutePath(path), children, 1); + return !children.empty(); } bool MetadataStorageFromPlainObjectStorage::isFile(const std::string & path) const @@ -66,7 +69,7 @@ bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path) const { RelativePathsWithSize children; - object_storage->findAllFiles(getAbsolutePath(path), children); + object_storage->findAllFiles(getAbsolutePath(path), children, 1); if (children.empty()) return 0; if (children.size() != 1) @@ -85,6 +88,11 @@ std::vector MetadataStorageFromPlainObjectStorage::listDirectory(co result.push_back(path_size.relative_path); for (const auto & directory : directories) result.push_back(directory); + for (auto & row : result) + { + chassert(row.starts_with(object_storage_root_path)); + row.erase(0, object_storage_root_path.size()); + } return result; } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 0c421ee03d7..099a7d458d0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -248,7 +248,7 @@ std::unique_ptr S3ObjectStorage::writeObject( /// NOLIN std::move(s3_buffer), std::move(finalize_callback), object.absolute_path); } -void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children) const +void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const { auto settings_ptr = s3_settings.get(); auto client_ptr = client.get(); @@ -256,7 +256,10 @@ void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSi Aws::S3::Model::ListObjectsV2Request request; request.SetBucket(bucket); request.SetPrefix(path); - request.SetMaxKeys(settings_ptr->list_object_keys_size); + if (max_keys) + request.SetMaxKeys(max_keys); + else + request.SetMaxKeys(settings_ptr->list_object_keys_size); Aws::S3::Model::ListObjectsV2Outcome outcome; do @@ -275,6 +278,14 @@ void S3ObjectStorage::findAllFiles(const std::string & path, RelativePathsWithSi for (const auto & object : objects) children.emplace_back(object.GetKey(), object.GetSize()); + if (max_keys) + { + int keys_left = max_keys - static_cast(children.size()); + if (keys_left <= 0) + break; + request.SetMaxKeys(keys_left); + } + request.SetContinuationToken(outcome.GetResult().GetNextContinuationToken()); } while (outcome.GetResult().GetIsTruncated()); } @@ -288,7 +299,12 @@ void S3ObjectStorage::getDirectoryContents(const std::string & path, Aws::S3::Model::ListObjectsV2Request request; request.SetBucket(bucket); - request.SetPrefix(path); + /// NOTE: if you do "ls /foo" instead of "ls /foo/" over S3 with this API + /// it will return only "/foo" itself without any underlying nodes. + if (path.ends_with("/")) + request.SetPrefix(path); + else + request.SetPrefix(path + "/"); request.SetMaxKeys(settings_ptr->list_object_keys_size); request.SetDelimiter("/"); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index 56f1c895924..447ca034aac 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -105,7 +105,7 @@ public: size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE, const WriteSettings & write_settings = {}) override; - void findAllFiles(const std::string & path, RelativePathsWithSize & children) const override; + void findAllFiles(const std::string & path, RelativePathsWithSize & children, int max_keys) const override; void getDirectoryContents(const std::string & path, RelativePathsWithSize & files, std::vector & directories) const override; diff --git a/src/Formats/EscapingRuleUtils.cpp b/src/Formats/EscapingRuleUtils.cpp index e80ab50968d..bba94e98e49 100644 --- a/src/Formats/EscapingRuleUtils.cpp +++ b/src/Formats/EscapingRuleUtils.cpp @@ -464,6 +464,9 @@ bool tryInferDate(const std::string_view & field) bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings) { + if (field.empty()) + return false; + ReadBufferFromString buf(field); Float64 tmp_float; /// Check if it's just a number, and if so, don't try to infer DateTime from it, diff --git a/src/Functions/DivisionUtils.h b/src/Functions/DivisionUtils.h index 98e5c690eb9..f8cffab9f48 100644 --- a/src/Functions/DivisionUtils.h +++ b/src/Functions/DivisionUtils.h @@ -178,4 +178,32 @@ struct ModuloLegacyImpl : ModuloImpl using ResultType = typename NumberTraits::ResultOfModuloLegacy::Type; }; +template +struct PositiveModuloImpl : ModuloImpl +{ + using OriginResultType = typename ModuloImpl::ResultType; + using ResultType = typename NumberTraits::ResultOfPositiveModulo::Type; + + template + static inline Result apply(A a, B b) + { + auto res = ModuloImpl::template apply(a, b); + if constexpr (is_signed_v) + { + if (res < 0) + { + if constexpr (is_unsigned_v) + res += static_cast(b); + else + { + if (b == std::numeric_limits::lowest()) + throw Exception("Division by the most negative number", ErrorCodes::ILLEGAL_DIVISION); + res += b >= 0 ? static_cast(b) : static_cast(-b); + } + } + } + return static_cast(res); + } +}; + } diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 7e4dc387b66..0a79ac3b0d9 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -131,50 +131,53 @@ public: using ResultDataType = Switch< /// Decimal cases Case || IsDataTypeDecimal), InvalidType>, - Case && IsDataTypeDecimal && UseLeftDecimal, LeftDataType>, + Case< + IsDataTypeDecimal && IsDataTypeDecimal && UseLeftDecimal, + LeftDataType>, Case && IsDataTypeDecimal, RightDataType>, Case && IsIntegralOrExtended, LeftDataType>, Case && IsIntegralOrExtended, RightDataType>, /// e.g Decimal +-*/ Float, least(Decimal, Float), greatest(Decimal, Float) = Float64 - Case::allow_decimal && IsDataTypeDecimal && IsFloatingPoint, - DataTypeFloat64>, - Case::allow_decimal && IsDataTypeDecimal && IsFloatingPoint, - DataTypeFloat64>, + Case::allow_decimal && IsDataTypeDecimal && IsFloatingPoint, DataTypeFloat64>, + Case::allow_decimal && IsDataTypeDecimal && IsFloatingPoint, DataTypeFloat64>, - Case::bit_hamming_distance && IsIntegral && IsIntegral, - DataTypeUInt8>, + Case::bit_hamming_distance && IsIntegral && IsIntegral, DataTypeUInt8>, /// Decimal Real is not supported (traditional DBs convert Decimal Real to Real) Case && !IsIntegralOrExtendedOrDecimal, InvalidType>, Case && !IsIntegralOrExtendedOrDecimal, InvalidType>, /// number number -> see corresponding impl - Case && !IsDateOrDateTime, - DataTypeFromFieldType>, + Case && !IsDateOrDateTime, DataTypeFromFieldType>, /// Date + Integral -> Date /// Integral + Date -> Date - Case::plus, Switch< - Case, LeftDataType>, - Case, RightDataType>>>, + Case< + IsOperation::plus, + Switch, LeftDataType>, Case, RightDataType>>>, /// Date - Date -> Int32 /// Date - Integral -> Date - Case::minus, Switch< - Case, DataTypeInt32>, - Case && IsIntegral, LeftDataType>>>, + Case< + IsOperation::minus, + Switch< + Case, DataTypeInt32>, + Case && IsIntegral, LeftDataType>>>, /// least(Date, Date) -> Date /// greatest(Date, Date) -> Date - Case && (IsOperation::least || IsOperation::greatest), + Case< + std::is_same_v && (IsOperation::least || IsOperation::greatest), LeftDataType>, /// Date % Int32 -> Int32 /// Date % Float -> Float64 - Case::modulo, Switch< - Case && IsIntegral, RightDataType>, - Case && IsFloatingPoint, DataTypeFloat64>>>>; + Case< + IsOperation::modulo || IsOperation::positive_modulo, + Switch< + Case && IsIntegral, RightDataType>, + Case && IsFloatingPoint, DataTypeFloat64>>>>; }; } @@ -1176,8 +1179,9 @@ public: bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override { - return ((IsOperation::div_int || IsOperation::modulo) && !arguments[1].is_const) - || (IsOperation::div_floating && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type))); + return ((IsOperation::div_int || IsOperation::modulo || IsOperation::positive_modulo) && !arguments[1].is_const) + || (IsOperation::div_floating + && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type))); } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override @@ -2080,7 +2084,7 @@ public: /// Check the case when operation is divide, intDiv or modulo and denominator is Nullable(Something). /// For divide operation we should check only Nullable(Decimal), because only this case can throw division by zero error. bool division_by_nullable = !arguments[0].type->onlyNull() && !arguments[1].type->onlyNull() && arguments[1].type->isNullable() - && (IsOperation::div_int || IsOperation::modulo + && (IsOperation::div_int || IsOperation::modulo || IsOperation::positive_modulo || (IsOperation::div_floating && (isDecimalOrNullableDecimal(arguments[0].type) || isDecimalOrNullableDecimal(arguments[1].type)))); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index dd494d821bf..208da8a78fe 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -2828,6 +2828,31 @@ private: }; } +#define GENERATE_INTERVAL_CASE(INTERVAL_KIND) \ + case IntervalKind::INTERVAL_KIND: \ + return createFunctionAdaptor(FunctionConvert::create(), from_type); + + static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind kind) + { + switch (kind) + { + GENERATE_INTERVAL_CASE(Nanosecond) + GENERATE_INTERVAL_CASE(Microsecond) + GENERATE_INTERVAL_CASE(Millisecond) + GENERATE_INTERVAL_CASE(Second) + GENERATE_INTERVAL_CASE(Minute) + GENERATE_INTERVAL_CASE(Hour) + GENERATE_INTERVAL_CASE(Day) + GENERATE_INTERVAL_CASE(Week) + GENERATE_INTERVAL_CASE(Month) + GENERATE_INTERVAL_CASE(Quarter) + GENERATE_INTERVAL_CASE(Year) + } + throw Exception{ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion to unexpected IntervalKind: {}", kind.toString()}; + } + +#undef GENERATE_INTERVAL_CASE + template requires IsDataTypeDecimal WrapperType createDecimalWrapper(const DataTypePtr & from_type, const ToDataType * to_type, bool requested_result_is_nullable) const @@ -3853,6 +3878,8 @@ private: return createObjectWrapper(from_type, checkAndGetDataType(to_type.get())); case TypeIndex::AggregateFunction: return createAggregateFunctionWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Interval: + return createIntervalWrapper(from_type, checkAndGetDataType(to_type.get())->getKind()); default: break; } diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h index de7701db59a..39f9114f5e0 100644 --- a/src/Functions/IsOperation.h +++ b/src/Functions/IsOperation.h @@ -15,6 +15,7 @@ template struct DivideIntegralOrZeroImpl; template struct LeastBaseImpl; template struct GreatestBaseImpl; template struct ModuloImpl; +template struct PositiveModuloImpl; template struct EqualsOp; template struct NotEqualsOp; template struct LessOrEqualsOp; @@ -53,6 +54,7 @@ struct IsOperation static constexpr bool div_int = IsSameOperation::value; static constexpr bool div_int_or_zero = IsSameOperation::value; static constexpr bool modulo = IsSameOperation::value; + static constexpr bool positive_modulo = IsSameOperation::value; static constexpr bool least = IsSameOperation::value; static constexpr bool greatest = IsSameOperation::value; diff --git a/src/Functions/canonicalRand.cpp b/src/Functions/canonicalRand.cpp new file mode 100644 index 00000000000..d0b8c655e14 --- /dev/null +++ b/src/Functions/canonicalRand.cpp @@ -0,0 +1,59 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace +{ + +struct CanonicalRandImpl +{ + static void execute(char * output, size_t size) + { + pcg64_fast rng1(randomSeed()); + pcg64_fast rng2(randomSeed()); + std::uniform_real_distribution distribution1(min, max); + std::uniform_real_distribution distribution2(min, max); + + for (const char * end = output + size; output < end; output += 16) + { + unalignedStore(output, distribution1(rng1)); + unalignedStore(output + 8, distribution2(rng2)); + } + } + /// It is guaranteed (by PaddedPODArray) that we can overwrite up to 15 bytes after end. + +private: + const static constexpr Float64 min = 0; + const static constexpr Float64 max = 1; +}; + + +struct NameCanonicalRand +{ + static constexpr auto name = "canonicalRand"; +}; + +class FunctionCanonicalRand : public FunctionRandomImpl +{ +public: + static FunctionPtr create(ContextPtr /*context*/) { return std::make_shared(); } +}; + +} + +REGISTER_FUNCTION(CanonicalRand) +{ + factory.registerFunction({ + R"( +The function generates pseudo random results with independent and identically distributed uniformly distributed values in [0, 1). +Non-deterministic. Return type is Float64. + )", + Documentation::Examples{{"canonicalRand", "SELECT canonicalRand()"}}, + Documentation::Categories{"Mathematical"}}); +} + +} diff --git a/src/Functions/factorial.cpp b/src/Functions/factorial.cpp new file mode 100644 index 00000000000..4e96391bccd --- /dev/null +++ b/src/Functions/factorial.cpp @@ -0,0 +1,113 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; +} + +template +struct FactorialImpl +{ + using ResultType = UInt64; + static const constexpr bool allow_decimal = false; + static const constexpr bool allow_fixed_string = false; + static const constexpr bool allow_string_integer = false; + + static inline NO_SANITIZE_UNDEFINED ResultType apply(A a) + { + if constexpr (std::is_floating_point_v || is_over_big_int) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type of argument of function factorial, should not be floating point or big int"); + + if constexpr (is_integer) + { + if (a > 20) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "The maximum value for the input argument of function factorial is 20"); + + if constexpr (is_unsigned_v) + return factorials[a]; + else if constexpr (is_signed_v) + return a >= 0 ? factorials[a] : 1; + } + } + +#if USE_EMBEDDED_COMPILER + static constexpr bool compilable = false; /// special type handling, some other time +#endif + +private: + static const constexpr ResultType factorials[21] + = {1, + 1, + 2, + 6, + 24, + 120, + 720, + 5040, + 40320, + 362880, + 3628800, + 39916800, + 479001600, + 6227020800L, + 87178291200L, + 1307674368000L, + 20922789888000L, + 355687428096000L, + 6402373705728000L, + 121645100408832000L, + 2432902008176640000L}; +}; + +struct NameFactorial { static constexpr auto name = "factorial"; }; +using FunctionFactorial = FunctionUnaryArithmetic; + +template <> struct FunctionUnaryArithmeticMonotonicity +{ + static bool has() { return true; } + + static IFunction::Monotonicity get(const Field & left, const Field & right) + { + bool is_strict = false; + if (!left.isNull() && !right.isNull()) + { + auto left_value = applyVisitor(FieldVisitorConvertToNumber(), left); + auto right_value = applyVisitor(FieldVisitorConvertToNumber(), left); + if (1 <= left_value && left_value <= right_value && right_value <= 20) + is_strict = true; + } + + return { + .is_monotonic = true, + .is_positive = true, + .is_always_monotonic = true, + .is_strict = is_strict, + }; + } +}; + + +REGISTER_FUNCTION(Factorial) +{ + factory.registerFunction( + { + R"( +Computes the factorial of an integer value. It works with any native integer type including UInt(8|16|32|64) and Int(8|16|32|64). The return type is UInt64. + +The factorial of 0 is 1. Likewise, the factorial() function returns 1 for any negative value. The maximum positive value for the input argument is 20, a value of 21 or greater will cause exception throw. +)", + Documentation::Examples{{"factorial", "SELECT factorial(10)"}}, + Documentation::Categories{"Mathematical"}}, + FunctionFactory::CaseInsensitive); +} + +} diff --git a/src/Functions/if.cpp b/src/Functions/if.cpp index 86707fc62d6..f3401713834 100644 --- a/src/Functions/if.cpp +++ b/src/Functions/if.cpp @@ -904,6 +904,7 @@ private: if (cond_col) { + arg_else_column = arg_else_column->convertToFullColumnIfConst(); auto result_column = IColumn::mutate(std::move(arg_else_column)); if (else_is_short) result_column->expand(cond_col->getData(), true); @@ -941,6 +942,7 @@ private: if (cond_col) { + arg_then_column = arg_then_column->convertToFullColumnIfConst(); auto result_column = IColumn::mutate(std::move(arg_then_column)); if (then_is_short) result_column->expand(cond_col->getData(), false); diff --git a/src/Functions/modulo.cpp b/src/Functions/modulo.cpp index b2411899160..9cd104cd1dc 100644 --- a/src/Functions/modulo.cpp +++ b/src/Functions/modulo.cpp @@ -133,6 +133,7 @@ struct ModuloLegacyByConstantImpl : ModuloByConstantImpl { using Op = ModuloLegacyImpl; }; + } /** Specializations are specified for dividing numbers of the type UInt64 and UInt32 by the numbers of the same sign. @@ -179,4 +180,22 @@ REGISTER_FUNCTION(ModuloLegacy) factory.registerFunction(); } +struct NamePositiveModulo +{ + static constexpr auto name = "positive_modulo"; +}; +using FunctionPositiveModulo = BinaryArithmeticOverloadResolver; + +REGISTER_FUNCTION(PositiveModulo) +{ + factory.registerFunction( + { + R"( +Calculates the remainder when dividing `a` by `b`. Similar to function `modulo` except that `positive_modulo` always return non-negative number. + )", + Documentation::Examples{{"positive_modulo", "SELECT positive_modulo(-1000, 32);"}}, + Documentation::Categories{"Arithmetic"}}, + FunctionFactory::CaseInsensitive); +} + } diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index f33b2399492..c015d4566d6 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -49,11 +49,7 @@ namespace { void setTimeouts(Poco::Net::HTTPClientSession & session, const ConnectionTimeouts & timeouts) { -#if defined(POCO_CLICKHOUSE_PATCH) || POCO_VERSION >= 0x02000000 session.setTimeout(timeouts.connection_timeout, timeouts.send_timeout, timeouts.receive_timeout); -#else - session.setTimeout(std::max({timeouts.connection_timeout, timeouts.send_timeout, timeouts.receive_timeout})); -#endif session.setKeepAliveTimeout(timeouts.http_keep_alive_timeout); } @@ -93,12 +89,7 @@ namespace ProfileEvents::increment(ProfileEvents::CreatedHTTPConnections); /// doesn't work properly without patch -#if defined(POCO_CLICKHOUSE_PATCH) session->setKeepAlive(keep_alive); -#else - (void)keep_alive; // Avoid warning: unused parameter -#endif - return session; } @@ -122,12 +113,10 @@ namespace session->setProxyHost(proxy_host); session->setProxyPort(proxy_port); -#if defined(POCO_CLICKHOUSE_PATCH) session->setProxyProtocol(proxy_scheme); /// Turn on tunnel mode if proxy scheme is HTTP while endpoint scheme is HTTPS. session->setProxyTunnel(!proxy_https && https); -#endif } return session; } diff --git a/src/IO/MySQLPacketPayloadReadBuffer.cpp b/src/IO/MySQLPacketPayloadReadBuffer.cpp index 9ca7845b2ae..ab58624d0fa 100644 --- a/src/IO/MySQLPacketPayloadReadBuffer.cpp +++ b/src/IO/MySQLPacketPayloadReadBuffer.cpp @@ -30,7 +30,7 @@ bool MySQLPacketPayloadReadBuffer::nextImpl() "Received packet with payload larger than max_packet_size: {}", payload_length); size_t packet_sequence_id = 0; - in.read(reinterpret_cast(packet_sequence_id)); + in.readStrict(reinterpret_cast(packet_sequence_id)); if (packet_sequence_id != sequence_id) throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_CLIENT, "Received packet with wrong sequence-id: {}. Expected: {}.", packet_sequence_id, static_cast(sequence_id)); diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index 8d697710081..182eb0b7105 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -149,7 +149,7 @@ public: } /// Reads a single byte. - bool ALWAYS_INLINE read(char & c) + [[nodiscard]] bool ALWAYS_INLINE read(char & c) { if (peek(c)) { @@ -168,7 +168,7 @@ public: } /** Reads as many as there are, no more than n bytes. */ - size_t read(char * to, size_t n) + [[nodiscard]] size_t read(char * to, size_t n) { size_t bytes_copied = 0; @@ -197,10 +197,7 @@ public: * By default - the same as read. * Don't use for small reads. */ - virtual size_t readBig(char * to, size_t n) - { - return read(to, n); - } + [[nodiscard]] virtual size_t readBig(char * to, size_t n) { return read(to, n); } /** Do something to allow faster subsequent call to 'nextImpl' if possible. * It's used for asynchronous readers with double-buffering. diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 27a24eef804..a7227811261 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -964,15 +964,16 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re components.whole = components.whole / common::exp10_i32(scale); } + bool is_ok = true; if constexpr (std::is_same_v) datetime64 = DecimalUtils::decimalFromComponents(components, scale); else - DecimalUtils::tryGetDecimalFromComponents(components, scale, datetime64); + is_ok = DecimalUtils::tryGetDecimalFromComponents(components, scale, datetime64); datetime64 *= negative_multiplier; - return ReturnType(true); + return ReturnType(is_ok); } inline void readDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance()) @@ -1032,6 +1033,15 @@ template requires is_arithmetic_v inline void readBinary(T & x, ReadBuffer & buf) { readPODBinary(x, buf); } +inline void readBinary(bool & x, ReadBuffer & buf) +{ + /// When deserializing a bool it might trigger UBSAN if the input is not 0 or 1, so it's better to treat it as an Int8 + static_assert(sizeof(bool) == sizeof(Int8)); + Int8 flag = 0; + readBinary(flag, buf); + x = (flag != 0); +} + inline void readBinary(String & x, ReadBuffer & buf) { readStringBinary(x, buf); } inline void readBinary(Int128 & x, ReadBuffer & buf) { readPODBinary(x, buf); } inline void readBinary(Int256 & x, ReadBuffer & buf) { readPODBinary(x, buf); } diff --git a/src/IO/parseDateTimeBestEffort.cpp b/src/IO/parseDateTimeBestEffort.cpp index 1dd06401bf1..e0cba169e81 100644 --- a/src/IO/parseDateTimeBestEffort.cpp +++ b/src/IO/parseDateTimeBestEffort.cpp @@ -659,6 +659,9 @@ ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuf fractional *= common::exp10_i64(scale - subsecond.digits); } + if constexpr (std::is_same_v) + return DecimalUtils::tryGetDecimalFromComponents(whole, fractional, scale, res); + res = DecimalUtils::decimalFromComponents(whole, fractional, scale); return ReturnType(true); } diff --git a/src/IO/tests/gtest_file_encryption.cpp b/src/IO/tests/gtest_file_encryption.cpp index 5353faa6086..6a090ff0810 100644 --- a/src/IO/tests/gtest_file_encryption.cpp +++ b/src/IO/tests/gtest_file_encryption.cpp @@ -251,7 +251,7 @@ TEST(FileEncryptionPositionUpdateTest, Decryption) rb.seek(0, SEEK_SET); ASSERT_EQ(rb.getPosition(), 0); res.resize(5); - rb.read(res.data(), res.size()); + ASSERT_EQ(rb.read(res.data(), res.size()), 5); ASSERT_EQ(res, data.substr(0, 5)); res.clear(); diff --git a/src/Interpreters/AsynchronousMetricLog.cpp b/src/Interpreters/AsynchronousMetricLog.cpp index 228934d5f4d..6176bb781ab 100644 --- a/src/Interpreters/AsynchronousMetricLog.cpp +++ b/src/Interpreters/AsynchronousMetricLog.cpp @@ -47,7 +47,7 @@ void AsynchronousMetricLog::addValues(const AsynchronousMetricValues & values) for (const auto & [key, value] : values) { element.metric_name = key; - element.value = round(value * precision) / precision; + element.value = round(value.value * precision) / precision; add(element); } diff --git a/src/Interpreters/AsynchronousMetricLog.h b/src/Interpreters/AsynchronousMetricLog.h index 900d84868bd..8a19fae29e9 100644 --- a/src/Interpreters/AsynchronousMetricLog.h +++ b/src/Interpreters/AsynchronousMetricLog.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -14,12 +15,8 @@ namespace DB { -using AsynchronousMetricValue = double; -using AsynchronousMetricValues = std::unordered_map; - /** AsynchronousMetricLog is a log of metric values measured at regular time interval. */ - struct AsynchronousMetricLogElement { UInt16 event_date; diff --git a/src/Interpreters/AsynchronousMetrics.cpp b/src/Interpreters/AsynchronousMetrics.cpp index 488ac77e956..291bca4277d 100644 --- a/src/Interpreters/AsynchronousMetrics.cpp +++ b/src/Interpreters/AsynchronousMetrics.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -24,15 +23,16 @@ #include #include #include +#include #include - #include "config.h" #if USE_JEMALLOC # include #endif + namespace DB { @@ -123,9 +123,9 @@ void AsynchronousMetrics::openSensors() { LOG_WARNING( &Poco::Logger::get("AsynchronousMetrics"), - "Thermal monitor '{}' exists but could not be read, error {}.", + "Thermal monitor '{}' exists but could not be read: {}.", thermal_device_index, - e.getErrno()); + errnoToString(e.getErrno())); continue; } @@ -252,10 +252,10 @@ void AsynchronousMetrics::openSensorsChips() { LOG_WARNING( &Poco::Logger::get("AsynchronousMetrics"), - "Hardware monitor '{}', sensor '{}' exists but could not be read, error {}.", + "Hardware monitor '{}', sensor '{}' exists but could not be read: {}.", hwmon_name, sensor_name, - e.getErrno()); + errnoToString(e.getErrno())); continue; } @@ -386,14 +386,15 @@ uint64_t updateJemallocEpoch() } template -static Value saveJemallocMetricImpl(AsynchronousMetricValues & values, +static Value saveJemallocMetricImpl( + AsynchronousMetricValues & values, const std::string & jemalloc_full_name, const std::string & clickhouse_full_name) { Value value{}; size_t size = sizeof(value); mallctl(jemalloc_full_name.c_str(), &value, &size, nullptr, 0); - values[clickhouse_full_name] = value; + values[clickhouse_full_name] = AsynchronousMetricValue(value, "An internal metric of the low-level memory allocator (jemalloc). See https://jemalloc.net/jemalloc.3.html"); return value; } @@ -570,85 +571,93 @@ void AsynchronousMetrics::update(TimePoint update_time) previous_update_time = update_time; /// This is also a good indicator of system responsiveness. - new_values["Jitter"] = std::chrono::duration_cast(current_time - update_time).count() / 1e9; + new_values["Jitter"] = { std::chrono::duration_cast(current_time - update_time).count() / 1e9, + "The difference in time the thread for calculation of the asynchronous metrics was scheduled to wake up and the time it was in fact, woken up." + " A proxy-indicator of overall system latency and responsiveness." }; + if (auto mark_cache = getContext()->getMarkCache()) { - if (auto mark_cache = getContext()->getMarkCache()) - { - new_values["MarkCacheBytes"] = mark_cache->weight(); - new_values["MarkCacheFiles"] = mark_cache->count(); - } + new_values["MarkCacheBytes"] = { mark_cache->weight(), "Total size of mark cache in bytes" }; + new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" }; } + if (auto uncompressed_cache = getContext()->getUncompressedCache()) { - if (auto uncompressed_cache = getContext()->getUncompressedCache()) - { - new_values["UncompressedCacheBytes"] = uncompressed_cache->weight(); - new_values["UncompressedCacheCells"] = uncompressed_cache->count(); - } + new_values["UncompressedCacheBytes"] = { uncompressed_cache->weight(), + "Total size of uncompressed cache in bytes. Uncompressed cache does not usually improve the performance and should be mostly avoided." }; + new_values["UncompressedCacheCells"] = { uncompressed_cache->count(), + "Total number of entries in the uncompressed cache. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." }; } + if (auto index_mark_cache = getContext()->getIndexMarkCache()) { - if (auto index_mark_cache = getContext()->getIndexMarkCache()) - { - new_values["IndexMarkCacheBytes"] = index_mark_cache->weight(); - new_values["IndexMarkCacheFiles"] = index_mark_cache->count(); - } + new_values["IndexMarkCacheBytes"] = { index_mark_cache->weight(), "Total size of mark cache for secondary indices in bytes." }; + new_values["IndexMarkCacheFiles"] = { index_mark_cache->count(), "Total number of mark files cached in the mark cache for secondary indices." }; } + if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache()) { - if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache()) - { - new_values["IndexUncompressedCacheBytes"] = index_uncompressed_cache->weight(); - new_values["IndexUncompressedCacheCells"] = index_uncompressed_cache->count(); - } + new_values["IndexUncompressedCacheBytes"] = { index_uncompressed_cache->weight(), + "Total size of uncompressed cache in bytes for secondary indices. Uncompressed cache does not usually improve the performance and should be mostly avoided." }; + new_values["IndexUncompressedCacheCells"] = { index_uncompressed_cache->count(), + "Total number of entries in the uncompressed cache for secondary indices. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." }; } + if (auto mmap_cache = getContext()->getMMappedFileCache()) { - if (auto mmap_cache = getContext()->getMMappedFileCache()) - { - new_values["MMapCacheCells"] = mmap_cache->count(); - } + new_values["MMapCacheCells"] = { mmap_cache->count(), + "The number of files opened with `mmap` (mapped in memory)." + " This is used for queries with the setting `local_filesystem_read_method` set to `mmap`." + " The files opened with `mmap` are kept in the cache to avoid costly TLB flushes."}; } { auto caches = FileCacheFactory::instance().getAll(); + size_t total_bytes = 0; + size_t total_files = 0; + for (const auto & [_, cache_data] : caches) { - new_values["FilesystemCacheBytes"] = cache_data->cache->getUsedCacheSize(); - new_values["FilesystemCacheFiles"] = cache_data->cache->getFileSegmentsNum(); + total_bytes += cache_data->cache->getUsedCacheSize(); + total_files += cache_data->cache->getFileSegmentsNum(); } + + new_values["FilesystemCacheBytes"] = { total_bytes, + "Total bytes in the `cache` virtual filesystem. This cache is hold on disk." }; + new_values["FilesystemCacheFiles"] = { total_files, + "Total number of cached file segments in the `cache` virtual filesystem. This cache is hold on disk." }; } #if USE_ROCKSDB + if (auto metadata_cache = getContext()->tryGetMergeTreeMetadataCache()) { - if (auto metadata_cache = getContext()->tryGetMergeTreeMetadataCache()) - { - new_values["MergeTreeMetadataCacheSize"] = metadata_cache->getEstimateNumKeys(); - } + new_values["MergeTreeMetadataCacheSize"] = { metadata_cache->getEstimateNumKeys(), + "The size of the metadata cache for tables. This cache is experimental and not used in production." }; } #endif #if USE_EMBEDDED_COMPILER + if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache()) { - if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache()) - { - new_values["CompiledExpressionCacheBytes"] = compiled_expression_cache->weight(); - new_values["CompiledExpressionCacheCount"] = compiled_expression_cache->count(); - } + new_values["CompiledExpressionCacheBytes"] = { compiled_expression_cache->weight(), + "Total bytes used for the cache of JIT-compiled code." }; + new_values["CompiledExpressionCacheCount"] = { compiled_expression_cache->count(), + "Total entries in the cache of JIT-compiled code." }; } #endif + new_values["Uptime"] = { getContext()->getUptimeSeconds(), + "The server uptime in seconds. It includes the time spent for server initialization before accepting connections." }; - new_values["Uptime"] = getContext()->getUptimeSeconds(); - + if (const auto stats = getHashTablesCacheStatistics()) { - if (const auto stats = getHashTablesCacheStatistics()) - { - new_values["HashTableStatsCacheEntries"] = stats->entries; - new_values["HashTableStatsCacheHits"] = stats->hits; - new_values["HashTableStatsCacheMisses"] = stats->misses; - } + new_values["HashTableStatsCacheEntries"] = { stats->entries, + "The number of entries in the cache of hash table sizes." + " The cache for hash table sizes is used for predictive optimization of GROUP BY." }; + new_values["HashTableStatsCacheHits"] = { stats->hits, + "The number of times the prediction of a hash table size was correct." }; + new_values["HashTableStatsCacheMisses"] = { stats->misses, + "The number of times the prediction of a hash table size was incorrect." }; } #if defined(OS_LINUX) || defined(OS_FREEBSD) @@ -660,7 +669,7 @@ void AsynchronousMetrics::update(TimePoint update_time) // the following calls will return stale values. It increments and returns // the current epoch number, which might be useful to log as a sanity check. auto epoch = updateJemallocEpoch(); - new_values["jemalloc.epoch"] = epoch; + new_values["jemalloc.epoch"] = { epoch, "An internal incremental update number of the statistics of jemalloc (Jason Evans' memory allocator), used in all other `jemalloc` metrics." }; // Collect the statistics themselves. saveJemallocMetric(new_values, "allocated"); @@ -685,13 +694,24 @@ void AsynchronousMetrics::update(TimePoint update_time) { MemoryStatisticsOS::Data & data = memory_statistics_data; - new_values["MemoryVirtual"] = data.virt; - new_values["MemoryResident"] = data.resident; + new_values["MemoryVirtual"] = { data.virt, + "The size of the virtual address space allocated by the server process, in bytes." + " The size of the virtual address space is usually much greater than the physical memory consumption, and should not be used as an estimate for the memory consumption." + " The large values of this metric are totally normal, and makes only technical sense."}; + new_values["MemoryResident"] = { data.resident, + "The amount of physical memory used by the server process, in bytes." }; #if !defined(OS_FREEBSD) - new_values["MemoryShared"] = data.shared; + new_values["MemoryShared"] = { data.shared, + "The amount of memory used by the server process, that is also shared by another processes, in bytes." + " ClickHouse does not use shared memory, but some memory can be labeled by OS as shared for its own reasons." + " This metric does not make a lot of sense to watch, and it exists only for completeness reasons."}; #endif - new_values["MemoryCode"] = data.code; - new_values["MemoryDataAndStack"] = data.data_and_stack; + new_values["MemoryCode"] = { data.code, + "The amount of virtual memory mapped for the pages of machine code of the server process, in bytes." }; + new_values["MemoryDataAndStack"] = { data.data_and_stack, + "The amount of virtual memory mapped for the use of stack and for the allocated memory, in bytes." + " It is unspecified whether it includes the per-thread stacks and most of the allocated memory, that is allocated with the 'mmap' system call." + " This metric exists only for completeness reasons. I recommend to use the `MemoryResident` metric for monitoring."}; /// We must update the value of total_memory_tracker periodically. /// Otherwise it might be calculated incorrectly - it can include a "drift" of memory amount. @@ -754,11 +774,22 @@ void AsynchronousMetrics::update(TimePoint update_time) assertChar('/', *loadavg); readText(threads_total, *loadavg); - new_values["LoadAverage1"] = loadavg1; - new_values["LoadAverage5"] = loadavg5; - new_values["LoadAverage15"] = loadavg15; - new_values["OSThreadsRunnable"] = threads_runnable; - new_values["OSThreadsTotal"] = threads_total; +#define LOAD_AVERAGE_DOCUMENTATION \ + " The load represents the number of threads across all the processes (the scheduling entities of the OS kernel)," \ + " that are currently running by CPU or waiting for IO, or ready to run but not being scheduled at this point of time." \ + " This number includes all the processes, not only clickhouse-server. The number can be greater than the number of CPU cores," \ + " if the system is overloaded, and many processes are ready to run but waiting for CPU or IO." + + new_values["LoadAverage1"] = { loadavg1, + "The whole system load, averaged with exponential smoothing over 1 minute." LOAD_AVERAGE_DOCUMENTATION }; + new_values["LoadAverage5"] = { loadavg5, + "The whole system load, averaged with exponential smoothing over 5 minutes." LOAD_AVERAGE_DOCUMENTATION }; + new_values["LoadAverage15"] = { loadavg15, + "The whole system load, averaged with exponential smoothing over 15 minutes." LOAD_AVERAGE_DOCUMENTATION }; + new_values["OSThreadsRunnable"] = { threads_runnable, + "The total number of 'runnable' threads, as the OS kernel scheduler seeing it." }; + new_values["OSThreadsTotal"] = { threads_total, + "The total number of threads, as the OS kernel scheduler seeing it." }; } catch (...) { @@ -775,7 +806,7 @@ void AsynchronousMetrics::update(TimePoint update_time) Float64 uptime_seconds = 0; readText(uptime_seconds, *uptime); - new_values["OSUptime"] = uptime_seconds; + new_values["OSUptime"] = { uptime_seconds, "The uptime of the host server (the machine where ClickHouse is running), in seconds." }; } catch (...) { @@ -838,16 +869,43 @@ void AsynchronousMetrics::update(TimePoint update_time) else delta_values_all_cpus = delta_values; - new_values["OSUserTime" + cpu_suffix] = delta_values.user * multiplier; - new_values["OSNiceTime" + cpu_suffix] = delta_values.nice * multiplier; - new_values["OSSystemTime" + cpu_suffix] = delta_values.system * multiplier; - new_values["OSIdleTime" + cpu_suffix] = delta_values.idle * multiplier; - new_values["OSIOWaitTime" + cpu_suffix] = delta_values.iowait * multiplier; - new_values["OSIrqTime" + cpu_suffix] = delta_values.irq * multiplier; - new_values["OSSoftIrqTime" + cpu_suffix] = delta_values.softirq * multiplier; - new_values["OSStealTime" + cpu_suffix] = delta_values.steal * multiplier; - new_values["OSGuestTime" + cpu_suffix] = delta_values.guest * multiplier; - new_values["OSGuestNiceTime" + cpu_suffix] = delta_values.guest_nice * multiplier; + new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier, + "The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier, + "The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier, + "The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier, + "The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier, + "The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier, + "The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " A high number of this metric may indicate hardware misconfiguration or a very high network load." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier, + "The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " A high number of this metric may indicate inefficient software running on the system." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier, + "The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " Not every virtualized environments present this metric, and most of them don't." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier, + "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " This metric is irrelevant for ClickHouse, but still exists for completeness." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; + new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier, + "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." + " This metric is irrelevant for ClickHouse, but still exists for completeness." + " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."}; } prev_values = current_values; @@ -872,14 +930,18 @@ void AsynchronousMetrics::update(TimePoint update_time) UInt64 processes_running = 0; readText(processes_running, *proc_stat); skipToNextLineOrEOF(*proc_stat); - new_values["OSProcessesRunning"] = processes_running; + new_values["OSProcessesRunning"] = { processes_running, + "The number of runnable (running or ready to run) threads by the operating system." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else if (name == "procs_blocked") { UInt64 processes_blocked = 0; readText(processes_blocked, *proc_stat); skipToNextLineOrEOF(*proc_stat); - new_values["OSProcessesBlocked"] = processes_blocked; + new_values["OSProcessesBlocked"] = { processes_blocked, + "Number of threads blocked waiting for I/O to complete (`man procfs`)." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else skipToNextLineOrEOF(*proc_stat); @@ -889,25 +951,45 @@ void AsynchronousMetrics::update(TimePoint update_time) { ProcStatValuesOther delta_values = current_other_values - proc_stat_values_other; - new_values["OSInterrupts"] = delta_values.interrupts; - new_values["OSContextSwitches"] = delta_values.context_switches; - new_values["OSProcessesCreated"] = delta_values.processes_created; + new_values["OSInterrupts"] = { delta_values.interrupts, "The number of interrupts on the host machine. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["OSContextSwitches"] = { delta_values.context_switches, "The number of context switches that the system underwent on the host machine. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["OSProcessesCreated"] = { delta_values.processes_created, "The number of processes created. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; /// Also write values normalized to 0..1 by diving to the number of CPUs. /// These values are good to be averaged across the cluster of non-uniform servers. if (num_cpus) { - new_values["OSUserTimeNormalized"] = delta_values_all_cpus.user * multiplier / num_cpus; - new_values["OSNiceTimeNormalized"] = delta_values_all_cpus.nice * multiplier / num_cpus; - new_values["OSSystemTimeNormalized"] = delta_values_all_cpus.system * multiplier / num_cpus; - new_values["OSIdleTimeNormalized"] = delta_values_all_cpus.idle * multiplier / num_cpus; - new_values["OSIOWaitTimeNormalized"] = delta_values_all_cpus.iowait * multiplier / num_cpus; - new_values["OSIrqTimeNormalized"] = delta_values_all_cpus.irq * multiplier / num_cpus; - new_values["OSSoftIrqTimeNormalized"] = delta_values_all_cpus.softirq * multiplier / num_cpus; - new_values["OSStealTimeNormalized"] = delta_values_all_cpus.steal * multiplier / num_cpus; - new_values["OSGuestTimeNormalized"] = delta_values_all_cpus.guest * multiplier / num_cpus; - new_values["OSGuestNiceTimeNormalized"] = delta_values_all_cpus.guest_nice * multiplier / num_cpus; + new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus, + "The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus, + "The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus, + "The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus, + "The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus, + "The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus, + "The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus, + "The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus, + "The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus, + "The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; + new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus, + "The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores." + " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."}; } } @@ -962,39 +1044,47 @@ void AsynchronousMetrics::update(TimePoint update_time) if (name == "MemTotal:") { - new_values["OSMemoryTotal"] = bytes; + new_values["OSMemoryTotal"] = { bytes, "The total amount of memory on the host system, in bytes." }; } else if (name == "MemFree:") { - /// We cannot simply name this metric "Free", because it confuses users. - /// See https://www.linuxatemyram.com/ - /// For convenience we also provide OSMemoryFreePlusCached, that should be somewhat similar to OSMemoryAvailable. - free_plus_cached_bytes += bytes; - new_values["OSMemoryFreeWithoutCached"] = bytes; + new_values["OSMemoryFreeWithoutCached"] = { bytes, + "The amount of free memory on the host system, in bytes." + " This does not include the memory used by the OS page cache memory, in bytes." + " The page cache memory is also available for usage by programs, so the value of this metric can be confusing." + " See the `OSMemoryAvailable` metric instead." + " For convenience we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable." + " See also https://www.linuxatemyram.com/." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else if (name == "MemAvailable:") { - new_values["OSMemoryAvailable"] = bytes; + new_values["OSMemoryAvailable"] = { bytes, "The amount of memory available to be used by programs, in bytes. This is very similar to the `OSMemoryFreePlusCached` metric." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else if (name == "Buffers:") { - new_values["OSMemoryBuffers"] = bytes; + new_values["OSMemoryBuffers"] = { bytes, "The amount of memory used by OS kernel buffers, in bytes. This should be typically small, and large values may indicate a misconfiguration of the OS." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else if (name == "Cached:") { free_plus_cached_bytes += bytes; - new_values["OSMemoryCached"] = bytes; + new_values["OSMemoryCached"] = { bytes, "The amount of memory used by the OS page cache, in bytes. Typically, almost all available memory is used by the OS page cache - high values of this metric are normal and expected." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } else if (name == "SwapCached:") { - new_values["OSMemorySwapCached"] = bytes; + new_values["OSMemorySwapCached"] = { bytes, "The amount of memory in swap that was also loaded in RAM. Swap should be disabled on production systems. If the value of this metric is large, it indicates a misconfiguration." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } skipToNextLineOrEOF(*meminfo); } - new_values["OSMemoryFreePlusCached"] = free_plus_cached_bytes; + new_values["OSMemoryFreePlusCached"] = { free_plus_cached_bytes, "The amount of free memory plus OS page cache memory on the host system, in bytes. This memory is available to be used by programs. The value should be very similar to `OSMemoryAvailable`." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } catch (...) { @@ -1043,7 +1133,7 @@ void AsynchronousMetrics::update(TimePoint update_time) if (auto colon = s.find_first_of(':')) { auto mhz = std::stod(s.substr(colon + 2)); - new_values[fmt::format("CPUFrequencyMHz_{}", core_id)] = mhz; + new_values[fmt::format("CPUFrequencyMHz_{}", core_id)] = { mhz, "The current frequency of the CPU, in MHz. Most of the modern CPUs adjust the frequency dynamically for power saving and Turbo Boosting." }; } } } @@ -1062,7 +1152,8 @@ void AsynchronousMetrics::update(TimePoint update_time) uint64_t open_files = 0; readText(open_files, *file_nr); - new_values["OSOpenFiles"] = open_files; + new_values["OSOpenFiles"] = { open_files, "The total number of opened files on the host machine." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } catch (...) { @@ -1083,7 +1174,17 @@ void AsynchronousMetrics::update(TimePoint update_time) BlockDeviceStatValues current_values{}; BlockDeviceStatValues & prev_values = block_device_stats[name]; - current_values.read(*device); + + try + { + current_values.read(*device); + } + catch (const ErrnoException & e) + { + LOG_DEBUG(log, "Cannot read statistics about the block device '{}': {}.", + name, errnoToString(e.getErrno())); + continue; + } BlockDeviceStatValues delta_values = current_values - prev_values; prev_values = current_values; @@ -1097,42 +1198,89 @@ void AsynchronousMetrics::update(TimePoint update_time) /// Always in milliseconds according to the docs. static constexpr double time_multiplier = 1e-6; - new_values["BlockReadOps_" + name] = delta_values.read_ios; - new_values["BlockWriteOps_" + name] = delta_values.write_ios; - new_values["BlockDiscardOps_" + name] = delta_values.discard_ops; +#define BLOCK_DEVICE_EXPLANATION \ + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." \ + " Source: `/sys/block`. See https://www.kernel.org/doc/Documentation/block/stat.txt" - new_values["BlockReadMerges_" + name] = delta_values.read_merges; - new_values["BlockWriteMerges_" + name] = delta_values.write_merges; - new_values["BlockDiscardMerges_" + name] = delta_values.discard_merges; + new_values["BlockReadOps_" + name] = { delta_values.read_ios, + "Number of read operations requested from the block device." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockWriteOps_" + name] = { delta_values.write_ios, + "Number of write operations requested from the block device." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockDiscardOps_" + name] = { delta_values.discard_ops, + "Number of discard operations requested from the block device. These operations are relevant for SSD." + " Discard operations are not used by ClickHouse, but can be used by other processes on the system." + BLOCK_DEVICE_EXPLANATION }; - new_values["BlockReadBytes_" + name] = delta_values.read_sectors * sector_size; - new_values["BlockWriteBytes_" + name] = delta_values.write_sectors * sector_size; - new_values["BlockDiscardBytes_" + name] = delta_values.discard_sectors * sector_size; + new_values["BlockReadMerges_" + name] = { delta_values.read_merges, + "Number of read operations requested from the block device and merged together by the OS IO scheduler." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockWriteMerges_" + name] = { delta_values.write_merges, + "Number of write operations requested from the block device and merged together by the OS IO scheduler." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockDiscardMerges_" + name] = { delta_values.discard_merges, + "Number of discard operations requested from the block device and merged together by the OS IO scheduler." + " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system." + BLOCK_DEVICE_EXPLANATION }; - new_values["BlockReadTime_" + name] = delta_values.read_ticks * time_multiplier; - new_values["BlockWriteTime_" + name] = delta_values.write_ticks * time_multiplier; - new_values["BlockDiscardTime_" + name] = delta_values.discard_ticks * time_multiplier; + new_values["BlockReadBytes_" + name] = { delta_values.read_sectors * sector_size, + "Number of bytes read from the block device." + " It can be lower than the number of bytes read from the filesystem due to the usage of the OS page cache, that saves IO." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockWriteBytes_" + name] = { delta_values.write_sectors * sector_size, + "Number of bytes written to the block device." + " It can be lower than the number of bytes written to the filesystem due to the usage of the OS page cache, that saves IO." + " A write to the block device may happen later than the corresponding write to the filesystem due to write-through caching." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockDiscardBytes_" + name] = { delta_values.discard_sectors * sector_size, + "Number of discarded bytes on the block device." + " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system." + BLOCK_DEVICE_EXPLANATION }; - new_values["BlockInFlightOps_" + name] = delta_values.in_flight_ios; + new_values["BlockReadTime_" + name] = { delta_values.read_ticks * time_multiplier, + "Time in seconds spend in read operations requested from the block device, summed across all the operations." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockWriteTime_" + name] = { delta_values.write_ticks * time_multiplier, + "Time in seconds spend in write operations requested from the block device, summed across all the operations." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockDiscardTime_" + name] = { delta_values.discard_ticks * time_multiplier, + "Time in seconds spend in discard operations requested from the block device, summed across all the operations." + " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system." + BLOCK_DEVICE_EXPLANATION }; - new_values["BlockActiveTime_" + name] = delta_values.io_ticks * time_multiplier; - new_values["BlockQueueTime_" + name] = delta_values.time_in_queue * time_multiplier; + new_values["BlockInFlightOps_" + name] = { delta_values.in_flight_ios, + "This value counts the number of I/O requests that have been issued to" + " the device driver but have not yet completed. It does not include IO" + " requests that are in the queue but not yet issued to the device driver." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockActiveTime_" + name] = { delta_values.io_ticks * time_multiplier, + "Time in seconds the block device had the IO requests queued." + BLOCK_DEVICE_EXPLANATION }; + new_values["BlockQueueTime_" + name] = { delta_values.time_in_queue * time_multiplier, + "This value counts the number of milliseconds that IO requests have waited" + " on this block device. If there are multiple IO requests waiting, this" + " value will increase as the product of the number of milliseconds times the" + " number of requests waiting." + BLOCK_DEVICE_EXPLANATION }; if (delta_values.in_flight_ios) { /// TODO Check if these values are meaningful. - new_values["BlockActiveTimePerOp_" + name] = delta_values.io_ticks * time_multiplier / delta_values.in_flight_ios; - new_values["BlockQueueTimePerOp_" + name] = delta_values.time_in_queue * time_multiplier / delta_values.in_flight_ios; + new_values["BlockActiveTimePerOp_" + name] = { delta_values.io_ticks * time_multiplier / delta_values.in_flight_ios, + "Similar to the `BlockActiveTime` metrics, but the value is divided to the number of IO operations to count the per-operation time." }; + new_values["BlockQueueTimePerOp_" + name] = { delta_values.time_in_queue * time_multiplier / delta_values.in_flight_ios, + "Similar to the `BlockQueueTime` metrics, but the value is divided to the number of IO operations to count the per-operation time." }; } } } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + LOG_DEBUG(log, "Cannot read statistics from block devices: {}", getCurrentExceptionMessage(false)); /// Try to reopen block devices in case of error - /// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name) + /// (i.e. ENOENT or ENODEV means that some disk had been replaced, and it may appear with a new name) try { openBlockDevices(); @@ -1211,15 +1359,31 @@ void AsynchronousMetrics::update(TimePoint update_time) if (!first_run) { - new_values["NetworkReceiveBytes_" + interface_name] = delta_values.recv_bytes; - new_values["NetworkReceivePackets_" + interface_name] = delta_values.recv_packets; - new_values["NetworkReceiveErrors_" + interface_name] = delta_values.recv_errors; - new_values["NetworkReceiveDrop_" + interface_name] = delta_values.recv_drop; + new_values["NetworkReceiveBytes_" + interface_name] = { delta_values.recv_bytes, + " Number of bytes received via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkReceivePackets_" + interface_name] = { delta_values.recv_packets, + " Number of network packets received via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkReceiveErrors_" + interface_name] = { delta_values.recv_errors, + " Number of times error happened receiving via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkReceiveDrop_" + interface_name] = { delta_values.recv_drop, + " Number of bytes a packet was dropped while received via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; - new_values["NetworkSendBytes_" + interface_name] = delta_values.send_bytes; - new_values["NetworkSendPackets_" + interface_name] = delta_values.send_packets; - new_values["NetworkSendErrors_" + interface_name] = delta_values.send_errors; - new_values["NetworkSendDrop_" + interface_name] = delta_values.send_drop; + new_values["NetworkSendBytes_" + interface_name] = { delta_values.send_bytes, + " Number of bytes sent via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkSendPackets_" + interface_name] = { delta_values.send_packets, + " Number of network packets sent via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkSendErrors_" + interface_name] = { delta_values.send_errors, + " Number of times error (e.g. TCP retransmit) happened while sending via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; + new_values["NetworkSendDrop_" + interface_name] = { delta_values.send_drop, + " Number of times a packed was dropped while sending via the network interface." + " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." }; } } } @@ -1238,7 +1402,8 @@ void AsynchronousMetrics::update(TimePoint update_time) in.rewind(); Int64 temperature = 0; readText(temperature, in); - new_values[fmt::format("Temperature{}", i)] = temperature * 0.001; + new_values[fmt::format("Temperature{}", i)] = { temperature * 0.001, + "The temperature of the corresponding device in ℃. A sensor can return an unrealistic value. Source: `/sys/class/thermal`" }; } } catch (...) @@ -1271,13 +1436,17 @@ void AsynchronousMetrics::update(TimePoint update_time) } catch (const ErrnoException & e) { - LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"), "Hardware monitor '{}', sensor '{}' exists but could not be read, error {}.", hwmon_name, sensor_name, e.getErrno()); + LOG_DEBUG(log, "Hardware monitor '{}', sensor '{}' exists but could not be read: {}.", + hwmon_name, sensor_name, errnoToString(e.getErrno())); + continue; } if (sensor_name.empty()) - new_values[fmt::format("Temperature_{}", hwmon_name)] = temperature * 0.001; + new_values[fmt::format("Temperature_{}", hwmon_name)] = { temperature * 0.001, + "The temperature reported by the corresponding hardware monitor in ℃. A sensor can return an unrealistic value. Source: `/sys/class/hwmon`" }; else - new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = temperature * 0.001; + new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = { temperature * 0.001, + "The temperature reported by the corresponding hardware monitor and the corresponding sensor in ℃. A sensor can return an unrealistic value. Source: `/sys/class/hwmon`" }; } } } @@ -1313,7 +1482,11 @@ void AsynchronousMetrics::update(TimePoint update_time) in.rewind(); uint64_t errors = 0; readText(errors, in); - new_values[fmt::format("EDAC{}_Correctable", i)] = errors; + new_values[fmt::format("EDAC{}_Correctable", i)] = { errors, + "The number of correctable ECC memory errors." + " A high number of this value indicates bad RAM which has to be immediately replaced," + " because in presence of a high number of corrected errors, a number of silent errors may happen as well, leading to data corruption." + " Source: `/sys/devices/system/edac/mc/`" }; } if (edac[i].second) @@ -1322,7 +1495,11 @@ void AsynchronousMetrics::update(TimePoint update_time) in.rewind(); uint64_t errors = 0; readText(errors, in); - new_values[fmt::format("EDAC{}_Uncorrectable", i)] = errors; + new_values[fmt::format("EDAC{}_Uncorrectable", i)] = { errors, + "The number of uncorrectable ECC memory errors." + " A non-zero number of this value indicates bad RAM which has to be immediately replaced," + " because it indicates potential data corruption." + " Source: `/sys/devices/system/edac/mc/`" }; } } } @@ -1346,24 +1523,36 @@ void AsynchronousMetrics::update(TimePoint update_time) { auto stat = getStatVFS(getContext()->getPath()); - new_values["FilesystemMainPathTotalBytes"] = stat.f_blocks * stat.f_frsize; - new_values["FilesystemMainPathAvailableBytes"] = stat.f_bavail * stat.f_frsize; - new_values["FilesystemMainPathUsedBytes"] = (stat.f_blocks - stat.f_bavail) * stat.f_frsize; - new_values["FilesystemMainPathTotalINodes"] = stat.f_files; - new_values["FilesystemMainPathAvailableINodes"] = stat.f_favail; - new_values["FilesystemMainPathUsedINodes"] = stat.f_files - stat.f_favail; + new_values["FilesystemMainPathTotalBytes"] = { stat.f_blocks * stat.f_frsize, + "The size of the volume where the main ClickHouse path is mounted, in bytes." }; + new_values["FilesystemMainPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize, + "Available bytes on the volume where the main ClickHouse path is mounted." }; + new_values["FilesystemMainPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize, + "Used bytes on the volume where the main ClickHouse path is mounted." }; + new_values["FilesystemMainPathTotalINodes"] = { stat.f_files, + "The total number of inodes on the volume where the main ClickHouse path is mounted. If it is less than 25 million, it indicates a misconfiguration." }; + new_values["FilesystemMainPathAvailableINodes"] = { stat.f_favail, + "The number of available inodes on the volume where the main ClickHouse path is mounted. If it is close to zero, it indicates a misconfiguration, and you will get 'no space left on device' even when the disk is not full." }; + new_values["FilesystemMainPathUsedINodes"] = { stat.f_files - stat.f_favail, + "The number of used inodes on the volume where the main ClickHouse path is mounted. This value mostly corresponds to the number of files." }; } { /// Current working directory of the server is the directory with logs. auto stat = getStatVFS("."); - new_values["FilesystemLogsPathTotalBytes"] = stat.f_blocks * stat.f_frsize; - new_values["FilesystemLogsPathAvailableBytes"] = stat.f_bavail * stat.f_frsize; - new_values["FilesystemLogsPathUsedBytes"] = (stat.f_blocks - stat.f_bavail) * stat.f_frsize; - new_values["FilesystemLogsPathTotalINodes"] = stat.f_files; - new_values["FilesystemLogsPathAvailableINodes"] = stat.f_favail; - new_values["FilesystemLogsPathUsedINodes"] = stat.f_files - stat.f_favail; + new_values["FilesystemLogsPathTotalBytes"] = { stat.f_blocks * stat.f_frsize, + "The size of the volume where ClickHouse logs path is mounted, in bytes. It's recommended to have at least 10 GB for logs." }; + new_values["FilesystemLogsPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize, + "Available bytes on the volume where ClickHouse logs path is mounted. If this value approaches zero, you should tune the log rotation in the configuration file." }; + new_values["FilesystemLogsPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize, + "Used bytes on the volume where ClickHouse logs path is mounted." }; + new_values["FilesystemLogsPathTotalINodes"] = { stat.f_files, + "The total number of inodes on the volume where ClickHouse logs path is mounted." }; + new_values["FilesystemLogsPathAvailableINodes"] = { stat.f_favail, + "The number of available inodes on the volume where ClickHouse logs path is mounted." }; + new_values["FilesystemLogsPathUsedINodes"] = { stat.f_files - stat.f_favail, + "The number of used inodes on the volume where ClickHouse logs path is mounted." }; } /// Free and total space on every configured disk. @@ -1380,10 +1569,14 @@ void AsynchronousMetrics::update(TimePoint update_time) auto available = disk->getAvailableSpace(); auto unreserved = disk->getUnreservedSpace(); - new_values[fmt::format("DiskTotal_{}", name)] = total; - new_values[fmt::format("DiskUsed_{}", name)] = total - available; - new_values[fmt::format("DiskAvailable_{}", name)] = available; - new_values[fmt::format("DiskUnreserved_{}", name)] = unreserved; + new_values[fmt::format("DiskTotal_{}", name)] = { total, + "The total size in bytes of the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." }; + new_values[fmt::format("DiskUsed_{}", name)] = { total - available, + "Used bytes on the disk (virtual filesystem). Remote filesystems not always provide this information." }; + new_values[fmt::format("DiskAvailable_{}", name)] = { available, + "Available bytes on the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." }; + new_values[fmt::format("DiskUnreserved_{}", name)] = { unreserved, + "Available bytes on the disk (virtual filesystem) without the reservations for merges, fetches, and moves. Remote filesystems can show a large value like 16 EiB." }; } } @@ -1463,44 +1656,46 @@ void AsynchronousMetrics::update(TimePoint update_time) } } - new_values["ReplicasMaxQueueSize"] = max_queue_size; - new_values["ReplicasMaxInsertsInQueue"] = max_inserts_in_queue; - new_values["ReplicasMaxMergesInQueue"] = max_merges_in_queue; + new_values["ReplicasMaxQueueSize"] = { max_queue_size, "Maximum queue size (in the number of operations like get, merge) across Replicated tables." }; + new_values["ReplicasMaxInsertsInQueue"] = { max_inserts_in_queue, "Maximum number of INSERT operations in the queue (still to be replicated) across Replicated tables." }; + new_values["ReplicasMaxMergesInQueue"] = { max_merges_in_queue, "Maximum number of merge operations in the queue (still to be applied) across Replicated tables." }; - new_values["ReplicasSumQueueSize"] = sum_queue_size; - new_values["ReplicasSumInsertsInQueue"] = sum_inserts_in_queue; - new_values["ReplicasSumMergesInQueue"] = sum_merges_in_queue; + new_values["ReplicasSumQueueSize"] = { sum_queue_size, "Sum queue size (in the number of operations like get, merge) across Replicated tables." }; + new_values["ReplicasSumInsertsInQueue"] = { sum_inserts_in_queue, "Sum of INSERT operations in the queue (still to be replicated) across Replicated tables." }; + new_values["ReplicasSumMergesInQueue"] = { sum_merges_in_queue, "Sum of merge operations in the queue (still to be applied) across Replicated tables." }; - new_values["ReplicasMaxAbsoluteDelay"] = max_absolute_delay; - new_values["ReplicasMaxRelativeDelay"] = max_relative_delay; + new_values["ReplicasMaxAbsoluteDelay"] = { max_absolute_delay, "Maximum difference in seconds between the most fresh replicated part and the most fresh data part still to be replicated, across Replicated tables. A very high value indicates a replica with no data." }; + new_values["ReplicasMaxRelativeDelay"] = { max_relative_delay, "Maximum difference between the replica delay and the delay of the most up-to-date replica of the same table, across Replicated tables." }; - new_values["MaxPartCountForPartition"] = max_part_count_for_partition; + new_values["MaxPartCountForPartition"] = { max_part_count_for_partition, "Maximum number of parts per partition across all partitions of all tables of MergeTree family. Values larger than 300 indicates misconfiguration, overload, or massive data loading." }; - new_values["NumberOfDatabases"] = number_of_databases; - new_values["NumberOfTables"] = total_number_of_tables; + new_values["NumberOfDatabases"] = { number_of_databases, "Total number of databases on the server." }; + new_values["NumberOfTables"] = { total_number_of_tables, "Total number of tables summed across the databases on the server, excluding the databases that cannot contain MergeTree tables." + " The excluded database engines are those who generate the set of tables on the fly, like `Lazy`, `MySQL`, `PostgreSQL`, `SQlite`."}; - new_values["TotalBytesOfMergeTreeTables"] = total_number_of_bytes; - new_values["TotalRowsOfMergeTreeTables"] = total_number_of_rows; - new_values["TotalPartsOfMergeTreeTables"] = total_number_of_parts; + new_values["TotalBytesOfMergeTreeTables"] = { total_number_of_bytes, "Total amount of bytes (compressed, including data and indices) stored in all tables of MergeTree family." }; + new_values["TotalRowsOfMergeTreeTables"] = { total_number_of_rows, "Total amount of rows (records) stored in all tables of MergeTree family." }; + new_values["TotalPartsOfMergeTreeTables"] = { total_number_of_parts, "Total amount of data parts in all tables of MergeTree family." + " Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key." }; - auto get_metric_name = [](const String & name) -> const char * + auto get_metric_name_doc = [](const String & name) -> std::pair { - static std::map metric_map = + static std::map> metric_map = { - {"tcp_port", "TCPThreads"}, - {"tcp_port_secure", "TCPSecureThreads"}, - {"http_port", "HTTPThreads"}, - {"https_port", "HTTPSecureThreads"}, - {"interserver_http_port", "InterserverThreads"}, - {"interserver_https_port", "InterserverSecureThreads"}, - {"mysql_port", "MySQLThreads"}, - {"postgresql_port", "PostgreSQLThreads"}, - {"grpc_port", "GRPCThreads"}, - {"prometheus.port", "PrometheusThreads"} + {"tcp_port", {"TCPThreads", "Number of threads in the server of the TCP protocol (without TLS)."}}, + {"tcp_port_secure", {"TCPSecureThreads", "Number of threads in the server of the TCP protocol (with TLS)."}}, + {"http_port", {"HTTPThreads", "Number of threads in the server of the HTTP interface (without TLS)."}}, + {"https_port", {"HTTPSecureThreads", "Number of threads in the server of the HTTPS interface."}}, + {"interserver_http_port", {"InterserverThreads", "Number of threads in the server of the replicas communication protocol (without TLS)."}}, + {"interserver_https_port", {"InterserverSecureThreads", "Number of threads in the server of the replicas communication protocol (with TLS)."}}, + {"mysql_port", {"MySQLThreads", "Number of threads in the server of the MySQL compatibility protocol."}}, + {"postgresql_port", {"PostgreSQLThreads", "Number of threads in the server of the PostgreSQL compatibility protocol."}}, + {"grpc_port", {"GRPCThreads", "Number of threads in the server of the GRPC protocol."}}, + {"prometheus.port", {"PrometheusThreads", "Number of threads in the server of the Prometheus endpoint. Note: prometheus endpoints can be also used via the usual HTTP/HTTPs ports."}} }; auto it = metric_map.find(name); if (it == metric_map.end()) - return nullptr; + return { nullptr, nullptr }; else return it->second; }; @@ -1508,8 +1703,8 @@ void AsynchronousMetrics::update(TimePoint update_time) const auto server_metrics = protocol_server_metrics_func(); for (const auto & server_metric : server_metrics) { - if (const auto * name = get_metric_name(server_metric.port_name)) - new_values[name] = server_metric.current_threads; + if (auto name_doc = get_metric_name_doc(server_metric.port_name); name_doc.first != nullptr) + new_values[name_doc.first] = { server_metric.current_threads, name_doc.second }; } } #if USE_NURAFT @@ -1522,14 +1717,14 @@ void AsynchronousMetrics::update(TimePoint update_time) size_t is_observer = 0; size_t is_standalone = 0; size_t znode_count = 0; - size_t watch_count =0; + size_t watch_count = 0; size_t ephemerals_count = 0; - size_t approximate_data_size =0; + size_t approximate_data_size = 0; size_t key_arena_size = 0; - size_t latest_snapshot_size =0; - size_t open_file_descriptor_count =0; - size_t max_file_descriptor_count =0; - size_t followers =0; + size_t latest_snapshot_size = 0; + size_t open_file_descriptor_count = 0; + size_t max_file_descriptor_count = 0; + size_t followers = 0; size_t synced_followers = 0; size_t zxid = 0; size_t session_with_watches = 0; @@ -1570,29 +1765,29 @@ void AsynchronousMetrics::update(TimePoint update_time) } } - new_values["KeeperIsLeader"] = is_leader; - new_values["KeeperIsFollower"] = is_follower; - new_values["KeeperIsObserver"] = is_observer; - new_values["KeeperIsStandalone"] = is_standalone; + new_values["KeeperIsLeader"] = { is_leader, "1 if ClickHouse Keeper is a leader, 0 otherwise." }; + new_values["KeeperIsFollower"] = { is_follower, "1 if ClickHouse Keeper is a follower, 0 otherwise." }; + new_values["KeeperIsObserver"] = { is_observer, "1 if ClickHouse Keeper is an observer, 0 otherwise." }; + new_values["KeeperIsStandalone"] = { is_standalone, "1 if ClickHouse Keeper is in a standalone mode, 0 otherwise." }; - new_values["KeeperZnodeCount"] = znode_count; - new_values["KeeperWatchCount"] = watch_count; - new_values["KeeperEphemeralsCount"] = ephemerals_count; + new_values["KeeperZnodeCount"] = { znode_count, "The number of nodes (data entries) in ClickHouse Keeper." }; + new_values["KeeperWatchCount"] = { watch_count, "The number of watches in ClickHouse Keeper." }; + new_values["KeeperEphemeralsCount"] = { ephemerals_count, "The number of ephemeral nodes in ClickHouse Keeper." }; - new_values["KeeperApproximateDataSize"] = approximate_data_size; - new_values["KeeperKeyArenaSize"] = key_arena_size; - new_values["KeeperLatestSnapshotSize"] = latest_snapshot_size; + new_values["KeeperApproximateDataSize"] = { approximate_data_size, "The approximate data size of ClickHouse Keeper, in bytes." }; + new_values["KeeperKeyArenaSize"] = { key_arena_size, "The size in bytes of the memory arena for keys in ClickHouse Keeper." }; + new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." }; - new_values["KeeperOpenFileDescriptorCount"] = open_file_descriptor_count; - new_values["KeeperMaxFileDescriptorCount"] = max_file_descriptor_count; + new_values["KeeperOpenFileDescriptorCount"] = { open_file_descriptor_count, "The number of open file descriptors in ClickHouse Keeper." }; + new_values["KeeperMaxFileDescriptorCount"] = { max_file_descriptor_count, "The maximum number of open file descriptors in ClickHouse Keeper." }; - new_values["KeeperFollowers"] = followers; - new_values["KeeperSyncedFollowers"] = synced_followers; - new_values["KeeperZxid"] = zxid; - new_values["KeeperSessionWithWatches"] = session_with_watches; - new_values["KeeperPathsWatched"] = paths_watched; - new_values["KeeperSnapshotDirSize"] = snapshot_dir_size; - new_values["KeeperLogDirSize"] = log_dir_size; + new_values["KeeperFollowers"] = { followers, "The number of followers of ClickHouse Keeper." }; + new_values["KeeperSyncedFollowers"] = { synced_followers, "The number of followers of ClickHouse Keeper who are also in-sync." }; + new_values["KeeperZxid"] = { zxid, "The current transaction id number (zxid) in ClickHouse Keeper." }; + new_values["KeeperSessionWithWatches"] = { session_with_watches, "The number of client sessions of ClickHouse Keeper having watches." }; + new_values["KeeperPathsWatched"] = { paths_watched, "The number of different paths watched by the clients of ClickHouse Keeper." }; + new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." }; + new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." }; } } #endif @@ -1601,7 +1796,7 @@ void AsynchronousMetrics::update(TimePoint update_time) /// Add more metrics as you wish. - new_values["AsynchronousMetricsCalculationTimeSpent"] = watch.elapsedSeconds(); + new_values["AsynchronousMetricsCalculationTimeSpent"] = { watch.elapsedSeconds(), "Time in seconds spent for calculation of asynchronous metrics (this is the overhead of asynchronous metrics)." }; /// Log the new metrics. if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog()) @@ -1681,11 +1876,10 @@ void AsynchronousMetrics::updateHeavyMetricsIfNeeded(TimePoint current_time, Tim update_period.count(), heavy_metric_update_period.count(), watch.elapsedSeconds()); - } - new_values["NumberOfDetachedParts"] = detached_parts_stats.count; - new_values["NumberOfDetachedByUserParts"] = detached_parts_stats.detached_by_user; + new_values["NumberOfDetachedParts"] = { detached_parts_stats.count, "The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed." }; + new_values["NumberOfDetachedByUserParts"] = { detached_parts_stats.detached_by_user, "The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed." }; } } diff --git a/src/Interpreters/AsynchronousMetrics.h b/src/Interpreters/AsynchronousMetrics.h index 6e32bdb43b8..22ed2e862ea 100644 --- a/src/Interpreters/AsynchronousMetrics.h +++ b/src/Interpreters/AsynchronousMetrics.h @@ -18,16 +18,25 @@ namespace Poco { -class Logger; + class Logger; } namespace DB { -class ProtocolServerAdapter; class ReadBuffer; -using AsynchronousMetricValue = double; +struct AsynchronousMetricValue +{ + double value; + const char * documentation; + + template + AsynchronousMetricValue(T value_, const char * documentation_) + : value(static_cast(value_)), documentation(documentation_) {} + AsynchronousMetricValue() = default; /// For std::unordered_map::operator[]. +}; + using AsynchronousMetricValues = std::unordered_map; struct ProtocolServerMetrics @@ -42,6 +51,9 @@ struct ProtocolServerMetrics * * This includes both ClickHouse-related metrics (like memory usage of ClickHouse process) * and common OS-related metrics (like total memory usage on the server). + * + * All the values are either gauge type (like the total number of tables, the current memory usage). + * Or delta-counters representing some accumulation during the interval of time. */ class AsynchronousMetrics : WithContext { diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index c0537a252d8..f53985acdae 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -79,6 +79,8 @@ #include #include #include +#include +#include #include #include #include @@ -1233,7 +1235,7 @@ void Context::addQueryFactoriesInfo(QueryLogFactories factory_type, const String } -StoragePtr Context::executeTableFunction(const ASTPtr & table_expression) +StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const ASTSelectQuery * select_query_hint) { ASTFunction * function = assert_cast(table_expression.get()); String database_name = getCurrentDatabase(); @@ -1278,15 +1280,61 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression) } throw; } - if (getSettingsRef().use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint()) + if (getSettingsRef().use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable()) { - const auto & insertion_table = getInsertionTable(); - if (!insertion_table.empty()) + const auto & structure_hint = DatabaseCatalog::instance().getTable(getInsertionTable(), shared_from_this())->getInMemoryMetadataPtr()->getColumns(); + bool use_columns_from_insert_query = true; + + /// use_structure_from_insertion_table_in_table_functions=2 means `auto` + if (select_query_hint && getSettingsRef().use_structure_from_insertion_table_in_table_functions == 2) { - const auto & structure_hint - = DatabaseCatalog::instance().getTable(insertion_table, shared_from_this())->getInMemoryMetadataPtr()->columns; - table_function_ptr->setStructureHint(structure_hint); + const auto * expression_list = select_query_hint->select()->as(); + Names columns_names; + bool have_asterisk = false; + /// First, check if we have only identifiers, asterisk and literals in select expression, + /// and if no, we cannot use the structure from insertion table. + for (const auto & expression : expression_list->children) + { + if (auto * identifier = expression->as()) + { + columns_names.push_back(identifier->name()); + } + else if (expression->as()) + { + have_asterisk = true; + } + else if (!expression->as()) + { + use_columns_from_insert_query = false; + break; + } + } + + /// Check that all identifiers are column names from insertion table. + for (const auto & column_name : columns_names) + { + if (!structure_hint.has(column_name)) + { + use_columns_from_insert_query = false; + break; + } + } + + /// If we don't have asterisk but only subset of columns, we should use + /// structure from insertion table only in case when table function + /// supports reading subset of columns from data. + if (use_columns_from_insert_query && !have_asterisk && !columns_names.empty()) + { + /// For input function we should check if input format supports reading subset of columns. + if (table_function_ptr->getName() == "input") + use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat()); + else + use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns(); + } } + + if (use_columns_from_insert_query) + table_function_ptr->setStructureHint(structure_hint); } res = table_function_ptr->execute(table_expression, shared_from_this(), table_function_ptr->getName()); @@ -1520,12 +1568,21 @@ String Context::getDefaultFormat() const return default_format.empty() ? "TabSeparated" : default_format; } - void Context::setDefaultFormat(const String & name) { default_format = name; } +String Context::getInsertFormat() const +{ + return insert_format; +} + +void Context::setInsertFormat(const String & name) +{ + insert_format = name; +} + MultiVersion::Version Context::getMacros() const { return shared->macros.get(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 0eaec215588..a0b62da364e 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -239,6 +240,9 @@ private: String default_format; /// Format, used when server formats data by itself and if query does not have FORMAT specification. /// Thus, used in HTTP interface. If not specified - then some globally default format is used. + + String insert_format; /// Format, used in insert query. + TemporaryTablesMapping external_tables_mapping; Scalars scalars; /// Used to store constant values which are different on each instance during distributed plan, such as _shard_num. @@ -602,7 +606,9 @@ public: const QueryFactoriesInfo & getQueryFactoriesInfo() const { return query_factories_info; } void addQueryFactoriesInfo(QueryLogFactories factory_type, const String & created_object) const; - StoragePtr executeTableFunction(const ASTPtr & table_expression); + /// For table functions s3/file/url/hdfs/input we can use structure from + /// insertion table depending on select expression. + StoragePtr executeTableFunction(const ASTPtr & table_expression, const ASTSelectQuery * select_query_hint = nullptr); void addViewSource(const StoragePtr & storage); StoragePtr getViewSource() const; @@ -631,6 +637,9 @@ public: String getDefaultFormat() const; /// If default_format is not specified, some global default format is returned. void setDefaultFormat(const String & name); + String getInsertFormat() const; + void setInsertFormat(const String & name); + MultiVersion::Version getMacros() const; void setMacros(std::unique_ptr && macros); diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp index f4a98ada199..b88bb5d1caf 100644 --- a/src/Interpreters/JoinedTables.cpp +++ b/src/Interpreters/JoinedTables.cpp @@ -173,12 +173,13 @@ using RenameQualifiedIdentifiersVisitor = InDepthNodeVisitorgetQueryContext()->executeTableFunction(left_table_expression); + return context->getQueryContext()->executeTableFunction(left_table_expression, &select_query); StorageID table_id = StorageID::createEmpty(); if (left_db_and_table) diff --git a/src/Interpreters/JoinedTables.h b/src/Interpreters/JoinedTables.h index 9d01c081e9f..7562dbc9ac5 100644 --- a/src/Interpreters/JoinedTables.h +++ b/src/Interpreters/JoinedTables.h @@ -22,7 +22,7 @@ using StorageMetadataPtr = std::shared_ptr; class JoinedTables { public: - JoinedTables(ContextPtr context, const ASTSelectQuery & select_query, bool include_all_columns_ = false); + JoinedTables(ContextPtr context, const ASTSelectQuery & select_query_, bool include_all_columns_ = false); void reset(const ASTSelectQuery & select_query); @@ -52,6 +52,7 @@ private: /// Legacy (duplicated left table values) ASTPtr left_table_expression; std::optional left_db_and_table; + const ASTSelectQuery & select_query; }; } diff --git a/src/Interpreters/TraceCollector.cpp b/src/Interpreters/TraceCollector.cpp index d277763a141..41a7fcf8389 100644 --- a/src/Interpreters/TraceCollector.cpp +++ b/src/Interpreters/TraceCollector.cpp @@ -72,7 +72,7 @@ void TraceCollector::run() UInt8 query_id_size = 0; readBinary(query_id_size, in); query_id.resize(query_id_size); - in.read(query_id.data(), query_id_size); + in.readStrict(query_id.data(), query_id_size); UInt8 trace_size = 0; readIntBinary(trace_size, in); diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 8a4352ae152..44723546006 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -455,6 +455,7 @@ static std::tuple executeQueryImpl( } else if (auto * insert_query = ast->as()) { + context->setInsertFormat(insert_query->format); if (insert_query->settings_ast) InterpreterSetQuery(insert_query->settings_ast, context).executeForCurrentContext(); insert_query->tail = istr; @@ -534,7 +535,7 @@ static std::tuple executeQueryImpl( insert_query->tryFindInputFunction(input_function); if (input_function) { - StoragePtr storage = context->executeTableFunction(input_function); + StoragePtr storage = context->executeTableFunction(input_function, insert_query->select->as()); auto & input_storage = dynamic_cast(*storage); auto input_metadata_snapshot = input_storage.getInMemoryMetadataPtr(); auto pipe = getSourceFromASTInsertQuery( diff --git a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp b/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp index c69f91394b9..fe05283eef5 100644 --- a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp +++ b/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp @@ -164,7 +164,7 @@ namespace /// MongoDB('host:port', 'database', 'collection', 'user', 'password', ...) wipePasswordFromArgument(*storage.engine, data, 4); } - else if (engine_name == "S3" || engine_name == "COSN") + else if (engine_name == "S3" || engine_name == "COSN" || engine_name == "OSS") { /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...) wipePasswordFromS3TableEngineArguments(*storage.engine, data); @@ -222,7 +222,7 @@ namespace /// mongodb('host:port', 'database', 'collection', 'user', 'password', ...) wipePasswordFromArgument(function, data, 4); } - else if (function.name == "s3" || function.name == "cosn") + else if (function.name == "s3" || function.name == "cosn" || function.name == "oss") { /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...) wipePasswordFromS3FunctionArguments(function, data, /* is_cluster_function= */ false); diff --git a/src/Interpreters/tests/gtest_merge_tree_set_index.cpp b/src/Interpreters/tests/gtest_merge_tree_set_index.cpp index 4bdbc9c9fc5..bae29fb4f26 100644 --- a/src/Interpreters/tests/gtest_merge_tree_set_index.cpp +++ b/src/Interpreters/tests/gtest_merge_tree_set_index.cpp @@ -82,11 +82,11 @@ TEST(MergeTreeSetIndex, checkInRangeTuple) std::vector ranges = {Range(1), Range("a", true, "c", true)}; ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(1), Range('a', true, 'c', true)"; - ranges = {Range(1, false, 3, false), Range()}; - ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(1, false, 3, false), Range()"; + ranges = {Range(1, false, 3, false), Range::createWholeUniverseWithoutNull()}; + ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(1, false, 3, false), Range::createWholeUniverseWithoutNull()"; - ranges = {Range(2, false, 5, false), Range()}; - ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(2, false, 5, false), Range()"; + ranges = {Range(2, false, 5, false), Range::createWholeUniverseWithoutNull()}; + ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(2, false, 5, false), Range::createWholeUniverseWithoutNull()"; ranges = {Range(3), Range::createLeftBounded("a", true)}; ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(3), Range::createLeftBounded('a', true)"; @@ -106,7 +106,7 @@ TEST(MergeTreeSetIndex, checkInRangeTuple) ranges = {Range(1), Range("c")}; ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, false) << "Range(1), Range('c')"; - ranges = {Range(2, true, 3, true), Range()}; + ranges = {Range(2, true, 3, true), Range::createWholeUniverseWithoutNull()}; ASSERT_EQ(set->checkInRange(ranges, types).can_be_true, true) << "Range(2, true, 3, true), Range('x', true, 'z', true)"; ranges = {Range(2), Range("a", true, "z", true)}; diff --git a/src/Parsers/ASTProjectionSelectQuery.cpp b/src/Parsers/ASTProjectionSelectQuery.cpp index 7df0b53da6f..9b85fcb2dac 100644 --- a/src/Parsers/ASTProjectionSelectQuery.cpp +++ b/src/Parsers/ASTProjectionSelectQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -125,10 +126,22 @@ ASTPtr ASTProjectionSelectQuery::cloneToASTSelect() const if (with()) select_query->setExpression(ASTSelectQuery::Expression::WITH, with()->clone()); if (select()) - select_query->setExpression(ASTSelectQuery::Expression::SELECT, select()->clone()); + { + ASTPtr select_list = select()->clone(); + if (orderBy()) + { + /// Add ORDER BY list to SELECT for simplicity. It is Ok because we only uses this to find all required columns. + auto * expressions = select_list->as(); + if (!expressions) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Unexpected structure of SELECT clause in projection definition {}; Expression list expected", + select_list->dumpTree(0)); + expressions->children.emplace_back(orderBy()->clone()); + } + select_query->setExpression(ASTSelectQuery::Expression::SELECT, std::move(select_list)); + } if (groupBy()) select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, groupBy()->clone()); - // Get rid of orderBy. It's used for projection definition only return node; } diff --git a/src/Parsers/ExpressionListParsers.cpp b/src/Parsers/ExpressionListParsers.cpp index ee212a1993b..44bf1023e6e 100644 --- a/src/Parsers/ExpressionListParsers.cpp +++ b/src/Parsers/ExpressionListParsers.cpp @@ -2201,40 +2201,40 @@ std::vector> ParserExpressionImpl::operators_t {"AND", Operator("and", 4, 2, OperatorType::Mergeable)}, {"BETWEEN", Operator("", 6, 0, OperatorType::StartBetween)}, {"NOT BETWEEN", Operator("", 6, 0, OperatorType::StartNotBetween)}, - {"IS NULL", Operator("isNull", 8, 1, OperatorType::IsNull)}, - {"IS NOT NULL", Operator("isNotNull", 8, 1, OperatorType::IsNull)}, - {"==", Operator("equals", 9, 2, OperatorType::Comparison)}, - {"!=", Operator("notEquals", 9, 2, OperatorType::Comparison)}, - {"<>", Operator("notEquals", 9, 2, OperatorType::Comparison)}, - {"<=", Operator("lessOrEquals", 9, 2, OperatorType::Comparison)}, - {">=", Operator("greaterOrEquals", 9, 2, OperatorType::Comparison)}, - {"<", Operator("less", 9, 2, OperatorType::Comparison)}, - {">", Operator("greater", 9, 2, OperatorType::Comparison)}, - {"=", Operator("equals", 9, 2, OperatorType::Comparison)}, - {"LIKE", Operator("like", 9, 2)}, - {"ILIKE", Operator("ilike", 9, 2)}, - {"NOT LIKE", Operator("notLike", 9, 2)}, - {"NOT ILIKE", Operator("notILike", 9, 2)}, - {"IN", Operator("in", 9, 2)}, - {"NOT IN", Operator("notIn", 9, 2)}, - {"GLOBAL IN", Operator("globalIn", 9, 2)}, - {"GLOBAL NOT IN", Operator("globalNotIn", 9, 2)}, - {"||", Operator("concat", 10, 2, OperatorType::Mergeable)}, - {"+", Operator("plus", 11, 2)}, - {"-", Operator("minus", 11, 2)}, - {"*", Operator("multiply", 12, 2)}, - {"/", Operator("divide", 12, 2)}, - {"%", Operator("modulo", 12, 2)}, - {"MOD", Operator("modulo", 12, 2)}, - {"DIV", Operator("intDiv", 12, 2)}, - {".", Operator("tupleElement", 14, 2, OperatorType::TupleElement)}, - {"[", Operator("arrayElement", 14, 2, OperatorType::ArrayElement)}, - {"::", Operator("CAST", 14, 2, OperatorType::Cast)}, + {"==", Operator("equals", 8, 2, OperatorType::Comparison)}, + {"!=", Operator("notEquals", 8, 2, OperatorType::Comparison)}, + {"<>", Operator("notEquals", 8, 2, OperatorType::Comparison)}, + {"<=", Operator("lessOrEquals", 8, 2, OperatorType::Comparison)}, + {">=", Operator("greaterOrEquals", 8, 2, OperatorType::Comparison)}, + {"<", Operator("less", 8, 2, OperatorType::Comparison)}, + {">", Operator("greater", 8, 2, OperatorType::Comparison)}, + {"=", Operator("equals", 8, 2, OperatorType::Comparison)}, + {"LIKE", Operator("like", 8, 2)}, + {"ILIKE", Operator("ilike", 8, 2)}, + {"NOT LIKE", Operator("notLike", 8, 2)}, + {"NOT ILIKE", Operator("notILike", 8, 2)}, + {"IN", Operator("in", 8, 2)}, + {"NOT IN", Operator("notIn", 8, 2)}, + {"GLOBAL IN", Operator("globalIn", 8, 2)}, + {"GLOBAL NOT IN", Operator("globalNotIn", 8, 2)}, + {"||", Operator("concat", 9, 2, OperatorType::Mergeable)}, + {"+", Operator("plus", 10, 2)}, + {"-", Operator("minus", 10, 2)}, + {"*", Operator("multiply", 11, 2)}, + {"/", Operator("divide", 11, 2)}, + {"%", Operator("modulo", 11, 2)}, + {"MOD", Operator("modulo", 11, 2)}, + {"DIV", Operator("intDiv", 11, 2)}, + {".", Operator("tupleElement", 13, 2, OperatorType::TupleElement)}, + {"[", Operator("arrayElement", 13, 2, OperatorType::ArrayElement)}, + {"::", Operator("CAST", 13, 2, OperatorType::Cast)}, + {"IS NULL", Operator("isNull", 13, 1, OperatorType::IsNull)}, + {"IS NOT NULL", Operator("isNotNull", 13, 1, OperatorType::IsNull)}, }); std::vector> ParserExpressionImpl::unary_operators_table({ {"NOT", Operator("not", 5, 1)}, - {"-", Operator("negate", 13, 1)} + {"-", Operator("negate", 12, 1)} }); Operator ParserExpressionImpl::finish_between_operator = Operator("", 7, 0, OperatorType::FinishBetween); diff --git a/src/Parsers/MySQL/ASTAlterCommand.cpp b/src/Parsers/MySQL/ASTAlterCommand.cpp index b5b36ff3c74..10446d40172 100644 --- a/src/Parsers/MySQL/ASTAlterCommand.cpp +++ b/src/Parsers/MySQL/ASTAlterCommand.cpp @@ -267,7 +267,12 @@ static inline bool parseRenameCommand(IParser::Pos & pos, ASTPtr & node, Expecte } else { - return false; + if (!ParserCompoundIdentifier(true).parse(pos, new_name, expected)) + return false; + auto new_table_id = new_name->as()->getTableId(); + alter_command->type = ASTAlterCommand::RENAME_TABLE; + alter_command->new_table_name = new_table_id.table_name; + alter_command->new_database_name = new_table_id.database_name; } node = alter_command; @@ -306,6 +311,7 @@ static inline bool parseOtherCommand(IParser::Pos & pos, ASTPtr & node, Expected OptionDescribe("CONVERT TO CHARACTER SET", "charset", std::make_shared()), OptionDescribe("CHARACTER SET", "charset", std::make_shared()), OptionDescribe("DEFAULT CHARACTER SET", "charset", std::make_shared()), + OptionDescribe("COMMENT", "", std::make_shared()), OptionDescribe("LOCK", "lock", std::make_shared()) } }; diff --git a/src/Parsers/MySQL/ASTDeclareColumn.cpp b/src/Parsers/MySQL/ASTDeclareColumn.cpp index 89085ef989d..e585dcb670c 100644 --- a/src/Parsers/MySQL/ASTDeclareColumn.cpp +++ b/src/Parsers/MySQL/ASTDeclareColumn.cpp @@ -52,6 +52,7 @@ static inline bool parseColumnDeclareOptions(IParser::Pos & pos, ASTPtr & node, OptionDescribe("KEY", "primary_key", std::make_unique()), OptionDescribe("COMMENT", "comment", std::make_unique()), OptionDescribe("CHARACTER SET", "charset_name", std::make_unique()), + OptionDescribe("CHARSET", "charset", std::make_unique()), OptionDescribe("COLLATE", "collate", std::make_unique()), OptionDescribe("COLUMN_FORMAT", "column_format", std::make_unique()), OptionDescribe("STORAGE", "storage", std::make_unique()), @@ -59,6 +60,7 @@ static inline bool parseColumnDeclareOptions(IParser::Pos & pos, ASTPtr & node, OptionDescribe("GENERATED ALWAYS AS", "generated", std::make_unique()), OptionDescribe("STORED", "is_stored", std::make_unique()), OptionDescribe("VIRTUAL", "is_stored", std::make_unique()), + OptionDescribe("INVISIBLE", "", std::make_unique()), OptionDescribe("", "reference", std::make_unique()), OptionDescribe("", "constraint", std::make_unique()), } diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 28a79270133..0566b579be1 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -227,7 +227,11 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node, JoinClausesAndActions join_clauses_and_actions; JoinKind join_kind = join_node.getKind(); - auto join_constant = tryExtractConstantFromJoinNode(join_tree_node); + std::optional join_constant; + + if (join_node.getStrictness() == JoinStrictness::All) + join_constant = tryExtractConstantFromJoinNode(join_tree_node); + if (join_constant) { /** If there is JOIN with always true constant, we transform it to cross. diff --git a/src/Planner/PlannerJoins.cpp b/src/Planner/PlannerJoins.cpp index f62517eaaad..53b9cfc5d99 100644 --- a/src/Planner/PlannerJoins.cpp +++ b/src/Planner/PlannerJoins.cpp @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -76,6 +77,23 @@ void JoinClause::dump(WriteBuffer & buffer) const if (!right_filter_condition_nodes.empty()) buffer << " right_condition_nodes: " + dump_dag_nodes(right_filter_condition_nodes); + + if (!asof_conditions.empty()) + { + buffer << " asof_conditions: "; + size_t asof_conditions_size = asof_conditions.size(); + + for (size_t i = 0; i < asof_conditions_size; ++i) + { + const auto & asof_condition = asof_conditions[i]; + + buffer << "key_index: " << asof_condition.key_index; + buffer << "inequality: " << toString(asof_condition.asof_inequality); + + if (i + 1 != asof_conditions_size) + buffer << ','; + } + } } String JoinClause::dump() const @@ -249,9 +267,7 @@ void buildJoinClause(ActionsDAGPtr join_expression_dag, join_node); if (!expression_side_optional) - throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, - "JOIN {} with constants is not supported", - join_node.formatASTForErrorMessage()); + expression_side_optional = JoinTableSide::Right; auto expression_side = *expression_side_optional; join_clause.addCondition(expression_side, join_expressions_actions_node); @@ -277,6 +293,22 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName & for (const auto & node : join_expression_actions_nodes) join_expression_dag_input_nodes.insert(&node); + auto * function_node = join_node.getJoinExpression()->as(); + if (!function_node) + throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, + "JOIN {} join expression expected function", + join_node.formatASTForErrorMessage()); + + /** It is possible to have constant value in JOIN ON section, that we need to ignore during DAG construction. + * If we do not ignore it, this function will be replaced by underlying constant. + * For example ASOF JOIN does not support JOIN with constants, and we should process it like ordinary JOIN. + * + * Example: SELECT * FROM (SELECT 1 AS id, 1 AS value) AS t1 ASOF LEFT JOIN (SELECT 1 AS id, 1 AS value) AS t2 + * ON (t1.id = t2.id) AND 1 != 1 AND (t1.value >= t1.value); + */ + auto constant_value = function_node->getConstantValueOrNull(); + function_node->performConstantFolding({}); + PlannerActionsVisitor join_expression_visitor(planner_context); auto join_expression_dag_node_raw_pointers = join_expression_visitor.visit(join_expression_actions, join_node.getJoinExpression()); if (join_expression_dag_node_raw_pointers.size() != 1) @@ -284,6 +316,8 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName & "JOIN {} ON clause contains multiple expressions", join_node.formatASTForErrorMessage()); + function_node->performConstantFolding(std::move(constant_value)); + const auto * join_expressions_actions_root_node = join_expression_dag_node_raw_pointers[0]; if (!join_expressions_actions_root_node->function) throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION, diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index 78bc9f611ab..8b8f21a3ae4 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -45,9 +45,9 @@ public: /// decide whether to deny or to accept that request. struct Extension { - std::shared_ptr task_iterator{nullptr}; - std::shared_ptr parallel_reading_coordinator; - std::optional replica_info; + std::shared_ptr task_iterator; + std::shared_ptr parallel_reading_coordinator; + std::optional replica_info; }; /// Takes already set connection. diff --git a/src/Server/PrometheusMetricsWriter.cpp b/src/Server/PrometheusMetricsWriter.cpp index 9168eb5f24d..843d1e64463 100644 --- a/src/Server/PrometheusMetricsWriter.cpp +++ b/src/Server/PrometheusMetricsWriter.cpp @@ -108,11 +108,16 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const if (!replaceInvalidChars(key)) continue; + auto value = name_value.second; + std::string metric_doc{value.documentation}; + convertHelpToSingleLine(metric_doc); + // TODO: add HELP section? asynchronous_metrics contains only key and value + writeOutLine(wb, "# HELP", key, metric_doc); writeOutLine(wb, "# TYPE", key, "gauge"); - writeOutLine(wb, key, value); + writeOutLine(wb, key, value.value); } } diff --git a/src/Server/PrometheusMetricsWriter.h b/src/Server/PrometheusMetricsWriter.h index 4422ced625e..0c2dde1f66f 100644 --- a/src/Server/PrometheusMetricsWriter.h +++ b/src/Server/PrometheusMetricsWriter.h @@ -3,11 +3,11 @@ #include #include - #include #include + namespace DB { diff --git a/src/Server/ProtocolServerAdapter.h b/src/Server/ProtocolServerAdapter.h index 850640ab70a..514354f9723 100644 --- a/src/Server/ProtocolServerAdapter.h +++ b/src/Server/ProtocolServerAdapter.h @@ -6,8 +6,10 @@ #include #include + namespace DB { + class GRPCServer; class TCPServer; diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index 9c8d3ca60f3..0b296aaef4e 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -19,8 +19,6 @@ #include #include -#include - #include "IServer.h" #include "Server/TCPProtocolStackData.h" #include "base/types.h" diff --git a/src/Storages/Hive/HiveFile.cpp b/src/Storages/Hive/HiveFile.cpp index 8f5b1b5f5fd..219fe562f2c 100644 --- a/src/Storages/Hive/HiveFile.cpp +++ b/src/Storages/Hive/HiveFile.cpp @@ -54,7 +54,7 @@ Range createRangeFromOrcStatistics(const StatisticsType * stats) } else { - return Range(); + return Range::createWholeUniverseWithoutNull(); } } @@ -64,14 +64,14 @@ Range createRangeFromParquetStatistics(std::shared_ptr stats) /// We must check if there are minimum or maximum values in statistics in case of /// null values or NaN/Inf values of double type. if (!stats->HasMinMax()) - return Range(); + return Range::createWholeUniverseWithoutNull(); return Range(FieldType(stats->min()), true, FieldType(stats->max()), true); } Range createRangeFromParquetStatistics(std::shared_ptr stats) { if (!stats->HasMinMax()) - return Range(); + return Range::createWholeUniverseWithoutNull(); String min_val(reinterpret_cast(stats->min().ptr), stats->min().len); String max_val(reinterpret_cast(stats->max().ptr), stats->max().len); return Range(min_val, true, max_val, true); @@ -116,7 +116,7 @@ void IHiveFile::loadSplitMinMaxIndexes() Range HiveORCFile::buildRange(const orc::ColumnStatistics * col_stats) { if (!col_stats || col_stats->hasNull()) - return {}; + return Range::createWholeUniverseWithoutNull(); if (const auto * int_stats = dynamic_cast(col_stats)) { @@ -155,7 +155,7 @@ Range HiveORCFile::buildRange(const orc::ColumnStatistics * col_stats) { return createRangeFromOrcStatistics(date_stats); } - return {}; + return Range::createWholeUniverseWithoutNull(); } void HiveORCFile::prepareReader() @@ -194,7 +194,7 @@ std::unique_ptr HiveORCFile::buildMinMaxIndex(c size_t range_num = index_names_and_types.size(); auto idx = std::make_unique(); - idx->hyperrectangle.resize(range_num); + idx->hyperrectangle.resize(range_num, Range::createWholeUniverseWithoutNull()); size_t i = 0; for (const auto & name_type : index_names_and_types) @@ -308,7 +308,7 @@ void HiveParquetFile::loadSplitMinMaxIndexesImpl() { auto row_group_meta = meta->RowGroup(static_cast(i)); split_minmax_idxes[i] = std::make_shared(); - split_minmax_idxes[i]->hyperrectangle.resize(num_cols); + split_minmax_idxes[i]->hyperrectangle.resize(num_cols, Range::createWholeUniverseWithoutNull()); size_t j = 0; auto it = index_names_and_types.begin(); diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 1d688427a57..f053c072dfa 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -438,8 +438,8 @@ const KeyCondition::AtomMap KeyCondition::atom_map [] (RPNElement & out, const Field &) { out.function = RPNElement::FUNCTION_IS_NOT_NULL; - // isNotNull means (-Inf, +Inf), which is the default Range - out.range = Range(); + // isNotNull means (-Inf, +Inf) + out.range = Range::createWholeUniverseWithoutNull(); return true; } }, @@ -448,9 +448,10 @@ const KeyCondition::AtomMap KeyCondition::atom_map [] (RPNElement & out, const Field &) { out.function = RPNElement::FUNCTION_IS_NULL; - // isNull means +Inf (NULLS_LAST) or -Inf (NULLS_FIRST), - // which is equivalent to not in Range (-Inf, +Inf) - out.range = Range(); + // isNull means +Inf (NULLS_LAST) or -Inf (NULLS_FIRST), We don't support discrete + // ranges, instead will use the inverse of (-Inf, +Inf). The inversion happens in + // checkInHyperrectangle. + out.range = Range::createWholeUniverseWithoutNull(); return true; } } @@ -1938,6 +1939,7 @@ static BoolMask forAnyHyperrectangle( bool left_bounded, bool right_bounded, std::vector & hyperrectangle, + const DataTypes & data_types, size_t prefix_size, BoolMask initial_mask, F && callback) @@ -1981,12 +1983,17 @@ static BoolMask forAnyHyperrectangle( if (left_bounded && right_bounded) hyperrectangle[prefix_size] = Range(left_keys[prefix_size], false, right_keys[prefix_size], false); else if (left_bounded) - hyperrectangle[prefix_size] = Range::createLeftBounded(left_keys[prefix_size], false); + hyperrectangle[prefix_size] = Range::createLeftBounded(left_keys[prefix_size], false, data_types[prefix_size]->isNullable()); else if (right_bounded) - hyperrectangle[prefix_size] = Range::createRightBounded(right_keys[prefix_size], false); + hyperrectangle[prefix_size] = Range::createRightBounded(right_keys[prefix_size], false, data_types[prefix_size]->isNullable()); for (size_t i = prefix_size + 1; i < key_size; ++i) - hyperrectangle[i] = Range(); + { + if (data_types[i]->isNullable()) + hyperrectangle[i] = Range::createWholeUniverse(); + else + hyperrectangle[i] = Range::createWholeUniverseWithoutNull(); + } BoolMask result = initial_mask; @@ -2004,7 +2011,9 @@ static BoolMask forAnyHyperrectangle( if (left_bounded) { hyperrectangle[prefix_size] = Range(left_keys[prefix_size]); - result = result | forAnyHyperrectangle(key_size, left_keys, right_keys, true, false, hyperrectangle, prefix_size + 1, initial_mask, callback); + result = result + | forAnyHyperrectangle( + key_size, left_keys, right_keys, true, false, hyperrectangle, data_types, prefix_size + 1, initial_mask, callback); if (result.isComplete()) return result; } @@ -2014,7 +2023,9 @@ static BoolMask forAnyHyperrectangle( if (right_bounded) { hyperrectangle[prefix_size] = Range(right_keys[prefix_size]); - result = result | forAnyHyperrectangle(key_size, left_keys, right_keys, false, true, hyperrectangle, prefix_size + 1, initial_mask, callback); + result = result + | forAnyHyperrectangle( + key_size, left_keys, right_keys, false, true, hyperrectangle, data_types, prefix_size + 1, initial_mask, callback); if (result.isComplete()) return result; } @@ -2030,7 +2041,16 @@ BoolMask KeyCondition::checkInRange( const DataTypes & data_types, BoolMask initial_mask) const { - std::vector key_ranges(used_key_size, Range()); + std::vector key_ranges; + + key_ranges.reserve(used_key_size); + for (size_t i = 0; i < used_key_size; ++i) + { + if (data_types[i]->isNullable()) + key_ranges.push_back(Range::createWholeUniverse()); + else + key_ranges.push_back(Range::createWholeUniverseWithoutNull()); + } // std::cerr << "Checking for: ["; // for (size_t i = 0; i != used_key_size; ++i) @@ -2041,7 +2061,7 @@ BoolMask KeyCondition::checkInRange( // std::cerr << (i != 0 ? ", " : "") << applyVisitor(FieldVisitorToString(), right_keys[i]); // std::cerr << "]\n"; - return forAnyHyperrectangle(used_key_size, left_keys, right_keys, true, true, key_ranges, 0, initial_mask, + return forAnyHyperrectangle(used_key_size, left_keys, right_keys, true, true, key_ranges, data_types, 0, initial_mask, [&] (const std::vector & key_ranges_hyperrectangle) { auto res = checkInHyperrectangle(key_ranges_hyperrectangle, data_types); @@ -2193,7 +2213,7 @@ BoolMask KeyCondition::checkInHyperrectangle( const Range * key_range = &hyperrectangle[element.key_column]; /// The case when the column is wrapped in a chain of possibly monotonic functions. - Range transformed_range; + Range transformed_range = Range::createWholeUniverse(); if (!element.monotonic_functions_chain.empty()) { std::optional new_range = applyMonotonicFunctionsChainToRange( diff --git a/src/Storages/MergeTree/KeyCondition.h b/src/Storages/MergeTree/KeyCondition.h index 138dce83db9..6aa0ae737c8 100644 --- a/src/Storages/MergeTree/KeyCondition.h +++ b/src/Storages/MergeTree/KeyCondition.h @@ -60,13 +60,10 @@ private: static bool less(const Field & lhs, const Field & rhs); public: - FieldRef left = NEGATIVE_INFINITY; /// the left border - FieldRef right = POSITIVE_INFINITY; /// the right border - bool left_included = false; /// includes the left border - bool right_included = false; /// includes the right border - - /// The whole universe (not null). - Range() {} /// NOLINT + FieldRef left; /// the left border + FieldRef right; /// the right border + bool left_included; /// includes the left border + bool right_included; /// includes the right border /// One point. Range(const FieldRef & point) /// NOLINT @@ -82,9 +79,19 @@ public: shrinkToIncludedIfPossible(); } - static Range createRightBounded(const FieldRef & right_point, bool right_included) + static Range createWholeUniverse() { - Range r; + return Range(NEGATIVE_INFINITY, true, POSITIVE_INFINITY, true); + } + + static Range createWholeUniverseWithoutNull() + { + return Range(NEGATIVE_INFINITY, false, POSITIVE_INFINITY, false); + } + + static Range createRightBounded(const FieldRef & right_point, bool right_included, bool with_null = false) + { + Range r = with_null ? createWholeUniverse() : createWholeUniverseWithoutNull(); r.right = right_point; r.right_included = right_included; r.shrinkToIncludedIfPossible(); @@ -94,9 +101,9 @@ public: return r; } - static Range createLeftBounded(const FieldRef & left_point, bool left_included) + static Range createLeftBounded(const FieldRef & left_point, bool left_included, bool with_null = false) { - Range r; + Range r = with_null ? createWholeUniverse() : createWholeUniverseWithoutNull(); r.left = left_point; r.left_included = left_included; r.shrinkToIncludedIfPossible(); @@ -367,7 +374,7 @@ private: Function function = FUNCTION_UNKNOWN; /// For FUNCTION_IN_RANGE and FUNCTION_NOT_IN_RANGE. - Range range; + Range range = Range::createWholeUniverse(); size_t key_column = 0; /// For FUNCTION_IN_SET, FUNCTION_NOT_IN_SET using MergeTreeSetIndexPtr = std::shared_ptr; diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index b63e08b733d..227a5c2a0ca 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -669,12 +669,11 @@ MergeTreeBaseSelectProcessor::Status MergeTreeBaseSelectProcessor::performReques if (task->data_part->isProjectionPart()) { part_name = task->data_part->getParentPart()->name; - projection_name = task->data_part->name; + projection_name = task->data_part->name; } else { part_name = task->data_part->name; - projection_name = ""; } PartBlockRange block_range @@ -691,8 +690,9 @@ MergeTreeBaseSelectProcessor::Status MergeTreeBaseSelectProcessor::performReques .block_range = std::move(block_range), .mark_ranges = std::move(requested_ranges) }; + String request_description = request.toString(); - /// Constistent hashing won't work with reading in order, because at the end of the execution + /// Consistent hashing won't work with reading in order, because at the end of the execution /// we could possibly seek back if (!delayed && canUseConsistentHashingForParallelReading()) { @@ -702,6 +702,7 @@ MergeTreeBaseSelectProcessor::Status MergeTreeBaseSelectProcessor::performReques auto delayed_task = std::make_unique(*task); // Create a copy delayed_task->mark_ranges = std::move(request.mark_ranges); delayed_tasks.emplace_back(std::move(delayed_task)); + LOG_TRACE(log, "Request delayed by hash: {}", request_description); return Status::Denied; } } @@ -709,17 +710,24 @@ MergeTreeBaseSelectProcessor::Status MergeTreeBaseSelectProcessor::performReques auto optional_response = extension.value().callback(std::move(request)); if (!optional_response.has_value()) + { + LOG_TRACE(log, "Request cancelled: {}", request_description); return Status::Cancelled; + } auto response = optional_response.value(); task->mark_ranges = std::move(response.mark_ranges); if (response.denied || task->mark_ranges.empty()) + { + LOG_TRACE(log, "Request rejected: {}", request_description); return Status::Denied; + } finalizeNewTask(); + LOG_TRACE(log, "Request accepted: {}", request_description); return Status::Accepted; } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 051854d8bc1..e385f5f4d25 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -45,7 +45,7 @@ public: const MergeTreeReaderSettings & reader_settings_, bool use_uncompressed_cache_, const Names & virt_column_names_ = {}, - std::optional extension = {}); + std::optional extension_ = {}); ~MergeTreeBaseSelectProcessor() override; @@ -189,12 +189,11 @@ private: /// It won't work with reading in order or reading in reverse order, because we can possibly seek back. bool getDelayedTasks(); - /// It will form a request a request to coordinator and + /// It will form a request to coordinator and /// then reinitialize the mark ranges of this->task object Status performRequestToCoordinator(MarkRanges requested_ranges, bool delayed); void splitCurrentTaskRangesAndFillBuffer(); - }; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 59171165704..0318fc0648c 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -686,7 +686,8 @@ MergeTreeDataSelectSamplingData MergeTreeDataSelectExecutor::getSampling( if (has_lower_limit) { - if (!key_condition.addCondition(sampling_key.column_names[0], Range::createLeftBounded(lower, true))) + if (!key_condition.addCondition( + sampling_key.column_names[0], Range::createLeftBounded(lower, true, sampling_key.data_types[0]->isNullable()))) throw Exception("Sampling column not in primary key", ErrorCodes::ILLEGAL_COLUMN); ASTPtr args = std::make_shared(); @@ -703,7 +704,8 @@ MergeTreeDataSelectSamplingData MergeTreeDataSelectExecutor::getSampling( if (has_upper_limit) { - if (!key_condition.addCondition(sampling_key.column_names[0], Range::createRightBounded(upper, false))) + if (!key_condition.addCondition( + sampling_key.column_names[0], Range::createRightBounded(upper, false, sampling_key.data_types[0]->isNullable()))) throw Exception("Sampling column not in primary key", ErrorCodes::ILLEGAL_COLUMN); ASTPtr args = std::make_shared(); diff --git a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp index 052834358bb..743bb504dbd 100644 --- a/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp @@ -44,7 +44,7 @@ void AnnoyIndex::deserialize(ReadBuffer& istr) readIntBinary(Base::_seed, istr); readVectorBinary(Base::_roots, istr); Base::_nodes = realloc(Base::_nodes, Base::_s * Base::_n_nodes); - istr.read(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); + istr.readStrict(reinterpret_cast(Base::_nodes), Base::_s * Base::_n_nodes); Base::_fd = 0; // set flags diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index b96d40f5759..03335d9ca98 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -59,8 +59,7 @@ void MergeTreeIndexGranuleFullText::deserializeBinary(ReadBuffer & istr, MergeTr for (auto & bloom_filter : bloom_filters) { - istr.read(reinterpret_cast( - bloom_filter.getFilter().data()), params.filter_size); + istr.readStrict(reinterpret_cast(bloom_filter.getFilter().data()), params.filter_size); } has_elems = true; } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp index 7efaf0866db..deed9b3f071 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranuleBloomFilter.cpp @@ -96,7 +96,7 @@ void MergeTreeIndexGranuleBloomFilter::deserializeBinary(ReadBuffer & istr, Merg static size_t atom_size = 8; size_t bytes_size = (bits_per_row * total_rows + atom_size - 1) / atom_size; filter = std::make_shared(bytes_size, hash_functions, 0); - istr.read(reinterpret_cast(filter->getFilter().data()), bytes_size); + istr.readStrict(reinterpret_cast(filter->getFilter().data()), bytes_size); } } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp index 730f9a05814..e07f19fb64c 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp @@ -1,24 +1,14 @@ #include -#include -#include -#include -#include -#include #include -#include -#include -#include - #include -#include #include #include -#include "IO/WriteBufferFromString.h" -#include +#include #include + namespace DB { @@ -45,15 +35,14 @@ public: PartitionReadResponse ParallelReplicasReadingCoordinator::Impl::handleRequest(PartitionReadRequest request) { - AtomicStopwatch watch; + auto * log = &Poco::Logger::get("ParallelReplicasReadingCoordinator"); + Stopwatch watch; + + String request_description = request.toString(); std::lock_guard lock(mutex); auto partition_it = partitions.find(request.partition_id); - SCOPE_EXIT({ - LOG_TRACE(&Poco::Logger::get("ParallelReplicasReadingCoordinator"), "Time for handling request: {}ns", watch.elapsed()); - }); - PartToRead::PartAndProjectionNames part_and_projection { .part = request.part_name, @@ -80,6 +69,7 @@ PartitionReadResponse ParallelReplicasReadingCoordinator::Impl::handleRequest(Pa partition_reading.mark_ranges_in_part.insert({part_and_projection, std::move(mark_ranges_index)}); partitions.insert({request.partition_id, std::move(partition_reading)}); + LOG_TRACE(log, "Request is first in partition, accepted in {} ns: {}", watch.elapsed(), request_description); return {.denied = false, .mark_ranges = std::move(request.mark_ranges)}; } @@ -95,6 +85,7 @@ PartitionReadResponse ParallelReplicasReadingCoordinator::Impl::handleRequest(Pa { case PartSegments::IntersectionResult::REJECT: { + LOG_TRACE(log, "Request rejected in {} ns: {}", watch.elapsed(), request_description); return {.denied = true, .mark_ranges = {}}; } case PartSegments::IntersectionResult::EXACTLY_ONE_INTERSECTION: @@ -110,6 +101,12 @@ PartitionReadResponse ParallelReplicasReadingCoordinator::Impl::handleRequest(Pa auto result_ranges = result.convertToMarkRangesFinal(); const bool denied = result_ranges.empty(); + + if (denied) + LOG_TRACE(log, "Request rejected due to intersection in {} ns: {}", watch.elapsed(), request_description); + else + LOG_TRACE(log, "Request accepted partially in {} ns: {}", watch.elapsed(), request_description); + return {.denied = denied, .mark_ranges = std::move(result_ranges)}; } case PartSegments::IntersectionResult::NO_INTERSECTION: @@ -121,6 +118,7 @@ PartitionReadResponse ParallelReplicasReadingCoordinator::Impl::handleRequest(Pa ); partition_reading.mark_ranges_in_part.insert({part_and_projection, std::move(mark_ranges_index)}); + LOG_TRACE(log, "Request accepted in {} ns: {}", watch.elapsed(), request_description); return {.denied = false, .mark_ranges = std::move(request.mark_ranges)}; } } diff --git a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h index bd2082be6c2..4800533e919 100644 --- a/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h +++ b/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.h @@ -3,6 +3,7 @@ #include #include + namespace DB { diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp index 7f91ffee1fe..47f10acb157 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeAttachThread.cpp @@ -182,6 +182,7 @@ void ReplicatedMergeTreeAttachThread::runImpl() storage.createNewZooKeeperNodes(); storage.syncPinnedPartUUIDs(); + std::lock_guard lock(storage.table_shared_id_mutex); storage.createTableSharedID(); }; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 99946e9d938..2d7afeafd0d 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1015,7 +1015,8 @@ bool ReplicatedMergeTreeQueue::checkReplaceRangeCanBeRemoved(const MergeTreePart void ReplicatedMergeTreeQueue::removePartProducingOpsInRange( zkutil::ZooKeeperPtr zookeeper, const MergeTreePartInfo & part_info, - const std::optional & covering_entry) + const std::optional & covering_entry, + const String & fetch_entry_znode) { /// TODO is it possible to simplify it? Queue to_wait; @@ -1029,22 +1030,40 @@ void ReplicatedMergeTreeQueue::removePartProducingOpsInRange( [[maybe_unused]] bool called_from_alter_query_directly = covering_entry && covering_entry->replace_range_entry && covering_entry->replace_range_entry->columns_version < 0; [[maybe_unused]] bool called_for_broken_part = !covering_entry; - assert(currently_executing_drop_replace_ranges.contains(part_info) || called_from_alter_query_directly || called_for_broken_part); + assert(currently_executing_drop_replace_ranges.contains(part_info) || called_from_alter_query_directly || called_for_broken_part || !fetch_entry_znode.empty()); + + auto is_simple_part_producing_op = [](const ReplicatedMergeTreeLogEntryData & data) + { + return data.type == LogEntry::GET_PART || + data.type == LogEntry::ATTACH_PART || + data.type == LogEntry::MERGE_PARTS || + data.type == LogEntry::MUTATE_PART; + }; for (Queue::iterator it = queue.begin(); it != queue.end();) { - auto type = (*it)->type; - bool is_simple_producing_op = type == LogEntry::GET_PART || - type == LogEntry::ATTACH_PART || - type == LogEntry::MERGE_PARTS || - type == LogEntry::MUTATE_PART; + /// Skipping currently processing entry + if (!fetch_entry_znode.empty() && (*it)->znode_name == fetch_entry_znode) + { + ++it; + continue; + } + + bool is_simple_producing_op = is_simple_part_producing_op(**it); bool simple_op_covered = is_simple_producing_op && part_info.contains(MergeTreePartInfo::fromPartName((*it)->new_part_name, format_version)); bool replace_range_covered = covering_entry && checkReplaceRangeCanBeRemoved(part_info, *it, *covering_entry); if (simple_op_covered || replace_range_covered) { if ((*it)->currently_executing) + { + bool is_covered_by_simple_op = covering_entry && is_simple_part_producing_op(*covering_entry); + bool is_fetching_covering_part = !fetch_entry_znode.empty(); + if (is_covered_by_simple_op || is_fetching_covering_part) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot remove covered entry {} producing parts {}, it's a bug", + (*it)->znode_name, fmt::join((*it)->getVirtualPartNames(format_version), ", ")); to_wait.push_back(*it); + } auto code = zookeeper->tryRemove(fs::path(replica_path) / "queue" / (*it)->znode_name); if (code != Coordination::Error::ZOK) LOG_INFO(log, "Couldn't remove {}: {}", (fs::path(replica_path) / "queue" / (*it)->znode_name).string(), Coordination::errorMessage(code)); @@ -1110,7 +1129,12 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry /// Parts are not disjoint. They can be even intersecting and it's not a problem, /// because we may have two queue entries producing intersecting parts if there's DROP_RANGE between them (so virtual_parts are ok). - /// We cannot execute `entry` (or upgrade its actual_part_name to `new_part_name`) + /// Give priority to DROP_RANGEs and allow processing them even if covered entries are currently executing. + /// DROP_RANGE will cancel covered operations and will wait for them in removePartProducingOpsInRange. + if (result_part.isFakeDropRangePart() && result_part.contains(future_part)) + continue; + + /// In other cases we cannot execute `entry` (or upgrade its actual_part_name to `new_part_name`) /// while any covered or covering parts are processed. /// But we also cannot simply return true and postpone entry processing, because it may lead to kind of livelock. /// Since queue is processed in multiple threads, it's likely that there will be at least one thread diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h index 2ebdd604af2..32421f91b04 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h @@ -336,8 +336,10 @@ public: * And also wait for the completion of their execution, if they are now being executed. * covering_entry is as an entry that caused removal of entries in range (usually, DROP_RANGE) */ - void removePartProducingOpsInRange(zkutil::ZooKeeperPtr zookeeper, const MergeTreePartInfo & part_info, - const std::optional & covering_entry); + void removePartProducingOpsInRange(zkutil::ZooKeeperPtr zookeeper, + const MergeTreePartInfo & part_info, + const std::optional & covering_entry, + const String & fetch_entry_znode); /** In the case where there are not enough parts to perform the merge in part_name * - move actions with merged parts to the end of the queue diff --git a/src/Storages/MergeTree/RequestResponse.cpp b/src/Storages/MergeTree/RequestResponse.cpp index a266540b99a..2ea6b0c9f9f 100644 --- a/src/Storages/MergeTree/RequestResponse.cpp +++ b/src/Storages/MergeTree/RequestResponse.cpp @@ -4,24 +4,27 @@ #include #include #include +#include #include + namespace DB { namespace ErrorCodes { extern const int UNKNOWN_PROTOCOL; + extern const int BAD_ARGUMENTS; } -static void readMarkRangesBinary(MarkRanges & ranges, ReadBuffer & buf, size_t MAX_RANGES_SIZE = DEFAULT_MAX_STRING_SIZE) +static void readMarkRangesBinary(MarkRanges & ranges, ReadBuffer & buf) { size_t size = 0; readVarUInt(size, buf); - if (size > MAX_RANGES_SIZE) - throw Poco::Exception("Too large ranges size."); + if (size > DEFAULT_MAX_STRING_SIZE) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Too large ranges size: {}.", size); ranges.resize(size); for (size_t i = 0; i < size; ++i) @@ -60,20 +63,28 @@ void PartitionReadRequest::serialize(WriteBuffer & out) const } -void PartitionReadRequest::describe(WriteBuffer & out) const +String PartitionReadRequest::toString() const { - String result; - result += fmt::format("partition_id: {} \n", partition_id); - result += fmt::format("part_name: {} \n", part_name); - result += fmt::format("projection_name: {} \n", projection_name); - result += fmt::format("block_range: ({}, {}) \n", block_range.begin, block_range.end); - result += "mark_ranges: "; - for (const auto & range : mark_ranges) - result += fmt::format("({}, {}) ", range.begin, range.end); - result += '\n'; - out.write(result.c_str(), result.size()); + WriteBufferFromOwnString out; + out << "partition: " << partition_id << ", part: " << part_name; + if (!projection_name.empty()) + out << ", projection: " << projection_name; + out << ", block range: [" << block_range.begin << ", " << block_range.end << "]"; + out << ", mark ranges: "; + + bool is_first = true; + for (const auto & [begin, end] : mark_ranges) + { + if (!is_first) + out << ", "; + out << "[" << begin << ", " << end << ")"; + is_first = false; + } + + return out.str(); } + void PartitionReadRequest::deserialize(ReadBuffer & in) { UInt64 version; @@ -95,14 +106,21 @@ void PartitionReadRequest::deserialize(ReadBuffer & in) UInt64 PartitionReadRequest::getConsistentHash(size_t buckets) const { - auto hash = SipHash(); + SipHash hash; + + hash.update(partition_id.size()); hash.update(partition_id); + + hash.update(part_name.size()); hash.update(part_name); + + hash.update(projection_name.size()); hash.update(projection_name); hash.update(block_range.begin); hash.update(block_range.end); + hash.update(mark_ranges.size()); for (const auto & range : mark_ranges) { hash.update(range.begin); @@ -118,7 +136,7 @@ void PartitionReadResponse::serialize(WriteBuffer & out) const /// Must be the first writeVarUInt(DBMS_PARALLEL_REPLICAS_PROTOCOL_VERSION, out); - writeVarUInt(static_cast(denied), out); + writeBinary(denied, out); writeMarkRangesBinary(mark_ranges, out); } diff --git a/src/Storages/MergeTree/RequestResponse.h b/src/Storages/MergeTree/RequestResponse.h index 85c8f7181af..ce9dc55f479 100644 --- a/src/Storages/MergeTree/RequestResponse.h +++ b/src/Storages/MergeTree/RequestResponse.h @@ -14,7 +14,7 @@ namespace DB { -/// Represents a segment [left; right] +/// Represents a segment [left; right] of part's block numbers. struct PartBlockRange { Int64 begin; @@ -35,10 +35,12 @@ struct PartitionReadRequest MarkRanges mark_ranges; void serialize(WriteBuffer & out) const; - void describe(WriteBuffer & out) const; void deserialize(ReadBuffer & in); UInt64 getConsistentHash(size_t buckets) const; + + /// Describe it for debugging purposes. + String toString() const; }; struct PartitionReadResponse diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 7e2d5e1727b..05aa8f469b0 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -307,8 +307,8 @@ static StoragePtr create(const StorageFactory::Arguments & args) else if (!args.attach && !args.getLocalContext()->getSettingsRef().allow_deprecated_syntax_for_merge_tree) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "This syntax for *MergeTree engine is deprecated. " - "Use extended storage definition syntax with ORDER BY/PRIMARY KEY clause." - "See also allow_deprecated_syntax_for_merge_tree setting."); + "Use extended storage definition syntax with ORDER BY/PRIMARY KEY clause. " + "See also `allow_deprecated_syntax_for_merge_tree` setting."); } /// For Replicated. diff --git a/src/Storages/NamedCollections.cpp b/src/Storages/NamedCollections.cpp new file mode 100644 index 00000000000..67847635f3f --- /dev/null +++ b/src/Storages/NamedCollections.cpp @@ -0,0 +1,545 @@ +#include "NamedCollections.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_NAMED_COLLECTION; + extern const int NAMED_COLLECTION_ALREADY_EXISTS; + extern const int BAD_ARGUMENTS; + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +namespace +{ + constexpr auto NAMED_COLLECTIONS_CONFIG_PREFIX = "named_collections"; + + std::string getCollectionPrefix(const std::string & collection_name) + { + return fmt::format("{}.{}", NAMED_COLLECTIONS_CONFIG_PREFIX, collection_name); + } + + /// Enumerate keys paths of the config recursively. + /// E.g. if `enumerate_paths` = {"root.key1"} and config like + /// + /// + /// + /// + /// + /// + /// + /// + /// + /// the `result` will contain two strings: "root.key1.key2" and "root.key1.key3.key4" + void collectKeys( + const Poco::Util::AbstractConfiguration & config, + std::queue enumerate_paths, + std::set & result) + { + if (enumerate_paths.empty()) + return; + + auto initial_paths = std::move(enumerate_paths); + enumerate_paths = {}; + while (!initial_paths.empty()) + { + auto path = initial_paths.front(); + initial_paths.pop(); + + Poco::Util::AbstractConfiguration::Keys keys; + config.keys(path, keys); + + if (keys.empty()) + { + result.insert(path); + } + else + { + for (const auto & key : keys) + enumerate_paths.emplace(path + '.' + key); + } + } + + collectKeys(config, enumerate_paths, result); + } +} + +NamedCollectionFactory & NamedCollectionFactory::instance() +{ + static NamedCollectionFactory instance; + return instance; +} + +void NamedCollectionFactory::initialize(const Poco::Util::AbstractConfiguration & config_) +{ + std::lock_guard lock(mutex); + if (is_initialized) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Named collection factory already initialized"); + } + + config = &config_; + is_initialized = true; +} + +void NamedCollectionFactory::reload(const Poco::Util::AbstractConfiguration & config_) +{ + std::lock_guard lock(mutex); + config = &config_; + loaded_named_collections.clear(); +} + +void NamedCollectionFactory::assertInitialized( + std::lock_guard & /* lock */) const +{ + if (!is_initialized) + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Named collection factory must be initialized before being used"); + } +} + +bool NamedCollectionFactory::exists(const std::string & collection_name) const +{ + std::lock_guard lock(mutex); + return existsUnlocked(collection_name, lock); +} + +bool NamedCollectionFactory::existsUnlocked( + const std::string & collection_name, + std::lock_guard & lock) const +{ + assertInitialized(lock); + /// Named collections can be added via SQL command or via config. + /// Named collections from config are loaded on first access, + /// therefore it might not be in `named_collections` map yet. + return loaded_named_collections.contains(collection_name) + || config->has(getCollectionPrefix(collection_name)); +} + +NamedCollectionPtr NamedCollectionFactory::get(const std::string & collection_name) const +{ + std::lock_guard lock(mutex); + assertInitialized(lock); + + if (!existsUnlocked(collection_name, lock)) + { + throw Exception( + ErrorCodes::UNKNOWN_NAMED_COLLECTION, + "There is no named collection `{}`", + collection_name); + } + + return getImpl(collection_name, lock); +} + +NamedCollectionPtr NamedCollectionFactory::tryGet(const std::string & collection_name) const +{ + std::lock_guard lock(mutex); + assertInitialized(lock); + + if (!existsUnlocked(collection_name, lock)) + return nullptr; + + return getImpl(collection_name, lock); +} + +NamedCollectionPtr NamedCollectionFactory::getImpl( + const std::string & collection_name, + std::lock_guard & /* lock */) const +{ + auto it = loaded_named_collections.find(collection_name); + if (it == loaded_named_collections.end()) + { + it = loaded_named_collections.emplace( + collection_name, + NamedCollection::create(*config, collection_name)).first; + } + return it->second; +} + +void NamedCollectionFactory::add( + const std::string & collection_name, + NamedCollectionPtr collection) +{ + std::lock_guard lock(mutex); + auto [it, inserted] = loaded_named_collections.emplace(collection_name, collection); + if (!inserted) + { + throw Exception( + ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS, + "A named collection `{}` already exists", + collection_name); + } +} + +void NamedCollectionFactory::remove(const std::string & collection_name) +{ + std::lock_guard lock(mutex); + assertInitialized(lock); + + if (!existsUnlocked(collection_name, lock)) + { + throw Exception( + ErrorCodes::UNKNOWN_NAMED_COLLECTION, + "There is no named collection `{}`", + collection_name); + } + + if (config->has(collection_name)) + { + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Collection {} is defined in config and cannot be removed", + collection_name); + } + + [[maybe_unused]] auto removed = loaded_named_collections.erase(collection_name); + assert(removed); +} + +NamedCollectionFactory::NamedCollections NamedCollectionFactory::getAll() const +{ + std::lock_guard lock(mutex); + assertInitialized(lock); + + NamedCollections result(loaded_named_collections); + + Poco::Util::AbstractConfiguration::Keys config_collections_names; + config->keys(NAMED_COLLECTIONS_CONFIG_PREFIX, config_collections_names); + + for (const auto & collection_name : config_collections_names) + { + if (result.contains(collection_name)) + continue; + + result.emplace(collection_name, NamedCollection::create(*config, collection_name)); + } + + return result; +} + +class NamedCollection::Impl +{ +private: + using ConfigurationPtr = Poco::AutoPtr; + + /// Named collection configuration + /// + /// ... + /// + ConfigurationPtr config; + Keys keys; + +public: + Impl(const Poco::Util::AbstractConfiguration & config_, + const std::string & collection_name_, + const Keys & keys_) + : config(createEmptyConfiguration(collection_name_)) + , keys(keys_) + { + auto collection_path = getCollectionPrefix(collection_name_); + for (const auto & key : keys) + copyConfigValue(config_, collection_path + '.' + key, *config, key); + } + + template T get(const Key & key) const + { + return getConfigValue(*config, key); + } + + template T getOrDefault(const Key & key, const T & default_value) const + { + return getConfigValueOrDefault(*config, key, &default_value); + } + + template void set(const Key & key, const T & value, bool update_if_exists) + { + setConfigValue(*config, key, value, update_if_exists); + if (!keys.contains(key)) + keys.insert(key); + } + + void remove(const Key & key) + { + removeConfigValue(*config, key); + [[maybe_unused]] auto removed = keys.erase(key); + assert(removed); + } + + Keys getKeys() const + { + return keys; + } + + ImplPtr copy() const + { + return std::make_unique(*this); + } + + std::string dumpStructure() const + { + /// Convert a collection config like + /// + /// value0 + /// + /// value2 + /// + /// value3 + /// + /// + /// + /// to a string: + /// "key0: value0 + /// key1: + /// key2: value2 + /// key3: + /// key4: value3" + WriteBufferFromOwnString wb; + Strings prev_key_parts; + for (const auto & key : keys) + { + Strings key_parts; + splitInto<'.'>(key_parts, key); + size_t tab_cnt = 0; + + auto it = key_parts.begin(); + auto prev_key_parts_it = prev_key_parts.begin(); + while (it != key_parts.end() + && prev_key_parts_it != prev_key_parts.end() + && *it == *prev_key_parts_it) + { + ++it; + ++prev_key_parts_it; + ++tab_cnt; + } + + auto start_it = it; + for (; it != key_parts.end(); ++it) + { + if (it != start_it) + wb << '\n'; + wb << std::string(tab_cnt++, '\t'); + wb << *it << ':'; + } + wb << '\t' << get(key) << '\n'; + prev_key_parts = key_parts; + } + return wb.str(); + } + +private: + template static T getConfigValue( + const Poco::Util::AbstractConfiguration & config, + const std::string & path) + { + return getConfigValueOrDefault(config, path); + } + + template static T getConfigValueOrDefault( + const Poco::Util::AbstractConfiguration & config, + const std::string & path, + const T * default_value = nullptr) + { + if (!config.has(path)) + { + if (!default_value) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", path); + return *default_value; + } + + if constexpr (std::is_same_v) + return config.getString(path); + else if constexpr (std::is_same_v) + return config.getUInt64(path); + else if constexpr (std::is_same_v) + return config.getInt64(path); + else if constexpr (std::is_same_v) + return config.getDouble(path); + else + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Unsupported type in getConfigValueOrDefault(). " + "Supported types are String, UInt64, Int64, Float64"); + } + + template static void setConfigValue( + Poco::Util::AbstractConfiguration & config, + const std::string & path, + const T & value, + bool update = false) + { + if (!update && config.has(path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key `{}` already exists", path); + + if constexpr (std::is_same_v) + config.setString(path, value); + else if constexpr (std::is_same_v) + config.setUInt64(path, value); + else if constexpr (std::is_same_v) + config.setInt64(path, value); + else if constexpr (std::is_same_v) + config.setDouble(path, value); + else + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Unsupported type in setConfigValue(). " + "Supported types are String, UInt64, Int64, Float64"); + } + + template static void copyConfigValue( + const Poco::Util::AbstractConfiguration & from_config, + const std::string & from_path, + Poco::Util::AbstractConfiguration & to_config, + const std::string & to_path) + { + if (!from_config.has(from_path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", from_path); + + if (to_config.has(to_path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key `{}` already exists", to_path); + + if constexpr (std::is_same_v) + to_config.setString(to_path, from_config.getString(from_path)); + else if constexpr (std::is_same_v) + to_config.setString(to_path, from_config.getString(from_path)); + else if constexpr (std::is_same_v) + to_config.setUInt64(to_path, from_config.getUInt64(from_path)); + else if constexpr (std::is_same_v) + to_config.setInt64(to_path, from_config.getInt64(from_path)); + else if constexpr (std::is_same_v) + to_config.setDouble(to_path, from_config.getDouble(from_path)); + else + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, + "Unsupported type in copyConfigValue(). " + "Supported types are String, UInt64, Int64, Float64"); + } + + static void removeConfigValue( + Poco::Util::AbstractConfiguration & config, + const std::string & path) + { + if (!config.has(path)) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", path); + config.remove(path); + } + + static ConfigurationPtr createEmptyConfiguration(const std::string & root_name) + { + using DocumentPtr = Poco::AutoPtr; + DocumentPtr xml_document(new Poco::XML::Document()); + xml_document->appendChild(xml_document->createElement(root_name)); + ConfigurationPtr config(new Poco::Util::XMLConfiguration(xml_document)); + return config; + } +}; + +NamedCollection::NamedCollection( + const Poco::Util::AbstractConfiguration & config, + const std::string & collection_path, + const Keys & keys) + : NamedCollection(std::make_unique(config, collection_path, keys)) +{ +} + +NamedCollection::NamedCollection(ImplPtr pimpl_) + : pimpl(std::move(pimpl_)) +{ +} + +NamedCollectionPtr NamedCollection::create( + const Poco::Util::AbstractConfiguration & config, + const std::string & collection_name) +{ + const auto collection_prefix = getCollectionPrefix(collection_name); + std::queue enumerate_input; + std::set enumerate_result; + + enumerate_input.push(collection_prefix); + collectKeys(config, std::move(enumerate_input), enumerate_result); + + /// Collection does not have any keys. + /// (`enumerate_result` == ). + const bool collection_is_empty = enumerate_result.size() == 1; + std::set keys; + if (!collection_is_empty) + { + /// Skip collection prefix and add +1 to avoid '.' in the beginning. + for (const auto & path : enumerate_result) + keys.emplace(path.substr(collection_prefix.size() + 1)); + } + return std::make_unique(config, collection_name, keys); +} + +template T NamedCollection::get(const Key & key) const +{ + return pimpl->get(key); +} + +template T NamedCollection::getOrDefault(const Key & key, const T & default_value) const +{ + return pimpl->getOrDefault(key, default_value); +} + +template void NamedCollection::set(const Key & key, const T & value, bool update_if_exists) +{ + pimpl->set(key, value, update_if_exists); +} + +void NamedCollection::remove(const Key & key) +{ + pimpl->remove(key); +} + +std::shared_ptr NamedCollection::duplicate() const +{ + return std::make_shared(pimpl->copy()); +} + +NamedCollection::Keys NamedCollection::getKeys() const +{ + return pimpl->getKeys(); +} + +std::string NamedCollection::dumpStructure() const +{ + return pimpl->dumpStructure(); +} + +template String NamedCollection::get(const NamedCollection::Key & key) const; +template UInt64 NamedCollection::get(const NamedCollection::Key & key) const; +template Int64 NamedCollection::get(const NamedCollection::Key & key) const; +template Float64 NamedCollection::get(const NamedCollection::Key & key) const; + +template String NamedCollection::getOrDefault(const NamedCollection::Key & key, const String & default_value) const; +template UInt64 NamedCollection::getOrDefault(const NamedCollection::Key & key, const UInt64 & default_value) const; +template Int64 NamedCollection::getOrDefault(const NamedCollection::Key & key, const Int64 & default_value) const; +template Float64 NamedCollection::getOrDefault(const NamedCollection::Key & key, const Float64 & default_value) const; + +template void NamedCollection::set(const NamedCollection::Key & key, const String & value, bool update_if_exists); +template void NamedCollection::set(const NamedCollection::Key & key, const UInt64 & value, bool update_if_exists); +template void NamedCollection::set(const NamedCollection::Key & key, const Int64 & value, bool update_if_exists); +template void NamedCollection::set(const NamedCollection::Key & key, const Float64 & value, bool update_if_exists); + +} diff --git a/src/Storages/NamedCollections.h b/src/Storages/NamedCollections.h new file mode 100644 index 00000000000..83bb1dd964e --- /dev/null +++ b/src/Storages/NamedCollections.h @@ -0,0 +1,107 @@ +#pragma once + +#include +#include + + +namespace DB +{ + +class NamedCollection; +using NamedCollectionPtr = std::shared_ptr; + +/** + * Class to represent arbitrary-structured named collection object. + * It can be defined via config or via SQL command. + * + * + * ... + * + * ... + * + */ +class NamedCollection +{ +private: + class Impl; + using ImplPtr = std::unique_ptr; + + ImplPtr pimpl; + +public: + using Key = std::string; + using Keys = std::set; + + static NamedCollectionPtr create( + const Poco::Util::AbstractConfiguration & config, + const std::string & collection_name); + + NamedCollection( + const Poco::Util::AbstractConfiguration & config, + const std::string & collection_path, + const Keys & keys); + + explicit NamedCollection(ImplPtr pimpl_); + + template T get(const Key & key) const; + + template T getOrDefault(const Key & key, const T & default_value) const; + + template void set(const Key & key, const T & value, bool update_if_exists = false); + + void remove(const Key & key); + + std::shared_ptr duplicate() const; + + Keys getKeys() const; + + std::string dumpStructure() const; +}; + +/** + * A factory of immutable named collections. + */ +class NamedCollectionFactory : boost::noncopyable +{ +public: + static NamedCollectionFactory & instance(); + + void initialize(const Poco::Util::AbstractConfiguration & config_); + + void reload(const Poco::Util::AbstractConfiguration & config_); + + bool exists(const std::string & collection_name) const; + + NamedCollectionPtr get(const std::string & collection_name) const; + + NamedCollectionPtr tryGet(const std::string & collection_name) const; + + void add( + const std::string & collection_name, + NamedCollectionPtr collection); + + void remove(const std::string & collection_name); + + using NamedCollections = std::unordered_map; + NamedCollections getAll() const; + +private: + void assertInitialized(std::lock_guard & lock) const; + + NamedCollectionPtr getImpl( + const std::string & collection_name, + std::lock_guard & lock) const; + + bool existsUnlocked( + const std::string & collection_name, + std::lock_guard & lock) const; + + mutable NamedCollections loaded_named_collections; + + const Poco::Util::AbstractConfiguration * config; + + bool is_initialized = false; + mutable std::mutex mutex; +}; + +} diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a450a9ef3a9..22e416384aa 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -227,7 +227,7 @@ void StorageMergeTree::read( bool enable_parallel_reading = local_context->getClientInfo().collaborate_with_initiator; if (enable_parallel_reading) - LOG_TRACE(log, "Parallel reading from replicas enabled {}", enable_parallel_reading); + LOG_TRACE(log, "Parallel reading from replicas enabled: {}", enable_parallel_reading); if (auto plan = reader.read( column_names, storage_snapshot, query_info, local_context, max_block_size, num_streams, processed_stage, nullptr, enable_parallel_reading)) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index dde844c0ee0..8b4788c8d55 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1750,8 +1750,18 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che if (!entry.actual_new_part_name.empty()) LOG_DEBUG(log, "Will fetch part {} instead of {}", entry.actual_new_part_name, entry.new_part_name); - if (!fetchPart(part_name, metadata_snapshot, fs::path(zookeeper_path) / "replicas" / replica, false, entry.quorum)) + String source_replica_path = fs::path(zookeeper_path) / "replicas" / replica; + if (!fetchPart(part_name, + metadata_snapshot, + source_replica_path, + /* to_detached= */ false, + entry.quorum, + /* zookeeper_ */ nullptr, + /* try_fetch_shared= */ true, + entry.znode_name)) + { return false; + } } catch (Exception & e) { @@ -1834,7 +1844,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) LOG_TRACE(log, "Executing DROP_RANGE {}", entry.new_part_name); auto drop_range_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version); getContext()->getMergeList().cancelInPartition(getStorageID(), drop_range_info.partition_id, drop_range_info.max_block); - queue.removePartProducingOpsInRange(getZooKeeper(), drop_range_info, entry); + queue.removePartProducingOpsInRange(getZooKeeper(), drop_range_info, entry, /* fetch_entry_znode= */ {}); part_check_thread.cancelRemovedPartsCheck(drop_range_info); /// Delete the parts contained in the range to be deleted. @@ -1906,7 +1916,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) if (replace) { getContext()->getMergeList().cancelInPartition(getStorageID(), drop_range.partition_id, drop_range.max_block); - queue.removePartProducingOpsInRange(getZooKeeper(), drop_range, entry); + queue.removePartProducingOpsInRange(getZooKeeper(), drop_range, entry, /* fetch_entry_znode= */ {}); part_check_thread.cancelRemovedPartsCheck(drop_range); } else @@ -3450,7 +3460,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n /// so GET_PART all_1_42_5 (and all source parts) is useless. The only thing we can do is to fetch all_1_42_5_63. /// 2. If all_1_42_5_63 is lost, then replication may stuck waiting for all_1_42_5_63 to appear, /// because we may have some covered parts (more precisely, parts with the same min and max blocks) - queue.removePartProducingOpsInRange(zookeeper, broken_part_info, {}); + queue.removePartProducingOpsInRange(zookeeper, broken_part_info, /* covering_entry= */ {}, /* fetch_entry_znode= */ {}); String part_path = fs::path(replica_path) / "parts" / part_name; @@ -3850,8 +3860,15 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo & } -bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, - const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_, bool try_fetch_shared) +bool StorageReplicatedMergeTree::fetchPart( + const String & part_name, + const StorageMetadataPtr & metadata_snapshot, + const String & source_replica_path, + bool to_detached, + size_t quorum, + zkutil::ZooKeeper::Ptr zookeeper_, + bool try_fetch_shared, + String entry_znode) { auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper(); const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); @@ -4049,6 +4066,17 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora ProfileEvents::increment(ProfileEvents::ObsoleteReplicatedParts); } + /// It is possible that fetched parts may cover other parts (see + /// findReplicaHavingCoveringPart()), and if those covered parts + /// cannot be executed right now (due to MERGE_PARTS that covers + /// them is in progress), replica delay will be increased until + /// those entries will be executed (if covered operations + /// finishes) in other words until MERGE_PARTS is in progress, + /// while this can take awhile. + /// + /// So let's just remove them from the queue. + queue.removePartProducingOpsInRange(zookeeper, part->info, /* covering_entry= */ {}, entry_znode); + write_part_log({}); } else @@ -7581,8 +7609,6 @@ std::unique_ptr StorageReplicatedMergeTree::getDefaultSetting String StorageReplicatedMergeTree::getTableSharedID() const { - /// Lock is not required in other places because createTableSharedID() - /// can be called only during table initialization std::lock_guard lock(table_shared_id_mutex); /// Can happen if table was partially initialized before drop by DatabaseCatalog @@ -7609,8 +7635,12 @@ String StorageReplicatedMergeTree::getTableSharedID() const void StorageReplicatedMergeTree::createTableSharedID() const { LOG_DEBUG(log, "Creating shared ID for table {}", getStorageID().getNameForLogs()); + // can be set by the call to getTableSharedID if (table_shared_id != UUIDHelpers::Nil) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Table shared id already initialized"); + { + LOG_INFO(log, "Shared ID already set to {}", table_shared_id); + return; + } auto zookeeper = getZooKeeper(); String zookeeper_table_id_path = fs::path(zookeeper_path) / "table_shared_id"; diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index d767d94889c..a048de27080 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -690,11 +690,12 @@ private: bool fetchPart( const String & part_name, const StorageMetadataPtr & metadata_snapshot, - const String & replica_path, + const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_ = nullptr, - bool try_fetch_shared = true); + bool try_fetch_shared = true, + String entry_znode = ""); /** Download the specified part from the specified replica. * Used for replace local part on the same s3-shared part in hybrid storage. diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index d759c339dea..ab9b71f5ff3 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1314,6 +1314,11 @@ void registerStorageCOS(StorageFactory & factory) return registerStorageS3Impl("COSN", factory); } +void registerStorageOSS(StorageFactory & factory) +{ + return registerStorageS3Impl("OSS", factory); +} + NamesAndTypesList StorageS3::getVirtuals() const { return virtual_columns; diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index 3b8c8b1cb92..350e942f972 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -117,32 +117,24 @@ Pipe StorageS3Cluster::read( addColumnsStructureToQueryWithClusterEngine( query_to_send, StorageDictionary::generateNamesAndTypesDescription(storage_snapshot->metadata->getColumns().getAll()), 5, getName()); - for (const auto & replicas : cluster->getShardsAddresses()) + const auto & current_settings = context->getSettingsRef(); + auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); + for (const auto & shard_info : cluster->getShardsInfo()) { - /// There will be only one replica, because we consider each replica as a shard - for (const auto & node : replicas) + auto try_results = shard_info.pool->getMany(timeouts, ¤t_settings, PoolMode::GET_MANY); + for (auto & try_result : try_results) { - auto connection = std::make_shared( - node.host_name, node.port, context->getGlobalContext()->getCurrentDatabase(), - node.user, node.password, node.quota_key, node.cluster, node.cluster_secret, - "S3ClusterInititiator", - node.compression, - node.secure - ); - - - /// For unknown reason global context is passed to IStorage::read() method - /// So, task_identifier is passed as constructor argument. It is more obvious. auto remote_query_executor = std::make_shared( - connection, - queryToString(query_to_send), - header, - context, - /*throttler=*/nullptr, - scalars, - Tables(), - processed_stage, - RemoteQueryExecutor::Extension{.task_iterator = callback}); + shard_info.pool, + std::vector{try_result}, + queryToString(query_to_send), + header, + context, + /*throttler=*/nullptr, + scalars, + Tables(), + processed_stage, + RemoteQueryExecutor::Extension{.task_iterator = callback}); pipes.emplace_back(std::make_shared(remote_query_executor, add_agg_info, false)); } diff --git a/src/Storages/System/InformationSchema/columns.sql b/src/Storages/System/InformationSchema/columns.sql index 80cf2f911be..b01352145ff 100644 --- a/src/Storages/System/InformationSchema/columns.sql +++ b/src/Storages/System/InformationSchema/columns.sql @@ -3,10 +3,12 @@ ATTACH VIEW columns `table_catalog` String, `table_schema` String, `table_name` String, + `TABLE_SCHEMA` String, + `TABLE_NAME` String, `column_name` String, `ordinal_position` UInt64, `column_default` String, - `is_nullable` UInt8, + `is_nullable` String, `data_type` String, `character_maximum_length` Nullable(UInt64), `character_octet_length` Nullable(UInt64), @@ -26,12 +28,10 @@ ATTACH VIEW columns `column_comment` String, `column_type` String, `TABLE_CATALOG` String ALIAS table_catalog, - `TABLE_SCHEMA` String ALIAS table_schema, - `TABLE_NAME` String ALIAS table_name, `COLUMN_NAME` String ALIAS column_name, `ORDINAL_POSITION` UInt64 ALIAS ordinal_position, `COLUMN_DEFAULT` String ALIAS column_default, - `IS_NULLABLE` UInt8 ALIAS is_nullable, + `IS_NULLABLE` String ALIAS is_nullable, `DATA_TYPE` String ALIAS data_type, `CHARACTER_MAXIMUM_LENGTH` Nullable(UInt64) ALIAS character_maximum_length, `CHARACTER_OCTET_LENGTH` Nullable(UInt64) ALIAS character_octet_length, @@ -54,7 +54,9 @@ ATTACH VIEW columns SELECT database AS table_catalog, database AS table_schema, + database AS TABLE_SCHEMA, table AS table_name, + table AS TABLE_NAME, name AS column_name, position AS ordinal_position, default_expression AS column_default, diff --git a/src/Storages/System/StorageSystemAsynchronousMetrics.cpp b/src/Storages/System/StorageSystemAsynchronousMetrics.cpp index 70e12440678..e2f62b902b7 100644 --- a/src/Storages/System/StorageSystemAsynchronousMetrics.cpp +++ b/src/Storages/System/StorageSystemAsynchronousMetrics.cpp @@ -12,6 +12,7 @@ NamesAndTypesList StorageSystemAsynchronousMetrics::getNamesAndTypes() return { {"metric", std::make_shared()}, {"value", std::make_shared()}, + {"description", std::make_shared()}, }; } @@ -27,7 +28,8 @@ void StorageSystemAsynchronousMetrics::fillData(MutableColumns & res_columns, Co for (const auto & name_value : async_metrics_values) { res_columns[0]->insert(name_value.first); - res_columns[1]->insert(name_value.second); + res_columns[1]->insert(name_value.second.value); + res_columns[2]->insert(name_value.second.documentation); } } diff --git a/src/Storages/System/StorageSystemNamedCollections.cpp b/src/Storages/System/StorageSystemNamedCollections.cpp new file mode 100644 index 00000000000..6f4078369d2 --- /dev/null +++ b/src/Storages/System/StorageSystemNamedCollections.cpp @@ -0,0 +1,58 @@ +#include "StorageSystemNamedCollections.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +NamesAndTypesList StorageSystemNamedCollections::getNamesAndTypes() +{ + return { + {"name", std::make_shared()}, + {"collection", std::make_shared(std::make_shared(), std::make_shared())}, + }; +} + +StorageSystemNamedCollections::StorageSystemNamedCollections(const StorageID & table_id_) + : IStorageSystemOneBlock(table_id_) +{ +} + +void StorageSystemNamedCollections::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const +{ + context->checkAccess(AccessType::SHOW_NAMED_COLLECTIONS); + + auto collections = NamedCollectionFactory::instance().getAll(); + for (const auto & [name, collection] : collections) + { + res_columns[0]->insert(name); + + auto * column_map = typeid_cast(res_columns[1].get()); + + auto & offsets = column_map->getNestedColumn().getOffsets(); + auto & tuple_column = column_map->getNestedData(); + auto & key_column = tuple_column.getColumn(0); + auto & value_column = tuple_column.getColumn(1); + + size_t size = 0; + for (const auto & key : collection->getKeys()) + { + key_column.insertData(key.data(), key.size()); + value_column.insert(collection->get(key)); + size++; + } + + offsets.push_back(offsets.back() + size); + } +} + +} diff --git a/src/Storages/System/StorageSystemNamedCollections.h b/src/Storages/System/StorageSystemNamedCollections.h new file mode 100644 index 00000000000..d20fa62d30b --- /dev/null +++ b/src/Storages/System/StorageSystemNamedCollections.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +namespace DB +{ + +class StorageSystemNamedCollections final : public IStorageSystemOneBlock +{ +public: + explicit StorageSystemNamedCollections(const StorageID & table_id_); + + std::string getName() const override { return "SystemNamedCollections"; } + + static NamesAndTypesList getNamesAndTypes(); + +protected: + void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override; +}; + +} diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index d3b81f4d1f9..068f7ddce46 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -174,6 +175,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "filesystem_cache"); attach(context, system_database, "remote_data_paths"); attach(context, system_database, "certificates"); + attach(context, system_database, "named_collections"); if (has_zookeeper) attach(context, system_database, "zookeeper"); diff --git a/src/Storages/registerStorages.cpp b/src/Storages/registerStorages.cpp index fd81b55ef61..200b8e637da 100644 --- a/src/Storages/registerStorages.cpp +++ b/src/Storages/registerStorages.cpp @@ -32,6 +32,7 @@ void registerStorageMeiliSearch(StorageFactory& factory); #if USE_AWS_S3 void registerStorageS3(StorageFactory & factory); void registerStorageCOS(StorageFactory & factory); +void registerStorageOSS(StorageFactory & factory); void registerStorageHudi(StorageFactory & factory); void registerStorageDelta(StorageFactory & factory); #endif @@ -120,6 +121,7 @@ void registerStorages() #if USE_AWS_S3 registerStorageS3(factory); registerStorageCOS(factory); + registerStorageOSS(factory); registerStorageHudi(factory); registerStorageDelta(factory); #endif diff --git a/src/Storages/tests/gtest_named_collections.cpp b/src/Storages/tests/gtest_named_collections.cpp new file mode 100644 index 00000000000..5ba9156bcd9 --- /dev/null +++ b/src/Storages/tests/gtest_named_collections.cpp @@ -0,0 +1,143 @@ +#include +#include +#include +#include +#include + +using namespace DB; + +TEST(NamedCollections, SimpleConfig) +{ + std::string xml(R"CONFIG( + + + value1 + 2 + 3.3 + -4 + + + value4 + 5 + 6.6 + + +)CONFIG"); + + Poco::XML::DOMParser dom_parser; + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + + NamedCollectionFactory::instance().initialize(*config); + + ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection1")); + ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2")); + ASSERT_TRUE(NamedCollectionFactory::instance().tryGet("collection3") == nullptr); + + auto collections = NamedCollectionFactory::instance().getAll(); + ASSERT_EQ(collections.size(), 2); + ASSERT_TRUE(collections.contains("collection1")); + ASSERT_TRUE(collections.contains("collection2")); + + ASSERT_EQ(collections["collection1"]->dumpStructure(), + R"CONFIG(key1: value1 +key2: 2 +key3: 3.3 +key4: -4 +)CONFIG"); + + auto collection1 = NamedCollectionFactory::instance().get("collection1"); + ASSERT_TRUE(collection1 != nullptr); + + ASSERT_TRUE(collection1->get("key1") == "value1"); + ASSERT_TRUE(collection1->get("key2") == 2); + ASSERT_TRUE(collection1->get("key3") == 3.3); + ASSERT_TRUE(collection1->get("key4") == -4); + + ASSERT_EQ(collections["collection2"]->dumpStructure(), + R"CONFIG(key4: value4 +key5: 5 +key6: 6.6 +)CONFIG"); + + auto collection2 = NamedCollectionFactory::instance().get("collection2"); + ASSERT_TRUE(collection2 != nullptr); + + ASSERT_TRUE(collection2->get("key4") == "value4"); + ASSERT_TRUE(collection2->get("key5") == 5); + ASSERT_TRUE(collection2->get("key6") == 6.6); + + auto collection2_copy = collections["collection2"]->duplicate(); + NamedCollectionFactory::instance().add("collection2_copy", collection2_copy); + ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2_copy")); + ASSERT_EQ(NamedCollectionFactory::instance().get("collection2_copy")->dumpStructure(), + R"CONFIG(key4: value4 +key5: 5 +key6: 6.6 +)CONFIG"); + + collection2_copy->set("key4", "value44", true); + ASSERT_TRUE(collection2_copy->get("key4") == "value44"); + ASSERT_TRUE(collection2->get("key4") == "value4"); + + collection2_copy->remove("key4"); + ASSERT_TRUE(collection2_copy->getOrDefault("key4", "N") == "N"); + ASSERT_TRUE(collection2->getOrDefault("key4", "N") == "value4"); + + collection2_copy->set("key4", "value45"); + ASSERT_TRUE(collection2_copy->getOrDefault("key4", "N") == "value45"); + + NamedCollectionFactory::instance().remove("collection2_copy"); + ASSERT_FALSE(NamedCollectionFactory::instance().exists("collection2_copy")); + + config.reset(); +} + +TEST(NamedCollections, NestedConfig) +{ + std::string xml(R"CONFIG( + + + + value1 + + + value2_1 + + + 4 + 5 + + + + + +)CONFIG"); + + Poco::XML::DOMParser dom_parser; + Poco::AutoPtr document = dom_parser.parseString(xml); + Poco::AutoPtr config = new Poco::Util::XMLConfiguration(document); + NamedCollectionFactory::instance().reload(*config); + + ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection1")); + + auto collection1 = NamedCollectionFactory::instance().get("collection1"); + ASSERT_TRUE(collection1 != nullptr); + + ASSERT_EQ(collection1->dumpStructure(), + R"CONFIG(key1: + key1_1: value1 +key2: + key2_1: value2_1 + key2_2: + key2_3: + key2_4: 4 + key2_5: 5 +)CONFIG"); + + ASSERT_EQ(collection1->get("key1.key1_1"), "value1"); + ASSERT_EQ(collection1->get("key2.key2_1"), "value2_1"); + ASSERT_EQ(collection1->get("key2.key2_2.key2_3.key2_4"), 4); + ASSERT_EQ(collection1->get("key2.key2_2.key2_3.key2_5"), 5); + +} diff --git a/src/TableFunctions/ITableFunction.h b/src/TableFunctions/ITableFunction.h index a05edcd32c8..79c58083020 100644 --- a/src/TableFunctions/ITableFunction.h +++ b/src/TableFunctions/ITableFunction.h @@ -55,15 +55,17 @@ public: virtual ColumnsDescription getActualTableStructure(ContextPtr /*context*/) const = 0; /// Check if table function needs a structure hint from SELECT query in case of - /// INSERT INTO FUNCTION ... SELECT ... + /// INSERT INTO FUNCTION ... SELECT ... and INSERT INTO ... SELECT ... FROM table_function(...) /// It's used for schema inference. virtual bool needStructureHint() const { return false; } /// Set a structure hint from SELECT query in case of - /// INSERT INTO FUNCTION ... SELECT ... + /// INSERT INTO FUNCTION ... SELECT ... and INSERT INTO ... SELECT ... FROM table_function(...) /// This hint could be used not to repeat schema in function arguments. virtual void setStructureHint(const ColumnsDescription &) {} + virtual bool supportsReadingSubsetOfColumns() { return true; } + /// Create storage according to the query. StoragePtr execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const; diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index 8be2341b81d..d62e44a16cc 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -34,6 +34,11 @@ String ITableFunctionFileLike::getFormatFromFirstArgument() return FormatFactory::instance().getFormatFromFileName(filename, true); } +bool ITableFunctionFileLike::supportsReadingSubsetOfColumns() +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format); +} + void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) { /// Parse args diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index c2f32eb0aa3..589fce67638 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -18,6 +18,8 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } + bool supportsReadingSubsetOfColumns() override; + protected: void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; virtual void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context); diff --git a/src/TableFunctions/TableFunctionDelta.cpp b/src/TableFunctions/TableFunctionDelta.cpp new file mode 100644 index 00000000000..25ea2aaa77f --- /dev/null +++ b/src/TableFunctions/TableFunctionDelta.cpp @@ -0,0 +1,170 @@ +#include "config.h" + +#if USE_AWS_S3 + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include "registerTableFunctions.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +void TableFunctionDelta::parseArgumentsImpl( + const String & error_message, ASTs & args, ContextPtr context, StorageS3Configuration & base_configuration) +{ + if (args.empty() || args.size() > 6) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, error_message); + + auto header_it = StorageURL::collectHeaders(args, base_configuration, context); + if (header_it != args.end()) + args.erase(header_it); + + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + + /// Size -> argument indexes + static auto size_to_args = std::map>{ + {1, {{}}}, + {2, {{"format", 1}}}, + {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, + {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}}}; + + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// deltaLake(source, format, structure, compression_method) and deltaLake(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 2-nd argument: check if it's a format name or not. + if (args.size() == 4) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id"); + if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + + else + args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}}; + } + /// For 3 arguments we support 2 possible variants: + /// deltaLake(source, format, structure) and deltaLake(source, access_key_id, access_key_id) + /// We can distinguish them by looking at the 2-nd argument: check if it's a format name or not. + else if (args.size() == 3) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id"); + if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + args_to_idx = {{"format", 1}, {"structure", 2}}; + else + args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + + /// This argument is always the first + base_configuration.url = checkAndGetLiteralArgument(args[0], "url"); + + if (args_to_idx.contains("format")) + base_configuration.format = checkAndGetLiteralArgument(args[args_to_idx["format"]], "format"); + else + base_configuration.format = "Parquet"; + + if (args_to_idx.contains("structure")) + base_configuration.structure = checkAndGetLiteralArgument(args[args_to_idx["structure"]], "structure"); + + if (args_to_idx.contains("compression_method")) + base_configuration.compression_method + = checkAndGetLiteralArgument(args[args_to_idx["compression_method"]], "compression_method"); + + if (args_to_idx.contains("access_key_id")) + base_configuration.auth_settings.access_key_id + = checkAndGetLiteralArgument(args[args_to_idx["access_key_id"]], "access_key_id"); + + if (args_to_idx.contains("secret_access_key")) + base_configuration.auth_settings.secret_access_key + = checkAndGetLiteralArgument(args[args_to_idx["secret_access_key"]], "secret_access_key"); +} + +void TableFunctionDelta::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + /// Parse args + ASTs & args_func = ast_function->children; + + const auto message = fmt::format( + "The signature of table function {} could be the following:\n" \ + " - url\n" \ + " - url, format\n" \ + " - url, format, structure\n" \ + " - url, access_key_id, secret_access_key\n" \ + " - url, format, structure, compression_method\n" \ + " - url, access_key_id, secret_access_key, format\n" \ + " - url, access_key_id, secret_access_key, format, structure\n" \ + " - url, access_key_id, secret_access_key, format, structure, compression_method", + getName()); + + if (args_func.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments", getName()); + + auto & args = args_func.at(0)->children; + + parseArgumentsImpl(message, args, context, configuration); +} + +ColumnsDescription TableFunctionDelta::getActualTableStructure(ContextPtr context) const +{ + if (configuration.structure == "auto") + { + context->checkAccess(getSourceAccessType()); + return StorageS3::getTableStructureFromData(configuration, false, std::nullopt, context); + } + + return parseColumnsListFromString(configuration.structure, context); +} + +StoragePtr TableFunctionDelta::executeImpl( + const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +{ + Poco::URI uri(configuration.url); + S3::URI s3_uri(uri); + + ColumnsDescription columns; + if (configuration.structure != "auto") + columns = parseColumnsListFromString(configuration.structure, context); + + StoragePtr storage = std::make_shared( + configuration, StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, String{}, context, std::nullopt); + + storage->startup(); + + return storage; +} + + +void registerTableFunctionDelta(TableFunctionFactory & factory) +{ + factory.registerFunction( + {.documentation + = {R"(The table function can be used to read the DeltaLake table stored on object store.)", + Documentation::Examples{{"hudi", "SELECT * FROM deltaLake(url, access_key_id, secret_access_key)"}}, + Documentation::Categories{"DataLake"}}, + .allow_readonly = true}); +} + +} + +#endif diff --git a/src/TableFunctions/TableFunctionDelta.h b/src/TableFunctions/TableFunctionDelta.h new file mode 100644 index 00000000000..badfd63f431 --- /dev/null +++ b/src/TableFunctions/TableFunctionDelta.h @@ -0,0 +1,44 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include + + +namespace DB +{ + +class Context; +class TableFunctionS3Cluster; + +/* deltaLake(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary DeltaLake table on S3. + */ +class TableFunctionDelta : public ITableFunction +{ +public: + static constexpr auto name = "deltaLake"; + std::string getName() const override + { + return name; + } + +protected: + StoragePtr executeImpl( + const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + + const char * getStorageTypeName() const override { return name; } + + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + + static void parseArgumentsImpl(const String & error_message, ASTs & args, ContextPtr context, StorageS3Configuration & configuration); + + StorageS3Configuration configuration; +}; + +} + +#endif diff --git a/src/TableFunctions/TableFunctionHudi.cpp b/src/TableFunctions/TableFunctionHudi.cpp new file mode 100644 index 00000000000..b1db90da550 --- /dev/null +++ b/src/TableFunctions/TableFunctionHudi.cpp @@ -0,0 +1,169 @@ +#include "config.h" + +#if USE_AWS_S3 + +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include "registerTableFunctions.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +} + + +void TableFunctionHudi::parseArgumentsImpl( + const String & error_message, ASTs & args, ContextPtr context, StorageS3Configuration & base_configuration) +{ + if (args.empty() || args.size() > 6) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, error_message); + + auto header_it = StorageURL::collectHeaders(args, base_configuration, context); + if (header_it != args.end()) + args.erase(header_it); + + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + + /// Size -> argument indexes + static auto size_to_args = std::map>{ + {1, {{}}}, + {2, {{"format", 1}}}, + {5, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}}}, + {6, {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}, {"structure", 4}, {"compression_method", 5}}}}; + + std::map args_to_idx; + /// For 4 arguments we support 2 possible variants: + /// hudi(source, format, structure, compression_method) and hudi(source, access_key_id, access_key_id, format) + /// We can distinguish them by looking at the 2-nd argument: check if it's a format name or not. + if (args.size() == 4) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id"); + if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + args_to_idx = {{"format", 1}, {"structure", 2}, {"compression_method", 3}}; + + else + args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}, {"format", 3}}; + } + /// For 3 arguments we support 2 possible variants: + /// hudi(source, format, structure) and hudi(source, access_key_id, access_key_id) + /// We can distinguish them by looking at the 2-nd argument: check if it's a format name or not. + else if (args.size() == 3) + { + auto second_arg = checkAndGetLiteralArgument(args[1], "format/access_key_id"); + if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) + args_to_idx = {{"format", 1}, {"structure", 2}}; + else + args_to_idx = {{"access_key_id", 1}, {"secret_access_key", 2}}; + } + else + { + args_to_idx = size_to_args[args.size()]; + } + + /// This argument is always the first + base_configuration.url = checkAndGetLiteralArgument(args[0], "url"); + + if (args_to_idx.contains("format")) + base_configuration.format = checkAndGetLiteralArgument(args[args_to_idx["format"]], "format"); + else + base_configuration.format = "Parquet"; + + if (args_to_idx.contains("structure")) + base_configuration.structure = checkAndGetLiteralArgument(args[args_to_idx["structure"]], "structure"); + + if (args_to_idx.contains("compression_method")) + base_configuration.compression_method + = checkAndGetLiteralArgument(args[args_to_idx["compression_method"]], "compression_method"); + + if (args_to_idx.contains("access_key_id")) + base_configuration.auth_settings.access_key_id + = checkAndGetLiteralArgument(args[args_to_idx["access_key_id"]], "access_key_id"); + + if (args_to_idx.contains("secret_access_key")) + base_configuration.auth_settings.secret_access_key + = checkAndGetLiteralArgument(args[args_to_idx["secret_access_key"]], "secret_access_key"); +} + +void TableFunctionHudi::parseArguments(const ASTPtr & ast_function, ContextPtr context) +{ + /// Parse args + ASTs & args_func = ast_function->children; + + const auto message = fmt::format( + "The signature of table function {} could be the following:\n" \ + " - url\n" \ + " - url, format\n" \ + " - url, format, structure\n" \ + " - url, access_key_id, secret_access_key\n" \ + " - url, format, structure, compression_method\n" \ + " - url, access_key_id, secret_access_key, format\n" \ + " - url, access_key_id, secret_access_key, format, structure\n" \ + " - url, access_key_id, secret_access_key, format, structure, compression_method", + getName()); + + if (args_func.size() != 1) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' must have arguments", getName()); + + auto & args = args_func.at(0)->children; + + parseArgumentsImpl(message, args, context, configuration); +} + +ColumnsDescription TableFunctionHudi::getActualTableStructure(ContextPtr context) const +{ + if (configuration.structure == "auto") + { + context->checkAccess(getSourceAccessType()); + return StorageS3::getTableStructureFromData(configuration, false, std::nullopt, context); + } + + return parseColumnsListFromString(configuration.structure, context); +} + +StoragePtr TableFunctionHudi::executeImpl( + const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const +{ + Poco::URI uri(configuration.url); + S3::URI s3_uri(uri); + + ColumnsDescription columns; + if (configuration.structure != "auto") + columns = parseColumnsListFromString(configuration.structure, context); + + StoragePtr storage = std::make_shared( + configuration, StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, String{}, context, std::nullopt); + + storage->startup(); + + return storage; +} + + +void registerTableFunctionHudi(TableFunctionFactory & factory) +{ + factory.registerFunction( + {.documentation + = {R"(The table function can be used to read the Hudi table stored on object store.)", + Documentation::Examples{{"hudi", "SELECT * FROM hudi(url, access_key_id, secret_access_key)"}}, + Documentation::Categories{"DataLake"}}, + .allow_readonly = true}); +} +} + +#endif diff --git a/src/TableFunctions/TableFunctionHudi.h b/src/TableFunctions/TableFunctionHudi.h new file mode 100644 index 00000000000..a370bca8c45 --- /dev/null +++ b/src/TableFunctions/TableFunctionHudi.h @@ -0,0 +1,44 @@ +#pragma once + +#include "config.h" + +#if USE_AWS_S3 + +#include +#include + + +namespace DB +{ + +class Context; +class TableFunctionS3Cluster; + +/* hudi(source, [access_key_id, secret_access_key,] format, structure[, compression]) - creates a temporary Hudi table on S3. + */ +class TableFunctionHudi : public ITableFunction +{ +public: + static constexpr auto name = "hudi"; + std::string getName() const override + { + return name; + } + +protected: + StoragePtr executeImpl( + const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns) const override; + + const char * getStorageTypeName() const override { return name; } + + ColumnsDescription getActualTableStructure(ContextPtr context) const override; + void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; + + static void parseArgumentsImpl(const String & error_message, ASTs & args, ContextPtr context, StorageS3Configuration & configuration); + + StorageS3Configuration configuration; +}; + +} + +#endif diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index 4c0b5352545..be6dc6d28c5 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -146,10 +146,15 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context) return parseColumnsListFromString(configuration.structure, context); } +bool TableFunctionS3::supportsReadingSubsetOfColumns() +{ + return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format); +} + StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - Poco::URI uri (configuration.url); - S3::URI s3_uri (uri); + Poco::URI uri(configuration.url); + S3::URI s3_uri(uri); ColumnsDescription columns; if (configuration.structure != "auto") @@ -183,6 +188,11 @@ void registerTableFunctionCOS(TableFunctionFactory & factory) factory.registerFunction(); } +void registerTableFunctionOSS(TableFunctionFactory & factory) +{ + factory.registerFunction(); +} + } #endif diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index 5c12c2a3975..b2eb03e8839 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -30,6 +30,8 @@ public: void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; } + bool supportsReadingSubsetOfColumns() override; + protected: friend class TableFunctionS3Cluster; @@ -62,6 +64,18 @@ private: const char * getStorageTypeName() const override { return "COSN"; } }; +class TableFunctionOSS : public TableFunctionS3 +{ +public: + static constexpr auto name = "oss"; + std::string getName() const override + { + return name; + } +private: + const char * getStorageTypeName() const override { return "OSS"; } +}; + } #endif diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 9328c12c122..e6c32766559 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -27,6 +27,10 @@ void registerTableFunctions() registerTableFunctionS3(factory); registerTableFunctionS3Cluster(factory); registerTableFunctionCOS(factory); + registerTableFunctionHudi(factory); + registerTableFunctionDelta(factory); + registerTableFunctionOSS(factory); + #endif #if USE_HDFS diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index 3bc9e3a85da..12a26bec70a 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -24,6 +24,9 @@ void registerTableFunctionMeiliSearch(TableFunctionFactory & factory); void registerTableFunctionS3(TableFunctionFactory & factory); void registerTableFunctionS3Cluster(TableFunctionFactory & factory); void registerTableFunctionCOS(TableFunctionFactory & factory); +void registerTableFunctionHudi(TableFunctionFactory & factory); +void registerTableFunctionDelta(TableFunctionFactory & factory); +void registerTableFunctionOSS(TableFunctionFactory & factory); #endif #if USE_HDFS diff --git a/tests/ci/commit_status_helper.py b/tests/ci/commit_status_helper.py index 8b9d28502c1..185dc64daa9 100644 --- a/tests/ci/commit_status_helper.py +++ b/tests/ci/commit_status_helper.py @@ -3,19 +3,21 @@ import csv import os import time -from typing import Optional +from typing import List import logging from ci_config import CI_CONFIG, REQUIRED_CHECKS from env_helper import GITHUB_REPOSITORY, GITHUB_RUN_URL from github import Github from github.Commit import Commit -from pr_info import SKIP_MERGEABLE_CHECK_LABEL +from github.CommitStatus import CommitStatus +from pr_info import PRInfo, SKIP_MERGEABLE_CHECK_LABEL RETRY = 5 +CommitStatuses = List[CommitStatus] -def override_status(status, check_name, invert=False): +def override_status(status: str, check_name: str, invert=False) -> str: if CI_CONFIG["tests_config"].get(check_name, {}).get("force_tests", False): return "success" @@ -27,24 +29,23 @@ def override_status(status, check_name, invert=False): return status -def get_commit( - gh: Github, commit_sha: str, retry_count: int = RETRY -) -> Optional[Commit]: +def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit: for i in range(retry_count): try: repo = gh.get_repo(GITHUB_REPOSITORY) commit = repo.get_commit(commit_sha) - return commit + break except Exception as ex: if i == retry_count - 1: raise ex time.sleep(i) - # just suppress warning - return None + return commit -def post_commit_status(gh, sha, check_name, description, state, report_url): +def post_commit_status( + gh: Github, sha: str, check_name: str, description: str, state: str, report_url: str +): for i in range(RETRY): try: commit = get_commit(gh, sha, 1) @@ -61,7 +62,9 @@ def post_commit_status(gh, sha, check_name, description, state, report_url): time.sleep(i) -def post_commit_status_to_file(file_path, description, state, report_url): +def post_commit_status_to_file( + file_path: str, description: str, state: str, report_url: str +): if os.path.exists(file_path): raise Exception(f'File "{file_path}" already exists!') with open(file_path, "w", encoding="utf-8") as f: @@ -69,21 +72,37 @@ def post_commit_status_to_file(file_path, description, state, report_url): out.writerow([state, report_url, description]) -def remove_labels(gh, pr_info, labels_names): +def get_commit_filtered_statuses(commit: Commit) -> CommitStatuses: + """ + Squash statuses to latest state + 1. context="first", state="success", update_time=1 + 2. context="second", state="success", update_time=2 + 3. context="first", stat="failure", update_time=3 + =========> + 1. context="second", state="success" + 2. context="first", stat="failure" + """ + filtered = {} + for status in sorted(commit.get_statuses(), key=lambda x: x.updated_at): + filtered[status.context] = status + return list(filtered.values()) + + +def remove_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) for label in labels_names: pull_request.remove_from_labels(label) -def post_labels(gh, pr_info, labels_names): +def post_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]): repo = gh.get_repo(GITHUB_REPOSITORY) pull_request = repo.get_pull(pr_info.number) for label in labels_names: pull_request.add_to_labels(label) -def fail_mergeable_check(commit, description): +def fail_mergeable_check(commit: Commit, description: str): commit.create_status( context="Mergeable Check", description=description, @@ -92,7 +111,7 @@ def fail_mergeable_check(commit, description): ) -def reset_mergeable_check(commit, description=""): +def reset_mergeable_check(commit: Commit, description: str = ""): commit.create_status( context="Mergeable Check", description=description, @@ -101,7 +120,7 @@ def reset_mergeable_check(commit, description=""): ) -def update_mergeable_check(gh, pr_info, check_name): +def update_mergeable_check(gh: Github, pr_info: PRInfo, check_name: str): if SKIP_MERGEABLE_CHECK_LABEL in pr_info.labels: return diff --git a/tests/ci/rerun_helper.py b/tests/ci/rerun_helper.py index c4ae70eadb9..fa73256d759 100644 --- a/tests/ci/rerun_helper.py +++ b/tests/ci/rerun_helper.py @@ -1,14 +1,13 @@ #!/usr/bin/env python3 -from typing import List, Optional +from typing import Optional -from commit_status_helper import get_commit +from commit_status_helper import get_commit, get_commit_filtered_statuses from github import Github from github.CommitStatus import CommitStatus from pr_info import PRInfo -CommitStatuses = List[CommitStatus] - +# TODO: move it to commit_status_helper class RerunHelper: def __init__(self, gh: Github, pr_info: PRInfo, check_name: str): self.gh = gh @@ -18,7 +17,7 @@ class RerunHelper: if commit is None: raise ValueError(f"unable to receive commit for {pr_info.sha}") self.pygh_commit = commit - self.statuses = self.ger_filtered_statuses() + self.statuses = get_commit_filtered_statuses(commit) def is_already_finished_by_status(self) -> bool: # currently we agree even for failed statuses @@ -35,20 +34,3 @@ class RerunHelper: if self.check_name in status.context: return status return None - - def ger_filtered_statuses(self) -> CommitStatuses: - """ - Squash statuses to latest state - 1. context="first", state="success", update_time=1 - 2. context="second", state="success", update_time=2 - 3. context="first", stat="failure", update_time=3 - =========> - 1. context="second", state="success" - 2. context="first", stat="failure" - """ - filt = {} - for status in sorted( - self.pygh_commit.get_statuses(), key=lambda x: x.updated_at - ): - filt[status.context] = status - return list(filt.values()) diff --git a/tests/ci/sqlancer_check.py b/tests/ci/sqlancer_check.py index 51c95e50746..63c7d18fe46 100644 --- a/tests/ci/sqlancer_check.py +++ b/tests/ci/sqlancer_check.py @@ -150,7 +150,7 @@ if __name__ == "__main__": os.path.join(workspace_path, "summary.tsv"), "r", encoding="utf-8" ) as summary_f: for line in summary_f: - l = line.split("\t") + l = line.rstrip("\n").split("\t") test_results.append((l[0], l[1])) with open( diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 666833013c8..a190126a8ff 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -2070,10 +2070,12 @@ class ClickHouseCluster: logging.debug("All instances of ZooKeeper started") return except Exception as ex: - logging.debug("Can't connect to ZooKeeper " + str(ex)) + logging.debug(f"Can't connect to ZooKeeper {instance}: {ex}") time.sleep(0.5) - raise Exception("Cannot wait ZooKeeper container") + raise Exception( + "Cannot wait ZooKeeper container (probably it's a `iptables-nft` issue, you may try to `sudo iptables -P FORWARD ACCEPT`)" + ) def make_hdfs_api(self, timeout=180, kerberized=False): if kerberized: diff --git a/tests/integration/parallel_skip.json b/tests/integration/parallel_skip.json index 3b4d1f2f29a..ba593b76bbf 100644 --- a/tests/integration/parallel_skip.json +++ b/tests/integration/parallel_skip.json @@ -48,6 +48,8 @@ "test_system_replicated_fetches/test.py::test_system_replicated_fetches", "test_zookeeper_config_load_balancing/test.py::test_round_robin", + "test_global_overcommit_tracker/test.py::test_global_overcommit", + "test_user_ip_restrictions/test.py::test_ipv4", "test_user_ip_restrictions/test.py::test_ipv6" ] diff --git a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml index 67278694d39..e4f481d3325 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml +++ b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml @@ -9,23 +9,38 @@ minio123 33554432 - + s3_plain - - http://minio1:9001/root/data/disks/disk_s3_plain/backup/ + + http://minio1:9001/root/data/disks/disk_s3_plain/backup_compact/ minio minio123 33554432 - + + + s3_plain + + http://minio1:9001/root/data/disks/disk_s3_plain/backup_wide/ + minio + minio123 + 33554432 + - +
- attach_disk_s3_plain + s3_backup_compact
-
+ + + +
+ s3_backup_wide +
+
+
diff --git a/tests/integration/test_attach_backup_from_s3_plain/test.py b/tests/integration/test_attach_backup_from_s3_plain/test.py index 35d53d5b8bd..f544a0c6e0a 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/test.py +++ b/tests/integration/test_attach_backup_from_s3_plain/test.py @@ -21,20 +21,51 @@ def start_cluster(): cluster.shutdown() -def test_attach_backup(): +@pytest.mark.parametrize( + "table_name,backup_name,storage_policy,min_bytes_for_wide_part", + [ + pytest.param( + "compact", "backup_compact", "s3_backup_compact", int(1e9), id="compact" + ), + pytest.param("wide", "backup_wide", "s3_backup_wide", int(0), id="wide"), + ], +) +def test_attach_compact_part( + table_name, backup_name, storage_policy, min_bytes_for_wide_part +): node.query( f""" + -- Catch any errors (NOTE: warnings are ok) + set send_logs_level='error'; + -- BACKUP writes Ordinary like structure set allow_deprecated_database_ordinary=1; - create database ordinary engine=Ordinary; - create table ordinary.test_backup_attach engine=MergeTree() order by tuple() as select * from numbers(100); + create database ordinary_db engine=Ordinary; + + create table ordinary_db.{table_name} engine=MergeTree() order by tuple() as select * from numbers(100); -- NOTE: name of backup ("backup") is significant. - backup table ordinary.test_backup_attach TO Disk('backup_disk_s3_plain', 'backup'); + backup table ordinary_db.{table_name} TO Disk('backup_disk_s3_plain', '{backup_name}'); - drop table ordinary.test_backup_attach; - attach table ordinary.test_backup_attach (number UInt64) engine=MergeTree() order by tuple() settings storage_policy='attach_policy_s3_plain'; + drop table ordinary_db.{table_name}; + attach table ordinary_db.{table_name} (number UInt64) + engine=MergeTree() + order by tuple() + settings + min_bytes_for_wide_part={min_bytes_for_wide_part}, + storage_policy='{storage_policy}'; """ ) - assert int(node.query("select count() from ordinary.test_backup_attach")) == 100 + assert int(node.query(f"select count() from ordinary_db.{table_name}")) == 100 + + node.query( + f""" + -- NOTE: be aware not to DROP the table, but DETACH first to keep it in S3. + detach table ordinary_db.{table_name}; + + -- NOTE: DROP DATABASE cannot be done w/o this due to metadata leftovers + set force_remove_data_recursively_on_drop=1; + drop database ordinary_db sync; + """ + ) diff --git a/tests/integration/test_keeper_four_word_command/test.py b/tests/integration/test_keeper_four_word_command/test.py index 4559904f8b7..c425c18158b 100644 --- a/tests/integration/test_keeper_four_word_command/test.py +++ b/tests/integration/test_keeper_four_word_command/test.py @@ -367,7 +367,7 @@ def test_cmd_stat(started_cluster): assert result["Received"] == "10" assert result["Sent"] == "10" assert int(result["Connections"]) == 1 - assert int(result["Zxid"]) > 14 + assert int(result["Zxid"]) >= 10 assert result["Mode"] == "leader" assert result["Node count"] == "13" diff --git a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py index c6f2f80c2fe..bed7772a3dd 100644 --- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py +++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py @@ -875,6 +875,22 @@ def alter_rename_table_with_materialized_mysql_database( "1\n2\n3\n4\n5\n", ) + mysql_node.query( + "ALTER TABLE test_database_rename_table.test_table_4 RENAME test_database_rename_table.test_table_5" + ) + mysql_node.query( + "ALTER TABLE test_database_rename_table.test_table_5 RENAME TO test_database_rename_table.test_table_6" + ) + mysql_node.query( + "ALTER TABLE test_database_rename_table.test_table_6 RENAME AS test_database_rename_table.test_table_7" + ) + + check_query( + clickhouse_node, + "SELECT * FROM test_database_rename_table.test_table_7 ORDER BY id FORMAT TSV", + "1\n2\n3\n4\n5\n", + ) + clickhouse_node.query("DROP DATABASE test_database_rename_table") mysql_node.query("DROP DATABASE test_database_rename_table") diff --git a/tests/integration/test_s3_cluster/configs/cluster.xml b/tests/integration/test_s3_cluster/configs/cluster.xml index 18f15763633..3059340cfe4 100644 --- a/tests/integration/test_s3_cluster/configs/cluster.xml +++ b/tests/integration/test_s3_cluster/configs/cluster.xml @@ -20,6 +20,21 @@ + + + + s0_0_0 + 9000 + + + + + s0_0_0 + 19000 + + + + cluster_simple diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 2cbb36fcf06..8e082f7d86a 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -195,3 +195,32 @@ def test_ambiguous_join(started_cluster): """ ) assert "AMBIGUOUS_COLUMN_NAME" not in result + + +def test_skip_unavailable_shards(started_cluster): + node = started_cluster.instances["s0_0_0"] + result = node.query( + """ + SELECT count(*) from s3Cluster( + 'cluster_non_existent_port', + 'http://minio1:9001/root/data/clickhouse/part1.csv', + 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + SETTINGS skip_unavailable_shards = 1 + """ + ) + + assert result == "10\n" + + +def test_unskip_unavailable_shards(started_cluster): + node = started_cluster.instances["s0_0_0"] + error = node.query_and_get_error( + """ + SELECT count(*) from s3Cluster( + 'cluster_non_existent_port', + 'http://minio1:9001/root/data/clickhouse/part1.csv', + 'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))') + """ + ) + + assert "NETWORK_ERROR" in error diff --git a/tests/integration/test_storage_delta/test.py b/tests/integration/test_storage_delta/test.py index 76dab554a57..a63244df814 100644 --- a/tests/integration/test_storage_delta/test.py +++ b/tests/integration/test_storage_delta/test.py @@ -126,7 +126,20 @@ def test_select_query(started_cluster): run_query(instance, create_query) select_query = "SELECT {} FROM deltalake FORMAT TSV" + select_table_function_query = "SELECT {col} FROM deltaLake('http://{ip}:{port}/{bucket}/test_table/', 'minio', 'minio123') FORMAT TSV" for column_name in columns: result = run_query(instance, select_query.format(column_name)).splitlines() assert len(result) > 0 + + for column_name in columns: + result = run_query( + instance, + select_table_function_query.format( + col=column_name, + ip=started_cluster.minio_ip, + port=started_cluster.minio_port, + bucket=bucket, + ), + ).splitlines() + assert len(result) > 0 diff --git a/tests/integration/test_storage_hudi/test.py b/tests/integration/test_storage_hudi/test.py index 549421afd89..dd870aae42e 100644 --- a/tests/integration/test_storage_hudi/test.py +++ b/tests/integration/test_storage_hudi/test.py @@ -133,15 +133,38 @@ def test_select_query(started_cluster): select_query = "SELECT {} FROM hudi FORMAT TSV" + select_table_function_query = "SELECT {col} FROM hudi('http://{ip}:{port}/{bucket}/test_table/', 'minio', 'minio123') FORMAT TSV" + for column_name in columns: result = run_query(instance, select_query.format(column_name)).splitlines() assert len(result) > 0 + for column_name in columns: + result = run_query( + instance, + select_table_function_query.format( + col=column_name, + ip=started_cluster.minio_ip, + port=started_cluster.minio_port, + bucket=bucket, + ), + ).splitlines() + assert len(result) > 0 + # test if all partition paths is presented in result distinct_select_query = ( "SELECT DISTINCT partitionpath FROM hudi ORDER BY partitionpath FORMAT TSV" ) + + distinct_select_table_function_query = "SELECT DISTINCT partitionpath FROM hudi('http://{ip}:{port}/{bucket}/test_table/', 'minio', 'minio123') ORDER BY partitionpath FORMAT TSV" + result = run_query(instance, distinct_select_query) + result_table_function = run_query( + instance, + distinct_select_query.format( + ip=started_cluster.minio_ip, port=started_cluster.minio_port, bucket=bucket + ), + ) expected = [ "americas/brazil/sao_paulo", "americas/united_states/san_francisco", @@ -149,3 +172,4 @@ def test_select_query(started_cluster): ] assert TSV(result) == TSV(expected) + assert TSV(result_table_function) == TSV(expected) diff --git a/tests/performance/rand.xml b/tests/performance/rand.xml index 32c97bb77d6..807b811c147 100644 --- a/tests/performance/rand.xml +++ b/tests/performance/rand.xml @@ -10,5 +10,6 @@ SELECT count() FROM (SELECT rand() FROM zeros(1000000000)) SELECT count() FROM (SELECT rand64() FROM zeros(1000000000)) + SELECT count() FROM (SELECT randUniform(0, 1) FROM zeros(100000000)) SELECT count() FROM (SELECT generateUUIDv4() FROM zeros( 100000000)) diff --git a/tests/queries/0_stateless/01047_nullable_rand.reference b/tests/queries/0_stateless/01047_nullable_rand.reference index aa75dc9a81a..e9993590a84 100644 --- a/tests/queries/0_stateless/01047_nullable_rand.reference +++ b/tests/queries/0_stateless/01047_nullable_rand.reference @@ -1,8 +1,12 @@ UInt32 +Float64 +UInt32 UInt32 -UInt32 +Float64 UInt32 0 0 0 0 +0 +0 diff --git a/tests/queries/0_stateless/01047_nullable_rand.sql b/tests/queries/0_stateless/01047_nullable_rand.sql index 865647aa0fb..9d3c361c543 100644 --- a/tests/queries/0_stateless/01047_nullable_rand.sql +++ b/tests/queries/0_stateless/01047_nullable_rand.sql @@ -1,9 +1,13 @@ select toTypeName(rand(cast(4 as Nullable(UInt8)))); +select toTypeName(canonicalRand(CAST(4 as Nullable(UInt8)))); select toTypeName(randConstant(CAST(4 as Nullable(UInt8)))); select toTypeName(rand(Null)); +select toTypeName(canonicalRand(Null)); select toTypeName(randConstant(Null)); select rand(cast(4 as Nullable(UInt8))) * 0; +select canonicalRand(cast(4 as Nullable(UInt8))) * 0; select randConstant(CAST(4 as Nullable(UInt8))) * 0; select rand(Null) * 0; +select canonicalRand(Null) * 0; select randConstant(Null) * 0; diff --git a/tests/queries/0_stateless/01161_information_schema.reference b/tests/queries/0_stateless/01161_information_schema.reference index 3be800888c7..5331e30b899 100644 --- a/tests/queries/0_stateless/01161_information_schema.reference +++ b/tests/queries/0_stateless/01161_information_schema.reference @@ -14,14 +14,14 @@ default default v VIEW tmp LOCAL TEMPORARY default default mv SELECT * FROM system.one NONE NO YES NO NO NO default default v SELECT n, f FROM default.t NONE NO NO NO NO NO -default default mv dummy 1 0 UInt8 \N \N 8 2 0 \N \N \N \N \N \N \N \N \N \N UInt8 -default default t n 1 0 UInt64 \N \N 64 2 0 \N \N \N \N \N \N \N \N \N \N UInt64 -default default t f 2 0 Float32 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N Float32 -default default t s 3 0 String \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N String -default default t fs 4 0 FixedString(42) 42 42 \N \N \N \N \N \N \N \N \N \N \N \N \N FixedString(42) -default default t d 5 0 Decimal(9, 6) \N \N 9 10 6 \N \N \N \N \N \N \N \N \N \N Decimal(9, 6) -default default v n 1 1 Nullable(Int32) \N \N 32 2 0 \N \N \N \N \N \N \N \N \N \N Nullable(Int32) -default default v f 2 0 Float64 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N Float64 - tmp d 1 0 Date \N \N \N \N \N 0 \N \N \N \N \N \N \N \N \N Date - tmp dt 2 0 DateTime \N \N \N \N \N 0 \N \N \N \N \N \N \N \N \N DateTime - tmp dtms 3 0 DateTime64(3) \N \N \N \N \N 3 \N \N \N \N \N \N \N \N \N DateTime64(3) +default default mv default mv dummy 1 0 UInt8 \N \N 8 2 0 \N \N \N \N \N \N \N \N \N \N UInt8 +default default t default t n 1 0 UInt64 \N \N 64 2 0 \N \N \N \N \N \N \N \N \N \N UInt64 +default default t default t f 2 0 Float32 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N Float32 +default default t default t s 3 0 String \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N String +default default t default t fs 4 0 FixedString(42) 42 42 \N \N \N \N \N \N \N \N \N \N \N \N \N FixedString(42) +default default t default t d 5 0 Decimal(9, 6) \N \N 9 10 6 \N \N \N \N \N \N \N \N \N \N Decimal(9, 6) +default default v default v n 1 1 Nullable(Int32) \N \N 32 2 0 \N \N \N \N \N \N \N \N \N \N Nullable(Int32) +default default v default v f 2 0 Float64 \N \N \N \N \N \N \N \N \N \N \N \N \N \N \N Float64 + tmp tmp d 1 0 Date \N \N \N \N \N 0 \N \N \N \N \N \N \N \N \N Date + tmp tmp dt 2 0 DateTime \N \N \N \N \N 0 \N \N \N \N \N \N \N \N \N DateTime + tmp tmp dtms 3 0 DateTime64(3) \N \N \N \N \N 3 \N \N \N \N \N \N \N \N \N DateTime64(3) diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index b332a7d71f7..59c7d978493 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -85,6 +85,7 @@ SHOW ROLES ['SHOW CREATE ROLE'] GLOBAL SHOW ACCESS SHOW ROW POLICIES ['SHOW POLICIES','SHOW CREATE ROW POLICY','SHOW CREATE POLICY'] TABLE SHOW ACCESS SHOW QUOTAS ['SHOW CREATE QUOTA'] GLOBAL SHOW ACCESS SHOW SETTINGS PROFILES ['SHOW PROFILES','SHOW CREATE SETTINGS PROFILE','SHOW CREATE PROFILE'] GLOBAL SHOW ACCESS +SHOW NAMED COLLECTIONS ['SHOW NAMED COLLECTIONS'] GLOBAL SHOW ACCESS SHOW ACCESS [] \N ACCESS MANAGEMENT ACCESS MANAGEMENT [] \N ALL SYSTEM SHUTDOWN ['SYSTEM KILL','SHUTDOWN'] GLOBAL SYSTEM diff --git a/tests/queries/0_stateless/01606_git_import.reference b/tests/queries/0_stateless/01606_git_import.reference index 6b599307ba1..44ae4a3e039 100644 --- a/tests/queries/0_stateless/01606_git_import.reference +++ b/tests/queries/0_stateless/01606_git_import.reference @@ -1,4 +1,4 @@ 913 888 2931 -160553 +160553 4.6 diff --git a/tests/queries/0_stateless/01606_git_import.sh b/tests/queries/0_stateless/01606_git_import.sh index 6d425c9bceb..8a2091a99a8 100755 --- a/tests/queries/0_stateless/01606_git_import.sh +++ b/tests/queries/0_stateless/01606_git_import.sh @@ -118,7 +118,7 @@ ${CLICKHOUSE_CLIENT} --query "INSERT INTO line_changes FORMAT TSV" < line_change ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM commits" ${CLICKHOUSE_CLIENT} --query "SELECT count() FROM file_changes" -${CLICKHOUSE_CLIENT} --query "SELECT count() FROM line_changes" +${CLICKHOUSE_CLIENT} --query "SELECT count(), round(avg(indent), 1) FROM line_changes" ${CLICKHOUSE_CLIENT} --multiline --multiquery --query " DROP TABLE commits; diff --git a/tests/queries/0_stateless/01825_type_json_schema_inference.sh b/tests/queries/0_stateless/01825_type_json_schema_inference.sh index 36991bd8069..a524eaec20a 100755 --- a/tests/queries/0_stateless/01825_type_json_schema_inference.sh +++ b/tests/queries/0_stateless/01825_type_json_schema_inference.sh @@ -46,7 +46,7 @@ ${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_json_inference (obj JSON, map Map(String echo '{"map": {"k1": 1, "k2": 2}, "obj": {"k1": 1, "k2": 2}}' > $filename -${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" --allow_experimental_object_type 1 +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_json_inference SELECT * FROM file('${CLICKHOUSE_TEST_UNIQUE_NAME}/data.json', 'JSONEachRow')" --allow_experimental_object_type 1 --use_structure_from_insertion_table_in_table_functions 0 ${CLICKHOUSE_CLIENT} -q "SELECT * FROM t_json_inference FORMAT JSONEachRow" --output_format_json_named_tuples_as_objects 1 ${CLICKHOUSE_CLIENT} -q "SELECT toTypeName(obj) FROM t_json_inference LIMIT 1" diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 4ffa31a5618..7a0c383b3fb 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -23,7 +23,8 @@ COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.asynchronous_metrics ( `metric` String, - `value` Float64 + `value` Float64, + `description` String ) ENGINE = SystemAsynchronousMetrics COMMENT 'SYSTEM TABLE is built on the fly.' @@ -281,7 +282,7 @@ CREATE TABLE system.grants ( `user_name` Nullable(String), `role_name` Nullable(String), - `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW ACCESS' = 87, 'ACCESS MANAGEMENT' = 88, 'SYSTEM SHUTDOWN' = 89, 'SYSTEM DROP DNS CACHE' = 90, 'SYSTEM DROP MARK CACHE' = 91, 'SYSTEM DROP UNCOMPRESSED CACHE' = 92, 'SYSTEM DROP MMAP CACHE' = 93, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 94, 'SYSTEM DROP FILESYSTEM CACHE' = 95, 'SYSTEM DROP SCHEMA CACHE' = 96, 'SYSTEM DROP CACHE' = 97, 'SYSTEM RELOAD CONFIG' = 98, 'SYSTEM RELOAD USERS' = 99, 'SYSTEM RELOAD SYMBOLS' = 100, 'SYSTEM RELOAD DICTIONARY' = 101, 'SYSTEM RELOAD MODEL' = 102, 'SYSTEM RELOAD FUNCTION' = 103, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 104, 'SYSTEM RELOAD' = 105, 'SYSTEM RESTART DISK' = 106, 'SYSTEM MERGES' = 107, 'SYSTEM TTL MERGES' = 108, 'SYSTEM FETCHES' = 109, 'SYSTEM MOVES' = 110, 'SYSTEM DISTRIBUTED SENDS' = 111, 'SYSTEM REPLICATED SENDS' = 112, 'SYSTEM SENDS' = 113, 'SYSTEM REPLICATION QUEUES' = 114, 'SYSTEM DROP REPLICA' = 115, 'SYSTEM SYNC REPLICA' = 116, 'SYSTEM RESTART REPLICA' = 117, 'SYSTEM RESTORE REPLICA' = 118, 'SYSTEM SYNC DATABASE REPLICA' = 119, 'SYSTEM SYNC TRANSACTION LOG' = 120, 'SYSTEM FLUSH DISTRIBUTED' = 121, 'SYSTEM FLUSH LOGS' = 122, 'SYSTEM FLUSH' = 123, 'SYSTEM THREAD FUZZER' = 124, 'SYSTEM UNFREEZE' = 125, 'SYSTEM' = 126, 'dictGet' = 127, 'addressToLine' = 128, 'addressToLineWithInlines' = 129, 'addressToSymbol' = 130, 'demangle' = 131, 'INTROSPECTION' = 132, 'FILE' = 133, 'URL' = 134, 'REMOTE' = 135, 'MONGO' = 136, 'MEILISEARCH' = 137, 'MYSQL' = 138, 'POSTGRES' = 139, 'SQLITE' = 140, 'ODBC' = 141, 'JDBC' = 142, 'HDFS' = 143, 'S3' = 144, 'HIVE' = 145, 'SOURCES' = 146, 'CLUSTER' = 147, 'ALL' = 148, 'NONE' = 149), + `access_type` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW NAMED COLLECTIONS' = 87, 'SHOW ACCESS' = 88, 'ACCESS MANAGEMENT' = 89, 'SYSTEM SHUTDOWN' = 90, 'SYSTEM DROP DNS CACHE' = 91, 'SYSTEM DROP MARK CACHE' = 92, 'SYSTEM DROP UNCOMPRESSED CACHE' = 93, 'SYSTEM DROP MMAP CACHE' = 94, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 95, 'SYSTEM DROP FILESYSTEM CACHE' = 96, 'SYSTEM DROP SCHEMA CACHE' = 97, 'SYSTEM DROP CACHE' = 98, 'SYSTEM RELOAD CONFIG' = 99, 'SYSTEM RELOAD USERS' = 100, 'SYSTEM RELOAD SYMBOLS' = 101, 'SYSTEM RELOAD DICTIONARY' = 102, 'SYSTEM RELOAD MODEL' = 103, 'SYSTEM RELOAD FUNCTION' = 104, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 105, 'SYSTEM RELOAD' = 106, 'SYSTEM RESTART DISK' = 107, 'SYSTEM MERGES' = 108, 'SYSTEM TTL MERGES' = 109, 'SYSTEM FETCHES' = 110, 'SYSTEM MOVES' = 111, 'SYSTEM DISTRIBUTED SENDS' = 112, 'SYSTEM REPLICATED SENDS' = 113, 'SYSTEM SENDS' = 114, 'SYSTEM REPLICATION QUEUES' = 115, 'SYSTEM DROP REPLICA' = 116, 'SYSTEM SYNC REPLICA' = 117, 'SYSTEM RESTART REPLICA' = 118, 'SYSTEM RESTORE REPLICA' = 119, 'SYSTEM SYNC DATABASE REPLICA' = 120, 'SYSTEM SYNC TRANSACTION LOG' = 121, 'SYSTEM FLUSH DISTRIBUTED' = 122, 'SYSTEM FLUSH LOGS' = 123, 'SYSTEM FLUSH' = 124, 'SYSTEM THREAD FUZZER' = 125, 'SYSTEM UNFREEZE' = 126, 'SYSTEM' = 127, 'dictGet' = 128, 'addressToLine' = 129, 'addressToLineWithInlines' = 130, 'addressToSymbol' = 131, 'demangle' = 132, 'INTROSPECTION' = 133, 'FILE' = 134, 'URL' = 135, 'REMOTE' = 136, 'MONGO' = 137, 'MEILISEARCH' = 138, 'MYSQL' = 139, 'POSTGRES' = 140, 'SQLITE' = 141, 'ODBC' = 142, 'JDBC' = 143, 'HDFS' = 144, 'S3' = 145, 'HIVE' = 146, 'SOURCES' = 147, 'CLUSTER' = 148, 'ALL' = 149, 'NONE' = 150), `database` Nullable(String), `table` Nullable(String), `column` Nullable(String), @@ -544,10 +545,10 @@ ENGINE = SystemPartsColumns COMMENT 'SYSTEM TABLE is built on the fly.' CREATE TABLE system.privileges ( - `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW ACCESS' = 87, 'ACCESS MANAGEMENT' = 88, 'SYSTEM SHUTDOWN' = 89, 'SYSTEM DROP DNS CACHE' = 90, 'SYSTEM DROP MARK CACHE' = 91, 'SYSTEM DROP UNCOMPRESSED CACHE' = 92, 'SYSTEM DROP MMAP CACHE' = 93, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 94, 'SYSTEM DROP FILESYSTEM CACHE' = 95, 'SYSTEM DROP SCHEMA CACHE' = 96, 'SYSTEM DROP CACHE' = 97, 'SYSTEM RELOAD CONFIG' = 98, 'SYSTEM RELOAD USERS' = 99, 'SYSTEM RELOAD SYMBOLS' = 100, 'SYSTEM RELOAD DICTIONARY' = 101, 'SYSTEM RELOAD MODEL' = 102, 'SYSTEM RELOAD FUNCTION' = 103, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 104, 'SYSTEM RELOAD' = 105, 'SYSTEM RESTART DISK' = 106, 'SYSTEM MERGES' = 107, 'SYSTEM TTL MERGES' = 108, 'SYSTEM FETCHES' = 109, 'SYSTEM MOVES' = 110, 'SYSTEM DISTRIBUTED SENDS' = 111, 'SYSTEM REPLICATED SENDS' = 112, 'SYSTEM SENDS' = 113, 'SYSTEM REPLICATION QUEUES' = 114, 'SYSTEM DROP REPLICA' = 115, 'SYSTEM SYNC REPLICA' = 116, 'SYSTEM RESTART REPLICA' = 117, 'SYSTEM RESTORE REPLICA' = 118, 'SYSTEM SYNC DATABASE REPLICA' = 119, 'SYSTEM SYNC TRANSACTION LOG' = 120, 'SYSTEM FLUSH DISTRIBUTED' = 121, 'SYSTEM FLUSH LOGS' = 122, 'SYSTEM FLUSH' = 123, 'SYSTEM THREAD FUZZER' = 124, 'SYSTEM UNFREEZE' = 125, 'SYSTEM' = 126, 'dictGet' = 127, 'addressToLine' = 128, 'addressToLineWithInlines' = 129, 'addressToSymbol' = 130, 'demangle' = 131, 'INTROSPECTION' = 132, 'FILE' = 133, 'URL' = 134, 'REMOTE' = 135, 'MONGO' = 136, 'MEILISEARCH' = 137, 'MYSQL' = 138, 'POSTGRES' = 139, 'SQLITE' = 140, 'ODBC' = 141, 'JDBC' = 142, 'HDFS' = 143, 'S3' = 144, 'HIVE' = 145, 'SOURCES' = 146, 'CLUSTER' = 147, 'ALL' = 148, 'NONE' = 149), + `privilege` Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW NAMED COLLECTIONS' = 87, 'SHOW ACCESS' = 88, 'ACCESS MANAGEMENT' = 89, 'SYSTEM SHUTDOWN' = 90, 'SYSTEM DROP DNS CACHE' = 91, 'SYSTEM DROP MARK CACHE' = 92, 'SYSTEM DROP UNCOMPRESSED CACHE' = 93, 'SYSTEM DROP MMAP CACHE' = 94, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 95, 'SYSTEM DROP FILESYSTEM CACHE' = 96, 'SYSTEM DROP SCHEMA CACHE' = 97, 'SYSTEM DROP CACHE' = 98, 'SYSTEM RELOAD CONFIG' = 99, 'SYSTEM RELOAD USERS' = 100, 'SYSTEM RELOAD SYMBOLS' = 101, 'SYSTEM RELOAD DICTIONARY' = 102, 'SYSTEM RELOAD MODEL' = 103, 'SYSTEM RELOAD FUNCTION' = 104, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 105, 'SYSTEM RELOAD' = 106, 'SYSTEM RESTART DISK' = 107, 'SYSTEM MERGES' = 108, 'SYSTEM TTL MERGES' = 109, 'SYSTEM FETCHES' = 110, 'SYSTEM MOVES' = 111, 'SYSTEM DISTRIBUTED SENDS' = 112, 'SYSTEM REPLICATED SENDS' = 113, 'SYSTEM SENDS' = 114, 'SYSTEM REPLICATION QUEUES' = 115, 'SYSTEM DROP REPLICA' = 116, 'SYSTEM SYNC REPLICA' = 117, 'SYSTEM RESTART REPLICA' = 118, 'SYSTEM RESTORE REPLICA' = 119, 'SYSTEM SYNC DATABASE REPLICA' = 120, 'SYSTEM SYNC TRANSACTION LOG' = 121, 'SYSTEM FLUSH DISTRIBUTED' = 122, 'SYSTEM FLUSH LOGS' = 123, 'SYSTEM FLUSH' = 124, 'SYSTEM THREAD FUZZER' = 125, 'SYSTEM UNFREEZE' = 126, 'SYSTEM' = 127, 'dictGet' = 128, 'addressToLine' = 129, 'addressToLineWithInlines' = 130, 'addressToSymbol' = 131, 'demangle' = 132, 'INTROSPECTION' = 133, 'FILE' = 134, 'URL' = 135, 'REMOTE' = 136, 'MONGO' = 137, 'MEILISEARCH' = 138, 'MYSQL' = 139, 'POSTGRES' = 140, 'SQLITE' = 141, 'ODBC' = 142, 'JDBC' = 143, 'HDFS' = 144, 'S3' = 145, 'HIVE' = 146, 'SOURCES' = 147, 'CLUSTER' = 148, 'ALL' = 149, 'NONE' = 150), `aliases` Array(String), `level` Nullable(Enum8('GLOBAL' = 0, 'DATABASE' = 1, 'TABLE' = 2, 'DICTIONARY' = 3, 'VIEW' = 4, 'COLUMN' = 5)), - `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW ACCESS' = 87, 'ACCESS MANAGEMENT' = 88, 'SYSTEM SHUTDOWN' = 89, 'SYSTEM DROP DNS CACHE' = 90, 'SYSTEM DROP MARK CACHE' = 91, 'SYSTEM DROP UNCOMPRESSED CACHE' = 92, 'SYSTEM DROP MMAP CACHE' = 93, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 94, 'SYSTEM DROP FILESYSTEM CACHE' = 95, 'SYSTEM DROP SCHEMA CACHE' = 96, 'SYSTEM DROP CACHE' = 97, 'SYSTEM RELOAD CONFIG' = 98, 'SYSTEM RELOAD USERS' = 99, 'SYSTEM RELOAD SYMBOLS' = 100, 'SYSTEM RELOAD DICTIONARY' = 101, 'SYSTEM RELOAD MODEL' = 102, 'SYSTEM RELOAD FUNCTION' = 103, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 104, 'SYSTEM RELOAD' = 105, 'SYSTEM RESTART DISK' = 106, 'SYSTEM MERGES' = 107, 'SYSTEM TTL MERGES' = 108, 'SYSTEM FETCHES' = 109, 'SYSTEM MOVES' = 110, 'SYSTEM DISTRIBUTED SENDS' = 111, 'SYSTEM REPLICATED SENDS' = 112, 'SYSTEM SENDS' = 113, 'SYSTEM REPLICATION QUEUES' = 114, 'SYSTEM DROP REPLICA' = 115, 'SYSTEM SYNC REPLICA' = 116, 'SYSTEM RESTART REPLICA' = 117, 'SYSTEM RESTORE REPLICA' = 118, 'SYSTEM SYNC DATABASE REPLICA' = 119, 'SYSTEM SYNC TRANSACTION LOG' = 120, 'SYSTEM FLUSH DISTRIBUTED' = 121, 'SYSTEM FLUSH LOGS' = 122, 'SYSTEM FLUSH' = 123, 'SYSTEM THREAD FUZZER' = 124, 'SYSTEM UNFREEZE' = 125, 'SYSTEM' = 126, 'dictGet' = 127, 'addressToLine' = 128, 'addressToLineWithInlines' = 129, 'addressToSymbol' = 130, 'demangle' = 131, 'INTROSPECTION' = 132, 'FILE' = 133, 'URL' = 134, 'REMOTE' = 135, 'MONGO' = 136, 'MEILISEARCH' = 137, 'MYSQL' = 138, 'POSTGRES' = 139, 'SQLITE' = 140, 'ODBC' = 141, 'JDBC' = 142, 'HDFS' = 143, 'S3' = 144, 'HIVE' = 145, 'SOURCES' = 146, 'CLUSTER' = 147, 'ALL' = 148, 'NONE' = 149)) + `parent_group` Nullable(Enum16('SHOW DATABASES' = 0, 'SHOW TABLES' = 1, 'SHOW COLUMNS' = 2, 'SHOW DICTIONARIES' = 3, 'SHOW' = 4, 'SHOW FILESYSTEM CACHES' = 5, 'SELECT' = 6, 'INSERT' = 7, 'ALTER UPDATE' = 8, 'ALTER DELETE' = 9, 'ALTER ADD COLUMN' = 10, 'ALTER MODIFY COLUMN' = 11, 'ALTER DROP COLUMN' = 12, 'ALTER COMMENT COLUMN' = 13, 'ALTER CLEAR COLUMN' = 14, 'ALTER RENAME COLUMN' = 15, 'ALTER MATERIALIZE COLUMN' = 16, 'ALTER COLUMN' = 17, 'ALTER MODIFY COMMENT' = 18, 'ALTER ORDER BY' = 19, 'ALTER SAMPLE BY' = 20, 'ALTER ADD INDEX' = 21, 'ALTER DROP INDEX' = 22, 'ALTER MATERIALIZE INDEX' = 23, 'ALTER CLEAR INDEX' = 24, 'ALTER INDEX' = 25, 'ALTER ADD PROJECTION' = 26, 'ALTER DROP PROJECTION' = 27, 'ALTER MATERIALIZE PROJECTION' = 28, 'ALTER CLEAR PROJECTION' = 29, 'ALTER PROJECTION' = 30, 'ALTER ADD CONSTRAINT' = 31, 'ALTER DROP CONSTRAINT' = 32, 'ALTER CONSTRAINT' = 33, 'ALTER TTL' = 34, 'ALTER MATERIALIZE TTL' = 35, 'ALTER SETTINGS' = 36, 'ALTER MOVE PARTITION' = 37, 'ALTER FETCH PARTITION' = 38, 'ALTER FREEZE PARTITION' = 39, 'ALTER DATABASE SETTINGS' = 40, 'ALTER TABLE' = 41, 'ALTER DATABASE' = 42, 'ALTER VIEW REFRESH' = 43, 'ALTER VIEW MODIFY QUERY' = 44, 'ALTER VIEW' = 45, 'ALTER' = 46, 'CREATE DATABASE' = 47, 'CREATE TABLE' = 48, 'CREATE VIEW' = 49, 'CREATE DICTIONARY' = 50, 'CREATE TEMPORARY TABLE' = 51, 'CREATE FUNCTION' = 52, 'CREATE' = 53, 'DROP DATABASE' = 54, 'DROP TABLE' = 55, 'DROP VIEW' = 56, 'DROP DICTIONARY' = 57, 'DROP FUNCTION' = 58, 'DROP' = 59, 'TRUNCATE' = 60, 'OPTIMIZE' = 61, 'BACKUP' = 62, 'KILL QUERY' = 63, 'KILL TRANSACTION' = 64, 'MOVE PARTITION BETWEEN SHARDS' = 65, 'CREATE USER' = 66, 'ALTER USER' = 67, 'DROP USER' = 68, 'CREATE ROLE' = 69, 'ALTER ROLE' = 70, 'DROP ROLE' = 71, 'ROLE ADMIN' = 72, 'CREATE ROW POLICY' = 73, 'ALTER ROW POLICY' = 74, 'DROP ROW POLICY' = 75, 'CREATE QUOTA' = 76, 'ALTER QUOTA' = 77, 'DROP QUOTA' = 78, 'CREATE SETTINGS PROFILE' = 79, 'ALTER SETTINGS PROFILE' = 80, 'DROP SETTINGS PROFILE' = 81, 'SHOW USERS' = 82, 'SHOW ROLES' = 83, 'SHOW ROW POLICIES' = 84, 'SHOW QUOTAS' = 85, 'SHOW SETTINGS PROFILES' = 86, 'SHOW NAMED COLLECTIONS' = 87, 'SHOW ACCESS' = 88, 'ACCESS MANAGEMENT' = 89, 'SYSTEM SHUTDOWN' = 90, 'SYSTEM DROP DNS CACHE' = 91, 'SYSTEM DROP MARK CACHE' = 92, 'SYSTEM DROP UNCOMPRESSED CACHE' = 93, 'SYSTEM DROP MMAP CACHE' = 94, 'SYSTEM DROP COMPILED EXPRESSION CACHE' = 95, 'SYSTEM DROP FILESYSTEM CACHE' = 96, 'SYSTEM DROP SCHEMA CACHE' = 97, 'SYSTEM DROP CACHE' = 98, 'SYSTEM RELOAD CONFIG' = 99, 'SYSTEM RELOAD USERS' = 100, 'SYSTEM RELOAD SYMBOLS' = 101, 'SYSTEM RELOAD DICTIONARY' = 102, 'SYSTEM RELOAD MODEL' = 103, 'SYSTEM RELOAD FUNCTION' = 104, 'SYSTEM RELOAD EMBEDDED DICTIONARIES' = 105, 'SYSTEM RELOAD' = 106, 'SYSTEM RESTART DISK' = 107, 'SYSTEM MERGES' = 108, 'SYSTEM TTL MERGES' = 109, 'SYSTEM FETCHES' = 110, 'SYSTEM MOVES' = 111, 'SYSTEM DISTRIBUTED SENDS' = 112, 'SYSTEM REPLICATED SENDS' = 113, 'SYSTEM SENDS' = 114, 'SYSTEM REPLICATION QUEUES' = 115, 'SYSTEM DROP REPLICA' = 116, 'SYSTEM SYNC REPLICA' = 117, 'SYSTEM RESTART REPLICA' = 118, 'SYSTEM RESTORE REPLICA' = 119, 'SYSTEM SYNC DATABASE REPLICA' = 120, 'SYSTEM SYNC TRANSACTION LOG' = 121, 'SYSTEM FLUSH DISTRIBUTED' = 122, 'SYSTEM FLUSH LOGS' = 123, 'SYSTEM FLUSH' = 124, 'SYSTEM THREAD FUZZER' = 125, 'SYSTEM UNFREEZE' = 126, 'SYSTEM' = 127, 'dictGet' = 128, 'addressToLine' = 129, 'addressToLineWithInlines' = 130, 'addressToSymbol' = 131, 'demangle' = 132, 'INTROSPECTION' = 133, 'FILE' = 134, 'URL' = 135, 'REMOTE' = 136, 'MONGO' = 137, 'MEILISEARCH' = 138, 'MYSQL' = 139, 'POSTGRES' = 140, 'SQLITE' = 141, 'ODBC' = 142, 'JDBC' = 143, 'HDFS' = 144, 'S3' = 145, 'HIVE' = 146, 'SOURCES' = 147, 'CLUSTER' = 148, 'ALL' = 149, 'NONE' = 150)) ) ENGINE = SystemPrivileges COMMENT 'SYSTEM TABLE is built on the fly.' diff --git a/tests/queries/0_stateless/02206_information_schema_show_database.reference b/tests/queries/0_stateless/02206_information_schema_show_database.reference index af437aca989..821fddbb933 100644 --- a/tests/queries/0_stateless/02206_information_schema_show_database.reference +++ b/tests/queries/0_stateless/02206_information_schema_show_database.reference @@ -1,4 +1,4 @@ CREATE DATABASE INFORMATION_SCHEMA\nENGINE = Memory -CREATE VIEW INFORMATION_SCHEMA.COLUMNS\n(\n `table_catalog` String,\n `table_schema` String,\n `table_name` String,\n `column_name` String,\n `ordinal_position` UInt64,\n `column_default` String,\n `is_nullable` UInt8,\n `data_type` String,\n `character_maximum_length` Nullable(UInt64),\n `character_octet_length` Nullable(UInt64),\n `numeric_precision` Nullable(UInt64),\n `numeric_precision_radix` Nullable(UInt64),\n `numeric_scale` Nullable(UInt64),\n `datetime_precision` Nullable(UInt64),\n `character_set_catalog` Nullable(String),\n `character_set_schema` Nullable(String),\n `character_set_name` Nullable(String),\n `collation_catalog` Nullable(String),\n `collation_schema` Nullable(String),\n `collation_name` Nullable(String),\n `domain_catalog` Nullable(String),\n `domain_schema` Nullable(String),\n `domain_name` Nullable(String),\n `column_comment` String,\n `column_type` String,\n `TABLE_CATALOG` String ALIAS table_catalog,\n `TABLE_SCHEMA` String ALIAS table_schema,\n `TABLE_NAME` String ALIAS table_name,\n `COLUMN_NAME` String ALIAS column_name,\n `ORDINAL_POSITION` UInt64 ALIAS ordinal_position,\n `COLUMN_DEFAULT` String ALIAS column_default,\n `IS_NULLABLE` UInt8 ALIAS is_nullable,\n `DATA_TYPE` String ALIAS data_type,\n `CHARACTER_MAXIMUM_LENGTH` Nullable(UInt64) ALIAS character_maximum_length,\n `CHARACTER_OCTET_LENGTH` Nullable(UInt64) ALIAS character_octet_length,\n `NUMERIC_PRECISION` Nullable(UInt64) ALIAS numeric_precision,\n `NUMERIC_PRECISION_RADIX` Nullable(UInt64) ALIAS numeric_precision_radix,\n `NUMERIC_SCALE` Nullable(UInt64) ALIAS numeric_scale,\n `DATETIME_PRECISION` Nullable(UInt64) ALIAS datetime_precision,\n `CHARACTER_SET_CATALOG` Nullable(String) ALIAS character_set_catalog,\n `CHARACTER_SET_SCHEMA` Nullable(String) ALIAS character_set_schema,\n `CHARACTER_SET_NAME` Nullable(String) ALIAS character_set_name,\n `COLLATION_CATALOG` Nullable(String) ALIAS collation_catalog,\n `COLLATION_SCHEMA` Nullable(String) ALIAS collation_schema,\n `COLLATION_NAME` Nullable(String) ALIAS collation_name,\n `DOMAIN_CATALOG` Nullable(String) ALIAS domain_catalog,\n `DOMAIN_SCHEMA` Nullable(String) ALIAS domain_schema,\n `DOMAIN_NAME` Nullable(String) ALIAS domain_name,\n `COLUMN_COMMENT` String ALIAS column_comment,\n `COLUMN_TYPE` String ALIAS column_type\n) AS\nSELECT\n database AS table_catalog,\n database AS table_schema,\n table AS table_name,\n name AS column_name,\n position AS ordinal_position,\n default_expression AS column_default,\n type LIKE \'Nullable(%)\' AS is_nullable,\n type AS data_type,\n character_octet_length AS character_maximum_length,\n character_octet_length,\n numeric_precision,\n numeric_precision_radix,\n numeric_scale,\n datetime_precision,\n NULL AS character_set_catalog,\n NULL AS character_set_schema,\n NULL AS character_set_name,\n NULL AS collation_catalog,\n NULL AS collation_schema,\n NULL AS collation_name,\n NULL AS domain_catalog,\n NULL AS domain_schema,\n NULL AS domain_name,\n comment AS column_comment,\n type AS column_type\nFROM system.columns +CREATE VIEW INFORMATION_SCHEMA.COLUMNS\n(\n `table_catalog` String,\n `table_schema` String,\n `table_name` String,\n `TABLE_SCHEMA` String,\n `TABLE_NAME` String,\n `column_name` String,\n `ordinal_position` UInt64,\n `column_default` String,\n `is_nullable` String,\n `data_type` String,\n `character_maximum_length` Nullable(UInt64),\n `character_octet_length` Nullable(UInt64),\n `numeric_precision` Nullable(UInt64),\n `numeric_precision_radix` Nullable(UInt64),\n `numeric_scale` Nullable(UInt64),\n `datetime_precision` Nullable(UInt64),\n `character_set_catalog` Nullable(String),\n `character_set_schema` Nullable(String),\n `character_set_name` Nullable(String),\n `collation_catalog` Nullable(String),\n `collation_schema` Nullable(String),\n `collation_name` Nullable(String),\n `domain_catalog` Nullable(String),\n `domain_schema` Nullable(String),\n `domain_name` Nullable(String),\n `column_comment` String,\n `column_type` String,\n `TABLE_CATALOG` String ALIAS table_catalog,\n `COLUMN_NAME` String ALIAS column_name,\n `ORDINAL_POSITION` UInt64 ALIAS ordinal_position,\n `COLUMN_DEFAULT` String ALIAS column_default,\n `IS_NULLABLE` String ALIAS is_nullable,\n `DATA_TYPE` String ALIAS data_type,\n `CHARACTER_MAXIMUM_LENGTH` Nullable(UInt64) ALIAS character_maximum_length,\n `CHARACTER_OCTET_LENGTH` Nullable(UInt64) ALIAS character_octet_length,\n `NUMERIC_PRECISION` Nullable(UInt64) ALIAS numeric_precision,\n `NUMERIC_PRECISION_RADIX` Nullable(UInt64) ALIAS numeric_precision_radix,\n `NUMERIC_SCALE` Nullable(UInt64) ALIAS numeric_scale,\n `DATETIME_PRECISION` Nullable(UInt64) ALIAS datetime_precision,\n `CHARACTER_SET_CATALOG` Nullable(String) ALIAS character_set_catalog,\n `CHARACTER_SET_SCHEMA` Nullable(String) ALIAS character_set_schema,\n `CHARACTER_SET_NAME` Nullable(String) ALIAS character_set_name,\n `COLLATION_CATALOG` Nullable(String) ALIAS collation_catalog,\n `COLLATION_SCHEMA` Nullable(String) ALIAS collation_schema,\n `COLLATION_NAME` Nullable(String) ALIAS collation_name,\n `DOMAIN_CATALOG` Nullable(String) ALIAS domain_catalog,\n `DOMAIN_SCHEMA` Nullable(String) ALIAS domain_schema,\n `DOMAIN_NAME` Nullable(String) ALIAS domain_name,\n `COLUMN_COMMENT` String ALIAS column_comment,\n `COLUMN_TYPE` String ALIAS column_type\n) AS\nSELECT\n database AS table_catalog,\n database AS table_schema,\n database AS TABLE_SCHEMA,\n table AS table_name,\n table AS TABLE_NAME,\n name AS column_name,\n position AS ordinal_position,\n default_expression AS column_default,\n type LIKE \'Nullable(%)\' AS is_nullable,\n type AS data_type,\n character_octet_length AS character_maximum_length,\n character_octet_length,\n numeric_precision,\n numeric_precision_radix,\n numeric_scale,\n datetime_precision,\n NULL AS character_set_catalog,\n NULL AS character_set_schema,\n NULL AS character_set_name,\n NULL AS collation_catalog,\n NULL AS collation_schema,\n NULL AS collation_name,\n NULL AS domain_catalog,\n NULL AS domain_schema,\n NULL AS domain_name,\n comment AS column_comment,\n type AS column_type\nFROM system.columns CREATE VIEW INFORMATION_SCHEMA.TABLES (`table_catalog` String, `table_schema` String, `table_name` String, `table_type` Enum8(\'BASE TABLE\' = 1, \'VIEW\' = 2, \'FOREIGN TABLE\' = 3, \'LOCAL TEMPORARY\' = 4, \'SYSTEM VIEW\' = 5), `TABLE_CATALOG` String ALIAS table_catalog, `TABLE_SCHEMA` String ALIAS table_schema, `TABLE_NAME` String ALIAS table_name, `TABLE_TYPE` Enum8(\'BASE TABLE\' = 1, \'VIEW\' = 2, \'FOREIGN TABLE\' = 3, \'LOCAL TEMPORARY\' = 4, \'SYSTEM VIEW\' = 5) ALIAS table_type) AS SELECT database AS table_catalog, database AS table_schema, name AS table_name, multiIf(is_temporary, 4, engine LIKE \'%View\', 2, engine LIKE \'System%\', 5, has_own_data = 0, 3, 1) AS table_type FROM system.tables CREATE VIEW information_schema.tables (`table_catalog` String, `table_schema` String, `table_name` String, `table_type` Enum8(\'BASE TABLE\' = 1, \'VIEW\' = 2, \'FOREIGN TABLE\' = 3, \'LOCAL TEMPORARY\' = 4, \'SYSTEM VIEW\' = 5), `TABLE_CATALOG` String ALIAS table_catalog, `TABLE_SCHEMA` String ALIAS table_schema, `TABLE_NAME` String ALIAS table_name, `TABLE_TYPE` Enum8(\'BASE TABLE\' = 1, \'VIEW\' = 2, \'FOREIGN TABLE\' = 3, \'LOCAL TEMPORARY\' = 4, \'SYSTEM VIEW\' = 5) ALIAS table_type) AS SELECT database AS table_catalog, database AS table_schema, name AS table_name, multiIf(is_temporary, 4, engine LIKE \'%View\', 2, engine LIKE \'System%\', 5, has_own_data = 0, 3, 1) AS table_type FROM system.tables diff --git a/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.expect b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.expect index 20333ae7960..07815e57610 100755 --- a/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.expect +++ b/tests/queries/0_stateless/02310_clickhouse_client_INSERT_progress_profile_events.expect @@ -24,7 +24,7 @@ expect_after { spawn bash send "source $basedir/../shell_config.sh\r" -send "yes | head -n10000000 | \$CLICKHOUSE_CLIENT --query \"insert into function null('foo String') format TSV\" >/dev/null\r" +send "yes | head -n10000000 | \$CLICKHOUSE_CLIENT --progress --query \"insert into function null('foo String') format TSV\" >/dev/null\r" expect "Progress: " send "\3" diff --git a/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.expect b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.expect index 5c95e17aefd..3333ee93468 100755 --- a/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.expect +++ b/tests/queries/0_stateless/02310_clickhouse_local_INSERT_progress_profile_events.expect @@ -24,7 +24,7 @@ expect_after { spawn bash send "source $basedir/../shell_config.sh\r" -send "yes | head -n10000000 | \$CLICKHOUSE_LOCAL --query \"insert into function null('foo String') format TSV\" >/dev/null\r" +send "yes | head -n10000000 | \$CLICKHOUSE_LOCAL --progress --query \"insert into function null('foo String') format TSV\" >/dev/null\r" expect "Progress: " send "\3" diff --git a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.sql b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.sql index 5d43ec6f0c2..a73993f6a5a 100644 --- a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.sql +++ b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.sql @@ -2,5 +2,5 @@ -- Please help shorten this list down to zero elements. SELECT name FROM system.table_functions WHERE length(description) < 10 AND name NOT IN ( - 'cosn', 'hdfs', 'hdfsCluster', 'hive', 'mysql', 'postgresql', 's3', 's3Cluster', 'sqlite' -- these functions are not enabled in fast test + 'cosn', 'oss', 'hdfs', 'hdfsCluster', 'hive', 'mysql', 'postgresql', 's3', 's3Cluster', 'sqlite' -- these functions are not enabled in fast test ) ORDER BY name; diff --git a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql index 0db816332a1..67a329ee1f0 100644 --- a/tests/queries/0_stateless/02448_clone_replica_lost_part.sql +++ b/tests/queries/0_stateless/02448_clone_replica_lost_part.sql @@ -5,9 +5,9 @@ SET insert_keeper_fault_injection_probability=0; -- disable fault injection; par drop table if exists rmt1; drop table if exists rmt2; create table rmt1 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '1') order by tuple() - settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=5; + settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=4; create table rmt2 (n int) engine=ReplicatedMergeTree('/test/02448/{database}/rmt', '2') order by tuple() - settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=5; + settings min_replicated_logs_to_keep=1, max_replicated_logs_to_keep=2, cleanup_delay_period=0, cleanup_delay_period_random_add=1, old_parts_lifetime=0, max_parts_to_merge_at_once=4; -- insert part only on one replica system stop replicated sends rmt1; diff --git a/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.reference b/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.reference index 0ecea03c64e..37cd3d93e39 100644 --- a/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.reference +++ b/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.reference @@ -1,4 +1,4 @@ -CREATE DICTIONARY default.dict\n(\n `id` UInt32,\n `value` String\n)\nPRIMARY KEY id\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' DB \'default\' TABLE \'view\'))\nLIFETIME(MIN 0 MAX 600)\nLAYOUT(HASHED()) +CREATE DICTIONARY default.dict\n(\n `id` UInt32,\n `value` String\n)\nPRIMARY KEY id\nSOURCE(CLICKHOUSE(HOST \'localhost\' PORT 9000 USER \'default\' DB \'default\' TABLE \'view\'))\nLIFETIME(MIN 600 MAX 600)\nLAYOUT(HASHED()) CREATE TABLE default.table\n(\n `col` String MATERIALIZED dictGet(\'default.dict\', \'value\', toUInt32(1))\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 1 v 1 v diff --git a/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.sql b/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.sql index 49e1e36acc9..c586db447c0 100644 --- a/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.sql +++ b/tests/queries/0_stateless/02449_check_dependencies_and_table_shutdown.sql @@ -8,7 +8,7 @@ INSERT INTO view VALUES (1, 'v'); CREATE DICTIONARY dict (id UInt32, value String) PRIMARY KEY id SOURCE(CLICKHOUSE(host 'localhost' port tcpPort() user 'default' db currentDatabase() table 'view')) -LAYOUT (HASHED()) LIFETIME (600); +LAYOUT (HASHED()) LIFETIME (MIN 600 MAX 600); SHOW CREATE dict; diff --git a/tests/queries/0_stateless/02456_datetime_schema_inference.reference b/tests/queries/0_stateless/02456_datetime_schema_inference.reference index 2b88f2783dc..0719f0eb44e 100644 --- a/tests/queries/0_stateless/02456_datetime_schema_inference.reference +++ b/tests/queries/0_stateless/02456_datetime_schema_inference.reference @@ -9,3 +9,4 @@ 2022-04-22 03:45:06.381000000 2022-04-22 03:45:06.381000000 1925-01-12 00:00:00.000000000 + diff --git a/tests/queries/0_stateless/02456_datetime_schema_inference.sql b/tests/queries/0_stateless/02456_datetime_schema_inference.sql index 34749dbd412..123bb324f87 100644 --- a/tests/queries/0_stateless/02456_datetime_schema_inference.sql +++ b/tests/queries/0_stateless/02456_datetime_schema_inference.sql @@ -12,4 +12,4 @@ set date_time_input_format = 'best_effort_us'; select * from format('TSV', '2022-04-22T03:45:06.381'); select toTimeZone(c1, 'UTC') from format('TSV', '2022-04-22T03:45:06.381Z'); select * from format('TSV', '01/12/1925'); - +select * from format(CSV, '""'); diff --git a/tests/queries/0_stateless/02456_progress_tty.expect b/tests/queries/0_stateless/02456_progress_tty.expect index 9daa6caa3fa..ba6cc0537eb 100755 --- a/tests/queries/0_stateless/02456_progress_tty.expect +++ b/tests/queries/0_stateless/02456_progress_tty.expect @@ -17,19 +17,18 @@ expect_after { spawn bash send "source $basedir/../shell_config.sh\r" -# Progress is displayed by default -send "\$CLICKHOUSE_LOCAL --query 'SELECT sum(sleep(1) = 0) FROM numbers(3) SETTINGS max_block_size = 1' >/dev/null\r" -expect "Progress: " -expect "█" -send "\3" - -# It is true even if we redirect both stdout and stderr to /dev/null -send "\$CLICKHOUSE_LOCAL --query 'SELECT sum(sleep(1) = 0) FROM numbers(3) SETTINGS max_block_size = 1' >/dev/null 2>&1\r" -expect "Progress: " -expect "█" +# Progress is not displayed by default +send "\$CLICKHOUSE_LOCAL --query 'SELECT sleep(1), \$\$Hello\$\$ FROM numbers(3) SETTINGS max_block_size = 1' 2>/dev/null\r" +expect -exact "0\tHello\r\n" send "\3" # The option --progress has implicit value of true +send "\$CLICKHOUSE_LOCAL --progress --query 'SELECT sum(sleep(1) = 0) FROM numbers(3) SETTINGS max_block_size = 1' >/dev/null\r" +expect "Progress: " +expect "█" +send "\3" + +# It works even if we redirect both stdout and stderr to /dev/null send "\$CLICKHOUSE_LOCAL --progress --query 'SELECT sum(sleep(1) = 0) FROM numbers(3) SETTINGS max_block_size = 1' >/dev/null 2>&1\r" expect "Progress: " expect "█" diff --git a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.reference b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.reference new file mode 100644 index 00000000000..0ca28640270 --- /dev/null +++ b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.reference @@ -0,0 +1,9 @@ +\N 1 +1 2 +\N 42 +\N 42 +\N 42 +\N 42 +\N +\N +\N diff --git a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql new file mode 100644 index 00000000000..a609dc361fe --- /dev/null +++ b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql @@ -0,0 +1,39 @@ +-- Tags: no-parallel, no-fasttest + +insert into function file(02458_data.jsonl) select NULL as x, 42 as y settings engine_file_truncate_on_insert=1; +insert into function file(02458_data.jsoncompacteachrow) select NULL as x, 42 as y settings engine_file_truncate_on_insert=1; +drop table if exists test; +create table test (x Nullable(UInt32), y UInt32) engine=Memory(); + +set use_structure_from_insertion_table_in_table_functions=2; +insert into test select * from file(02458_data.jsonl); +insert into test select x, 1 from file(02458_data.jsonl); +insert into test select x, y from file(02458_data.jsonl); +insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, z from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} + +insert into test select * from file(02458_data.jsoncompacteachrow); +insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} + +insert into test select * from input() format CSV 1,2 +insert into test select x, y from input() format CSV 1,2 -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, y from input() format JSONEachRow {"x" : null, "y" : 42} + +select * from test order by y; + +drop table test; +create table test (x Nullable(UInt32)) engine=Memory(); +insert into test select * from file(02458_data.jsonl); +insert into test select x from file(02458_data.jsonl); +insert into test select y from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select y as x from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} + +insert into test select c1 from input() format CSV 1,2; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x from input() format JSONEachRow {"x" : null, "y" : 42} + +select * from test order by x; + +drop table test; diff --git a/tests/queries/0_stateless/02475_date_time_schema_inference_bug.reference b/tests/queries/0_stateless/02475_date_time_schema_inference_bug.reference new file mode 100644 index 00000000000..8b137891791 --- /dev/null +++ b/tests/queries/0_stateless/02475_date_time_schema_inference_bug.reference @@ -0,0 +1 @@ + diff --git a/tests/queries/0_stateless/02475_date_time_schema_inference_bug.sql b/tests/queries/0_stateless/02475_date_time_schema_inference_bug.sql new file mode 100644 index 00000000000..1aea4a8024c --- /dev/null +++ b/tests/queries/0_stateless/02475_date_time_schema_inference_bug.sql @@ -0,0 +1 @@ +select * from format(CSV, '""'); diff --git a/tests/queries/0_stateless/02475_positive_modulo.reference b/tests/queries/0_stateless/02475_positive_modulo.reference new file mode 100644 index 00000000000..1e707fe2cc8 --- /dev/null +++ b/tests/queries/0_stateless/02475_positive_modulo.reference @@ -0,0 +1,4 @@ +8 +8 +24 +24 diff --git a/tests/queries/0_stateless/02475_positive_modulo.sql b/tests/queries/0_stateless/02475_positive_modulo.sql new file mode 100644 index 00000000000..5f1fdad3c1a --- /dev/null +++ b/tests/queries/0_stateless/02475_positive_modulo.sql @@ -0,0 +1,4 @@ +SELECT positive_modulo(1000, 32); +SELECT positive_modulo(1000, -32); +SELECT positive_modulo(-1000, -32); +SELECT positive_modulo(-1000, 32); \ No newline at end of file diff --git a/tests/queries/0_stateless/02476_query_parameters_insert.reference b/tests/queries/0_stateless/02476_query_parameters_insert.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/02476_query_parameters_insert.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/02476_query_parameters_insert.sql b/tests/queries/0_stateless/02476_query_parameters_insert.sql new file mode 100644 index 00000000000..de866ccbc4d --- /dev/null +++ b/tests/queries/0_stateless/02476_query_parameters_insert.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS 02476_query_parameters_insert; +CREATE TABLE 02476_query_parameters_insert (x Int32) ENGINE=MergeTree() ORDER BY tuple(); + +SET param_x = 1; +INSERT INTO 02476_query_parameters_insert VALUES ({x: Int32}); +SELECT * FROM 02476_query_parameters_insert; + +DROP TABLE 02476_query_parameters_insert; diff --git a/tests/queries/0_stateless/02477_invalid_reads.reference b/tests/queries/0_stateless/02477_invalid_reads.reference new file mode 100644 index 00000000000..a04038dea65 --- /dev/null +++ b/tests/queries/0_stateless/02477_invalid_reads.reference @@ -0,0 +1 @@ +ubsan 30313233343536373839303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930313233 diff --git a/tests/queries/0_stateless/02477_invalid_reads.sql b/tests/queries/0_stateless/02477_invalid_reads.sql new file mode 100644 index 00000000000..08748af3378 --- /dev/null +++ b/tests/queries/0_stateless/02477_invalid_reads.sql @@ -0,0 +1,61 @@ +-- MIN, MAX AND FAMILY should check for errors in its input +SELECT finalizeAggregation(CAST(unhex('0F00000030'), 'AggregateFunction(min, String)')); -- { serverError 33 } +SELECT finalizeAggregation(CAST(unhex('FFFF000030'), 'AggregateFunction(min, String)')); -- { serverError 33 } + +-- UBSAN +SELECT 'ubsan', hex(finalizeAggregation(CAST(unhex('4000000030313233343536373839303132333435363738393031323334353637383930313233343536373839303132333435363738393031323334353637383930313233010000000000000000'), + 'AggregateFunction(argMax, String, UInt64)'))); + +-- aggThrow should check for errors in its input +SELECT finalizeAggregation(CAST('', 'AggregateFunction(aggThrow(0.), UInt8)')); -- { serverError 32 } + +-- categoricalInformationValue should check for errors in its input +SELECT finalizeAggregation(CAST(unhex('01000000000000000100000000000000'), + 'AggregateFunction(categoricalInformationValue, UInt8, UInt8)')); -- { serverError 33 } +SELECT finalizeAggregation(CAST(unhex('0101000000000000000100000000000000020000000000000001000000000000'), + 'AggregateFunction(categoricalInformationValue, Nullable(UInt8), UInt8)')); -- { serverError 33 } + +-- groupArray should check for errors in its input +SELECT finalizeAggregation(CAST(unhex('5FF3001310132'), 'AggregateFunction(groupArray, String)')); -- { serverError 33 } +SELECT finalizeAggregation(CAST(unhex('FF000000000000000001000000000000000200000000000000'), 'AggregateFunction(groupArray, UInt64)')); -- { serverError 33 } + +-- Same for groupArrayMovingXXXX +SELECT finalizeAggregation(CAST(unhex('0FF00000000000000001000000000000000300000000000000'), 'AggregateFunction(groupArrayMovingSum, UInt64)')); -- { serverError 33 } +SELECT finalizeAggregation(CAST(unhex('0FF00000000000000001000000000000000300000000000000'), 'AggregateFunction(groupArrayMovingAvg, UInt64)')); -- { serverError 33 } + +-- Histogram +SELECT finalizeAggregation(CAST(unhex('00000000000024C000000000000018C00500000000000024C0000000000000F03F00000000000022C0000000000000F03F00000000000020C0000000000000'), + 'AggregateFunction(histogram(5), Int64)')); -- { serverError 33 } + +-- StatisticalSample +SELECT finalizeAggregation(CAST(unhex('0F01000000000000244000000000000026400000000000002840000000000000244000000000000026400000000000002840000000000000F03F'), + 'AggregateFunction(mannWhitneyUTest, Float64, UInt8)')); -- { serverError 33 } + +-- maxIntersections +SELECT finalizeAggregation(CAST(unhex('0F010000000000000001000000000000000300000000000000FFFFFFFFFFFFFFFF03340B9B047F000001000000000000000500000065000000FFFFFFFFFFFFFFFF'), + 'AggregateFunction(maxIntersections, UInt8, UInt8)')); -- { serverError 33 } + +-- sequenceNextNode (This was fine because it would fail in the next readBinary call, but better to add a test) +SELECT finalizeAggregation(CAST(unhex('FFFFFFF014181056F38010000000000000001FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'), + 'AggregateFunction(sequenceNextNode(''forward'', ''head''), DateTime, Nullable(String), UInt8, Nullable(UInt8))')) + SETTINGS allow_experimental_funnel_functions=1; -- { serverError 33 } + +-- Fuzzer (ALL) +SELECT finalizeAggregation(CAST(unhex('FFFFFFF014181056F38010000000000000001FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF014181056F38010000000000000001FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'), + 'AggregateFunction(sequenceNextNode(\'forward\', \'head\'), DateTime, Nullable(String), UInt8, Nullable(UInt8))')) + SETTINGS allow_experimental_funnel_functions = 1; -- { serverError 128 } + +-- Fuzzer 2 (UBSAN) +SELECT finalizeAggregation(CAST(unhex('FFFFFFF014181056F38010000000000000001FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF'), + 'AggregateFunction(sequenceNextNode(\'forward\', \'head\'), DateTime, Nullable(String), UInt8, Nullable(UInt8))')) + SETTINGS allow_experimental_funnel_functions = 1; -- { serverError 33 } + +-- uniqUpTo +SELECT finalizeAggregation(CAST(unhex('04128345AA2BC97190'), + 'AggregateFunction(uniqUpTo(10), String)')); -- { serverError 33 } + +-- quantiles +SELECT finalizeAggregation(CAST(unhex('0F0000000000000000'), + 'AggregateFunction(quantileExact, UInt64)')); -- { serverError 33 } +SELECT finalizeAggregation(CAST(unhex('0F000000000000803F'), + 'AggregateFunction(quantileTDigest, UInt64)')); -- { serverError 33 } diff --git a/tests/queries/0_stateless/02477_is_null_parser.reference b/tests/queries/0_stateless/02477_is_null_parser.reference new file mode 100644 index 00000000000..57d96862011 --- /dev/null +++ b/tests/queries/0_stateless/02477_is_null_parser.reference @@ -0,0 +1,3 @@ +SELECT (\'a\' IS NULL) + (\'b\' IS NOT NULL) +SELECT (\'a\' IS NULL) = 0 +SELECT CAST(1 IS NULL, \'Int32\') diff --git a/tests/queries/0_stateless/02477_is_null_parser.sql b/tests/queries/0_stateless/02477_is_null_parser.sql new file mode 100644 index 00000000000..b95a35fde21 --- /dev/null +++ b/tests/queries/0_stateless/02477_is_null_parser.sql @@ -0,0 +1,3 @@ +EXPLAIN SYNTAX SELECT 'a' IS NULL + 'b' IS NOT NULL; +EXPLAIN SYNTAX SELECT 'a' IS NULL = 0; +EXPLAIN SYNTAX SELECT 1 IS NULL :: Int32; diff --git a/tests/queries/0_stateless/02478_factorial.reference b/tests/queries/0_stateless/02478_factorial.reference new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/tests/queries/0_stateless/02478_factorial.reference @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/tests/queries/0_stateless/02478_factorial.sql b/tests/queries/0_stateless/02478_factorial.sql new file mode 100644 index 00000000000..e1a0f7d60e5 --- /dev/null +++ b/tests/queries/0_stateless/02478_factorial.sql @@ -0,0 +1,7 @@ +select factorial(-1) = 1; +select factorial(0) = 1; +select factorial(10) = 3628800; + +select factorial(100); -- { serverError 36 } +select factorial('100'); -- { serverError 43 } +select factorial(100.1234); -- { serverError 43 } diff --git a/tests/queries/0_stateless/02478_projection_with_group_by_alter.reference b/tests/queries/0_stateless/02478_projection_with_group_by_alter.reference new file mode 100644 index 00000000000..6ad2c8ec8db --- /dev/null +++ b/tests/queries/0_stateless/02478_projection_with_group_by_alter.reference @@ -0,0 +1,60 @@ +-- { echoOn } + +OPTIMIZE TABLE testing FINAL; +SELECT c FROM testing ORDER BY d; +0 +1 +2 +3 +4 +SELECT c FROM testing ORDER BY e, d; +0 +2 +4 +1 +3 +-- update all colums used by proj_1 +ALTER TABLE testing UPDATE c = c+1, d = d+2 WHERE True SETTINGS mutations_sync=2; +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; +SELECT c FROM testing ORDER BY d; +1 +2 +3 +4 +5 +SELECT c FROM testing ORDER BY e, d; +1 +3 +5 +2 +4 +-- update only one column +ALTER TABLE testing UPDATE d = d-1 WHERE True SETTINGS mutations_sync=2; +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; +SELECT c FROM testing ORDER BY d; +1 +2 +3 +4 +5 +SELECT c FROM testing ORDER BY e, d; +1 +3 +5 +2 +4 +-- update only another one column +ALTER TABLE testing UPDATE c = c-1 WHERE True SETTINGS mutations_sync=2; +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; +SELECT c FROM testing ORDER BY d; +0 +1 +2 +3 +4 +SELECT c FROM testing ORDER BY e, d; +0 +2 +4 +1 +3 diff --git a/tests/queries/0_stateless/02478_projection_with_group_by_alter.sql b/tests/queries/0_stateless/02478_projection_with_group_by_alter.sql new file mode 100644 index 00000000000..9ed644fd7da --- /dev/null +++ b/tests/queries/0_stateless/02478_projection_with_group_by_alter.sql @@ -0,0 +1,56 @@ +CREATE TABLE testing +( + a String, + b String, + c Int32, + d Int32, + e Int32, + PROJECTION proj_1 + ( + SELECT c ORDER BY d + ), + PROJECTION proj_2 + ( + SELECT c ORDER BY e, d + ) +) +ENGINE = MergeTree() PRIMARY KEY (a) SETTINGS min_bytes_for_wide_part = 0; + +INSERT INTO testing SELECT number, number, number, number, number%2 FROM numbers(5); + +-- { echoOn } + +OPTIMIZE TABLE testing FINAL; + +SELECT c FROM testing ORDER BY d; +SELECT c FROM testing ORDER BY e, d; + +-- update all colums used by proj_1 +ALTER TABLE testing UPDATE c = c+1, d = d+2 WHERE True SETTINGS mutations_sync=2; + +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; + +SELECT c FROM testing ORDER BY d; +SELECT c FROM testing ORDER BY e, d; + + +-- update only one column +ALTER TABLE testing UPDATE d = d-1 WHERE True SETTINGS mutations_sync=2; + +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; + +SELECT c FROM testing ORDER BY d; +SELECT c FROM testing ORDER BY e, d; + + +-- update only another one column +ALTER TABLE testing UPDATE c = c-1 WHERE True SETTINGS mutations_sync=2; + +SELECT * FROM system.mutations WHERE database = currentDatabase() AND table = 'testing' AND not is_done; + +SELECT c FROM testing ORDER BY d; +SELECT c FROM testing ORDER BY e, d; + +-- { echoOff } + +DROP TABLE testing; diff --git a/tests/queries/0_stateless/02479_analyzer_aggregation_crash.reference b/tests/queries/0_stateless/02479_analyzer_aggregation_crash.reference new file mode 100644 index 00000000000..73d811f75f3 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_aggregation_crash.reference @@ -0,0 +1,2 @@ +10 123456789 +10 123456789 diff --git a/tests/queries/0_stateless/02479_analyzer_aggregation_crash.sql b/tests/queries/0_stateless/02479_analyzer_aggregation_crash.sql new file mode 100644 index 00000000000..c931a3ab634 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_aggregation_crash.sql @@ -0,0 +1,13 @@ +SET allow_experimental_analyzer = 1; +SET compile_aggregate_expressions = 1; +SET min_count_to_compile_aggregate_expression = 0; + +DROP TABLE IF EXISTS lc_00906__fuzz_46; +CREATE TABLE lc_00906__fuzz_46 (`b` Int64) ENGINE = MergeTree ORDER BY b; +INSERT INTO lc_00906__fuzz_46 SELECT '0123456789' FROM numbers(10); + +SELECT count(3.4028234663852886e38), b FROM lc_00906__fuzz_46 GROUP BY b; + +SELECT count(1), b FROM lc_00906__fuzz_46 GROUP BY b; + +DROP TABLE lc_00906__fuzz_46; diff --git a/tests/queries/0_stateless/02479_analyzer_join_with_constants.reference b/tests/queries/0_stateless/02479_analyzer_join_with_constants.reference new file mode 100644 index 00000000000..2a428d5d927 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_join_with_constants.reference @@ -0,0 +1,15 @@ +1 1 +-- +-- +-- +1 2 +-- +1 1 1 1 +-- +1 1 0 0 +-- +0 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/02479_analyzer_join_with_constants.sql b/tests/queries/0_stateless/02479_analyzer_join_with_constants.sql new file mode 100644 index 00000000000..0cc3ff3ab00 --- /dev/null +++ b/tests/queries/0_stateless/02479_analyzer_join_with_constants.sql @@ -0,0 +1,27 @@ +SET allow_experimental_analyzer = 1; + +SELECT * FROM (SELECT 1 AS id) AS t1 INNER JOIN (SELECT 1 AS id) AS t2 ON t1.id = t2.id AND 1; + +SELECT '--'; + +SELECT * FROM (SELECT 1 AS id) AS t1 INNER JOIN (SELECT 2 AS id) AS t2 ON t1.id = t2.id AND 1; + +SELECT '--'; + +SELECT * FROM (SELECT 1 AS id) AS t1 INNER JOIN (SELECT 1 AS id) AS t2 ON t1.id = t2.id AND 0; + +SELECT '--'; + +SELECT * FROM (SELECT 1 AS id) AS t1 INNER JOIN (SELECT 2 AS id) AS t2 ON t1.id = t2.id OR 1; + +SELECT '--'; + +SELECT * FROM (SELECT 1 AS id, 1 AS value) AS t1 ASOF LEFT JOIN (SELECT 1 AS id, 1 AS value) AS t2 ON (t1.id = t2.id) AND 1 == 1 AND (t1.value >= t2.value); + +SELECT '--'; + +SELECT * FROM (SELECT 1 AS id, 1 AS value) AS t1 ASOF LEFT JOIN (SELECT 1 AS id, 1 AS value) AS t2 ON (t1.id = t2.id) AND 1 != 1 AND (t1.value >= t2.value); + +SELECT '--'; + +SELECT b.dt FROM (SELECT NULL > NULL AS pk, 1 AS dt FROM numbers(5)) AS a ASOF LEFT JOIN (SELECT NULL AS pk, 1 AS dt) AS b ON (a.pk = b.pk) AND 1 != 1 AND (a.dt >= b.dt); diff --git a/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.reference b/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.reference new file mode 100644 index 00000000000..376364af7b4 --- /dev/null +++ b/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.reference @@ -0,0 +1,4 @@ +1 +\N +\N +1 diff --git a/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.sql b/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.sql new file mode 100644 index 00000000000..b684de88cb2 --- /dev/null +++ b/tests/queries/0_stateless/02479_if_with_null_and_cullable_const.sql @@ -0,0 +1,3 @@ +SELECT if(number % 2, NULL, toNullable(1)) FROM numbers(2); +SELECT if(number % 2, toNullable(1), NULL) FROM numbers(2); + diff --git a/tests/queries/0_stateless/02479_mysql_connect_to_self.reference b/tests/queries/0_stateless/02479_mysql_connect_to_self.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/02479_mysql_connect_to_self.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/02479_mysql_connect_to_self.sql b/tests/queries/0_stateless/02479_mysql_connect_to_self.sql new file mode 100644 index 00000000000..a7aa6a96c1d --- /dev/null +++ b/tests/queries/0_stateless/02479_mysql_connect_to_self.sql @@ -0,0 +1,3 @@ +-- Tags: no-fasttest +SELECT * +FROM mysql('127.0.0.1:9004', system, one, 'default', '') diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference new file mode 100644 index 00000000000..f0227e1a41e --- /dev/null +++ b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.reference @@ -0,0 +1 @@ +a \N diff --git a/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql new file mode 100644 index 00000000000..ad0c09222c2 --- /dev/null +++ b/tests/queries/0_stateless/02479_nullable_primary_key_second_column.sql @@ -0,0 +1,9 @@ +drop table if exists test_table; + +create table test_table (A Nullable(String), B Nullable(String)) engine MergeTree order by (A,B) settings index_granularity = 1, allow_nullable_key=1; + +insert into test_table values ('a', 'b'), ('a', null), (null, 'b'); + +select * from test_table where B is null; + +drop table test_table; diff --git a/tests/queries/0_stateless/02480_every_asynchronous_metric_must_have_documentation.reference b/tests/queries/0_stateless/02480_every_asynchronous_metric_must_have_documentation.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02480_every_asynchronous_metric_must_have_documentation.sql b/tests/queries/0_stateless/02480_every_asynchronous_metric_must_have_documentation.sql new file mode 100644 index 00000000000..3f0ab58cc43 --- /dev/null +++ b/tests/queries/0_stateless/02480_every_asynchronous_metric_must_have_documentation.sql @@ -0,0 +1 @@ +SELECT metric FROM system.asynchronous_metrics WHERE length(description) < 10; diff --git a/tests/queries/0_stateless/02480_interval_casting_and_subquery.reference b/tests/queries/0_stateless/02480_interval_casting_and_subquery.reference new file mode 100644 index 00000000000..74df309720b --- /dev/null +++ b/tests/queries/0_stateless/02480_interval_casting_and_subquery.reference @@ -0,0 +1,25 @@ +5 2017-01-01 00:00:05 +5 2017-01-01 00:05:00 +5 2017-01-01 05:00:00 +5 2017-01-06 00:00:00 +5 2017-06-01 00:00:00 +5 2018-04-01 00:00:00 +5 2022-01-01 00:00:00 +5 +5 +5 +5 +5 +5 +5 +5 +5 +5 +5 +5 2017-01-01 00:00:05 +5 2017-01-01 00:05:00 +5 2017-01-01 05:00:00 +5 2017-01-06 00:00:00 +5 2017-06-01 00:00:00 +5 2018-04-01 00:00:00 +5 2022-01-01 00:00:00 diff --git a/tests/queries/0_stateless/02480_interval_casting_and_subquery.sql b/tests/queries/0_stateless/02480_interval_casting_and_subquery.sql new file mode 100644 index 00000000000..cb6eccb06c0 --- /dev/null +++ b/tests/queries/0_stateless/02480_interval_casting_and_subquery.sql @@ -0,0 +1,25 @@ +SELECT toIntervalSecond(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalMinute(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalHour(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalDay(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalMonth(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalQuarter(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT toIntervalYear(5) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT CAST(5 AS IntervalNanosecond); +SELECT CAST(5 AS IntervalMicrosecond); +SELECT CAST(5 AS IntervalMillisecond); +SELECT CAST(5 AS IntervalSecond); +SELECT CAST(5 AS IntervalMinute); +SELECT CAST(5 AS IntervalHour); +SELECT CAST(5 AS IntervalDay); +SELECT CAST(5 AS IntervalWeek); +SELECT CAST(5 AS IntervalMonth); +SELECT CAST(5 AS IntervalQuarter); +SELECT CAST(5 AS IntervalYear); +SELECT (SELECT toIntervalSecond(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalMinute(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalHour(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalDay(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalMonth(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalQuarter(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; +SELECT (SELECT toIntervalYear(5)) AS interval, toDateTime('2017-01-01 00:00:00') + interval AS res; diff --git a/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.reference b/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.reference new file mode 100644 index 00000000000..1515932af18 --- /dev/null +++ b/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.reference @@ -0,0 +1,3 @@ +9279104479c7da1114861274de32208ead91b60e +\N +\N diff --git a/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.sql b/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.sql new file mode 100644 index 00000000000..5102fb47204 --- /dev/null +++ b/tests/queries/0_stateless/02480_parse_date_time_best_effort_math_overflow.sql @@ -0,0 +1,3 @@ +select * from format(TSV, '9279104479c7da1114861274de32208ead91b60e') settings date_time_input_format='best_effort'; +select parseDateTime64BestEffortOrNull('9279104477', 9); +select toDateTime64OrNull('9279104477', 9); diff --git a/tests/queries/0_stateless/02480_tlp_nan.reference b/tests/queries/0_stateless/02480_tlp_nan.reference new file mode 100644 index 00000000000..ea4aa44fa89 --- /dev/null +++ b/tests/queries/0_stateless/02480_tlp_nan.reference @@ -0,0 +1,10 @@ +nan 0 1 0 +nan 0 1 0 +-inf 0 1 0 +-inf 0 1 0 +\N \N \N 1 +\N \N \N 1 +inf 0 1 0 +inf 0 1 0 +nan 0 1 0 +nan 0 1 0 diff --git a/tests/queries/0_stateless/02480_tlp_nan.sql b/tests/queries/0_stateless/02480_tlp_nan.sql new file mode 100644 index 00000000000..e24bc9a9830 --- /dev/null +++ b/tests/queries/0_stateless/02480_tlp_nan.sql @@ -0,0 +1,15 @@ +-- {echo} +SELECT sqrt(-1) as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=1; +SELECT sqrt(-1) as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=0; + +SELECT -inf as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=1; +SELECT -inf as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=0; + +SELECT NULL as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=1; +SELECT NULL as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=0; + +SELECT inf as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=1; +SELECT inf as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=0; + +SELECT nan as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=1; +SELECT nan as x, not(x), not(not(x)), (not(x)) IS NULL SETTINGS allow_experimental_analyzer=0; diff --git a/utils/check-style/check-style b/utils/check-style/check-style index a0556d971e8..ed397a4a162 100755 --- a/utils/check-style/check-style +++ b/utils/check-style/check-style @@ -13,7 +13,7 @@ # and then to run formatter only for the specified files. ROOT_PATH=$(git rev-parse --show-toplevel) -EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|memcpy/|consistent-hashing|benchmark' +EXCLUDE_DIRS='build/|integration/|widechar_width/|glibc-compatibility/|memcpy/|consistent-hashing|benchmark|tests/' # From [1]: # But since array_to_string_internal() in array.c still loops over array