From e160e834c97ed56b0c108e82f34134838a02e5bb Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 22 Nov 2023 12:24:21 +0000 Subject: [PATCH 001/133] Add a test from fuzzer --- tests/queries/0_stateless/02915_analyzer_fuzz_1.reference | 1 + tests/queries/0_stateless/02915_analyzer_fuzz_1.sql | 2 ++ 2 files changed, 3 insertions(+) create mode 100644 tests/queries/0_stateless/02915_analyzer_fuzz_1.reference create mode 100644 tests/queries/0_stateless/02915_analyzer_fuzz_1.sql diff --git a/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference b/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference new file mode 100644 index 00000000000..ac3f57c1a2e --- /dev/null +++ b/tests/queries/0_stateless/02915_analyzer_fuzz_1.reference @@ -0,0 +1 @@ +With ba\0 diff --git a/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql b/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql new file mode 100644 index 00000000000..94849453063 --- /dev/null +++ b/tests/queries/0_stateless/02915_analyzer_fuzz_1.sql @@ -0,0 +1,2 @@ +set allow_experimental_analyzer=1; +SELECT concat('With ', materialize(_CAST('ba\0', 'LowCardinality(FixedString(3))'))) AS `concat('With ', materialize(CAST('ba\\0', 'LowCardinality(FixedString(3))')))` FROM system.one GROUP BY 'With '; From 9f9bb182c0d8aca1dc12209648cc35c5f3cb1948 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 22 Nov 2023 12:31:26 +0000 Subject: [PATCH 002/133] Add a test from fuzzer --- .../02271_fix_column_matcher_and_column_transformer.sql | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index 245b2cc97e3..20a1f5a439f 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -63,4 +63,10 @@ ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name; +set allow_suspicious_low_cardinality_types=1; + +CREATE TABLE github_events__fuzz_0 (`file_time` Int64, `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), `actor_login` LowCardinality(String), `repo_name` LowCardinality(Nullable(String)), `created_at` DateTime, `updated_at` DateTime, `action` Array(Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20)), `comment_id` UInt64, `body` String, `path` LowCardinality(String), `position` Int32, `line` Int32, `ref` String, `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), `creator_user_login` Int16, `number` UInt32, `title` String, `labels` Array(Array(LowCardinality(String))), `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2), `locked` UInt8, `assignee` Array(LowCardinality(String)), `assignees` Array(LowCardinality(String)), `comments` UInt32, `author_association` Array(Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5)), `closed_at` UUID, `merged_at` DateTime, `merge_commit_sha` Nullable(String), `requested_reviewers` Array(LowCardinality(Int64)), `requested_teams` Array(String), `head_ref` String, `head_sha` String, `base_ref` String, `base_sha` String, `merged` Nullable(UInt8), `mergeable` Nullable(UInt8), `rebaseable` LowCardinality(UInt8), `mergeable_state` Array(Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4)), `merged_by` LowCardinality(String), `review_comments` UInt32, `maintainer_can_modify` Nullable(UInt8), `commits` UInt32, `additions` Nullable(UInt32), `deletions` UInt32, `changed_files` UInt32, `diff_hunk` Nullable(String), `original_position` UInt32, `commit_id` String, `original_commit_id` String, `push_size` UInt32, `push_distinct_size` UInt32, `member_login` LowCardinality(String), `release_tag_name` LowCardinality(String), `release_name` String, `review_state` Int16) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at) settings allow_nullable_key=1; + +EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; + DROP TABLE github_events; From c039b71abe75284059b247b652c291f23d04b637 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Dec 2023 16:17:25 +0000 Subject: [PATCH 003/133] Update test. --- .../02271_fix_column_matcher_and_column_transformer.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index 20a1f5a439f..2ad3732ee03 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -67,6 +67,6 @@ set allow_suspicious_low_cardinality_types=1; CREATE TABLE github_events__fuzz_0 (`file_time` Int64, `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), `actor_login` LowCardinality(String), `repo_name` LowCardinality(Nullable(String)), `created_at` DateTime, `updated_at` DateTime, `action` Array(Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20)), `comment_id` UInt64, `body` String, `path` LowCardinality(String), `position` Int32, `line` Int32, `ref` String, `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), `creator_user_login` Int16, `number` UInt32, `title` String, `labels` Array(Array(LowCardinality(String))), `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2), `locked` UInt8, `assignee` Array(LowCardinality(String)), `assignees` Array(LowCardinality(String)), `comments` UInt32, `author_association` Array(Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5)), `closed_at` UUID, `merged_at` DateTime, `merge_commit_sha` Nullable(String), `requested_reviewers` Array(LowCardinality(Int64)), `requested_teams` Array(String), `head_ref` String, `head_sha` String, `base_ref` String, `base_sha` String, `merged` Nullable(UInt8), `mergeable` Nullable(UInt8), `rebaseable` LowCardinality(UInt8), `mergeable_state` Array(Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4)), `merged_by` LowCardinality(String), `review_comments` UInt32, `maintainer_can_modify` Nullable(UInt8), `commits` UInt32, `additions` Nullable(UInt32), `deletions` UInt32, `changed_files` UInt32, `diff_hunk` Nullable(String), `original_position` UInt32, `commit_id` String, `original_commit_id` String, `push_size` UInt32, `push_distinct_size` UInt32, `member_login` LowCardinality(String), `release_tag_name` LowCardinality(String), `release_name` String, `review_state` Int16) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at) settings allow_nullable_key=1; -EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; +EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; -- { serverError TYPE_MISMATCH } DROP TABLE github_events; From 552e1acf18865a7d7042e8bdbbafc44c85fed962 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 29 Jan 2024 19:49:10 +0100 Subject: [PATCH 004/133] support uniq for statistics --- .../mergetree-family/mergetree.md | 10 +- .../statements/alter/statistic.md | 10 +- src/Access/Common/AccessType.h | 1 + src/AggregateFunctions/QuantileTDigest.h | 13 ++ src/Interpreters/InterpreterAlterQuery.cpp | 5 + src/Interpreters/InterpreterCreateQuery.cpp | 8 +- src/Interpreters/MutationsInterpreter.cpp | 2 +- src/Parsers/ASTAlterQuery.h | 1 + src/Parsers/ASTStatisticDeclaration.cpp | 42 ----- src/Parsers/ASTStatisticsDeclaration.cpp | 57 ++++++ ...claration.h => ASTStatisticsDeclaration.h} | 6 +- src/Parsers/ParserAlterQuery.cpp | 19 +- src/Parsers/ParserCreateQuery.cpp | 29 +++- src/Parsers/ParserCreateQuery.h | 11 ++ src/Storages/AlterCommands.cpp | 69 +++++--- src/Storages/AlterCommands.h | 3 +- src/Storages/ColumnsDescription.cpp | 6 +- src/Storages/ColumnsDescription.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 4 +- src/Storages/MergeTree/MergeTask.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 10 +- .../MergeTree/MergeTreeDataPartCompact.cpp | 2 +- .../MergeTree/MergeTreeDataPartCompact.h | 2 +- .../MergeTree/MergeTreeDataPartInMemory.cpp | 2 +- .../MergeTree/MergeTreeDataPartInMemory.h | 2 +- .../MergeTree/MergeTreeDataPartWide.cpp | 2 +- .../MergeTree/MergeTreeDataPartWide.h | 2 +- .../MergeTreeDataPartWriterCompact.cpp | 2 +- .../MergeTreeDataPartWriterCompact.h | 2 +- .../MergeTreeDataPartWriterOnDisk.cpp | 2 +- .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 4 +- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 2 +- .../MergeTree/MergeTreeDataPartWriterWide.h | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 3 +- .../MergeTree/MergeTreeWhereOptimizer.cpp | 4 +- .../MergeTree/MergeTreeWhereOptimizer.h | 4 +- .../MergeTree/MergedBlockOutputStream.cpp | 2 +- .../MergeTree/MergedBlockOutputStream.h | 2 +- .../MergedColumnOnlyOutputStream.cpp | 2 +- .../MergeTree/MergedColumnOnlyOutputStream.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 20 +-- src/Storages/MutationCommands.cpp | 7 +- src/Storages/MutationCommands.h | 1 + src/Storages/Statistics/Estimator.cpp | 84 +++++++-- src/Storages/Statistics/Estimator.h | 80 ++------- src/Storages/Statistics/Statistics.cpp | 149 ++++++++++++++-- src/Storages/Statistics/Statistics.h | 54 ++++-- src/Storages/Statistics/TDigestStatistic.cpp | 10 +- src/Storages/Statistics/TDigestStatistic.h | 7 +- src/Storages/Statistics/UniqStatistic.h | 61 +++++++ src/Storages/Statistics/tests/gtest_stats.cpp | 2 +- src/Storages/StatisticsDescription.cpp | 163 +++++++++++++----- src/Storages/StatisticsDescription.h | 55 +++++- .../test_manipulate_statistic/test.py | 8 +- .../0_stateless/02864_statistic_exception.sql | 10 +- .../0_stateless/02864_statistic_operate.sql | 4 +- .../02864_statistic_uniq.reference | 29 ++++ .../0_stateless/02864_statistic_uniq.sql | 43 +++++ 59 files changed, 835 insertions(+), 311 deletions(-) delete mode 100644 src/Parsers/ASTStatisticDeclaration.cpp create mode 100644 src/Parsers/ASTStatisticsDeclaration.cpp rename src/Parsers/{ASTStatisticDeclaration.h => ASTStatisticsDeclaration.h} (74%) create mode 100644 src/Storages/Statistics/UniqStatistic.h create mode 100644 tests/queries/0_stateless/02864_statistic_uniq.reference create mode 100644 tests/queries/0_stateless/02864_statistic_uniq.sql diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index ed413959ca6..a90e9a2698c 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1365,7 +1365,7 @@ The statistic declaration is in the columns section of the `CREATE` query for ta ``` sql CREATE TABLE example_table ( - a Int64 STATISTIC(tdigest), + a Int64 STATISTIC(tdigest, uniq), b Float64 ) ENGINE = MergeTree @@ -1375,8 +1375,8 @@ ORDER BY a We can also manipulate statistics with `ALTER` statements. ```sql -ALTER TABLE example_table ADD STATISTIC b TYPE tdigest; -ALTER TABLE example_table DROP STATISTIC a TYPE tdigest; +ALTER TABLE example_table ADD STATISTIC b TYPE tdigest, uniq; +ALTER TABLE example_table DROP STATISTIC a; ``` These lightweight statistics aggregate information about distribution of values in columns. @@ -1387,3 +1387,7 @@ They can be used for query optimization when we enable `set allow_statistic_opti - `tdigest` Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch. + +- `uniq` + + Estimate the number of distinct values of a column. diff --git a/docs/en/sql-reference/statements/alter/statistic.md b/docs/en/sql-reference/statements/alter/statistic.md index 1c2e45b23fd..08010a3911d 100644 --- a/docs/en/sql-reference/statements/alter/statistic.md +++ b/docs/en/sql-reference/statements/alter/statistic.md @@ -8,13 +8,15 @@ sidebar_label: STATISTIC The following operations are available: -- `ALTER TABLE [db].table ADD STATISTIC (columns list) TYPE type` - Adds statistic description to tables metadata. +- `ALTER TABLE [db].table ADD STATISTIC (columns list) TYPE (type list)` - Adds statistic description to tables metadata. -- `ALTER TABLE [db].table DROP STATISTIC (columns list) TYPE type` - Removes statistic description from tables metadata and deletes statistic files from disk. +- `ALTER TABLE [db].table MODIFY STATISTIC (columns list) TYPE (type list)` - Modifies statistic description to tables metadata. -- `ALTER TABLE [db].table CLEAR STATISTIC (columns list) TYPE type` - Deletes statistic files from disk. +- `ALTER TABLE [db].table DROP STATISTIC (columns list)` - Removes statistic description from tables metadata and deletes statistic files from disk. -- `ALTER TABLE [db.]table MATERIALIZE STATISTIC (columns list) TYPE type` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). +- `ALTER TABLE [db].table CLEAR STATISTIC (columns list)` - Deletes statistic files from disk. + +- `ALTER TABLE [db.]table MATERIALIZE STATISTIC (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). The first two commands are lightweight in a sense that they only change metadata or remove files. diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 45d427a7c55..0e2ff7247f0 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -53,6 +53,7 @@ enum class AccessType \ M(ALTER_ADD_STATISTIC, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTIC) \ M(ALTER_DROP_STATISTIC, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTIC) \ + M(ALTER_MODIFY_STATISTIC, "ALTER MODIFY STATISTIC", TABLE, ALTER_STATISTIC) \ M(ALTER_MATERIALIZE_STATISTIC, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTIC) \ M(ALTER_STATISTIC, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\ \ diff --git a/src/AggregateFunctions/QuantileTDigest.h b/src/AggregateFunctions/QuantileTDigest.h index 979c3f2af15..cc03e477645 100644 --- a/src/AggregateFunctions/QuantileTDigest.h +++ b/src/AggregateFunctions/QuantileTDigest.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -335,6 +336,18 @@ public: compress(); // Allows reading/writing TDigests with different epsilon/max_centroids params } + Float64 getCountEqual(Float64 value) const + { + Float64 result = 0; + for (const auto & c : centroids) + { + std::cerr << "c "<< c.mean << " "<< c.count << std::endl; + if (value == c.mean) + result += c.count; + } + return result; + } + Float64 getCountLessThan(Float64 value) const { bool first = true; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index db93467c0a4..089784d79d0 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -329,6 +329,11 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS required_access.emplace_back(AccessType::ALTER_ADD_STATISTIC, database, table); break; } + case ASTAlterCommand::MODIFY_STATISTIC: + { + required_access.emplace_back(AccessType::ALTER_MODIFY_STATISTIC, database, table); + break; + } case ASTAlterCommand::DROP_STATISTIC: { required_access.emplace_back(AccessType::ALTER_DROP_STATISTIC, database, table); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 1eadb325e95..767010f566b 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -450,9 +450,9 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns) column_declaration->children.push_back(column_declaration->codec); } - if (column.stat) + if (!column.stats.empty()) { - column_declaration->stat_type = column.stat->ast; + column_declaration->stat_type = column.stats.getAST(); column_declaration->children.push_back(column_declaration->stat_type); } @@ -658,11 +658,13 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec); } + column.stats.column_name = column.name; /// We assign column name here for better exception error message. if (col_decl.stat_type) { if (!attach && !context_->getSettingsRef().allow_experimental_statistic) throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistic is now disabled. Turn on allow_experimental_statistic"); - column.stat = StatisticDescription::getStatisticFromColumnDeclaration(col_decl); + column.stats = StatisticsDescription::getStatisticFromColumnDeclaration(col_decl); + column.stats.data_type = column.type; } if (col_decl.ttl) diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index bf50766c165..a9c9c8774b9 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -797,7 +797,7 @@ void MutationsInterpreter::prepare(bool dry_run) mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); for (const auto & stat_column_name: command.statistic_columns) { - if (!columns_desc.has(stat_column_name) || !columns_desc.get(stat_column_name).stat) + if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).stats.empty()) throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Unknown statistic column: {}", stat_column_name); dependencies.emplace(stat_column_name, ColumnDependency::STATISTIC); materialized_statistics.emplace(stat_column_name); diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index 77c540aed33..1f82933c687 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -56,6 +56,7 @@ public: ADD_STATISTIC, DROP_STATISTIC, + MODIFY_STATISTIC, MATERIALIZE_STATISTIC, DROP_PARTITION, diff --git a/src/Parsers/ASTStatisticDeclaration.cpp b/src/Parsers/ASTStatisticDeclaration.cpp deleted file mode 100644 index 0e20b020ab3..00000000000 --- a/src/Parsers/ASTStatisticDeclaration.cpp +++ /dev/null @@ -1,42 +0,0 @@ -#include -#include - -#include -#include -#include - - -namespace DB -{ - -ASTPtr ASTStatisticDeclaration::clone() const -{ - auto res = std::make_shared(); - - res->set(res->columns, columns->clone()); - res->type = type; - - return res; -} - -std::vector ASTStatisticDeclaration::getColumnNames() const -{ - std::vector result; - result.reserve(columns->children.size()); - for (const ASTPtr & column_ast : columns->children) - { - result.push_back(column_ast->as().name()); - } - return result; - -} - -void ASTStatisticDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const -{ - columns->formatImpl(s, state, frame); - s.ostr << (s.hilite ? hilite_keyword : "") << " TYPE " << (s.hilite ? hilite_none : ""); - s.ostr << backQuoteIfNeed(type); -} - -} - diff --git a/src/Parsers/ASTStatisticsDeclaration.cpp b/src/Parsers/ASTStatisticsDeclaration.cpp new file mode 100644 index 00000000000..ed80de54655 --- /dev/null +++ b/src/Parsers/ASTStatisticsDeclaration.cpp @@ -0,0 +1,57 @@ +#include +#include + +#include +#include +#include + + +namespace DB +{ + +ASTPtr ASTStatisticsDeclaration::clone() const +{ + auto res = std::make_shared(); + + res->set(res->columns, columns->clone()); + if (types) + res->set(res->types, types->clone()); + + return res; +} + +std::vector ASTStatisticsDeclaration::getColumnNames() const +{ + std::vector result; + result.reserve(columns->children.size()); + for (const ASTPtr & column_ast : columns->children) + { + result.push_back(column_ast->as().name()); + } + return result; + +} + +std::vector ASTStatisticsDeclaration::getTypeNames() const +{ + chassert(types != nullptr); + std::vector result; + result.reserve(types->children.size()); + for (const ASTPtr & column_ast : types->children) + { + result.push_back(column_ast->as().name); + } + return result; + +} + +void ASTStatisticsDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const +{ + columns->formatImpl(s, state, frame); + s.ostr << (s.hilite ? hilite_keyword : "") << " TYPE " << (s.hilite ? hilite_none : ""); + if (types) + types->formatImpl(s, state, frame); +} + +} + diff --git a/src/Parsers/ASTStatisticDeclaration.h b/src/Parsers/ASTStatisticsDeclaration.h similarity index 74% rename from src/Parsers/ASTStatisticDeclaration.h rename to src/Parsers/ASTStatisticsDeclaration.h index f936c93f2ba..f43567b3c70 100644 --- a/src/Parsers/ASTStatisticDeclaration.h +++ b/src/Parsers/ASTStatisticsDeclaration.h @@ -9,17 +9,17 @@ class ASTFunction; /** name BY columns TYPE typename(args) in create query */ -class ASTStatisticDeclaration : public IAST +class ASTStatisticsDeclaration : public IAST { public: IAST * columns; - /// TODO type should be a list of ASTFunction, for example, 'tdigest(256), hyperloglog(128)', etc. - String type; + IAST * types; /** Get the text that identifies this element. */ String getID(char) const override { return "Stat"; } std::vector getColumnNames() const; + std::vector getTypeNames() const; ASTPtr clone() const override; void formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const override; diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp index 2a0060f20f2..bf93bd64bc8 100644 --- a/src/Parsers/ParserAlterQuery.cpp +++ b/src/Parsers/ParserAlterQuery.cpp @@ -46,6 +46,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_add_statistic("ADD STATISTIC"); ParserKeyword s_drop_statistic("DROP STATISTIC"); + ParserKeyword s_modify_statistic("MODIFY STATISTIC"); ParserKeyword s_clear_statistic("CLEAR STATISTIC"); ParserKeyword s_materialize_statistic("MATERIALIZE STATISTIC"); @@ -119,6 +120,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserCompoundColumnDeclaration parser_col_decl; ParserIndexDeclaration parser_idx_decl; ParserStatisticDeclaration parser_stat_decl; + ParserStatisticDeclarationWithoutTypes parser_stat_decl_for_drop; ParserConstraintDeclaration parser_constraint_decl; ParserProjectionDeclaration parser_projection_decl; ParserCompoundColumnDeclaration parser_modify_col_decl(false, false, true); @@ -344,12 +346,19 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected command->type = ASTAlterCommand::ADD_STATISTIC; } + else if (s_modify_statistic.ignore(pos, expected)) + { + if (!parser_stat_decl.parse(pos, command->statistic_decl, expected)) + return false; + + command->type = ASTAlterCommand::MODIFY_STATISTIC; + } else if (s_drop_statistic.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command->statistic_decl, expected)) + if (!parser_stat_decl_for_drop.parse(pos, command->statistic_decl, expected)) return false; command->type = ASTAlterCommand::DROP_STATISTIC; @@ -359,13 +368,13 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command->statistic_decl, expected)) - return false; - command->type = ASTAlterCommand::DROP_STATISTIC; command->clear_statistic = true; command->detach = false; + if (!parser_stat_decl_for_drop.parse(pos, command->statistic_decl, expected)) + return false; + if (s_in_partition.ignore(pos, expected)) { if (!parser_partition.parse(pos, command->partition, expected)) @@ -377,7 +386,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl.parse(pos, command->statistic_decl, expected)) + if (!parser_stat_decl_for_drop.parse(pos, command->statistic_decl, expected)) return false; command->type = ASTAlterCommand::MATERIALIZE_STATISTIC; diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index f79850467e4..4fa6406a77e 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -167,10 +167,10 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & ParserKeyword s_type("TYPE"); ParserList columns_p(std::make_unique(), std::make_unique(TokenType::Comma), false); - ParserIdentifier type_p; + ParserList types_p(std::make_unique(), std::make_unique(TokenType::Comma), false); ASTPtr columns; - ASTPtr type; + ASTPtr types; if (!columns_p.parse(pos, columns, expected)) return false; @@ -178,12 +178,29 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & if (!s_type.ignore(pos, expected)) return false; - if (!type_p.parse(pos, type, expected)) + if (!types_p.parse(pos, types, expected)) return false; - auto stat = std::make_shared(); + auto stat = std::make_shared(); + stat->set(stat->columns, columns); + stat->set(stat->types, types); + node = stat; + + return true; +} + +bool ParserStatisticDeclarationWithoutTypes::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +{ + + ParserList columns_p(std::make_unique(), std::make_unique(TokenType::Comma), false); + + ASTPtr columns; + + if (!columns_p.parse(pos, columns, expected)) + return false; + + auto stat = std::make_shared(); stat->set(stat->columns, columns); - stat->type = type->as().name(); node = stat; return true; diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index 910ee048442..8dd398766a8 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -414,6 +414,17 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; +class ParserStatisticDeclarationWithoutTypes : public IParserBase +{ +public: + ParserStatisticDeclarationWithoutTypes() = default; + +protected: + const char * getName() const override { return "statistics declaration"; } + bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; +}; + + class ParserConstraintDeclaration : public IParserBase { protected: diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 6f93cb3c370..fd0295c4a2c 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -250,10 +250,25 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ command.statistic_decl = command_ast->statistic_decl; command.type = AlterCommand::ADD_STATISTIC; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + const auto & ast_stat_decl = command_ast->statistic_decl->as(); command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_type = ast_stat_decl.type; + command.statistic_types = ast_stat_decl.getTypeNames(); + command.if_not_exists = command_ast->if_not_exists; + + return command; + } + else if (command_ast->type == ASTAlterCommand::MODIFY_STATISTIC) + { + AlterCommand command; + command.ast = command_ast->clone(); + command.statistic_decl = command_ast->statistic_decl; + command.type = AlterCommand::MODIFY_STATISTIC; + + const auto & ast_stat_decl = command_ast->statistic_decl->as(); + + command.statistic_columns = ast_stat_decl.getColumnNames(); + command.statistic_types = ast_stat_decl.getTypeNames(); command.if_not_exists = command_ast->if_not_exists; return command; @@ -321,11 +336,11 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ { AlterCommand command; command.ast = command_ast->clone(); + command.statistic_decl = command_ast->statistic_decl; command.type = AlterCommand::DROP_STATISTIC; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + const auto & ast_stat_decl = command_ast->statistic_decl->as(); command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_type = ast_stat_decl.type; command.if_exists = command_ast->if_exists; command.clear = command_ast->clear_statistic; @@ -626,35 +641,49 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) { if (!metadata.columns.has(statistic_column_name)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic {} with type {}: this column is not found", statistic_column_name, statistic_type); + throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic for column {}: this column is not found", statistic_column_name); } - if (!if_exists && metadata.columns.get(statistic_column_name).stat) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic {} with type {}: statistic on this column with this type already exists", statistic_column_name, statistic_type); } - auto stats = StatisticDescription::getStatisticsFromAST(statistic_decl, metadata.columns); - for (auto && stat : stats) + auto stats_vec = StatisticsDescription::getStatisticsFromAST(statistic_decl, metadata.columns); + for (const auto & stats : stats_vec) { - metadata.columns.modify(stat.column_name, - [&](ColumnDescription & column) { column.stat = std::move(stat); }); + metadata.columns.modify(stats.column_name, + [&](ColumnDescription & column) { column.stats.merge(stats, column, if_not_exists); }); } } else if (type == DROP_STATISTIC) { - for (const auto & stat_column_name : statistic_columns) + for (const auto & statistic_column_name : statistic_columns) { - if (!metadata.columns.has(stat_column_name) || !metadata.columns.get(stat_column_name).stat) + if (!metadata.columns.has(statistic_column_name)) { if (if_exists) return; - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Wrong statistic name. Cannot find statistic {} with type {} to drop", backQuote(stat_column_name), statistic_type); + throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Wrong statistic name. Cannot find statistic {} to drop", backQuote(statistic_column_name)); } - if (!partition && !clear) + + if (!clear && !partition) + metadata.columns.modify(statistic_column_name, + [&](ColumnDescription & column) { column.stats.clear(); }); + } + } + else if (type == MODIFY_STATISTIC) + { + for (const auto & statistic_column_name : statistic_columns) + { + if (!metadata.columns.has(statistic_column_name)) { - metadata.columns.modify(stat_column_name, - [&](ColumnDescription & column) { column.stat = std::nullopt; }); + throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic for column {}: this column is not found", statistic_column_name); } } + + auto stats_vec = StatisticsDescription::getStatisticsFromAST(statistic_decl, metadata.columns); + for (const auto & stats : stats_vec) + { + metadata.columns.modify(stats.column_name, + [&](ColumnDescription & column) { column.stats.modify(stats); }); + } } else if (type == ADD_CONSTRAINT) { @@ -773,8 +802,8 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) rename_visitor.visit(column_to_modify.default_desc.expression); if (column_to_modify.ttl) rename_visitor.visit(column_to_modify.ttl); - if (column_to_modify.name == column_name && column_to_modify.stat) - column_to_modify.stat->column_name = rename_to; + if (column_to_modify.name == column_name && !column_to_modify.stats.empty()) + column_to_modify.stats.column_name = rename_to; }); } if (metadata.table_ttl.definition_ast) diff --git a/src/Storages/AlterCommands.h b/src/Storages/AlterCommands.h index 26c20995991..5a5d77a0670 100644 --- a/src/Storages/AlterCommands.h +++ b/src/Storages/AlterCommands.h @@ -40,6 +40,7 @@ struct AlterCommand DROP_PROJECTION, ADD_STATISTIC, DROP_STATISTIC, + MODIFY_STATISTIC, MODIFY_TTL, MODIFY_SETTING, RESET_SETTING, @@ -122,7 +123,7 @@ struct AlterCommand ASTPtr statistic_decl = nullptr; std::vector statistic_columns; - String statistic_type; + std::vector statistic_types; /// For MODIFY TTL ASTPtr ttl = nullptr; diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 33d8b309750..00cd3669a63 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -60,7 +60,7 @@ bool ColumnDescription::operator==(const ColumnDescription & other) const return name == other.name && type->equals(*other.type) && default_desc == other.default_desc - && stat == other.stat + && stats == other.stats && ast_to_str(codec) == ast_to_str(other.codec) && ast_to_str(ttl) == ast_to_str(other.ttl); } @@ -94,10 +94,10 @@ void ColumnDescription::writeText(WriteBuffer & buf) const writeEscapedString(queryToString(codec), buf); } - if (stat) + if (!stats.empty()) { writeChar('\t', buf); - writeEscapedString(queryToString(stat->ast), buf); + writeEscapedString(queryToString(stats.getAST()), buf); } if (ttl) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 4de8aa11de3..0e6709262af 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -84,7 +84,7 @@ struct ColumnDescription String comment; ASTPtr codec; ASTPtr ttl; - std::optional stat; + StatisticsDescription stats; ColumnDescription() = default; ColumnDescription(ColumnDescription &&) = default; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 87f23b0da2a..cb12379529a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -629,13 +629,13 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(bool with_subc return *minimum_size_column; } -Statistics IMergeTreeDataPart::loadStatistics() const +std::vector IMergeTreeDataPart::loadStatistics() const { const auto & metadata_snaphost = storage.getInMemoryMetadata(); auto total_statistics = MergeTreeStatisticsFactory::instance().getMany(metadata_snaphost.getColumns()); - Statistics result; + std::vector result; for (auto & stat : total_statistics) { String file_name = stat->getFileName() + STAT_FILE_SUFFIX; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 640a1f1d0a3..91350ef695a 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -104,7 +104,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) = 0; @@ -170,7 +170,7 @@ public: void remove(); - Statistics loadStatistics() const; + std::vector loadStatistics() const; /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load various metadata into memory: checksums from checksums.txt, index if required, etc. diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index a8b657d0e3e..0afd4ddc760 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -609,7 +609,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const /// because all of them were already recalculated and written /// as key part of vertical merge std::vector{}, - std::vector{}, /// TODO: think about it + std::vector{}, /// TODO(hanfei) &global_ctx->written_offset_columns, global_ctx->to->getIndexGranularity()); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 450bf10bdcb..078563b5f65 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -470,7 +470,7 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQ { auto stats = part->loadStatistics(); /// TODO: We only have one stats file for every part. - for (const auto & stat : stats) + for (const auto stat : stats) result.merge(part->info.getPartNameV1(), part->rows_count, stat); } } @@ -663,8 +663,8 @@ void MergeTreeData::checkProperties( for (const auto & col : new_metadata.columns) { - if (col.stat) - MergeTreeStatisticsFactory::instance().validate(*col.stat, col.type); + if (!col.stats.empty()) + MergeTreeStatisticsFactory::instance().validate(col.stats, col.type); } checkKeyExpression(*new_sorting_key.expression, new_sorting_key.sample_block, "Sorting", allow_nullable_key_); @@ -3194,7 +3194,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context new_metadata.getColumns().getPhysical(command.column_name)); const auto & old_column = old_metadata.getColumns().get(command.column_name); - if (old_column.stat) + if (!old_column.stats.empty()) { const auto & new_column = new_metadata.getColumns().get(command.column_name); if (!old_column.type->equals(*new_column.type)) @@ -8290,7 +8290,7 @@ std::pair MergeTreeData::createE const auto & index_factory = MergeTreeIndexFactory::instance(); MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), - Statistics{}, + std::vector{}, compression_codec, txn); bool sync_on_insert = settings->fsync_after_insert; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 0ecd7abe183..f1c6b0b0ec2 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -53,7 +53,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter( const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 35a358b3720..d2096d6158e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -43,7 +43,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp index 2f01dbfe04b..1add899e94c 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.cpp @@ -51,7 +51,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartInMemory::getWriter( const NamesAndTypesList &, const StorageMetadataPtr &, const std::vector &, - const Statistics &, + const std::vector &, const CompressionCodecPtr &, const MergeTreeWriterSettings &, const MergeTreeIndexGranularity &) diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h index 27f8ba4bccb..7f2c099bf6e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h @@ -32,7 +32,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index dc6c1f0019d..7b2f00af0de 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -50,7 +50,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter( const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 14147c4ad56..1242bd5e00f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -38,7 +38,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index b05b4584259..bc1616d084e 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -24,7 +24,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, - const Statistics & stats_to_recalc, + const std::vector & stats_to_recalc, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index ddb6178dce6..81bf3d39f97 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -15,7 +15,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc, + const std::vector & stats_to_recalc, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 6e544b4a35a..c6823f93b0a 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -136,7 +136,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeIndices & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 4d081778e68..7f96ceedb36 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -108,7 +108,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, @@ -152,7 +152,7 @@ protected: const MergeTreeIndices skip_indices; - const Statistics stats; + const std::vector stats; std::vector stats_streams; const String marks_file_extension; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index d86ff3a17ff..16a400a5398 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -81,7 +81,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index ae40eb03649..25765ca7f73 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -22,7 +22,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index f63394a4d48..3d03d41375d 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -702,7 +702,8 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( metadata_snapshot, columns, MergeTreeIndices{}, - Statistics{}, /// TODO(hanfei): It should be helpful to write statistics for projection result. + /// TODO(hanfei): It should be helpful to write statistics for projection result. + std::vector{}, compression_codec, NO_TRANSACTION_PTR, false, false, data.getContext()->getWriteSettings()); diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 0cac051bb2c..e4f4b5d9f2a 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -272,10 +272,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const RPNBuilderTree { cond.good = cond.viable; - cond.selectivity = estimator.estimateSelectivity(node); + cond.estimated_row_count = estimator.estimateRowCount(node); if (node.getASTNode() != nullptr) - LOG_TEST(log, "Condition {} has selectivity {}", node.getASTNode()->dumpTree(), cond.selectivity); + LOG_DEBUG(log, "Condition {} has estimated row count {}", node.getASTNode()->dumpTree(), cond.estimated_row_count); } if (where_optimizer_context.move_primary_key_columns_to_end_of_prewhere) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 0ef7ac9efff..b561938c817 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -75,7 +75,7 @@ private: bool good = false; /// the lower the better - Float64 selectivity = 1.0; + Float64 estimated_row_count = 0; /// Does the condition contain primary key column? /// If so, it is better to move it further to the end of PREWHERE chain depending on minimal position in PK of any @@ -84,7 +84,7 @@ private: auto tuple() const { - return std::make_tuple(!viable, !good, -min_position_in_primary_key, selectivity, columns_size, table_columns.size()); + return std::make_tuple(!viable, !good, -min_position_in_primary_key, estimated_row_count, columns_size, table_columns.size()); } /// Is condition a better candidate for moving to PREWHERE? diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 8b34c221eec..55978ca1978 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -19,7 +19,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const Statistics & statistics, + const std::vector & statistics, CompressionCodecPtr default_codec_, const MergeTreeTransactionPtr & txn, bool reset_columns_, diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 540b3b3bffa..0d6c76794bd 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -20,7 +20,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const Statistics & statistics, + const std::vector & statistics, CompressionCodecPtr default_codec_, const MergeTreeTransactionPtr & txn, bool reset_columns_ = false, diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 728b2e38833..74f6eb020b3 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -16,7 +16,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( const Block & header_, CompressionCodecPtr default_codec, const MergeTreeIndices & indices_to_recalc, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h index ad3cabe459e..c734acf71c7 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h @@ -20,7 +20,7 @@ public: const Block & header_, CompressionCodecPtr default_codec_, const MergeTreeIndices & indices_to_recalc_, - const Statistics & stats_to_recalc_, + const std::vector & stats_to_recalc_, WrittenOffsetColumns * offset_columns_ = nullptr, const MergeTreeIndexGranularity & index_granularity = {}, const MergeTreeIndexGranularityInfo * index_granularity_info_ = nullptr); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 8c896edab14..1c7849e6950 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -462,16 +462,16 @@ static ExecuteTTLType shouldExecuteTTL(const StorageMetadataPtr & metadata_snaps return has_ttl_expression ? ExecuteTTLType::RECALCULATE : ExecuteTTLType::NONE; } -static std::set getStatisticsToRecalculate(const StorageMetadataPtr & metadata_snapshot, const NameSet & materialized_stats) +static std::set getStatisticsToRecalculate(const StorageMetadataPtr & metadata_snapshot, const NameSet & materialized_stats) { const auto & stats_factory = MergeTreeStatisticsFactory::instance(); - std::set stats_to_recalc; + std::set stats_to_recalc; const auto & columns = metadata_snapshot->getColumns(); for (const auto & col_desc : columns) { - if (col_desc.stat && materialized_stats.contains(col_desc.name)) + if (!col_desc.stats.empty() && materialized_stats.contains(col_desc.name)) { - stats_to_recalc.insert(stats_factory.get(*col_desc.stat)); + stats_to_recalc.insert(stats_factory.get(col_desc.stats)); } } return stats_to_recalc; @@ -583,7 +583,7 @@ static NameSet collectFilesToSkip( const std::set & indices_to_recalc, const String & mrk_extension, const std::set & projections_to_recalc, - const std::set & stats_to_recalc) + const std::set & stats_to_recalc) { NameSet files_to_skip = source_part->getFileNamesWithoutChecksums(); @@ -939,7 +939,7 @@ struct MutationContext IMergeTreeDataPart::MinMaxIndexPtr minmax_idx{nullptr}; std::set indices_to_recalc; - std::set stats_to_recalc; + std::set stats_to_recalc; std::set projections_to_recalc; MergeTreeData::DataPart::Checksums existing_indices_stats_checksums; NameSet files_to_skip; @@ -1409,16 +1409,16 @@ private: } } - Statistics stats_to_rewrite; + std::vector stats_to_rewrite; const auto & columns = ctx->metadata_snapshot->getColumns(); for (const auto & col : columns) { - if (!col.stat || removed_stats.contains(col.name)) + if (col.stats.empty() || removed_stats.contains(col.name)) continue; if (ctx->materialized_statistics.contains(col.name)) { - stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(*col.stat)); + stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col.stats)); } else { @@ -1771,7 +1771,7 @@ private: ctx->updated_header, ctx->compression_codec, std::vector(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()), - Statistics(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), + std::vector(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), nullptr, ctx->source_part->index_granularity, &ctx->source_part->index_granularity_info diff --git a/src/Storages/MutationCommands.cpp b/src/Storages/MutationCommands.cpp index 36388a32b41..f27f7adc9dd 100644 --- a/src/Storages/MutationCommands.cpp +++ b/src/Storages/MutationCommands.cpp @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include @@ -85,7 +85,7 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.type = MATERIALIZE_STATISTIC; res.partition = command->partition; res.predicate = nullptr; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistic_columns = command->statistic_decl->as().getColumnNames(); return res; } else if (command->type == ASTAlterCommand::MATERIALIZE_PROJECTION) @@ -151,7 +151,8 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.partition = command->partition; if (command->clear_index) res.clear = true; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistic_types = command->statistic_decl->as().getTypeNames(); return res; } else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_PROJECTION) diff --git a/src/Storages/MutationCommands.h b/src/Storages/MutationCommands.h index 6e10f7d9b2d..9d5e02db1b4 100644 --- a/src/Storages/MutationCommands.h +++ b/src/Storages/MutationCommands.h @@ -55,6 +55,7 @@ struct MutationCommand String index_name = {}; String projection_name = {}; std::vector statistic_columns = {}; + std::vector statistic_types = {}; /// For MATERIALIZE INDEX, UPDATE and DELETE. ASTPtr partition = {}; diff --git a/src/Storages/Statistics/Estimator.cpp b/src/Storages/Statistics/Estimator.cpp index 7e0e465c7bf..34a0c61aeda 100644 --- a/src/Storages/Statistics/Estimator.cpp +++ b/src/Storages/Statistics/Estimator.cpp @@ -4,6 +4,56 @@ namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +void ConditionEstimator::ColumnEstimator::merge(std::string part_name, ColumnStatisticsPtr stats) +{ + if (estimators.contains(part_name)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "part {} has been added in column {}", part_name, stats->columnName()); + estimators[part_name] = stats; +} + +Float64 ConditionEstimator::ColumnEstimator::estimateLess(Float64 val, Float64 total) const +{ + if (estimators.empty()) + return default_normal_cond_factor * total; + Float64 result = 0; + Float64 partial_cnt = 0; + for (const auto & [key, estimator] : estimators) + { + result += estimator->estimateLess(val); + partial_cnt += estimator->count(); + } + return result * total / partial_cnt; +} + +Float64 ConditionEstimator::ColumnEstimator::estimateGreater(Float64 val, Float64 total) const +{ + return total - estimateLess(val, total); +} + +Float64 ConditionEstimator::ColumnEstimator::estimateEqual(Float64 val, Float64 total) const +{ + if (estimators.empty()) + { + if (val < - threshold || val > threshold) + return default_normal_cond_factor * total; + else + return default_good_cond_factor * total; + } + Float64 result = 0; + Float64 partial_cnt = 0; + for (const auto & [key, estimator] : estimators) + { + result += estimator->estimateEqual(val); + partial_cnt += estimator->count(); + } + return result * total / partial_cnt; +} + /// second return value represents how many columns in the node. static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNode & node) { @@ -87,7 +137,7 @@ std::pair ConditionEstimator::extractBinaryOp(const RPNBui return std::make_pair(function_name, value); } -Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node) const +Float64 ConditionEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const { auto result = tryToExtractSingleColumn(node); if (result.second != 1) @@ -112,26 +162,40 @@ Float64 ConditionEstimator::estimateSelectivity(const RPNBuilderTreeNode & node) auto [op, val] = extractBinaryOp(node, col); if (op == "equals") { - if (val < - threshold || val > threshold) - return default_normal_cond_factor; - else - return default_good_cond_factor; + if (dummy) + { + if (val < - threshold || val > threshold) + return default_normal_cond_factor * total_count; + else + return default_good_cond_factor * total_count; + } + return estimator.estimateEqual(val, total_count); } else if (op == "less" || op == "lessThan") { if (dummy) - return default_normal_cond_factor; - return estimator.estimateLess(val) / total_count; + return default_normal_cond_factor * total_count; + return estimator.estimateLess(val, total_count); } else if (op == "greater" || op == "greaterThan") { if (dummy) - return default_normal_cond_factor; - return estimator.estimateGreater(val) / total_count; + return default_normal_cond_factor * total_count; + return estimator.estimateGreater(val, total_count); } else - return default_unknown_cond_factor; + return default_unknown_cond_factor * total_count; } +void ConditionEstimator::merge(std::string part_name, UInt64 part_count, ColumnStatisticsPtr column_stat) +{ + if (!part_names.contains(part_name)) + { + total_count += part_count; + part_names.insert(part_name); + } + if (column_stat != nullptr) + column_estimators[column_stat->columnName()].merge(part_name, column_stat); +} } diff --git a/src/Storages/Statistics/Estimator.h b/src/Storages/Statistics/Estimator.h index 903bb57eb80..e7f8316e2bc 100644 --- a/src/Storages/Statistics/Estimator.h +++ b/src/Storages/Statistics/Estimator.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB { @@ -11,7 +11,7 @@ class RPNBuilderTreeNode; class ConditionEstimator { private: - + friend class ColumnStatistics; static constexpr auto default_good_cond_factor = 0.1; static constexpr auto default_normal_cond_factor = 0.5; static constexpr auto default_unknown_cond_factor = 1.0; @@ -21,75 +21,23 @@ private: UInt64 total_count = 0; - /// Minimum estimator for values in a part. It can contains multiple types of statistics. - /// But right now we only have tdigest; - struct PartColumnEstimator - { - UInt64 part_count = 0; - - std::shared_ptr tdigest; - - void merge(StatisticPtr statistic) - { - UInt64 cur_part_count = statistic->count(); - if (part_count == 0) - part_count = cur_part_count; - - if (typeid_cast(statistic.get())) - { - tdigest = std::static_pointer_cast(statistic); - } - } - - Float64 estimateLess(Float64 val) const - { - if (tdigest != nullptr) - return tdigest -> estimateLess(val); - return part_count * default_normal_cond_factor; - } - - Float64 estimateGreator(Float64 val) const - { - if (tdigest != nullptr) - return part_count - tdigest -> estimateLess(val); - return part_count * default_normal_cond_factor; - } - }; - /// An estimator for a column consists of several PartColumnEstimator. /// We simply get selectivity for every part estimator and combine the result. struct ColumnEstimator { - std::map estimators; + std::map estimators; - void merge(std::string part_name, StatisticPtr statistic) - { - estimators[part_name].merge(statistic); - } + void merge(std::string part_name, ColumnStatisticsPtr stats); - Float64 estimateLess(Float64 val) const - { - if (estimators.empty()) - return default_normal_cond_factor; - Float64 result = 0; - for (const auto & [key, estimator] : estimators) - result += estimator.estimateLess(val); - return result; - } + Float64 estimateLess(Float64 val, Float64 total) const; - Float64 estimateGreater(Float64 val) const - { - if (estimators.empty()) - return default_normal_cond_factor; - Float64 result = 0; - for (const auto & [key, estimator] : estimators) - result += estimator.estimateGreator(val); - return result; - } + Float64 estimateGreater(Float64 val, Float64 total) const; + + Float64 estimateEqual(Float64 val, Float64 total) const; }; + std::set part_names; std::map column_estimators; - /// std::optional extractSingleColumn(const RPNBuilderTreeNode & node) const; std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const std::string & column_name) const; public: @@ -97,15 +45,9 @@ public: /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... /// Right now we only support simple condition like col = val / col < val - Float64 estimateSelectivity(const RPNBuilderTreeNode & node) const; + Float64 estimateRowCount(const RPNBuilderTreeNode & node) const; - void merge(std::string part_name, UInt64 part_count, StatisticPtr statistic) - { - total_count += part_count; - if (statistic != nullptr) - column_estimators[statistic->columnName()].merge(part_name, statistic); - } + void merge(std::string part_name, UInt64 part_count, ColumnStatisticsPtr column_stat); }; - } diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index 6619eac19dc..fa9058e8e7f 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -3,9 +3,13 @@ #include #include +#include #include +#include #include #include +#include +#include #include namespace DB @@ -18,6 +22,99 @@ namespace ErrorCodes extern const int ILLEGAL_STATISTIC; } +enum StatisticsFileVersion : UInt16 +{ + V0 = 0, +}; + +/// Version / bitmask of statistics / data of statistics / + +ColumnStatistics::ColumnStatistics(const StatisticsDescription & stats_desc_) + : stats_desc(stats_desc_), counter(0) +{ +} + +void ColumnStatistics::update(const ColumnPtr & column) +{ + counter += column->size(); + for (auto iter : stats) + { + iter.second->update(column); + } +} + +Float64 ColumnStatistics::estimateLess(Float64 val) const +{ + if (stats.contains(TDigest)) + return std::static_pointer_cast(stats.at(TDigest))->estimateLess(val); + return counter * ConditionEstimator::default_normal_cond_factor; +} + +Float64 ColumnStatistics::estimateGreater(Float64 val) const +{ + return counter - estimateLess(val); +} + +Float64 ColumnStatistics::estimateEqual(Float64 val) const +{ + if (stats.contains(Uniq) && stats.contains(TDigest)) + { + auto uniq_static = std::static_pointer_cast(stats.at(Uniq)); + Int64 ndv = uniq_static->getNDV(); + if (ndv < 2048) + { + auto tdigest_static = std::static_pointer_cast(stats.at(TDigest)); + return tdigest_static->estimateEqual(val); + } + } + if (val < - ConditionEstimator::threshold || val > ConditionEstimator::threshold) + return counter * ConditionEstimator::default_normal_cond_factor; + else + return counter * ConditionEstimator::default_good_cond_factor; +} + +void ColumnStatistics::serialize(WriteBuffer & buf) +{ + writeIntBinary(V0, buf); + UInt64 stat_types_mask = 0; + for (const auto & [type, _]: stats) + { + stat_types_mask |= 1 << type; + } + writeIntBinary(stat_types_mask, buf); + /// We write some basic statistics + writeIntBinary(counter, buf); + /// We write complex statistics + for (const auto & [type, stat_ptr]: stats) + { + stat_ptr->serialize(buf); + } +} + +void ColumnStatistics::deserialize(ReadBuffer &buf) +{ + UInt16 version; + readIntBinary(version, buf); + if (version != V0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown file format version: {}", version); + + UInt64 stat_types_mask = 0; + readIntBinary(stat_types_mask, buf); + readIntBinary(counter, buf); + for (auto it = stats.begin(); it != stats.end();) + { + if (!(stat_types_mask & 1 << (it->first))) + { + stats.erase(it ++); + } + else + { + it->second->deserialize(buf); + ++ it; + } + } +} + void MergeTreeStatisticsFactory::registerCreator(StatisticType stat_type, Creator creator) { if (!creators.emplace(stat_type, std::move(creator)).second) @@ -31,7 +128,7 @@ void MergeTreeStatisticsFactory::registerValidator(StatisticType stat_type, Vali } -StatisticPtr TDigestCreator(const StatisticDescription & stat) +StatisticPtr TDigestCreator(const StatisticDescription & stat, DataTypePtr) { return StatisticPtr(new TDigestStatistic(stat)); } @@ -43,11 +140,22 @@ void TDigestValidator(const StatisticDescription &, DataTypePtr data_type) throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "TDigest does not support type {}", data_type->getName()); } +void UniqValidator(const StatisticDescription &, DataTypePtr) +{ + /// TODO(hanfei): check something +} + +StatisticPtr UniqCreator(const StatisticDescription & stat, DataTypePtr data_type) +{ + return StatisticPtr(new UniqStatistic(stat, data_type)); +} MergeTreeStatisticsFactory::MergeTreeStatisticsFactory() { registerCreator(TDigest, TDigestCreator); + registerCreator(Uniq, UniqCreator); registerValidator(TDigest, TDigestValidator); + registerValidator(Uniq, UniqValidator); } MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() @@ -56,33 +164,42 @@ MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() return instance; } -void MergeTreeStatisticsFactory::validate(const StatisticDescription & stat, DataTypePtr data_type) const +void MergeTreeStatisticsFactory::validate(const StatisticsDescription & stats, DataTypePtr data_type) const { - auto it = validators.find(stat.type); - if (it == validators.end()) + for (const auto & [type, desc] : stats.stats) { - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown Statistic type '{}'", stat.type); + auto it = validators.find(type); + if (it == validators.end()) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown Statistic type '{}'", type); + } + it->second(desc, data_type); } - it->second(stat, data_type); } -StatisticPtr MergeTreeStatisticsFactory::get(const StatisticDescription & stat) const +ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const StatisticsDescription & stats) const { - auto it = creators.find(stat.type); - if (it == creators.end()) + ColumnStatisticsPtr column_stat = std::make_shared(stats); + for (const auto & [type, desc] : stats.stats) { - throw Exception(ErrorCodes::INCORRECT_QUERY, - "Unknown Statistic type '{}'. Available types: tdigest", stat.type); + auto it = creators.find(type); + if (it == creators.end()) + { + throw Exception(ErrorCodes::INCORRECT_QUERY, + "Unknown Statistic type '{}'. Available types: tdigest", type); + } + auto stat_ptr = (it->second)(desc, stats.data_type); + column_stat->stats[type] = stat_ptr; } - return std::make_shared(stat); + return column_stat; } -Statistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const +std::vector MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const { - Statistics result; + std::vector result; for (const auto & col : columns) - if (col.stat) - result.push_back(get(*col.stat)); + if (!col.stats.empty()) + result.push_back(get(col.stats)); return result; } diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index e6d9666ce1c..f6cf3c90e92 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -6,7 +6,6 @@ #include -#include #include #include #include @@ -23,7 +22,7 @@ namespace DB class IStatistic; using StatisticPtr = std::shared_ptr; -using Statistics = std::vector; +/// using Statistics = std::vector; /// Statistic contains the distribution of values in a column. /// right now we support @@ -37,6 +36,34 @@ public: } virtual ~IStatistic() = default; + virtual void serialize(WriteBuffer & buf) = 0; + + virtual void deserialize(ReadBuffer & buf) = 0; + + virtual void update(const ColumnPtr & column) = 0; + + /// how many rows this statistics contain + /// virtual UInt64 count() = 0; + +protected: + + StatisticDescription stat; + +}; + +class ColumnStatistics; +using ColumnStatisticsPtr = std::shared_ptr; + +class ColumnStatistics +{ + friend class MergeTreeStatisticsFactory; + StatisticsDescription stats_desc; + std::map stats; + UInt64 counter; +public: + explicit ColumnStatistics(const StatisticsDescription & stats_); + void serialize(WriteBuffer & buf); + void deserialize(ReadBuffer & buf); String getFileName() const { return STAT_FILE_PREFIX + columnName(); @@ -44,21 +71,20 @@ public: const String & columnName() const { - return stat.column_name; + return stats_desc.column_name; } - virtual void serialize(WriteBuffer & buf) = 0; + UInt64 count() const { return counter; } - virtual void deserialize(ReadBuffer & buf) = 0; + void update(const ColumnPtr & column); - virtual void update(const ColumnPtr & column) = 0; + /// void merge(ColumnStatisticsPtr other_column_stats); - virtual UInt64 count() = 0; + Float64 estimateLess(Float64 val) const; -protected: - - StatisticDescription stat; + Float64 estimateGreater(Float64 val) const; + Float64 estimateEqual(Float64 val) const; }; class ColumnsDescription; @@ -68,15 +94,15 @@ class MergeTreeStatisticsFactory : private boost::noncopyable public: static MergeTreeStatisticsFactory & instance(); - void validate(const StatisticDescription & stat, DataTypePtr data_type) const; + void validate(const StatisticsDescription & stats, DataTypePtr data_type) const; - using Creator = std::function; + using Creator = std::function; using Validator = std::function; - StatisticPtr get(const StatisticDescription & stat) const; + ColumnStatisticsPtr get(const StatisticsDescription & stat) const; - Statistics getMany(const ColumnsDescription & columns) const; + std::vector getMany(const ColumnsDescription & columns) const; void registerCreator(StatisticType type, Creator creator); void registerValidator(StatisticType type, Validator validator); diff --git a/src/Storages/Statistics/TDigestStatistic.cpp b/src/Storages/Statistics/TDigestStatistic.cpp index efb4282d203..a3353595216 100644 --- a/src/Storages/Statistics/TDigestStatistic.cpp +++ b/src/Storages/Statistics/TDigestStatistic.cpp @@ -8,6 +8,11 @@ Float64 TDigestStatistic::estimateLess(Float64 val) const return data.getCountLessThan(val); } +Float64 TDigestStatistic::estimateEqual(Float64 val) const +{ + return data.getCountEqual(val); +} + void TDigestStatistic::serialize(WriteBuffer & buf) { data.serialize(buf); @@ -30,9 +35,4 @@ void TDigestStatistic::update(const ColumnPtr & column) } } -UInt64 TDigestStatistic::count() -{ - return static_cast(data.count); -} - } diff --git a/src/Storages/Statistics/TDigestStatistic.h b/src/Storages/Statistics/TDigestStatistic.h index 295b5f69900..24b33393aeb 100644 --- a/src/Storages/Statistics/TDigestStatistic.h +++ b/src/Storages/Statistics/TDigestStatistic.h @@ -1,13 +1,16 @@ #pragma once #include +#include namespace DB { + /// TDigestStatistic is a kind of histogram. class TDigestStatistic : public IStatistic { + friend class ColumnStatistics; QuantileTDigest data; public: explicit TDigestStatistic(const StatisticDescription & stat_) : IStatistic(stat_) @@ -16,13 +19,13 @@ public: Float64 estimateLess(Float64 val) const; + Float64 estimateEqual(Float64 val) const; + void serialize(WriteBuffer & buf) override; void deserialize(ReadBuffer & buf) override; void update(const ColumnPtr & column) override; - - UInt64 count() override; }; } diff --git a/src/Storages/Statistics/UniqStatistic.h b/src/Storages/Statistics/UniqStatistic.h new file mode 100644 index 00000000000..556539cfb45 --- /dev/null +++ b/src/Storages/Statistics/UniqStatistic.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class UniqStatistic : public IStatistic +{ + std::unique_ptr arena; + AggregateFunctionPtr uniq_collector; + AggregateDataPtr data; + Int64 result; +public: + explicit UniqStatistic(const StatisticDescription & stat_, DataTypePtr data_type) : IStatistic(stat_), result(-1) + { + arena = std::make_unique(); + AggregateFunctionProperties property; + property.returns_default_when_only_null = true; + uniq_collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), property); + data = arena->alignedAlloc(uniq_collector->sizeOfData(), uniq_collector->alignOfData()); + uniq_collector->create(data); + } + + ~UniqStatistic() override + { + uniq_collector->destroy(data); + } + + Int64 getNDV() + { + if (result < 0) + { + auto column = DataTypeInt64().createColumn(); + uniq_collector->insertResultInto(data, *column, nullptr); + result = column->getInt(0); + } + return result; + } + + void serialize(WriteBuffer & buf) override + { + uniq_collector->serialize(data, buf); + } + + void deserialize(ReadBuffer & buf) override + { + uniq_collector->deserialize(data, buf); + } + + void update(const ColumnPtr & column) override + { + const IColumn * col_ptr = column.get(); + uniq_collector->add(data, &col_ptr, column->size(), nullptr); + } +}; + +} diff --git a/src/Storages/Statistics/tests/gtest_stats.cpp b/src/Storages/Statistics/tests/gtest_stats.cpp index 45f8271be97..1d0faf65f7d 100644 --- a/src/Storages/Statistics/tests/gtest_stats.cpp +++ b/src/Storages/Statistics/tests/gtest_stats.cpp @@ -1,6 +1,6 @@ #include -#include +#include TEST(Statistics, TDigestLessThan) { diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index a427fb6a7cd..232ec29c312 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include @@ -19,75 +19,160 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_QUERY; + extern const int ILLEGAL_STATISTIC; extern const int LOGICAL_ERROR; }; +String queryToString(const IAST & query); + StatisticType stringToType(String type) { if (type == "tdigest") return TDigest; + if (type == "uniq") + return Uniq; throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); } String StatisticDescription::getTypeName() const { if (type == TDigest) - return "tdigest"; + return "TDigest"; + if (type == Uniq) + return "Uniq"; throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); } -std::vector StatisticDescription::getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) +static ASTPtr getASTForStatisticTypes(const std::unordered_map & statistic_types) { - const auto * stat_definition = definition_ast->as(); - if (!stat_definition) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create statistic from non ASTStatisticDeclaration AST"); - - std::vector stats; - stats.reserve(stat_definition->columns->children.size()); - for (const auto & column_ast : stat_definition->columns->children) - { - StatisticDescription stat; - stat.type = stringToType(Poco::toLower(stat_definition->type)); - String column_name = column_ast->as().name(); - - if (!columns.hasPhysical(column_name)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", column_name); - - const auto & column = columns.getPhysical(column_name); - stat.column_name = column.name; - auto function_node = std::make_shared(); function_node->name = "STATISTIC"; function_node->arguments = std::make_shared(); - function_node->arguments->children.push_back(std::make_shared(stat_definition->type)); + for (const auto & [type, desc] : statistic_types) + { + if (desc.ast == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown ast"); + function_node->arguments->children.push_back(desc.ast); + } function_node->children.push_back(function_node->arguments); + return function_node; +} - stat.ast = function_node; +bool StatisticsDescription::contains(const String & stat_type) const +{ + return stats.contains(stringToType(stat_type)); +} - stats.push_back(stat); +void StatisticsDescription::merge(const StatisticsDescription & other, const ColumnDescription & column, bool if_not_exists) +{ + if (other.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "We are merging empty stats in column {}", column.name); + + if (column_name.empty()) + { + column_name = column.name; + data_type = column.type; } - if (stats.empty()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistic column list"); + for (const auto & iter: other.stats) + { + if (!if_not_exists && stats.contains(iter.first)) + { + throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Statistic type name {} has existed in column {}", iter.first, column_name); + } + } + + for (const auto & iter: other.stats) + if (!stats.contains(iter.first)) + stats[iter.first] = iter.second; +} + +void StatisticsDescription::modify(const StatisticsDescription & other) +{ + if (other.column_name != column_name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "unmactched statistic columns {} and {}", column_name, other.column_name); + + stats = other.stats; +} + +void StatisticsDescription::clear() +{ + stats.clear(); +} + +std::vector StatisticsDescription::getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) +{ + const auto * stat_definition = definition_ast->as(); + if (!stat_definition) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create statistic from non ASTStatisticDeclaration AST"); + + std::vector result; + result.reserve(stat_definition->columns->children.size()); + + std::unordered_map statistic_types; + for (const auto & stat_ast : stat_definition->types->children) + { + StatisticDescription stat; + + String stat_type_name = stat_ast->as().name; + if (statistic_types.contains(stat.type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Duplicated statistic type name: {} ", stat_type_name); + stat.type = stringToType(Poco::toLower(stat_type_name)); + stat.ast = stat_ast->clone(); + statistic_types[stat.type] = stat; + } + + for (const auto & column_ast : stat_definition->columns->children) + { + + StatisticsDescription stats_desc; + String physical_column_name = column_ast->as().name(); + + if (!columns.hasPhysical(physical_column_name)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", physical_column_name); + + const auto & column = columns.getPhysical(physical_column_name); + stats_desc.column_name = column.name; + stats_desc.stats = statistic_types; + result.push_back(stats_desc); + } + + if (result.empty()) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistic column list is not allowed."); + + return result; +} + +StatisticsDescription StatisticsDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) +{ + const auto & stat_type_list_ast = column.stat_type->as().arguments; + if (stat_type_list_ast->children.empty()) + throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistic type for column {}", queryToString(column)); + StatisticsDescription stats; + stats.column_name = column.name; + for (const auto & ast : stat_type_list_ast->children) + { + const auto & stat_type = ast->as().name; + + StatisticDescription stat; + stat.type = stringToType(Poco::toLower(stat_type)); + stat.ast = ast->clone(); + stats.add(stat.type, stat); + } return stats; } -String queryToString(const IAST & query); - -StatisticDescription StatisticDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) +void StatisticsDescription::add(StatisticType stat_type, const StatisticDescription & desc) { - const auto & stat_type_list_ast = column.stat_type->as().arguments; - if (stat_type_list_ast->children.size() != 1) - throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect only one statistic type for column {}", queryToString(column)); - const auto & stat_type = stat_type_list_ast->children[0]->as().name; + if (stats.contains(stat_type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic type {} duplicates", stat_type); + stats[stat_type] = desc; +} - StatisticDescription stat; - stat.type = stringToType(Poco::toLower(stat_type)); - stat.column_name = column.name; - stat.ast = column.stat_type; - - return stat; +ASTPtr StatisticsDescription::getAST() const +{ + return getASTForStatisticTypes(stats); } } diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index 9a66951ab52..d148879cdba 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -1,15 +1,20 @@ #pragma once +#include #include #include + #include namespace DB { -enum StatisticType +enum StatisticType : UInt8 { TDigest = 0, + Uniq = 1, + + UnknownStatistics = 63, }; class ColumnsDescription; @@ -19,9 +24,6 @@ struct StatisticDescription /// the type of statistic, right now it's only tdigest. StatisticType type; - /// Names of statistic columns - String column_name; - ASTPtr ast; String getTypeName() const; @@ -30,12 +32,51 @@ struct StatisticDescription bool operator==(const StatisticDescription & other) const { - return type == other.type && column_name == other.column_name; + return type == other.type; //&& column_name == other.column_name; + } +}; + +struct ColumnDescription; + +struct StatisticsDescription +{ + std::unordered_map stats; + + bool operator==(const StatisticsDescription & other) const + { + for (const auto & iter : stats) + { + if (!other.stats.contains(iter.first)) + return false; + if (!(iter.second == other.stats.at(iter.first))) + return false; + } + return stats.size() == other.stats.size(); } - static StatisticDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); + bool empty() const + { + return stats.empty(); + } + + bool contains(const String & stat_type) const; + + void merge(const StatisticsDescription & other, const ColumnDescription & column, bool if_not_exists); + + void modify(const StatisticsDescription & other); + + void clear(); + + void add(StatisticType stat_type, const StatisticDescription & desc); + + ASTPtr getAST() const; + + String column_name; + DataTypePtr data_type; + + static std::vector getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); + static StatisticsDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); - static std::vector getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); }; } diff --git a/tests/integration/test_manipulate_statistic/test.py b/tests/integration/test_manipulate_statistic/test.py index f1c00a61b07..19ca2607105 100644 --- a/tests/integration/test_manipulate_statistic/test.py +++ b/tests/integration/test_manipulate_statistic/test.py @@ -56,26 +56,26 @@ def run_test_single_node(started_cluster): check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "c", True) - node1.query("ALTER TABLE test_stat DROP STATISTIC a type tdigest") + node1.query("ALTER TABLE test_stat DROP STATISTIC a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "c", True) - node1.query("ALTER TABLE test_stat CLEAR STATISTIC b, c type tdigest") + node1.query("ALTER TABLE test_stat CLEAR STATISTIC b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "b", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "c", False) - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC b, c type tdigest") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "c", True) node1.query("ALTER TABLE test_stat ADD STATISTIC a type tdigest") - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC a type tdigest") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "a", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "b", True) diff --git a/tests/queries/0_stateless/02864_statistic_exception.sql b/tests/queries/0_stateless/02864_statistic_exception.sql index 092fa9bda85..28aaf7d5caa 100644 --- a/tests/queries/0_stateless/02864_statistic_exception.sql +++ b/tests/queries/0_stateless/02864_statistic_exception.sql @@ -39,11 +39,11 @@ ALTER TABLE t1 ADD STATISTIC a TYPE xyz; -- { serverError INCORRECT_QUERY } ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } ALTER TABLE t1 ADD STATISTIC pk TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 DROP STATISTIC b TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 DROP STATISTIC a TYPE tdigest; -ALTER TABLE t1 DROP STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 CLEAR STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 MATERIALIZE STATISTIC b TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } +ALTER TABLE t1 DROP STATISTIC b; +ALTER TABLE t1 DROP STATISTIC a; +ALTER TABLE t1 DROP STATISTIC a; +ALTER TABLE t1 CLEAR STATISTIC a; +ALTER TABLE t1 MATERIALIZE STATISTIC b; -- { serverError ILLEGAL_STATISTIC } ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; ALTER TABLE t1 ADD STATISTIC b TYPE tdigest; diff --git a/tests/queries/0_stateless/02864_statistic_operate.sql b/tests/queries/0_stateless/02864_statistic_operate.sql index 29bd213f04a..7ff2e6fea62 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.sql +++ b/tests/queries/0_stateless/02864_statistic_operate.sql @@ -20,7 +20,7 @@ EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SELECT count(*) FROM t1 WHERE b < NULL and a < '10'; -ALTER TABLE t1 DROP STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 DROP STATISTIC a, b; SELECT 'After drop statistic'; EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and a < 10; @@ -34,7 +34,7 @@ SELECT 'After add statistic'; SHOW CREATE TABLE t1; -ALTER TABLE t1 MATERIALIZE STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 MATERIALIZE STATISTIC a, b; INSERT INTO t1 select number, -number, generateUUIDv4() FROM system.numbers LIMIT 10000; SELECT 'After materialize statistic'; diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistic_uniq.reference new file mode 100644 index 00000000000..86a0abb44cb --- /dev/null +++ b/tests/queries/0_stateless/02864_statistic_uniq.reference @@ -0,0 +1,29 @@ +CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(uniq, tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +After insert +SELECT count() +FROM t1 +PREWHERE (a < 10) AND (c = 0) AND (b < 10) +SELECT count() +FROM t1 +PREWHERE (c = 11) AND (a < 10) AND (b < 10) +After merge +SELECT count() +FROM t1 +PREWHERE (a < 10) AND (c = 0) AND (b < 10) +SELECT count() +FROM t1 +PREWHERE (c = 11) AND (a < 10) AND (b < 10) +After modify TDigest +SELECT count() +FROM t1 +PREWHERE (a < 10) AND (c = 0) AND (c = 11) AND (b < 10) +SELECT count() +FROM t1 +PREWHERE (c < -1) AND (a < 10) AND (b < 10) +After drop +SELECT count() +FROM t1 +PREWHERE (a < 10) AND (c = 0) AND (c = 11) AND (b < 10) +SELECT count() +FROM t1 +PREWHERE (a < 10) AND (c < -1) AND (b < 10) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistic_uniq.sql new file mode 100644 index 00000000000..435ae9bb35b --- /dev/null +++ b/tests/queries/0_stateless/02864_statistic_uniq.sql @@ -0,0 +1,43 @@ +DROP TABLE IF EXISTS t1; + +SET allow_experimental_statistic = 1; +SET allow_statistic_optimize = 1; + +CREATE TABLE t1 +( + a Float64 STATISTIC(tdigest), + b Int64 STATISTIC(tdigest), + c Int64 STATISTIC(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; + +SHOW CREATE TABLE t1; + +INSERT INTO t1 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; +INSERT INTO t1 select 0, 0, 11, generateUUIDv4(); + +SELECT 'After insert'; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10; +OPTIMIZE TABLE t1 FINAL; + +SELECT 'After merge'; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10; + +SELECT 'After modify TDigest'; +ALTER TABLE t1 MODIFY STATISTIC c TYPE TDigest; +ALTER TABLE t1 MATERIALIZE STATISTIC c; + +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10; + + +ALTER TABLE t1 DROP STATISTIC c; + +SELECT 'After drop'; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10; +EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10; + +DROP TABLE IF EXISTS t1; From b755db627924e5a579cc1bb9137f550b08893f12 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Mon, 29 Jan 2024 23:02:36 +0100 Subject: [PATCH 005/133] fix style --- src/AggregateFunctions/QuantileTDigest.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/AggregateFunctions/QuantileTDigest.h b/src/AggregateFunctions/QuantileTDigest.h index cc03e477645..731a8ac474a 100644 --- a/src/AggregateFunctions/QuantileTDigest.h +++ b/src/AggregateFunctions/QuantileTDigest.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include #include @@ -341,7 +340,7 @@ public: Float64 result = 0; for (const auto & c : centroids) { - std::cerr << "c "<< c.mean << " "<< c.count << std::endl; + /// std::cerr << "c "<< c.mean << " "<< c.count << std::endl; if (value == c.mean) result += c.count; } From 95abcaf183655766dbacbe32562a7ac820d454df Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 30 Jan 2024 10:30:30 +0100 Subject: [PATCH 006/133] address comments --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 1 - src/Storages/Statistics/Statistics.cpp | 3 +-- src/Storages/Statistics/UniqStatistic.h | 2 +- tests/queries/0_stateless/01271_show_privileges.reference | 1 + 4 files changed, 3 insertions(+), 4 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 9d8f9ed018a..b43de4bea86 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1433,4 +1433,3 @@ ALTER TABLE tab MODIFY COLUMN document MODIFY SETTING min_compress_block_size = ```sql ALTER TABLE tab MODIFY COLUMN document RESET SETTING min_compress_block_size; ``` ->>>>>>> master diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index fa9058e8e7f..b38e1d8a68e 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -60,8 +60,7 @@ Float64 ColumnStatistics::estimateEqual(Float64 val) const if (stats.contains(Uniq) && stats.contains(TDigest)) { auto uniq_static = std::static_pointer_cast(stats.at(Uniq)); - Int64 ndv = uniq_static->getNDV(); - if (ndv < 2048) + if (uniq_static->getCardinality() < 2048) { auto tdigest_static = std::static_pointer_cast(stats.at(TDigest)); return tdigest_static->estimateEqual(val); diff --git a/src/Storages/Statistics/UniqStatistic.h b/src/Storages/Statistics/UniqStatistic.h index 556539cfb45..14b1ce8523e 100644 --- a/src/Storages/Statistics/UniqStatistic.h +++ b/src/Storages/Statistics/UniqStatistic.h @@ -30,7 +30,7 @@ public: uniq_collector->destroy(data); } - Int64 getNDV() + Int64 getCardinality() { if (result < 0) { diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 6a7e4748130..3d8bac1bb9e 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -26,6 +26,7 @@ ALTER CLEAR INDEX ['CLEAR INDEX'] TABLE ALTER INDEX ALTER INDEX ['INDEX'] \N ALTER TABLE ALTER ADD STATISTIC ['ALTER ADD STATISTIC'] TABLE ALTER STATISTIC ALTER DROP STATISTIC ['ALTER DROP STATISTIC'] TABLE ALTER STATISTIC +ALTER MODIFY STATISTIC ['ALTER MODIFY STATISTIC'] TABLE ALTER STATISTIC ALTER MATERIALIZE STATISTIC ['ALTER MATERIALIZE STATISTIC'] TABLE ALTER STATISTIC ALTER STATISTIC ['STATISTIC'] \N ALTER TABLE ALTER ADD PROJECTION ['ADD PROJECTION'] TABLE ALTER PROJECTION From 3b798b51e340815c836b5e8e90b4a36d08bad42d Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 30 Jan 2024 16:44:16 +0100 Subject: [PATCH 007/133] try to fix tests --- src/Storages/Statistics/Statistics.cpp | 8 +++++--- src/Storages/Statistics/Statistics.h | 4 +--- src/Storages/Statistics/UniqStatistic.h | 12 ++++++------ 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index b38e1d8a68e..e05147e3a4a 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -37,7 +37,7 @@ ColumnStatistics::ColumnStatistics(const StatisticsDescription & stats_desc_) void ColumnStatistics::update(const ColumnPtr & column) { counter += column->size(); - for (auto iter : stats) + for (const auto & iter : stats) { iter.second->update(column); } @@ -139,9 +139,11 @@ void TDigestValidator(const StatisticDescription &, DataTypePtr data_type) throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "TDigest does not support type {}", data_type->getName()); } -void UniqValidator(const StatisticDescription &, DataTypePtr) +void UniqValidator(const StatisticDescription &, DataTypePtr data_type) { - /// TODO(hanfei): check something + data_type = removeNullable(data_type); + if (!data_type->isValueRepresentedByNumber()) + throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Uniq does not support type {}", data_type->getName()); } StatisticPtr UniqCreator(const StatisticDescription & stat, DataTypePtr data_type) diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index f6cf3c90e92..96992a254d2 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -27,6 +27,7 @@ using StatisticPtr = std::shared_ptr; /// Statistic contains the distribution of values in a column. /// right now we support /// - tdigest +/// - uniq(hyperloglog) class IStatistic { public: @@ -42,9 +43,6 @@ public: virtual void update(const ColumnPtr & column) = 0; - /// how many rows this statistics contain - /// virtual UInt64 count() = 0; - protected: StatisticDescription stat; diff --git a/src/Storages/Statistics/UniqStatistic.h b/src/Storages/Statistics/UniqStatistic.h index 14b1ce8523e..0df3bcb66df 100644 --- a/src/Storages/Statistics/UniqStatistic.h +++ b/src/Storages/Statistics/UniqStatistic.h @@ -13,9 +13,9 @@ class UniqStatistic : public IStatistic std::unique_ptr arena; AggregateFunctionPtr uniq_collector; AggregateDataPtr data; - Int64 result; + UInt64 result; public: - explicit UniqStatistic(const StatisticDescription & stat_, DataTypePtr data_type) : IStatistic(stat_), result(-1) + explicit UniqStatistic(const StatisticDescription & stat_, DataTypePtr data_type) : IStatistic(stat_), result(0) { arena = std::make_unique(); AggregateFunctionProperties property; @@ -30,13 +30,13 @@ public: uniq_collector->destroy(data); } - Int64 getCardinality() + UInt64 getCardinality() { - if (result < 0) + if (!result) { - auto column = DataTypeInt64().createColumn(); + auto column = DataTypeUInt64().createColumn(); uniq_collector->insertResultInto(data, *column, nullptr); - result = column->getInt(0); + result = column->getUInt(0); } return result; } From 2b5b9589a4884a615d23224baaa41d1588e3d3ba Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 1 Feb 2024 16:28:56 +0100 Subject: [PATCH 008/133] make tests greate again --- src/Storages/Statistics/UniqStatistic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/Statistics/UniqStatistic.h b/src/Storages/Statistics/UniqStatistic.h index 0df3bcb66df..00c1f51eefc 100644 --- a/src/Storages/Statistics/UniqStatistic.h +++ b/src/Storages/Statistics/UniqStatistic.h @@ -54,7 +54,7 @@ public: void update(const ColumnPtr & column) override { const IColumn * col_ptr = column.get(); - uniq_collector->add(data, &col_ptr, column->size(), nullptr); + uniq_collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); } }; From 7ec3c48ccbb34a829b618e2c0e0462d468260c38 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 21 Mar 2024 17:28:56 +0100 Subject: [PATCH 009/133] fix tests --- .../02864_statistic_uniq.reference | 48 +++++++++---------- .../0_stateless/02864_statistic_uniq.sql | 16 +++---- 2 files changed, 32 insertions(+), 32 deletions(-) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistic_uniq.reference index 86a0abb44cb..d0c97596b01 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.reference +++ b/tests/queries/0_stateless/02864_statistic_uniq.reference @@ -1,29 +1,29 @@ CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(uniq, tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After insert -SELECT count() -FROM t1 -PREWHERE (a < 10) AND (c = 0) AND (b < 10) -SELECT count() -FROM t1 -PREWHERE (c = 11) AND (a < 10) AND (b < 10) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(c, 11), less(a, 10), less(b, 10)) (removed) After merge -SELECT count() -FROM t1 -PREWHERE (a < 10) AND (c = 0) AND (b < 10) -SELECT count() -FROM t1 -PREWHERE (c = 11) AND (a < 10) AND (b < 10) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(equals(c, 11), less(a, 10), less(b, 10)) (removed) After modify TDigest -SELECT count() -FROM t1 -PREWHERE (a < 10) AND (c = 0) AND (c = 11) AND (b < 10) -SELECT count() -FROM t1 -PREWHERE (c < -1) AND (a < 10) AND (b < 10) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(c, -1), less(a, 10), less(b, 10)) (removed) After drop -SELECT count() -FROM t1 -PREWHERE (a < 10) AND (c = 0) AND (c = 11) AND (b < 10) -SELECT count() -FROM t1 -PREWHERE (a < 10) AND (c < -1) AND (b < 10) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), less(c, -1), less(b, 10)) (removed) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistic_uniq.sql index 435ae9bb35b..7e996db6ad7 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.sql +++ b/tests/queries/0_stateless/02864_statistic_uniq.sql @@ -18,26 +18,26 @@ INSERT INTO t1 select number, -number, number/1000, generateUUIDv4() FROM system INSERT INTO t1 select 0, 0, 11, generateUUIDv4(); SELECT 'After insert'; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; OPTIMIZE TABLE t1 FINAL; SELECT 'After merge'; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; SELECT 'After modify TDigest'; ALTER TABLE t1 MODIFY STATISTIC c TYPE TDigest; ALTER TABLE t1 MATERIALIZE STATISTIC c; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; ALTER TABLE t1 DROP STATISTIC c; SELECT 'After drop'; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10; -EXPLAIN SYNTAX SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; DROP TABLE IF EXISTS t1; From dc677f0f18343c1afb66e23de4bc08b01dca995b Mon Sep 17 00:00:00 2001 From: Sean Haynes Date: Wed, 20 Dec 2023 10:32:59 +0000 Subject: [PATCH 010/133] Use scheduleOrThrow in MergeTree data selector thread pool At the moment, the use of scheduleOrThrowOnError doesn't currently have a timeout. So if you reach a point of saturation and use all threads available in the global pool, threads block infinitely and lead to a deadlock. This changes that behaviour so that MergeTree data selector threads will have a timeout and return a "No threads available" exception to clients. Credit to Nikita Mikhaylov for the proposition here: https://github.com/ClickHouse/ClickHouse/pull/56431 --- src/Databases/DatabaseOnDisk.cpp | 4 +- src/Server/HTTPHandler.cpp | 9 ++- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 11 ++- .../__init__.py | 0 .../configs/settings.xml | 6 ++ .../test.py | 68 +++++++++++++++++++ 6 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/__init__.py create mode 100644 tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml create mode 100644 tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 642a7148487..335562f0630 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -648,13 +648,13 @@ void DatabaseOnDisk::iterateMetadataFiles(ContextPtr local_context, const Iterat ThreadPool pool(CurrentMetrics::DatabaseOnDiskThreads, CurrentMetrics::DatabaseOnDiskThreadsActive, CurrentMetrics::DatabaseOnDiskThreadsScheduled); for (const auto & file : metadata_files) { - pool.scheduleOrThrowOnError([&]() + pool.scheduleOrThrow([&]() { if (file.second) process_metadata_file(file.first); else process_tmp_drop_metadata_file(file.first); - }); + }, Priority{}, getContext()->getSettingsRef().lock_acquire_timeout.totalMicroseconds()); } pool.wait(); } diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index 72e7c5552f8..f855dd4a6ee 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -64,6 +64,8 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; + extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_OPEN_FILE; extern const int CANNOT_PARSE_TEXT; extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; extern const int CANNOT_PARSE_QUOTED_STRING; @@ -75,8 +77,7 @@ namespace ErrorCodes extern const int CANNOT_PARSE_IPV6; extern const int CANNOT_PARSE_UUID; extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; - extern const int CANNOT_OPEN_FILE; - extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_SCHEDULE_TASK; extern const int DUPLICATE_COLUMN; extern const int ILLEGAL_COLUMN; extern const int THERE_IS_NO_COLUMN; @@ -260,6 +261,10 @@ static Poco::Net::HTTPResponse::HTTPStatus exceptionCodeToHTTPStatus(int excepti { return HTTPResponse::HTTP_REQUEST_TIMEOUT; } + else if (exception_code == ErrorCodes::CANNOT_SCHEDULE_TASK) + { + return HTTPResponse::HTTP_SERVICE_UNAVAILABLE; + } return HTTPResponse::HTTP_INTERNAL_SERVER_ERROR; } diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index a76d370d057..585a4ca8722 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -746,8 +746,15 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd CurrentMetrics::MergeTreeDataSelectExecutorThreadsScheduled, num_threads); + + /// Instances of ThreadPool "borrow" threads from the global thread pool. + /// We intentionally use scheduleOrThrow here to avoid a deadlock. + /// For example, queries can already be running with threads from the + /// global pool, and if we saturate max_thread_pool_size whilst requesting + /// more in this loop, queries will block infinitely. + /// So we wait until lock_acquire_timeout, and then raise an exception. for (size_t part_index = 0; part_index < parts.size(); ++part_index) - pool.scheduleOrThrowOnError([&, part_index, thread_group = CurrentThread::getGroup()] + pool.scheduleOrThrow([&, part_index, thread_group = CurrentThread::getGroup()] { SCOPE_EXIT_SAFE( if (thread_group) @@ -757,7 +764,7 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd CurrentThread::attachToGroupIfDetached(thread_group); process_part(part_index); - }); + }, Priority{}, context->getSettingsRef().lock_acquire_timeout.totalMicroseconds()); pool.wait(); } diff --git a/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/__init__.py b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml new file mode 100644 index 00000000000..0a390937413 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/configs/settings.xml @@ -0,0 +1,6 @@ + + + 300 + 1 + 128 + diff --git a/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py new file mode 100644 index 00000000000..515d9530424 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_thread_schedule_timeouts/test.py @@ -0,0 +1,68 @@ +import concurrent.futures + +import pytest +from helpers.cluster import ClickHouseCluster + + +MAX_THREADS = 60 + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance( + "node1", + macros={"cluster": "test-cluster", "replica": "node1"}, + main_configs=["configs/settings.xml"], + with_zookeeper=True, +) + + +def prepare_cluster(): + node1.query("DROP TABLE IF EXISTS test_threads_busy SYNC") + node1.query( + """ + CREATE TABLE test_threads_busy(d Date, i Int64, s String) ENGINE=MergeTree PARTITION BY toYYYYMMDD(d) ORDER BY d + """ + ) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def do_slow_select(): + # Do a bunch of slow queries that use a large number of threads to saturate max_thread_pool_size + # explicitly set max_threads as otherwise it's relative to the number of CPU cores + query = ( + "SELECT d, i, s, sleepEachRow(3) from test_threads_busy SETTINGS max_threads=40" + ) + node1.query(query) + + +def test_query_exception_on_thread_pool_full(started_cluster): + prepare_cluster() + # Generate some sample data so sleepEachRow in do_slow_select works + node1.query( + f"INSERT INTO test_threads_busy VALUES ('2024-01-01', 1, 'thread-test')" + ) + + futures = [] + errors = [] + with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_THREADS) as executor: + for _ in range(MAX_THREADS): + futures.append(executor.submit(do_slow_select)) + + for f in futures: + try: + f.result() + except Exception as err: + errors.append(str(err)) + assert len(errors) > 0, "Should be 'Cannot schedule a task' exceptions" + assert all( + "Cannot schedule a task" in err for err in errors + ), "Query threads are stuck, or returned an unexpected error" From 4775259f677f96e7b00dda8ac682b4969faa0fa2 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 3 Apr 2024 13:42:26 +0200 Subject: [PATCH 011/133] fix tests --- src/Parsers/ASTAlterQuery.cpp | 6 ++++++ src/Parsers/ASTStatisticsDeclaration.cpp | 5 ++++- src/Storages/MutationCommands.cpp | 1 - .../0_stateless/02864_statistic_uniq.reference | 10 ++++++++-- .../0_stateless/02864_statistic_uniq.sql | 18 ++++++++++-------- 5 files changed, 28 insertions(+), 12 deletions(-) diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index f104e715452..e1d3937d8fb 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -206,6 +206,12 @@ void ASTAlterCommand::formatImpl(const FormatSettings & settings, FormatState & << (settings.hilite ? hilite_none : ""); statistic_decl->formatImpl(settings, state, frame); } + else if (type == ASTAlterCommand::MODIFY_STATISTIC) + { + settings.ostr << (settings.hilite ? hilite_keyword : "") << "MODIFY STATISTIC " + << (settings.hilite ? hilite_none : ""); + statistic_decl->formatImpl(settings, state, frame); + } else if (type == ASTAlterCommand::DROP_STATISTIC) { settings.ostr << (settings.hilite ? hilite_keyword : "") << (clear_statistic ? "CLEAR " : "DROP ") << "STATISTIC " diff --git a/src/Parsers/ASTStatisticsDeclaration.cpp b/src/Parsers/ASTStatisticsDeclaration.cpp index ed80de54655..f9b7a9e29db 100644 --- a/src/Parsers/ASTStatisticsDeclaration.cpp +++ b/src/Parsers/ASTStatisticsDeclaration.cpp @@ -48,9 +48,12 @@ std::vector ASTStatisticsDeclaration::getTypeNames() const void ASTStatisticsDeclaration::formatImpl(const FormatSettings & s, FormatState & state, FormatStateStacked frame) const { columns->formatImpl(s, state, frame); - s.ostr << (s.hilite ? hilite_keyword : "") << " TYPE " << (s.hilite ? hilite_none : ""); + s.ostr << (s.hilite ? hilite_keyword : ""); if (types) + { + s.ostr << " TYPE " << (s.hilite ? hilite_none : ""); types->formatImpl(s, state, frame); + } } } diff --git a/src/Storages/MutationCommands.cpp b/src/Storages/MutationCommands.cpp index 8e823b815d5..a41c5833109 100644 --- a/src/Storages/MutationCommands.cpp +++ b/src/Storages/MutationCommands.cpp @@ -160,7 +160,6 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, if (command->clear_index) res.clear = true; res.statistic_columns = command->statistic_decl->as().getColumnNames(); - res.statistic_types = command->statistic_decl->as().getTypeNames(); return res; } else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_PROJECTION) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistic_uniq.reference index d0c97596b01..56d44e825e8 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.reference +++ b/tests/queries/0_stateless/02864_statistic_uniq.reference @@ -16,14 +16,20 @@ After merge After modify TDigest Prewhere info Prewhere filter - Prewhere filter column: and(less(a, 10), equals(c, 0), equals(c, 11), less(b, 10)) (removed) + Prewhere filter column: and(less(a, 10), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(less(c, -1), less(a, 10), less(b, 10)) (removed) After drop Prewhere info Prewhere filter - Prewhere filter column: and(less(a, 10), equals(c, 0), equals(c, 11), less(b, 10)) (removed) + Prewhere filter column: and(less(a, 10), equals(c, 11), less(b, 10)) (removed) + Prewhere info + Prewhere filter + Prewhere filter column: and(less(a, 10), equals(c, 0), less(b, 10)) (removed) Prewhere info Prewhere filter Prewhere filter column: and(less(a, 10), less(c, -1), less(b, 10)) (removed) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistic_uniq.sql index 7e996db6ad7..cbb24269fac 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.sql +++ b/tests/queries/0_stateless/02864_statistic_uniq.sql @@ -18,26 +18,28 @@ INSERT INTO t1 select number, -number, number/1000, generateUUIDv4() FROM system INSERT INTO t1 select 0, 0, 11, generateUUIDv4(); SELECT 'After insert'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; OPTIMIZE TABLE t1 FINAL; SELECT 'After merge'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; SELECT 'After modify TDigest'; ALTER TABLE t1 MODIFY STATISTIC c TYPE TDigest; ALTER TABLE t1 MATERIALIZE STATISTIC c; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; ALTER TABLE t1 DROP STATISTIC c; SELECT 'After drop'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; -SELECT explain FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; +SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; DROP TABLE IF EXISTS t1; From 547f99381cac142ca7c171217027be9ecc4d0fd8 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 4 Apr 2024 18:21:28 +0200 Subject: [PATCH 012/133] try to fix tests --- src/Storages/StatisticsDescription.cpp | 4 ++-- src/Storages/StatisticsDescription.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index 232ec29c312..567c4090b97 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -43,7 +43,7 @@ String StatisticDescription::getTypeName() const throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); } -static ASTPtr getASTForStatisticTypes(const std::unordered_map & statistic_types) +static ASTPtr getASTForStatisticTypes(const std::map & statistic_types) { auto function_node = std::make_shared(); function_node->name = "STATISTIC"; @@ -109,7 +109,7 @@ std::vector StatisticsDescription::getStatisticsFromAST(c std::vector result; result.reserve(stat_definition->columns->children.size()); - std::unordered_map statistic_types; + std::map statistic_types; for (const auto & stat_ast : stat_definition->types->children) { StatisticDescription stat; diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index d148879cdba..a39dd76226a 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -40,7 +40,7 @@ struct ColumnDescription; struct StatisticsDescription { - std::unordered_map stats; + std::map stats; bool operator==(const StatisticsDescription & other) const { From e38ab18e16f575371b1b5da6c52f808fa3d4ce94 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 4 Apr 2024 22:14:57 +0200 Subject: [PATCH 013/133] fix tests --- tests/queries/0_stateless/02864_statistic_uniq.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistic_uniq.reference index 56d44e825e8..8a828352dd2 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.reference +++ b/tests/queries/0_stateless/02864_statistic_uniq.reference @@ -1,4 +1,4 @@ -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(uniq, tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(tdigest, uniq),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After insert Prewhere info Prewhere filter From 1979ea5e8f7f5a05909092fcc46dfa8491d97047 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Fri, 5 Apr 2024 09:41:57 +0200 Subject: [PATCH 014/133] fix clang tidy --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6317a26bfd4..14c58eac3ec 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -494,7 +494,7 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQ { auto stats = part->loadStatistics(); /// TODO: We only have one stats file for every part. - for (const auto stat : stats) + for (const auto & stat : stats) result.merge(part->info.getPartNameV1(), part->rows_count, stat); } } From 3a380642cc2d06059557116e8462f3ce51e5887a Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 24 Apr 2024 18:20:48 +0200 Subject: [PATCH 015/133] address comments --- src/Common/ErrorCodes.cpp | 2 +- src/Interpreters/InterpreterAlterQuery.cpp | 14 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/MutationsInterpreter.cpp | 30 ++-- src/Interpreters/MutationsInterpreter.h | 2 +- src/Parsers/ASTAlterQuery.cpp | 30 ++-- src/Parsers/ASTAlterQuery.h | 12 +- src/Parsers/CommonParsers.h | 12 +- src/Parsers/ExpressionElementParsers.cpp | 4 +- src/Parsers/ExpressionElementParsers.h | 2 +- src/Parsers/ParserAlterQuery.cpp | 52 +++--- src/Parsers/ParserCreateQuery.cpp | 4 +- src/Parsers/ParserCreateQuery.h | 12 +- src/Storages/AlterCommands.cpp | 78 ++++---- src/Storages/AlterCommands.h | 12 +- src/Storages/ColumnDependency.h | 4 +- src/Storages/ColumnsDescription.h | 2 +- src/Storages/IStorage.cpp | 4 +- src/Storages/IStorage.h | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 4 +- src/Storages/MergeTree/MergeTask.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 18 +- src/Storages/MergeTree/MergeTreeData.h | 2 +- .../MergeTree/MergeTreeDataPartCompact.cpp | 2 +- .../MergeTree/MergeTreeDataPartCompact.h | 2 +- .../MergeTree/MergeTreeDataPartWide.cpp | 2 +- .../MergeTree/MergeTreeDataPartWide.h | 2 +- .../MergeTreeDataPartWriterCompact.cpp | 2 +- .../MergeTreeDataPartWriterCompact.h | 2 +- .../MergeTreeDataPartWriterOnDisk.cpp | 2 +- .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 4 +- .../MergeTree/MergeTreeDataPartWriterWide.cpp | 2 +- .../MergeTree/MergeTreeDataPartWriterWide.h | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 2 +- .../MergeTree/MergeTreeWhereOptimizer.cpp | 2 +- .../MergeTree/MergeTreeWhereOptimizer.h | 6 +- .../MergeTree/MergedBlockOutputStream.cpp | 2 +- .../MergeTree/MergedBlockOutputStream.h | 2 +- .../MergedColumnOnlyOutputStream.cpp | 2 +- .../MergeTree/MergedColumnOnlyOutputStream.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 30 ++-- src/Storages/MutationCommands.cpp | 12 +- src/Storages/MutationCommands.h | 10 +- .../{Estimator.cpp => ConditionEstimator.cpp} | 76 ++++---- .../{Estimator.h => ConditionEstimator.h} | 43 +++-- src/Storages/Statistics/Statistics.cpp | 107 +++++------ src/Storages/Statistics/Statistics.h | 62 +++---- src/Storages/Statistics/TDigestStatistic.cpp | 38 ---- src/Storages/Statistics/TDigestStatistics.cpp | 55 ++++++ ...TDigestStatistic.h => TDigestStatistics.h} | 11 +- src/Storages/Statistics/UniqStatistic.h | 61 ------- src/Storages/Statistics/UniqStatistics.cpp | 63 +++++++ src/Storages/Statistics/UniqStatistics.h | 34 ++++ src/Storages/Statistics/tests/gtest_stats.cpp | 2 +- src/Storages/StatisticsDescription.cpp | 170 ++++++++++-------- src/Storages/StatisticsDescription.h | 54 ++---- .../__init__.py | 0 .../config/config.xml | 0 .../test.py | 14 +- .../0_stateless/02864_statistic_exception.sql | 36 ++-- .../02864_statistic_operate.reference | 6 +- .../0_stateless/02864_statistic_operate.sql | 10 +- .../02864_statistic_uniq.reference | 2 +- .../0_stateless/02864_statistic_uniq.sql | 12 +- tests/sqllogic/test_parser.py | 2 +- 66 files changed, 657 insertions(+), 605 deletions(-) rename src/Storages/Statistics/{Estimator.cpp => ConditionEstimator.cpp} (63%) rename src/Storages/Statistics/{Estimator.h => ConditionEstimator.h} (50%) delete mode 100644 src/Storages/Statistics/TDigestStatistic.cpp create mode 100644 src/Storages/Statistics/TDigestStatistics.cpp rename src/Storages/Statistics/{TDigestStatistic.h => TDigestStatistics.h} (60%) delete mode 100644 src/Storages/Statistics/UniqStatistic.h create mode 100644 src/Storages/Statistics/UniqStatistics.cpp create mode 100644 src/Storages/Statistics/UniqStatistics.h rename tests/integration/{test_manipulate_statistic => test_manipulate_statistics}/__init__.py (100%) rename tests/integration/{test_manipulate_statistic => test_manipulate_statistics}/config/config.xml (100%) rename tests/integration/{test_manipulate_statistic => test_manipulate_statistics}/test.py (86%) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index af609fabb8f..f7c777e6760 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -586,7 +586,7 @@ M(705, TABLE_NOT_EMPTY) \ M(706, LIBSSH_ERROR) \ M(707, GCP_ERROR) \ - M(708, ILLEGAL_STATISTIC) \ + M(708, ILLEGAL_STATISTICS) \ M(709, CANNOT_GET_REPLICATED_DATABASE_SNAPSHOT) \ M(710, FAULT_INJECTED) \ M(711, FILECACHE_ACCESS_DENIED) \ diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 41c3c112ef9..e2a924808e8 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -176,9 +176,9 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong parameter type in ALTER query"); if (!getContext()->getSettings().allow_experimental_statistic && ( - command_ast->type == ASTAlterCommand::ADD_STATISTIC || - command_ast->type == ASTAlterCommand::DROP_STATISTIC || - command_ast->type == ASTAlterCommand::MATERIALIZE_STATISTIC)) + command_ast->type == ASTAlterCommand::ADD_STATISTICS || + command_ast->type == ASTAlterCommand::DROP_STATISTICS || + command_ast->type == ASTAlterCommand::MATERIALIZE_STATISTICS)) throw Exception(ErrorCodes::INCORRECT_QUERY, "Alter table with statistic is now disabled. Turn on allow_experimental_statistic"); } @@ -343,22 +343,22 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS required_access.emplace_back(AccessType::ALTER_SAMPLE_BY, database, table); break; } - case ASTAlterCommand::ADD_STATISTIC: + case ASTAlterCommand::ADD_STATISTICS: { required_access.emplace_back(AccessType::ALTER_ADD_STATISTIC, database, table); break; } - case ASTAlterCommand::MODIFY_STATISTIC: + case ASTAlterCommand::MODIFY_STATISTICS: { required_access.emplace_back(AccessType::ALTER_MODIFY_STATISTIC, database, table); break; } - case ASTAlterCommand::DROP_STATISTIC: + case ASTAlterCommand::DROP_STATISTICS: { required_access.emplace_back(AccessType::ALTER_DROP_STATISTIC, database, table); break; } - case ASTAlterCommand::MATERIALIZE_STATISTIC: + case ASTAlterCommand::MATERIALIZE_STATISTICS: { required_access.emplace_back(AccessType::ALTER_MATERIALIZE_STATISTIC, database, table); break; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f1b15270d70..df80e1d5fbf 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -679,7 +679,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( { if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistic) throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistic is now disabled. Turn on allow_experimental_statistic"); - column.stats = StatisticsDescription::getStatisticFromColumnDeclaration(col_decl); + column.stats = ColumnStatisticsDescription::getStatisticFromColumnDeclaration(col_decl); column.stats.data_type = column.type; } diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index de9e663d869..0a6c873bcac 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -55,7 +55,7 @@ namespace ErrorCodes extern const int CANNOT_UPDATE_COLUMN; extern const int UNEXPECTED_EXPRESSION; extern const int THERE_IS_NO_COLUMN; - extern const int ILLEGAL_STATISTIC; + extern const int ILLEGAL_STATISTICS; } @@ -773,7 +773,7 @@ void MutationsInterpreter::prepare(bool dry_run) } else if (command.type == MutationCommand::MATERIALIZE_INDEX) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); auto it = std::find_if( std::cbegin(indices_desc), std::end(indices_desc), [&](const IndexDescription & index) @@ -793,20 +793,20 @@ void MutationsInterpreter::prepare(bool dry_run) materialized_indices.emplace(command.index_name); } } - else if (command.type == MutationCommand::MATERIALIZE_STATISTIC) + else if (command.type == MutationCommand::MATERIALIZE_STATISTICS) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); - for (const auto & stat_column_name: command.statistic_columns) + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); + for (const auto & stat_column_name: command.statistics_columns) { if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).stats.empty()) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Unknown statistic column: {}", stat_column_name); - dependencies.emplace(stat_column_name, ColumnDependency::STATISTIC); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Unknown statistics column: {}", stat_column_name); + dependencies.emplace(stat_column_name, ColumnDependency::STATISTICS); materialized_statistics.emplace(stat_column_name); } } else if (command.type == MutationCommand::MATERIALIZE_PROJECTION) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { @@ -817,18 +817,18 @@ void MutationsInterpreter::prepare(bool dry_run) } else if (command.type == MutationCommand::DROP_INDEX) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); materialized_indices.erase(command.index_name); } - else if (command.type == MutationCommand::DROP_STATISTIC) + else if (command.type == MutationCommand::DROP_STATISTICS) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); - for (const auto & stat_column_name: command.statistic_columns) + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); + for (const auto & stat_column_name: command.statistics_columns) materialized_statistics.erase(stat_column_name); } else if (command.type == MutationCommand::DROP_PROJECTION) { - mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); + mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); materialized_projections.erase(command.projection_name); } else if (command.type == MutationCommand::MATERIALIZE_TTL) @@ -880,7 +880,7 @@ void MutationsInterpreter::prepare(bool dry_run) { if (dependency.kind == ColumnDependency::SKIP_INDEX || dependency.kind == ColumnDependency::PROJECTION - || dependency.kind == ColumnDependency::STATISTIC) + || dependency.kind == ColumnDependency::STATISTICS) dependencies.insert(dependency); } } @@ -1352,7 +1352,7 @@ QueryPipelineBuilder MutationsInterpreter::execute() Block MutationsInterpreter::getUpdatedHeader() const { // If it's an index/projection materialization, we don't write any data columns, thus empty header is used - return mutation_kind.mutation_kind == MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION ? Block{} : *updated_header; + return mutation_kind.mutation_kind == MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION ? Block{} : *updated_header; } const ColumnDependencies & MutationsInterpreter::getColumnDependencies() const diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index 2d01c7154c8..6aaa233cda3 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -102,7 +102,7 @@ public: enum MutationKindEnum { MUTATE_UNKNOWN, - MUTATE_INDEX_STATISTIC_PROJECTION, + MUTATE_INDEX_STATISTICS_PROJECTION, MUTATE_OTHER, } mutation_kind = MUTATE_UNKNOWN; diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index e1d3937d8fb..90b63d2ce6f 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -42,8 +42,8 @@ ASTPtr ASTAlterCommand::clone() const res->projection_decl = res->children.emplace_back(projection_decl->clone()).get(); if (projection) res->projection = res->children.emplace_back(projection->clone()).get(); - if (statistic_decl) - res->statistic_decl = res->children.emplace_back(statistic_decl->clone()).get(); + if (statistics_decl) + res->statistics_decl = res->children.emplace_back(statistics_decl->clone()).get(); if (partition) res->partition = res->children.emplace_back(partition->clone()).get(); if (predicate) @@ -200,33 +200,33 @@ void ASTAlterCommand::formatImpl(const FormatSettings & settings, FormatState & partition->formatImpl(settings, state, frame); } } - else if (type == ASTAlterCommand::ADD_STATISTIC) + else if (type == ASTAlterCommand::ADD_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << "ADD STATISTIC " << (if_not_exists ? "IF NOT EXISTS " : "") + settings.ostr << (settings.hilite ? hilite_keyword : "") << "ADD STATISTICS " << (if_not_exists ? "IF NOT EXISTS " : "") << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + statistics_decl->formatImpl(settings, state, frame); } - else if (type == ASTAlterCommand::MODIFY_STATISTIC) + else if (type == ASTAlterCommand::MODIFY_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << "MODIFY STATISTIC " + settings.ostr << (settings.hilite ? hilite_keyword : "") << "MODIFY STATISTICS " << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + statistics_decl->formatImpl(settings, state, frame); } - else if (type == ASTAlterCommand::DROP_STATISTIC) + else if (type == ASTAlterCommand::DROP_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << (clear_statistic ? "CLEAR " : "DROP ") << "STATISTIC " + settings.ostr << (settings.hilite ? hilite_keyword : "") << (clear_statistics ? "CLEAR " : "DROP ") << "STATISTICS " << (if_exists ? "IF EXISTS " : "") << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + statistics_decl->formatImpl(settings, state, frame); if (partition) { settings.ostr << (settings.hilite ? hilite_keyword : "") << " IN PARTITION " << (settings.hilite ? hilite_none : ""); partition->formatImpl(settings, state, frame); } } - else if (type == ASTAlterCommand::MATERIALIZE_STATISTIC) + else if (type == ASTAlterCommand::MATERIALIZE_STATISTICS) { - settings.ostr << (settings.hilite ? hilite_keyword : "") << "MATERIALIZE STATISTIC " << (settings.hilite ? hilite_none : ""); - statistic_decl->formatImpl(settings, state, frame); + settings.ostr << (settings.hilite ? hilite_keyword : "") << "MATERIALIZE STATISTICS " << (settings.hilite ? hilite_none : ""); + statistics_decl->formatImpl(settings, state, frame); if (partition) { settings.ostr << (settings.hilite ? hilite_keyword : "") << " IN PARTITION " << (settings.hilite ? hilite_none : ""); @@ -513,7 +513,7 @@ void ASTAlterCommand::forEachPointerToChild(std::function f) f(reinterpret_cast(&constraint)); f(reinterpret_cast(&projection_decl)); f(reinterpret_cast(&projection)); - f(reinterpret_cast(&statistic_decl)); + f(reinterpret_cast(&statistics_decl)); f(reinterpret_cast(&partition)); f(reinterpret_cast(&predicate)); f(reinterpret_cast(&update_assignments)); diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index c2a23114f6a..f23351211b1 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -55,10 +55,10 @@ public: DROP_PROJECTION, MATERIALIZE_PROJECTION, - ADD_STATISTIC, - DROP_STATISTIC, - MODIFY_STATISTIC, - MATERIALIZE_STATISTIC, + ADD_STATISTICS, + DROP_STATISTICS, + MODIFY_STATISTICS, + MATERIALIZE_STATISTICS, DROP_PARTITION, DROP_DETACHED_PARTITION, @@ -136,7 +136,7 @@ public: */ IAST * projection = nullptr; - IAST * statistic_decl = nullptr; + IAST * statistics_decl = nullptr; /** Used in DROP PARTITION, ATTACH PARTITION FROM, FORGET PARTITION, UPDATE, DELETE queries. * The value or ID of the partition is stored here. @@ -181,7 +181,7 @@ public: bool clear_index = false; /// for CLEAR INDEX (do not drop index from metadata) - bool clear_statistic = false; /// for CLEAR STATISTIC (do not drop statistic from metadata) + bool clear_statistics = false; /// for CLEAR STATISTICS (do not drop statistics from metadata) bool clear_projection = false; /// for CLEAR PROJECTION (do not drop projection from metadata) diff --git a/src/Parsers/CommonParsers.h b/src/Parsers/CommonParsers.h index fc77020a94a..f88ecfd502c 100644 --- a/src/Parsers/CommonParsers.h +++ b/src/Parsers/CommonParsers.h @@ -13,7 +13,7 @@ namespace DB MR_MACROS(ADD_CONSTRAINT, "ADD CONSTRAINT") \ MR_MACROS(ADD_INDEX, "ADD INDEX") \ MR_MACROS(ADD_PROJECTION, "ADD PROJECTION") \ - MR_MACROS(ADD_STATISTIC, "ADD STATISTIC") \ + MR_MACROS(ADD_STATISTICS, "ADD STATISTICS") \ MR_MACROS(ADD, "ADD") \ MR_MACROS(ADMIN_OPTION_FOR, "ADMIN OPTION FOR") \ MR_MACROS(AFTER, "AFTER") \ @@ -83,7 +83,7 @@ namespace DB MR_MACROS(CLEAR_COLUMN, "CLEAR COLUMN") \ MR_MACROS(CLEAR_INDEX, "CLEAR INDEX") \ MR_MACROS(CLEAR_PROJECTION, "CLEAR PROJECTION") \ - MR_MACROS(CLEAR_STATISTIC, "CLEAR STATISTIC") \ + MR_MACROS(CLEAR_STATISTICS, "CLEAR STATISTICS") \ MR_MACROS(CLUSTER, "CLUSTER") \ MR_MACROS(CLUSTERS, "CLUSTERS") \ MR_MACROS(CN, "CN") \ @@ -150,7 +150,7 @@ namespace DB MR_MACROS(DROP_PART, "DROP PART") \ MR_MACROS(DROP_PARTITION, "DROP PARTITION") \ MR_MACROS(DROP_PROJECTION, "DROP PROJECTION") \ - MR_MACROS(DROP_STATISTIC, "DROP STATISTIC") \ + MR_MACROS(DROP_STATISTICS, "DROP STATISTICS") \ MR_MACROS(DROP_TABLE, "DROP TABLE") \ MR_MACROS(DROP_TEMPORARY_TABLE, "DROP TEMPORARY TABLE") \ MR_MACROS(DROP, "DROP") \ @@ -279,7 +279,7 @@ namespace DB MR_MACROS(MATERIALIZE_COLUMN, "MATERIALIZE COLUMN") \ MR_MACROS(MATERIALIZE_INDEX, "MATERIALIZE INDEX") \ MR_MACROS(MATERIALIZE_PROJECTION, "MATERIALIZE PROJECTION") \ - MR_MACROS(MATERIALIZE_STATISTIC, "MATERIALIZE STATISTIC") \ + MR_MACROS(MATERIALIZE_STATISTICS, "MATERIALIZE STATISTICS") \ MR_MACROS(MATERIALIZE_TTL, "MATERIALIZE TTL") \ MR_MACROS(MATERIALIZE, "MATERIALIZE") \ MR_MACROS(MATERIALIZED, "MATERIALIZED") \ @@ -304,7 +304,7 @@ namespace DB MR_MACROS(MODIFY_QUERY, "MODIFY QUERY") \ MR_MACROS(MODIFY_REFRESH, "MODIFY REFRESH") \ MR_MACROS(MODIFY_SAMPLE_BY, "MODIFY SAMPLE BY") \ - MR_MACROS(MODIFY_STATISTIC, "MODIFY STATISTIC") \ + MR_MACROS(MODIFY_STATISTICS, "MODIFY STATISTICS") \ MR_MACROS(MODIFY_SETTING, "MODIFY SETTING") \ MR_MACROS(MODIFY_SQL_SECURITY, "MODIFY SQL SECURITY") \ MR_MACROS(MODIFY_TTL, "MODIFY TTL") \ @@ -448,7 +448,7 @@ namespace DB MR_MACROS(SQL_SECURITY, "SQL SECURITY") \ MR_MACROS(SS, "SS") \ MR_MACROS(START_TRANSACTION, "START TRANSACTION") \ - MR_MACROS(STATISTIC, "STATISTIC") \ + MR_MACROS(STATISTICS, "STATISTICS") \ MR_MACROS(STEP, "STEP") \ MR_MACROS(STORAGE, "STORAGE") \ MR_MACROS(STRICT, "STRICT") \ diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 2c8ab65d1fc..4911357c48c 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -685,7 +685,7 @@ bool ParserCodec::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) return true; } -bool ParserStatisticType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserStatisticsType::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserList stat_type_parser(std::make_unique(), std::make_unique(TokenType::Comma), false); @@ -704,7 +704,7 @@ bool ParserStatisticType::parseImpl(Pos & pos, ASTPtr & node, Expected & expecte ++pos; auto function_node = std::make_shared(); - function_node->name = "STATISTIC"; + function_node->name = "STATISTICS"; function_node->arguments = stat_type; function_node->children.push_back(function_node->arguments); diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index b29f5cc4251..d44e3af2a9c 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -198,7 +198,7 @@ protected: }; /// STATISTIC(tdigest(200)) -class ParserStatisticType : public IParserBase +class ParserStatisticsType : public IParserBase { protected: const char * getName() const override { return "statistic"; } diff --git a/src/Parsers/ParserAlterQuery.cpp b/src/Parsers/ParserAlterQuery.cpp index 731a74f9b6d..c289102bc03 100644 --- a/src/Parsers/ParserAlterQuery.cpp +++ b/src/Parsers/ParserAlterQuery.cpp @@ -49,11 +49,11 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserKeyword s_clear_index(Keyword::CLEAR_INDEX); ParserKeyword s_materialize_index(Keyword::MATERIALIZE_INDEX); - ParserKeyword s_add_statistic(Keyword::ADD_STATISTIC); - ParserKeyword s_drop_statistic(Keyword::DROP_STATISTIC); - ParserKeyword s_modify_statistic(Keyword::MODIFY_STATISTIC); - ParserKeyword s_clear_statistic(Keyword::CLEAR_STATISTIC); - ParserKeyword s_materialize_statistic(Keyword::MATERIALIZE_STATISTIC); + ParserKeyword s_add_statistics(Keyword::ADD_STATISTICS); + ParserKeyword s_drop_statistics(Keyword::DROP_STATISTICS); + ParserKeyword s_modify_statistics(Keyword::MODIFY_STATISTICS); + ParserKeyword s_clear_statistics(Keyword::CLEAR_STATISTICS); + ParserKeyword s_materialize_statistics(Keyword::MATERIALIZE_STATISTICS); ParserKeyword s_add_constraint(Keyword::ADD_CONSTRAINT); ParserKeyword s_drop_constraint(Keyword::DROP_CONSTRAINT); @@ -127,8 +127,8 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ParserIdentifier parser_remove_property; ParserCompoundColumnDeclaration parser_col_decl; ParserIndexDeclaration parser_idx_decl; - ParserStatisticDeclaration parser_stat_decl; - ParserStatisticDeclarationWithoutTypes parser_stat_decl_without_types; + ParserStatisticsDeclaration parser_stat_decl; + ParserStatisticsDeclarationWithoutTypes parser_stat_decl_without_types; ParserConstraintDeclaration parser_constraint_decl; ParserProjectionDeclaration parser_projection_decl; ParserCompoundColumnDeclaration parser_modify_col_decl(false, false, true); @@ -156,7 +156,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected ASTPtr command_constraint; ASTPtr command_projection_decl; ASTPtr command_projection; - ASTPtr command_statistic_decl; + ASTPtr command_statistics_decl; ASTPtr command_partition; ASTPtr command_predicate; ASTPtr command_update_assignments; @@ -370,43 +370,43 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } } - else if (s_add_statistic.ignore(pos, expected)) + else if (s_add_statistics.ignore(pos, expected)) { if (s_if_not_exists.ignore(pos, expected)) command->if_not_exists = true; - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::ADD_STATISTIC; + command->type = ASTAlterCommand::ADD_STATISTICS; } - else if (s_modify_statistic.ignore(pos, expected)) + else if (s_modify_statistics.ignore(pos, expected)) { - if (!parser_stat_decl.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::MODIFY_STATISTIC; + command->type = ASTAlterCommand::MODIFY_STATISTICS; } - else if (s_drop_statistic.ignore(pos, expected)) + else if (s_drop_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl_without_types.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::DROP_STATISTIC; + command->type = ASTAlterCommand::DROP_STATISTICS; } - else if (s_clear_statistic.ignore(pos, expected)) + else if (s_clear_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl_without_types.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::DROP_STATISTIC; - command->clear_statistic = true; + command->type = ASTAlterCommand::DROP_STATISTICS; + command->clear_statistics = true; command->detach = false; if (s_in_partition.ignore(pos, expected)) @@ -415,15 +415,15 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected return false; } } - else if (s_materialize_statistic.ignore(pos, expected)) + else if (s_materialize_statistics.ignore(pos, expected)) { if (s_if_exists.ignore(pos, expected)) command->if_exists = true; - if (!parser_stat_decl_without_types.parse(pos, command_statistic_decl, expected)) + if (!parser_stat_decl_without_types.parse(pos, command_statistics_decl, expected)) return false; - command->type = ASTAlterCommand::MATERIALIZE_STATISTIC; + command->type = ASTAlterCommand::MATERIALIZE_STATISTICS; command->detach = false; if (s_in_partition.ignore(pos, expected)) @@ -940,8 +940,8 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected command->projection_decl = command->children.emplace_back(std::move(command_projection_decl)).get(); if (command_projection) command->projection = command->children.emplace_back(std::move(command_projection)).get(); - if (command_statistic_decl) - command->statistic_decl = command->children.emplace_back(std::move(command_statistic_decl)).get(); + if (command_statistics_decl) + command->statistics_decl = command->children.emplace_back(std::move(command_statistics_decl)).get(); if (command_partition) command->partition = command->children.emplace_back(std::move(command_partition)).get(); if (command_predicate) diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp index 91082a02c59..27bf0c79d3f 100644 --- a/src/Parsers/ParserCreateQuery.cpp +++ b/src/Parsers/ParserCreateQuery.cpp @@ -225,7 +225,7 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe return true; } -bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserStatisticsDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserKeyword s_type(Keyword::TYPE); @@ -252,7 +252,7 @@ bool ParserStatisticDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & return true; } -bool ParserStatisticDeclarationWithoutTypes::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) +bool ParserStatisticsDeclarationWithoutTypes::parseImpl(Pos & pos, ASTPtr & node, Expected & expected) { ParserList columns_p(std::make_unique(), std::make_unique(TokenType::Comma), false); diff --git a/src/Parsers/ParserCreateQuery.h b/src/Parsers/ParserCreateQuery.h index ba17c796f00..27bb524970d 100644 --- a/src/Parsers/ParserCreateQuery.h +++ b/src/Parsers/ParserCreateQuery.h @@ -138,7 +138,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserKeyword s_auto_increment{Keyword::AUTO_INCREMENT}; ParserKeyword s_comment{Keyword::COMMENT}; ParserKeyword s_codec{Keyword::CODEC}; - ParserKeyword s_stat{Keyword::STATISTIC}; + ParserKeyword s_stat{Keyword::STATISTICS}; ParserKeyword s_ttl{Keyword::TTL}; ParserKeyword s_remove{Keyword::REMOVE}; ParserKeyword s_modify_setting(Keyword::MODIFY_SETTING); @@ -155,7 +155,7 @@ bool IParserColumnDeclaration::parseImpl(Pos & pos, ASTPtr & node, E ParserLiteral literal_parser; ParserCodec codec_parser; ParserCollation collation_parser; - ParserStatisticType stat_type_parser; + ParserStatisticsType stat_type_parser; ParserExpression expression_parser; ParserSetQuery settings_parser(true); @@ -452,20 +452,20 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -class ParserStatisticDeclaration : public IParserBase +class ParserStatisticsDeclaration : public IParserBase { public: - ParserStatisticDeclaration() = default; + ParserStatisticsDeclaration() = default; protected: const char * getName() const override { return "statistics declaration"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -class ParserStatisticDeclarationWithoutTypes : public IParserBase +class ParserStatisticsDeclarationWithoutTypes : public IParserBase { public: - ParserStatisticDeclarationWithoutTypes() = default; + ParserStatisticsDeclarationWithoutTypes() = default; protected: const char * getName() const override { return "statistics declaration"; } diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 5b3881ba036..e768a3f362a 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -44,7 +44,7 @@ namespace DB namespace ErrorCodes { extern const int ILLEGAL_COLUMN; - extern const int ILLEGAL_STATISTIC; + extern const int ILLEGAL_STATISTICS; extern const int BAD_ARGUMENTS; extern const int NOT_FOUND_COLUMN_IN_BLOCK; extern const int LOGICAL_ERROR; @@ -263,32 +263,32 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ return command; } - else if (command_ast->type == ASTAlterCommand::ADD_STATISTIC) + else if (command_ast->type == ASTAlterCommand::ADD_STATISTICS) { AlterCommand command; command.ast = command_ast->clone(); - command.statistic_decl = command_ast->statistic_decl->clone(); - command.type = AlterCommand::ADD_STATISTIC; + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::ADD_STATISTICS; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + const auto & ast_stat_decl = command_ast->statistics_decl->as(); - command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_types = ast_stat_decl.getTypeNames(); + command.statistics_columns = ast_stat_decl.getColumnNames(); + command.statistics_types = ast_stat_decl.getTypeNames(); command.if_not_exists = command_ast->if_not_exists; return command; } - else if (command_ast->type == ASTAlterCommand::MODIFY_STATISTIC) + else if (command_ast->type == ASTAlterCommand::MODIFY_STATISTICS) { AlterCommand command; command.ast = command_ast->clone(); - command.statistic_decl = command_ast->statistic_decl->clone(); - command.type = AlterCommand::MODIFY_STATISTIC; + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::MODIFY_STATISTICS; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + const auto & ast_stat_decl = command_ast->statistics_decl->as(); - command.statistic_columns = ast_stat_decl.getColumnNames(); - command.statistic_types = ast_stat_decl.getTypeNames(); + command.statistics_columns = ast_stat_decl.getColumnNames(); + command.statistics_types = ast_stat_decl.getTypeNames(); command.if_not_exists = command_ast->if_not_exists; return command; @@ -352,17 +352,17 @@ std::optional AlterCommand::parse(const ASTAlterCommand * command_ return command; } - else if (command_ast->type == ASTAlterCommand::DROP_STATISTIC) + else if (command_ast->type == ASTAlterCommand::DROP_STATISTICS) { AlterCommand command; command.ast = command_ast->clone(); - command.statistic_decl = command_ast->statistic_decl->clone(); - command.type = AlterCommand::DROP_STATISTIC; - const auto & ast_stat_decl = command_ast->statistic_decl->as(); + command.statistics_decl = command_ast->statistics_decl->clone(); + command.type = AlterCommand::DROP_STATISTICS; + const auto & ast_stat_decl = command_ast->statistics_decl->as(); - command.statistic_columns = ast_stat_decl.getColumnNames(); + command.statistics_columns = ast_stat_decl.getColumnNames(); command.if_exists = command_ast->if_exists; - command.clear = command_ast->clear_statistic; + command.clear = command_ast->clear_statistics; if (command_ast->partition) command.partition = command_ast->partition->clone(); @@ -691,54 +691,54 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) metadata.secondary_indices.erase(erase_it); } } - else if (type == ADD_STATISTIC) + else if (type == ADD_STATISTICS) { - for (const auto & statistic_column_name : statistic_columns) + for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(statistic_column_name)) + if (!metadata.columns.has(statistics_column_name)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic for column {}: this column is not found", statistic_column_name); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot add statistics for column {}: this column is not found", statistics_column_name); } } - auto stats_vec = StatisticsDescription::getStatisticsFromAST(statistic_decl, metadata.columns); + auto stats_vec = ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(statistics_decl, metadata.columns); for (const auto & stats : stats_vec) { metadata.columns.modify(stats.column_name, [&](ColumnDescription & column) { column.stats.merge(stats, column, if_not_exists); }); } } - else if (type == DROP_STATISTIC) + else if (type == DROP_STATISTICS) { - for (const auto & statistic_column_name : statistic_columns) + for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(statistic_column_name)) + if (!metadata.columns.has(statistics_column_name)) { if (if_exists) return; - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Wrong statistic name. Cannot find statistic {} to drop", backQuote(statistic_column_name)); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Wrong statistics name. Cannot find statistics {} to drop", backQuote(statistics_column_name)); } if (!clear && !partition) - metadata.columns.modify(statistic_column_name, + metadata.columns.modify(statistics_column_name, [&](ColumnDescription & column) { column.stats.clear(); }); } } - else if (type == MODIFY_STATISTIC) + else if (type == MODIFY_STATISTICS) { - for (const auto & statistic_column_name : statistic_columns) + for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(statistic_column_name)) + if (!metadata.columns.has(statistics_column_name)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Cannot add statistic for column {}: this column is not found", statistic_column_name); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Cannot add statistics for column {}: this column is not found", statistics_column_name); } } - auto stats_vec = StatisticsDescription::getStatisticsFromAST(statistic_decl, metadata.columns); + auto stats_vec = ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(statistics_decl, metadata.columns); for (const auto & stats : stats_vec) { metadata.columns.modify(stats.column_name, - [&](ColumnDescription & column) { column.stats.modify(stats); }); + [&](ColumnDescription & column) { column.stats.assign(stats); }); } } else if (type == ADD_CONSTRAINT) @@ -987,7 +987,7 @@ bool AlterCommand::isRequireMutationStage(const StorageInMemoryMetadata & metada if (isRemovingProperty() || type == REMOVE_TTL || type == REMOVE_SAMPLE_BY) return false; - if (type == DROP_INDEX || type == DROP_PROJECTION || type == RENAME_COLUMN || type == DROP_STATISTIC) + if (type == DROP_INDEX || type == DROP_PROJECTION || type == RENAME_COLUMN || type == DROP_STATISTICS) return true; /// Drop alias is metadata alter, in other case mutation is required. @@ -1094,10 +1094,10 @@ std::optional AlterCommand::tryConvertToMutationCommand(Storage result.predicate = nullptr; } - else if (type == DROP_STATISTIC) + else if (type == DROP_STATISTICS) { - result.type = MutationCommand::Type::DROP_STATISTIC; - result.statistic_columns = statistic_columns; + result.type = MutationCommand::Type::DROP_STATISTICS; + result.statistics_columns = statistics_columns; if (clear) result.clear = true; diff --git a/src/Storages/AlterCommands.h b/src/Storages/AlterCommands.h index 10de4ec1a77..68c366b10c5 100644 --- a/src/Storages/AlterCommands.h +++ b/src/Storages/AlterCommands.h @@ -38,9 +38,9 @@ struct AlterCommand DROP_CONSTRAINT, ADD_PROJECTION, DROP_PROJECTION, - ADD_STATISTIC, - DROP_STATISTIC, - MODIFY_STATISTIC, + ADD_STATISTICS, + DROP_STATISTICS, + MODIFY_STATISTICS, MODIFY_TTL, MODIFY_SETTING, RESET_SETTING, @@ -124,9 +124,9 @@ struct AlterCommand /// For ADD/DROP PROJECTION String projection_name; - ASTPtr statistic_decl = nullptr; - std::vector statistic_columns; - std::vector statistic_types; + ASTPtr statistics_decl = nullptr; + std::vector statistics_columns; + std::vector statistics_types; /// For MODIFY TTL ASTPtr ttl = nullptr; diff --git a/src/Storages/ColumnDependency.h b/src/Storages/ColumnDependency.h index b9088dd0227..dcbda7a4b86 100644 --- a/src/Storages/ColumnDependency.h +++ b/src/Storages/ColumnDependency.h @@ -26,8 +26,8 @@ struct ColumnDependency /// TTL is set for @column_name. TTL_TARGET, - /// Exists any statistic, that requires @column_name - STATISTIC, + /// Exists any statistics, that requires @column_name + STATISTICS, }; ColumnDependency(const String & column_name_, Kind kind_) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index f3798c557b1..63f617a91cd 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -89,7 +89,7 @@ struct ColumnDescription ASTPtr codec; SettingsChanges settings; ASTPtr ttl; - StatisticsDescription stats; + ColumnStatisticsDescription stats; ColumnDescription() = default; ColumnDescription(ColumnDescription &&) = default; diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index b532abc9074..d0db2c02738 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -233,7 +233,7 @@ StorageID IStorage::getStorageID() const return storage_id; } -ConditionEstimator IStorage::getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const +ConditionSelectivityEstimator IStorage::getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const { return {}; } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 87a04c3fcc6..99f6897a8f5 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -69,7 +69,7 @@ using DatabaseAndTableName = std::pair; class BackupEntriesCollector; class RestorerFromBackup; -class ConditionEstimator; +class ConditionSelectivityEstimator; struct ColumnSize { @@ -136,7 +136,7 @@ public: /// Returns true if the storage supports queries with the PREWHERE section. virtual bool supportsPrewhere() const { return false; } - virtual ConditionEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const; + virtual ConditionSelectivityEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const; /// Returns which columns supports PREWHERE, or empty std::nullopt if all columns is supported. /// This is needed for engines whose aggregates data from multiple tables, like Merge. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index e3765ca43d3..162ce9e1d27 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -657,13 +657,13 @@ String IMergeTreeDataPart::getColumnNameWithMinimumCompressedSize(bool with_subc return *minimum_size_column; } -std::vector IMergeTreeDataPart::loadStatistics() const +ColumnsStatistics IMergeTreeDataPart::loadStatistics() const { const auto & metadata_snaphost = storage.getInMemoryMetadata(); auto total_statistics = MergeTreeStatisticsFactory::instance().getMany(metadata_snaphost.getColumns()); - std::vector result; + ColumnsStatistics result; for (auto & stat : total_statistics) { String file_name = stat->getFileName() + STAT_FILE_SUFFIX; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 1afb7e64fc8..f788e493ca5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -110,7 +110,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) = 0; @@ -176,7 +176,7 @@ public: void remove(); - std::vector loadStatistics() const; + ColumnsStatistics loadStatistics() const; /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load various metadata into memory: checksums from checksums.txt, index if required, etc. diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index 7f59b8c674e..ba01ffabd3d 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -632,7 +632,7 @@ void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const /// because all of them were already recalculated and written /// as key part of vertical merge std::vector{}, - std::vector{}, /// TODO(hanfei) + ColumnsStatistics{}, /// TODO(hanfei) &global_ctx->written_offset_columns, global_ctx->to->getIndexGranularity()); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 4b6c7ddf027..c55b7555050 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include #include #include @@ -470,7 +470,7 @@ StoragePolicyPtr MergeTreeData::getStoragePolicy() const return storage_policy; } -ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const +ConditionSelectivityEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const { if (!local_context->getSettings().allow_statistic_optimize) return {}; @@ -484,23 +484,29 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQ ASTPtr expression_ast; - ConditionEstimator result; + ConditionSelectivityEstimator result; PartitionPruner partition_pruner(storage_snapshot->metadata, query_info, local_context, true /* strict */); if (partition_pruner.isUseless()) { /// Read all partitions. for (const auto & part : parts) + try { auto stats = part->loadStatistics(); /// TODO: We only have one stats file for every part. for (const auto & stat : stats) result.merge(part->info.getPartNameV1(), part->rows_count, stat); } + catch(...) + { + tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); + } } else { for (const auto & part : parts) + try { if (!partition_pruner.canBePruned(*part)) { @@ -509,6 +515,10 @@ ConditionEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQ result.merge(part->info.getPartNameV1(), part->rows_count, stat); } } + catch(...) + { + tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); + } } return result; @@ -8354,7 +8364,7 @@ std::pair MergeTreeData::createE const auto & index_factory = MergeTreeIndexFactory::instance(); MergedBlockOutputStream out(new_data_part, metadata_snapshot, columns, index_factory.getMany(metadata_snapshot->getSecondaryIndices()), - std::vector{}, + ColumnsStatistics{}, compression_codec, txn); bool sync_on_insert = settings->fsync_after_insert; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 0d56b902f1a..501801b93e3 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -426,7 +426,7 @@ public: bool supportsPrewhere() const override { return true; } - ConditionEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const override; + ConditionSelectivityEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const override; bool supportsFinal() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index c5c7d8a1c19..0dec70f4eb1 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -51,7 +51,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartCompact::getWriter( const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 7302aef9d74..560ca5e5425 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -44,7 +44,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index 0f5522ab62e..49e0d09d569 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -57,7 +57,7 @@ IMergeTreeDataPart::MergeTreeWriterPtr MergeTreeDataPartWide::getWriter( const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 84a566bc9ac..989c8f14e91 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -39,7 +39,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, const MergeTreeIndexGranularity & computed_index_granularity) override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp index 3d33d99fe79..eaccfc80d3d 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp @@ -14,7 +14,7 @@ MergeTreeDataPartWriterCompact::MergeTreeDataPartWriterCompact( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, - const std::vector & stats_to_recalc, + const ColumnsStatistics & stats_to_recalc, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h index 81bf3d39f97..e80054675bf 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.h @@ -15,7 +15,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc, + const ColumnsStatistics & stats_to_recalc, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index 3ca83594d51..e1bdf73bbcf 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -144,7 +144,7 @@ MergeTreeDataPartWriterOnDisk::MergeTreeDataPartWriterOnDisk( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const MergeTreeIndices & indices_to_recalc_, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h index 232f013475d..d6802e2b0ab 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.h @@ -108,7 +108,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, @@ -152,7 +152,7 @@ protected: const MergeTreeIndices skip_indices; - const std::vector stats; + const ColumnsStatistics stats; std::vector stats_streams; const String marks_file_extension; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp index be8b7b5e9f0..fd978e3de73 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.cpp @@ -80,7 +80,7 @@ MergeTreeDataPartWriterWide::MergeTreeDataPartWriterWide( const NamesAndTypesList & columns_list_, const StorageMetadataPtr & metadata_snapshot_, const std::vector & indices_to_recalc_, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & settings_, diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h index 5827332195c..3eaef4437fe 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterWide.h @@ -25,7 +25,7 @@ public: const NamesAndTypesList & columns_list, const StorageMetadataPtr & metadata_snapshot, const std::vector & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec, const MergeTreeWriterSettings & settings, diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index bfa5aa23ba8..d95fb33e647 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -735,7 +735,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeProjectionPartImpl( columns, MergeTreeIndices{}, /// TODO(hanfei): It should be helpful to write statistics for projection result. - std::vector{}, + ColumnsStatistics{}, compression_codec, NO_TRANSACTION_PTR, false, false, data.getContext()->getWriteSettings()); diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 21bde79873f..3309a5fcb92 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -53,7 +53,7 @@ static Int64 findMinPosition(const NameSet & condition_table_columns, const Name MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( std::unordered_map column_sizes_, const StorageMetadataPtr & metadata_snapshot, - const ConditionEstimator & estimator_, + const ConditionSelectivityEstimator & estimator_, const Names & queried_columns_, const std::optional & supported_columns_, LoggerPtr log_) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index fa1724f6c8c..813f4a78ea4 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include @@ -38,7 +38,7 @@ public: MergeTreeWhereOptimizer( std::unordered_map column_sizes_, const StorageMetadataPtr & metadata_snapshot, - const ConditionEstimator & estimator_, + const ConditionSelectivityEstimator & estimator_, const Names & queried_columns_, const std::optional & supported_columns_, LoggerPtr log_); @@ -147,7 +147,7 @@ private: static NameSet determineArrayJoinedNames(const ASTSelectQuery & select); - const ConditionEstimator estimator; + const ConditionSelectivityEstimator estimator; const NameSet table_columns; const Names queried_columns; diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 72e05d12ae6..2c0b0a29012 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -19,7 +19,7 @@ MergedBlockOutputStream::MergedBlockOutputStream( const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const std::vector & statistics, + const ColumnsStatistics & statistics, CompressionCodecPtr default_codec_, const MergeTreeTransactionPtr & txn, bool reset_columns_, diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.h b/src/Storages/MergeTree/MergedBlockOutputStream.h index 0d6c76794bd..001767320f2 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.h +++ b/src/Storages/MergeTree/MergedBlockOutputStream.h @@ -20,7 +20,7 @@ public: const StorageMetadataPtr & metadata_snapshot_, const NamesAndTypesList & columns_list_, const MergeTreeIndices & skip_indices, - const std::vector & statistics, + const ColumnsStatistics & statistics, CompressionCodecPtr default_codec_, const MergeTreeTransactionPtr & txn, bool reset_columns_ = false, diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp index 74f6eb020b3..95f186d1b86 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.cpp @@ -16,7 +16,7 @@ MergedColumnOnlyOutputStream::MergedColumnOnlyOutputStream( const Block & header_, CompressionCodecPtr default_codec, const MergeTreeIndices & indices_to_recalc, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, WrittenOffsetColumns * offset_columns_, const MergeTreeIndexGranularity & index_granularity, const MergeTreeIndexGranularityInfo * index_granularity_info) diff --git a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h index c734acf71c7..16a54ff33b6 100644 --- a/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h +++ b/src/Storages/MergeTree/MergedColumnOnlyOutputStream.h @@ -20,7 +20,7 @@ public: const Block & header_, CompressionCodecPtr default_codec_, const MergeTreeIndices & indices_to_recalc_, - const std::vector & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, WrittenOffsetColumns * offset_columns_ = nullptr, const MergeTreeIndexGranularity & index_granularity = {}, const MergeTreeIndexGranularityInfo * index_granularity_info_ = nullptr); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 0e84d002320..ebb71e1e2a4 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -114,7 +114,7 @@ static void splitAndModifyMutationCommands( } } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC + || command.type == MutationCommand::Type::MATERIALIZE_STATISTICS || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -127,7 +127,7 @@ static void splitAndModifyMutationCommands( } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION - || command.type == MutationCommand::Type::DROP_STATISTIC) + || command.type == MutationCommand::Type::DROP_STATISTICS) { for_file_renames.push_back(command); } @@ -242,7 +242,7 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); } else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC + || command.type == MutationCommand::Type::MATERIALIZE_STATISTICS || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -253,7 +253,7 @@ static void splitAndModifyMutationCommands( } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION - || command.type == MutationCommand::Type::DROP_STATISTIC) + || command.type == MutationCommand::Type::DROP_STATISTICS) { for_file_renames.push_back(command); } @@ -756,11 +756,11 @@ static NameToNameVector collectFilesForRenames( if (source_part->checksums.has(command.column_name + ".proj")) add_rename(command.column_name + ".proj", ""); } - else if (command.type == MutationCommand::Type::DROP_STATISTIC) + else if (command.type == MutationCommand::Type::DROP_STATISTICS) { - for (const auto & statistic_column_name : command.statistic_columns) - if (source_part->checksums.has(STAT_FILE_PREFIX + statistic_column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + statistic_column_name + STAT_FILE_SUFFIX, ""); + for (const auto & statistics_column_name : command.statistics_columns) + if (source_part->checksums.has(STAT_FILE_PREFIX + statistics_column_name + STAT_FILE_SUFFIX)) + add_rename(STAT_FILE_PREFIX + statistics_column_name + STAT_FILE_SUFFIX, ""); } else if (isWidePart(source_part)) { @@ -781,7 +781,7 @@ static NameToNameVector collectFilesForRenames( if (auto serialization = source_part->tryGetSerialization(command.column_name)) serialization->enumerateStreams(callback); - /// if we drop a column with statistic, we should also drop the stat file. + /// if we drop a column with statistics, we should also drop the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, ""); } @@ -817,7 +817,7 @@ static NameToNameVector collectFilesForRenames( if (auto serialization = source_part->tryGetSerialization(command.column_name)) serialization->enumerateStreams(callback); - /// if we rename a column with statistic, we should also rename the stat file. + /// if we rename a column with statistics, we should also rename the stat file. if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, STAT_FILE_PREFIX + command.rename_to + STAT_FILE_SUFFIX); } @@ -1457,8 +1457,8 @@ private: { if (command.type == MutationCommand::DROP_INDEX) removed_indices.insert(command.column_name); - else if (command.type == MutationCommand::DROP_STATISTIC) - for (const auto & column_name : command.statistic_columns) + else if (command.type == MutationCommand::DROP_STATISTICS) + for (const auto & column_name : command.statistics_columns) removed_stats.insert(column_name); else if (command.type == MutationCommand::RENAME_COLUMN && ctx->source_part->checksums.files.contains(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) @@ -1498,7 +1498,7 @@ private: } } - std::vector stats_to_rewrite; + ColumnsStatistics stats_to_rewrite; const auto & columns = ctx->metadata_snapshot->getColumns(); for (const auto & col : columns) { @@ -1512,7 +1512,7 @@ private: else { /// We do not hard-link statistics which - /// 1. In `DROP STATISTIC` statement. It is filtered by `removed_stats` + /// 1. In `DROP STATISTICS` statement. It is filtered by `removed_stats` /// 2. Not in column list anymore, including `DROP COLUMN`. It is not touched by this loop. String stat_file_name = STAT_FILE_PREFIX + col.name + STAT_FILE_SUFFIX; auto it = ctx->source_part->checksums.files.find(stat_file_name); @@ -1888,7 +1888,7 @@ private: ctx->updated_header, ctx->compression_codec, std::vector(ctx->indices_to_recalc.begin(), ctx->indices_to_recalc.end()), - std::vector(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), + ColumnsStatistics(ctx->stats_to_recalc.begin(), ctx->stats_to_recalc.end()), nullptr, ctx->source_part->index_granularity, &ctx->source_part->index_granularity_info diff --git a/src/Storages/MutationCommands.cpp b/src/Storages/MutationCommands.cpp index a41c5833109..f736c863eee 100644 --- a/src/Storages/MutationCommands.cpp +++ b/src/Storages/MutationCommands.cpp @@ -83,15 +83,15 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.index_name = command->index->as().name(); return res; } - else if (command->type == ASTAlterCommand::MATERIALIZE_STATISTIC) + else if (command->type == ASTAlterCommand::MATERIALIZE_STATISTICS) { MutationCommand res; res.ast = command->ptr(); - res.type = MATERIALIZE_STATISTIC; + res.type = MATERIALIZE_STATISTICS; if (command->partition) res.partition = command->partition->clone(); res.predicate = nullptr; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistics_columns = command->statistics_decl->as().getColumnNames(); return res; } else if (command->type == ASTAlterCommand::MATERIALIZE_PROJECTION) @@ -150,16 +150,16 @@ std::optional MutationCommand::parse(ASTAlterCommand * command, res.clear = true; return res; } - else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_STATISTIC) + else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_STATISTICS) { MutationCommand res; res.ast = command->ptr(); - res.type = MutationCommand::Type::DROP_STATISTIC; + res.type = MutationCommand::Type::DROP_STATISTICS; if (command->partition) res.partition = command->partition->clone(); if (command->clear_index) res.clear = true; - res.statistic_columns = command->statistic_decl->as().getColumnNames(); + res.statistics_columns = command->statistics_decl->as().getColumnNames(); return res; } else if (parse_alter_commands && command->type == ASTAlterCommand::DROP_PROJECTION) diff --git a/src/Storages/MutationCommands.h b/src/Storages/MutationCommands.h index 9d5e02db1b4..f999aab1f4d 100644 --- a/src/Storages/MutationCommands.h +++ b/src/Storages/MutationCommands.h @@ -30,12 +30,12 @@ struct MutationCommand UPDATE, MATERIALIZE_INDEX, MATERIALIZE_PROJECTION, - MATERIALIZE_STATISTIC, + MATERIALIZE_STATISTICS, READ_COLUMN, /// Read column and apply conversions (MODIFY COLUMN alter query). DROP_COLUMN, DROP_INDEX, DROP_PROJECTION, - DROP_STATISTIC, + DROP_STATISTICS, MATERIALIZE_TTL, RENAME_COLUMN, MATERIALIZE_COLUMN, @@ -51,11 +51,11 @@ struct MutationCommand /// Columns with corresponding actions std::unordered_map column_to_update_expression = {}; - /// For MATERIALIZE INDEX and PROJECTION and STATISTIC + /// For MATERIALIZE INDEX and PROJECTION and STATISTICS String index_name = {}; String projection_name = {}; - std::vector statistic_columns = {}; - std::vector statistic_types = {}; + std::vector statistics_columns = {}; + std::vector statistics_types = {}; /// For MATERIALIZE INDEX, UPDATE and DELETE. ASTPtr partition = {}; diff --git a/src/Storages/Statistics/Estimator.cpp b/src/Storages/Statistics/ConditionEstimator.cpp similarity index 63% rename from src/Storages/Statistics/Estimator.cpp rename to src/Storages/Statistics/ConditionEstimator.cpp index 34a0c61aeda..05ea5bc62a5 100644 --- a/src/Storages/Statistics/Estimator.cpp +++ b/src/Storages/Statistics/ConditionEstimator.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -9,53 +9,53 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -void ConditionEstimator::ColumnEstimator::merge(std::string part_name, ColumnStatisticsPtr stats) +void ConditionSelectivityEstimator::ColumnSelectivityEstimator::merge(String part_name, ColumnStatisticsPtr stats) { - if (estimators.contains(part_name)) + if (part_statistics.contains(part_name)) throw Exception(ErrorCodes::LOGICAL_ERROR, "part {} has been added in column {}", part_name, stats->columnName()); - estimators[part_name] = stats; + part_statistics[part_name] = stats; } -Float64 ConditionEstimator::ColumnEstimator::estimateLess(Float64 val, Float64 total) const +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess(Float64 val, Float64 rows) const { - if (estimators.empty()) - return default_normal_cond_factor * total; + if (part_statistics.empty()) + return default_normal_cond_factor * rows; Float64 result = 0; - Float64 partial_cnt = 0; - for (const auto & [key, estimator] : estimators) + Float64 part_rows = 0; + for (const auto & [key, estimator] : part_statistics) { result += estimator->estimateLess(val); - partial_cnt += estimator->count(); + part_rows += estimator->count(); } - return result * total / partial_cnt; + return result * rows / part_rows; } -Float64 ConditionEstimator::ColumnEstimator::estimateGreater(Float64 val, Float64 total) const +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateGreater(Float64 val, Float64 rows) const { - return total - estimateLess(val, total); + return rows - estimateLess(val, rows); } -Float64 ConditionEstimator::ColumnEstimator::estimateEqual(Float64 val, Float64 total) const +Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual(Float64 val, Float64 rows) const { - if (estimators.empty()) + if (part_statistics.empty()) { if (val < - threshold || val > threshold) - return default_normal_cond_factor * total; + return default_normal_cond_factor * rows; else - return default_good_cond_factor * total; + return default_good_cond_factor * rows; } Float64 result = 0; Float64 partial_cnt = 0; - for (const auto & [key, estimator] : estimators) + for (const auto & [key, estimator] : part_statistics) { result += estimator->estimateEqual(val); partial_cnt += estimator->count(); } - return result * total / partial_cnt; + return result * rows / partial_cnt; } /// second return value represents how many columns in the node. -static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNode & node) +static std::pair tryToExtractSingleColumn(const RPNBuilderTreeNode & node) { if (node.isConstant()) { @@ -70,7 +70,7 @@ static std::pair tryToExtractSingleColumn(const RPNBuilderTr auto function_node = node.toFunctionNode(); size_t arguments_size = function_node.getArgumentsSize(); - std::pair result; + std::pair result; for (size_t i = 0; i < arguments_size; ++i) { auto function_argument = function_node.getArgumentAt(i); @@ -87,7 +87,7 @@ static std::pair tryToExtractSingleColumn(const RPNBuilderTr return result; } -std::pair ConditionEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const std::string & column_name) const +std::pair ConditionSelectivityEstimator::extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const { if (!node.isFunction()) return {}; @@ -96,7 +96,7 @@ std::pair ConditionEstimator::extractBinaryOp(const RPNBui if (function_node.getArgumentsSize() != 2) return {}; - std::string function_name = function_node.getFunctionName(); + String function_name = function_node.getFunctionName(); auto lhs_argument = function_node.getArgumentAt(0); auto rhs_argument = function_node.getArgumentAt(1); @@ -137,7 +137,7 @@ std::pair ConditionEstimator::extractBinaryOp(const RPNBui return std::make_pair(function_name, value); } -Float64 ConditionEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const +Float64 ConditionSelectivityEstimator::estimateRowCount(const RPNBuilderTreeNode & node) const { auto result = tryToExtractSingleColumn(node); if (result.second != 1) @@ -149,8 +149,8 @@ Float64 ConditionEstimator::estimateRowCount(const RPNBuilderTreeNode & node) co /// If there the estimator of the column is not found or there are no data at all, /// we use dummy estimation. - bool dummy = total_count == 0; - ColumnEstimator estimator; + bool dummy = total_rows == 0; + ColumnSelectivityEstimator estimator; if (it != column_estimators.end()) { estimator = it->second; @@ -165,33 +165,33 @@ Float64 ConditionEstimator::estimateRowCount(const RPNBuilderTreeNode & node) co if (dummy) { if (val < - threshold || val > threshold) - return default_normal_cond_factor * total_count; + return default_normal_cond_factor * total_rows; else - return default_good_cond_factor * total_count; + return default_good_cond_factor * total_rows; } - return estimator.estimateEqual(val, total_count); + return estimator.estimateEqual(val, total_rows); } - else if (op == "less" || op == "lessThan") + else if (op == "less" || op == "lessOrEquals") { if (dummy) - return default_normal_cond_factor * total_count; - return estimator.estimateLess(val, total_count); + return default_normal_cond_factor * total_rows; + return estimator.estimateLess(val, total_rows); } - else if (op == "greater" || op == "greaterThan") + else if (op == "greater" || op == "greaterOrEquals") { if (dummy) - return default_normal_cond_factor * total_count; - return estimator.estimateGreater(val, total_count); + return default_normal_cond_factor * total_rows; + return estimator.estimateGreater(val, total_rows); } else - return default_unknown_cond_factor * total_count; + return default_unknown_cond_factor * total_rows; } -void ConditionEstimator::merge(std::string part_name, UInt64 part_count, ColumnStatisticsPtr column_stat) +void ConditionSelectivityEstimator::merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat) { if (!part_names.contains(part_name)) { - total_count += part_count; + total_rows += part_rows; part_names.insert(part_name); } if (column_stat != nullptr) diff --git a/src/Storages/Statistics/Estimator.h b/src/Storages/Statistics/ConditionEstimator.h similarity index 50% rename from src/Storages/Statistics/Estimator.h rename to src/Storages/Statistics/ConditionEstimator.h index e7f8316e2bc..4e5b12194d2 100644 --- a/src/Storages/Statistics/Estimator.h +++ b/src/Storages/Statistics/ConditionEstimator.h @@ -8,10 +8,25 @@ namespace DB class RPNBuilderTreeNode; /// It estimates the selectivity of a condition. -class ConditionEstimator +class ConditionSelectivityEstimator { private: friend class ColumnStatistics; + struct ColumnSelectivityEstimator + { + /// We store the part_name and part_statistics. + /// then simply get selectivity for every part_statistics and combine them. + std::map part_statistics; + + void merge(String part_name, ColumnStatisticsPtr stats); + + Float64 estimateLess(Float64 val, Float64 rows) const; + + Float64 estimateGreater(Float64 val, Float64 rows) const; + + Float64 estimateEqual(Float64 val, Float64 rows) const; + }; + static constexpr auto default_good_cond_factor = 0.1; static constexpr auto default_normal_cond_factor = 0.5; static constexpr auto default_unknown_cond_factor = 1.0; @@ -19,35 +34,19 @@ private: /// This is used to assume that condition is likely to have good selectivity. static constexpr auto threshold = 2; - UInt64 total_count = 0; - - /// An estimator for a column consists of several PartColumnEstimator. - /// We simply get selectivity for every part estimator and combine the result. - struct ColumnEstimator - { - std::map estimators; - - void merge(std::string part_name, ColumnStatisticsPtr stats); - - Float64 estimateLess(Float64 val, Float64 total) const; - - Float64 estimateGreater(Float64 val, Float64 total) const; - - Float64 estimateEqual(Float64 val, Float64 total) const; - }; - + UInt64 total_rows = 0; std::set part_names; - std::map column_estimators; - std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const std::string & column_name) const; + std::map column_estimators; + std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const; public: - ConditionEstimator() = default; + ConditionSelectivityEstimator() = default; /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... /// Right now we only support simple condition like col = val / col < val Float64 estimateRowCount(const RPNBuilderTreeNode & node) const; - void merge(std::string part_name, UInt64 part_count, ColumnStatisticsPtr column_stat); + void merge(String part_name, UInt64 part_rows, ColumnStatisticsPtr column_stat); }; } diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index e05147e3a4a..933de06fa97 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -1,11 +1,10 @@ #include #include -#include #include -#include -#include -#include +#include +#include +#include #include #include #include @@ -19,7 +18,6 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INCORRECT_QUERY; - extern const int ILLEGAL_STATISTIC; } enum StatisticsFileVersion : UInt16 @@ -29,14 +27,14 @@ enum StatisticsFileVersion : UInt16 /// Version / bitmask of statistics / data of statistics / -ColumnStatistics::ColumnStatistics(const StatisticsDescription & stats_desc_) - : stats_desc(stats_desc_), counter(0) +ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_) + : stats_desc(stats_desc_), rows(0) { } void ColumnStatistics::update(const ColumnPtr & column) { - counter += column->size(); + rows += column->size(); for (const auto & iter : stats) { iter.second->update(column); @@ -45,31 +43,31 @@ void ColumnStatistics::update(const ColumnPtr & column) Float64 ColumnStatistics::estimateLess(Float64 val) const { - if (stats.contains(TDigest)) - return std::static_pointer_cast(stats.at(TDigest))->estimateLess(val); - return counter * ConditionEstimator::default_normal_cond_factor; + if (stats.contains(StatisticsType::TDigest)) + return std::static_pointer_cast(stats.at(StatisticsType::TDigest))->estimateLess(val); + return rows * ConditionSelectivityEstimator::default_normal_cond_factor; } Float64 ColumnStatistics::estimateGreater(Float64 val) const { - return counter - estimateLess(val); + return rows - estimateLess(val); } Float64 ColumnStatistics::estimateEqual(Float64 val) const { - if (stats.contains(Uniq) && stats.contains(TDigest)) + if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) { - auto uniq_static = std::static_pointer_cast(stats.at(Uniq)); + auto uniq_static = std::static_pointer_cast(stats.at(StatisticsType::Uniq)); if (uniq_static->getCardinality() < 2048) { - auto tdigest_static = std::static_pointer_cast(stats.at(TDigest)); + auto tdigest_static = std::static_pointer_cast(stats.at(StatisticsType::TDigest)); return tdigest_static->estimateEqual(val); } } - if (val < - ConditionEstimator::threshold || val > ConditionEstimator::threshold) - return counter * ConditionEstimator::default_normal_cond_factor; + if (val < - ConditionSelectivityEstimator::threshold || val > ConditionSelectivityEstimator::threshold) + return rows * ConditionSelectivityEstimator::default_normal_cond_factor; else - return counter * ConditionEstimator::default_good_cond_factor; + return rows * ConditionSelectivityEstimator::default_good_cond_factor; } void ColumnStatistics::serialize(WriteBuffer & buf) @@ -78,11 +76,11 @@ void ColumnStatistics::serialize(WriteBuffer & buf) UInt64 stat_types_mask = 0; for (const auto & [type, _]: stats) { - stat_types_mask |= 1 << type; + stat_types_mask |= 1 << UInt8(type); } writeIntBinary(stat_types_mask, buf); /// We write some basic statistics - writeIntBinary(counter, buf); + writeIntBinary(rows, buf); /// We write complex statistics for (const auto & [type, stat_ptr]: stats) { @@ -99,10 +97,10 @@ void ColumnStatistics::deserialize(ReadBuffer &buf) UInt64 stat_types_mask = 0; readIntBinary(stat_types_mask, buf); - readIntBinary(counter, buf); + readIntBinary(rows, buf); for (auto it = stats.begin(); it != stats.end();) { - if (!(stat_types_mask & 1 << (it->first))) + if (!(stat_types_mask & 1 << UInt8(it->first))) { stats.erase(it ++); } @@ -114,49 +112,40 @@ void ColumnStatistics::deserialize(ReadBuffer &buf) } } -void MergeTreeStatisticsFactory::registerCreator(StatisticType stat_type, Creator creator) +String ColumnStatistics::getFileName() const +{ + return STAT_FILE_PREFIX + columnName(); +} + +const String & ColumnStatistics::columnName() const +{ + return stats_desc.column_name; +} + +UInt64 ColumnStatistics::count() const +{ + return rows; +} + +void MergeTreeStatisticsFactory::registerCreator(StatisticsType stat_type, Creator creator) { if (!creators.emplace(stat_type, std::move(creator)).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic creator type {} is not unique", stat_type); } -void MergeTreeStatisticsFactory::registerValidator(StatisticType stat_type, Validator validator) +void MergeTreeStatisticsFactory::registerValidator(StatisticsType stat_type, Validator validator) { if (!validators.emplace(stat_type, std::move(validator)).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic validator type {} is not unique", stat_type); } -StatisticPtr TDigestCreator(const StatisticDescription & stat, DataTypePtr) -{ - return StatisticPtr(new TDigestStatistic(stat)); -} - -void TDigestValidator(const StatisticDescription &, DataTypePtr data_type) -{ - data_type = removeNullable(data_type); - if (!data_type->isValueRepresentedByNumber()) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "TDigest does not support type {}", data_type->getName()); -} - -void UniqValidator(const StatisticDescription &, DataTypePtr data_type) -{ - data_type = removeNullable(data_type); - if (!data_type->isValueRepresentedByNumber()) - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Uniq does not support type {}", data_type->getName()); -} - -StatisticPtr UniqCreator(const StatisticDescription & stat, DataTypePtr data_type) -{ - return StatisticPtr(new UniqStatistic(stat, data_type)); -} - MergeTreeStatisticsFactory::MergeTreeStatisticsFactory() { - registerCreator(TDigest, TDigestCreator); - registerCreator(Uniq, UniqCreator); - registerValidator(TDigest, TDigestValidator); - registerValidator(Uniq, UniqValidator); + registerCreator(StatisticsType::TDigest, TDigestCreator); + registerCreator(StatisticsType::Uniq, UniqCreator); + registerValidator(StatisticsType::TDigest, TDigestValidator); + registerValidator(StatisticsType::Uniq, UniqValidator); } MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() @@ -165,9 +154,9 @@ MergeTreeStatisticsFactory & MergeTreeStatisticsFactory::instance() return instance; } -void MergeTreeStatisticsFactory::validate(const StatisticsDescription & stats, DataTypePtr data_type) const +void MergeTreeStatisticsFactory::validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const { - for (const auto & [type, desc] : stats.stats) + for (const auto & [type, desc] : stats.types_to_desc) { auto it = validators.find(type); if (it == validators.end()) @@ -178,16 +167,16 @@ void MergeTreeStatisticsFactory::validate(const StatisticsDescription & stats, D } } -ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const StatisticsDescription & stats) const +ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const ColumnStatisticsDescription & stats) const { ColumnStatisticsPtr column_stat = std::make_shared(stats); - for (const auto & [type, desc] : stats.stats) + for (const auto & [type, desc] : stats.types_to_desc) { auto it = creators.find(type); if (it == creators.end()) { throw Exception(ErrorCodes::INCORRECT_QUERY, - "Unknown Statistic type '{}'. Available types: tdigest", type); + "Unknown Statistic type '{}'. Available types: tdigest, uniq", type); } auto stat_ptr = (it->second)(desc, stats.data_type); column_stat->stats[type] = stat_ptr; @@ -195,9 +184,9 @@ ColumnStatisticsPtr MergeTreeStatisticsFactory::get(const StatisticsDescription return column_stat; } -std::vector MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const +ColumnsStatistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription & columns) const { - std::vector result; + ColumnsStatistics result; for (const auto & col : columns) if (!col.stats.empty()) result.push_back(get(col.stats)); diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index 96992a254d2..1c111ba3a93 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -20,22 +20,18 @@ constexpr auto STAT_FILE_SUFFIX = ".stat"; namespace DB { -class IStatistic; -using StatisticPtr = std::shared_ptr; -/// using Statistics = std::vector; - -/// Statistic contains the distribution of values in a column. +/// Statistics contains the distribution of values in a column. /// right now we support /// - tdigest /// - uniq(hyperloglog) -class IStatistic +class IStatistics { public: - explicit IStatistic(const StatisticDescription & stat_) + explicit IStatistics(const SingleStatisticsDescription & stat_) : stat(stat_) { } - virtual ~IStatistic() = default; + virtual ~IStatistics() = default; virtual void serialize(WriteBuffer & buf) = 0; @@ -45,44 +41,42 @@ public: protected: - StatisticDescription stat; + SingleStatisticsDescription stat; }; +using StatisticsPtr = std::shared_ptr; + class ColumnStatistics; using ColumnStatisticsPtr = std::shared_ptr; +using ColumnsStatistics = std::vector; class ColumnStatistics { - friend class MergeTreeStatisticsFactory; - StatisticsDescription stats_desc; - std::map stats; - UInt64 counter; public: - explicit ColumnStatistics(const StatisticsDescription & stats_); + explicit ColumnStatistics(const ColumnStatisticsDescription & stats_); void serialize(WriteBuffer & buf); void deserialize(ReadBuffer & buf); - String getFileName() const - { - return STAT_FILE_PREFIX + columnName(); - } + String getFileName() const; - const String & columnName() const - { - return stats_desc.column_name; - } + const String & columnName() const; - UInt64 count() const { return counter; } + UInt64 count() const; void update(const ColumnPtr & column); - /// void merge(ColumnStatisticsPtr other_column_stats); - Float64 estimateLess(Float64 val) const; Float64 estimateGreater(Float64 val) const; Float64 estimateEqual(Float64 val) const; + +private: + + friend class MergeTreeStatisticsFactory; + ColumnStatisticsDescription stats_desc; + std::map stats; + UInt64 rows; /// the number of rows of the column }; class ColumnsDescription; @@ -92,25 +86,25 @@ class MergeTreeStatisticsFactory : private boost::noncopyable public: static MergeTreeStatisticsFactory & instance(); - void validate(const StatisticsDescription & stats, DataTypePtr data_type) const; + void validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const; - using Creator = std::function; + using Creator = std::function; - using Validator = std::function; + using Validator = std::function; - ColumnStatisticsPtr get(const StatisticsDescription & stat) const; + ColumnStatisticsPtr get(const ColumnStatisticsDescription & stat) const; - std::vector getMany(const ColumnsDescription & columns) const; + ColumnsStatistics getMany(const ColumnsDescription & columns) const; - void registerCreator(StatisticType type, Creator creator); - void registerValidator(StatisticType type, Validator validator); + void registerCreator(StatisticsType type, Creator creator); + void registerValidator(StatisticsType type, Validator validator); protected: MergeTreeStatisticsFactory(); private: - using Creators = std::unordered_map; - using Validators = std::unordered_map; + using Creators = std::unordered_map; + using Validators = std::unordered_map; Creators creators; Validators validators; }; diff --git a/src/Storages/Statistics/TDigestStatistic.cpp b/src/Storages/Statistics/TDigestStatistic.cpp deleted file mode 100644 index a3353595216..00000000000 --- a/src/Storages/Statistics/TDigestStatistic.cpp +++ /dev/null @@ -1,38 +0,0 @@ -#include - -namespace DB -{ - -Float64 TDigestStatistic::estimateLess(Float64 val) const -{ - return data.getCountLessThan(val); -} - -Float64 TDigestStatistic::estimateEqual(Float64 val) const -{ - return data.getCountEqual(val); -} - -void TDigestStatistic::serialize(WriteBuffer & buf) -{ - data.serialize(buf); -} - -void TDigestStatistic::deserialize(ReadBuffer & buf) -{ - data.deserialize(buf); -} - -void TDigestStatistic::update(const ColumnPtr & column) -{ - size_t size = column->size(); - - for (size_t i = 0; i < size; ++i) - { - /// TODO: support more types. - Float64 value = column->getFloat64(i); - data.add(value, 1); - } -} - -} diff --git a/src/Storages/Statistics/TDigestStatistics.cpp b/src/Storages/Statistics/TDigestStatistics.cpp new file mode 100644 index 00000000000..0cb0282f015 --- /dev/null +++ b/src/Storages/Statistics/TDigestStatistics.cpp @@ -0,0 +1,55 @@ +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_STATISTICS; +} + +Float64 TDigestStatistics::estimateLess(Float64 val) const +{ + return data.getCountLessThan(val); +} + +Float64 TDigestStatistics::estimateEqual(Float64 val) const +{ + return data.getCountEqual(val); +} + +void TDigestStatistics::serialize(WriteBuffer & buf) +{ + data.serialize(buf); +} + +void TDigestStatistics::deserialize(ReadBuffer & buf) +{ + data.deserialize(buf); +} + +void TDigestStatistics::update(const ColumnPtr & column) +{ + size_t size = column->size(); + + for (size_t i = 0; i < size; ++i) + { + /// TODO: support more types. + Float64 value = column->getFloat64(i); + data.add(value, 1); + } +} + +StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr) +{ + return std::make_shared(stat); +} + +void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type) +{ + data_type = removeNullable(data_type); + if (!data_type->isValueRepresentedByNumber()) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "TDigest does not support type {}", data_type->getName()); +} + +} diff --git a/src/Storages/Statistics/TDigestStatistic.h b/src/Storages/Statistics/TDigestStatistics.h similarity index 60% rename from src/Storages/Statistics/TDigestStatistic.h rename to src/Storages/Statistics/TDigestStatistics.h index 24b33393aeb..bcf4b15fd60 100644 --- a/src/Storages/Statistics/TDigestStatistic.h +++ b/src/Storages/Statistics/TDigestStatistics.h @@ -8,12 +8,10 @@ namespace DB /// TDigestStatistic is a kind of histogram. -class TDigestStatistic : public IStatistic +class TDigestStatistics : public IStatistics { - friend class ColumnStatistics; - QuantileTDigest data; public: - explicit TDigestStatistic(const StatisticDescription & stat_) : IStatistic(stat_) + explicit TDigestStatistics(const SingleStatisticsDescription & stat_) : IStatistics(stat_) { } @@ -26,6 +24,11 @@ public: void deserialize(ReadBuffer & buf) override; void update(const ColumnPtr & column) override; +private: + QuantileTDigest data; }; +StatisticsPtr TDigestCreator(const SingleStatisticsDescription & stat, DataTypePtr); +void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type); + } diff --git a/src/Storages/Statistics/UniqStatistic.h b/src/Storages/Statistics/UniqStatistic.h deleted file mode 100644 index 00c1f51eefc..00000000000 --- a/src/Storages/Statistics/UniqStatistic.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -namespace DB -{ - -class UniqStatistic : public IStatistic -{ - std::unique_ptr arena; - AggregateFunctionPtr uniq_collector; - AggregateDataPtr data; - UInt64 result; -public: - explicit UniqStatistic(const StatisticDescription & stat_, DataTypePtr data_type) : IStatistic(stat_), result(0) - { - arena = std::make_unique(); - AggregateFunctionProperties property; - property.returns_default_when_only_null = true; - uniq_collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), property); - data = arena->alignedAlloc(uniq_collector->sizeOfData(), uniq_collector->alignOfData()); - uniq_collector->create(data); - } - - ~UniqStatistic() override - { - uniq_collector->destroy(data); - } - - UInt64 getCardinality() - { - if (!result) - { - auto column = DataTypeUInt64().createColumn(); - uniq_collector->insertResultInto(data, *column, nullptr); - result = column->getUInt(0); - } - return result; - } - - void serialize(WriteBuffer & buf) override - { - uniq_collector->serialize(data, buf); - } - - void deserialize(ReadBuffer & buf) override - { - uniq_collector->deserialize(data, buf); - } - - void update(const ColumnPtr & column) override - { - const IColumn * col_ptr = column.get(); - uniq_collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); - } -}; - -} diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp new file mode 100644 index 00000000000..3d0645a9553 --- /dev/null +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -0,0 +1,63 @@ +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_STATISTICS; +} + +UniqStatistics::UniqStatistics(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type) + : IStatistics(stat_) +{ + arena = std::make_unique(); + AggregateFunctionProperties property; + property.returns_default_when_only_null = true; + uniq_collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), property); + data = arena->alignedAlloc(uniq_collector->sizeOfData(), uniq_collector->alignOfData()); + uniq_collector->create(data); +} + +UniqStatistics::~UniqStatistics() +{ + uniq_collector->destroy(data); +} + +UInt64 UniqStatistics::getCardinality() +{ + auto column = DataTypeUInt64().createColumn(); + uniq_collector->insertResultInto(data, *column, nullptr); + return column->getUInt(0); +} + +void UniqStatistics::serialize(WriteBuffer & buf) +{ + uniq_collector->serialize(data, buf); +} + +void UniqStatistics::deserialize(ReadBuffer & buf) +{ + uniq_collector->deserialize(data, buf); +} + +void UniqStatistics::update(const ColumnPtr & column) +{ + const IColumn * col_ptr = column.get(); + uniq_collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); +} + +void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) +{ + data_type = removeNullable(data_type); + if (!data_type->isValueRepresentedByNumber()) + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type Uniq does not support type {}", data_type->getName()); +} + +StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type) +{ + return std::make_shared(stat, data_type); +} + +} diff --git a/src/Storages/Statistics/UniqStatistics.h b/src/Storages/Statistics/UniqStatistics.h new file mode 100644 index 00000000000..75a893c080c --- /dev/null +++ b/src/Storages/Statistics/UniqStatistics.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +class UniqStatistics : public IStatistics +{ + std::unique_ptr arena; + AggregateFunctionPtr uniq_collector; + AggregateDataPtr data; + +public: + UniqStatistics(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type); + + ~UniqStatistics() override; + + UInt64 getCardinality(); + + void serialize(WriteBuffer & buf) override; + + void deserialize(ReadBuffer & buf) override; + + void update(const ColumnPtr & column) override; +}; + +StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type); +void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type); + +} diff --git a/src/Storages/Statistics/tests/gtest_stats.cpp b/src/Storages/Statistics/tests/gtest_stats.cpp index 1d0faf65f7d..f94f310be56 100644 --- a/src/Storages/Statistics/tests/gtest_stats.cpp +++ b/src/Storages/Statistics/tests/gtest_stats.cpp @@ -1,6 +1,6 @@ #include -#include +#include TEST(Statistics, TDigestLessThan) { diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index 567c4090b97..29761fd1ded 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -5,10 +7,10 @@ #include #include #include +#include #include #include #include -#include #include #include @@ -19,122 +21,134 @@ namespace DB namespace ErrorCodes { extern const int INCORRECT_QUERY; - extern const int ILLEGAL_STATISTIC; + extern const int ILLEGAL_STATISTICS; extern const int LOGICAL_ERROR; }; -String queryToString(const IAST & query); - -StatisticType stringToType(String type) +static StatisticsType stringToStatisticType(String type) { if (type == "tdigest") - return TDigest; + return StatisticsType::TDigest; if (type == "uniq") - return Uniq; - throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); + return StatisticsType::Uniq; + throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. Supported statistic types are `tdigest` and `uniq`.", type); } -String StatisticDescription::getTypeName() const +String SingleStatisticsDescription::getTypeName() const { - if (type == TDigest) - return "TDigest"; - if (type == Uniq) - return "Uniq"; - throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. We only support statistic type `tdigest` right now.", type); + switch (type) + { + case StatisticsType::TDigest: + return "TDigest"; + case StatisticsType::Uniq: + return "Uniq"; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistic type: {}. Supported statistic types are `tdigest` and `uniq`.", type); + } } -static ASTPtr getASTForStatisticTypes(const std::map & statistic_types) +SingleStatisticsDescription::SingleStatisticsDescription(StatisticsType type_, ASTPtr ast_) + : type(type_), ast(ast_) +{} + +bool SingleStatisticsDescription::operator==(const SingleStatisticsDescription & other) const { - auto function_node = std::make_shared(); - function_node->name = "STATISTIC"; - function_node->arguments = std::make_shared(); - for (const auto & [type, desc] : statistic_types) - { - if (desc.ast == nullptr) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown ast"); - function_node->arguments->children.push_back(desc.ast); - } - function_node->children.push_back(function_node->arguments); - return function_node; + return type == other.type; } -bool StatisticsDescription::contains(const String & stat_type) const +bool ColumnStatisticsDescription::operator==(const ColumnStatisticsDescription & other) const { - return stats.contains(stringToType(stat_type)); + if (types_to_desc.size() != other.types_to_desc.size()) + return false; + + for (const auto & s : types_to_desc) + { + StatisticsType stats_type = s.first; + if (!other.types_to_desc.contains(stats_type)) + return false; + if (!(s.second == other.types_to_desc.at(stats_type))) + return false; + } + + return true; } -void StatisticsDescription::merge(const StatisticsDescription & other, const ColumnDescription & column, bool if_not_exists) +bool ColumnStatisticsDescription::empty() const { - if (other.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "We are merging empty stats in column {}", column.name); + return types_to_desc.empty(); +} +bool ColumnStatisticsDescription::contains(const String & stat_type) const +{ + return types_to_desc.contains(stringToStatisticType(stat_type)); +} + +void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & other, const ColumnDescription & column, bool if_not_exists) +{ if (column_name.empty()) { column_name = column.name; data_type = column.type; } - for (const auto & iter: other.stats) + for (const auto & iter: other.types_to_desc) { - if (!if_not_exists && stats.contains(iter.first)) + if (!if_not_exists && types_to_desc.contains(iter.first)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTIC, "Statistic type name {} has existed in column {}", iter.first, column_name); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistic type name {} has existed in column {}", iter.first, column_name); } + else if (!types_to_desc.contains(iter.first)) + types_to_desc.emplace(iter.first, iter.second); } - - for (const auto & iter: other.stats) - if (!stats.contains(iter.first)) - stats[iter.first] = iter.second; } -void StatisticsDescription::modify(const StatisticsDescription & other) +void ColumnStatisticsDescription::assign(const ColumnStatisticsDescription & other) { if (other.column_name != column_name) - throw Exception(ErrorCodes::LOGICAL_ERROR, "unmactched statistic columns {} and {}", column_name, other.column_name); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot assign statistics from column {} to {}", column_name, other.column_name); - stats = other.stats; + types_to_desc = other.types_to_desc; } -void StatisticsDescription::clear() +void ColumnStatisticsDescription::clear() { - stats.clear(); + types_to_desc.clear(); } -std::vector StatisticsDescription::getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) +std::vector ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) { - const auto * stat_definition = definition_ast->as(); - if (!stat_definition) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot create statistic from non ASTStatisticDeclaration AST"); + const auto * stat_definition_ast = definition_ast->as(); + if (!stat_definition_ast) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot AST to ASTStatisticDeclaration"); - std::vector result; - result.reserve(stat_definition->columns->children.size()); + std::vector result; + result.reserve(stat_definition_ast->columns->children.size()); - std::map statistic_types; - for (const auto & stat_ast : stat_definition->types->children) + StatisticsTypeDescMap statistic_types; + for (const auto & stat_ast : stat_definition_ast->types->children) { - StatisticDescription stat; - String stat_type_name = stat_ast->as().name; - if (statistic_types.contains(stat.type)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Duplicated statistic type name: {} ", stat_type_name); - stat.type = stringToType(Poco::toLower(stat_type_name)); - stat.ast = stat_ast->clone(); - statistic_types[stat.type] = stat; + auto stat_type = stringToStatisticType(Poco::toLower(stat_type_name)); + if (statistic_types.contains(stat_type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic type {} was specified more than once", stat_type_name); + SingleStatisticsDescription stat(stat_type, stat_ast->clone()); + + statistic_types.emplace(stat.type, stat); } - for (const auto & column_ast : stat_definition->columns->children) + for (const auto & column_ast : stat_definition_ast->columns->children) { - StatisticsDescription stats_desc; + ColumnStatisticsDescription types_to_desc_desc; String physical_column_name = column_ast->as().name(); if (!columns.hasPhysical(physical_column_name)) throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", physical_column_name); const auto & column = columns.getPhysical(physical_column_name); - stats_desc.column_name = column.name; - stats_desc.stats = statistic_types; - result.push_back(stats_desc); + types_to_desc_desc.column_name = column.name; + types_to_desc_desc.types_to_desc = statistic_types; + result.push_back(types_to_desc_desc); } if (result.empty()) @@ -143,36 +157,44 @@ std::vector StatisticsDescription::getStatisticsFromAST(c return result; } -StatisticsDescription StatisticsDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) +ColumnStatisticsDescription ColumnStatisticsDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) { const auto & stat_type_list_ast = column.stat_type->as().arguments; if (stat_type_list_ast->children.empty()) throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistic type for column {}", queryToString(column)); - StatisticsDescription stats; + ColumnStatisticsDescription stats; stats.column_name = column.name; for (const auto & ast : stat_type_list_ast->children) { const auto & stat_type = ast->as().name; - StatisticDescription stat; - stat.type = stringToType(Poco::toLower(stat_type)); - stat.ast = ast->clone(); + SingleStatisticsDescription stat(stringToStatisticType(Poco::toLower(stat_type)), ast->clone()); stats.add(stat.type, stat); } return stats; } -void StatisticsDescription::add(StatisticType stat_type, const StatisticDescription & desc) +void ColumnStatisticsDescription::add(StatisticsType stat_type, const SingleStatisticsDescription & desc) { - if (stats.contains(stat_type)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic type {} duplicates", stat_type); - stats[stat_type] = desc; + if (types_to_desc.contains(stat_type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistic type {}", column_name, stat_type); + types_to_desc.emplace(stat_type, desc); } -ASTPtr StatisticsDescription::getAST() const +ASTPtr ColumnStatisticsDescription::getAST() const { - return getASTForStatisticTypes(stats); + auto function_node = std::make_shared(); + function_node->name = "STATISTICS"; + function_node->arguments = std::make_shared(); + for (const auto & [type, desc] : types_to_desc) + { + if (desc.ast == nullptr) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown ast"); + function_node->arguments->children.push_back(desc.ast); + } + function_node->children.push_back(function_node->arguments); + return function_node; } } diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index a39dd76226a..da362b9b47d 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -9,7 +9,7 @@ namespace DB { -enum StatisticType : UInt8 +enum class StatisticsType : UInt8 { TDigest = 0, Uniq = 1, @@ -17,66 +17,48 @@ enum StatisticType : UInt8 UnknownStatistics = 63, }; -class ColumnsDescription; - -struct StatisticDescription +struct SingleStatisticsDescription { - /// the type of statistic, right now it's only tdigest. - StatisticType type; + StatisticsType type; ASTPtr ast; String getTypeName() const; - StatisticDescription() = default; + SingleStatisticsDescription() = delete; + SingleStatisticsDescription(StatisticsType type_, ASTPtr ast_); - bool operator==(const StatisticDescription & other) const - { - return type == other.type; //&& column_name == other.column_name; - } + bool operator==(const SingleStatisticsDescription & other) const; }; struct ColumnDescription; +class ColumnsDescription; -struct StatisticsDescription +struct ColumnStatisticsDescription { - std::map stats; + bool operator==(const ColumnStatisticsDescription & other) const; - bool operator==(const StatisticsDescription & other) const - { - for (const auto & iter : stats) - { - if (!other.stats.contains(iter.first)) - return false; - if (!(iter.second == other.stats.at(iter.first))) - return false; - } - return stats.size() == other.stats.size(); - } - - bool empty() const - { - return stats.empty(); - } + bool empty() const; bool contains(const String & stat_type) const; - void merge(const StatisticsDescription & other, const ColumnDescription & column, bool if_not_exists); + void merge(const ColumnStatisticsDescription & other, const ColumnDescription & column, bool if_not_exists); - void modify(const StatisticsDescription & other); + void assign(const ColumnStatisticsDescription & other); void clear(); - void add(StatisticType stat_type, const StatisticDescription & desc); + void add(StatisticsType stat_type, const SingleStatisticsDescription & desc); ASTPtr getAST() const; + static std::vector getStatisticsDescriptionsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); + static ColumnStatisticsDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); + + using StatisticsTypeDescMap = std::map; + StatisticsTypeDescMap types_to_desc; String column_name; DataTypePtr data_type; - - static std::vector getStatisticsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); - static StatisticsDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); - }; } diff --git a/tests/integration/test_manipulate_statistic/__init__.py b/tests/integration/test_manipulate_statistics/__init__.py similarity index 100% rename from tests/integration/test_manipulate_statistic/__init__.py rename to tests/integration/test_manipulate_statistics/__init__.py diff --git a/tests/integration/test_manipulate_statistic/config/config.xml b/tests/integration/test_manipulate_statistics/config/config.xml similarity index 100% rename from tests/integration/test_manipulate_statistic/config/config.xml rename to tests/integration/test_manipulate_statistics/config/config.xml diff --git a/tests/integration/test_manipulate_statistic/test.py b/tests/integration/test_manipulate_statistics/test.py similarity index 86% rename from tests/integration/test_manipulate_statistic/test.py rename to tests/integration/test_manipulate_statistics/test.py index 8454e6f1796..e6291024e76 100644 --- a/tests/integration/test_manipulate_statistic/test.py +++ b/tests/integration/test_manipulate_statistics/test.py @@ -56,26 +56,26 @@ def run_test_single_node(started_cluster): check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0", "c", True) - node1.query("ALTER TABLE test_stat DROP STATISTIC a") + node1.query("ALTER TABLE test_stat DROP STATISTICS a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_2", "c", True) - node1.query("ALTER TABLE test_stat CLEAR STATISTIC b, c") + node1.query("ALTER TABLE test_stat CLEAR STATISTICS b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "b", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_3", "c", False) - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC b, c") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTICS b, c") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "a", False) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "b", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_4", "c", True) - node1.query("ALTER TABLE test_stat ADD STATISTIC a type tdigest") - node1.query("ALTER TABLE test_stat MATERIALIZE STATISTIC a") + node1.query("ALTER TABLE test_stat ADD STATISTICS a type tdigest") + node1.query("ALTER TABLE test_stat MATERIALIZE STATISTICS a") check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "a", True) check_stat_file_on_disk(node1, "test_stat", "all_1_1_0_5", "b", True) @@ -104,7 +104,7 @@ def test_single_node_wide(started_cluster): node1.query( """ - CREATE TABLE test_stat(a Int64 STATISTIC(tdigest), b Int64 STATISTIC(tdigest), c Int64 STATISTIC(tdigest)) + CREATE TABLE test_stat(a Int64 STATISTICS(tdigest), b Int64 STATISTICS(tdigest), c Int64 STATISTICS(tdigest)) ENGINE = MergeTree() ORDER BY a SETTINGS min_bytes_for_wide_part = 0; """ @@ -117,7 +117,7 @@ def test_single_node_normal(started_cluster): node1.query( """ - CREATE TABLE test_stat(a Int64 STATISTIC(tdigest), b Int64 STATISTIC(tdigest), c Int64 STATISTIC(tdigest)) + CREATE TABLE test_stat(a Int64 STATISTICS(tdigest), b Int64 STATISTICS(tdigest), c Int64 STATISTICS(tdigest)) ENGINE = MergeTree() ORDER BY a; """ ) diff --git a/tests/queries/0_stateless/02864_statistic_exception.sql b/tests/queries/0_stateless/02864_statistic_exception.sql index 28aaf7d5caa..4597ed11d4d 100644 --- a/tests/queries/0_stateless/02864_statistic_exception.sql +++ b/tests/queries/0_stateless/02864_statistic_exception.sql @@ -2,8 +2,8 @@ DROP TABLE IF EXISTS t1; CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest), - b Int64 STATISTIC(tdigest), + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), pk String, ) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } @@ -11,20 +11,20 @@ SET allow_experimental_statistic = 1; CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest), + a Float64 STATISTICS(tdigest), b Int64, - pk String STATISTIC(tdigest), -) Engine = MergeTree() ORDER BY pk; -- { serverError ILLEGAL_STATISTIC } + pk String STATISTICS(tdigest), +) Engine = MergeTree() ORDER BY pk; -- { serverError ILLEGAL_STATISTICS } CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest, tdigest(10)), + a Float64 STATISTICS(tdigest, tdigest(10)), b Int64, ) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } CREATE TABLE t1 ( - a Float64 STATISTIC(xyz), + a Float64 STATISTICS(xyz), b Int64, ) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } @@ -35,18 +35,18 @@ CREATE TABLE t1 pk String, ) Engine = MergeTree() ORDER BY pk; -ALTER TABLE t1 ADD STATISTIC a TYPE xyz; -- { serverError INCORRECT_QUERY } -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 ADD STATISTIC pk TYPE tdigest; -- { serverError ILLEGAL_STATISTIC } -ALTER TABLE t1 DROP STATISTIC b; -ALTER TABLE t1 DROP STATISTIC a; -ALTER TABLE t1 DROP STATISTIC a; -ALTER TABLE t1 CLEAR STATISTIC a; -ALTER TABLE t1 MATERIALIZE STATISTIC b; -- { serverError ILLEGAL_STATISTIC } +ALTER TABLE t1 ADD STATISTICS a TYPE xyz; -- { serverError INCORRECT_QUERY } +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 ADD STATISTICS pk TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 DROP STATISTICS b; +ALTER TABLE t1 DROP STATISTICS a; +ALTER TABLE t1 DROP STATISTICS a; +ALTER TABLE t1 CLEAR STATISTICS a; +ALTER TABLE t1 MATERIALIZE STATISTICS b; -- { serverError ILLEGAL_STATISTICS } -ALTER TABLE t1 ADD STATISTIC a TYPE tdigest; -ALTER TABLE t1 ADD STATISTIC b TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS b TYPE tdigest; ALTER TABLE t1 MODIFY COLUMN a Float64 TTL toDateTime(b) + INTERVAL 1 MONTH; ALTER TABLE t1 MODIFY COLUMN a Int64; -- { serverError ALTER_OF_COLUMN_IS_FORBIDDEN } diff --git a/tests/queries/0_stateless/02864_statistic_operate.reference b/tests/queries/0_stateless/02864_statistic_operate.reference index 3e291485031..6398a9bd000 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.reference +++ b/tests/queries/0_stateless/02864_statistic_operate.reference @@ -1,4 +1,4 @@ -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After insert Prewhere info Prewhere filter @@ -12,7 +12,7 @@ After drop statistic 10 CREATE TABLE default.t1\n(\n `a` Float64,\n `b` Int64,\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After add statistic -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After materialize statistic Prewhere info Prewhere filter @@ -23,7 +23,7 @@ After merge Prewhere filter Prewhere filter column: and(less(a, 10), less(b, 10)) (removed) 20 -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `c` Int64 STATISTICS(tdigest),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After rename Prewhere info Prewhere filter diff --git a/tests/queries/0_stateless/02864_statistic_operate.sql b/tests/queries/0_stateless/02864_statistic_operate.sql index 665bdc17f1f..914e58d7d3a 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.sql +++ b/tests/queries/0_stateless/02864_statistic_operate.sql @@ -5,8 +5,8 @@ SET allow_statistic_optimize = 1; CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest), - b Int64 STATISTIC(tdigest), + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), pk String, ) Engine = MergeTree() ORDER BY pk SETTINGS min_bytes_for_wide_part = 0; @@ -20,7 +20,7 @@ SELECT replaceRegexpAll(explain, '__table1\.|_UInt8', '') FROM (EXPLAIN actions= SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SELECT count(*) FROM t1 WHERE b < NULL and a < '10'; -ALTER TABLE t1 DROP STATISTIC a, b; +ALTER TABLE t1 DROP STATISTICS a, b; SELECT 'After drop statistic'; SELECT replaceRegexpAll(explain, '__table1\.|_UInt8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; @@ -28,13 +28,13 @@ SELECT count(*) FROM t1 WHERE b < 10 and a < 10; SHOW CREATE TABLE t1; -ALTER TABLE t1 ADD STATISTIC a, b TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS a, b TYPE tdigest; SELECT 'After add statistic'; SHOW CREATE TABLE t1; -ALTER TABLE t1 MATERIALIZE STATISTIC a, b; +ALTER TABLE t1 MATERIALIZE STATISTICS a, b; INSERT INTO t1 select number, -number, generateUUIDv4() FROM system.numbers LIMIT 10000; SELECT 'After materialize statistic'; diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistic_uniq.reference index 8a828352dd2..77786dbdd8c 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.reference +++ b/tests/queries/0_stateless/02864_statistic_uniq.reference @@ -1,4 +1,4 @@ -CREATE TABLE default.t1\n(\n `a` Float64 STATISTIC(tdigest),\n `b` Int64 STATISTIC(tdigest),\n `c` Int64 STATISTIC(tdigest, uniq),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 +CREATE TABLE default.t1\n(\n `a` Float64 STATISTICS(tdigest),\n `b` Int64 STATISTICS(tdigest),\n `c` Int64 STATISTICS(tdigest, uniq),\n `pk` String\n)\nENGINE = MergeTree\nORDER BY pk\nSETTINGS min_bytes_for_wide_part = 0, index_granularity = 8192 After insert Prewhere info Prewhere filter diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistic_uniq.sql index cbb24269fac..79bd9a50732 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.sql +++ b/tests/queries/0_stateless/02864_statistic_uniq.sql @@ -5,9 +5,9 @@ SET allow_statistic_optimize = 1; CREATE TABLE t1 ( - a Float64 STATISTIC(tdigest), - b Int64 STATISTIC(tdigest), - c Int64 STATISTIC(tdigest, uniq), + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c Int64 STATISTICS(tdigest, uniq), pk String, ) Engine = MergeTree() ORDER BY pk SETTINGS min_bytes_for_wide_part = 0; @@ -27,15 +27,15 @@ SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN act SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; SELECT 'After modify TDigest'; -ALTER TABLE t1 MODIFY STATISTIC c TYPE TDigest; -ALTER TABLE t1 MATERIALIZE STATISTIC c; +ALTER TABLE t1 MODIFY STATISTICS c TYPE TDigest; +ALTER TABLE t1 MATERIALIZE STATISTICS c; SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 0 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; -ALTER TABLE t1 DROP STATISTIC c; +ALTER TABLE t1 DROP STATISTICS c; SELECT 'After drop'; SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c = 11 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; diff --git a/tests/sqllogic/test_parser.py b/tests/sqllogic/test_parser.py index 648fa9f6bf6..1c963450ba4 100755 --- a/tests/sqllogic/test_parser.py +++ b/tests/sqllogic/test_parser.py @@ -525,7 +525,7 @@ class QueryResult: for row in rows: res_row = [] for c, t in zip(row, types): - logger.debug("Builging row. c:%s t:%s", c, t) + logger.debug("Building row. c:%s t:%s", c, t) if c is None: res_row.append("NULL") continue From b150d83cbb87aaf77bd9dff13063a6728c38c58b Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 24 Apr 2024 19:24:31 +0200 Subject: [PATCH 016/133] fix style --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c55b7555050..683998ffc38 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -498,7 +498,7 @@ ConditionSelectivityEstimator MergeTreeData::getConditionEstimatorByPredicate(co for (const auto & stat : stats) result.merge(part->info.getPartNameV1(), part->rows_count, stat); } - catch(...) + catch (...) { tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); } @@ -515,7 +515,7 @@ ConditionSelectivityEstimator MergeTreeData::getConditionEstimatorByPredicate(co result.merge(part->info.getPartNameV1(), part->rows_count, stat); } } - catch(...) + catch (...) { tryLogCurrentException(log, fmt::format("while loading statistics on part {}", part->info.getPartNameV1())); } From bb29c3b7b4a1631653e8068221887313e348429b Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 14 May 2024 18:16:01 +0200 Subject: [PATCH 017/133] address part of comments --- src/Core/Settings.h | 4 +- src/Interpreters/InterpreterAlterQuery.cpp | 4 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Interpreters/InterpreterExplainQuery.cpp | 4 +- src/Parsers/ExpressionElementParsers.h | 4 +- src/Storages/AlterCommands.cpp | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 6 +- .../MergeTree/MergeTreeWhereOptimizer.cpp | 6 +- .../MergeTree/MergeTreeWhereOptimizer.h | 2 +- src/Storages/Statistics/Statistics.cpp | 12 +-- src/Storages/Statistics/Statistics.h | 10 +-- src/Storages/Statistics/TDigestStatistics.cpp | 7 +- src/Storages/Statistics/TDigestStatistics.h | 4 +- src/Storages/Statistics/UniqStatistics.cpp | 23 ++--- src/Storages/Statistics/UniqStatistics.h | 12 +-- src/Storages/StatisticsDescription.cpp | 84 +++++++++---------- src/Storages/StatisticsDescription.h | 5 +- 18 files changed, 97 insertions(+), 98 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 375bdb1c516..e270f6642a2 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -151,8 +151,8 @@ class IColumn; M(Bool, enable_multiple_prewhere_read_steps, true, "Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND", 0) \ M(Bool, move_primary_key_columns_to_end_of_prewhere, true, "Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering.", 0) \ \ - M(Bool, allow_statistic_optimize, false, "Allows using statistic to optimize queries", 0) \ - M(Bool, allow_experimental_statistic, false, "Allows using statistic", 0) \ + M(Bool, allow_statistics_optimize, false, "Allows using statistics to optimize queries", 0) \ + M(Bool, allow_experimental_statistics, false, "Allows using statistics", 0) \ \ M(UInt64, alter_sync, 1, "Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone.", 0) ALIAS(replication_alter_partitions_sync) \ M(Int64, replication_wait_for_inactive_replica_timeout, 120, "Wait for inactive replica to execute ALTER/OPTIMIZE. Time in seconds, 0 - do not wait, negative - wait for unlimited time.", 0) \ diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 1e0706f728d..d2017bc3766 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -175,11 +175,11 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) else throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong parameter type in ALTER query"); - if (!getContext()->getSettings().allow_experimental_statistic && ( + if (!getContext()->getSettings().allow_experimental_statistics && ( command_ast->type == ASTAlterCommand::ADD_STATISTICS || command_ast->type == ASTAlterCommand::DROP_STATISTICS || command_ast->type == ASTAlterCommand::MATERIALIZE_STATISTICS)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Alter table with statistic is now disabled. Turn on allow_experimental_statistic"); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Alter table with statistics is now disabled. Turn on allow_experimental_statistics"); } if (typeid_cast(database.get())) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 28441843ab1..475490ec35f 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -686,8 +686,8 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( column.stats.column_name = column.name; /// We assign column name here for better exception error message. if (col_decl.stat_type) { - if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistic) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistic is now disabled. Turn on allow_experimental_statistic"); + if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistics) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistics is now disabled. Turn on allow_experimental_statistics"); column.stats = ColumnStatisticsDescription::getStatisticFromColumnDeclaration(col_decl); column.stats.data_type = column.type; } diff --git a/src/Interpreters/InterpreterExplainQuery.cpp b/src/Interpreters/InterpreterExplainQuery.cpp index 458be843b59..3a06e1b2301 100644 --- a/src/Interpreters/InterpreterExplainQuery.cpp +++ b/src/Interpreters/InterpreterExplainQuery.cpp @@ -67,8 +67,8 @@ namespace static void visit(ASTSelectQuery & select, ASTPtr & node, Data & data) { - /// we need to read statistic when `allow_statistic_optimize` is enabled. - bool only_analyze = !data.getContext()->getSettings().allow_statistic_optimize; + /// we need to read statistic when `allow_statistics_optimize` is enabled. + bool only_analyze = !data.getContext()->getSettings().allow_statistics_optimize; InterpreterSelectQuery interpreter( node, data.getContext(), SelectQueryOptions(QueryProcessingStage::FetchColumns).analyze(only_analyze).modify()); diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index becbd724a25..a28f40a00e3 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -201,11 +201,11 @@ protected: bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; -/// STATISTIC(tdigest(200)) +/// STATISTICS(tdigest(200)) class ParserStatisticsType : public IParserBase { protected: - const char * getName() const override { return "statistic"; } + const char * getName() const override { return "statistics"; } bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override; }; diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index e768a3f362a..bf00fae933b 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -705,7 +705,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) for (const auto & stats : stats_vec) { metadata.columns.modify(stats.column_name, - [&](ColumnDescription & column) { column.stats.merge(stats, column, if_not_exists); }); + [&](ColumnDescription & column) { column.stats.merge(stats, column.name, column.type, if_not_exists); }); } } else if (type == DROP_STATISTICS) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index ce2b8f9efd7..ae9d32fb5a2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -666,7 +666,7 @@ ColumnsStatistics IMergeTreeDataPart::loadStatistics() const ColumnsStatistics result; for (auto & stat : total_statistics) { - String file_name = stat->getFileName() + STAT_FILE_SUFFIX; + String file_name = stat->getFileName() + STATS_FILE_SUFFIX; String file_path = fs::path(getDataPartStorage().getRelativePath()) / file_name; if (!metadata_manager->exists(file_name)) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 91f16d69a3d..12b361392e0 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -471,7 +471,7 @@ StoragePolicyPtr MergeTreeData::getStoragePolicy() const ConditionSelectivityEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const { - if (!local_context->getSettings().allow_statistic_optimize) + if (!local_context->getSettings().allow_statistics_optimize) return {}; const auto & parts = assert_cast(*storage_snapshot->data).parts; @@ -3242,8 +3242,8 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context const auto & new_column = new_metadata.getColumns().get(command.column_name); if (!old_column.type->equals(*new_column.type)) throw Exception(ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN, - "ALTER types of column {} with statistic is not not safe " - "because it can change the representation of statistic", + "ALTER types of column {} with statistics is not not safe " + "because it can change the representation of statistics", backQuoteIfNeed(command.column_name)); } } diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 3309a5fcb92..ab2ed7725d8 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -92,7 +92,7 @@ void MergeTreeWhereOptimizer::optimize(SelectQueryInfo & select_query_info, cons where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere; where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere; where_optimizer_context.is_final = select.final(); - where_optimizer_context.use_statistic = context->getSettingsRef().allow_statistic_optimize; + where_optimizer_context.use_statistics = context->getSettingsRef().allow_statistics_optimize; RPNBuilderTreeContext tree_context(context, std::move(block_with_constants), {} /*prepared_sets*/); RPNBuilderTreeNode node(select.where().get(), tree_context); @@ -123,7 +123,7 @@ MergeTreeWhereOptimizer::FilterActionsOptimizeResult MergeTreeWhereOptimizer::op where_optimizer_context.move_all_conditions_to_prewhere = context->getSettingsRef().move_all_conditions_to_prewhere; where_optimizer_context.move_primary_key_columns_to_end_of_prewhere = context->getSettingsRef().move_primary_key_columns_to_end_of_prewhere; where_optimizer_context.is_final = is_final; - where_optimizer_context.use_statistic = context->getSettingsRef().allow_statistic_optimize; + where_optimizer_context.use_statistics = context->getSettingsRef().allow_statistics_optimize; RPNBuilderTreeContext tree_context(context); RPNBuilderTreeNode node(&filter_dag->findInOutputs(filter_column_name), tree_context); @@ -276,7 +276,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const RPNBuilderTree if (cond.viable) cond.good = isConditionGood(node, table_columns); - if (where_optimizer_context.use_statistic) + if (where_optimizer_context.use_statistics) { cond.good = cond.viable; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 813f4a78ea4..92a692ab148 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -104,7 +104,7 @@ private: bool move_all_conditions_to_prewhere = false; bool move_primary_key_columns_to_end_of_prewhere = false; bool is_final = false; - bool use_statistic = false; + bool use_statistics = false; }; struct OptimizeResult diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index 933de06fa97..0f63a286f75 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -127,16 +127,16 @@ UInt64 ColumnStatistics::count() const return rows; } -void MergeTreeStatisticsFactory::registerCreator(StatisticsType stat_type, Creator creator) +void MergeTreeStatisticsFactory::registerCreator(StatisticsType stats_type, Creator creator) { - if (!creators.emplace(stat_type, std::move(creator)).second) - throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic creator type {} is not unique", stat_type); + if (!creators.emplace(stats_type, std::move(creator)).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistics creator type {} is not unique", stats_type); } -void MergeTreeStatisticsFactory::registerValidator(StatisticsType stat_type, Validator validator) +void MergeTreeStatisticsFactory::registerValidator(StatisticsType stats_type, Validator validator) { - if (!validators.emplace(stat_type, std::move(validator)).second) - throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistic validator type {} is not unique", stat_type); + if (!validators.emplace(stats_type, std::move(validator)).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "MergeTreeStatisticsFactory: the statistics validator type {} is not unique", stats_type); } diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index 1c111ba3a93..1415f0a5d2f 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -14,8 +14,8 @@ /// this is for user-defined statistic. -constexpr auto STAT_FILE_PREFIX = "statistic_"; -constexpr auto STAT_FILE_SUFFIX = ".stat"; +constexpr auto STATS_FILE_PREFIX = "statistics_"; +constexpr auto STATS_FILE_SUFFIX = ".stats"; namespace DB { @@ -88,11 +88,11 @@ public: void validate(const ColumnStatisticsDescription & stats, DataTypePtr data_type) const; - using Creator = std::function; + using Creator = std::function; - using Validator = std::function; + using Validator = std::function; - ColumnStatisticsPtr get(const ColumnStatisticsDescription & stat) const; + ColumnStatisticsPtr get(const ColumnStatisticsDescription & stats) const; ColumnsStatistics getMany(const ColumnsDescription & columns) const; diff --git a/src/Storages/Statistics/TDigestStatistics.cpp b/src/Storages/Statistics/TDigestStatistics.cpp index 0cb0282f015..aa5662c979d 100644 --- a/src/Storages/Statistics/TDigestStatistics.cpp +++ b/src/Storages/Statistics/TDigestStatistics.cpp @@ -8,6 +8,11 @@ namespace ErrorCodes extern const int ILLEGAL_STATISTICS; } +TDigestStatistics::TDigestStatistics(const SingleStatisticsDescription & stat_): + IStatistics(stat_) +{ +} + Float64 TDigestStatistics::estimateLess(Float64 val) const { return data.getCountLessThan(val); @@ -49,7 +54,7 @@ void TDigestValidator(const SingleStatisticsDescription &, DataTypePtr data_type { data_type = removeNullable(data_type); if (!data_type->isValueRepresentedByNumber()) - throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "TDigest does not support type {}", data_type->getName()); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'tdigest' does not support type {}", data_type->getName()); } } diff --git a/src/Storages/Statistics/TDigestStatistics.h b/src/Storages/Statistics/TDigestStatistics.h index bcf4b15fd60..7c361b8751f 100644 --- a/src/Storages/Statistics/TDigestStatistics.h +++ b/src/Storages/Statistics/TDigestStatistics.h @@ -11,9 +11,7 @@ namespace DB class TDigestStatistics : public IStatistics { public: - explicit TDigestStatistics(const SingleStatisticsDescription & stat_) : IStatistics(stat_) - { - } + explicit TDigestStatistics(const SingleStatisticsDescription & stat_); Float64 estimateLess(Float64 val) const; diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp index 3d0645a9553..7f99a91cf86 100644 --- a/src/Storages/Statistics/UniqStatistics.cpp +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace DB @@ -13,46 +14,46 @@ UniqStatistics::UniqStatistics(const SingleStatisticsDescription & stat_, const : IStatistics(stat_) { arena = std::make_unique(); - AggregateFunctionProperties property; - property.returns_default_when_only_null = true; - uniq_collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), property); - data = arena->alignedAlloc(uniq_collector->sizeOfData(), uniq_collector->alignOfData()); - uniq_collector->create(data); + AggregateFunctionProperties properties; + properties.returns_default_when_only_null = true; + collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), properties); + data = arena->alignedAlloc(collector->sizeOfData(), collector->alignOfData()); + collector->create(data); } UniqStatistics::~UniqStatistics() { - uniq_collector->destroy(data); + collector->destroy(data); } UInt64 UniqStatistics::getCardinality() { auto column = DataTypeUInt64().createColumn(); - uniq_collector->insertResultInto(data, *column, nullptr); + collector->insertResultInto(data, *column, nullptr); return column->getUInt(0); } void UniqStatistics::serialize(WriteBuffer & buf) { - uniq_collector->serialize(data, buf); + collector->serialize(data, buf); } void UniqStatistics::deserialize(ReadBuffer & buf) { - uniq_collector->deserialize(data, buf); + collector->deserialize(data, buf); } void UniqStatistics::update(const ColumnPtr & column) { const IColumn * col_ptr = column.get(); - uniq_collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); + collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); } void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) { data_type = removeNullable(data_type); if (!data_type->isValueRepresentedByNumber()) - throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type Uniq does not support type {}", data_type->getName()); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics of type 'uniq' does not support type {}", data_type->getName()); } StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type) diff --git a/src/Storages/Statistics/UniqStatistics.h b/src/Storages/Statistics/UniqStatistics.h index 75a893c080c..0d86a6e458a 100644 --- a/src/Storages/Statistics/UniqStatistics.h +++ b/src/Storages/Statistics/UniqStatistics.h @@ -2,7 +2,6 @@ #include #include -#include #include namespace DB @@ -10,10 +9,6 @@ namespace DB class UniqStatistics : public IStatistics { - std::unique_ptr arena; - AggregateFunctionPtr uniq_collector; - AggregateDataPtr data; - public: UniqStatistics(const SingleStatisticsDescription & stat_, const DataTypePtr & data_type); @@ -26,6 +21,13 @@ public: void deserialize(ReadBuffer & buf) override; void update(const ColumnPtr & column) override; + +private: + + std::unique_ptr arena; + AggregateFunctionPtr collector; + AggregateDataPtr data; + }; StatisticsPtr UniqCreator(const SingleStatisticsDescription & stat, DataTypePtr data_type); diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index 29761fd1ded..3de7b8159b7 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -25,13 +25,13 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; }; -static StatisticsType stringToStatisticType(String type) +static StatisticsType stringToStatisticsType(String type) { if (type == "tdigest") return StatisticsType::TDigest; if (type == "uniq") return StatisticsType::Uniq; - throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistic type: {}. Supported statistic types are `tdigest` and `uniq`.", type); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); } String SingleStatisticsDescription::getTypeName() const @@ -43,7 +43,7 @@ String SingleStatisticsDescription::getTypeName() const case StatisticsType::Uniq: return "Uniq"; default: - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistic type: {}. Supported statistic types are `tdigest` and `uniq`.", type); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown statistics type: {}. Supported statistics types are `tdigest` and `uniq`.", type); } } @@ -61,12 +61,12 @@ bool ColumnStatisticsDescription::operator==(const ColumnStatisticsDescription & if (types_to_desc.size() != other.types_to_desc.size()) return false; - for (const auto & s : types_to_desc) + for (const auto & [type, desc] : types_to_desc) { - StatisticsType stats_type = s.first; + StatisticsType stats_type = type; if (!other.types_to_desc.contains(stats_type)) return false; - if (!(s.second == other.types_to_desc.at(stats_type))) + if (!(desc == other.types_to_desc.at(stats_type))) return false; } @@ -80,25 +80,27 @@ bool ColumnStatisticsDescription::empty() const bool ColumnStatisticsDescription::contains(const String & stat_type) const { - return types_to_desc.contains(stringToStatisticType(stat_type)); + return types_to_desc.contains(stringToStatisticsType(stat_type)); } -void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & other, const ColumnDescription & column, bool if_not_exists) +void ColumnStatisticsDescription::merge(const ColumnStatisticsDescription & other, const String & merging_column_name, DataTypePtr merging_column_type, bool if_not_exists) { + chassert(merging_column_type); + if (column_name.empty()) { - column_name = column.name; - data_type = column.type; + column_name = merging_column_name; + data_type = merging_column_type; } - for (const auto & iter: other.types_to_desc) + for (const auto & [stats_type, stats_desc]: other.types_to_desc) { - if (!if_not_exists && types_to_desc.contains(iter.first)) + if (!if_not_exists && types_to_desc.contains(stats_type)) { - throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistic type name {} has existed in column {}", iter.first, column_name); + throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Statistics type name {} has existed in column {}", stats_type, column_name); } - else if (!types_to_desc.contains(iter.first)) - types_to_desc.emplace(iter.first, iter.second); + else if (!types_to_desc.contains(stats_type)) + types_to_desc.emplace(stats_type, stats_desc); } } @@ -119,40 +121,39 @@ std::vector ColumnStatisticsDescription::getStatist { const auto * stat_definition_ast = definition_ast->as(); if (!stat_definition_ast) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot AST to ASTStatisticDeclaration"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot cast AST to ASTStatisticDeclaration"); + + StatisticsTypeDescMap statistics_types; + for (const auto & stat_ast : stat_definition_ast->types->children) + { + String stat_type_name = stat_ast->as().name; + auto stat_type = stringToStatisticsType(Poco::toLower(stat_type_name)); + if (statistics_types.contains(stat_type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistics type {} was specified more than once", stat_type_name); + SingleStatisticsDescription stat(stat_type, stat_ast->clone()); + + statistics_types.emplace(stat.type, stat); + } std::vector result; result.reserve(stat_definition_ast->columns->children.size()); - StatisticsTypeDescMap statistic_types; - for (const auto & stat_ast : stat_definition_ast->types->children) - { - String stat_type_name = stat_ast->as().name; - auto stat_type = stringToStatisticType(Poco::toLower(stat_type_name)); - if (statistic_types.contains(stat_type)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Statistic type {} was specified more than once", stat_type_name); - SingleStatisticsDescription stat(stat_type, stat_ast->clone()); - - statistic_types.emplace(stat.type, stat); - } - for (const auto & column_ast : stat_definition_ast->columns->children) { - - ColumnStatisticsDescription types_to_desc_desc; + ColumnStatisticsDescription stats; String physical_column_name = column_ast->as().name(); if (!columns.hasPhysical(physical_column_name)) throw Exception(ErrorCodes::INCORRECT_QUERY, "Incorrect column name {}", physical_column_name); const auto & column = columns.getPhysical(physical_column_name); - types_to_desc_desc.column_name = column.name; - types_to_desc_desc.types_to_desc = statistic_types; - result.push_back(types_to_desc_desc); + stats.column_name = column.name; + stats.types_to_desc = statistics_types; + result.push_back(stats); } if (result.empty()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistic column list is not allowed."); + throw Exception(ErrorCodes::INCORRECT_QUERY, "Empty statistics column list is not allowed."); return result; } @@ -161,27 +162,22 @@ ColumnStatisticsDescription ColumnStatisticsDescription::getStatisticFromColumnD { const auto & stat_type_list_ast = column.stat_type->as().arguments; if (stat_type_list_ast->children.empty()) - throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistic type for column {}", queryToString(column)); + throw Exception(ErrorCodes::INCORRECT_QUERY, "We expect at least one statistics type for column {}", queryToString(column)); ColumnStatisticsDescription stats; stats.column_name = column.name; for (const auto & ast : stat_type_list_ast->children) { const auto & stat_type = ast->as().name; - SingleStatisticsDescription stat(stringToStatisticType(Poco::toLower(stat_type)), ast->clone()); - stats.add(stat.type, stat); + SingleStatisticsDescription stat(stringToStatisticsType(Poco::toLower(stat_type)), ast->clone()); + if (stats.types_to_desc.contains(stat.type)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistics type {}", stats.column_name, stat_type); + stats.types_to_desc.emplace(stat.type, std::move(stat)); } return stats; } -void ColumnStatisticsDescription::add(StatisticsType stat_type, const SingleStatisticsDescription & desc) -{ - if (types_to_desc.contains(stat_type)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "Column {} already contains statistic type {}", column_name, stat_type); - types_to_desc.emplace(stat_type, desc); -} - ASTPtr ColumnStatisticsDescription::getAST() const { auto function_node = std::make_shared(); diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index da362b9b47d..b064644c020 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -31,7 +31,6 @@ struct SingleStatisticsDescription bool operator==(const SingleStatisticsDescription & other) const; }; -struct ColumnDescription; class ColumnsDescription; struct ColumnStatisticsDescription @@ -42,14 +41,12 @@ struct ColumnStatisticsDescription bool contains(const String & stat_type) const; - void merge(const ColumnStatisticsDescription & other, const ColumnDescription & column, bool if_not_exists); + void merge(const ColumnStatisticsDescription & other, const String & column_name, DataTypePtr column_type, bool if_not_exists); void assign(const ColumnStatisticsDescription & other); void clear(); - void add(StatisticsType stat_type, const SingleStatisticsDescription & desc); - ASTPtr getAST() const; static std::vector getStatisticsDescriptionsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); From e9cfdc9c5643910b330fff5b29e3759b4dc3b807 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Fri, 17 May 2024 17:16:49 +0200 Subject: [PATCH 018/133] address comments --- src/Access/Common/AccessType.h | 10 +++--- src/Interpreters/InterpreterAlterQuery.cpp | 8 ++--- src/Interpreters/InterpreterCreateQuery.cpp | 10 +++--- src/Interpreters/InterpreterSelectQuery.cpp | 2 +- src/Interpreters/MutationsInterpreter.cpp | 2 +- .../Optimizations/optimizePrewhere.cpp | 2 +- .../QueryPlan/Optimizations/optimizeTree.cpp | 2 +- src/Storages/AlterCommands.cpp | 14 ++++---- src/Storages/ColumnsDescription.cpp | 6 ++-- src/Storages/ColumnsDescription.h | 2 +- src/Storages/IStorage.cpp | 4 +-- src/Storages/IStorage.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 10 +++--- src/Storages/MergeTree/MergeTreeData.h | 2 +- .../MergeTreeDataPartWriterOnDisk.cpp | 2 +- .../MergeTree/MergeTreeWhereOptimizer.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 28 ++++++++-------- ....cpp => ConditionSelectivityEstimator.cpp} | 6 ++-- ...ator.h => ConditionSelectivityEstimator.h} | 2 -- src/Storages/Statistics/Statistics.cpp | 23 +++++++------ src/Storages/Statistics/Statistics.h | 33 +++++++------------ src/Storages/Statistics/UniqStatistics.cpp | 1 - src/Storages/StatisticsDescription.cpp | 18 ++-------- src/Storages/StatisticsDescription.h | 6 ++-- .../config/config.xml | 2 +- .../01271_show_privileges.reference | 10 +++--- .../0_stateless/02864_statistic_exception.sql | 2 +- .../0_stateless/02864_statistic_operate.sql | 4 +-- .../0_stateless/02864_statistic_uniq.sql | 4 +-- .../0_stateless/02995_baseline_23_12_1.tsv | 4 +-- 30 files changed, 99 insertions(+), 124 deletions(-) rename src/Storages/Statistics/{ConditionEstimator.cpp => ConditionSelectivityEstimator.cpp} (97%) rename src/Storages/Statistics/{ConditionEstimator.h => ConditionSelectivityEstimator.h} (97%) diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 2c5e0f06cdc..e9f24a8c685 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -51,11 +51,11 @@ enum class AccessType : uint8_t M(ALTER_CLEAR_INDEX, "CLEAR INDEX", TABLE, ALTER_INDEX) \ M(ALTER_INDEX, "INDEX", GROUP, ALTER_TABLE) /* allows to execute ALTER ORDER BY or ALTER {ADD|DROP...} INDEX */\ \ - M(ALTER_ADD_STATISTIC, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_DROP_STATISTIC, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_MODIFY_STATISTIC, "ALTER MODIFY STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_MATERIALIZE_STATISTIC, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTIC) \ - M(ALTER_STATISTIC, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\ + M(ALTER_ADD_STATISTICS, "ALTER ADD STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_DROP_STATISTICS, "ALTER DROP STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_MODIFY_STATISTICS, "ALTER MODIFY STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_MATERIALIZE_STATISTICS, "ALTER MATERIALIZE STATISTIC", TABLE, ALTER_STATISTICS) \ + M(ALTER_STATISTICS, "STATISTIC", GROUP, ALTER_TABLE) /* allows to execute ALTER STATISTIC */\ \ M(ALTER_ADD_PROJECTION, "ADD PROJECTION", TABLE, ALTER_PROJECTION) \ M(ALTER_DROP_PROJECTION, "DROP PROJECTION", TABLE, ALTER_PROJECTION) \ diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index d2017bc3766..c70a3397f4e 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -345,22 +345,22 @@ AccessRightsElements InterpreterAlterQuery::getRequiredAccessForCommand(const AS } case ASTAlterCommand::ADD_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_ADD_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_ADD_STATISTICS, database, table); break; } case ASTAlterCommand::MODIFY_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_MODIFY_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_MODIFY_STATISTICS, database, table); break; } case ASTAlterCommand::DROP_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_DROP_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_DROP_STATISTICS, database, table); break; } case ASTAlterCommand::MATERIALIZE_STATISTICS: { - required_access.emplace_back(AccessType::ALTER_MATERIALIZE_STATISTIC, database, table); + required_access.emplace_back(AccessType::ALTER_MATERIALIZE_STATISTICS, database, table); break; } case ASTAlterCommand::ADD_INDEX: diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 117e7a27699..d0563dc7054 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -448,9 +448,9 @@ ASTPtr InterpreterCreateQuery::formatColumns(const ColumnsDescription & columns) column_declaration->children.push_back(column_declaration->codec); } - if (!column.stats.empty()) + if (!column.statistics.empty()) { - column_declaration->stat_type = column.stats.getAST(); + column_declaration->stat_type = column.statistics.getAST(); column_declaration->children.push_back(column_declaration->stat_type); } @@ -675,13 +675,13 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( col_decl.codec, column.type, sanity_check_compression_codecs, allow_experimental_codecs, enable_deflate_qpl_codec, enable_zstd_qat_codec); } - column.stats.column_name = column.name; /// We assign column name here for better exception error message. + column.statistics.column_name = column.name; /// We assign column name here for better exception error message. if (col_decl.stat_type) { if (!skip_checks && !context_->getSettingsRef().allow_experimental_statistics) throw Exception(ErrorCodes::INCORRECT_QUERY, "Create table with statistics is now disabled. Turn on allow_experimental_statistics"); - column.stats = ColumnStatisticsDescription::getStatisticFromColumnDeclaration(col_decl); - column.stats.data_type = column.type; + column.statistics = ColumnStatisticsDescription::fromColumnDeclaration(col_decl); + column.statistics.data_type = column.type; } if (col_decl.ttl) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index ffe45d55643..1033a0d7ca4 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -657,7 +657,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( MergeTreeWhereOptimizer where_optimizer{ std::move(column_compressed_sizes), metadata_snapshot, - storage->getConditionEstimatorByPredicate(query_info, storage_snapshot, context), + storage->getConditionSelectivityEstimatorByPredicate(query_info, storage_snapshot, context), queried_columns, supported_prewhere_columns, log}; diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 5b3247e5005..ba33b70b59c 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -806,7 +806,7 @@ void MutationsInterpreter::prepare(bool dry_run) mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTICS_PROJECTION); for (const auto & stat_column_name: command.statistics_columns) { - if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).stats.empty()) + if (!columns_desc.has(stat_column_name) || columns_desc.get(stat_column_name).statistics.empty()) throw Exception(ErrorCodes::ILLEGAL_STATISTICS, "Unknown statistics column: {}", stat_column_name); dependencies.emplace(stat_column_name, ColumnDependency::STATISTICS); materialized_statistics.emplace(stat_column_name); diff --git a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp index 8c5839a9803..3d898cd4453 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizePrewhere.cpp @@ -83,7 +83,7 @@ void optimizePrewhere(Stack & stack, QueryPlan::Nodes &) MergeTreeWhereOptimizer where_optimizer{ std::move(column_compressed_sizes), storage_metadata, - storage.getConditionEstimatorByPredicate(source_step_with_filter->getQueryInfo(), storage_snapshot, context), + storage.getConditionSelectivityEstimatorByPredicate(source_step_with_filter->getQueryInfo(), storage_snapshot, context), queried_columns, storage.supportedPrewhereColumns(), getLogger("QueryPlanOptimizePrewhere")}; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index 915e664ea8f..cd069e41022 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -117,7 +117,7 @@ void optimizeTreeSecondPass(const QueryPlanOptimizationSettings & optimization_s optimizePrimaryKeyCondition(stack); /// NOTE: optimizePrewhere can modify the stack. - /// Prewhere optimization relies on PK optimization (getConditionEstimatorByPredicate) + /// Prewhere optimization relies on PK optimization (getConditionSelectivityEstimatorByPredicate) if (optimization_settings.optimize_prewhere) optimizePrewhere(stack, nodes); diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index d5621d4fc5a..59b96f9817c 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -701,11 +701,11 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) } } - auto stats_vec = ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(statistics_decl, metadata.columns); + auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns); for (const auto & stats : stats_vec) { metadata.columns.modify(stats.column_name, - [&](ColumnDescription & column) { column.stats.merge(stats, column.name, column.type, if_not_exists); }); + [&](ColumnDescription & column) { column.statistics.merge(stats, column.name, column.type, if_not_exists); }); } } else if (type == DROP_STATISTICS) @@ -721,7 +721,7 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) if (!clear && !partition) metadata.columns.modify(statistics_column_name, - [&](ColumnDescription & column) { column.stats.clear(); }); + [&](ColumnDescription & column) { column.statistics.clear(); }); } } else if (type == MODIFY_STATISTICS) @@ -734,11 +734,11 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) } } - auto stats_vec = ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(statistics_decl, metadata.columns); + auto stats_vec = ColumnStatisticsDescription::fromAST(statistics_decl, metadata.columns); for (const auto & stats : stats_vec) { metadata.columns.modify(stats.column_name, - [&](ColumnDescription & column) { column.stats.assign(stats); }); + [&](ColumnDescription & column) { column.statistics.assign(stats); }); } } else if (type == ADD_CONSTRAINT) @@ -862,8 +862,8 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) rename_visitor.visit(column_to_modify.default_desc.expression); if (column_to_modify.ttl) rename_visitor.visit(column_to_modify.ttl); - if (column_to_modify.name == column_name && !column_to_modify.stats.empty()) - column_to_modify.stats.column_name = rename_to; + if (column_to_modify.name == column_name && !column_to_modify.statistics.empty()) + column_to_modify.statistics.column_name = rename_to; }); } if (metadata.table_ttl.definition_ast) diff --git a/src/Storages/ColumnsDescription.cpp b/src/Storages/ColumnsDescription.cpp index 23c3c52af5e..0a5e7437a40 100644 --- a/src/Storages/ColumnsDescription.cpp +++ b/src/Storages/ColumnsDescription.cpp @@ -67,7 +67,7 @@ bool ColumnDescription::operator==(const ColumnDescription & other) const return name == other.name && type->equals(*other.type) && default_desc == other.default_desc - && stats == other.stats + && statistics == other.statistics && ast_to_str(codec) == ast_to_str(other.codec) && settings == other.settings && ast_to_str(ttl) == ast_to_str(other.ttl); @@ -114,10 +114,10 @@ void ColumnDescription::writeText(WriteBuffer & buf) const DB::writeText(")", buf); } - if (!stats.empty()) + if (!statistics.empty()) { writeChar('\t', buf); - writeEscapedString(queryToString(stats.getAST()), buf); + writeEscapedString(queryToString(statistics.getAST()), buf); } if (ttl) diff --git a/src/Storages/ColumnsDescription.h b/src/Storages/ColumnsDescription.h index 63f617a91cd..14ea40afab6 100644 --- a/src/Storages/ColumnsDescription.h +++ b/src/Storages/ColumnsDescription.h @@ -89,7 +89,7 @@ struct ColumnDescription ASTPtr codec; SettingsChanges settings; ASTPtr ttl; - ColumnStatisticsDescription stats; + ColumnStatisticsDescription statistics; ColumnDescription() = default; ColumnDescription(ColumnDescription &&) = default; diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index d0db2c02738..41b254300b1 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include @@ -233,7 +233,7 @@ StorageID IStorage::getStorageID() const return storage_id; } -ConditionSelectivityEstimator IStorage::getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const +ConditionSelectivityEstimator IStorage::getConditionSelectivityEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const { return {}; } diff --git a/src/Storages/IStorage.h b/src/Storages/IStorage.h index 99f6897a8f5..1aa7f503421 100644 --- a/src/Storages/IStorage.h +++ b/src/Storages/IStorage.h @@ -136,7 +136,7 @@ public: /// Returns true if the storage supports queries with the PREWHERE section. virtual bool supportsPrewhere() const { return false; } - virtual ConditionSelectivityEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const; + virtual ConditionSelectivityEstimator getConditionSelectivityEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const; /// Returns which columns supports PREWHERE, or empty std::nullopt if all columns is supported. /// This is needed for engines whose aggregates data from multiple tables, like Merge. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 270a1f5f667..e86a4bd98cc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -72,7 +72,7 @@ #include #include #include -#include +#include #include #include #include @@ -469,7 +469,7 @@ StoragePolicyPtr MergeTreeData::getStoragePolicy() const return storage_policy; } -ConditionSelectivityEstimator MergeTreeData::getConditionEstimatorByPredicate(const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const +ConditionSelectivityEstimator MergeTreeData::getConditionSelectivityEstimatorByPredicate(const SelectQueryInfo & query_info, const StorageSnapshotPtr & storage_snapshot, ContextPtr local_context) const { if (!local_context->getSettings().allow_statistics_optimize) return {}; @@ -698,8 +698,8 @@ void MergeTreeData::checkProperties( for (const auto & col : new_metadata.columns) { - if (!col.stats.empty()) - MergeTreeStatisticsFactory::instance().validate(col.stats, col.type); + if (!col.statistics.empty()) + MergeTreeStatisticsFactory::instance().validate(col.statistics, col.type); } checkKeyExpression(*new_sorting_key.expression, new_sorting_key.sample_block, "Sorting", allow_nullable_key_); @@ -3475,7 +3475,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, Context new_metadata.getColumns().getPhysical(command.column_name)); const auto & old_column = old_metadata.getColumns().get(command.column_name); - if (!old_column.stats.empty()) + if (!old_column.statistics.empty()) { const auto & new_column = new_metadata.getColumns().get(command.column_name); if (!old_column.type->equals(*new_column.type)) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index a1f1b2a7f31..43a13206921 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -426,7 +426,7 @@ public: bool supportsPrewhere() const override { return true; } - ConditionSelectivityEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const override; + ConditionSelectivityEstimator getConditionSelectivityEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const override; bool supportsFinal() const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp index f3417975374..7ffca6db13f 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp @@ -257,7 +257,7 @@ void MergeTreeDataPartWriterOnDisk::initStatistics() stats_streams.emplace_back(std::make_unique>( stats_name, data_part->getDataPartStoragePtr(), - stats_name, STAT_FILE_SUFFIX, + stats_name, STATS_FILE_SUFFIX, default_codec, settings.max_compress_block_size, settings.query_write_settings)); } diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 92a692ab148..ba6b4660924 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index c62d925fda0..01d4d857496 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -523,9 +523,9 @@ static std::set getStatisticsToRecalculate(const StorageMet const auto & columns = metadata_snapshot->getColumns(); for (const auto & col_desc : columns) { - if (!col_desc.stats.empty() && materialized_stats.contains(col_desc.name)) + if (!col_desc.statistics.empty() && materialized_stats.contains(col_desc.name)) { - stats_to_recalc.insert(stats_factory.get(col_desc.stats)); + stats_to_recalc.insert(stats_factory.get(col_desc.statistics)); } } return stats_to_recalc; @@ -667,7 +667,7 @@ static NameSet collectFilesToSkip( files_to_skip.insert(projection->getDirectoryName()); for (const auto & stat : stats_to_recalc) - files_to_skip.insert(stat->getFileName() + STAT_FILE_SUFFIX); + files_to_skip.insert(stat->getFileName() + STATS_FILE_SUFFIX); if (isWidePart(source_part)) { @@ -759,8 +759,8 @@ static NameToNameVector collectFilesForRenames( else if (command.type == MutationCommand::Type::DROP_STATISTICS) { for (const auto & statistics_column_name : command.statistics_columns) - if (source_part->checksums.has(STAT_FILE_PREFIX + statistics_column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + statistics_column_name + STAT_FILE_SUFFIX, ""); + if (source_part->checksums.has(STATS_FILE_PREFIX + statistics_column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + statistics_column_name + STATS_FILE_SUFFIX, ""); } else if (isWidePart(source_part)) { @@ -782,8 +782,8 @@ static NameToNameVector collectFilesForRenames( serialization->enumerateStreams(callback); /// if we drop a column with statistics, we should also drop the stat file. - if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, ""); + if (source_part->checksums.has(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX, ""); } else if (command.type == MutationCommand::Type::RENAME_COLUMN) { @@ -818,8 +818,8 @@ static NameToNameVector collectFilesForRenames( serialization->enumerateStreams(callback); /// if we rename a column with statistics, we should also rename the stat file. - if (source_part->checksums.has(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - add_rename(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX, STAT_FILE_PREFIX + command.rename_to + STAT_FILE_SUFFIX); + if (source_part->checksums.has(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + add_rename(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX, STATS_FILE_PREFIX + command.rename_to + STATS_FILE_SUFFIX); } else if (command.type == MutationCommand::Type::READ_COLUMN) { @@ -1461,8 +1461,8 @@ private: for (const auto & column_name : command.statistics_columns) removed_stats.insert(column_name); else if (command.type == MutationCommand::RENAME_COLUMN - && ctx->source_part->checksums.files.contains(STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX)) - renamed_stats[STAT_FILE_PREFIX + command.column_name + STAT_FILE_SUFFIX] = STAT_FILE_PREFIX + command.rename_to + STAT_FILE_SUFFIX; + && ctx->source_part->checksums.files.contains(STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX)) + renamed_stats[STATS_FILE_PREFIX + command.column_name + STATS_FILE_SUFFIX] = STATS_FILE_PREFIX + command.rename_to + STATS_FILE_SUFFIX; } bool is_full_part_storage = isFullPartStorage(ctx->new_data_part->getDataPartStorage()); @@ -1502,19 +1502,19 @@ private: const auto & columns = ctx->metadata_snapshot->getColumns(); for (const auto & col : columns) { - if (col.stats.empty() || removed_stats.contains(col.name)) + if (col.statistics.empty() || removed_stats.contains(col.name)) continue; if (ctx->materialized_statistics.contains(col.name)) { - stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col.stats)); + stats_to_rewrite.push_back(MergeTreeStatisticsFactory::instance().get(col.statistics)); } else { /// We do not hard-link statistics which /// 1. In `DROP STATISTICS` statement. It is filtered by `removed_stats` /// 2. Not in column list anymore, including `DROP COLUMN`. It is not touched by this loop. - String stat_file_name = STAT_FILE_PREFIX + col.name + STAT_FILE_SUFFIX; + String stat_file_name = STATS_FILE_PREFIX + col.name + STATS_FILE_SUFFIX; auto it = ctx->source_part->checksums.files.find(stat_file_name); if (it != ctx->source_part->checksums.files.end()) { diff --git a/src/Storages/Statistics/ConditionEstimator.cpp b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp similarity index 97% rename from src/Storages/Statistics/ConditionEstimator.cpp rename to src/Storages/Statistics/ConditionSelectivityEstimator.cpp index 05ea5bc62a5..757136fdf42 100644 --- a/src/Storages/Statistics/ConditionEstimator.cpp +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -25,7 +25,7 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateLess( for (const auto & [key, estimator] : part_statistics) { result += estimator->estimateLess(val); - part_rows += estimator->count(); + part_rows += estimator->rowCount(); } return result * rows / part_rows; } @@ -49,7 +49,7 @@ Float64 ConditionSelectivityEstimator::ColumnSelectivityEstimator::estimateEqual for (const auto & [key, estimator] : part_statistics) { result += estimator->estimateEqual(val); - partial_cnt += estimator->count(); + partial_cnt += estimator->rowCount(); } return result * rows / partial_cnt; } diff --git a/src/Storages/Statistics/ConditionEstimator.h b/src/Storages/Statistics/ConditionSelectivityEstimator.h similarity index 97% rename from src/Storages/Statistics/ConditionEstimator.h rename to src/Storages/Statistics/ConditionSelectivityEstimator.h index 4e5b12194d2..f0599742276 100644 --- a/src/Storages/Statistics/ConditionEstimator.h +++ b/src/Storages/Statistics/ConditionSelectivityEstimator.h @@ -40,8 +40,6 @@ private: std::pair extractBinaryOp(const RPNBuilderTreeNode & node, const String & column_name) const; public: - ConditionSelectivityEstimator() = default; - /// TODO: Support the condition consists of CNF/DNF like (cond1 and cond2) or (cond3) ... /// Right now we only support simple condition like col = val / col < val Float64 estimateRowCount(const RPNBuilderTreeNode & node) const; diff --git a/src/Storages/Statistics/Statistics.cpp b/src/Storages/Statistics/Statistics.cpp index 0f63a286f75..fed0bd61c03 100644 --- a/src/Storages/Statistics/Statistics.cpp +++ b/src/Storages/Statistics/Statistics.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include @@ -20,12 +20,13 @@ namespace ErrorCodes extern const int INCORRECT_QUERY; } +/// Version / bitmask of statistics / data of statistics / enum StatisticsFileVersion : UInt16 { V0 = 0, }; -/// Version / bitmask of statistics / data of statistics / +IStatistics::IStatistics(const SingleStatisticsDescription & stat_) : stat(stat_) {} ColumnStatistics::ColumnStatistics(const ColumnStatisticsDescription & stats_desc_) : stats_desc(stats_desc_), rows(0) @@ -58,6 +59,8 @@ Float64 ColumnStatistics::estimateEqual(Float64 val) const if (stats.contains(StatisticsType::Uniq) && stats.contains(StatisticsType::TDigest)) { auto uniq_static = std::static_pointer_cast(stats.at(StatisticsType::Uniq)); + /// 2048 is the default number of buckets in TDigest. In this case, TDigest stores exactly one value (with many rows) + /// for every bucket. if (uniq_static->getCardinality() < 2048) { auto tdigest_static = std::static_pointer_cast(stats.at(StatisticsType::TDigest)); @@ -75,17 +78,13 @@ void ColumnStatistics::serialize(WriteBuffer & buf) writeIntBinary(V0, buf); UInt64 stat_types_mask = 0; for (const auto & [type, _]: stats) - { stat_types_mask |= 1 << UInt8(type); - } writeIntBinary(stat_types_mask, buf); /// We write some basic statistics writeIntBinary(rows, buf); /// We write complex statistics for (const auto & [type, stat_ptr]: stats) - { stat_ptr->serialize(buf); - } } void ColumnStatistics::deserialize(ReadBuffer &buf) @@ -102,19 +101,19 @@ void ColumnStatistics::deserialize(ReadBuffer &buf) { if (!(stat_types_mask & 1 << UInt8(it->first))) { - stats.erase(it ++); + stats.erase(it++); } else { it->second->deserialize(buf); - ++ it; + ++it; } } } String ColumnStatistics::getFileName() const { - return STAT_FILE_PREFIX + columnName(); + return STATS_FILE_PREFIX + columnName(); } const String & ColumnStatistics::columnName() const @@ -122,7 +121,7 @@ const String & ColumnStatistics::columnName() const return stats_desc.column_name; } -UInt64 ColumnStatistics::count() const +UInt64 ColumnStatistics::rowCount() const { return rows; } @@ -188,8 +187,8 @@ ColumnsStatistics MergeTreeStatisticsFactory::getMany(const ColumnsDescription & { ColumnsStatistics result; for (const auto & col : columns) - if (!col.stats.empty()) - result.push_back(get(col.stats)); + if (!col.statistics.empty()) + result.push_back(get(col.statistics)); return result; } diff --git a/src/Storages/Statistics/Statistics.h b/src/Storages/Statistics/Statistics.h index 1415f0a5d2f..2ab1337af02 100644 --- a/src/Storages/Statistics/Statistics.h +++ b/src/Storages/Statistics/Statistics.h @@ -1,9 +1,6 @@ #pragma once -#include #include -#include - #include #include @@ -13,24 +10,22 @@ #include +namespace DB +{ + /// this is for user-defined statistic. constexpr auto STATS_FILE_PREFIX = "statistics_"; constexpr auto STATS_FILE_SUFFIX = ".stats"; -namespace DB -{ - -/// Statistics contains the distribution of values in a column. -/// right now we support -/// - tdigest -/// - uniq(hyperloglog) +/// Statistics describe properties of the values in the column, +/// e.g. how many unique values exist, +/// what are the N most frequent values, +/// how frequent is a value V, etc. class IStatistics { public: - explicit IStatistics(const SingleStatisticsDescription & stat_) - : stat(stat_) - { - } + explicit IStatistics(const SingleStatisticsDescription & stat_); + virtual ~IStatistics() = default; virtual void serialize(WriteBuffer & buf) = 0; @@ -40,17 +35,11 @@ public: virtual void update(const ColumnPtr & column) = 0; protected: - SingleStatisticsDescription stat; - }; using StatisticsPtr = std::shared_ptr; -class ColumnStatistics; -using ColumnStatisticsPtr = std::shared_ptr; -using ColumnsStatistics = std::vector; - class ColumnStatistics { public: @@ -61,7 +50,7 @@ public: const String & columnName() const; - UInt64 count() const; + UInt64 rowCount() const; void update(const ColumnPtr & column); @@ -80,6 +69,8 @@ private: }; class ColumnsDescription; +using ColumnStatisticsPtr = std::shared_ptr; +using ColumnsStatistics = std::vector; class MergeTreeStatisticsFactory : private boost::noncopyable { diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp index 7f99a91cf86..59d71c5aff6 100644 --- a/src/Storages/Statistics/UniqStatistics.cpp +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -15,7 +15,6 @@ UniqStatistics::UniqStatistics(const SingleStatisticsDescription & stat_, const { arena = std::make_unique(); AggregateFunctionProperties properties; - properties.returns_default_when_only_null = true; collector = AggregateFunctionFactory::instance().get("uniq", NullsAction::IGNORE_NULLS, {data_type}, Array(), properties); data = arena->alignedAlloc(collector->sizeOfData(), collector->alignOfData()); collector->create(data); diff --git a/src/Storages/StatisticsDescription.cpp b/src/Storages/StatisticsDescription.cpp index 3de7b8159b7..b7d2507e21a 100644 --- a/src/Storages/StatisticsDescription.cpp +++ b/src/Storages/StatisticsDescription.cpp @@ -58,19 +58,7 @@ bool SingleStatisticsDescription::operator==(const SingleStatisticsDescription & bool ColumnStatisticsDescription::operator==(const ColumnStatisticsDescription & other) const { - if (types_to_desc.size() != other.types_to_desc.size()) - return false; - - for (const auto & [type, desc] : types_to_desc) - { - StatisticsType stats_type = type; - if (!other.types_to_desc.contains(stats_type)) - return false; - if (!(desc == other.types_to_desc.at(stats_type))) - return false; - } - - return true; + return types_to_desc == other.types_to_desc; } bool ColumnStatisticsDescription::empty() const @@ -117,7 +105,7 @@ void ColumnStatisticsDescription::clear() types_to_desc.clear(); } -std::vector ColumnStatisticsDescription::getStatisticsDescriptionsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) +std::vector ColumnStatisticsDescription::fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns) { const auto * stat_definition_ast = definition_ast->as(); if (!stat_definition_ast) @@ -158,7 +146,7 @@ std::vector ColumnStatisticsDescription::getStatist return result; } -ColumnStatisticsDescription ColumnStatisticsDescription::getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column) +ColumnStatisticsDescription ColumnStatisticsDescription::fromColumnDeclaration(const ASTColumnDeclaration & column) { const auto & stat_type_list_ast = column.stat_type->as().arguments; if (stat_type_list_ast->children.empty()) diff --git a/src/Storages/StatisticsDescription.h b/src/Storages/StatisticsDescription.h index b064644c020..c26cb91020b 100644 --- a/src/Storages/StatisticsDescription.h +++ b/src/Storages/StatisticsDescription.h @@ -14,7 +14,7 @@ enum class StatisticsType : UInt8 TDigest = 0, Uniq = 1, - UnknownStatistics = 63, + Max = 63, }; struct SingleStatisticsDescription @@ -49,8 +49,8 @@ struct ColumnStatisticsDescription ASTPtr getAST() const; - static std::vector getStatisticsDescriptionsFromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); - static ColumnStatisticsDescription getStatisticFromColumnDeclaration(const ASTColumnDeclaration & column); + static std::vector fromAST(const ASTPtr & definition_ast, const ColumnsDescription & columns); + static ColumnStatisticsDescription fromColumnDeclaration(const ASTColumnDeclaration & column); using StatisticsTypeDescMap = std::map; StatisticsTypeDescMap types_to_desc; diff --git a/tests/integration/test_manipulate_statistics/config/config.xml b/tests/integration/test_manipulate_statistics/config/config.xml index b47f8123499..c448798a7c1 100644 --- a/tests/integration/test_manipulate_statistics/config/config.xml +++ b/tests/integration/test_manipulate_statistics/config/config.xml @@ -1,7 +1,7 @@ - 1 + 1 diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index a7a0509fbd2..17554f5c8a5 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -24,11 +24,11 @@ ALTER DROP INDEX ['DROP INDEX'] TABLE ALTER INDEX ALTER MATERIALIZE INDEX ['MATERIALIZE INDEX'] TABLE ALTER INDEX ALTER CLEAR INDEX ['CLEAR INDEX'] TABLE ALTER INDEX ALTER INDEX ['INDEX'] \N ALTER TABLE -ALTER ADD STATISTIC ['ALTER ADD STATISTIC'] TABLE ALTER STATISTIC -ALTER DROP STATISTIC ['ALTER DROP STATISTIC'] TABLE ALTER STATISTIC -ALTER MODIFY STATISTIC ['ALTER MODIFY STATISTIC'] TABLE ALTER STATISTIC -ALTER MATERIALIZE STATISTIC ['ALTER MATERIALIZE STATISTIC'] TABLE ALTER STATISTIC -ALTER STATISTIC ['STATISTIC'] \N ALTER TABLE +ALTER ADD STATISTICS ['ALTER ADD STATISTIC'] TABLE ALTER STATISTICS +ALTER DROP STATISTICS ['ALTER DROP STATISTIC'] TABLE ALTER STATISTICS +ALTER MODIFY STATISTICS ['ALTER MODIFY STATISTIC'] TABLE ALTER STATISTICS +ALTER MATERIALIZE STATISTICS ['ALTER MATERIALIZE STATISTIC'] TABLE ALTER STATISTICS +ALTER STATISTICS ['STATISTIC'] \N ALTER TABLE ALTER ADD PROJECTION ['ADD PROJECTION'] TABLE ALTER PROJECTION ALTER DROP PROJECTION ['DROP PROJECTION'] TABLE ALTER PROJECTION ALTER MATERIALIZE PROJECTION ['MATERIALIZE PROJECTION'] TABLE ALTER PROJECTION diff --git a/tests/queries/0_stateless/02864_statistic_exception.sql b/tests/queries/0_stateless/02864_statistic_exception.sql index 4597ed11d4d..8dde46af887 100644 --- a/tests/queries/0_stateless/02864_statistic_exception.sql +++ b/tests/queries/0_stateless/02864_statistic_exception.sql @@ -7,7 +7,7 @@ CREATE TABLE t1 pk String, ) Engine = MergeTree() ORDER BY pk; -- { serverError INCORRECT_QUERY } -SET allow_experimental_statistic = 1; +SET allow_experimental_statistics = 1; CREATE TABLE t1 ( diff --git a/tests/queries/0_stateless/02864_statistic_operate.sql b/tests/queries/0_stateless/02864_statistic_operate.sql index 914e58d7d3a..bf69c11bc91 100644 --- a/tests/queries/0_stateless/02864_statistic_operate.sql +++ b/tests/queries/0_stateless/02864_statistic_operate.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS t1; -SET allow_experimental_statistic = 1; -SET allow_statistic_optimize = 1; +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; CREATE TABLE t1 ( diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistic_uniq.sql index 79bd9a50732..818d2f973c8 100644 --- a/tests/queries/0_stateless/02864_statistic_uniq.sql +++ b/tests/queries/0_stateless/02864_statistic_uniq.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS t1; -SET allow_experimental_statistic = 1; -SET allow_statistic_optimize = 1; +SET allow_experimental_statistics = 1; +SET allow_statistics_optimize = 1; CREATE TABLE t1 ( diff --git a/tests/queries/0_stateless/02995_baseline_23_12_1.tsv b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv index 4c0c9125b46..a391473e7c9 100644 --- a/tests/queries/0_stateless/02995_baseline_23_12_1.tsv +++ b/tests/queries/0_stateless/02995_baseline_23_12_1.tsv @@ -41,7 +41,7 @@ allow_experimental_query_deduplication 0 allow_experimental_refreshable_materialized_view 0 allow_experimental_s3queue 1 allow_experimental_shared_merge_tree 0 -allow_experimental_statistic 0 +allow_experimental_statistics 0 allow_experimental_undrop_table_query 1 allow_experimental_usearch_index 0 allow_experimental_window_functions 1 @@ -58,7 +58,7 @@ allow_prefetched_read_pool_for_remote_filesystem 1 allow_push_predicate_when_subquery_contains_with 1 allow_settings_after_format_in_insert 0 allow_simdjson 1 -allow_statistic_optimize 0 +allow_statistics_optimize 0 allow_suspicious_codecs 0 allow_suspicious_fixed_string_types 0 allow_suspicious_indices 0 From 2e2d20717b1dda7075e99d16a06fa7f45790eeb0 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Fri, 17 May 2024 17:37:16 +0200 Subject: [PATCH 019/133] refine docs --- .../mergetree-family/mergetree.md | 18 +++++----- docs/en/operations/settings/settings.md | 2 +- .../sql-reference/statements/alter/index.md | 2 +- .../statements/alter/statistic.md | 27 --------------- .../statements/alter/statistics.md | 33 +++++++++++++++++++ 5 files changed, 44 insertions(+), 38 deletions(-) delete mode 100644 docs/en/sql-reference/statements/alter/statistic.md create mode 100644 docs/en/sql-reference/statements/alter/statistics.md diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index c009a648b44..0a9f6202a51 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1039,12 +1039,12 @@ ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [ ## Column Statistics (Experimental) {#column-statistics} -The statistic declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistic = 1`. +The statistics declaration is in the columns section of the `CREATE` query for tables from the `*MergeTree*` Family when we enable `set allow_experimental_statistics = 1`. ``` sql CREATE TABLE tab ( - a Int64 STATISTIC(tdigest, uniq), + a Int64 STATISTICS(TDigest, Uniq), b Float64 ) ENGINE = MergeTree @@ -1054,22 +1054,22 @@ ORDER BY a We can also manipulate statistics with `ALTER` statements. ```sql -ALTER TABLE tab ADD STATISTIC b TYPE tdigest, uniq; -ALTER TABLE tab DROP STATISTIC a; +ALTER TABLE tab ADD STATISTICS b TYPE TDigest, Uniq; +ALTER TABLE tab DROP STATISTICS a; ``` -These lightweight statistics aggregate information about distribution of values in columns. -They can be used for query optimization when we enable `set allow_statistic_optimize = 1`. +These lightweight statistics aggregate information about distribution of values in columns. Statistics are stored in every part and updated when every insert comes. +They can be used for prewhere optimization only if we enable `set allow_statistics_optimize = 1`. #### Available Types of Column Statistics {#available-types-of-column-statistics} -- `tdigest` +- `TDigest` Stores distribution of values from numeric columns in [TDigest](https://github.com/tdunning/t-digest) sketch. -- `uniq` +- `Uniq` - Estimate the number of distinct values of a column. + Estimate the number of distinct values of a column by HyperLogLog. ## Column-level Settings {#column-level-settings} diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 91b544c6a82..c69cfcb75f9 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -5038,7 +5038,7 @@ a Tuple( ) ``` -## allow_experimental_statistic {#allow_experimental_statistic} +## allow_experimental_statistics {#allow_experimental_statistics} Allows defining columns with [statistics](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) and [manipulate statistics](../../engines/table-engines/mergetree-family/mergetree.md#column-statistics). diff --git a/docs/en/sql-reference/statements/alter/index.md b/docs/en/sql-reference/statements/alter/index.md index 7961315c193..edd976ae951 100644 --- a/docs/en/sql-reference/statements/alter/index.md +++ b/docs/en/sql-reference/statements/alter/index.md @@ -16,7 +16,7 @@ Most `ALTER TABLE` queries modify table settings or data: - [INDEX](/docs/en/sql-reference/statements/alter/skipping-index.md) - [CONSTRAINT](/docs/en/sql-reference/statements/alter/constraint.md) - [TTL](/docs/en/sql-reference/statements/alter/ttl.md) -- [STATISTIC](/docs/en/sql-reference/statements/alter/statistic.md) +- [STATISTICS](/docs/en/sql-reference/statements/alter/statistics.md) - [APPLY DELETED MASK](/docs/en/sql-reference/statements/alter/apply-deleted-mask.md) :::note diff --git a/docs/en/sql-reference/statements/alter/statistic.md b/docs/en/sql-reference/statements/alter/statistic.md deleted file mode 100644 index 08010a3911d..00000000000 --- a/docs/en/sql-reference/statements/alter/statistic.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -slug: /en/sql-reference/statements/alter/statistic -sidebar_position: 45 -sidebar_label: STATISTIC ---- - -# Manipulating Column Statistics - -The following operations are available: - -- `ALTER TABLE [db].table ADD STATISTIC (columns list) TYPE (type list)` - Adds statistic description to tables metadata. - -- `ALTER TABLE [db].table MODIFY STATISTIC (columns list) TYPE (type list)` - Modifies statistic description to tables metadata. - -- `ALTER TABLE [db].table DROP STATISTIC (columns list)` - Removes statistic description from tables metadata and deletes statistic files from disk. - -- `ALTER TABLE [db].table CLEAR STATISTIC (columns list)` - Deletes statistic files from disk. - -- `ALTER TABLE [db.]table MATERIALIZE STATISTIC (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). - -The first two commands are lightweight in a sense that they only change metadata or remove files. - -Also, they are replicated, syncing statistics metadata via ZooKeeper. - -:::note -Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). -::: diff --git a/docs/en/sql-reference/statements/alter/statistics.md b/docs/en/sql-reference/statements/alter/statistics.md new file mode 100644 index 00000000000..d8c107c46f9 --- /dev/null +++ b/docs/en/sql-reference/statements/alter/statistics.md @@ -0,0 +1,33 @@ +--- +slug: /en/sql-reference/statements/alter/statistics +sidebar_position: 45 +sidebar_label: STATISTICS +--- + +# Manipulating Column Statistics + +The following operations are available: + +- `ALTER TABLE [db].table ADD STATISTICS (columns list) TYPE (type list)` - Adds statistic description to tables metadata. + +- `ALTER TABLE [db].table MODIFY STATISTICS (columns list) TYPE (type list)` - Modifies statistic description to tables metadata. + +- `ALTER TABLE [db].table DROP STATISTICS (columns list)` - Removes statistic description from tables metadata and deletes statistic files from disk. + +- `ALTER TABLE [db].table CLEAR STATISTICS (columns list)` - Deletes statistic files from disk. + +- `ALTER TABLE [db.]table MATERIALIZE STATISTICS (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). + +The first two commands are lightweight in a sense that they only change metadata or remove files. + +Also, they are replicated, syncing statistics metadata via ZooKeeper. + +There is an example adding two statistics types to two columns: + +``` +ALTER TABLE t1 MODIFY STATISTICS c, d TYPE TDigest, Uniq; +``` + +:::note +Statistic manipulation is supported only for tables with [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md) engine (including [replicated](../../../engines/table-engines/mergetree-family/replication.md) variants). +::: From 93a6c1e5a886737e3ddd0d52dba588feb8c56945 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Tue, 21 May 2024 16:03:38 +0200 Subject: [PATCH 020/133] fix tests --- .../integration/test_manipulate_statistics/config/config.xml | 2 +- tests/integration/test_manipulate_statistics/test.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_manipulate_statistics/config/config.xml b/tests/integration/test_manipulate_statistics/config/config.xml index c448798a7c1..24225173eeb 100644 --- a/tests/integration/test_manipulate_statistics/config/config.xml +++ b/tests/integration/test_manipulate_statistics/config/config.xml @@ -1,7 +1,7 @@ - 1 + 1 diff --git a/tests/integration/test_manipulate_statistics/test.py b/tests/integration/test_manipulate_statistics/test.py index e6291024e76..2b26af940d1 100644 --- a/tests/integration/test_manipulate_statistics/test.py +++ b/tests/integration/test_manipulate_statistics/test.py @@ -34,14 +34,14 @@ def check_stat_file_on_disk(node, table, part_name, column_name, exist): [ "bash", "-c", - "find {p} -type f -name statistic_{col}.stat".format( + "find {p} -type f -name statistics_{col}.stats".format( p=part_path, col=column_name ), ], privileged=True, ) logging.debug( - f"Checking stat file in {part_path} for column {column_name}, got {output}" + f"Checking stats file in {part_path} for column {column_name}, got {output}" ) if exist: assert len(output) != 0 From 6e15d6b3448da6fe9866736ca45e53abc221efbd Mon Sep 17 00:00:00 2001 From: Han Fei Date: Wed, 22 May 2024 15:24:18 +0200 Subject: [PATCH 021/133] address comments --- docs/en/sql-reference/statements/alter/statistics.md | 4 ++-- src/Storages/AlterCommands.cpp | 3 ++- ....reference => 02864_statistics_exception.reference} | 0 ...ic_exception.sql => 02864_statistics_exception.sql} | 10 +++++++--- ...te.reference => 02864_statistics_operate.reference} | 0 ...tistic_operate.sql => 02864_statistics_operate.sql} | 0 ..._uniq.reference => 02864_statistics_uniq.reference} | 0 ...64_statistic_uniq.sql => 02864_statistics_uniq.sql} | 0 8 files changed, 11 insertions(+), 6 deletions(-) rename tests/queries/0_stateless/{02864_statistic_exception.reference => 02864_statistics_exception.reference} (100%) rename tests/queries/0_stateless/{02864_statistic_exception.sql => 02864_statistics_exception.sql} (78%) rename tests/queries/0_stateless/{02864_statistic_operate.reference => 02864_statistics_operate.reference} (100%) rename tests/queries/0_stateless/{02864_statistic_operate.sql => 02864_statistics_operate.sql} (100%) rename tests/queries/0_stateless/{02864_statistic_uniq.reference => 02864_statistics_uniq.reference} (100%) rename tests/queries/0_stateless/{02864_statistic_uniq.sql => 02864_statistics_uniq.sql} (100%) diff --git a/docs/en/sql-reference/statements/alter/statistics.md b/docs/en/sql-reference/statements/alter/statistics.md index d8c107c46f9..80024781f88 100644 --- a/docs/en/sql-reference/statements/alter/statistics.md +++ b/docs/en/sql-reference/statements/alter/statistics.md @@ -12,9 +12,9 @@ The following operations are available: - `ALTER TABLE [db].table MODIFY STATISTICS (columns list) TYPE (type list)` - Modifies statistic description to tables metadata. -- `ALTER TABLE [db].table DROP STATISTICS (columns list)` - Removes statistic description from tables metadata and deletes statistic files from disk. +- `ALTER TABLE [db].table DROP STATISTICS (columns list)` - Removes statistics from the metadata of the specified columns and deletes all statistics objects in all parts for the specified columns. -- `ALTER TABLE [db].table CLEAR STATISTICS (columns list)` - Deletes statistic files from disk. +- `ALTER TABLE [db].table CLEAR STATISTICS (columns list)` - Deletes all statistics objects in all parts for the specified columns. Statistics objects can be rebuild using `ALTER TABLE MATERIALIZE STATISTICS`. - `ALTER TABLE [db.]table MATERIALIZE STATISTICS (columns list)` - Rebuilds the statistic for columns. Implemented as a [mutation](../../../sql-reference/statements/alter/index.md#mutations). diff --git a/src/Storages/AlterCommands.cpp b/src/Storages/AlterCommands.cpp index 59b96f9817c..6628b7efc5d 100644 --- a/src/Storages/AlterCommands.cpp +++ b/src/Storages/AlterCommands.cpp @@ -712,7 +712,8 @@ void AlterCommand::apply(StorageInMemoryMetadata & metadata, ContextPtr context) { for (const auto & statistics_column_name : statistics_columns) { - if (!metadata.columns.has(statistics_column_name)) + if (!metadata.columns.has(statistics_column_name) + || metadata.columns.get(statistics_column_name).statistics.empty()) { if (if_exists) return; diff --git a/tests/queries/0_stateless/02864_statistic_exception.reference b/tests/queries/0_stateless/02864_statistics_exception.reference similarity index 100% rename from tests/queries/0_stateless/02864_statistic_exception.reference rename to tests/queries/0_stateless/02864_statistics_exception.reference diff --git a/tests/queries/0_stateless/02864_statistic_exception.sql b/tests/queries/0_stateless/02864_statistics_exception.sql similarity index 78% rename from tests/queries/0_stateless/02864_statistic_exception.sql rename to tests/queries/0_stateless/02864_statistics_exception.sql index 8dde46af887..c531d39cd69 100644 --- a/tests/queries/0_stateless/02864_statistic_exception.sql +++ b/tests/queries/0_stateless/02864_statistics_exception.sql @@ -37,12 +37,16 @@ CREATE TABLE t1 ALTER TABLE t1 ADD STATISTICS a TYPE xyz; -- { serverError INCORRECT_QUERY } ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; +ALTER TABLE t1 ADD STATISTICS IF NOT EXISTS a TYPE tdigest; ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } +-- Statistics can be created only on integer columns +ALTER TABLE t1 MODIFY STATISTICS a TYPE tdigest; ALTER TABLE t1 ADD STATISTICS pk TYPE tdigest; -- { serverError ILLEGAL_STATISTICS } -ALTER TABLE t1 DROP STATISTICS b; +ALTER TABLE t1 DROP STATISTICS b; -- { serverError ILLEGAL_STATISTICS } ALTER TABLE t1 DROP STATISTICS a; -ALTER TABLE t1 DROP STATISTICS a; -ALTER TABLE t1 CLEAR STATISTICS a; +ALTER TABLE t1 DROP STATISTICS IF EXISTS a; +ALTER TABLE t1 CLEAR STATISTICS a; -- { serverError ILLEGAL_STATISTICS } +ALTER TABLE t1 CLEAR STATISTICS IF EXISTS a; ALTER TABLE t1 MATERIALIZE STATISTICS b; -- { serverError ILLEGAL_STATISTICS } ALTER TABLE t1 ADD STATISTICS a TYPE tdigest; diff --git a/tests/queries/0_stateless/02864_statistic_operate.reference b/tests/queries/0_stateless/02864_statistics_operate.reference similarity index 100% rename from tests/queries/0_stateless/02864_statistic_operate.reference rename to tests/queries/0_stateless/02864_statistics_operate.reference diff --git a/tests/queries/0_stateless/02864_statistic_operate.sql b/tests/queries/0_stateless/02864_statistics_operate.sql similarity index 100% rename from tests/queries/0_stateless/02864_statistic_operate.sql rename to tests/queries/0_stateless/02864_statistics_operate.sql diff --git a/tests/queries/0_stateless/02864_statistic_uniq.reference b/tests/queries/0_stateless/02864_statistics_uniq.reference similarity index 100% rename from tests/queries/0_stateless/02864_statistic_uniq.reference rename to tests/queries/0_stateless/02864_statistics_uniq.reference diff --git a/tests/queries/0_stateless/02864_statistic_uniq.sql b/tests/queries/0_stateless/02864_statistics_uniq.sql similarity index 100% rename from tests/queries/0_stateless/02864_statistic_uniq.sql rename to tests/queries/0_stateless/02864_statistics_uniq.sql From a94845920f7fce05cfbc859ff663a4d14f7478b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 22 May 2024 21:18:58 +0200 Subject: [PATCH 022/133] Make `settings_changes_history` const --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 21552a336c0..ab6d040849e 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -83,7 +83,7 @@ namespace SettingsChangesHistory /// For newly added setting choose the most appropriate previous_value (for example, if new setting /// controls new feature and it's 'true' by default, use 'false' as previous_value). /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) -static std::map settings_changes_history = +static const std::map settings_changes_history = { {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, From 21d6f9ef2232d87d4657eaed1c0a1ce7f88c3410 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=B8=D1=80=D0=B8=D0=BB=D0=BB=20=D0=93=D0=B0=D1=80?= =?UTF-8?q?=D0=B1=D0=B0=D1=80?= Date: Thu, 23 May 2024 03:13:25 +0300 Subject: [PATCH 023/133] Prevent conversion to Replicated if zookeeper path already exists --- src/Databases/DatabaseOrdinary.cpp | 15 ++++ .../test_zk_path_exists.py | 69 +++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 5d36f1cc3d6..10a8e06e8f0 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -44,6 +44,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int UNKNOWN_DATABASE_ENGINE; extern const int NOT_IMPLEMENTED; + extern const int UNEXPECTED_NODE_IN_ZOOKEEPER; } static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; @@ -76,6 +77,20 @@ static void setReplicatedEngine(ASTCreateQuery * create_query, ContextPtr contex String replica_path = server_settings.default_replica_path; String replica_name = server_settings.default_replica_name; + /// Check that replica path doesn't exist + Macros::MacroExpansionInfo info; + StorageID table_id = StorageID(create_query->getDatabase(), create_query->getTable(), create_query->uuid); + info.table_id = table_id; + info.expand_special_macros_only = false; + + String zookeeper_path = context->getMacros()->expand(replica_path, info); + if (context->getZooKeeper()->exists(zookeeper_path)) + throw Exception( + ErrorCodes::UNEXPECTED_NODE_IN_ZOOKEEPER, + "Found existing ZooKeeper path {} while trying to convert table {} to replicated. Table will not be converted.", + zookeeper_path, backQuote(table_id.getFullTableName()) + ); + auto args = std::make_shared(); args->children.push_back(std::make_shared(replica_path)); args->children.push_back(std::make_shared(replica_name)); diff --git a/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py b/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py new file mode 100644 index 00000000000..3bf492cf69d --- /dev/null +++ b/tests/integration/test_modify_engine_on_restart/test_zk_path_exists.py @@ -0,0 +1,69 @@ +import pytest +from test_modify_engine_on_restart.common import ( + get_table_path, + set_convert_flags, +) +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +ch1 = cluster.add_instance( + "ch1", + main_configs=[ + "configs/config.d/clusters.xml", + "configs/config.d/distributed_ddl.xml", + ], + with_zookeeper=True, + macros={"replica": "node1"}, + stay_alive=True, +) + +database_name = "modify_engine_zk_path" + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + + finally: + cluster.shutdown() + + +def q(node, query): + return node.query(database=database_name, sql=query) + + +def test_modify_engine_fails_if_zk_path_exists(started_cluster): + ch1.query("CREATE DATABASE " + database_name) + + q( + ch1, + "CREATE TABLE already_exists_1 ( A Int64, D Date, S String ) ENGINE MergeTree() PARTITION BY toYYYYMM(D) ORDER BY A;", + ) + uuid = q( + ch1, + f"SELECT uuid FROM system.tables WHERE table = 'already_exists_1' and database = '{database_name}'", + ).strip("'[]\n") + + q( + ch1, + f"CREATE TABLE already_exists_2 ( A Int64, D Date, S String ) ENGINE ReplicatedMergeTree('/clickhouse/tables/{uuid}/{{shard}}', 'node2') PARTITION BY toYYYYMM(D) ORDER BY A;", + ) + + set_convert_flags(ch1, database_name, ["already_exists_1"]) + + table_data_path = get_table_path(ch1, "already_exists_1", database_name) + + ch1.stop_clickhouse() + ch1.start_clickhouse(start_wait_sec=120, expected_to_fail=True) + + # Check if we can cancel convertation + ch1.exec_in_container( + [ + "bash", + "-c", + f"rm {table_data_path}convert_to_replicated", + ] + ) + ch1.start_clickhouse() From 76eae6269403e2e8d2baf5f0fd995d0045e6fe9c Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 23 May 2024 15:28:12 +0200 Subject: [PATCH 024/133] fix fuzzer --- src/Storages/Statistics/UniqStatistics.cpp | 4 ++- .../0_stateless/02864_statistics_uniq.sql | 26 +++++++++++++++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp index 59d71c5aff6..0a96d7bdc3f 100644 --- a/src/Storages/Statistics/UniqStatistics.cpp +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -44,7 +44,9 @@ void UniqStatistics::deserialize(ReadBuffer & buf) void UniqStatistics::update(const ColumnPtr & column) { - const IColumn * col_ptr = column.get(); + /// TODO(hanfei): For low cardinality, it's very slow to convert to full column. We can read the dictionary directly. + /// Here we intend to avoid crash in CI. + const IColumn * col_ptr = column->convertToFullColumnIfLowCardinality().get(); collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); } diff --git a/tests/queries/0_stateless/02864_statistics_uniq.sql b/tests/queries/0_stateless/02864_statistics_uniq.sql index 818d2f973c8..c6b51d2a377 100644 --- a/tests/queries/0_stateless/02864_statistics_uniq.sql +++ b/tests/queries/0_stateless/02864_statistics_uniq.sql @@ -43,3 +43,29 @@ SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN act SELECT replaceRegexpAll(explain, '__table1.|_UInt8|_Int8', '') FROM (EXPLAIN actions=1 SELECT count(*) FROM t1 WHERE b < 10 and c < -1 and a < 10) WHERE explain LIKE '%Prewhere%' OR explain LIKE '%Filter column%'; DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; +SET allow_suspicious_low_cardinality_types=1; +CREATE TABLE t2 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c LowCardinality(Int64) STATISTICS(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t2 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; + +DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t3; + +CREATE TABLE t3 +( + a Float64 STATISTICS(tdigest), + b Int64 STATISTICS(tdigest), + c Nullable(Int64) STATISTICS(tdigest, uniq), + pk String, +) Engine = MergeTree() ORDER BY pk +SETTINGS min_bytes_for_wide_part = 0; +INSERT INTO t3 select number, -number, number/1000, generateUUIDv4() FROM system.numbers LIMIT 10000; + +DROP TABLE IF EXISTS t3; From 92dfaa7e49944aec4c45af094bfb2f76b6f8e3a5 Mon Sep 17 00:00:00 2001 From: wudidapaopao <664920313@qq.com> Date: Thu, 23 May 2024 15:00:03 +0800 Subject: [PATCH 025/133] Fix unexpected accurateCast from string to integer --- src/DataTypes/IDataType.h | 6 +++ src/Functions/FunctionsConversion.cpp | 47 ++++++++++++++----- .../0_stateless/01601_accurate_cast.reference | 5 ++ .../0_stateless/01601_accurate_cast.sql | 15 ++++++ 4 files changed, 61 insertions(+), 12 deletions(-) diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index 46c30240ef8..85fce671cbb 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -543,6 +543,7 @@ template constexpr bool IsDataTypeNumber = false; template constexpr bool IsDataTypeDateOrDateTime = false; template constexpr bool IsDataTypeDate = false; template constexpr bool IsDataTypeEnum = false; +template constexpr bool IsDataTypeStringOrFixedString = false; template constexpr bool IsDataTypeDecimalOrNumber = IsDataTypeDecimal || IsDataTypeNumber; @@ -556,6 +557,8 @@ class DataTypeDate; class DataTypeDate32; class DataTypeDateTime; class DataTypeDateTime64; +class DataTypeString; +class DataTypeFixedString; template constexpr bool IsDataTypeDecimal> = true; @@ -572,6 +575,9 @@ template <> inline constexpr bool IsDataTypeDateOrDateTime = tru template <> inline constexpr bool IsDataTypeDateOrDateTime = true; template <> inline constexpr bool IsDataTypeDateOrDateTime = true; +template <> inline constexpr bool IsDataTypeStringOrFixedString = true; +template <> inline constexpr bool IsDataTypeStringOrFixedString = true; + template class DataTypeEnum; diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 44d0b750af9..2a0b2f1d075 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -709,7 +709,7 @@ bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateL else return tryReadFloatTextFast(x, rb); } - else /*if constexpr (is_integer_v)*/ + else /*if constexpr (is_integral_v)*/ return tryReadIntText(x, rb); } @@ -814,6 +814,16 @@ enum class ConvertFromStringParsingMode : uint8_t BestEffortUS }; +struct AccurateConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + +struct AccurateOrNullConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + template struct ConvertThroughParsing @@ -1020,7 +1030,13 @@ struct ConvertThroughParsing break; } } - parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); + if constexpr (std::is_same_v) + { + if (!tryParseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing)) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "Cannot parse string to type {}", TypeName); + } + else + parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); } while (false); } } @@ -1120,16 +1136,6 @@ struct ConvertThroughParsing /// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; -struct AccurateConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - -struct AccurateOrNullConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - enum class BehaviourOnErrorFromString : uint8_t { ConvertDefaultBehaviorTag, @@ -3174,8 +3180,11 @@ private: { TypeIndex from_type_index = from_type->getTypeId(); WhichDataType which(from_type_index); + TypeIndex to_type_index = to_type->getTypeId(); + WhichDataType to(to_type_index); bool can_apply_accurate_cast = (cast_type == CastType::accurate || cast_type == CastType::accurateOrNull) && (which.isInt() || which.isUInt() || which.isFloat()); + can_apply_accurate_cast |= cast_type == CastType::accurate && which.isStringOrFixedString() && to.isNativeInteger(); FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; if (context) @@ -3260,6 +3269,20 @@ private: return true; } } + else if constexpr (IsDataTypeStringOrFixedString) + { + if constexpr (IsDataTypeNumber) + { + chassert(wrapper_cast_type == CastType::accurate); + result_column = ConvertImpl::execute( + arguments, + result_type, + input_rows_count, + BehaviourOnErrorFromString::ConvertDefaultBehaviorTag, + AccurateConvertStrategyAdditions()); + } + return true; + } return false; }); diff --git a/tests/queries/0_stateless/01601_accurate_cast.reference b/tests/queries/0_stateless/01601_accurate_cast.reference index 82138e6354a..6a438c49f13 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.reference +++ b/tests/queries/0_stateless/01601_accurate_cast.reference @@ -4,6 +4,11 @@ 5 5 5 +5 +5 +5 +5 +5 1 12 2023-05-30 14:38:20 diff --git a/tests/queries/0_stateless/01601_accurate_cast.sql b/tests/queries/0_stateless/01601_accurate_cast.sql index 471e4e34a4a..3d418b5a36f 100644 --- a/tests/queries/0_stateless/01601_accurate_cast.sql +++ b/tests/queries/0_stateless/01601_accurate_cast.sql @@ -16,6 +16,21 @@ SELECT accurateCast(-129, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } SELECT accurateCast(5, 'Int8'); SELECT accurateCast(128, 'Int8'); -- { serverError CANNOT_CONVERT_TYPE } +SELECT accurateCast('-1', 'UInt8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt8'); +SELECT accurateCast('257', 'UInt8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt16'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt16'); +SELECT accurateCast('65536', 'UInt16'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt32'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt32'); +SELECT accurateCast('4294967296', 'UInt32'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('-1', 'UInt64'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'UInt64'); +SELECT accurateCast('-129', 'Int8'); -- { serverError CANNOT_PARSE_TEXT } +SELECT accurateCast('5', 'Int8'); +SELECT accurateCast('128', 'Int8'); -- { serverError CANNOT_PARSE_TEXT } + SELECT accurateCast(10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } SELECT accurateCast(1, 'Decimal32(9)'); SELECT accurateCast(-10, 'Decimal32(9)'); -- { serverError DECIMAL_OVERFLOW } From dc30cee58fefa1fcd7414f5faa1af97c3f334b45 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Sat, 25 May 2024 18:02:06 +0200 Subject: [PATCH 026/133] refind docs --- docs/en/engines/table-engines/mergetree-family/mergetree.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 081deccdfee..8576ba553dc 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -39,8 +39,8 @@ If you need to update rows frequently, we recommend using the [`ReplacingMergeTr ``` sql CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] ( - name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTIC(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)], - name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTIC(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)], + name1 [type1] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr1] [COMMENT ...] [CODEC(codec1)] [STATISTICS(stat1)] [TTL expr1] [PRIMARY KEY] [SETTINGS (name = value, ...)], + name2 [type2] [[NOT] NULL] [DEFAULT|MATERIALIZED|ALIAS|EPHEMERAL expr2] [COMMENT ...] [CODEC(codec2)] [STATISTICS(stat2)] [TTL expr2] [PRIMARY KEY] [SETTINGS (name = value, ...)], ... INDEX index_name1 expr1 TYPE type1(...) [GRANULARITY value1], INDEX index_name2 expr2 TYPE type2(...) [GRANULARITY value2], From 25d974173b1493ae579badbdf558792ccf9b18b4 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Sun, 26 May 2024 11:05:53 +0200 Subject: [PATCH 027/133] fix --- src/Storages/Statistics/UniqStatistics.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp index 0a96d7bdc3f..2e455cdff5c 100644 --- a/src/Storages/Statistics/UniqStatistics.cpp +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -46,8 +46,8 @@ void UniqStatistics::update(const ColumnPtr & column) { /// TODO(hanfei): For low cardinality, it's very slow to convert to full column. We can read the dictionary directly. /// Here we intend to avoid crash in CI. - const IColumn * col_ptr = column->convertToFullColumnIfLowCardinality().get(); - collector->addBatchSinglePlace(0, column->size(), data, &col_ptr, nullptr); + auto col_ptr = column->convertToFullColumnIfLowCardinality(); + collector->addBatchSinglePlace(0, column->size(), data, &(col_ptr.get()), nullptr); } void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) From e939e0a0367f7fd2ba49172ee24c3249f89d70d5 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Sun, 26 May 2024 11:55:27 +0200 Subject: [PATCH 028/133] fix build --- src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp | 6 +++--- src/Storages/MergeTree/IMergeTreeDataPartWriter.h | 2 +- src/Storages/Statistics/UniqStatistics.cpp | 3 ++- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp index 891ba1b9660..6152da78395 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.cpp @@ -119,7 +119,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartCompactWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, @@ -136,7 +136,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWideWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, @@ -156,7 +156,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension_, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h index f04beb37ebb..d9e9a433827 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPartWriter.h +++ b/src/Storages/MergeTree/IMergeTreeDataPartWriter.h @@ -84,7 +84,7 @@ MergeTreeDataPartWriterPtr createMergeTreeDataPartWriter( const StorageMetadataPtr & metadata_snapshot, const VirtualsDescriptionPtr & virtual_columns_, const std::vector & indices_to_recalc, - const Statistics & stats_to_recalc_, + const ColumnsStatistics & stats_to_recalc_, const String & marks_file_extension, const CompressionCodecPtr & default_codec_, const MergeTreeWriterSettings & writer_settings, diff --git a/src/Storages/Statistics/UniqStatistics.cpp b/src/Storages/Statistics/UniqStatistics.cpp index 2e455cdff5c..fc748e769ca 100644 --- a/src/Storages/Statistics/UniqStatistics.cpp +++ b/src/Storages/Statistics/UniqStatistics.cpp @@ -47,7 +47,8 @@ void UniqStatistics::update(const ColumnPtr & column) /// TODO(hanfei): For low cardinality, it's very slow to convert to full column. We can read the dictionary directly. /// Here we intend to avoid crash in CI. auto col_ptr = column->convertToFullColumnIfLowCardinality(); - collector->addBatchSinglePlace(0, column->size(), data, &(col_ptr.get()), nullptr); + const IColumn * raw_ptr = col_ptr.get(); + collector->addBatchSinglePlace(0, column->size(), data, &(raw_ptr), nullptr); } void UniqValidator(const SingleStatisticsDescription &, DataTypePtr data_type) From cea82aab5970eeddad04cbabc27407c0c1dc0ff9 Mon Sep 17 00:00:00 2001 From: serxa Date: Sun, 26 May 2024 20:43:49 +0000 Subject: [PATCH 029/133] add dynamic untracked memory limits for more precise memory tracking --- src/Common/CurrentMemoryTracker.cpp | 9 +++++++++ src/Common/CurrentMemoryTracker.h | 2 ++ src/Common/ThreadStatus.h | 12 ++++++++++++ src/Core/Settings.h | 1 + src/Interpreters/ThreadStatusExt.cpp | 10 +++++++--- 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/src/Common/CurrentMemoryTracker.cpp b/src/Common/CurrentMemoryTracker.cpp index 02c7dc6e224..6166119eccf 100644 --- a/src/Common/CurrentMemoryTracker.cpp +++ b/src/Common/CurrentMemoryTracker.cpp @@ -57,6 +57,7 @@ AllocationTrace CurrentMemoryTracker::allocImpl(Int64 size, bool throw_if_memory { auto res = memory_tracker->allocImpl(will_be, throw_if_memory_exceeded); current_thread->untracked_memory = 0; + current_thread->updateUntrackedMemoryLimit(memory_tracker->get()); return res; } else @@ -84,6 +85,13 @@ void CurrentMemoryTracker::check() std::ignore = memory_tracker->allocImpl(0, true); } +Int64 CurrentMemoryTracker::get() +{ + if (auto * memory_tracker = getMemoryTracker()) + return memory_tracker->get(); + return 0; +} + AllocationTrace CurrentMemoryTracker::alloc(Int64 size) { bool throw_if_memory_exceeded = true; @@ -107,6 +115,7 @@ AllocationTrace CurrentMemoryTracker::free(Int64 size) { Int64 untracked_memory = current_thread->untracked_memory; current_thread->untracked_memory = 0; + current_thread->updateUntrackedMemoryLimit(memory_tracker->get() + untracked_memory); return memory_tracker->free(-untracked_memory); } } diff --git a/src/Common/CurrentMemoryTracker.h b/src/Common/CurrentMemoryTracker.h index 18a1e3f49b1..401eeed93dd 100644 --- a/src/Common/CurrentMemoryTracker.h +++ b/src/Common/CurrentMemoryTracker.h @@ -12,7 +12,9 @@ struct CurrentMemoryTracker /// This function should be called after memory deallocation. [[nodiscard]] static AllocationTrace free(Int64 size); + static void check(); + [[nodiscard]] static Int64 get(); /// Throws MEMORY_LIMIT_EXCEEDED (if it's allowed to throw exceptions) static void injectFault(); diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 0c02ab8fdb0..04fb568540b 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -183,6 +183,12 @@ public: Int64 untracked_memory = 0; /// Each thread could new/delete memory in range of (-untracked_memory_limit, untracked_memory_limit) without access to common counters. Int64 untracked_memory_limit = 4 * 1024 * 1024; + /// To keep total untracked memory limited to `untracked_memory_ratio * RSS` we have to account threads with small and large memory footprint differently. + /// For this purpose we dynamically change `untracked_memory_limit` after every tracking event using a simple formula: + /// untracked_memory_limit = clamp(untracked_memory_ratio * cur_memory_bytes, min_untracked_memory, max_untracked_memory) + /// Note that this values are updated when thread is attached to a group + Int64 min_untracked_memory = 4 * 1024 * 1024; + Int64 max_untracked_memory = 4 * 1024; /// Statistics of read and write rows/bytes Progress progress_in; @@ -309,6 +315,12 @@ public: void initGlobalProfiler(UInt64 global_profiler_real_time_period, UInt64 global_profiler_cpu_time_period); + void updateUntrackedMemoryLimit(Int64 current) + { + constexpr Int64 untracked_memory_ratio_bits = 4; // untracked_memory_ratio = 1.0 / (1 << untracked_memory_ratio_bits) = 1.0 / 16 = 6.25% + untracked_memory_limit = std::clamp(current >> untracked_memory_ratio_bits, min_untracked_memory, max_untracked_memory); + } + private: void applyGlobalSettings(); void applyQuerySettings(); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f0389e7e2d5..28b068b9e37 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -489,6 +489,7 @@ class IColumn; M(UInt64, max_memory_usage_for_user, 0, "Maximum memory usage for processing all concurrently running queries for the user. Zero means unlimited.", 0) \ M(UInt64, memory_overcommit_ratio_denominator_for_user, 1_GiB, "It represents soft memory limit on the global level. This value is used to compute query overcommit ratio.", 0) \ M(UInt64, max_untracked_memory, (4 * 1024 * 1024), "Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when amount (in absolute value) becomes larger than specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'.", 0) \ + M(UInt64, min_untracked_memory, (4 * 1024), "Lower bound for untracked memory limit which is applied to threads with low memory consumption. Untracked memory limit equals thread_memory_usage/16 and clamped between min_untracked_memory and max_untracked_memory for every thread.", 0) \ M(UInt64, memory_profiler_step, (4 * 1024 * 1024), "Whenever query memory usage becomes larger than every next step in number of bytes the memory profiler will collect the allocating stack trace. Zero means disabled memory profiler. Values lower than a few megabytes will slow down query processing.", 0) \ M(Float, memory_profiler_sample_probability, 0., "Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type. The probability is for every alloc/free regardless to the size of the allocation (can be changed with `memory_profiler_sample_min_allocation_size` and `memory_profiler_sample_max_allocation_size`). Note that sampling happens only when the amount of untracked memory exceeds 'max_untracked_memory'. You may want to set 'max_untracked_memory' to 0 for extra fine grained sampling.", 0) \ M(UInt64, memory_profiler_sample_min_allocation_size, 0, "Collect random allocations of size greater or equal than specified value with probability equal to `memory_profiler_sample_probability`. 0 means disabled. You may want to set 'max_untracked_memory' to 0 to make this threshold to work as expected.", 0) \ diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 9ca521a4ab3..981c7d45d8e 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -210,9 +211,12 @@ void ThreadStatus::applyQuerySettings() query_id_from_query_context = query_context_ptr->getCurrentQueryId(); initQueryProfiler(); - untracked_memory_limit = settings.max_untracked_memory; - if (settings.memory_profiler_step && settings.memory_profiler_step < static_cast(untracked_memory_limit)) - untracked_memory_limit = settings.memory_profiler_step; + max_untracked_memory = settings.max_untracked_memory; + if (settings.memory_profiler_step && settings.memory_profiler_step < static_cast(max_untracked_memory)) + max_untracked_memory = settings.memory_profiler_step; + min_untracked_memory = std::min(settings.min_untracked_memory, max_untracked_memory); + + updateUntrackedMemoryLimit(CurrentMemoryTracker::get()); #if defined(OS_LINUX) /// Set "nice" value if required. From e0c8ae8f4baf9a9571aaa02e7d8a06610cf91d9e Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 27 May 2024 10:44:14 +0000 Subject: [PATCH 030/133] fix tests --- src/Core/SettingsChangesHistory.h | 1 + tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 16f28d94640..4c087060179 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -106,6 +106,7 @@ static std::map sett {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}, {"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."}, + {"min_untracked_memory", 4_MiB, 4_KiB, "A new setting."}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, diff --git a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql index 69bd15e3f54..68472a93c9c 100644 --- a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql +++ b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql @@ -7,7 +7,8 @@ -- sizeof(HLL) is (2^K * 6 / 8) -- hence max_memory_usage for 100 rows = (96<<10)*100 = 9830400 -SET use_uncompressed_cache = 0; +SET use_uncompressed_cache = 0; +SET min_untracked_memory = 4194304; -- 4MiB -- HashTable for UInt32 (used until (1<<13) elements), hence 8192 elements SELECT 'UInt32'; From 54735e6292ebbce528a4a0681d294ac56c71cbb5 Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 27 May 2024 17:52:09 +0000 Subject: [PATCH 031/133] fix --- src/Common/ThreadStatus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 04fb568540b..49594116b91 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -187,8 +187,8 @@ public: /// For this purpose we dynamically change `untracked_memory_limit` after every tracking event using a simple formula: /// untracked_memory_limit = clamp(untracked_memory_ratio * cur_memory_bytes, min_untracked_memory, max_untracked_memory) /// Note that this values are updated when thread is attached to a group - Int64 min_untracked_memory = 4 * 1024 * 1024; - Int64 max_untracked_memory = 4 * 1024; + Int64 min_untracked_memory = 4 * 1024; + Int64 max_untracked_memory = 4 * 1024 * 1024; /// Statistics of read and write rows/bytes Progress progress_in; From c973addee64c4dba156ad6ea741afdf97e8a46cd Mon Sep 17 00:00:00 2001 From: serxa Date: Mon, 27 May 2024 19:13:56 +0000 Subject: [PATCH 032/133] disable precise memory tracking for some tests --- tests/integration/test_settings_constraints_distributed/test.py | 2 +- .../0_stateless/03030_system_flush_distributed_settings.sql | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_settings_constraints_distributed/test.py b/tests/integration/test_settings_constraints_distributed/test.py index fbebbac276e..295347192bd 100644 --- a/tests/integration/test_settings_constraints_distributed/test.py +++ b/tests/integration/test_settings_constraints_distributed/test.py @@ -136,7 +136,7 @@ def test_select_clamps_settings(): ) assert ( - distributed.query(query, settings={"max_memory_usage": 1}) + distributed.query(query, settings={"max_memory_usage": 1, "min_untracked_memory": 4194304}) == "node1\tmax_memory_usage\t11111111\n" "node1\treadonly\t0\n" "node2\tmax_memory_usage\t0\n" diff --git a/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql b/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql index da2a387e07c..e8a3da174a6 100644 --- a/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql +++ b/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql @@ -13,6 +13,8 @@ create table dist_out as data engine=Distributed(test_shard_localhost, currentDa set prefer_localhost_replica=0; +set min_untracked_memory='4Mi' -- Disable precise memory tracking + insert into dist_in select number/100, number from system.numbers limit 1e6 settings max_memory_usage='20Mi'; system flush distributed dist_in; -- { serverError MEMORY_LIMIT_EXCEEDED } system flush distributed dist_in settings max_memory_usage=0; From 18dce4169f1b3a3692f4975fb688a3b137b547c4 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 27 May 2024 19:22:24 +0000 Subject: [PATCH 033/133] Automatic style fix --- .../integration/test_settings_constraints_distributed/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_settings_constraints_distributed/test.py b/tests/integration/test_settings_constraints_distributed/test.py index 295347192bd..a1f44af1069 100644 --- a/tests/integration/test_settings_constraints_distributed/test.py +++ b/tests/integration/test_settings_constraints_distributed/test.py @@ -136,7 +136,9 @@ def test_select_clamps_settings(): ) assert ( - distributed.query(query, settings={"max_memory_usage": 1, "min_untracked_memory": 4194304}) + distributed.query( + query, settings={"max_memory_usage": 1, "min_untracked_memory": 4194304} + ) == "node1\tmax_memory_usage\t11111111\n" "node1\treadonly\t0\n" "node2\tmax_memory_usage\t0\n" From 1c9f4da6b081832c61842beb2a40c209beb2e5b7 Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 28 May 2024 11:16:32 +0000 Subject: [PATCH 034/133] turn off dynamic untracked limit memory for not-attached threads (clients and tests) --- src/Common/ThreadStatus.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 49594116b91..db4854da707 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -187,7 +187,7 @@ public: /// For this purpose we dynamically change `untracked_memory_limit` after every tracking event using a simple formula: /// untracked_memory_limit = clamp(untracked_memory_ratio * cur_memory_bytes, min_untracked_memory, max_untracked_memory) /// Note that this values are updated when thread is attached to a group - Int64 min_untracked_memory = 4 * 1024; + Int64 min_untracked_memory = 4 * 1024 * 1024; // Default value is kept 4MB mostly for tests and client (should be changed to 4KB as default value a setting) Int64 max_untracked_memory = 4 * 1024 * 1024; /// Statistics of read and write rows/bytes From d07c6461e2d480cad7d95aeceed070f78d42bfc5 Mon Sep 17 00:00:00 2001 From: serxa Date: Tue, 28 May 2024 14:17:33 +0000 Subject: [PATCH 035/133] fix syntax error --- .../0_stateless/03030_system_flush_distributed_settings.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql b/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql index e8a3da174a6..7961444dbc2 100644 --- a/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql +++ b/tests/queries/0_stateless/03030_system_flush_distributed_settings.sql @@ -13,7 +13,7 @@ create table dist_out as data engine=Distributed(test_shard_localhost, currentDa set prefer_localhost_replica=0; -set min_untracked_memory='4Mi' -- Disable precise memory tracking +set min_untracked_memory='4Mi'; -- Disable precise memory tracking insert into dist_in select number/100, number from system.numbers limit 1e6 settings max_memory_usage='20Mi'; system flush distributed dist_in; -- { serverError MEMORY_LIMIT_EXCEEDED } From 28e71af95cb4008ce791dceaf381d84e32d716e5 Mon Sep 17 00:00:00 2001 From: serxa Date: Wed, 29 May 2024 14:55:32 +0000 Subject: [PATCH 036/133] disable precise memory tracking to avoid memory_exceeded exception in test --- tests/integration/test_failed_async_inserts/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_failed_async_inserts/test.py b/tests/integration/test_failed_async_inserts/test.py index ecb506c36bc..3a6159107ac 100644 --- a/tests/integration/test_failed_async_inserts/test.py +++ b/tests/integration/test_failed_async_inserts/test.py @@ -46,7 +46,7 @@ def test_failed_async_inserts(started_cluster): ) select_query = ( - "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery'" + "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery' SETTINGS min_untracked_memory = 4194304" ) assert node.query(select_query) == "4\n" From c083896c590d547e4ed3649259d4ef4b00fd91d0 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 29 May 2024 15:06:31 +0000 Subject: [PATCH 037/133] Automatic style fix --- tests/integration/test_failed_async_inserts/test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_failed_async_inserts/test.py b/tests/integration/test_failed_async_inserts/test.py index 3a6159107ac..2bb56b250ea 100644 --- a/tests/integration/test_failed_async_inserts/test.py +++ b/tests/integration/test_failed_async_inserts/test.py @@ -45,9 +45,7 @@ def test_failed_async_inserts(started_cluster): ignore_error=True, ) - select_query = ( - "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery' SETTINGS min_untracked_memory = 4194304" - ) + select_query = "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery' SETTINGS min_untracked_memory = 4194304" assert node.query(select_query) == "4\n" From 2843aaaf24046bdfe81a9683a6adbc57bf0c9882 Mon Sep 17 00:00:00 2001 From: Thom O'Connor Date: Wed, 29 May 2024 18:21:57 -0600 Subject: [PATCH 038/133] Updated Advanced Dashboard for both open-source and ClickHouse Cloud versions to include a chart for 'Maximum concurrent network connections' --- .../System/StorageSystemDashboards.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 9682fbc74a1..0e92769764c 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -212,6 +212,20 @@ FROM merge('system', '^asynchronous_metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} AND metric = 'MaxPartCountForPartition' GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} +)EOQ") } + }, + { + { "dashboard", "Overview" }, + { "title", "Maximum concurrent network connections" }, + { "query", trim(R"EOQ( +SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) +FROM ( +SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections +FROM merge('system', '^metric_log') +WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} +GROUP BY event_time) +GROUP BY t +ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } }, /// Default dashboard for ClickHouse Cloud @@ -349,6 +363,11 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} { "dashboard", "Cloud overview" }, { "title", "Network send bytes/sec" }, { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, avg(value)\nFROM (\n SELECT event_time, sum(value) AS value\n FROM clusterAllReplicas(default, merge('system', '^asynchronous_metric_log'))\n WHERE event_date >= toDate(now() - {seconds:UInt32})\n AND event_time >= now() - {seconds:UInt32}\n AND metric LIKE 'NetworkSendBytes%'\n GROUP BY event_time)\nGROUP BY t\nORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + }, + { + { "dashboard", "Cloud overview" }, + { "title", "Maximum concurrent network connections" }, + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM ( SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM clusterAllReplicas(default, merge('system', '^metric_log')) WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; From 47b45fdc1fc8521ad91a69677b1cb398771b2bfb Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 30 May 2024 16:46:13 +0000 Subject: [PATCH 039/133] add hysteresis of untracked memory --- src/Common/CurrentMemoryTracker.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Common/CurrentMemoryTracker.cpp b/src/Common/CurrentMemoryTracker.cpp index 6166119eccf..b1dcded0b23 100644 --- a/src/Common/CurrentMemoryTracker.cpp +++ b/src/Common/CurrentMemoryTracker.cpp @@ -111,7 +111,8 @@ AllocationTrace CurrentMemoryTracker::free(Int64 size) if (current_thread) { current_thread->untracked_memory -= size; - if (current_thread->untracked_memory < -current_thread->untracked_memory_limit) + // Note that we use `max_untracked_memory` and not `untracked_memory_limit` to create hysteresis to avoid track/untrack cycles + if (current_thread->untracked_memory < -current_thread->max_untracked_memory) { Int64 untracked_memory = current_thread->untracked_memory; current_thread->untracked_memory = 0; From 6de079e10cbf8e2510dbe6cd45c8c84d40e70609 Mon Sep 17 00:00:00 2001 From: Thom O'Connor Date: Thu, 30 May 2024 18:00:03 -0600 Subject: [PATCH 040/133] Minor update: modified 'Maximum concurrent network connections' to 'Concurrent network connections' --- src/Storages/System/StorageSystemDashboards.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 0e92769764c..57f84e09857 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -216,7 +216,7 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} }, { { "dashboard", "Overview" }, - { "title", "Maximum concurrent network connections" }, + { "title", "Concurrent network connections" }, { "query", trim(R"EOQ( SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM ( @@ -366,7 +366,7 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} }, { { "dashboard", "Cloud overview" }, - { "title", "Maximum concurrent network connections" }, + { "title", "Concurrent network connections" }, { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM ( SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM clusterAllReplicas(default, merge('system', '^metric_log')) WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; From 3f74783302f545971b0ec7bfec954e91209dc0b6 Mon Sep 17 00:00:00 2001 From: serxa Date: Fri, 31 May 2024 09:11:58 +0000 Subject: [PATCH 041/133] adjust settings history changes --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 4c087060179..ecb4960a06a 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -92,6 +92,7 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + {"min_untracked_memory", 4_MiB, 4_KiB, "A new setting."}, }}, {"24.5", {{"allow_deprecated_functions", true, false, "Allow usage of deprecated functions"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, @@ -106,7 +107,6 @@ static std::map sett {"cast_string_to_dynamic_use_inference", false, false, "Add setting to allow converting String to Dynamic through parsing"}, {"allow_experimental_dynamic_type", false, false, "Add new experimental Dynamic type"}, {"azure_max_blocks_in_multipart_upload", 50000, 50000, "Maximum number of blocks in multipart upload for Azure."}, - {"min_untracked_memory", 4_MiB, 4_KiB, "A new setting."}, }}, {"24.4", {{"input_format_json_throw_on_bad_escape_sequence", true, true, "Allow to save JSON strings with bad escape sequences"}, {"max_parsing_threads", 0, 0, "Add a separate setting to control number of threads in parallel parsing from files"}, From 6a8adb6d487db7789f2c2f4f72103cb5e14b2281 Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 3 Jun 2024 16:34:53 +0200 Subject: [PATCH 042/133] Refactor change and add failing test case --- src/Common/StringUtils.h | 12 ++++++++++++ src/Common/UTF8Helpers.cpp | 18 +++++++++++++++--- .../03142_skip_ANSI_in_UTF8_compute_width.sql | 7 ++++++- 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/Common/StringUtils.h b/src/Common/StringUtils.h index fe5fc3c058f..e4c7ab3e80c 100644 --- a/src/Common/StringUtils.h +++ b/src/Common/StringUtils.h @@ -140,6 +140,18 @@ inline bool isPrintableASCII(char c) return uc >= 32 && uc <= 126; /// 127 is ASCII DEL. } +inline bool isCSIParameterByte(char c) +{ + uint8_t uc = c; + return uc >= 0x30 && uc <= 0x3F; /// ASCII 0–9:;<=>? +} + +inline bool isCSIIntermediateByte(char c) +{ + uint8_t uc = c; + return uc >= 0x20 && uc <= 0x2F; /// ASCII !"#$%&'()*+,-./ +} + inline bool isCSIFinalByte(char c) { uint8_t uc = c; diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 8c8c8e8327b..34eba832113 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -147,10 +147,22 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l while (i < size && isPrintableASCII(data[i])) { - if (!isEscapeSequence) + auto isParameterByte = isCSIParameterByte(data[i]); + auto isIntermediateByte = isCSIIntermediateByte(data[i]); + auto ignore_width = isEscapeSequence & (isParameterByte || isIntermediateByte); + + if (ignore_width || (data[i] == '[' && isEscapeSequence)) + { + /// don't count the width + } + else if (isEscapeSequence && isCSIFinalByte(data[i])) + { + isEscapeSequence = false; + } + else + { ++width; - else if (isCSIFinalByte(data[i]) && data[i - 1] != '\x1b') - isEscapeSequence = false; /// end of CSI escape sequence reached + } ++i; } diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql index e37b0db08e9..f4b0bfe5888 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql @@ -1 +1,6 @@ -SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 0) AS x FORMAT Pretty; +SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; +SELECT format('\x1b[38;2;{0};{1};{2}m█ test \x1b[0m', 255, 128, 128) AS x; +SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test', 255, 128, 128) AS x; +SELECT format('test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; +SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; +SELECT visibleWidth('0};{1};{2}m█'); \ No newline at end of file From acfe2876b57aa4766e15df4a955991c19eb9dc8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 3 Jun 2024 21:06:02 +0200 Subject: [PATCH 043/133] Fix OrderByLimitByDuplicateEliminationVisitor across subqueries --- ...OrderByLimitByDuplicateEliminationPass.cpp | 7 +--- .../03165_order_by_duplicate.reference | 39 +++++++++++++++++++ .../0_stateless/03165_order_by_duplicate.sql | 16 ++++++++ 3 files changed, 57 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/03165_order_by_duplicate.reference create mode 100644 tests/queries/0_stateless/03165_order_by_duplicate.sql diff --git a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp index 26ca5984b49..15919c4a2fe 100644 --- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp +++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp @@ -22,6 +22,7 @@ public: if (query_node->hasOrderBy()) { + QueryTreeNodeConstRawPtrWithHashSet unique_expressions_nodes_set; QueryTreeNodes result_nodes; auto & query_order_by_nodes = query_node->getOrderBy().getNodes(); @@ -45,10 +46,9 @@ public: query_order_by_nodes = std::move(result_nodes); } - unique_expressions_nodes_set.clear(); - if (query_node->hasLimitBy()) { + QueryTreeNodeConstRawPtrWithHashSet unique_expressions_nodes_set; QueryTreeNodes result_nodes; auto & query_limit_by_nodes = query_node->getLimitBy().getNodes(); @@ -63,9 +63,6 @@ public: query_limit_by_nodes = std::move(result_nodes); } } - -private: - QueryTreeNodeConstRawPtrWithHashSet unique_expressions_nodes_set; }; } diff --git a/tests/queries/0_stateless/03165_order_by_duplicate.reference b/tests/queries/0_stateless/03165_order_by_duplicate.reference new file mode 100644 index 00000000000..5d5e7a33f4a --- /dev/null +++ b/tests/queries/0_stateless/03165_order_by_duplicate.reference @@ -0,0 +1,39 @@ +QUERY id: 0 + PROJECTION COLUMNS + id UInt64 + PROJECTION + LIST id: 1, nodes: 1 + COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3 + JOIN TREE + TABLE id: 3, alias: __table1, table_name: default.test, final: 1 + WHERE + FUNCTION id: 4, function_name: in, function_type: ordinary, result_type: UInt8 + ARGUMENTS + LIST id: 5, nodes: 2 + COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3 + QUERY id: 6, is_subquery: 1, is_distinct: 1 + PROJECTION COLUMNS + id UInt64 + PROJECTION + LIST id: 7, nodes: 1 + COLUMN id: 8, column_name: id, result_type: UInt64, source_id: 9 + JOIN TREE + TABLE id: 9, alias: __table1, table_name: default.test, final: 1 + ORDER BY + LIST id: 10, nodes: 1 + SORT id: 11, sort_direction: ASCENDING, with_fill: 0 + EXPRESSION + COLUMN id: 8, column_name: id, result_type: UInt64, source_id: 9 + LIMIT + CONSTANT id: 12, constant_value: UInt64_4, constant_value_type: UInt64 + ORDER BY + LIST id: 13, nodes: 1 + SORT id: 14, sort_direction: ASCENDING, with_fill: 0 + EXPRESSION + COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3 + LIMIT BY LIMIT + CONSTANT id: 15, constant_value: UInt64_1, constant_value_type: UInt64 + LIMIT BY + LIST id: 16, nodes: 1 + COLUMN id: 2, column_name: id, result_type: UInt64, source_id: 3 + SETTINGS allow_experimental_analyzer=1 diff --git a/tests/queries/0_stateless/03165_order_by_duplicate.sql b/tests/queries/0_stateless/03165_order_by_duplicate.sql new file mode 100644 index 00000000000..0054cbc36a6 --- /dev/null +++ b/tests/queries/0_stateless/03165_order_by_duplicate.sql @@ -0,0 +1,16 @@ +CREATE TABLE test +ENGINE = ReplacingMergeTree +PRIMARY KEY id +AS SELECT number AS id FROM numbers(100); + +EXPLAIN QUERY TREE SELECT id +FROM test FINAL +WHERE id IN ( + SELECT DISTINCT id + FROM test FINAL + ORDER BY id ASC + LIMIT 4 +) +ORDER BY id ASC +LIMIT 1 BY id +SETTINGS allow_experimental_analyzer = 1; From 62aacc5539f4ba286d4a39905d00433fbba94390 Mon Sep 17 00:00:00 2001 From: pufit Date: Mon, 3 Jun 2024 18:43:08 -0400 Subject: [PATCH 044/133] Fix default database with grant on cluster --- src/Interpreters/Access/InterpreterGrantQuery.cpp | 9 +++++---- .../integration/test_access_control_on_cluster/test.py | 10 ++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/Access/InterpreterGrantQuery.cpp b/src/Interpreters/Access/InterpreterGrantQuery.cpp index a137404a669..6ad32ae5a31 100644 --- a/src/Interpreters/Access/InterpreterGrantQuery.cpp +++ b/src/Interpreters/Access/InterpreterGrantQuery.cpp @@ -438,6 +438,11 @@ BlockIO InterpreterGrantQuery::execute() RolesOrUsersSet roles_to_revoke; collectRolesToGrantOrRevoke(access_control, query, roles_to_grant, roles_to_revoke); + /// Check if the current user has corresponding access rights granted with grant option. + String current_database = getContext()->getCurrentDatabase(); + elements_to_grant.replaceEmptyDatabase(current_database); + elements_to_revoke.replaceEmptyDatabase(current_database); + /// Executing on cluster. if (!query.cluster.empty()) { @@ -452,10 +457,6 @@ BlockIO InterpreterGrantQuery::execute() return executeDDLQueryOnCluster(updated_query, getContext(), params); } - /// Check if the current user has corresponding access rights granted with grant option. - String current_database = getContext()->getCurrentDatabase(); - elements_to_grant.replaceEmptyDatabase(current_database); - elements_to_revoke.replaceEmptyDatabase(current_database); bool need_check_grantees_are_allowed = true; if (!query.current_grants) checkGrantOption(access_control, *current_user_access, grantees, need_check_grantees_are_allowed, elements_to_grant, elements_to_revoke); diff --git a/tests/integration/test_access_control_on_cluster/test.py b/tests/integration/test_access_control_on_cluster/test.py index 8dbb87c67d8..87298bcabd8 100644 --- a/tests/integration/test_access_control_on_cluster/test.py +++ b/tests/integration/test_access_control_on_cluster/test.py @@ -74,3 +74,13 @@ def test_grant_all_on_cluster(): assert ch2.query("SHOW GRANTS FOR Alex") == "GRANT ALL ON *.* TO Alex\n" ch1.query("DROP USER Alex ON CLUSTER 'cluster'") + + +def test_grant_current_database_on_cluster(): + ch1.query("CREATE DATABASE user_db ON CLUSTER 'cluster'") + ch1.query("CREATE USER IF NOT EXISTS test_user ON CLUSTER 'cluster' DEFAULT DATABASE user_db") + ch1.query("GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION") + + assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" + ch1.query("GRANT SELECT ON * TO test_user ON CLUSTER 'cluster'", user="test_user") + assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" From 4aa396d115029ef3fb963bedc2c873749dac24db Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Mon, 3 Jun 2024 22:45:48 +0000 Subject: [PATCH 045/133] Fix assert in IObjectStorageIteratorAsync --- .../ObjectStorageIteratorAsync.cpp | 43 ++++++++++--------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp index 0420de0f8dd..a249789df4b 100644 --- a/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageIteratorAsync.cpp @@ -36,30 +36,24 @@ void IObjectStorageIteratorAsync::deactivate() void IObjectStorageIteratorAsync::nextBatch() { std::lock_guard lock(mutex); + if (is_finished) { current_batch.clear(); current_batch_iterator = current_batch.begin(); + return; } - else - { - if (!is_initialized) - { - outcome_future = scheduleBatch(); - is_initialized = true; - } + if (!is_initialized) + { + outcome_future = scheduleBatch(); + is_initialized = true; + } + + try + { chassert(outcome_future.valid()); - BatchAndHasNext result; - try - { - result = outcome_future.get(); - } - catch (...) - { - is_finished = true; - throw; - } + BatchAndHasNext result = outcome_future.get(); current_batch = std::move(result.batch); current_batch_iterator = current_batch.begin(); @@ -71,6 +65,11 @@ void IObjectStorageIteratorAsync::nextBatch() else is_finished = true; } + catch (...) + { + is_finished = true; + throw; + } } void IObjectStorageIteratorAsync::next() @@ -95,35 +94,39 @@ std::future IObjectStorageIterator bool IObjectStorageIteratorAsync::isValid() { + std::lock_guard lock(mutex); + if (!is_initialized) nextBatch(); - std::lock_guard lock(mutex); return current_batch_iterator != current_batch.end(); } RelativePathWithMetadataPtr IObjectStorageIteratorAsync::current() { + std::lock_guard lock(mutex); + if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); - std::lock_guard lock(mutex); return *current_batch_iterator; } RelativePathsWithMetadata IObjectStorageIteratorAsync::currentBatch() { + std::lock_guard lock(mutex); + if (!isValid()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to access invalid iterator"); - std::lock_guard lock(mutex); return current_batch; } std::optional IObjectStorageIteratorAsync::getCurrentBatchAndScheduleNext() { std::lock_guard lock(mutex); + if (!is_initialized) nextBatch(); From c6108cf8f5b919061b2fe2d5b9730e6d9d119013 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 3 Jun 2024 22:55:53 +0000 Subject: [PATCH 046/133] Automatic style fix --- tests/integration/test_access_control_on_cluster/test.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_access_control_on_cluster/test.py b/tests/integration/test_access_control_on_cluster/test.py index 87298bcabd8..73112b5deae 100644 --- a/tests/integration/test_access_control_on_cluster/test.py +++ b/tests/integration/test_access_control_on_cluster/test.py @@ -78,8 +78,12 @@ def test_grant_all_on_cluster(): def test_grant_current_database_on_cluster(): ch1.query("CREATE DATABASE user_db ON CLUSTER 'cluster'") - ch1.query("CREATE USER IF NOT EXISTS test_user ON CLUSTER 'cluster' DEFAULT DATABASE user_db") - ch1.query("GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION") + ch1.query( + "CREATE USER IF NOT EXISTS test_user ON CLUSTER 'cluster' DEFAULT DATABASE user_db" + ) + ch1.query( + "GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION" + ) assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" ch1.query("GRANT SELECT ON * TO test_user ON CLUSTER 'cluster'", user="test_user") From abdf0d5b5896d87302156199b3fbaeddd32c1d14 Mon Sep 17 00:00:00 2001 From: pufit Date: Mon, 3 Jun 2024 21:29:08 -0400 Subject: [PATCH 047/133] fix test --- tests/integration/test_access_control_on_cluster/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_access_control_on_cluster/test.py b/tests/integration/test_access_control_on_cluster/test.py index 73112b5deae..1b480a39768 100644 --- a/tests/integration/test_access_control_on_cluster/test.py +++ b/tests/integration/test_access_control_on_cluster/test.py @@ -84,6 +84,7 @@ def test_grant_current_database_on_cluster(): ch1.query( "GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION" ) + ch1.query("GRANT CLUSTER ON * TO test_user ON CLUSTER 'cluster'") assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" ch1.query("GRANT SELECT ON * TO test_user ON CLUSTER 'cluster'", user="test_user") From e59d71be487378561826d49f48885bf83a27096d Mon Sep 17 00:00:00 2001 From: pufit Date: Mon, 3 Jun 2024 23:58:39 -0400 Subject: [PATCH 048/133] fix test --- tests/integration/test_access_control_on_cluster/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_access_control_on_cluster/test.py b/tests/integration/test_access_control_on_cluster/test.py index 1b480a39768..b12add7ad3f 100644 --- a/tests/integration/test_access_control_on_cluster/test.py +++ b/tests/integration/test_access_control_on_cluster/test.py @@ -84,7 +84,7 @@ def test_grant_current_database_on_cluster(): ch1.query( "GRANT SELECT ON user_db.* TO test_user ON CLUSTER 'cluster' WITH GRANT OPTION" ) - ch1.query("GRANT CLUSTER ON * TO test_user ON CLUSTER 'cluster'") + ch1.query("GRANT CLUSTER ON *.* TO test_user ON CLUSTER 'cluster'") assert ch1.query("SHOW DATABASES", user="test_user") == "user_db\n" ch1.query("GRANT SELECT ON * TO test_user ON CLUSTER 'cluster'", user="test_user") From c3fd58475a8ef619fa1ec119350330949c8c92b8 Mon Sep 17 00:00:00 2001 From: pufit Date: Tue, 4 Jun 2024 01:12:30 -0400 Subject: [PATCH 049/133] Add comment --- src/Interpreters/Access/InterpreterGrantQuery.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/Access/InterpreterGrantQuery.cpp b/src/Interpreters/Access/InterpreterGrantQuery.cpp index 6ad32ae5a31..b75c0bfb1c7 100644 --- a/src/Interpreters/Access/InterpreterGrantQuery.cpp +++ b/src/Interpreters/Access/InterpreterGrantQuery.cpp @@ -438,7 +438,7 @@ BlockIO InterpreterGrantQuery::execute() RolesOrUsersSet roles_to_revoke; collectRolesToGrantOrRevoke(access_control, query, roles_to_grant, roles_to_revoke); - /// Check if the current user has corresponding access rights granted with grant option. + /// Replacing empty database with the default. This step must be done before replication to avoid privilege escalation. String current_database = getContext()->getCurrentDatabase(); elements_to_grant.replaceEmptyDatabase(current_database); elements_to_revoke.replaceEmptyDatabase(current_database); @@ -457,6 +457,7 @@ BlockIO InterpreterGrantQuery::execute() return executeDDLQueryOnCluster(updated_query, getContext(), params); } + /// Check if the current user has corresponding access rights granted with grant option. bool need_check_grantees_are_allowed = true; if (!query.current_grants) checkGrantOption(access_control, *current_user_access, grantees, need_check_grantees_are_allowed, elements_to_grant, elements_to_revoke); From a7729d6bc4d2316191f08e6831316eb70dad9b75 Mon Sep 17 00:00:00 2001 From: Eduard Karacharov Date: Sat, 1 Jun 2024 17:39:05 +0300 Subject: [PATCH 050/133] fix: function filters with token-based text indexes --- src/Interpreters/ITokenExtractor.cpp | 30 +++ src/Interpreters/ITokenExtractor.h | 33 +++ .../MergeTreeIndexBloomFilterText.cpp | 18 +- .../MergeTree/MergeTreeIndexFullText.cpp | 10 +- ...6_fulltext_index_match_predicate.reference | 12 +- .../02346_fulltext_index_match_predicate.sql | 28 +-- .../02346_fulltext_index_search.reference | 18 +- .../02346_fulltext_index_search.sql | 14 +- ...f_indexes_support_match_function.reference | 12 +- ...ngrambf_indexes_support_match_function.sql | 23 +- ...unctions_with_token_text_indexes.reference | 83 +++++++ ...ring_functions_with_token_text_indexes.sql | 227 ++++++++++++++++++ 12 files changed, 446 insertions(+), 62 deletions(-) create mode 100644 tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference create mode 100644 tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql diff --git a/src/Interpreters/ITokenExtractor.cpp b/src/Interpreters/ITokenExtractor.cpp index 1c5d0d4b6d4..f0bf90fcb5c 100644 --- a/src/Interpreters/ITokenExtractor.cpp +++ b/src/Interpreters/ITokenExtractor.cpp @@ -240,4 +240,34 @@ bool SplitTokenExtractor::nextInStringLike(const char * data, size_t length, siz return !bad_token && !token.empty(); } +void SplitTokenExtractor::substringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter, bool is_prefix, bool is_suffix) const +{ + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + + while (cur < length && nextInString(data, length, &cur, &token_start, &token_len)) + // In order to avoid filter updates with incomplete tokens, + // first token is ignored, unless substring is prefix and + // last token is ignored, unless substring is suffix + if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix)) + bloom_filter.add(data + token_start, token_len); +} + +void SplitTokenExtractor::substringToGinFilter(const char * data, size_t length, GinFilter & gin_filter, bool is_prefix, bool is_suffix) const +{ + gin_filter.setQueryString(data, length); + + size_t cur = 0; + size_t token_start = 0; + size_t token_len = 0; + + while (cur < length && nextInString(data, length, &cur, &token_start, &token_len)) + // In order to avoid filter updates with incomplete tokens, + // first token is ignored, unless substring is prefix and + // last token is ignored, unless substring is suffix + if ((token_start > 0 || is_prefix) && (token_start + token_len < length || is_suffix)) + gin_filter.addTerm(data + token_start, token_len); +} + } diff --git a/src/Interpreters/ITokenExtractor.h b/src/Interpreters/ITokenExtractor.h index 2423ef12311..76711606d09 100644 --- a/src/Interpreters/ITokenExtractor.h +++ b/src/Interpreters/ITokenExtractor.h @@ -28,8 +28,22 @@ struct ITokenExtractor /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight. virtual bool nextInStringLike(const char * data, size_t length, size_t * pos, String & out) const = 0; + /// Updates Bloom filter from exact-match string filter value virtual void stringToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0; + /// Updates Bloom filter from substring-match string filter value. + /// An `ITokenExtractor` implementation may decide to skip certain + /// tokens depending on whether the substring is a prefix or a suffix. + virtual void substringToBloomFilter( + const char * data, + size_t length, + BloomFilter & bloom_filter, + bool is_prefix [[maybe_unused]], + bool is_suffix [[maybe_unused]]) const + { + stringToBloomFilter(data, length, bloom_filter); + } + virtual void stringPaddedToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const { stringToBloomFilter(data, length, bloom_filter); @@ -37,8 +51,22 @@ struct ITokenExtractor virtual void stringLikeToBloomFilter(const char * data, size_t length, BloomFilter & bloom_filter) const = 0; + /// Updates GIN filter from exact-match string filter value virtual void stringToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const = 0; + /// Updates GIN filter from substring-match string filter value. + /// An `ITokenExtractor` implementation may decide to skip certain + /// tokens depending on whether the substring is a prefix or a suffix. + virtual void substringToGinFilter( + const char * data, + size_t length, + GinFilter & gin_filter, + bool is_prefix [[maybe_unused]], + bool is_suffix [[maybe_unused]]) const + { + stringToGinFilter(data, length, gin_filter); + } + virtual void stringPaddedToGinFilter(const char * data, size_t length, GinFilter & gin_filter) const { stringToGinFilter(data, length, gin_filter); @@ -148,6 +176,11 @@ struct SplitTokenExtractor final : public ITokenExtractorHelper(params); const auto & value = const_value.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, true, false); return true; } else if (function_name == "endsWith") @@ -575,7 +575,7 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( out.function = RPNElement::FUNCTION_EQUALS; out.bloom_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(value.data(), value.size(), *out.bloom_filter, false, true); return true; } else if (function_name == "multiSearchAny" @@ -596,7 +596,15 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( bloom_filters.back().emplace_back(params); const auto & value = element.get(); - token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back()); + + if (function_name == "multiSearchAny") + { + token_extractor->substringToBloomFilter(value.data(), value.size(), bloom_filters.back().back(), false, false); + } + else + { + token_extractor->stringToBloomFilter(value.data(), value.size(), bloom_filters.back().back()); + } } out.set_bloom_filters = std::move(bloom_filters); return true; @@ -625,12 +633,12 @@ bool MergeTreeConditionBloomFilterText::traverseTreeEquals( for (const auto & alternative : alternatives) { bloom_filters.back().emplace_back(params); - token_extractor->stringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back()); + token_extractor->substringToBloomFilter(alternative.data(), alternative.size(), bloom_filters.back().back(), false, false); } out.set_bloom_filters = std::move(bloom_filters); } else - token_extractor->stringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter); + token_extractor->substringToBloomFilter(required_substring.data(), required_substring.size(), *out.bloom_filter, false, false); return true; } diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index c5965415be5..653cfd8731a 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -594,7 +594,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( out.function = RPNElement::FUNCTION_EQUALS; out.gin_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter); + token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, true, false); return true; } else if (function_name == "endsWith") @@ -603,7 +603,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( out.function = RPNElement::FUNCTION_EQUALS; out.gin_filter = std::make_unique(params); const auto & value = const_value.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), *out.gin_filter); + token_extractor->substringToGinFilter(value.data(), value.size(), *out.gin_filter, false, true); return true; } else if (function_name == "multiSearchAny") @@ -621,7 +621,7 @@ bool MergeTreeConditionFullText::traverseASTEquals( gin_filters.back().emplace_back(params); const auto & value = element.get(); - token_extractor->stringToGinFilter(value.data(), value.size(), gin_filters.back().back()); + token_extractor->substringToGinFilter(value.data(), value.size(), gin_filters.back().back(), false, false); } out.set_gin_filters = std::move(gin_filters); return true; @@ -649,14 +649,14 @@ bool MergeTreeConditionFullText::traverseASTEquals( for (const auto & alternative : alternatives) { gin_filters.back().emplace_back(params); - token_extractor->stringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back()); + token_extractor->substringToGinFilter(alternative.data(), alternative.size(), gin_filters.back().back(), false, false); } out.set_gin_filters = std::move(gin_filters); } else { out.gin_filter = std::make_unique(params); - token_extractor->stringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter); + token_extractor->substringToGinFilter(required_substring.data(), required_substring.size(), *out.gin_filter, false, false); } return true; diff --git a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference index 84fc422379c..e890eac1794 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference +++ b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.reference @@ -1,19 +1,19 @@ -1 Hello ClickHouse -2 Hello World +1 Well, Hello ClickHouse ! +2 Well, Hello World ! Granules: 6/6 Granules: 2/6 Granules: 6/6 Granules: 2/6 --- -1 Hello ClickHouse -2 Hello World -6 World Champion +1 Well, Hello ClickHouse ! +2 Well, Hello World ! +6 True World Champion Granules: 6/6 Granules: 3/6 Granules: 6/6 Granules: 3/6 --- -5 OLAP Database +5 Its An OLAP Database Granules: 6/6 Granules: 1/6 Granules: 6/6 diff --git a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql index 2233c8a1f31..3c558f07be2 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_match_predicate.sql @@ -14,19 +14,19 @@ ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 1; -INSERT INTO tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); +INSERT INTO tab VALUES (1, 'Well, Hello ClickHouse !'), (2, 'Well, Hello World !'), (3, 'Good Weather !'), (4, 'Say Hello !'), (5, 'Its An OLAP Database'), (6, 'True World Champion'); -SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; +SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id; -- Read 2/6 granules --- Required string: 'Hello ' --- Alternatives: 'Hello ClickHouse', 'Hello World' +-- Required string: ' Hello ' +-- Alternatives: ' Hello ClickHouse ', ' Hello World ' SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -37,7 +37,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -46,17 +46,17 @@ SETTINGS SELECT '---'; -SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; +SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id; -- Read 3/6 granules -- Required string: - --- Alternatives: 'ClickHouse', 'World' +-- Alternatives: ' ClickHouse ', ' World ' SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -67,7 +67,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -76,17 +76,17 @@ SETTINGS SELECT '---'; -SELECT * FROM tab WHERE match(str, 'OLAP.*') ORDER BY id; +SELECT * FROM tab WHERE match(str, ' OLAP .*') ORDER BY id; -- Read 1/6 granules --- Required string: 'OLAP' +-- Required string: ' OLAP ' -- Alternatives: - SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -97,7 +97,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' diff --git a/tests/queries/0_stateless/02346_fulltext_index_search.reference b/tests/queries/0_stateless/02346_fulltext_index_search.reference index d742bbc77ec..d7c89d434e7 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_search.reference +++ b/tests/queries/0_stateless/02346_fulltext_index_search.reference @@ -13,19 +13,19 @@ af full_text 1 Test full_text() af full_text -101 Alick a01 -106 Alick a06 -111 Alick b01 -116 Alick b06 -101 Alick a01 -106 Alick a06 +101 x Alick a01 y +106 x Alick a06 y +111 x Alick b01 y +116 x Alick b06 y +101 x Alick a01 y +106 x Alick a06 y 1 -101 Alick a01 -111 Alick b01 +101 x Alick a01 y +111 x Alick b01 y 1 Test on array columns af full_text -3 ['Click a03','Click b03'] +3 ['x Click a03 y','x Click b03 y'] 1 Test on map columns af full_text diff --git a/tests/queries/0_stateless/02346_fulltext_index_search.sql b/tests/queries/0_stateless/02346_fulltext_index_search.sql index 62cd6073842..8506c512409 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_search.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_search.sql @@ -67,7 +67,7 @@ CREATE TABLE tab_x(k UInt64, s String, INDEX af(s) TYPE full_text()) ENGINE = MergeTree() ORDER BY k SETTINGS index_granularity = 2, index_granularity_bytes = '10Mi'; -INSERT INTO tab_x VALUES (101, 'Alick a01'), (102, 'Blick a02'), (103, 'Click a03'), (104, 'Dlick a04'), (105, 'Elick a05'), (106, 'Alick a06'), (107, 'Blick a07'), (108, 'Click a08'), (109, 'Dlick a09'), (110, 'Elick a10'), (111, 'Alick b01'), (112, 'Blick b02'), (113, 'Click b03'), (114, 'Dlick b04'), (115, 'Elick b05'), (116, 'Alick b06'), (117, 'Blick b07'), (118, 'Click b08'), (119, 'Dlick b09'), (120, 'Elick b10'); +INSERT INTO tab_x VALUES (101, 'x Alick a01 y'), (102, 'x Blick a02 y'), (103, 'x Click a03 y'), (104, 'x Dlick a04 y'), (105, 'x Elick a05 y'), (106, 'x Alick a06 y'), (107, 'x Blick a07 y'), (108, 'x Click a08 y'), (109, 'x Dlick a09 y'), (110, 'x Elick a10 y'), (111, 'x Alick b01 y'), (112, 'x Blick b02 y'), (113, 'x Click b03 y'), (114, 'x Dlick b04 y'), (115, 'x Elick b05 y'), (116, 'x Alick b06 y'), (117, 'x Blick b07 y'), (118, 'x Click b08 y'), (119, 'x Dlick b09 y'), (120, 'x Elick b10 y'); -- check full_text index was created SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab_x' AND database = currentDatabase() LIMIT 1; @@ -86,27 +86,27 @@ SELECT read_rows==8 from system.query_log LIMIT 1; -- search full_text index with IN operator -SELECT * FROM tab_x WHERE s IN ('Alick a01', 'Alick a06') ORDER BY k; +SELECT * FROM tab_x WHERE s IN ('x Alick a01 y', 'x Alick a06 y') ORDER BY k; -- check the query only read 2 granules (4 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==4 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'Alick a01\', \'Alick a06\') ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE s IN (\'x Alick a01 y\', \'x Alick a06 y\') ORDER BY k;') AND type='QueryFinish' AND result_rows==2 LIMIT 1; -- search full_text index with multiSearch -SELECT * FROM tab_x WHERE multiSearchAny(s, ['a01', 'b01']) ORDER BY k; +SELECT * FROM tab_x WHERE multiSearchAny(s, [' a01 ', ' b01 ']) ORDER BY k; -- check the query only read 2 granules (4 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==4 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\'a01\', \'b01\']) ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab_x WHERE multiSearchAny(s, [\' a01 \', \' b01 \']) ORDER BY k;') AND type='QueryFinish' AND result_rows==2 LIMIT 1; @@ -126,14 +126,14 @@ INSERT INTO tab SELECT rowNumberInBlock(), groupArray(s) FROM tab_x GROUP BY k%1 SELECT name, type FROM system.data_skipping_indices WHERE table == 'tab' AND database = currentDatabase() LIMIT 1; -- search full_text index with has -SELECT * FROM tab WHERE has(s, 'Click a03') ORDER BY k; +SELECT * FROM tab WHERE has(s, 'x Click a03 y') ORDER BY k; -- check the query must read all 10 granules (20 rows total; each granule has 2 rows) SYSTEM FLUSH LOGS; SELECT read_rows==2 from system.query_log WHERE query_kind ='Select' AND current_database = currentDatabase() - AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'Click a03\') ORDER BY k;') + AND endsWith(trimRight(query), 'SELECT * FROM tab WHERE has(s, \'x Click a03 y\') ORDER BY k;') AND type='QueryFinish' AND result_rows==1 LIMIT 1; diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference index 0e1954cde62..5b7ad7ddce0 100644 --- a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.reference @@ -1,5 +1,5 @@ -1 Hello ClickHouse -2 Hello World +1 Well, Hello ClickHouse ! +2 Well, Hello World ! 1 Hello ClickHouse 2 Hello World Granules: 6/6 @@ -11,9 +11,9 @@ Granules: 6/6 Granules: 2/6 --- -1 Hello ClickHouse -2 Hello World -6 World Champion +1 Well, Hello ClickHouse ! +2 Well, Hello World ! +6 True World Champion 1 Hello ClickHouse 2 Hello World 6 World Champion @@ -26,7 +26,7 @@ Granules: 6/6 Granules: 3/6 --- -5 OLAP Database +5 Its An OLAP Database 5 OLAP Database Granules: 6/6 Granules: 1/6 diff --git a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql index 49d39c601ef..42175cbb2c6 100644 --- a/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql +++ b/tests/queries/0_stateless/02943_tokenbf_and_ngrambf_indexes_support_match_function.sql @@ -21,21 +21,22 @@ ENGINE = MergeTree ORDER BY id SETTINGS index_granularity = 1; -INSERT INTO tokenbf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); +INSERT INTO tokenbf_tab VALUES (1, 'Well, Hello ClickHouse !'), (2, 'Well, Hello World !'), (3, 'Good Weather !'), (4, 'Say Hello !'), (5, 'Its An OLAP Database'), (6, 'True World Champion'); INSERT INTO ngrambf_tab VALUES (1, 'Hello ClickHouse'), (2, 'Hello World'), (3, 'Good Weather'), (4, 'Say Hello'), (5, 'OLAP Database'), (6, 'World Champion'); -SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id; -- Read 2/6 granules -- Required string: 'Hello ' -- Alternatives: 'Hello ClickHouse', 'Hello World' +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -46,7 +47,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes=1 - SELECT * FROM tokenbf_tab WHERE match(str, 'Hello (ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' Hello (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -78,18 +79,19 @@ SETTINGS SELECT '---'; -SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id; -- Read 3/6 granules -- Required string: - -- Alternatives: 'ClickHouse', 'World' +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -100,7 +102,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, '.*(ClickHouse|World)') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, '.* (ClickHouse|World) ') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -131,18 +133,19 @@ SETTINGS SELECT '---'; -SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP.*') ORDER BY id; +SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP .*') ORDER BY id; SELECT * FROM ngrambf_tab WHERE match(str, 'OLAP.*') ORDER BY id; -- Read 1/6 granules -- Required string: 'OLAP' -- Alternatives: - +-- Surrounded by spaces for tokenbf SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' @@ -152,7 +155,7 @@ SELECT * FROM ( EXPLAIN PLAN indexes = 1 - SELECT * FROM tokenbf_tab WHERE match(str, 'OLAP (.*?)*') ORDER BY id + SELECT * FROM tokenbf_tab WHERE match(str, ' OLAP (.*?)*') ORDER BY id ) WHERE explain LIKE '%Granules: %' diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference new file mode 100644 index 00000000000..4fb6812cb4f --- /dev/null +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.reference @@ -0,0 +1,83 @@ +-------- Bloom filter -------- + +-- No skip for prefix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for prefix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for suffix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for suffix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for substring +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for substring with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple substrings +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for multiple substrings with complete tokens +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple non-existsing substrings, only one with complete token +Parts: 1/1 +Parts: 1/1 + +-------- GIN filter -------- + +-- No skip for prefix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for prefix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for suffix +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for suffix with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for substring +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for substring with complete token +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple substrings +Parts: 1/1 +Parts: 1/1 +1 Service is not ready + +-- Skip for multiple substrings with complete tokens +Parts: 1/1 +Parts: 0/1 + +-- No skip for multiple non-existsing substrings, only one with complete token +Parts: 1/1 +Parts: 1/1 diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql new file mode 100644 index 00000000000..a0cb8a35169 --- /dev/null +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql @@ -0,0 +1,227 @@ +SELECT '-------- Bloom filter --------'; +SELECT ''; +DROP TABLE IF EXISTS 03165_token_bf; + +CREATE TABLE 03165_token_bf +( + id Int64, + message String, + INDEX idx_message message TYPE tokenbf_v1(32768, 3, 2) GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO 03165_token_bf VALUES(1, 'Service is not ready'); + +SELECT '-- No skip for prefix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv'); + +SELECT ''; +SELECT '-- Skip for prefix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv i') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE startsWith(message, 'Serv i'); + +SELECT ''; +SELECT '-- No skip for suffix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE endsWith(message, 'eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE endsWith(message, 'eady'); + +SELECT ''; +SELECT '-- Skip for suffix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE endsWith(message, ' eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE endsWith(message, ' eady'); + +SELECT ''; +SELECT '-- No skip for substring'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE match(message, 'no') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE match(message, 'no'); + +SELECT ''; +SELECT '-- Skip for substring with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE match(message, ' xyz ') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE match(message, ' xyz '); + +SELECT ''; +SELECT '-- No skip for multiple substrings'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, ['ce', 'no']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, ['ce', 'no']); + +SELECT ''; +SELECT '-- Skip for multiple substrings with complete tokens'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', ' yz ']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', ' yz ']); + +SELECT ''; +SELECT '-- No skip for multiple non-existsing substrings, only one with complete token'; +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', 'yz']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_bf WHERE multiSearchAny(message, [' wx ', 'yz']); + +DROP TABLE IF EXISTS 03165_token_bf; + +SELECT ''; +SELECT '-------- GIN filter --------'; +SELECT ''; + +SET allow_experimental_inverted_index=1; +DROP TABLE IF EXISTS 03165_token_ft; +CREATE TABLE 03165_token_ft +( + id Int64, + message String, + INDEX idx_message message TYPE full_text() GRANULARITY 1 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO 03165_token_ft VALUES(1, 'Service is not ready'); + +SELECT '-- No skip for prefix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv'); + +SELECT ''; +SELECT '-- Skip for prefix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv i') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE startsWith(message, 'Serv i'); + +SELECT ''; +SELECT '-- No skip for suffix'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE endsWith(message, 'eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE endsWith(message, 'eady'); + +SELECT ''; +SELECT '-- Skip for suffix with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE endsWith(message, ' eady') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE endsWith(message, ' eady'); + +SELECT ''; +SELECT '-- No skip for substring'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE match(message, 'no') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE match(message, 'no'); + +SELECT ''; +SELECT '-- Skip for substring with complete token'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE match(message, ' xyz ') +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE match(message, ' xyz '); + +SELECT ''; +SELECT '-- No skip for multiple substrings'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, ['ce', 'no']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, ['ce', 'no']); + +SELECT ''; +SELECT '-- Skip for multiple substrings with complete tokens'; + +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', ' yz ']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', ' yz ']); + +SELECT ''; +SELECT '-- No skip for multiple non-existsing substrings, only one with complete token'; +SELECT trim(explain) +FROM ( + EXPLAIN indexes = 1 SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', 'yz']) +) +WHERE explain LIKE '%Parts:%'; + +SELECT * FROM 03165_token_ft WHERE multiSearchAny(message, [' wx ', 'yz']); From 55512d4a61e147ef5255bac2a1b75989fae05f4e Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 24 May 2024 18:07:49 +0000 Subject: [PATCH 051/133] Prevent recursive logging in blob_storage_log --- src/Core/Settings.h | 1 + src/IO/S3/BlobStorageLogWriter.cpp | 16 ++++++++++--- src/Interpreters/BlobStorageLog.cpp | 36 +++++++++++++++++++++++++++++ src/Interpreters/BlobStorageLog.h | 10 ++++++++ src/Interpreters/SystemLog.cpp | 25 ++++++++++++++------ src/Interpreters/SystemLog.h | 17 +++++++++----- 6 files changed, 89 insertions(+), 16 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b8f5a8b5a75..18c39b79dde 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -891,6 +891,7 @@ class IColumn; M(Bool, geo_distance_returns_float64_on_float64_arguments, true, "If all four arguments to `geoDistance`, `greatCircleDistance`, `greatCircleAngle` functions are Float64, return Float64 and use double precision for internal calculations. In previous ClickHouse versions, the functions always returned Float32.", 0) \ M(Bool, allow_get_client_http_header, false, "Allow to use the function `getClientHTTPHeader` which lets to obtain a value of an the current HTTP request's header. It is not enabled by default for security reasons, because some headers, such as `Cookie`, could contain sensitive info. Note that the `X-ClickHouse-*` and `Authentication` headers are always restricted and cannot be obtained with this function.", 0) \ M(Bool, cast_string_to_dynamic_use_inference, false, "Use types inference during String to Dynamic conversion", 0) \ + M(Bool, enable_blob_storage_log, true, "Write information about blob storage operations to system.blob_storage_log table", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp index aaf4aea5a8e..7252f33c8b3 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -23,6 +23,9 @@ void BlobStorageLogWriter::addEvent( if (!log) return; + if (log->shouldIgnorePath(local_path_.empty() ? local_path : local_path_)) + return; + if (!time_now.time_since_epoch().count()) time_now = std::chrono::system_clock::now(); @@ -54,15 +57,22 @@ void BlobStorageLogWriter::addEvent( BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) { #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD /// Keeper standalone build doesn't have a context - if (auto blob_storage_log = Context::getGlobalContextInstance()->getBlobStorageLog()) + const auto & global_context = Context::getGlobalContextInstance(); + bool enable_blob_storage_log = global_context->getSettingsRef().enable_blob_storage_log; + if (auto blob_storage_log = global_context->getBlobStorageLog()) { auto log_writer = std::make_shared(std::move(blob_storage_log)); log_writer->disk_name = disk_name; - if (CurrentThread::isInitialized() && CurrentThread::get().getQueryContext()) + const auto & query_context = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; + if (query_context) + { log_writer->query_id = CurrentThread::getQueryId(); + enable_blob_storage_log = query_context->getSettingsRef().enable_blob_storage_log; + } - return log_writer; + if (enable_blob_storage_log) + return log_writer; } #endif return {}; diff --git a/src/Interpreters/BlobStorageLog.cpp b/src/Interpreters/BlobStorageLog.cpp index 0324ef8713c..c97895de95d 100644 --- a/src/Interpreters/BlobStorageLog.cpp +++ b/src/Interpreters/BlobStorageLog.cpp @@ -9,6 +9,8 @@ #include #include +#include +#include namespace DB { @@ -69,4 +71,38 @@ void BlobStorageLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(error_message); } +ContextMutablePtr BlobStorageLog::getQueryContext(const ContextPtr & context_) const +{ + /// Override setting in INSERT query context to disable logging blobs inserted to the table itself + auto result_context = Context::createCopy(context_); + result_context->makeQueryContext(); + result_context->setSetting("enable_blob_storage_log", false); + return result_context; +} + +static std::string_view normalizePath(std::string_view path) +{ + if (path.starts_with("./")) + path.remove_prefix(2); + if (path.ends_with("/")) + path.remove_suffix(1); + return path; +} + +void BlobStorageLog::prepareTable() +{ + SystemLog::prepareTable(); + if (auto merge_tree_table = std::dynamic_pointer_cast(getStorage())) + { + const auto & relative_data_path = merge_tree_table->getRelativeDataPath(); + prefix_to_ignore = normalizePath(relative_data_path); + } +} + +bool BlobStorageLog::shouldIgnorePath(const String & path) const +{ + /// Avoid logging info for data in `blob_storage_log` itself + return !prefix_to_ignore.empty() && normalizePath(path).starts_with(prefix_to_ignore); +} + } diff --git a/src/Interpreters/BlobStorageLog.h b/src/Interpreters/BlobStorageLog.h index 15e15be4f87..c4c50c7e55a 100644 --- a/src/Interpreters/BlobStorageLog.h +++ b/src/Interpreters/BlobStorageLog.h @@ -51,7 +51,17 @@ struct BlobStorageLogElement class BlobStorageLog : public SystemLog { +public: using SystemLog::SystemLog; + + /// We should not log events for table itself to avoid infinite recursion + bool shouldIgnorePath(const String & path) const; +protected: + void prepareTable() override; + ContextMutablePtr getQueryContext(const ContextPtr & context_) const override; + +private: + String prefix_to_ignore; }; } diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index 3af8761ff8e..e3f8ad02f46 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -517,8 +517,7 @@ void SystemLog::flushImpl(const std::vector & to_flush, ASTPtr query_ptr(insert.release()); // we need query context to do inserts to target table with MV containing subqueries or joins - auto insert_context = Context::createCopy(context); - insert_context->makeQueryContext(); + auto insert_context = getQueryContext(getContext()); /// We always want to deliver the data to the original table regardless of the MVs insert_context->setSetting("materialized_views_ignore_errors", true); @@ -541,13 +540,18 @@ void SystemLog::flushImpl(const std::vector & to_flush, LOG_TRACE(log, "Flushed system log up to offset {}", to_flush_end); } +template +StoragePtr SystemLog::getStorage() const +{ + return DatabaseCatalog::instance().tryGetTable(table_id, getContext()); +} template void SystemLog::prepareTable() { String description = table_id.getNameForLogs(); - auto table = DatabaseCatalog::instance().tryGetTable(table_id, getContext()); + auto table = getStorage(); if (table) { if (old_create_query.empty()) @@ -595,11 +599,10 @@ void SystemLog::prepareTable() if (DatabaseCatalog::instance().getDatabase(table_id.database_name)->getUUID() == UUIDHelpers::Nil) merges_lock = table->getActionLock(ActionLocks::PartsMerge); - auto query_context = Context::createCopy(context); + auto query_context = getQueryContext(getContext()); /// As this operation is performed automatically we don't want it to fail because of user dependencies on log tables query_context->setSetting("check_table_dependencies", Field{false}); query_context->setSetting("check_referential_table_dependencies", Field{false}); - query_context->makeQueryContext(); InterpreterRenameQuery(rename, query_context).execute(); /// The required table will be created. @@ -614,8 +617,7 @@ void SystemLog::prepareTable() /// Create the table. LOG_DEBUG(log, "Creating new table {} for {}", description, LogElement::name()); - auto query_context = Context::createCopy(context); - query_context->makeQueryContext(); + auto query_context = getQueryContext(getContext()); auto create_query_ast = getCreateTableQuery(); InterpreterCreateQuery interpreter(create_query_ast, query_context); @@ -630,6 +632,15 @@ void SystemLog::prepareTable() is_prepared = true; } + +template +ContextMutablePtr SystemLog::getQueryContext(const ContextPtr & context_) const +{ + auto query_context = Context::createCopy(context_); + query_context->makeQueryContext(); + return query_context; +} + template ASTPtr SystemLog::getCreateTableQuery() { diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index e5b79585701..b38546b96da 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -139,6 +139,17 @@ protected: using ISystemLog::thread_mutex; using Base::queue; + StoragePtr getStorage() const; + + /** Creates new table if it does not exist. + * Renames old table if its structure is not suitable. + * This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created. + */ + void prepareTable() override; + + /// Some tables can override settings for internal queries + virtual ContextMutablePtr getQueryContext(const ContextPtr & context_) const; + private: /* Saving thread data */ const StorageID table_id; @@ -147,12 +158,6 @@ private: String old_create_query; bool is_prepared = false; - /** Creates new table if it does not exist. - * Renames old table if its structure is not suitable. - * This cannot be done in constructor to avoid deadlock while renaming a table under locked Context when SystemLog object is created. - */ - void prepareTable() override; - void savingThreadFunction() override; /// flushImpl can be executed only in saving_thread. From 03fa9c32ee9106d6389bf18716280616ec41f8cf Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 27 May 2024 10:48:22 +0000 Subject: [PATCH 052/133] Update BlobStorageLog and SystemLog to add settings for query --- src/IO/S3/BlobStorageLogWriter.cpp | 13 +++-------- src/Interpreters/BlobStorageLog.cpp | 11 +++++---- src/Interpreters/BlobStorageLog.h | 2 +- src/Interpreters/Context.cpp | 7 ++++++ src/Interpreters/SystemLog.cpp | 35 ++++++++++++++++++----------- src/Interpreters/SystemLog.h | 3 ++- 6 files changed, 40 insertions(+), 31 deletions(-) diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp index 7252f33c8b3..c2f0cb86928 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -57,22 +57,15 @@ void BlobStorageLogWriter::addEvent( BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) { #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD /// Keeper standalone build doesn't have a context - const auto & global_context = Context::getGlobalContextInstance(); - bool enable_blob_storage_log = global_context->getSettingsRef().enable_blob_storage_log; - if (auto blob_storage_log = global_context->getBlobStorageLog()) + if (auto blob_storage_log = Context::getGlobalContextInstance()->getBlobStorageLog()) { auto log_writer = std::make_shared(std::move(blob_storage_log)); log_writer->disk_name = disk_name; - const auto & query_context = CurrentThread::isInitialized() ? CurrentThread::get().getQueryContext() : nullptr; - if (query_context) - { + if (CurrentThread::isInitialized() && CurrentThread::get().getQueryContext()) log_writer->query_id = CurrentThread::getQueryId(); - enable_blob_storage_log = query_context->getSettingsRef().enable_blob_storage_log; - } - if (enable_blob_storage_log) - return log_writer; + return log_writer; } #endif return {}; diff --git a/src/Interpreters/BlobStorageLog.cpp b/src/Interpreters/BlobStorageLog.cpp index c97895de95d..a7612be6f5e 100644 --- a/src/Interpreters/BlobStorageLog.cpp +++ b/src/Interpreters/BlobStorageLog.cpp @@ -71,13 +71,12 @@ void BlobStorageLogElement::appendToBlock(MutableColumns & columns) const columns[i++]->insert(error_message); } -ContextMutablePtr BlobStorageLog::getQueryContext(const ContextPtr & context_) const +void BlobStorageLog::addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const { - /// Override setting in INSERT query context to disable logging blobs inserted to the table itself - auto result_context = Context::createCopy(context_); - result_context->makeQueryContext(); - result_context->setSetting("enable_blob_storage_log", false); - return result_context; + SystemLog::addSettingsForQuery(mutable_context, query_kind); + + if (query_kind == IAST::QueryKind::Insert) + mutable_context->setSetting("enable_blob_storage_log", false); } static std::string_view normalizePath(std::string_view path) diff --git a/src/Interpreters/BlobStorageLog.h b/src/Interpreters/BlobStorageLog.h index c4c50c7e55a..80d1f363c20 100644 --- a/src/Interpreters/BlobStorageLog.h +++ b/src/Interpreters/BlobStorageLog.h @@ -58,7 +58,7 @@ public: bool shouldIgnorePath(const String & path) const; protected: void prepareTable() override; - ContextMutablePtr getQueryContext(const ContextPtr & context_) const override; + void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const override; private: String prefix_to_ignore; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 5c9ae4716b9..06b3adb328d 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4103,6 +4103,13 @@ std::shared_ptr Context::getBackupLog() const std::shared_ptr Context::getBlobStorageLog() const { + bool enable_blob_storage_log = settings.enable_blob_storage_log; + if (hasQueryContext()) + enable_blob_storage_log = getQueryContext()->getSettingsRef().enable_blob_storage_log; + + if (!enable_blob_storage_log) + return {}; + SharedLockGuard lock(shared->mutex); if (!shared->system_logs) diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp index e3f8ad02f46..5e0ce2cb0de 100644 --- a/src/Interpreters/SystemLog.cpp +++ b/src/Interpreters/SystemLog.cpp @@ -517,9 +517,9 @@ void SystemLog::flushImpl(const std::vector & to_flush, ASTPtr query_ptr(insert.release()); // we need query context to do inserts to target table with MV containing subqueries or joins - auto insert_context = getQueryContext(getContext()); - /// We always want to deliver the data to the original table regardless of the MVs - insert_context->setSetting("materialized_views_ignore_errors", true); + auto insert_context = Context::createCopy(context); + insert_context->makeQueryContext(); + addSettingsForQuery(insert_context, IAST::QueryKind::Insert); InterpreterInsertQuery interpreter(query_ptr, insert_context); BlockIO io = interpreter.execute(); @@ -599,10 +599,10 @@ void SystemLog::prepareTable() if (DatabaseCatalog::instance().getDatabase(table_id.database_name)->getUUID() == UUIDHelpers::Nil) merges_lock = table->getActionLock(ActionLocks::PartsMerge); - auto query_context = getQueryContext(getContext()); - /// As this operation is performed automatically we don't want it to fail because of user dependencies on log tables - query_context->setSetting("check_table_dependencies", Field{false}); - query_context->setSetting("check_referential_table_dependencies", Field{false}); + auto query_context = Context::createCopy(context); + query_context->makeQueryContext(); + addSettingsForQuery(query_context, IAST::QueryKind::Rename); + InterpreterRenameQuery(rename, query_context).execute(); /// The required table will be created. @@ -617,7 +617,9 @@ void SystemLog::prepareTable() /// Create the table. LOG_DEBUG(log, "Creating new table {} for {}", description, LogElement::name()); - auto query_context = getQueryContext(getContext()); + auto query_context = Context::createCopy(context); + query_context->makeQueryContext(); + addSettingsForQuery(query_context, IAST::QueryKind::Create); auto create_query_ast = getCreateTableQuery(); InterpreterCreateQuery interpreter(create_query_ast, query_context); @@ -632,13 +634,20 @@ void SystemLog::prepareTable() is_prepared = true; } - template -ContextMutablePtr SystemLog::getQueryContext(const ContextPtr & context_) const +void SystemLog::addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const { - auto query_context = Context::createCopy(context_); - query_context->makeQueryContext(); - return query_context; + if (query_kind == IAST::QueryKind::Insert) + { + /// We always want to deliver the data to the original table regardless of the MVs + mutable_context->setSetting("materialized_views_ignore_errors", true); + } + else if (query_kind == IAST::QueryKind::Rename) + { + /// As this operation is performed automatically we don't want it to fail because of user dependencies on log tables + mutable_context->setSetting("check_table_dependencies", Field{false}); + mutable_context->setSetting("check_referential_table_dependencies", Field{false}); + } } template diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index b38546b96da..af635ca1bdb 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -2,6 +2,7 @@ #include #include +#include #include @@ -148,7 +149,7 @@ protected: void prepareTable() override; /// Some tables can override settings for internal queries - virtual ContextMutablePtr getQueryContext(const ContextPtr & context_) const; + virtual void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const; private: /* Saving thread data */ From 57e7e46a2b57598ec4294349a30f622efcca7b8e Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 27 May 2024 14:45:39 +0000 Subject: [PATCH 053/133] Add enable_blob_storage_log to SettingsChangesHistory.h --- src/Core/SettingsChangesHistory.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 9352b22132f..e3b6cf40173 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -95,6 +95,7 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, + {"enable_blob_storage_log", true, true, "Write information about blob storage operations to system.blob_storage_log table"}, }}, {"24.5", {{"allow_deprecated_error_prone_window_functions", true, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, From 5d1b33612c7121e5d8d543e355f9311fa944e4a0 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 27 May 2024 16:35:28 +0000 Subject: [PATCH 054/133] Fix build BlobStorageLogWriter with CLICKHOUSE_KEEPER_STANDALONE_BUILD --- src/IO/S3/BlobStorageLogWriter.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp index c2f0cb86928..aa480932d7c 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -20,6 +20,9 @@ void BlobStorageLogWriter::addEvent( const Aws::S3::S3Error * error, BlobStorageLogElement::EvenTime time_now) { +/// Keeper standalone build doesn't build BlobStorageLog +/// But BlobStorageLogWriterPtr is used in IO, so we need to provide a stub implementation +#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD if (!log) return; @@ -52,6 +55,7 @@ void BlobStorageLogWriter::addEvent( element.event_time = time_now; log->add(element); +#endif } BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) From 6d9b2c8f5ab1010feda5215daaf9688e9a569462 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 28 May 2024 14:58:48 +0000 Subject: [PATCH 055/133] Move BlobStorageLogWriter to Interpreters --- src/Backups/BackupIO_S3.h | 2 +- src/Common/SystemLogBase.cpp | 2 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/IO/S3/copyS3File.cpp | 2 +- src/IO/S3/copyS3File.h | 2 +- src/IO/WriteBufferFromS3.cpp | 2 +- src/IO/WriteBufferFromS3.h | 2 +- src/{IO/S3 => Interpreters}/BlobStorageLogWriter.cpp | 6 +----- src/{IO/S3 => Interpreters}/BlobStorageLogWriter.h | 0 src/Storages/S3Queue/StorageS3Queue.h | 2 +- 10 files changed, 9 insertions(+), 13 deletions(-) rename src/{IO/S3 => Interpreters}/BlobStorageLogWriter.cpp (89%) rename src/{IO/S3 => Interpreters}/BlobStorageLogWriter.h (100%) diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index f81eb975df3..db5217960f9 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp index 15803db4929..950f4e40d62 100644 --- a/src/Common/SystemLogBase.cpp +++ b/src/Common/SystemLogBase.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index ae719f5cde4..056fed04a8a 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index d3968d883e8..4bddda70f10 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index 85b3870ddbf..f3bc5106857 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index b796c029051..58a4ccc10eb 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index fbfec3588fa..529de309ae5 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/Interpreters/BlobStorageLogWriter.cpp similarity index 89% rename from src/IO/S3/BlobStorageLogWriter.cpp rename to src/Interpreters/BlobStorageLogWriter.cpp index aa480932d7c..dcbbfb48a2d 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/Interpreters/BlobStorageLogWriter.cpp @@ -1,4 +1,4 @@ -#include +#include #if USE_AWS_S3 @@ -20,9 +20,6 @@ void BlobStorageLogWriter::addEvent( const Aws::S3::S3Error * error, BlobStorageLogElement::EvenTime time_now) { -/// Keeper standalone build doesn't build BlobStorageLog -/// But BlobStorageLogWriterPtr is used in IO, so we need to provide a stub implementation -#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD if (!log) return; @@ -55,7 +52,6 @@ void BlobStorageLogWriter::addEvent( element.event_time = time_now; log->add(element); -#endif } BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) diff --git a/src/IO/S3/BlobStorageLogWriter.h b/src/Interpreters/BlobStorageLogWriter.h similarity index 100% rename from src/IO/S3/BlobStorageLogWriter.h rename to src/Interpreters/BlobStorageLogWriter.h diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 83b7bc6667b..45c7dd2a100 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include From 72e7a266e720466974be101d566a3770a4bc5010 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 28 May 2024 16:16:04 +0000 Subject: [PATCH 056/133] Revert "Move BlobStorageLogWriter to Interpreters" This reverts commit ca3d80102365e76d931be016638b1ca506dffb86. --- src/Backups/BackupIO_S3.h | 2 +- src/Common/SystemLogBase.cpp | 2 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 2 +- src/{Interpreters => IO/S3}/BlobStorageLogWriter.cpp | 6 +++++- src/{Interpreters => IO/S3}/BlobStorageLogWriter.h | 0 src/IO/S3/copyS3File.cpp | 2 +- src/IO/S3/copyS3File.h | 2 +- src/IO/WriteBufferFromS3.cpp | 2 +- src/IO/WriteBufferFromS3.h | 2 +- src/Storages/S3Queue/StorageS3Queue.h | 2 +- 10 files changed, 13 insertions(+), 9 deletions(-) rename src/{Interpreters => IO/S3}/BlobStorageLogWriter.cpp (89%) rename src/{Interpreters => IO/S3}/BlobStorageLogWriter.h (100%) diff --git a/src/Backups/BackupIO_S3.h b/src/Backups/BackupIO_S3.h index db5217960f9..f81eb975df3 100644 --- a/src/Backups/BackupIO_S3.h +++ b/src/Backups/BackupIO_S3.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include namespace DB { diff --git a/src/Common/SystemLogBase.cpp b/src/Common/SystemLogBase.cpp index 950f4e40d62..15803db4929 100644 --- a/src/Common/SystemLogBase.cpp +++ b/src/Common/SystemLogBase.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 056fed04a8a..ae719f5cde4 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include diff --git a/src/Interpreters/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp similarity index 89% rename from src/Interpreters/BlobStorageLogWriter.cpp rename to src/IO/S3/BlobStorageLogWriter.cpp index dcbbfb48a2d..aa480932d7c 100644 --- a/src/Interpreters/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -1,4 +1,4 @@ -#include +#include #if USE_AWS_S3 @@ -20,6 +20,9 @@ void BlobStorageLogWriter::addEvent( const Aws::S3::S3Error * error, BlobStorageLogElement::EvenTime time_now) { +/// Keeper standalone build doesn't build BlobStorageLog +/// But BlobStorageLogWriterPtr is used in IO, so we need to provide a stub implementation +#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD if (!log) return; @@ -52,6 +55,7 @@ void BlobStorageLogWriter::addEvent( element.event_time = time_now; log->add(element); +#endif } BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) diff --git a/src/Interpreters/BlobStorageLogWriter.h b/src/IO/S3/BlobStorageLogWriter.h similarity index 100% rename from src/Interpreters/BlobStorageLogWriter.h rename to src/IO/S3/BlobStorageLogWriter.h diff --git a/src/IO/S3/copyS3File.cpp b/src/IO/S3/copyS3File.cpp index 4bddda70f10..d3968d883e8 100644 --- a/src/IO/S3/copyS3File.cpp +++ b/src/IO/S3/copyS3File.cpp @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/S3/copyS3File.h b/src/IO/S3/copyS3File.h index f3bc5106857..85b3870ddbf 100644 --- a/src/IO/S3/copyS3File.h +++ b/src/IO/S3/copyS3File.h @@ -6,7 +6,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 58a4ccc10eb..b796c029051 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 529de309ae5..fbfec3588fa 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/S3Queue/StorageS3Queue.h b/src/Storages/S3Queue/StorageS3Queue.h index 45c7dd2a100..83b7bc6667b 100644 --- a/src/Storages/S3Queue/StorageS3Queue.h +++ b/src/Storages/S3Queue/StorageS3Queue.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include From 37fa4f5dd60b35ec507e7073fe65bfbf8eb8c91e Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 29 May 2024 09:25:28 +0000 Subject: [PATCH 057/133] Revert "Fix build BlobStorageLogWriter with CLICKHOUSE_KEEPER_STANDALONE_BUILD" This reverts commit dfcc36ee2d02c036126007dcdc1ffc1946a3e9f2. --- src/IO/S3/BlobStorageLogWriter.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/IO/S3/BlobStorageLogWriter.cpp b/src/IO/S3/BlobStorageLogWriter.cpp index aa480932d7c..c2f0cb86928 100644 --- a/src/IO/S3/BlobStorageLogWriter.cpp +++ b/src/IO/S3/BlobStorageLogWriter.cpp @@ -20,9 +20,6 @@ void BlobStorageLogWriter::addEvent( const Aws::S3::S3Error * error, BlobStorageLogElement::EvenTime time_now) { -/// Keeper standalone build doesn't build BlobStorageLog -/// But BlobStorageLogWriterPtr is used in IO, so we need to provide a stub implementation -#ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD if (!log) return; @@ -55,7 +52,6 @@ void BlobStorageLogWriter::addEvent( element.event_time = time_now; log->add(element); -#endif } BlobStorageLogWriterPtr BlobStorageLogWriter::create(const String & disk_name) From 764199e63cc3483980ef9364073acd83eded006c Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 29 May 2024 09:25:55 +0000 Subject: [PATCH 058/133] fix build --- src/Interpreters/BlobStorageLog.cpp | 6 ------ src/Interpreters/BlobStorageLog.h | 6 +++++- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/BlobStorageLog.cpp b/src/Interpreters/BlobStorageLog.cpp index a7612be6f5e..923703a02c6 100644 --- a/src/Interpreters/BlobStorageLog.cpp +++ b/src/Interpreters/BlobStorageLog.cpp @@ -98,10 +98,4 @@ void BlobStorageLog::prepareTable() } } -bool BlobStorageLog::shouldIgnorePath(const String & path) const -{ - /// Avoid logging info for data in `blob_storage_log` itself - return !prefix_to_ignore.empty() && normalizePath(path).starts_with(prefix_to_ignore); -} - } diff --git a/src/Interpreters/BlobStorageLog.h b/src/Interpreters/BlobStorageLog.h index 80d1f363c20..aa9b377263f 100644 --- a/src/Interpreters/BlobStorageLog.h +++ b/src/Interpreters/BlobStorageLog.h @@ -55,7 +55,11 @@ public: using SystemLog::SystemLog; /// We should not log events for table itself to avoid infinite recursion - bool shouldIgnorePath(const String & path) const; + bool shouldIgnorePath(const String & path) const + { + return !prefix_to_ignore.empty() && path.starts_with(prefix_to_ignore); + } + protected: void prepareTable() override; void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const override; From cbe99d56193aa47e01867173903a54b1ddb4c616 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 4 Jun 2024 11:10:37 +0000 Subject: [PATCH 059/133] fix race in BlobStorageLog::shouldIgnorePath --- src/Interpreters/BlobStorageLog.cpp | 1 + src/Interpreters/BlobStorageLog.h | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/BlobStorageLog.cpp b/src/Interpreters/BlobStorageLog.cpp index 923703a02c6..f20ac9165ac 100644 --- a/src/Interpreters/BlobStorageLog.cpp +++ b/src/Interpreters/BlobStorageLog.cpp @@ -93,6 +93,7 @@ void BlobStorageLog::prepareTable() SystemLog::prepareTable(); if (auto merge_tree_table = std::dynamic_pointer_cast(getStorage())) { + std::unique_lock lock{prepare_mutex}; const auto & relative_data_path = merge_tree_table->getRelativeDataPath(); prefix_to_ignore = normalizePath(relative_data_path); } diff --git a/src/Interpreters/BlobStorageLog.h b/src/Interpreters/BlobStorageLog.h index aa9b377263f..cf8f37299f7 100644 --- a/src/Interpreters/BlobStorageLog.h +++ b/src/Interpreters/BlobStorageLog.h @@ -1,11 +1,14 @@ #pragma once -#include -#include -#include -#include -#include #include +#include + +#include + +#include +#include +#include +#include namespace DB { @@ -57,6 +60,7 @@ public: /// We should not log events for table itself to avoid infinite recursion bool shouldIgnorePath(const String & path) const { + std::shared_lock lock{prepare_mutex}; return !prefix_to_ignore.empty() && path.starts_with(prefix_to_ignore); } @@ -65,6 +69,7 @@ protected: void addSettingsForQuery(ContextMutablePtr & mutable_context, IAST::QueryKind query_kind) const override; private: + mutable std::shared_mutex prepare_mutex; String prefix_to_ignore; }; From 8b9bb1d47309c2ca927b9d50026b7dcc9be7b164 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 4 Jun 2024 14:09:32 +0200 Subject: [PATCH 060/133] Fix incorrect width calculation --- src/Common/UTF8Helpers.cpp | 18 ++++++++---------- .../03142_skip_ANSI_in_UTF8_compute_width.sql | 7 ++----- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 34eba832113..006ec33c08b 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -116,6 +116,11 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l while (i + 15 < size) { + if (isEscapeSequence) + { + break; + } + __m128i bytes = _mm_loadu_si128(reinterpret_cast(&data[i])); const uint16_t non_regular_width_mask = _mm_movemask_epi8( @@ -132,15 +137,8 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l } else { - if (isEscapeSequence) - { - break; - } - else - { - i += 16; - width += 16; - } + i += 16; + width += 16; } } #endif @@ -149,7 +147,7 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l { auto isParameterByte = isCSIParameterByte(data[i]); auto isIntermediateByte = isCSIIntermediateByte(data[i]); - auto ignore_width = isEscapeSequence & (isParameterByte || isIntermediateByte); + auto ignore_width = isEscapeSequence && (isParameterByte || isIntermediateByte); if (ignore_width || (data[i] == '[' && isEscapeSequence)) { diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql index f4b0bfe5888..812e7124526 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql @@ -1,6 +1,3 @@ SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; -SELECT format('\x1b[38;2;{0};{1};{2}m█ test \x1b[0m', 255, 128, 128) AS x; -SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test', 255, 128, 128) AS x; -SELECT format('test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; -SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; -SELECT visibleWidth('0};{1};{2}m█'); \ No newline at end of file +SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x +SELECT visibleWidth(format('\x1b[38;2;{0};{1};{2}m█\x1b[0m',255,128,128)); From 54a9daa57007550fc253bd64dce3114331a211fd Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 4 Jun 2024 14:15:14 +0200 Subject: [PATCH 061/133] Update reference file --- .../03142_skip_ANSI_in_UTF8_compute_width.reference | 12 +++++++++++- .../03142_skip_ANSI_in_UTF8_compute_width.sql | 6 +++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference index 864f62d3113..fa161970a3d 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference @@ -1,5 +1,15 @@ ┏━━━┓ ┃ x ┃ ┡━━━┩ -1. │ █ │ +1. │ █ │ └───┘ + ┏━━━━━━━━━┳━━━━━━━━━━┓ + ┃ 'Hello' ┃ x ┃ + ┡━━━━━━━━━╇━━━━━━━━━━┩ +1. │ Hello │ █ test █ │ + └─────────┴──────────┘ + ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ + ┃ visibleWidth(format('[38;2;{0};{1};{2}m█', 255, 128, 128)) ┃ + ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ +1. │ 22 │ + └─────────────────────────────────────────────────────┘ diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql index 812e7124526..17608655ec5 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql @@ -1,3 +1,3 @@ -SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x; -SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x -SELECT visibleWidth(format('\x1b[38;2;{0};{1};{2}m█\x1b[0m',255,128,128)); +SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty; +SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty; +SELECT visibleWidth(format('\x1b[38;2;{0};{1};{2}m█\x1b[0m',255,128,128)) FORMAT Pretty; From 252b5f51c2f8e5a6f41d21245340fae9782445c1 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 4 Jun 2024 15:07:44 +0200 Subject: [PATCH 062/133] update test --- .../03142_skip_ANSI_in_UTF8_compute_width.reference | 5 ----- .../0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql | 1 - 2 files changed, 6 deletions(-) diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference index fa161970a3d..6d375fd471a 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.reference @@ -8,8 +8,3 @@ ┡━━━━━━━━━╇━━━━━━━━━━┩ 1. │ Hello │ █ test █ │ └─────────┴──────────┘ - ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┓ - ┃ visibleWidth(format('[38;2;{0};{1};{2}m█', 255, 128, 128)) ┃ - ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┩ -1. │ 22 │ - └─────────────────────────────────────────────────────┘ diff --git a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql index 17608655ec5..49f689a4cc5 100644 --- a/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql +++ b/tests/queries/0_stateless/03142_skip_ANSI_in_UTF8_compute_width.sql @@ -1,3 +1,2 @@ SELECT format('\x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty; SELECT 'Hello', format('\x1b[38;2;{0};{1};{2}m█\x1b[0m test \x1b[38;2;{0};{1};{2}m█\x1b[0m', 255, 128, 128) AS x FORMAT Pretty; -SELECT visibleWidth(format('\x1b[38;2;{0};{1};{2}m█\x1b[0m',255,128,128)) FORMAT Pretty; From 1d77cda70b2db1041a89f7bf7537e96795084dae Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 4 Jun 2024 17:13:19 +0000 Subject: [PATCH 063/133] Fix distributed array join by nested --- src/Analyzer/ArrayJoinNode.cpp | 8 +++++++- ...6_analyzer_array_join_distributed.reference | 2 ++ .../03156_analyzer_array_join_distributed.sql | 18 ++++++++++++++++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/Analyzer/ArrayJoinNode.cpp b/src/Analyzer/ArrayJoinNode.cpp index 27d7229d46a..0cfb5d80b2a 100644 --- a/src/Analyzer/ArrayJoinNode.cpp +++ b/src/Analyzer/ArrayJoinNode.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -64,7 +65,12 @@ ASTPtr ArrayJoinNode::toASTImpl(const ConvertToASTOptions & options) const auto * column_node = array_join_expression->as(); if (column_node && column_node->getExpression()) - array_join_expression_ast = column_node->getExpression()->toAST(options); + { + if (const auto * function_node = column_node->getExpression()->as(); function_node && function_node->getFunctionName() == "nested") + array_join_expression_ast = array_join_expression->toAST(options); + else + array_join_expression_ast = column_node->getExpression()->toAST(options); + } else array_join_expression_ast = array_join_expression->toAST(options); diff --git a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference index b5b2aec9c12..18830a293bd 100644 --- a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference +++ b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.reference @@ -10,3 +10,5 @@ Hello 1 Hello 1 Hello 2 Hello 2 +2020-01-01 a 2 +2020-01-01 b 4 diff --git a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql index f605a369822..55f9877b2ac 100644 --- a/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql +++ b/tests/queries/0_stateless/03156_analyzer_array_join_distributed.sql @@ -8,3 +8,21 @@ SELECT s, arr, a FROM remote('127.0.0.{1,2}', currentDatabase(), arrays_test) AR SELECT s, arr FROM remote('127.0.0.2', currentDatabase(), arrays_test) ARRAY JOIN arr WHERE arr < 3 ORDER BY arr; SELECT s, arr FROM remote('127.0.0.{1,2}', currentDatabase(), arrays_test) ARRAY JOIN arr WHERE arr < 3 ORDER BY arr; + +create table hourly( + hour datetime, + `metric.names` Array(String), + `metric.values` Array(Int64) +) Engine=Memory +as select '2020-01-01', ['a', 'b'], [1,2]; + +SELECT + toDate(hour) AS day, + `metric.names`, + sum(`metric.values`) +FROM remote('127.0.0.{1,2}', currentDatabase(), hourly) +ARRAY JOIN metric +GROUP BY + day, + metric.names +ORDER BY metric.names; From 09c2151f3b0e2e19a1a1f77e27d3677e95b17fb0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 5 Jun 2024 03:08:58 +0200 Subject: [PATCH 064/133] Fix style --- src/Common/UTF8Helpers.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index 006ec33c08b..bfa860af98a 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -103,7 +103,7 @@ template size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t limit) noexcept { UTF8Decoder decoder; - int isEscapeSequence = false; + bool is_escape_sequence = false; size_t width = 0; size_t rollback = 0; for (size_t i = 0; i < size; ++i) @@ -116,10 +116,8 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l while (i + 15 < size) { - if (isEscapeSequence) - { + if (is_escape_sequence) break; - } __m128i bytes = _mm_loadu_si128(reinterpret_cast(&data[i])); @@ -145,17 +143,15 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l while (i < size && isPrintableASCII(data[i])) { - auto isParameterByte = isCSIParameterByte(data[i]); - auto isIntermediateByte = isCSIIntermediateByte(data[i]); - auto ignore_width = isEscapeSequence && (isParameterByte || isIntermediateByte); + bool ignore_width = is_escape_sequence && (isCSIParameterByte(data[i]) || isCSIIntermediateByte(data[i])); - if (ignore_width || (data[i] == '[' && isEscapeSequence)) + if (ignore_width || (data[i] == '[' && is_escape_sequence)) { /// don't count the width } - else if (isEscapeSequence && isCSIFinalByte(data[i])) + else if (is_escape_sequence && isCSIFinalByte(data[i])) { - isEscapeSequence = false; + is_escape_sequence = false; } else { From 8c94832c2041846e45ab50678f8702408c46ac97 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 4 Jun 2024 11:10:24 +0800 Subject: [PATCH 065/133] fixed #64513. mixed join condition with function 'in' --- src/Planner/PlannerJoinTree.cpp | 2 + ..._join_on_inequal_expression_fast.reference | 88 +++++++++++++++++++ ...006_join_on_inequal_expression_fast.sql.j2 | 1 + 3 files changed, 91 insertions(+) diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 1b2a55a50b0..83b6f4f2c26 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -1526,6 +1526,8 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ left_join_tree_query_plan.actions_dags.emplace_back(std::move(join_clauses_and_actions.left_join_expressions_actions)); if (join_clauses_and_actions.right_join_expressions_actions) left_join_tree_query_plan.actions_dags.emplace_back(std::move(join_clauses_and_actions.right_join_expressions_actions)); + if (join_clauses_and_actions.mixed_join_expressions_actions) + left_join_tree_query_plan.actions_dags.push_back(join_clauses_and_actions.mixed_join_expressions_actions); auto mapping = std::move(left_join_tree_query_plan.query_node_to_plan_step_mapping); auto & r_mapping = right_join_tree_query_plan.query_node_to_plan_step_mapping; diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference index 806596f8a63..0d225d7c98b 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference @@ -38,6 +38,17 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 key1 a 1 1 2 key1 B 2 1 2 @@ -67,6 +78,16 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -102,6 +123,17 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -146,6 +178,18 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SET join_algorithm='grace_hash'; SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 @@ -185,6 +229,17 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 key1 a 1 1 2 key1 B 2 1 2 @@ -214,6 +269,16 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -249,6 +314,17 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key4 f 2 3 4 key4 F 1 1 1 SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON (t1.a < t2.a OR lower(t1.attr) == lower(t2.attr)) AND t1.key = t2.key ORDER BY (t1.key, t1.attr, t2.key, t2.attr); 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 A 1 2 1 @@ -293,6 +369,18 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; + 0 0 \N key3 a3 1 1 1 +key1 a 1 1 2 key1 C 3 4 5 +key1 b 2 3 2 key1 A 1 2 1 +key1 b 2 3 2 key1 B 2 1 2 +key1 b 2 3 2 key1 C 3 4 5 +key1 b 2 3 2 key1 D 4 1 6 +key1 c 3 2 1 key1 C 3 4 5 +key1 d 4 7 2 key1 C 3 4 5 +key1 e 5 5 5 key1 C 3 4 5 +key2 a2 1 1 1 0 0 \N +key4 f 2 3 4 key4 F 1 1 1 SET join_algorithm='hash'; SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND t1.a < t2.a OR t1.a = t2.a ORDER BY (t1.key, t1.attr, t2.key, t2.attr); key1 a 1 1 2 key1 A 1 2 1 diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 index d3aa74f5c38..7ca9bacb622 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 @@ -18,6 +18,7 @@ SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.b + SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 {{ join_type }} JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); +SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; {% endfor -%} {% endfor -%} From 284d2b5e699a7598c6bad439f8c85150f81cde9b Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Wed, 5 Jun 2024 09:43:48 +0800 Subject: [PATCH 066/133] update test --- ...006_join_on_inequal_expression_fast.reference | 16 ++++++++-------- .../03006_join_on_inequal_expression_fast.sql.j2 | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference index 0d225d7c98b..46f24f73356 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.reference @@ -38,7 +38,7 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 key1 b 2 3 2 key1 B 2 1 2 @@ -78,7 +78,7 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 key1 b 2 3 2 key1 B 2 1 2 @@ -123,7 +123,7 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 @@ -178,7 +178,7 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 @@ -229,7 +229,7 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 LEFT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 LEFT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 LEFT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 key1 b 2 3 2 key1 B 2 1 2 @@ -269,7 +269,7 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 INNER JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 INNER JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 INNER JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 key1 b 2 3 2 key1 B 2 1 2 @@ -314,7 +314,7 @@ key1 c 3 2 1 key1 D 4 1 6 SELECT t1.*, t2.* from t1 RIGHT JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 RIGHT JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 RIGHT JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 @@ -369,7 +369,7 @@ key4 f 2 3 4 0 0 \N SELECT t1.*, t2.* from t1 FULL JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 FULL JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); 1 1 1 1 1 1 -SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 FULL JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; 0 0 \N key3 a3 1 1 1 key1 a 1 1 2 key1 C 3 4 5 key1 b 2 3 2 key1 A 1 2 1 diff --git a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 index 7ca9bacb622..a363101ca69 100644 --- a/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 +++ b/tests/queries/0_stateless/03006_join_on_inequal_expression_fast.sql.j2 @@ -18,7 +18,7 @@ SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.b + SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and (t1.a < t2.a) ORDER BY (t1.key, t1.attr, t2.key, t2.attr); SELECT t1.*, t2.* from t1 {{ join_type }} JOIN t2 ON t1.key = t2.key and t1.c ORDER BY (t1.key, t1.attr, t2.key, t2.attr); -- { serverError INVALID_JOIN_ON_EXPRESSION } SELECT * FROM (SELECT 1 AS a, 1 AS b, 1 AS c) AS t1 {{ join_type }} JOIN (SELECT 1 AS a, 1 AS b, 1 AS c) AS t2 ON t1.a = t2.a AND (t1.b > 0 OR t2.b > 0); -SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY t1.key, t2.key; +SELECT t1.*, t2.* FROM t1 {{ join_type }} JOIN t2 ON t1.key = t2.key AND (t1.a=2 OR (t2.a IN (SELECT a FROM t1 WHERE a = 3))) ORDER BY ALL; {% endfor -%} {% endfor -%} From a13bf252683670c5db4ce4eb62ab19008e463a52 Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 5 Jun 2024 10:26:56 +0200 Subject: [PATCH 067/133] Trigger CI From c25f8fa28c076df434462641a182dc99bdcb470a Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 5 Jun 2024 10:43:41 +0200 Subject: [PATCH 068/133] Fix --- src/Storages/S3Queue/S3QueueMetadata.cpp | 2 +- src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueMetadata.cpp b/src/Storages/S3Queue/S3QueueMetadata.cpp index f4c8c5c5ef2..9c77bb2d24c 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueMetadata.cpp @@ -167,7 +167,7 @@ S3QueueMetadata::FileMetadataPtr S3QueueMetadata::getFileMetadata( S3QueueOrderedFileMetadata::BucketInfoPtr bucket_info) { auto file_status = local_file_statuses->get(path, /* create */true); - switch (settings.mode) + switch (settings.mode.value) { case S3QueueMode::ORDERED: return std::make_shared( diff --git a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp index d1298b8c4fa..bac87c95cc9 100644 --- a/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp +++ b/src/Storages/S3Queue/S3QueueOrderedFileMetadata.cpp @@ -371,7 +371,6 @@ void S3QueueOrderedFileMetadata::setProcessedImpl() }; const auto zk_client = getZooKeeper(); - const auto node_metadata_str = node_metadata.toString(); std::string failure_reason; while (true) From 5f3bc4271f6a0fe87a3cd2b9d1e694a88639ef2a Mon Sep 17 00:00:00 2001 From: Blargian Date: Wed, 5 Jun 2024 10:58:30 +0200 Subject: [PATCH 069/133] rename forgoten isEscapeSequence to is_escape_sequence --- src/Common/UTF8Helpers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/UTF8Helpers.cpp b/src/Common/UTF8Helpers.cpp index bfa860af98a..dd24cb20933 100644 --- a/src/Common/UTF8Helpers.cpp +++ b/src/Common/UTF8Helpers.cpp @@ -184,7 +184,7 @@ size_t computeWidthImpl(const UInt8 * data, size_t size, size_t prefix, size_t l // special treatment for '\t' and for ESC size_t next_width = width; if (decoder.codepoint == '\x1b') - isEscapeSequence = true; + is_escape_sequence = true; else if (decoder.codepoint == '\t') next_width += 8 - (prefix + width) % 8; else From 69d23f5e67a13b07b6b29e8c54c9f6e29f86fb9c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 29 Dec 2023 15:02:11 +0100 Subject: [PATCH 070/133] Fix all problems in tests that had been found by flake8 Signed-off-by: Azat Khuzhin --- tests/integration/helpers/hdfs_api.py | 3 +-- tests/integration/test_backup_restore_new/test.py | 2 +- .../test_disallow_concurrency.py | 4 ++-- .../test_convert_ordinary.py | 2 +- .../test_backward_compatibility/test_functions.py | 2 +- .../integration/test_disk_over_web_server/test.py | 4 ++-- tests/integration/test_jbod_balancer/test.py | 2 +- tests/integration/test_jdbc_bridge/test.py | 8 ++++---- .../test_keeper_snapshot_small_distance/test.py | 2 +- tests/integration/test_keeper_snapshots/test.py | 1 - .../test_keeper_three_nodes_start/test.py | 1 - .../test_merge_tree_azure_blob_storage/test.py | 5 +---- .../test.py | 6 ++---- tests/integration/test_scheduler/test.py | 1 + tests/integration/test_storage_hudi/test.py | 2 +- tests/integration/test_storage_iceberg/test.py | 2 +- tests/integration/test_storage_rabbitmq/test.py | 14 ++++++-------- tests/integration/test_ttl_move/test.py | 2 +- tests/integration/test_ttl_replicated/test.py | 2 +- 19 files changed, 28 insertions(+), 37 deletions(-) diff --git a/tests/integration/helpers/hdfs_api.py b/tests/integration/helpers/hdfs_api.py index 5739496cb50..4e4468fef77 100644 --- a/tests/integration/helpers/hdfs_api.py +++ b/tests/integration/helpers/hdfs_api.py @@ -110,10 +110,9 @@ class HDFSApi(object): logging.debug( "Stdout:\n{}\n".format(res.stdout.decode("utf-8")) ) - logging.debug("Env:\n{}\n".format(env)) raise Exception( "Command {} return non-zero code {}: {}".format( - args, res.returncode, res.stderr.decode("utf-8") + cmd, res.returncode, res.stderr.decode("utf-8") ) ) diff --git a/tests/integration/test_backup_restore_new/test.py b/tests/integration/test_backup_restore_new/test.py index ef9e536976b..68b8d29f42e 100644 --- a/tests/integration/test_backup_restore_new/test.py +++ b/tests/integration/test_backup_restore_new/test.py @@ -1474,7 +1474,7 @@ def test_backup_all(exclude_system_log_tables): restore_settings = [] if not exclude_system_log_tables: restore_settings.append("allow_non_empty_tables=true") - restore_command = f"RESTORE ALL FROM {backup_name} {'SETTINGS '+ ', '.join(restore_settings) if restore_settings else ''}" + restore_command = f"RESTORE ALL FROM {backup_name} {'SETTINGS ' + ', '.join(restore_settings) if restore_settings else ''}" session_id = new_session_id() instance.http_query( diff --git a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py index c9f20333654..cd0f2032559 100644 --- a/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py +++ b/tests/integration/test_backup_restore_on_cluster/test_disallow_concurrency.py @@ -161,13 +161,13 @@ def wait_for_fail_restore(node, restore_id): elif status == "RESTORING": assert_eq_with_retry( node, - f"SELECT status FROM system.backups WHERE id = '{backup_id}'", + f"SELECT status FROM system.backups WHERE id = '{restore_id}'", "RESTORE_FAILED", sleep_time=2, retry_count=50, ) error = node.query( - f"SELECT error FROM system.backups WHERE id == '{backup_id}'" + f"SELECT error FROM system.backups WHERE id == '{restore_id}'" ).rstrip("\n") assert re.search( "Cannot restore the table default.tbl because it already contains some data", diff --git a/tests/integration/test_backward_compatibility/test_convert_ordinary.py b/tests/integration/test_backward_compatibility/test_convert_ordinary.py index b8db4e005a4..f5d0c066600 100644 --- a/tests/integration/test_backward_compatibility/test_convert_ordinary.py +++ b/tests/integration/test_backward_compatibility/test_convert_ordinary.py @@ -187,7 +187,7 @@ def check_convert_all_dbs_to_atomic(): # 6 tables, MVs contain 2 rows (inner tables does not match regexp) assert "8\t{}\n".format(8 * len("atomic")) == node.query( - "SELECT count(), sum(n) FROM atomic.merge".format(db) + "SELECT count(), sum(n) FROM atomic.merge" ) node.query("DETACH TABLE ordinary.detached PERMANENTLY") diff --git a/tests/integration/test_backward_compatibility/test_functions.py b/tests/integration/test_backward_compatibility/test_functions.py index 1cf5c3deb81..758dda655da 100644 --- a/tests/integration/test_backward_compatibility/test_functions.py +++ b/tests/integration/test_backward_compatibility/test_functions.py @@ -89,7 +89,7 @@ def test_aggregate_states(start_cluster): logging.info("Skipping %s", aggregate_function) skipped += 1 continue - logging.exception("Failed %s", function) + logging.exception("Failed %s", aggregate_function) failed += 1 continue diff --git a/tests/integration/test_disk_over_web_server/test.py b/tests/integration/test_disk_over_web_server/test.py index 9f43ab73fa3..f4ea7d54571 100644 --- a/tests/integration/test_disk_over_web_server/test.py +++ b/tests/integration/test_disk_over_web_server/test.py @@ -116,7 +116,7 @@ def test_usage(cluster, node_name): (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'; """.format( - i, uuids[i], i, i + i, uuids[i] ) ) @@ -338,7 +338,7 @@ def test_page_cache(cluster): (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'; """.format( - i, uuids[i], i, i + i, uuids[i] ) ) diff --git a/tests/integration/test_jbod_balancer/test.py b/tests/integration/test_jbod_balancer/test.py index 69ab83283ff..8635f5e612a 100644 --- a/tests/integration/test_jbod_balancer/test.py +++ b/tests/integration/test_jbod_balancer/test.py @@ -90,7 +90,7 @@ def wait_until_fully_merged(node, table): except: return - raise Exception(f"There are still merges on-going after {retry} assignments") + raise Exception(f"There are still merges on-going after {i} assignments") def test_jbod_balanced_merge(start_cluster): diff --git a/tests/integration/test_jdbc_bridge/test.py b/tests/integration/test_jdbc_bridge/test.py index c4a0a525df3..1efd868e4a7 100644 --- a/tests/integration/test_jdbc_bridge/test.py +++ b/tests/integration/test_jdbc_bridge/test.py @@ -91,7 +91,7 @@ def test_jdbc_insert(started_cluster): """ CREATE TABLE test.test_insert ENGINE = Memory AS SELECT * FROM test.ClickHouseTable; - SELECT * + SELECT * FROM jdbc('{0}?mutation', 'INSERT INTO test.test_insert VALUES({1}, ''{1}'', ''{1}'')'); """.format( datasource, records @@ -115,7 +115,7 @@ def test_jdbc_update(started_cluster): """ CREATE TABLE test.test_update ENGINE = Memory AS SELECT * FROM test.ClickHouseTable; - SELECT * + SELECT * FROM jdbc( '{}?mutation', 'SET mutations_sync = 1; ALTER TABLE test.test_update UPDATE Str=''{}'' WHERE Num = {} - 1;' @@ -145,7 +145,7 @@ def test_jdbc_delete(started_cluster): """ CREATE TABLE test.test_delete ENGINE = Memory AS SELECT * FROM test.ClickHouseTable; - SELECT * + SELECT * FROM jdbc( '{}?mutation', 'SET mutations_sync = 1; ALTER TABLE test.test_delete DELETE WHERE Num < {} - 1;' @@ -158,7 +158,7 @@ def test_jdbc_delete(started_cluster): expected = records - 1 actual = instance.query( "SELECT Str FROM jdbc('{}', 'SELECT * FROM test.test_delete')".format( - datasource, records + datasource ) ) assert int(actual) == expected, "expecting {} but got {}".format(expected, actual) diff --git a/tests/integration/test_keeper_snapshot_small_distance/test.py b/tests/integration/test_keeper_snapshot_small_distance/test.py index be8bf1bd245..612c5b3c65d 100644 --- a/tests/integration/test_keeper_snapshot_small_distance/test.py +++ b/tests/integration/test_keeper_snapshot_small_distance/test.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -##!/usr/bin/env python3 + import pytest from helpers.cluster import ClickHouseCluster import helpers.keeper_utils as keeper_utils diff --git a/tests/integration/test_keeper_snapshots/test.py b/tests/integration/test_keeper_snapshots/test.py index 6dfb2078559..951970dba23 100644 --- a/tests/integration/test_keeper_snapshots/test.py +++ b/tests/integration/test_keeper_snapshots/test.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -#!/usr/bin/env python3 import pytest from helpers.cluster import ClickHouseCluster import helpers.keeper_utils as keeper_utils diff --git a/tests/integration/test_keeper_three_nodes_start/test.py b/tests/integration/test_keeper_three_nodes_start/test.py index bc93a6089cb..6576d386fcb 100644 --- a/tests/integration/test_keeper_three_nodes_start/test.py +++ b/tests/integration/test_keeper_three_nodes_start/test.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 -#!/usr/bin/env python3 import pytest from helpers.cluster import ClickHouseCluster import random diff --git a/tests/integration/test_merge_tree_azure_blob_storage/test.py b/tests/integration/test_merge_tree_azure_blob_storage/test.py index 7f77627e793..45ae88f427e 100644 --- a/tests/integration/test_merge_tree_azure_blob_storage/test.py +++ b/tests/integration/test_merge_tree_azure_blob_storage/test.py @@ -537,10 +537,7 @@ def test_freeze_unfreeze(cluster): def test_apply_new_settings(cluster): node = cluster.instances[NODE_NAME] create_table(node, TABLE_NAME) - config_path = os.path.join( - SCRIPT_DIR, - "./_gen/disk_storage_conf.xml".format(cluster.instances_dir_name), - ) + config_path = os.path.join(SCRIPT_DIR, "./_gen/disk_storage_conf.xml") azure_query( node, f"INSERT INTO {TABLE_NAME} VALUES {generate_values('2020-01-03', 4096)}" diff --git a/tests/integration/test_postgresql_replica_database_engine_1/test.py b/tests/integration/test_postgresql_replica_database_engine_1/test.py index f04425d83d4..0e87cb0e690 100644 --- a/tests/integration/test_postgresql_replica_database_engine_1/test.py +++ b/tests/integration/test_postgresql_replica_database_engine_1/test.py @@ -179,9 +179,7 @@ def test_different_data_types(started_cluster): for i in range(10): col = random.choice(["a", "b", "c"]) cursor.execute("UPDATE test_data_types SET {} = {};".format(col, i)) - cursor.execute( - """UPDATE test_data_types SET i = '2020-12-12';""".format(col, i) - ) + cursor.execute("UPDATE test_data_types SET i = '2020-12-12';") check_tables_are_synchronized(instance, "test_data_types", "id") @@ -452,7 +450,7 @@ def test_many_concurrent_queries(started_cluster): # also change primary key value print("try update primary key {}".format(thread_id)) cursor.execute( - "UPDATE {table}_{} SET key=key%100000+100000*{} WHERE key%{}=0".format( + "UPDATE {} SET key=key%100000+100000*{} WHERE key%{}=0".format( table_name, i + 1, i + 1 ) ) diff --git a/tests/integration/test_scheduler/test.py b/tests/integration/test_scheduler/test.py index e6def99c076..8e37bd8d403 100644 --- a/tests/integration/test_scheduler/test.py +++ b/tests/integration/test_scheduler/test.py @@ -6,6 +6,7 @@ import time import threading import pytest +from helpers.client import QueryRuntimeException from helpers.cluster import ClickHouseCluster cluster = ClickHouseCluster(__file__) diff --git a/tests/integration/test_storage_hudi/test.py b/tests/integration/test_storage_hudi/test.py index 6fe7a193129..0c3fbfb3cda 100644 --- a/tests/integration/test_storage_hudi/test.py +++ b/tests/integration/test_storage_hudi/test.py @@ -4,7 +4,7 @@ import os import json import helpers.client -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, ClickHouseInstance from helpers.test_tools import TSV from helpers.s3_tools import prepare_s3_bucket, upload_directory, get_file_contents diff --git a/tests/integration/test_storage_iceberg/test.py b/tests/integration/test_storage_iceberg/test.py index d9dee0541b0..7762d17b96f 100644 --- a/tests/integration/test_storage_iceberg/test.py +++ b/tests/integration/test_storage_iceberg/test.py @@ -1,5 +1,5 @@ import helpers.client -from helpers.cluster import ClickHouseCluster +from helpers.cluster import ClickHouseCluster, ClickHouseInstance from helpers.test_tools import TSV import pyspark diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 23a95d5dd71..3240039ee81 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -702,7 +702,7 @@ def test_rabbitmq_sharding_between_queues_publish(rabbitmq_cluster): assert ( int(result1) == messages_num * threads_num - ), "ClickHouse lost some messages: {}".format(result) + ), "ClickHouse lost some messages: {}".format(result1) assert int(result2) == 10 @@ -1516,7 +1516,7 @@ def test_rabbitmq_hash_exchange(rabbitmq_cluster): assert ( int(result1) == messages_num * threads_num - ), "ClickHouse lost some messages: {}".format(result) + ), "ClickHouse lost some messages: {}".format(result1) assert int(result2) == 4 * num_tables @@ -1966,7 +1966,7 @@ def test_rabbitmq_many_consumers_to_each_queue(rabbitmq_cluster): assert ( int(result1) == messages_num * threads_num - ), "ClickHouse lost some messages: {}".format(result) + ), "ClickHouse lost some messages: {}".format(result1) # 4 tables, 2 consumers for each table => 8 consumer tags assert int(result2) == 8 @@ -2427,9 +2427,7 @@ def test_rabbitmq_drop_table_properly(rabbitmq_cluster): time.sleep(30) try: - exists = channel.queue_declare( - callback, queue="rabbit_queue_drop", passive=True - ) + exists = channel.queue_declare(queue="rabbit_queue_drop", passive=True) except Exception as e: exists = False @@ -3364,7 +3362,7 @@ def test_rabbitmq_flush_by_block_size(rabbitmq_cluster): routing_key="", body=json.dumps({"key": 0, "value": 0}), ) - except e: + except Exception as e: logging.debug(f"Got error: {str(e)}") produce_thread = threading.Thread(target=produce) @@ -3442,7 +3440,7 @@ def test_rabbitmq_flush_by_time(rabbitmq_cluster): ) logging.debug("Produced a message") time.sleep(0.8) - except e: + except Exception as e: logging.debug(f"Got error: {str(e)}") produce_thread = threading.Thread(target=produce) diff --git a/tests/integration/test_ttl_move/test.py b/tests/integration/test_ttl_move/test.py index 94432b89ab6..3b79ea7916d 100644 --- a/tests/integration/test_ttl_move/test.py +++ b/tests/integration/test_ttl_move/test.py @@ -1850,7 +1850,7 @@ class TestCancelBackgroundMoving: config = inspect.cleandoc( f""" - { 256 * 1024 } + {256 * 1024} """ ) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index f944adbea41..538322473ee 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -325,7 +325,7 @@ def optimize_with_retry(node, table_name, retry=20): settings={"optimize_throw_if_noop": "1"}, ) break - except e: + except: time.sleep(0.5) From a474816fc744088ae0c300971de5043a5c054c72 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 29 Dec 2023 16:18:03 +0100 Subject: [PATCH 071/133] Add missing botocore import into clickhouse_backupview.py Signed-off-by: Azat Khuzhin --- utils/backupview/clickhouse_backupview.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/backupview/clickhouse_backupview.py b/utils/backupview/clickhouse_backupview.py index 4ba1f391d02..d1331e2ab49 100755 --- a/utils/backupview/clickhouse_backupview.py +++ b/utils/backupview/clickhouse_backupview.py @@ -8,6 +8,7 @@ import shutil import zipfile # For reading backups from zip archives import boto3 # For reading backups from S3 +import botocore ## Examples: From b2535d7f508c189c9fcbf871c3b60ac722afdaf7 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 5 Jun 2024 09:50:39 +0200 Subject: [PATCH 072/133] Fix invalid escape sequence warnings Signed-off-by: Azat Khuzhin --- tests/integration/helpers/uclient.py | 4 ++-- tests/integration/test_prometheus_endpoint/test.py | 2 +- .../0_stateless/01056_window_view_proc_hop_watch.py | 6 +++--- .../01059_window_view_event_hop_watch_strict_asc.py | 8 ++++---- .../01062_window_view_event_hop_watch_asc.py | 6 +++--- .../01065_window_view_event_hop_watch_bounded.py | 4 ++-- .../0_stateless/01069_window_view_proc_tumble_watch.py | 8 ++++---- .../0_stateless/01070_window_view_watch_events.py | 6 +++--- .../0_stateless/01078_window_view_alter_query_watch.py | 10 +++++----- .../0_stateless/01082_window_view_watch_limit.py | 4 ++-- tests/queries/0_stateless/01921_test_progress_bar.py | 4 ++-- tests/queries/0_stateless/02473_infile_progress.py | 4 ++-- tests/queries/0_stateless/helpers/client.py | 4 ++-- tests/queries/0_stateless/helpers/shell.py | 2 +- 14 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/integration/helpers/uclient.py b/tests/integration/helpers/uclient.py index 45c8b8f64e2..195eb52ffeb 100644 --- a/tests/integration/helpers/uclient.py +++ b/tests/integration/helpers/uclient.py @@ -8,7 +8,7 @@ sys.path.insert(0, os.path.join(CURDIR)) from . import uexpect -prompt = ":\) " +prompt = ":\\) " end_of_block = r".*\r\n.*\r\n" @@ -21,7 +21,7 @@ class client(object): self.client.eol("\r") self.client.logger(log, prefix=name) self.client.timeout(20) - self.client.expect("[#\$] ", timeout=2) + self.client.expect("[#\\$] ", timeout=2) self.client.send(command) def __enter__(self): diff --git a/tests/integration/test_prometheus_endpoint/test.py b/tests/integration/test_prometheus_endpoint/test.py index f140ebdfbe7..c1f04497b55 100644 --- a/tests/integration/test_prometheus_endpoint/test.py +++ b/tests/integration/test_prometheus_endpoint/test.py @@ -28,7 +28,7 @@ def parse_response_line(line): if line.startswith("#"): return {} - match = re.match("^([a-zA-Z_:][a-zA-Z0-9_:]+)(\{.*\})? -?(\d)", line) + match = re.match(r"^([a-zA-Z_:][a-zA-Z0-9_:]+)(\{.*\})? -?(\d)", line) assert match, line name, _, val = match.groups() return {name: int(val)} diff --git a/tests/queries/0_stateless/01056_window_view_proc_hop_watch.py b/tests/queries/0_stateless/01056_window_view_proc_hop_watch.py index 2db14fcdddf..e65650816ab 100755 --- a/tests/queries/0_stateless/01056_window_view_proc_hop_watch.py +++ b/tests/queries/0_stateless/01056_window_view_proc_hop_watch.py @@ -49,16 +49,16 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01056_window_view_proc_hop_watch.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01056_window_view_proc_hop_watch.mt VALUES (1, now('US/Samoa') + 3)" ) client1.expect("1" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01059_window_view_event_hop_watch_strict_asc.py b/tests/queries/0_stateless/01059_window_view_event_hop_watch_strict_asc.py index 2323ee5c838..3dbb176b0dc 100755 --- a/tests/queries/0_stateless/01059_window_view_event_hop_watch_strict_asc.py +++ b/tests/queries/0_stateless/01059_window_view_event_hop_watch_strict_asc.py @@ -47,7 +47,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH db_01059_event_hop_watch_strict_asc.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO db_01059_event_hop_watch_strict_asc.mt VALUES (1, toDateTime('1990/01/01 12:00:00', 'US/Samoa'));" ) @@ -57,7 +57,7 @@ with client(name="client1>", log=log) as client1, client( ) client2.expect("Ok.") client1.expect("1*1990-01-01 12:00:02" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") client2.send( "INSERT INTO db_01059_event_hop_watch_strict_asc.mt VALUES (1, toDateTime('1990/01/01 12:00:10', 'US/Samoa'));" @@ -65,11 +65,11 @@ with client(name="client1>", log=log) as client1, client( client2.expect("Ok.") client1.expect("1*1990-01-01 12:00:06" + end_of_block) client1.expect("1*1990-01-01 12:00:08" + end_of_block) - client1.expect("Progress: 3.00 rows.*\)") + client1.expect("Progress: 3.00 rows.*\\)") # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01062_window_view_event_hop_watch_asc.py b/tests/queries/0_stateless/01062_window_view_event_hop_watch_asc.py index db9e8cef6c5..d6cc3ee1a88 100755 --- a/tests/queries/0_stateless/01062_window_view_event_hop_watch_asc.py +++ b/tests/queries/0_stateless/01062_window_view_event_hop_watch_asc.py @@ -49,7 +49,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01062_window_view_event_hop_watch_asc.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01062_window_view_event_hop_watch_asc.mt VALUES (1, toDateTime('1990/01/01 12:00:00', 'US/Samoa'));" ) @@ -69,11 +69,11 @@ with client(name="client1>", log=log) as client1, client( client2.expect(prompt) client1.expect("1" + end_of_block) client1.expect("2" + end_of_block) - client1.expect("Progress: 3.00 rows.*\)") + client1.expect("Progress: 3.00 rows.*\\)") # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01065_window_view_event_hop_watch_bounded.py b/tests/queries/0_stateless/01065_window_view_event_hop_watch_bounded.py index b8d5ff02d37..e5f9ab59f60 100755 --- a/tests/queries/0_stateless/01065_window_view_event_hop_watch_bounded.py +++ b/tests/queries/0_stateless/01065_window_view_event_hop_watch_bounded.py @@ -50,7 +50,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01065_window_view_event_hop_watch_bounded.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01065_window_view_event_hop_watch_bounded.mt VALUES (1, '1990/01/01 12:00:00');" ) @@ -72,7 +72,7 @@ with client(name="client1>", log=log) as client1, client( # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01069_window_view_proc_tumble_watch.py b/tests/queries/0_stateless/01069_window_view_proc_tumble_watch.py index 21c2e831afc..8c3a46992dc 100755 --- a/tests/queries/0_stateless/01069_window_view_proc_tumble_watch.py +++ b/tests/queries/0_stateless/01069_window_view_proc_tumble_watch.py @@ -49,23 +49,23 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01069_window_view_proc_tumble_watch.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01069_window_view_proc_tumble_watch.mt VALUES (1, now('US/Samoa') + 3)" ) client2.expect("Ok.") client1.expect("1" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") client2.send( "INSERT INTO 01069_window_view_proc_tumble_watch.mt VALUES (1, now('US/Samoa') + 3)" ) client2.expect("Ok.") client1.expect("1" + end_of_block) - client1.expect("Progress: 2.00 rows.*\)") + client1.expect("Progress: 2.00 rows.*\\)") # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01070_window_view_watch_events.py b/tests/queries/0_stateless/01070_window_view_watch_events.py index 1cf7678a014..172a82a29da 100755 --- a/tests/queries/0_stateless/01070_window_view_watch_events.py +++ b/tests/queries/0_stateless/01070_window_view_watch_events.py @@ -49,7 +49,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01070_window_view_watch_events.wv EVENTS") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01070_window_view_watch_events.mt VALUES (1, toDateTime('1990/01/01 12:00:00', 'US/Samoa'));" ) @@ -59,11 +59,11 @@ with client(name="client1>", log=log) as client1, client( ) client2.expect("Ok.") client1.expect("1990-01-01 12:00:05" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01078_window_view_alter_query_watch.py b/tests/queries/0_stateless/01078_window_view_alter_query_watch.py index 3f3dfe0cda8..05aeb1b4ccb 100755 --- a/tests/queries/0_stateless/01078_window_view_alter_query_watch.py +++ b/tests/queries/0_stateless/01078_window_view_alter_query_watch.py @@ -55,7 +55,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01078_window_view_alter_query_watch.wv") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01078_window_view_alter_query_watch.mt VALUES (1, toDateTime('1990/01/01 12:00:00', 'US/Samoa'));" ) @@ -65,7 +65,7 @@ with client(name="client1>", log=log) as client1, client( ) client2.expect("Ok.") client1.expect("1" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") client2.send( "ALTER TABLE 01078_window_view_alter_query_watch.wv MODIFY QUERY SELECT count(a) * 2 AS count, hopEnd(wid) AS w_end FROM 01078_window_view_alter_query_watch.mt GROUP BY hop(timestamp, INTERVAL '2' SECOND, INTERVAL '3' SECOND, 'US/Samoa') AS wid" ) @@ -75,7 +75,7 @@ with client(name="client1>", log=log) as client1, client( client1.expect(prompt) client3.send("WATCH 01078_window_view_alter_query_watch.wv") client3.expect("Query id" + end_of_block) - client3.expect("Progress: 0.00 rows.*\)") + client3.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01078_window_view_alter_query_watch.mt VALUES (1, toDateTime('1990/01/01 12:00:06', 'US/Samoa'));" ) @@ -85,11 +85,11 @@ with client(name="client1>", log=log) as client1, client( ) client2.expect("Ok.") client3.expect("2" + end_of_block) - client3.expect("Progress: 1.00 rows.*\)") + client3.expect("Progress: 1.00 rows.*\\)") # send Ctrl-C client3.send("\x03", eol="") - match = client3.expect("(%s)|([#\$] )" % prompt) + match = client3.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client3.send(client3.command) client3.expect(prompt) diff --git a/tests/queries/0_stateless/01082_window_view_watch_limit.py b/tests/queries/0_stateless/01082_window_view_watch_limit.py index 9938ebcab98..5dcdfdb5020 100755 --- a/tests/queries/0_stateless/01082_window_view_watch_limit.py +++ b/tests/queries/0_stateless/01082_window_view_watch_limit.py @@ -49,7 +49,7 @@ with client(name="client1>", log=log) as client1, client( client1.send("WATCH 01082_window_view_watch_limit.wv LIMIT 1") client1.expect("Query id" + end_of_block) - client1.expect("Progress: 0.00 rows.*\)") + client1.expect("Progress: 0.00 rows.*\\)") client2.send( "INSERT INTO 01082_window_view_watch_limit.mt VALUES (1, '1990/01/01 12:00:00');" ) @@ -59,7 +59,7 @@ with client(name="client1>", log=log) as client1, client( ) client2.expect("Ok.") client1.expect("1" + end_of_block) - client1.expect("Progress: 1.00 rows.*\)") + client1.expect("Progress: 1.00 rows.*\\)") client1.expect("1 row" + end_of_block) client1.expect(prompt) diff --git a/tests/queries/0_stateless/01921_test_progress_bar.py b/tests/queries/0_stateless/01921_test_progress_bar.py index 54c7ae59894..6406534a647 100755 --- a/tests/queries/0_stateless/01921_test_progress_bar.py +++ b/tests/queries/0_stateless/01921_test_progress_bar.py @@ -15,6 +15,6 @@ log = None with client(name="client1>", log=log) as client1: client1.expect(prompt) client1.send("SELECT number FROM numbers(1000) FORMAT Null") - client1.expect("Progress: 1\.00 thousand rows, 8\.00 KB .*" + end_of_block) - client1.expect("0 rows in set. Elapsed: [\\w]{1}\.[\\w]{3} sec.") + client1.expect("Progress: 1\\.00 thousand rows, 8\\.00 KB .*" + end_of_block) + client1.expect("0 rows in set. Elapsed: [\\w]{1}\\.[\\w]{3} sec.") client1.expect("Peak memory usage: .*B" + end_of_block) diff --git a/tests/queries/0_stateless/02473_infile_progress.py b/tests/queries/0_stateless/02473_infile_progress.py index 9941736107f..4165eeb6d31 100755 --- a/tests/queries/0_stateless/02473_infile_progress.py +++ b/tests/queries/0_stateless/02473_infile_progress.py @@ -32,12 +32,12 @@ with client( ) client1.expect(prompt) client1.send(f"INSERT INTO test.infile_progress FROM INFILE '{filename}'") - client1.expect("Progress: 5.00 rows, 10.00 B.*\)") + client1.expect("Progress: 5.00 rows, 10.00 B.*\\)") client1.expect(prompt) # send Ctrl-C client1.send("\x03", eol="") - match = client1.expect("(%s)|([#\$] )" % prompt) + match = client1.expect("(%s)|([#\\$] )" % prompt) if match.groups()[1]: client1.send(client1.command) client1.expect(prompt) diff --git a/tests/queries/0_stateless/helpers/client.py b/tests/queries/0_stateless/helpers/client.py index 5c8589dfca1..ac0896f2e93 100644 --- a/tests/queries/0_stateless/helpers/client.py +++ b/tests/queries/0_stateless/helpers/client.py @@ -8,7 +8,7 @@ sys.path.insert(0, os.path.join(CURDIR)) import uexpect -prompt = ":\) " +prompt = ":\\) " end_of_block = r".*\r\n.*\r\n" @@ -21,7 +21,7 @@ class client(object): self.client.eol("\r") self.client.logger(log, prefix=name) self.client.timeout(120) - self.client.expect("[#\$] ", timeout=60) + self.client.expect("[#\\$] ", timeout=60) self.client.send(command) def __enter__(self): diff --git a/tests/queries/0_stateless/helpers/shell.py b/tests/queries/0_stateless/helpers/shell.py index befb3dcd543..c3fff61ffc9 100644 --- a/tests/queries/0_stateless/helpers/shell.py +++ b/tests/queries/0_stateless/helpers/shell.py @@ -10,7 +10,7 @@ import uexpect class shell(object): - def __init__(self, command=None, name="", log=None, prompt="[#\$] "): + def __init__(self, command=None, name="", log=None, prompt="[#\\$] "): if command is None: command = ["/bin/bash", "--noediting"] self.prompt = prompt From 11905682a9facddcde8296309e97dedee5479afb Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 29 Dec 2023 14:51:24 +0100 Subject: [PATCH 073/133] Check python code with flake8 Recently assert-on-tuple had been introduced in tests [1], let's prevent this. [1]: https://github.com/ClickHouse/ClickHouse/pull/56367#discussion_r1437098533 v2: pin flake8 to 4.0.1 (instead of originally 6.1) due to other dependencies, hope that it will find such errors Signed-off-by: Azat Khuzhin --- docker/test/style/Dockerfile | 1 + docker/test/style/run.sh | 2 + docs/en/development/continuous-integration.md | 3 + utils/check-style/check-flake8 | 55 +++++++++++++++++++ .../check-style/process_style_check_result.py | 1 + 5 files changed, 62 insertions(+) create mode 100755 utils/check-style/check-flake8 diff --git a/docker/test/style/Dockerfile b/docker/test/style/Dockerfile index cb29185f068..91768c8328d 100644 --- a/docker/test/style/Dockerfile +++ b/docker/test/style/Dockerfile @@ -30,6 +30,7 @@ RUN pip3 install \ mypy==1.8.0 \ pylint==3.1.0 \ python-magic==0.4.24 \ + flake8==4.0.1 \ requests \ thefuzz \ types-requests \ diff --git a/docker/test/style/run.sh b/docker/test/style/run.sh index cc6cb292b66..64803191532 100755 --- a/docker/test/style/run.sh +++ b/docker/test/style/run.sh @@ -9,6 +9,8 @@ echo "Check style" | ts ./check-style -n |& tee /test_output/style_output.txt echo "Check python formatting with black" | ts ./check-black -n |& tee /test_output/black_output.txt +echo "Check python with flake8" | ts +./check-flake8 |& tee /test_output/flake8_output.txt echo "Check python type hinting with mypy" | ts ./check-mypy -n |& tee /test_output/mypy_output.txt echo "Check typos" | ts diff --git a/docs/en/development/continuous-integration.md b/docs/en/development/continuous-integration.md index c348eb5ca07..c283cfbf4c2 100644 --- a/docs/en/development/continuous-integration.md +++ b/docs/en/development/continuous-integration.md @@ -91,6 +91,9 @@ cd ./utils/check-style # Check python type hinting with mypy ./check-mypy +# Check python with flake8 +./check-flake8 + # Check code with codespell ./check-typos diff --git a/utils/check-style/check-flake8 b/utils/check-style/check-flake8 new file mode 100755 index 00000000000..58dd8a99d40 --- /dev/null +++ b/utils/check-style/check-flake8 @@ -0,0 +1,55 @@ +#!/usr/bin/env bash + +function join_by() { local IFS="$1"; shift; echo "$*"; } + +set -e + +# We check only our code, that's why we skip contrib +GIT_ROOT=$(git rev-parse --show-cdup) +GIT_ROOT=${GIT_ROOT:-./} + +# Find all *.py, *.python files and executable files without extension +# that are determined as python scripts by 'file' util +# in the repo except the contrib directory. +find_cmd=( + find "$GIT_ROOT" -type f -not -path "${GIT_ROOT}contrib/*" + \( + \( + -name '*.py' -or -name "*.python" -or + \( + -executable -not -name "*.*" -exec sh -c 'file {} | grep -q "Python script"' \; + \) + \) + # We skip modules generated by the protocol buffer compiler from *.proto files. + -and -not -name '*_pb2.py' -and -not -name '*_pb2_grpc.py' + \) -print0 +) + +ignores=( + E101 # Indentation contains mixed spaces and tabs + E203 # Whitespace before ':' + E226 # missing whitespace around arithmetic operator + E266 # Too many leading '#' for block comment + E401 # Multiple imports on one line + E402 # Module level import not at top of file + E501 # line too long + E711 # Comparison to None should be 'cond is None:' + E712 # Comparison to true should be 'if cond is true:' or 'if cond:' + E713 # Test for membership should be 'not in' + E714 # Test for object identity should be 'is not' + E722 # Do not use bare except, specify exception instead + E731 # Do not assign a lambda expression, use a def + E741 # Do not use variables named 'I', 'O', or 'l' + F401 # Module imported but unused + F403 # 'from module import *' used; unable to detect undefined names + F405 # Name may be undefined, or defined from star imports: module + F522 # .format(...) unused named arguments + F541 # f-string without any placeholders + F811 # redefinition of unused name from line N + F841 # local variable name is assigned to but never used + W191 # Indentation contains tabs + W291 # Trailing whitespace + W293 # Blank line contains whitespace + W503 # Line break occurred before a binary operator +) +"${find_cmd[@]}" | xargs -0 flake8 --ignore "$(join_by , "${ignores[@]}")" diff --git a/utils/check-style/process_style_check_result.py b/utils/check-style/process_style_check_result.py index e603084732d..2c349114a59 100755 --- a/utils/check-style/process_style_check_result.py +++ b/utils/check-style/process_style_check_result.py @@ -18,6 +18,7 @@ def process_result(result_folder): "style", "pylint", "black", + "flake8", "mypy", "typos", "whitespaces", From 59784a4cf2eb337016bcc960898d3f50e0d87d65 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 5 Jun 2024 15:04:15 +0200 Subject: [PATCH 074/133] Upload blob_storage_log from stateless tests --- docker/test/stateless/run.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 4d2c2e6f466..f94621ba092 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -285,7 +285,7 @@ stop_logs_replication # Try to get logs while server is running failed_to_save_logs=0 -for table in query_log zookeeper_log trace_log transactions_info_log metric_log +for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log do err=$(clickhouse-client -q "select * from system.$table into outfile '/test_output/$table.tsv.gz' format TSVWithNamesAndTypes") echo "$err" @@ -339,7 +339,7 @@ if [ $failed_to_save_logs -ne 0 ]; then # directly # - even though ci auto-compress some files (but not *.tsv) it does this only # for files >64MB, we want this files to be compressed explicitly - for table in query_log zookeeper_log trace_log transactions_info_log metric_log + for table in query_log zookeeper_log trace_log transactions_info_log metric_log blob_storage_log do clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||: if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then From 1c346d5c2efbcfece843fd4a37557725cd8529d9 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 5 Jun 2024 15:04:48 +0200 Subject: [PATCH 075/133] Bump From 5e9a41bd9f0642513a11a67cd0cb3a21e5697775 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 5 Jun 2024 13:10:37 +0000 Subject: [PATCH 076/133] fix untacked memory in MemoryTrackerSwitcher --- src/Common/MemoryTrackerSwitcher.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Common/MemoryTrackerSwitcher.h b/src/Common/MemoryTrackerSwitcher.h index 3c99fd12353..796b5295a83 100644 --- a/src/Common/MemoryTrackerSwitcher.h +++ b/src/Common/MemoryTrackerSwitcher.h @@ -15,6 +15,7 @@ struct MemoryTrackerSwitcher return; auto * thread_tracker = CurrentThread::getMemoryTracker(); + prev_untracked_memory = current_thread->untracked_memory; prev_memory_tracker_parent = thread_tracker->getParent(); @@ -31,8 +32,10 @@ struct MemoryTrackerSwitcher CurrentThread::flushUntrackedMemory(); auto * thread_tracker = CurrentThread::getMemoryTracker(); - current_thread->untracked_memory = prev_untracked_memory; + /// It is important to set untracked memory after the call of + /// 'setParent' because it may flush untracked memory to the wrong parent. thread_tracker->setParent(prev_memory_tracker_parent); + current_thread->untracked_memory = prev_untracked_memory; } private: From 2a30c77346dec01ecec931ac79f08948f451be40 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 5 Jun 2024 15:21:51 +0200 Subject: [PATCH 077/133] Fix compatibility --- src/Storages/S3Queue/S3QueueSettings.h | 2 +- src/Storages/S3Queue/StorageS3Queue.cpp | 18 ++++++++---------- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/src/Storages/S3Queue/S3QueueSettings.h b/src/Storages/S3Queue/S3QueueSettings.h index c486a7fbb5d..4a92d99c411 100644 --- a/src/Storages/S3Queue/S3QueueSettings.h +++ b/src/Storages/S3Queue/S3QueueSettings.h @@ -13,7 +13,7 @@ class ASTStorage; #define S3QUEUE_RELATED_SETTINGS(M, ALIAS) \ M(S3QueueMode, \ mode, \ - S3QueueMode::UNORDERED, \ + S3QueueMode::ORDERED, \ "With unordered mode, the set of all already processed files is tracked with persistent nodes in ZooKepeer." \ "With ordered mode, only the max name of the successfully consumed file stored.", \ 0) \ diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index 0844d0a479e..afb75a21b21 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -71,8 +71,14 @@ namespace return zkutil::extractZooKeeperPath(result_zk_path, true); } - void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings) + void checkAndAdjustSettings(S3QueueSettings & s3queue_settings, const Settings & settings, bool is_attach) { + if (!is_attach && !s3queue_settings.mode.changed) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `mode` (Unordered/Ordered) is not specified, but is required."); + } + /// In case !is_attach, we leave Ordered mode as default for compatibility. + if (!s3queue_settings.s3queue_processing_threads_num) { throw Exception(ErrorCodes::BAD_ARGUMENTS, "Setting `s3queue_processing_threads_num` cannot be set to zero"); @@ -125,15 +131,7 @@ StorageS3Queue::StorageS3Queue( throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue url must either end with '/' or contain globs"); } - if (mode == LoadingStrictnessLevel::CREATE - && !context_->getSettingsRef().s3queue_allow_experimental_sharded_mode - && s3queue_settings->mode == S3QueueMode::ORDERED - && (s3queue_settings->s3queue_buckets > 1 || s3queue_settings->s3queue_processing_threads_num > 1)) - { - throw Exception(ErrorCodes::QUERY_NOT_ALLOWED, "S3Queue sharded mode is not allowed. To enable use `s3queue_allow_experimental_sharded_mode`"); - } - - checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef()); + checkAndAdjustSettings(*s3queue_settings, context_->getSettingsRef(), mode > LoadingStrictnessLevel::CREATE); object_storage = configuration->createObjectStorage(context_, /* is_readonly */true); FormatFactory::instance().checkFormatName(configuration->format); From 8bdd291049bf2a2988b7e8f33c4f353744f0b0fc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 5 Jun 2024 13:27:07 +0000 Subject: [PATCH 078/133] Simplify handling of old 'inverted' indexes --- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- .../MergeTree/MergeTreeIndexFullText.cpp | 2 ++ .../02346_fulltext_index_old_name.sql | 30 +++++++------------ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f2e03ca41bd..a89b507fda7 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -754,7 +754,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Experimental full-text index feature is not enabled (the setting 'allow_experimental_full_text_index')"); /// ---- /// Temporary check during a transition period. Please remove at the end of 2024. - if (index_desc.type == INVERTED_INDEX_NAME && settings.allow_experimental_inverted_index) /// The funny condition is not a mistake, see 02346_fulltext_index_old_name.sql + if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'"); /// ---- if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index) diff --git a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp index af9ee710f88..451971cff98 100644 --- a/src/Storages/MergeTree/MergeTreeIndexFullText.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexFullText.cpp @@ -742,6 +742,7 @@ bool MergeTreeConditionFullText::tryPrepareSetGinFilter( MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const { + /// ------ /// Index type 'inverted' was renamed to 'full_text' in May 2024. /// Tables with old indexes can be loaded during a transition period. We still want let users know that they should drop existing /// indexes and re-create them. Function `createIndexGranule` is called whenever the index is used by queries. Reject the query if we @@ -749,6 +750,7 @@ MergeTreeIndexGranulePtr MergeTreeIndexFullText::createIndexGranule() const /// TODO: remove this at the end of 2024. if (index.type == INVERTED_INDEX_NAME) throw Exception(ErrorCodes::ILLEGAL_INDEX, "Indexes of type 'inverted' are no longer supported. Please drop and recreate the index as type 'full-text'"); + /// ------ return std::make_shared(index.name, index.column_names.size(), params); } diff --git a/tests/queries/0_stateless/02346_fulltext_index_old_name.sql b/tests/queries/0_stateless/02346_fulltext_index_old_name.sql index bc641caf237..4e52e689211 100644 --- a/tests/queries/0_stateless/02346_fulltext_index_old_name.sql +++ b/tests/queries/0_stateless/02346_fulltext_index_old_name.sql @@ -1,22 +1,16 @@ +-- Index type 'inverted' was renamed to 'full_text' in April 2024. +-- Such indexes are experimental. Test what happens when ClickHouse encounters tables with the old index type. + DROP TABLE IF EXISTS tab; --- Index type 'inverted' was renamed to 'full_text' in April 2024. --- Such indexes are experimental. Nevertheless test what happens when ClickHouse encounters tables with the old index type. +-- It must be possible to load old tables with 'inverted'-type indexes +-- In stateless tests, we cannot use old persistences. Emulate "loading an old index" by creating it (internally, similar code executes). --- Create a full text index with the old type --- This was how it was done in the old days. These days this throws an exception. -SET allow_experimental_inverted_index = 1; -CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; -- { serverError ILLEGAL_INDEX }; - --- There are unfortunately side effects of this behavior. In particular, if ClickHouse's automatic table load during --- startup finds a table with 'inverted'-type indexes created by an older version, it immediately halts as it thinks --- the persistence is corrupt. Similarly (but less severely), tables with 'inverted' index cannot be attached. --- A backdoor avoids this. Just set allow_experimental_inverted_index = 0 (which is the default). --- --- Note that the backdoor will exist only temporarily during a transition period. It will be removed in future. Its only purpose is --- to simplify the migrationn of experimental inverted indexes to experimental full-text indexes instead of simply breaking existing --- tables. +-- Creation only works with the (old) setting enabled. SET allow_experimental_inverted_index = 0; +CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; -- { serverError ILLEGAL_INDEX } + +SET allow_experimental_inverted_index = 1; CREATE TABLE tab(k UInt64, s String, INDEX idx(s) TYPE inverted(2)) ENGINE = MergeTree() ORDER BY k; INSERT INTO tab VALUES (1, 'ab') (2, 'bc'); @@ -24,14 +18,12 @@ INSERT INTO tab VALUES (1, 'ab') (2, 'bc'); DETACH TABLE tab; ATTACH TABLE tab; --- No, the backdoor does not make 'inverted' indexes non-experimental. --- On the one hand, the backdoor is undocumented, on the other hand, SELECTs that use such indexes now throw an exception, --- making 'inverted' indexes useless. +-- To encourage users to migrate to the new index type, we now throw an exception when the index is used by queries. SELECT * from tab WHERE s = 'bc'; -- { serverError ILLEGAL_INDEX } -- The exception recommends to drop the index and create a 'full_text' index instead. Let's try. ALTER TABLE tab DROP INDEX idx; -SET allow_experimental_full_text_index = 1; -- note that this is a different setting +SET allow_experimental_full_text_index = 1; -- the new setting ALTER TABLE tab ADD INDEX idx(s) TYPE full_text(2); SELECT * from tab WHERE s = 'bc'; From 98b780569792240979cde999afd69dff7a12781d Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 5 Jun 2024 16:19:17 +0200 Subject: [PATCH 079/133] Fix unused field --- src/Storages/S3Queue/S3QueueMetadata.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/S3Queue/S3QueueMetadata.h b/src/Storages/S3Queue/S3QueueMetadata.h index ef4a9808c68..25d01fb52b9 100644 --- a/src/Storages/S3Queue/S3QueueMetadata.h +++ b/src/Storages/S3Queue/S3QueueMetadata.h @@ -82,7 +82,6 @@ private: const fs::path zookeeper_path; const size_t buckets_num; - bool initialized = false; LoggerPtr log; std::atomic_bool shutdown_called = false; From e29185f05e285083f4348d8fd60dc76244589b6a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 17:02:09 +0200 Subject: [PATCH 080/133] Replace Markdown with YAML --- .github/ISSUE_TEMPLATE/10_question.md | 20 -------------------- .github/ISSUE_TEMPLATE/10_question.yaml | 21 +++++++++++++++++++++ 2 files changed, 21 insertions(+), 20 deletions(-) delete mode 100644 .github/ISSUE_TEMPLATE/10_question.md create mode 100644 .github/ISSUE_TEMPLATE/10_question.yaml diff --git a/.github/ISSUE_TEMPLATE/10_question.md b/.github/ISSUE_TEMPLATE/10_question.md deleted file mode 100644 index 08a05a844e0..00000000000 --- a/.github/ISSUE_TEMPLATE/10_question.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -name: Question -about: Ask a question about ClickHouse -title: '' -labels: question -assignees: '' - ---- - -> Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse - -> If you still prefer GitHub issues, remove all this text and ask your question here. - -**Company or project name** - -Put your company name or project description here - -**Question** - -Your question diff --git a/.github/ISSUE_TEMPLATE/10_question.yaml b/.github/ISSUE_TEMPLATE/10_question.yaml new file mode 100644 index 00000000000..68904348796 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/10_question.yaml @@ -0,0 +1,21 @@ +name: Question +description: Ask a question about ClickHouse +title: "" +labels: ["question"] +body: + - type: markdown + attributes: + value: | + > Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse + - type: textarea + attributes: + label: Company or project name + description: Put your company name or project description here. + validations: + required: false + - type: textarea + attributes: + label: Question + description: Please put your question here. + validations: + required: false From 4b27d38fe6de18de312aa178cfd4c44004860e35 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 17:13:30 +0200 Subject: [PATCH 081/133] Update 10_question.yaml --- .github/ISSUE_TEMPLATE/10_question.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/10_question.yaml b/.github/ISSUE_TEMPLATE/10_question.yaml index 68904348796..1e5ddd4de0f 100644 --- a/.github/ISSUE_TEMPLATE/10_question.yaml +++ b/.github/ISSUE_TEMPLATE/10_question.yaml @@ -1,6 +1,5 @@ name: Question description: Ask a question about ClickHouse -title: "" labels: ["question"] body: - type: markdown From 3cd699fc806320a10693e248a6c5ac218ab1578d Mon Sep 17 00:00:00 2001 From: xogoodnow Date: Wed, 5 Jun 2024 18:48:45 +0330 Subject: [PATCH 082/133] Added listen_try and listen_reuse_port parameters --- .../settings.md | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index a5fe74fd0c6..b7e1a9aa4d7 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1353,6 +1353,26 @@ Examples: 127.0.0.1 ``` +## listen_try {#listen_try} + +Server does not exit if IPv6 or IPv4 networks are unavailable while trying to listen. + +Examples: + +``` xml +0 +``` + +## listen_reuse_port {#listen_reuse_port} + +Allow multiple servers to listen on the same address:port. Enabling this setting is not recommended. + +Examples: + +``` xml +0 +``` + ## listen_backlog {#listen_backlog} Backlog (queue size of pending connections) of the listen socket. From 5aec5ea8ef1744ad106710afe56ac8d3d0c0b069 Mon Sep 17 00:00:00 2001 From: xogoodnow Date: Wed, 5 Jun 2024 19:15:00 +0330 Subject: [PATCH 083/133] Added mlock_executable parameter --- .../server-configuration-parameters/settings.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index a5fe74fd0c6..74424c946cb 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1206,6 +1206,16 @@ Expired time for HSTS in seconds. The default value is 0 means clickhouse disabl 600000 ``` +## mlock_executable {#mlock_executable} + +Perform mlockall after startup to lower first queries latency and to prevent clickhouse executable from being paged out under high IO load. Enabling this option is recommended but will lead to increased startup time for up to a few seconds. +Keep in mind that this parameter would not work without "CAP_IPC_LOCK" capability. +**Example** + +``` xml +false +``` + ## include_from {#include_from} The path to the file with substitutions. Both XML and YAML formats are supported. From 9324784ceb5dcb836a3c9dd2ebaa78b23956ad01 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 5 Jun 2024 17:54:44 +0200 Subject: [PATCH 084/133] Update settings.md --- .../en/operations/server-configuration-parameters/settings.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index b7e1a9aa4d7..0569f94e59d 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -1355,7 +1355,7 @@ Examples: ## listen_try {#listen_try} -Server does not exit if IPv6 or IPv4 networks are unavailable while trying to listen. +The server will not exit if IPv6 or IPv4 networks are unavailable while trying to listen. Examples: @@ -1365,7 +1365,7 @@ Examples: ## listen_reuse_port {#listen_reuse_port} -Allow multiple servers to listen on the same address:port. Enabling this setting is not recommended. +Allow multiple servers to listen on the same address:port. Requests will be routed to a random server by the operating system. Enabling this setting is not recommended. Examples: From b2144b45d0b7a64f098540f8b23d2b8b6749856e Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 18:00:03 +0200 Subject: [PATCH 085/133] Update 10_question.yaml --- .github/ISSUE_TEMPLATE/10_question.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/10_question.yaml b/.github/ISSUE_TEMPLATE/10_question.yaml index 1e5ddd4de0f..a651392ca0d 100644 --- a/.github/ISSUE_TEMPLATE/10_question.yaml +++ b/.github/ISSUE_TEMPLATE/10_question.yaml @@ -17,4 +17,4 @@ body: label: Question description: Please put your question here. validations: - required: false + required: true From 5ec3699a197ce843ba1d019d6e3ea594d46ac8c5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 5 Jun 2024 18:36:13 +0200 Subject: [PATCH 086/133] Fix test --- .../integration/test_mask_sensitive_info/test.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index 251da7e4e09..d2562f3966a 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -195,10 +195,10 @@ def test_create_table(): f"DeltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '{password}')", "DNS_ERROR", ), - f"S3Queue('http://minio1:9001/root/data/', 'CSV')", - f"S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip')", - f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV')", - f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV', 'gzip')", + f"S3Queue('http://minio1:9001/root/data/', 'CSV') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV') settings mode = 'ordered'", + f"S3Queue('http://minio1:9001/root/data/', 'minio', '{password}', 'CSV', 'gzip') settings mode = 'ordered'", ] def make_test_case(i): @@ -258,10 +258,10 @@ def test_create_table(): "CREATE TABLE table14 (x int) ENGINE = S3('http://minio1:9001/root/data/test9.csv.gz', 'NOSIGN', 'CSV', 'gzip')", "CREATE TABLE table15 (`x` int) ENGINE = S3('http://minio1:9001/root/data/test10.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE table16 (`x` int) ENGINE = DeltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", - "CREATE TABLE table17 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV')", - "CREATE TABLE table18 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip')", - "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV')", - "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip')", + "CREATE TABLE table17 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV') settings mode = 'ordered'", + "CREATE TABLE table18 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'", + "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV') settings mode = 'ordered'", + "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') settings mode = 'ordered'", ], must_not_contain=[password], ) From 66a2962ccef3f64f3c51178955d2839739d3d882 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 4 Apr 2024 20:26:33 +0200 Subject: [PATCH 087/133] Add reason into "Part {} is broken and need manual correction" message Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 4c8f1240cf5..143394b1171 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -737,7 +737,11 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks { /// Don't scare people with broken part error if (!isRetryableException(std::current_exception())) - LOG_ERROR(storage.log, "Part {} is broken and need manual correction", getDataPartStorage().getFullPath()); + { + auto message = getCurrentExceptionMessage(true); + LOG_ERROR(storage.log, "Part {} is broken and need manual correction. Reason: {}", + getDataPartStorage().getFullPath(), message); + } // There could be conditions that data part to be loaded is broken, but some of meta infos are already written // into meta data before exception, need to clean them all. From 78088ce59a9562e3c805ab54147c68c888228615 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 27 Mar 2024 11:51:59 +0100 Subject: [PATCH 088/133] Reduce lock contention for MergeTree tables (by renaming parts without holding lock) Under heavy load, or not so heavy but with fsync_part_directory=1, time that renameTo() holds DataPartsLock will be increased, and this will affect almost every operation with this table. On one of production clusters I saw ~60 seconds with fsync_part_directory=1. Move the renameTo() out from the critical section. v2: instead of using DataPartsLock.lock.lock()/unlock() move the renameTo() into MergeTreeData::Transaction::commit() Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.cpp | 14 +++++++++----- src/Storages/MergeTree/MergeTreeData.h | 4 +++- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index cd706dab9ae..1042dca4bd0 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3906,12 +3906,9 @@ void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction return !may_be_cleaned_up || temporary_parts.contains(dir_name); }()); - if (need_rename) - part->renameTo(part->name, true); - LOG_TEST(log, "preparePartForCommit: inserting {} into data_parts_indexes", part->getNameWithState()); data_parts_indexes.insert(part); - out_transaction.addPart(part); + out_transaction.addPart(part, need_rename); } bool MergeTreeData::addTempPart( @@ -6617,9 +6614,11 @@ TransactionID MergeTreeData::Transaction::getTID() const return Tx::PrehistoricTID; } -void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part) +void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part, bool need_rename) { precommitted_parts.insert(part); + if (need_rename) + precommitted_parts_need_rename.insert(part); } void MergeTreeData::Transaction::rollback(DataPartsLock * lock) @@ -6665,7 +6664,9 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) void MergeTreeData::Transaction::clear() { + chassert(precommitted_parts.size() >= precommitted_parts_need_rename.size()); precommitted_parts.clear(); + precommitted_parts_need_rename.clear(); } MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock * acquired_parts_lock) @@ -6682,6 +6683,9 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock if (part->getDataPartStorage().hasActiveTransaction()) part->getDataPartStorage().commitTransaction(); + for (const auto & part_need_rename : precommitted_parts_need_rename) + part_need_rename->renameTo(part_need_rename->name, true); + if (txn) { for (const auto & part : precommitted_parts) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index c6f736a4afd..6abdebbe98d 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -255,7 +255,7 @@ public: DataPartsVector commit(DataPartsLock * acquired_parts_lock = nullptr); - void addPart(MutableDataPartPtr & part); + void addPart(MutableDataPartPtr & part, bool need_rename); void rollback(DataPartsLock * lock = nullptr); @@ -286,7 +286,9 @@ public: MergeTreeData & data; MergeTreeTransaction * txn; + MutableDataParts precommitted_parts; + MutableDataParts precommitted_parts_need_rename; MutableDataParts locked_parts; }; From 6c3db34aaeb0d45a573daf341900dc9ae1b0cb50 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Wed, 27 Mar 2024 12:38:24 +0100 Subject: [PATCH 089/133] Remove unused locked_parts from MergeTreeData::Transaction Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 6abdebbe98d..e4009107093 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -289,8 +289,6 @@ public: MutableDataParts precommitted_parts; MutableDataParts precommitted_parts_need_rename; - MutableDataParts locked_parts; - }; using TransactionUniquePtr = std::unique_ptr; From ee546fa00a72a29f8b91f3cfff77caa37fd598c5 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 28 Mar 2024 09:04:36 +0100 Subject: [PATCH 090/133] Fix replacing parts with empty Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.cpp | 10 ++++++++-- src/Storages/MergeTree/MergeTreeData.h | 2 ++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 1042dca4bd0..5ea9f012e8d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4244,6 +4244,7 @@ MergeTreeData::PartsToRemoveFromZooKeeper MergeTreeData::removePartsInRangeFromW MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); renameTempPartAndAdd(new_data_part, transaction, lock); /// All covered parts must be already removed + transaction.renameParts(); /// It will add the empty part to the set of Outdated parts without making it Active (exactly what we need) transaction.rollback(&lock); new_data_part->remove_time.store(0, std::memory_order_relaxed); @@ -6669,6 +6670,12 @@ void MergeTreeData::Transaction::clear() precommitted_parts_need_rename.clear(); } +void MergeTreeData::Transaction::renameParts() +{ + for (const auto & part_need_rename : precommitted_parts_need_rename) + part_need_rename->renameTo(part_need_rename->name, true); +} + MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock * acquired_parts_lock) { DataPartsVector total_covered_parts; @@ -6683,8 +6690,7 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock if (part->getDataPartStorage().hasActiveTransaction()) part->getDataPartStorage().commitTransaction(); - for (const auto & part_need_rename : precommitted_parts_need_rename) - part_need_rename->renameTo(part_need_rename->name, true); + renameParts(); if (txn) { diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index e4009107093..29818a24331 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -255,6 +255,8 @@ public: DataPartsVector commit(DataPartsLock * acquired_parts_lock = nullptr); + void renameParts(); + void addPart(MutableDataPartPtr & part, bool need_rename); void rollback(DataPartsLock * lock = nullptr); From b41d08a2b618ec98de0a02862fa1e5463e01f364 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 28 Mar 2024 21:07:06 +0100 Subject: [PATCH 091/133] Use renameParts() explicitly to avoid leaving parts in detached Since there is an assertion that does not allows to remove detached parts during cleanup, which sounds good in general, but breaks this new code. Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.cpp | 1 + src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 3 +++ 2 files changed, 4 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 5ea9f012e8d..a323266b0a8 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6674,6 +6674,7 @@ void MergeTreeData::Transaction::renameParts() { for (const auto & part_need_rename : precommitted_parts_need_rename) part_need_rename->renameTo(part_need_rename->name, true); + precommitted_parts_need_rename.clear(); } MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock * acquired_parts_lock) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 4b4f4c33e7d..215239ff401 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -903,6 +903,9 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: throw; } + /// Rename parts before committing to ZooKeeper without holding DataPartsLock. + transaction.renameParts(); + ThreadFuzzer::maybeInjectSleep(); fiu_do_on(FailPoints::replicated_merge_tree_commit_zk_fail_after_op, { zookeeper->forceFailureAfterOperation(); }); From ca2c720d0ecf22c57a4ed7e5405d3f146348b884 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 29 Mar 2024 17:48:12 +0100 Subject: [PATCH 092/133] Avoid race between cleanup thread and renameMergedTemporaryPart() The problem was that with this patch set renameMergedTemporaryPart() is called without temporary_directory_lock holded (in MergeTask), since it is reseted just before calling renameMergedTemporaryPart(), and this can be seen in logs: 2024.03.29 19:56:42.126919 [ 1341 ] {ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95::-8_0_138_2_2} test_btnct5cr.alter_table_0 (ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95) (MergerMutator): Merged 50 parts: [-8_0_0_0_2, -8_138_138_0] -> -8_0_138_2_2 2024.03.29 19:56:42.127034 [ 1341 ] {ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95::-8_0_138_2_2} test_btnct5cr.alter_table_0 (ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95): Committing part -8_0_138_2_2 to zookeeper 2024.03.29 19:56:42.128462 [ 884 ] {} test_btnct5cr.alter_table_0 (ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95): Removing temporary directory /var/lib/clickhouse/store/ea7/ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95/tmp_merge_-8_0_138_2_2/ 2024.03.29 19:56:42.128647 [ 1341 ] {ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95::-8_0_138_2_2} test_btnct5cr.alter_table_0 (ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95): Part -8_0_138_2_2 committed to zookeeper ... 2024.03.29 19:56:54.586084 [ 57841 ] {bf240267-0620-4294-afc1-479c58e6be89} executeQuery: std::exception. Code: 1001, type: std::__1::__fs::filesystem::filesystem_error, e.what() = filesystem error: in file_size: No such file or directory ["/var/lib/clickhouse/store/ea7/ea7a3fd2-cf47-4ec7-91a5-51c69fba1b95/-8_0_138_2_2/data.cmrk3"] This should fix failures of 00993_system_parts_race_condition_drop_zookeeper in [1]. [1]: https://s3.amazonaws.com/clickhouse-test-reports/61973/f6f826c85dd5b7bb8db16286fd10dcf441a440f7/stateless_tests__coverage__[4_6].html Though now it looks hackish... Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 2d49e1df19b..b3fd6c3edb1 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -750,6 +750,9 @@ MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart /// Rename new part, add to the set and remove original parts. auto replaced_parts = data.renameTempPartAndReplace(new_data_part, out_transaction); + /// Explicitly rename part while still holding the lock for tmp folder to avoid cleanup + out_transaction.renameParts(); + /// Let's check that all original parts have been deleted and only them. if (replaced_parts.size() != parts.size()) { From 3675c27fe9e64c7f30cc84c1418afdde5817ff23 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 4 Apr 2024 20:50:30 +0200 Subject: [PATCH 093/133] Require explicit rename of parts in transaction Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.cpp | 34 +++++++++++++------ src/Storages/MergeTree/MergeTreeData.h | 16 ++++++--- .../MergeTree/MergeTreeDataMergerMutator.cpp | 2 +- src/Storages/MergeTree/MergeTreeSink.cpp | 3 +- .../MergeTree/MutateFromLogEntryTask.cpp | 3 +- .../MergeTree/MutatePlainMergeTreeTask.cpp | 3 +- .../MergeTree/ReplicatedMergeTreeSink.cpp | 2 +- src/Storages/StorageMergeTree.cpp | 14 ++++---- src/Storages/StorageReplicatedMergeTree.cpp | 20 ++++++----- 9 files changed, 61 insertions(+), 36 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a323266b0a8..e18d2a57a6d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3894,7 +3894,7 @@ void MergeTreeData::checkPartDynamicColumns(MutableDataPartPtr & part, DataParts } } -void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename) +void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename, bool rename_in_transaction) { part->is_temp = false; part->setState(DataPartState::PreActive); @@ -3906,9 +3906,15 @@ void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction return !may_be_cleaned_up || temporary_parts.contains(dir_name); }()); + if (need_rename && !rename_in_transaction) + part->renameTo(part->name, true); + LOG_TEST(log, "preparePartForCommit: inserting {} into data_parts_indexes", part->getNameWithState()); data_parts_indexes.insert(part); - out_transaction.addPart(part, need_rename); + if (rename_in_transaction) + out_transaction.addPart(part, need_rename); + else + out_transaction.addPart(part, /* need_rename= */ false); } bool MergeTreeData::addTempPart( @@ -3957,7 +3963,8 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts) + DataPartsVector * out_covered_parts, + bool rename_in_transaction) { LOG_TRACE(log, "Renaming temporary part {} to {} with tid {}.", part->getDataPartStorage().getPartDirectory(), part->name, out_transaction.getTID()); @@ -3996,7 +4003,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( /// All checks are passed. Now we can rename the part on disk. /// So, we maintain invariant: if a non-temporary part in filesystem then it is in data_parts - preparePartForCommit(part, out_transaction, /* need_rename */ true); + preparePartForCommit(part, out_transaction, /* need_rename= */ true, rename_in_transaction); if (out_covered_parts) { @@ -4011,29 +4018,31 @@ bool MergeTreeData::renameTempPartAndReplaceUnlocked( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts) + bool rename_in_transaction) { - return renameTempPartAndReplaceImpl(part, out_transaction, lock, out_covered_parts); + return renameTempPartAndReplaceImpl(part, out_transaction, lock, /*out_covered_parts=*/ nullptr, rename_in_transaction); } MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( MutableDataPartPtr & part, - Transaction & out_transaction) + Transaction & out_transaction, + bool rename_in_transaction) { auto part_lock = lockParts(); DataPartsVector covered_parts; - renameTempPartAndReplaceImpl(part, out_transaction, part_lock, &covered_parts); + renameTempPartAndReplaceImpl(part, out_transaction, part_lock, &covered_parts, rename_in_transaction); return covered_parts; } bool MergeTreeData::renameTempPartAndAdd( MutableDataPartPtr & part, Transaction & out_transaction, - DataPartsLock & lock) + DataPartsLock & lock, + bool rename_in_transaction) { DataPartsVector covered_parts; - if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts)) + if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts, rename_in_transaction)) return false; if (!covered_parts.empty()) @@ -4242,7 +4251,7 @@ MergeTreeData::PartsToRemoveFromZooKeeper MergeTreeData::removePartsInRangeFromW auto [new_data_part, tmp_dir_holder] = createEmptyPart(empty_info, partition, empty_part_name, NO_TRANSACTION_PTR); MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); - renameTempPartAndAdd(new_data_part, transaction, lock); /// All covered parts must be already removed + renameTempPartAndAdd(new_data_part, transaction, lock, /*rename_in_transaction=*/ true); /// All covered parts must be already removed transaction.renameParts(); /// It will add the empty part to the set of Outdated parts without making it Active (exactly what we need) @@ -6683,6 +6692,9 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock if (!isEmpty()) { + if (!precommitted_parts_need_rename.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Parts not renamed"); + auto settings = data.getSettings(); auto parts_lock = acquired_parts_lock ? DataPartsLock() : data.lockParts(); auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 29818a24331..d9c53863a4f 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -590,20 +590,22 @@ public: bool renameTempPartAndAdd( MutableDataPartPtr & part, Transaction & transaction, - DataPartsLock & lock); + DataPartsLock & lock, + bool rename_in_transaction); /// The same as renameTempPartAndAdd but the block range of the part can contain existing parts. /// Returns all parts covered by the added part (in ascending order). DataPartsVector renameTempPartAndReplace( MutableDataPartPtr & part, - Transaction & out_transaction); + Transaction & out_transaction, + bool rename_in_transaction); /// Unlocked version of previous one. Useful when added multiple parts with a single lock. bool renameTempPartAndReplaceUnlocked( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts = nullptr); + bool rename_in_transaction); /// Remove parts from working set immediately (without wait for background /// process). Transfer part state to temporary. Have very limited usage only @@ -1604,7 +1606,10 @@ private: /// Preparing itself to be committed in memory: fill some fields inside part, add it to data_parts_indexes /// in precommitted state and to transaction - void preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename); + /// + /// @param need_rename - rename the part + /// @param rename_in_transaction - if set, the rename will be done as part of transaction (without holding DataPartsLock), otherwise inplace (when it does not make sense). + void preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename, bool rename_in_transaction = false); /// Low-level method for preparing parts for commit (in-memory). /// FIXME Merge MergeTreeTransaction and Transaction @@ -1612,7 +1617,8 @@ private: MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts); + DataPartsVector * out_covered_parts, + bool rename_in_transaction); /// RAII Wrapper for atomic work with currently moving parts /// Acquire them in constructor and remove them in destructor diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index b3fd6c3edb1..791bcbc3275 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -748,7 +748,7 @@ MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart "but transactions were enabled for this table"); /// Rename new part, add to the set and remove original parts. - auto replaced_parts = data.renameTempPartAndReplace(new_data_part, out_transaction); + auto replaced_parts = data.renameTempPartAndReplace(new_data_part, out_transaction, /*rename_in_transaction=*/ true); /// Explicitly rename part while still holding the lock for tmp folder to avoid cleanup out_transaction.renameParts(); diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index b7dede3cb00..dd28c04fef7 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -186,7 +186,8 @@ void MergeTreeSink::finishDelayedChunk() } } - added = storage.renameTempPartAndAdd(part, transaction, lock); + /// FIXME + added = storage.renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); transaction.commit(&lock); } diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 8d40658bb2c..5c59d5c1b47 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -236,10 +236,11 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit if (data_part_storage.hasActiveTransaction()) data_part_storage.precommitTransaction(); - storage.renameTempPartAndReplace(new_part, *transaction_ptr); + storage.renameTempPartAndReplace(new_part, *transaction_ptr, /*rename_in_transaction=*/ true); try { + transaction_ptr->renameParts(); storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part, mutate_task->getHardlinkedFiles()); } catch (const Exception & e) diff --git a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp index 2fd02708421..8a0d5c444bd 100644 --- a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp +++ b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp @@ -97,7 +97,8 @@ bool MutatePlainMergeTreeTask::executeStep() MergeTreeData::Transaction transaction(storage, merge_mutate_entry->txn.get()); /// FIXME Transactions: it's too optimistic, better to lock parts before starting transaction - storage.renameTempPartAndReplace(new_part, transaction); + storage.renameTempPartAndReplace(new_part, transaction, /*rename_in_transaction=*/ true); + transaction.renameParts(); transaction.commit(); storage.updateMutationEntriesErrors(future_part, true, ""); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 215239ff401..50142185f79 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -888,7 +888,7 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: try { auto lock = storage.lockParts(); - storage.renameTempPartAndAdd(part, transaction, lock); + storage.renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); } catch (const Exception & e) { diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 27a76f4f21d..a85bc936031 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1788,7 +1788,7 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa for (auto & part: new_parts) { - DataPartsVector covered_parts_by_one_part = renameTempPartAndReplace(part, transaction); + DataPartsVector covered_parts_by_one_part = renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); if (covered_parts_by_one_part.size() > 1) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -1798,10 +1798,10 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa std::move(covered_parts_by_one_part.begin(), covered_parts_by_one_part.end(), std::back_inserter(covered_parts)); } - LOG_INFO(log, "Remove {} parts by covering them with empty {} parts. With txn {}.", covered_parts.size(), new_parts.size(), transaction.getTID()); + transaction.renameParts(); transaction.commit(); /// Remove covered parts without waiting for old_parts_lifetime seconds. @@ -2064,7 +2064,7 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition( { auto lock = lockParts(); fillNewPartNameAndResetLevel(loaded_parts[i], lock); - renameTempPartAndAdd(loaded_parts[i], transaction, lock); + renameTempPartAndAdd(loaded_parts[i], transaction, lock, /*rename_in_transaction=*/ false); transaction.commit(&lock); } @@ -2180,8 +2180,9 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con for (auto part : dst_parts) { fillNewPartName(part, data_parts_lock); - renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock); + renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock, /*rename_in_transaction=*/ true); } + transaction.renameParts(); /// Populate transaction transaction.commit(&data_parts_lock); @@ -2284,10 +2285,9 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const for (auto & part : dst_parts) { dest_table_storage->fillNewPartName(part, dest_data_parts_lock); - dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock); + dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock, /*rename_in_transaction=*/ false); } - removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), src_parts, true, src_data_parts_lock); transaction.commit(&src_data_parts_lock); } @@ -2447,7 +2447,7 @@ void StorageMergeTree::attachRestoredParts(MutableDataPartsVector && parts) { auto lock = lockParts(); fillNewPartName(part, lock); - renameTempPartAndAdd(part, transaction, lock); + renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); transaction.commit(&lock); } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e18e66d7af9..9ebca78d87a 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -2093,7 +2093,8 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) Transaction transaction(*this, NO_TRANSACTION_RAW); part->version.setCreationTID(Tx::PrehistoricTID, nullptr); - renameTempPartAndReplace(part, transaction); + renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); + transaction.renameParts(); checkPartChecksumsAndCommit(transaction, part); writePartLog(PartLogElement::Type::NEW_PART, {}, 0 /** log entry is fake so we don't measure the time */, @@ -2882,11 +2883,11 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) Coordination::Requests ops; for (PartDescriptionPtr & part_desc : final_parts) { - renameTempPartAndReplace(part_desc->res_part, transaction); + renameTempPartAndReplace(part_desc->res_part, transaction, /*rename_in_transaction=*/ true); getCommitPartOps(ops, part_desc->res_part); - - lockSharedData(*part_desc->res_part, /* replace_existing_lock */ true, part_desc->hardlinked_files); + lockSharedData(*part_desc->res_part, /*replace_existing_lock=*/ true, part_desc->hardlinked_files); } + transaction.renameParts(); if (!ops.empty()) @@ -4958,7 +4959,8 @@ bool StorageReplicatedMergeTree::fetchPart( if (!to_detached) { Transaction transaction(*this, NO_TRANSACTION_RAW); - renameTempPartAndReplace(part, transaction); + renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); + transaction.renameParts(); chassert(!part_to_clone || !is_zero_copy_part(part)); replaced_parts = checkPartChecksumsAndCommit(transaction, part, /*hardlinked_files*/ {}, /*replace_zero_copy_lock*/ true); @@ -8202,8 +8204,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( { auto data_parts_lock = lockParts(); for (auto & part : dst_parts) - renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock); + renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock, /*rename_in_transaction=*/ true); } + transaction.renameParts(); for (const auto & dst_part : dst_parts) lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); @@ -8478,7 +8481,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta auto dest_data_parts_lock = dest_table_storage->lockParts(); for (auto & part : dst_parts) - dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock); + dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock, /*rename_in_transaction=*/ false); for (const auto & dst_part : dst_parts) dest_table_storage->lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); @@ -10111,7 +10114,8 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP try { MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); - auto replaced_parts = renameTempPartAndReplace(new_data_part, transaction); + auto replaced_parts = renameTempPartAndReplace(new_data_part, transaction, /*rename_in_transaction=*/ true); + transaction.renameParts(); if (!replaced_parts.empty()) { From 6f522c1d619c5ce67cc4d6758409addfaee3618d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 5 Apr 2024 10:24:30 +0200 Subject: [PATCH 094/133] Do not remove detached parts in Transaction::rollback Signed-off-by: Azat Khuzhin --- .../MergeTree/DataPartStorageOnDiskBase.cpp | 16 ++++++-- .../MergeTree/DataPartStorageOnDiskBase.h | 1 + src/Storages/MergeTree/IDataPartStorage.h | 11 +++--- src/Storages/MergeTree/MergeTreeData.cpp | 39 ++++++++++++++++--- src/Storages/MergeTree/MergeTreeData.h | 2 +- 5 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 378a1944396..120e0a6f426 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -59,6 +59,16 @@ std::string DataPartStorageOnDiskBase::getRelativePath() const return fs::path(root_path) / part_dir / ""; } +std::string DataPartStorageOnDiskBase::getParentDirectory() const +{ + /// Cut last "/" if it exists (it shouldn't). Otherwise fs::path behave differently. + fs::path part_dir_without_slash = part_dir.ends_with("/") ? part_dir.substr(0, part_dir.size() - 1) : part_dir; + + if (part_dir_without_slash.has_parent_path()) + return part_dir_without_slash.parent_path(); + return ""; +} + std::optional DataPartStorageOnDiskBase::getRelativePathForPrefix(LoggerPtr log, const String & prefix, bool detached, bool broken) const { assert(!broken || detached); @@ -674,9 +684,9 @@ void DataPartStorageOnDiskBase::remove( if (!has_delete_prefix) { - if (part_dir_without_slash.has_parent_path()) + auto parent_path = getParentDirectory(); + if (!parent_path.empty()) { - auto parent_path = part_dir_without_slash.parent_path(); if (parent_path == MergeTreeData::DETACHED_DIR_NAME) throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -684,7 +694,7 @@ void DataPartStorageOnDiskBase::remove( part_dir, root_path); - part_dir_without_slash = parent_path / ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); + part_dir_without_slash = fs::path(parent_path) / ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); } else { diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 81353d4e20b..44b2454e256 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -20,6 +20,7 @@ public: std::string getRelativePath() const override; std::string getPartDirectory() const override; std::string getFullRootPath() const override; + std::string getParentDirectory() const override; Poco::Timestamp getLastModified() const override; UInt64 calculateTotalSizeOnDisk() const override; diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index f6320a7e1e4..9342d6ca0ea 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -96,11 +96,12 @@ public: virtual MergeTreeDataPartStorageType getType() const = 0; /// Methods to get path components of a data part. - virtual std::string getFullPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving/all_1_5_1' - virtual std::string getRelativePath() const = 0; /// 'database/table/moving/all_1_5_1' - virtual std::string getPartDirectory() const = 0; /// 'all_1_5_1' - virtual std::string getFullRootPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving' - /// Can add it if needed /// 'database/table/moving' + virtual std::string getFullPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving/all_1_5_1' + virtual std::string getRelativePath() const = 0; /// 'database/table/moving/all_1_5_1' + virtual std::string getPartDirectory() const = 0; /// 'all_1_5_1' + virtual std::string getFullRootPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving' + virtual std::string getParentDirectory() const = 0; /// '' (or 'detached' for 'detached/all_1_5_1') + /// Can add it if needed /// 'database/table/moving' /// virtual std::string getRelativeRootPath() const = 0; /// Get a storage for projection. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e18d2a57a6d..522c9f8dd82 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -4083,9 +4083,9 @@ void MergeTreeData::removePartsFromWorkingSet(MergeTreeTransaction * txn, const resetObjectColumnsFromActiveParts(acquired_lock); } -void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove) +void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove, DataPartsLock * acquired_lock) { - auto lock = lockParts(); + auto lock = (acquired_lock) ? DataPartsLock() : lockParts(); for (const auto & part : remove) { @@ -6635,16 +6635,41 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) { if (!isEmpty()) { + for (const auto & part : precommitted_parts) + part->version.creation_csn.store(Tx::RolledBackCSN); + + /// Remove detached parts from working set. + /// + /// It is possible to have detached parts here, only when rename (in + /// commit()) of detached parts had been broken (i.e. during ATTACH), + /// i.e. the part itself is broken. + DataPartsVector detached_precommitted_parts; + for (auto it = precommitted_parts.begin(); it != precommitted_parts.end();) + { + const auto & part = *it; + if (part->getDataPartStorage().getParentDirectory() == DETACHED_DIR_NAME) + { + detached_precommitted_parts.push_back(part); + it = precommitted_parts.erase(it); + } + else + ++it; + } + WriteBufferFromOwnString buf; buf << "Removing parts:"; for (const auto & part : precommitted_parts) buf << " " << part->getDataPartStorage().getPartDirectory(); buf << "."; + if (!detached_precommitted_parts.empty()) + { + buf << " Rollbacking parts state to temporary and removing from working set:"; + for (const auto & part : detached_precommitted_parts) + buf << " " << part->getDataPartStorage().getPartDirectory(); + buf << "."; + } LOG_DEBUG(data.log, "Undoing transaction {}. {}", getTID(), buf.str()); - for (const auto & part : precommitted_parts) - part->version.creation_csn.store(Tx::RolledBackCSN); - /// It would be much better with TSA... auto our_lock = (lock) ? DataPartsLock() : data.lockParts(); @@ -6663,6 +6688,10 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) } else { + data.removePartsFromWorkingSetImmediatelyAndSetTemporaryState( + detached_precommitted_parts, + &our_lock); + data.removePartsFromWorkingSet(txn, DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), /* clear_without_timeout = */ true, &our_lock); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index d9c53863a4f..7881062b724 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -610,7 +610,7 @@ public: /// Remove parts from working set immediately (without wait for background /// process). Transfer part state to temporary. Have very limited usage only /// for new parts which aren't already present in table. - void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove); + void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove, DataPartsLock * acquired_lock = nullptr); /// Removes parts from the working set parts. /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states. From 6cfd5b2165970a65a551117fe58e4b9d22237b8c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 11 Apr 2024 14:10:13 +0200 Subject: [PATCH 095/133] Fix possible assertion when size of precommitted_parts <= precommitted_parts_need_rename CI founds [1]: Logical error: 'precommitted_parts.size() >= precommitted_parts_need_rename.size()' [1]: https://s3.amazonaws.com/clickhouse-test-reports/61973/5c1e6a3e956917bdbb7eaa467934e5b75f17a923/stateless_tests__tsan__s3_storage__[5_5].html The problem is that after precommitted_parts cleaned from detached parts it may be less then precommitted_parts_need_rename, so to avoid this, let's just copy it to a new container. Signed-off-by: Azat Khuzhin --- src/Storages/MergeTree/MergeTreeData.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 522c9f8dd82..ace28e058d4 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -6638,19 +6638,21 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) for (const auto & part : precommitted_parts) part->version.creation_csn.store(Tx::RolledBackCSN); + auto non_detached_precommitted_parts = precommitted_parts; + /// Remove detached parts from working set. /// /// It is possible to have detached parts here, only when rename (in /// commit()) of detached parts had been broken (i.e. during ATTACH), /// i.e. the part itself is broken. DataPartsVector detached_precommitted_parts; - for (auto it = precommitted_parts.begin(); it != precommitted_parts.end();) + for (auto it = non_detached_precommitted_parts.begin(); it != non_detached_precommitted_parts.end();) { const auto & part = *it; if (part->getDataPartStorage().getParentDirectory() == DETACHED_DIR_NAME) { detached_precommitted_parts.push_back(part); - it = precommitted_parts.erase(it); + it = non_detached_precommitted_parts.erase(it); } else ++it; @@ -6658,7 +6660,7 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) WriteBufferFromOwnString buf; buf << "Removing parts:"; - for (const auto & part : precommitted_parts) + for (const auto & part : non_detached_precommitted_parts) buf << " " << part->getDataPartStorage().getPartDirectory(); buf << "."; if (!detached_precommitted_parts.empty()) @@ -6679,7 +6681,7 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) if (!data.all_data_dropped) { Strings part_names; - for (const auto & part : precommitted_parts) + for (const auto & part : non_detached_precommitted_parts) part_names.emplace_back(part->name); throw Exception(ErrorCodes::LOGICAL_ERROR, "There are some PreActive parts ({}) to rollback, " "but data parts set is empty and table {} was not dropped. It's a bug", @@ -6693,7 +6695,7 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) &our_lock); data.removePartsFromWorkingSet(txn, - DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), + DataPartsVector(non_detached_precommitted_parts.begin(), non_detached_precommitted_parts.end()), /* clear_without_timeout = */ true, &our_lock); } } From bfb2cc3793980f1d0c74bf91f359708c6271e3e3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 5 Jun 2024 19:42:11 +0200 Subject: [PATCH 096/133] Update 02271_fix_column_matcher_and_column_transformer.sql --- .../02271_fix_column_matcher_and_column_transformer.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql index 0437e944e6a..f8faa3e653b 100644 --- a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -72,6 +72,6 @@ set allow_suspicious_low_cardinality_types=1; CREATE TABLE github_events__fuzz_0 (`file_time` Int64, `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), `actor_login` LowCardinality(String), `repo_name` LowCardinality(Nullable(String)), `created_at` DateTime, `updated_at` DateTime, `action` Array(Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20)), `comment_id` UInt64, `body` String, `path` LowCardinality(String), `position` Int32, `line` Int32, `ref` String, `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), `creator_user_login` Int16, `number` UInt32, `title` String, `labels` Array(Array(LowCardinality(String))), `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2), `locked` UInt8, `assignee` Array(LowCardinality(String)), `assignees` Array(LowCardinality(String)), `comments` UInt32, `author_association` Array(Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5)), `closed_at` UUID, `merged_at` DateTime, `merge_commit_sha` Nullable(String), `requested_reviewers` Array(LowCardinality(Int64)), `requested_teams` Array(String), `head_ref` String, `head_sha` String, `base_ref` String, `base_sha` String, `merged` Nullable(UInt8), `mergeable` Nullable(UInt8), `rebaseable` LowCardinality(UInt8), `mergeable_state` Array(Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4)), `merged_by` LowCardinality(String), `review_comments` UInt32, `maintainer_can_modify` Nullable(UInt8), `commits` UInt32, `additions` Nullable(UInt32), `deletions` UInt32, `changed_files` UInt32, `diff_hunk` Nullable(String), `original_position` UInt32, `commit_id` String, `original_commit_id` String, `push_size` UInt32, `push_distinct_size` UInt32, `member_login` LowCardinality(String), `release_tag_name` LowCardinality(String), `release_name` String, `review_state` Int16) ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at) settings allow_nullable_key=1; -EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; -- { serverError TYPE_MISMATCH } +EXPLAIN PIPELINE header = true, compact = true WITH top_repos AS (SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 WHERE (event_type = 'WatchEvent') AND (toMonday(created_at) = toMonday(today() - toIntervalWeek(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events__fuzz_0 PREWHERE (event_type = 'WatchEvent') AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count() DESC LIMIT 100 UNION DISTINCT SELECT repo_name FROM github_events WHERE (event_type = 'WatchEvent') AND (toYear(created_at) = (toYear(today()) - 1)) GROUP BY repo_name ORDER BY count() DESC LIMIT 100), last_day AS (SELECT repo_name, count() AS count_last_day, rowNumberInAllBlocks() + 1 AS position_last_day FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toDate(created_at) = (today() - 1)) GROUP BY repo_name ORDER BY count_last_day DESC), last_week AS (SELECT repo_name, count() AS count_last_week, rowNumberInAllBlocks() + 1 AS position_last_week FROM github_events WHERE (repo_name IN (SELECT repo_name FROM top_repos)) AND (toMonday(created_at) = (toMonday(today()) - toIntervalWeek(2))) GROUP BY repo_name ORDER BY count_last_week DESC), last_month AS (SELECT repo_name, count() AS count_last_month, rowNumberInAllBlocks() + 1 AS position_last_month FROM github_events__fuzz_0 WHERE ('deleted' = 4) AND in(repo_name) AND (toStartOfMonth(created_at) = (toStartOfMonth(today()) - toIntervalMonth(1))) GROUP BY repo_name ORDER BY count_last_month DESC) SELECT d.repo_name, COLUMNS(count) FROM last_day AS d INNER JOIN last_week AS w ON d.repo_name = w.repo_name INNER JOIN last_month AS m ON d.repo_name = m.repo_name format Null; -- { serverError INVALID_SETTING_VALUE } DROP TABLE github_events; From eb8520758a6ed83ea1ab63e2a1c0b4164e160693 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 20:51:07 +0200 Subject: [PATCH 097/133] Done --- src/Core/SettingsChangesHistory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 40b3f5a7bfa..8c09afef7c6 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -97,6 +97,8 @@ static std::map sett {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, {"enable_blob_storage_log", true, true, "Write information about blob storage operations to system.blob_storage_log table"}, + {"allow_statistics_optimize", false, false, "The setting was renamed. The previous name is `allow_statistic_optimize`."}, + {"allow_experimental_statistics", false, false, "The setting was renamed. The previous name is `allow_experimental_statistic`."} }}, {"24.5", {{"allow_deprecated_error_prone_window_functions", true, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, From 106c1529ed3af167be986d85aa7eba98f40bf23a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 21:14:26 +0200 Subject: [PATCH 098/133] Introduce an alias --- src/Core/Settings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ff72995b2b7..27ce54c03a7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -160,8 +160,8 @@ class IColumn; M(Bool, enable_multiple_prewhere_read_steps, true, "Move more conditions from WHERE to PREWHERE and do reads from disk and filtering in multiple steps if there are multiple conditions combined with AND", 0) \ M(Bool, move_primary_key_columns_to_end_of_prewhere, true, "Move PREWHERE conditions containing primary key columns to the end of AND chain. It is likely that these conditions are taken into account during primary key analysis and thus will not contribute a lot to PREWHERE filtering.", 0) \ \ - M(Bool, allow_statistics_optimize, false, "Allows using statistics to optimize queries", 0) \ - M(Bool, allow_experimental_statistics, false, "Allows using statistics", 0) \ + M(Bool, allow_statistics_optimize, false, "Allows using statistics to optimize queries", 0) ALIAS(allow_statistic_optimize) \ + M(Bool, allow_experimental_statistics, false, "Allows using statistics", 0) ALIAS(allow_experimental_statistic) \ \ M(UInt64, alter_sync, 1, "Wait for actions to manipulate the partitions. 0 - do not wait, 1 - wait for execution only of itself, 2 - wait for everyone.", 0) ALIAS(replication_alter_partitions_sync) \ M(Int64, replication_wait_for_inactive_replica_timeout, 120, "Wait for inactive replica to execute ALTER/OPTIMIZE. Time in seconds, 0 - do not wait, negative - wait for unlimited time.", 0) \ From f42452d51e5c711875ffc1cc91982be6b6f1578a Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 21:57:23 +0200 Subject: [PATCH 099/133] Add settings to changes history --- src/Core/SettingsChangesHistory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 8c09afef7c6..f7423754fc2 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -97,6 +97,8 @@ static std::map sett {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, {"enable_blob_storage_log", true, true, "Write information about blob storage operations to system.blob_storage_log table"}, + {"allow_statistic_optimize", false, false, "Old setting which popped up here being renamed."}, + {"allow_experimental_statistic", false, false, "Old setting which popped up here being renamed."}, {"allow_statistics_optimize", false, false, "The setting was renamed. The previous name is `allow_statistic_optimize`."}, {"allow_experimental_statistics", false, false, "The setting was renamed. The previous name is `allow_experimental_statistic`."} }}, From e7b7c3aebefcdb2f5eb1d765a201193d75671e0a Mon Sep 17 00:00:00 2001 From: pufit Date: Wed, 5 Jun 2024 16:00:08 -0400 Subject: [PATCH 100/133] Update query before replication --- src/Interpreters/Access/InterpreterGrantQuery.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Interpreters/Access/InterpreterGrantQuery.cpp b/src/Interpreters/Access/InterpreterGrantQuery.cpp index b75c0bfb1c7..6a46ac9c330 100644 --- a/src/Interpreters/Access/InterpreterGrantQuery.cpp +++ b/src/Interpreters/Access/InterpreterGrantQuery.cpp @@ -442,6 +442,7 @@ BlockIO InterpreterGrantQuery::execute() String current_database = getContext()->getCurrentDatabase(); elements_to_grant.replaceEmptyDatabase(current_database); elements_to_revoke.replaceEmptyDatabase(current_database); + query.access_rights_elements.replaceEmptyDatabase(current_database); /// Executing on cluster. if (!query.cluster.empty()) From 0d50dd302b05f51ba476db27fe11f9c65b1f2deb Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 22:25:27 +0200 Subject: [PATCH 101/133] Bump From e35d8c29ea64339c3278d7fa339815fd569dd0b9 Mon Sep 17 00:00:00 2001 From: Nikita Mikhaylov Date: Wed, 5 Jun 2024 22:47:05 +0200 Subject: [PATCH 102/133] Update 10_question.yaml --- .github/ISSUE_TEMPLATE/10_question.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/10_question.yaml b/.github/ISSUE_TEMPLATE/10_question.yaml index a651392ca0d..39d4c27807a 100644 --- a/.github/ISSUE_TEMPLATE/10_question.yaml +++ b/.github/ISSUE_TEMPLATE/10_question.yaml @@ -5,7 +5,7 @@ body: - type: markdown attributes: value: | - > Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse + > Make sure to check documentation https://clickhouse.com/docs/en/ first. If the question is concise and probably has a short answer, asking it in [community Slack](https://join.slack.com/t/clickhousedb/shared_invite/zt-1gh9ds7f4-PgDhJAaF8ad5RbWBAAjzFg) is probably the fastest way to find the answer. For more complicated questions, consider asking them on StackOverflow with "clickhouse" tag https://stackoverflow.com/questions/tagged/clickhouse - type: textarea attributes: label: Company or project name From 8863736459a62decd423175aa04a11eff6576c81 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 5 Jun 2024 22:50:47 +0200 Subject: [PATCH 103/133] Fix style --- src/Storages/System/StorageSystemDashboards.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/System/StorageSystemDashboards.cpp b/src/Storages/System/StorageSystemDashboards.cpp index 57f84e09857..5faa37d951e 100644 --- a/src/Storages/System/StorageSystemDashboards.cpp +++ b/src/Storages/System/StorageSystemDashboards.cpp @@ -218,12 +218,12 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} { "dashboard", "Overview" }, { "title", "Concurrent network connections" }, { "query", trim(R"EOQ( -SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) -FROM ( -SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections +SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, + sum(CurrentMetric_TCPConnection) AS TCP_Connections, + sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, + sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM merge('system', '^metric_log') WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} -GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} )EOQ") } @@ -367,7 +367,7 @@ ORDER BY t WITH FILL STEP {rounding:UInt32} { { "dashboard", "Cloud overview" }, { "title", "Concurrent network connections" }, - { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM ( SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM clusterAllReplicas(default, merge('system', '^metric_log')) WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } + { "query", "SELECT toStartOfInterval(event_time, INTERVAL {rounding:UInt32} SECOND)::INT AS t, max(TCP_Connections), max(MySQL_Connections), max(HTTP_Connections) FROM (SELECT event_time, sum(CurrentMetric_TCPConnection) AS TCP_Connections, sum(CurrentMetric_MySQLConnection) AS MySQL_Connections, sum(CurrentMetric_HTTPConnection) AS HTTP_Connections FROM clusterAllReplicas(default, merge('system', '^metric_log')) WHERE event_date >= toDate(now() - {seconds:UInt32}) AND event_time >= now() - {seconds:UInt32} GROUP BY event_time) GROUP BY t ORDER BY t WITH FILL STEP {rounding:UInt32} SETTINGS skip_unavailable_shards = 1" } } }; From 6ff11f54d0802253ac801c5b59d87283eddd6eac Mon Sep 17 00:00:00 2001 From: xogoodnow Date: Thu, 6 Jun 2024 02:10:19 +0330 Subject: [PATCH 104/133] added mlock and mlockall --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index c35e860a5d7..7ae6fc15c4f 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -557,6 +557,8 @@ MinHash MinIO MinMax MindsDB +mlock +mlockall Mongodb Monotonicity MsgPack From c55e79f283dbdf5def18aeec42046aedf85cfd5b Mon Sep 17 00:00:00 2001 From: xogoodnow Date: Thu, 6 Jun 2024 02:19:15 +0330 Subject: [PATCH 105/133] Added words at the right place --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 7ae6fc15c4f..49f43615c7e 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -557,8 +557,6 @@ MinHash MinIO MinMax MindsDB -mlock -mlockall Mongodb Monotonicity MsgPack @@ -2002,6 +2000,8 @@ minmax mins misconfiguration mispredictions +mlock +mlockall mmap mmapped modularization From 33cd9c274c64452de55b1c01c80f031f625da8ae Mon Sep 17 00:00:00 2001 From: Peignon Melvyn Date: Thu, 6 Jun 2024 03:03:19 +0200 Subject: [PATCH 106/133] Update settings.md --- docs/en/operations/settings/settings.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 8045ebf4238..ffaf53085c4 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3170,6 +3170,18 @@ Possible values: Default value: `0`. +## lightweight_deletes_sync {#lightweight_deletes_sync} + +The same as 'mutation_sync', but controls only execution of lightweight deletes. + +Possible values: + +- 0 - Mutations execute asynchronously. +- 1 - The query waits for the lightweight deletes to complete on the current server. +- 2 - The query waits for the lightweight deletes to complete on all replicas (if they exist). + +Default value: `2`. + **See Also** - [Synchronicity of ALTER Queries](../../sql-reference/statements/alter/index.md#synchronicity-of-alter-queries) From f9c243064f886c0d0260d43787f1a630d911aa74 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 6 Jun 2024 04:20:20 +0200 Subject: [PATCH 107/133] A tiny fix for fancy quotes --- src/Parsers/Lexer.cpp | 3 --- .../0_stateless/03167_fancy_quotes_off_by_one.reference | 1 + tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql | 1 + 3 files changed, 2 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference create mode 100644 tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index 5f2bd50524c..b4601389696 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -59,9 +59,6 @@ Token quotedStringWithUnicodeQuotes(const char *& pos, const char * const token_ pos = find_first_symbols<'\xE2'>(pos, end); if (pos + 2 >= end) return Token(error_token, token_begin, end); - /// Empty identifiers are not allowed, while empty strings are. - if (success_token == TokenType::QuotedIdentifier && pos + 3 >= end) - return Token(error_token, token_begin, end); if (pos[0] == '\xE2' && pos[1] == '\x80' && pos[2] == expected_end_byte) { diff --git a/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference new file mode 100644 index 00000000000..9daeafb9864 --- /dev/null +++ b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.reference @@ -0,0 +1 @@ +test diff --git a/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql new file mode 100644 index 00000000000..6f563d8f2a1 --- /dev/null +++ b/tests/queries/0_stateless/03167_fancy_quotes_off_by_one.sql @@ -0,0 +1 @@ +SELECT ‘test’ AS “column” \ No newline at end of file From 1156233ea8b84854b2fd6a01042390fb9ac2009e Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Wed, 5 Jun 2024 16:40:55 +0200 Subject: [PATCH 108/133] Use a proper way to download packages from releases --- docker/test/upgrade/run.sh | 3 +- tests/ci/build_download_helper.py | 14 +++- tests/ci/download_release_packages.py | 77 +++++--------------- tests/ci/get_previous_release_tag.py | 100 +++++++++++++------------- 4 files changed, 84 insertions(+), 110 deletions(-) diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh index 1f2cc9903b2..a4c4c75e5b3 100644 --- a/docker/test/upgrade/run.sh +++ b/docker/test/upgrade/run.sh @@ -25,7 +25,8 @@ azurite-blob --blobHost 0.0.0.0 --blobPort 10000 --debug /azurite_log & ./setup_minio.sh stateless # to have a proper environment echo "Get previous release tag" -previous_release_tag=$(dpkg --info package_folder/clickhouse-client*.deb | grep "Version: " | awk '{print $2}' | cut -f1 -d'+' | get_previous_release_tag) +# shellcheck disable=SC2016 +previous_release_tag=$(dpkg-deb --showformat='${Version}' --show package_folder/clickhouse-client*.deb | get_previous_release_tag) echo $previous_release_tag echo "Clone previous release repository" diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 0f6c8e5aa8a..9a131084763 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -10,9 +10,21 @@ from typing import Any, Callable, List, Optional, Union import requests -import get_robot_token as grt # we need an updated ROBOT_TOKEN from ci_config import CI_CONFIG +try: + # A work around for scripts using this downloading module without required deps + import get_robot_token as grt # we need an updated ROBOT_TOKEN +except ImportError: + + class grt: # type: ignore + ROBOT_TOKEN = None + + @staticmethod + def get_best_robot_token() -> str: + return "" + + DOWNLOAD_RETRIES_COUNT = 5 diff --git a/tests/ci/download_release_packages.py b/tests/ci/download_release_packages.py index 1ba4ff8ff2e..8f3a2190ae8 100755 --- a/tests/ci/download_release_packages.py +++ b/tests/ci/download_release_packages.py @@ -1,79 +1,38 @@ #!/usr/bin/env python3 import logging -import os +from pathlib import Path -import requests -from requests.adapters import HTTPAdapter # type: ignore -from urllib3.util.retry import Retry # type: ignore - -from get_previous_release_tag import ReleaseInfo, get_previous_release - -CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/tags" - -DOWNLOAD_PREFIX = ( - "https://github.com/ClickHouse/ClickHouse/releases/download/v{version}-{type}/" +from build_download_helper import DownloadException, download_build_with_progress +from get_previous_release_tag import ( + ReleaseInfo, + get_previous_release, + get_release_by_tag, ) -CLICKHOUSE_COMMON_STATIC_PACKAGE_NAME = "clickhouse-common-static_{version}_amd64.deb" -CLICKHOUSE_COMMON_STATIC_DBG_PACKAGE_NAME = ( - "clickhouse-common-static-dbg_{version}_amd64.deb" -) -CLICKHOUSE_CLIENT_PACKAGE_NAME = "clickhouse-client_{version}_amd64.deb" -CLICKHOUSE_LIBRARY_BRIDGE_PACKAGE_NAME = "clickhouse-library-bridge_{version}_amd64.deb" -CLICKHOUSE_ODBC_BRIDGE_PACKAGE_NAME = "clickhouse-odbc-bridge_{version}_amd64.deb" -CLICKHOUSE_SERVER_PACKAGE_NAME = "clickhouse-server_{version}_amd64.deb" -PACKAGES_DIR = "previous_release_package_folder/" -VERSION_PATTERN = r"((?:\d+\.)?(?:\d+\.)?(?:\d+\.)?\d+-[a-zA-Z]*)" +PACKAGES_DIR = Path("previous_release_package_folder") -def download_package(url, out_path, retries=10, backoff_factor=0.3): - session = requests.Session() - retry = Retry( - total=retries, - read=retries, - connect=retries, - backoff_factor=backoff_factor, - status_forcelist=[500, 502, 503, 504], - ) - adapter = HTTPAdapter(max_retries=retry) - session.mount("http://", adapter) - session.mount("https://", adapter) - response = session.get(url) - response.raise_for_status() - print(f"Download {url} to {out_path}") - with open(out_path, "wb") as fd: - fd.write(response.content) - - -def download_packages(release, dest_path=PACKAGES_DIR): - if not os.path.exists(dest_path): - os.makedirs(dest_path) +def download_packages(release: ReleaseInfo, dest_path: Path = PACKAGES_DIR) -> None: + dest_path.mkdir(parents=True, exist_ok=True) logging.info("Will download %s", release) - def get_dest_path(pkg_name): - return os.path.join(dest_path, pkg_name) - - for pkg in ( - CLICKHOUSE_COMMON_STATIC_PACKAGE_NAME, - CLICKHOUSE_COMMON_STATIC_DBG_PACKAGE_NAME, - CLICKHOUSE_CLIENT_PACKAGE_NAME, - CLICKHOUSE_LIBRARY_BRIDGE_PACKAGE_NAME, - CLICKHOUSE_ODBC_BRIDGE_PACKAGE_NAME, - CLICKHOUSE_SERVER_PACKAGE_NAME, - ): - url = (DOWNLOAD_PREFIX + pkg).format(version=release.version, type=release.type) - pkg_name = get_dest_path(pkg.format(version=release.version)) - download_package(url, pkg_name) + for pkg, url in release.assets.items(): + if not pkg.endswith("_amd64.deb") or "-dbg_" in pkg: + continue + pkg_name = dest_path / pkg + download_build_with_progress(url, pkg_name) -def download_last_release(dest_path): +def download_last_release(dest_path: Path) -> None: current_release = get_previous_release(None) + if current_release is None: + raise DownloadException("The current release is not found") download_packages(current_release, dest_path=dest_path) if __name__ == "__main__": logging.basicConfig(level=logging.INFO) - release = ReleaseInfo(input()) + release = get_release_by_tag(input()) download_packages(release) diff --git a/tests/ci/get_previous_release_tag.py b/tests/ci/get_previous_release_tag.py index bc0cb975ef5..2b4d09aa326 100755 --- a/tests/ci/get_previous_release_tag.py +++ b/tests/ci/get_previous_release_tag.py @@ -2,47 +2,37 @@ import logging import re -from typing import List, Optional, Tuple +from typing import Dict, List, Optional, Tuple -import requests - -CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/tags" -CLICKHOUSE_PACKAGE_URL = ( - "https://github.com/ClickHouse/ClickHouse/releases/download/" - "v{version}-{type}/clickhouse-common-static_{version}_amd64.deb" +from build_download_helper import get_gh_api +from git_helper import TAG_REGEXP +from version_helper import ( + ClickHouseVersion, + get_version_from_string, + get_version_from_tag, ) -VERSION_PATTERN = r"(v(?:\d+\.)?(?:\d+\.)?(?:\d+\.)?\d+-[a-zA-Z]*)" + +CLICKHOUSE_TAGS_URL = "https://api.github.com/repos/ClickHouse/ClickHouse/releases" +PACKAGE_REGEXP = r"\Aclickhouse-common-static_.+[.]deb" logger = logging.getLogger(__name__) -class Version: - def __init__(self, version: str): - self.version = version - - def __lt__(self, other: "Version") -> bool: - return list(map(int, self.version.split("."))) < list( - map(int, other.version.split(".")) - ) - - def __str__(self): - return self.version - - class ReleaseInfo: - def __init__(self, release_tag: str): - self.version = Version(release_tag[1:].split("-")[0]) - self.type = release_tag[1:].split("-")[1] + def __init__(self, release_tag: str, assets: Dict[str, str]): + self.version = get_version_from_tag(release_tag) + self.type = self.version.description + self.assets = assets def __str__(self): - return f"v{self.version}-{self.type}" + return self.version.describe def __repr__(self): - return f"ReleaseInfo: {self.version}-{self.type}" + return f"ReleaseInfo: {self.version.describe}" def find_previous_release( - server_version: Optional[Version], releases: List[ReleaseInfo] + server_version: Optional[ClickHouseVersion], releases: List[ReleaseInfo] ) -> Tuple[bool, Optional[ReleaseInfo]]: releases.sort(key=lambda x: x.version, reverse=True) @@ -54,15 +44,7 @@ def find_previous_release( # Check if the artifact exists on GitHub. # It can be not true for a short period of time # after creating a tag for a new release before uploading the packages. - if ( - requests.head( - CLICKHOUSE_PACKAGE_URL.format( - version=release.version, type=release.type - ), - timeout=10, - ).status_code - != 404 - ): + if any(re.match(PACKAGE_REGEXP, name) for name in release.assets.keys()): return True, release logger.debug( @@ -74,12 +56,14 @@ def find_previous_release( return False, None -def get_previous_release(server_version: Optional[Version]) -> Optional[ReleaseInfo]: +def get_previous_release( + server_version: Optional[ClickHouseVersion], +) -> Optional[ReleaseInfo]: page = 1 found = False while not found: - response = requests.get( - CLICKHOUSE_TAGS_URL, {"page": page, "per_page": 100}, timeout=10 + response = get_gh_api( + CLICKHOUSE_TAGS_URL, params={"page": page, "per_page": 100}, timeout=10 ) if not response.ok: logger.error( @@ -87,24 +71,42 @@ def get_previous_release(server_version: Optional[Version]) -> Optional[ReleaseI ) response.raise_for_status() - releases_str = set(re.findall(VERSION_PATTERN, response.text)) - if len(releases_str) == 0: - raise ValueError( - "Cannot find previous release for " - + str(server_version) - + " server version" - ) + releases = response.json() - releases = [ReleaseInfo(release) for release in releases_str] - found, previous_release = find_previous_release(server_version, releases) + release_infos = [] # type: List[ReleaseInfo] + for r in releases: + if re.match(TAG_REGEXP, r["tag_name"]): + assets = { + a["name"]: a["browser_download_url"] + for a in r["assets"] + if a["state"] == "uploaded" + } + release_infos.append(ReleaseInfo(r["tag_name"], assets)) + found, previous_release = find_previous_release(server_version, release_infos) page += 1 return previous_release +def get_release_by_tag(tag: str) -> ReleaseInfo: + response = get_gh_api(f"{CLICKHOUSE_TAGS_URL}/tags/{tag}", timeout=10) + release = response.json() + assets = { + a["name"]: a["browser_download_url"] + for a in release["assets"] + if a["state"] == "uploaded" + } + return ReleaseInfo(release["tag_name"], assets) + + def main(): logging.basicConfig(level=logging.INFO) - server_version = Version(input()) + version_string = input() + version_string = version_string.split("+", maxsplit=1)[0] + try: + server_version = get_version_from_string(version_string) + except ValueError: + server_version = get_version_from_tag(version_string) print(get_previous_release(server_version)) From 4eba3105273a09a9d1fe724c987018d87659a1d2 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 6 Jun 2024 00:57:00 +0200 Subject: [PATCH 109/133] Simplify set_auth_header --- tests/ci/build_download_helper.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tests/ci/build_download_helper.py b/tests/ci/build_download_helper.py index 9a131084763..ce5b46a664e 100644 --- a/tests/ci/build_download_helper.py +++ b/tests/ci/build_download_helper.py @@ -75,15 +75,10 @@ def get_gh_api( """ def set_auth_header(): - if "headers" in kwargs: - if "Authorization" not in kwargs["headers"]: - kwargs["headers"][ - "Authorization" - ] = f"Bearer {grt.get_best_robot_token()}" - else: - kwargs["headers"] = { - "Authorization": f"Bearer {grt.get_best_robot_token()}" - } + headers = kwargs.get("headers", {}) + if "Authorization" not in headers: + headers["Authorization"] = f"Bearer {grt.get_best_robot_token()}" + kwargs["headers"] = headers if grt.ROBOT_TOKEN is not None: set_auth_header() From fd930971301edfc6f5f199744354ab4f5005beb7 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 29 May 2024 04:10:38 +0000 Subject: [PATCH 110/133] Fix writing ORC statistics for unsigned types --- contrib/orc | 2 +- .../Impl/NativeORCBlockInputFormat.cpp | 7 +++- .../Formats/Impl/ORCBlockOutputFormat.cpp | 12 +++--- .../0_stateless/02892_orc_filter_pushdown.sql | 2 +- .../03164_orc_signedness.reference | 41 +++++++++++++++++++ .../0_stateless/03164_orc_signedness.sql | 40 ++++++++++++++++++ 6 files changed, 96 insertions(+), 8 deletions(-) create mode 100644 tests/queries/0_stateless/03164_orc_signedness.reference create mode 100644 tests/queries/0_stateless/03164_orc_signedness.sql diff --git a/contrib/orc b/contrib/orc index e24f2c2a3ca..947cebaf943 160000 --- a/contrib/orc +++ b/contrib/orc @@ -1 +1 @@ -Subproject commit e24f2c2a3ca0769c96704ab20ad6f512a83ea2ad +Subproject commit 947cebaf9432d708253ac08dc3012daa6b4ede6f diff --git a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp index 0b55f633c6a..dcd5a531b05 100644 --- a/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/NativeORCBlockInputFormat.cpp @@ -269,7 +269,12 @@ convertFieldToORCLiteral(const orc::Type & orc_type, const Field & field, DataTy case orc::SHORT: case orc::INT: case orc::LONG: { - /// May throw exception + /// May throw exception. + /// + /// In particular, it'll throw if we request the column as unsigned, like this: + /// SELECT * FROM file('t.orc', ORC, 'x UInt8') WHERE x > 10 + /// We have to reject this, otherwise it would miss values > 127 (because + /// they're treated as negative by ORC). auto val = field.get(); return orc::Literal(val); } diff --git a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp index 1e36c100667..6f543a05fba 100644 --- a/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ORCBlockOutputFormat.cpp @@ -315,18 +315,20 @@ void ORCBlockOutputFormat::writeColumn( if (null_bytemap) orc_column.hasNulls = true; + /// ORC doesn't have unsigned types, so cast everything to signed and sign-extend to Int64 to + /// make the ORC library calculate min and max correctly. switch (type->getTypeId()) { case TypeIndex::Enum8: [[fallthrough]]; case TypeIndex::Int8: { /// Note: Explicit cast to avoid clang-tidy error: 'signed char' to 'long' conversion; consider casting to 'unsigned char' first. - writeNumbers(orc_column, column, null_bytemap, [](const Int8 & value){ return static_cast(value); }); + writeNumbers(orc_column, column, null_bytemap, [](const Int8 & value){ return Int64(Int8(value)); }); break; } case TypeIndex::UInt8: { - writeNumbers(orc_column, column, null_bytemap, [](const UInt8 & value){ return value; }); + writeNumbers(orc_column, column, null_bytemap, [](const UInt8 & value){ return Int64(Int8(value)); }); break; } case TypeIndex::Enum16: [[fallthrough]]; @@ -338,7 +340,7 @@ void ORCBlockOutputFormat::writeColumn( case TypeIndex::Date: [[fallthrough]]; case TypeIndex::UInt16: { - writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; }); + writeNumbers(orc_column, column, null_bytemap, [](const UInt16 & value){ return Int64(Int16(value)); }); break; } case TypeIndex::Date32: [[fallthrough]]; @@ -349,12 +351,12 @@ void ORCBlockOutputFormat::writeColumn( } case TypeIndex::UInt32: { - writeNumbers(orc_column, column, null_bytemap, [](const UInt32 & value){ return value; }); + writeNumbers(orc_column, column, null_bytemap, [](const UInt32 & value){ return Int64(Int32(value)); }); break; } case TypeIndex::IPv4: { - writeNumbers(orc_column, column, null_bytemap, [](const IPv4 & value){ return value.toUnderType(); }); + writeNumbers(orc_column, column, null_bytemap, [](const IPv4 & value){ return Int64(Int32(value.toUnderType())); }); break; } case TypeIndex::Int64: diff --git a/tests/queries/0_stateless/02892_orc_filter_pushdown.sql b/tests/queries/0_stateless/02892_orc_filter_pushdown.sql index f9aa7696ac6..f1d1ba12570 100644 --- a/tests/queries/0_stateless/02892_orc_filter_pushdown.sql +++ b/tests/queries/0_stateless/02892_orc_filter_pushdown.sql @@ -1,4 +1,4 @@ --- Tags: no-fasttest, no-parallel, no-cpu-aarch64 +-- Tags: no-fasttest, no-parallel set output_format_orc_string_as_string = 1; set output_format_orc_row_index_stride = 100; diff --git a/tests/queries/0_stateless/03164_orc_signedness.reference b/tests/queries/0_stateless/03164_orc_signedness.reference new file mode 100644 index 00000000000..3ee822a94c1 --- /dev/null +++ b/tests/queries/0_stateless/03164_orc_signedness.reference @@ -0,0 +1,41 @@ +-- { echoOn } +select x from file('i8.orc') where indexHint(x = -128); +-128 +select x from file('i8.orc') where indexHint(x = 128); +select x from file('u8.orc') where indexHint(x = -128); +-128 +select x from file('u8.orc') where indexHint(x = 128); +select x from file('i16.orc') where indexHint(x = -32768); +-32768 +select x from file('i16.orc') where indexHint(x = 32768); +select x from file('u16.orc') where indexHint(x = -32768); +-32768 +select x from file('u16.orc') where indexHint(x = 32768); +select x from file('i32.orc') where indexHint(x = -2147483648); +-2147483648 +select x from file('i32.orc') where indexHint(x = 2147483648); +select x from file('u32.orc') where indexHint(x = -2147483648); +-2147483648 +select x from file('u32.orc') where indexHint(x = 2147483648); +select x from file('i64.orc') where indexHint(x = -9223372036854775808); +-9223372036854775808 +select x from file('i64.orc') where indexHint(x = 9223372036854775808); +-9223372036854775808 +select x from file('u64.orc') where indexHint(x = -9223372036854775808); +-9223372036854775808 +select x from file('u64.orc') where indexHint(x = 9223372036854775808); +-9223372036854775808 +select x from file('u8.orc', ORC, 'x UInt8') where indexHint(x > 10); +128 +select x from file('u8.orc', ORC, 'x UInt64') where indexHint(x > 10); +18446744073709551488 +select x from file('u16.orc', ORC, 'x UInt16') where indexHint(x > 10); +32768 +select x from file('u16.orc', ORC, 'x UInt64') where indexHint(x > 10); +18446744073709518848 +select x from file('u32.orc', ORC, 'x UInt32') where indexHint(x > 10); +2147483648 +select x from file('u32.orc', ORC, 'x UInt64') where indexHint(x > 10); +18446744071562067968 +select x from file('u64.orc', ORC, 'x UInt64') where indexHint(x > 10); +9223372036854775808 diff --git a/tests/queries/0_stateless/03164_orc_signedness.sql b/tests/queries/0_stateless/03164_orc_signedness.sql new file mode 100644 index 00000000000..ced99c7dca7 --- /dev/null +++ b/tests/queries/0_stateless/03164_orc_signedness.sql @@ -0,0 +1,40 @@ +set input_format_orc_filter_push_down = 1; +set engine_file_truncate_on_insert = 1; + +insert into function file('i8.orc') select materialize(-128)::Int8 as x; +insert into function file('u8.orc') select materialize(128)::UInt8 as x; +insert into function file('i16.orc') select materialize(-32768)::Int16 as x; +insert into function file('u16.orc') select materialize(32768)::UInt16 as x; +insert into function file('i32.orc') select materialize(-2147483648)::Int32 as x; +insert into function file('u32.orc') select materialize(2147483648)::UInt32 as x; +insert into function file('i64.orc') select materialize(-9223372036854775808)::Int64 as x; +insert into function file('u64.orc') select materialize(9223372036854775808)::UInt64 as x; + +-- { echoOn } +select x from file('i8.orc') where indexHint(x = -128); +select x from file('i8.orc') where indexHint(x = 128); +select x from file('u8.orc') where indexHint(x = -128); +select x from file('u8.orc') where indexHint(x = 128); + +select x from file('i16.orc') where indexHint(x = -32768); +select x from file('i16.orc') where indexHint(x = 32768); +select x from file('u16.orc') where indexHint(x = -32768); +select x from file('u16.orc') where indexHint(x = 32768); + +select x from file('i32.orc') where indexHint(x = -2147483648); +select x from file('i32.orc') where indexHint(x = 2147483648); +select x from file('u32.orc') where indexHint(x = -2147483648); +select x from file('u32.orc') where indexHint(x = 2147483648); + +select x from file('i64.orc') where indexHint(x = -9223372036854775808); +select x from file('i64.orc') where indexHint(x = 9223372036854775808); +select x from file('u64.orc') where indexHint(x = -9223372036854775808); +select x from file('u64.orc') where indexHint(x = 9223372036854775808); + +select x from file('u8.orc', ORC, 'x UInt8') where indexHint(x > 10); +select x from file('u8.orc', ORC, 'x UInt64') where indexHint(x > 10); +select x from file('u16.orc', ORC, 'x UInt16') where indexHint(x > 10); +select x from file('u16.orc', ORC, 'x UInt64') where indexHint(x > 10); +select x from file('u32.orc', ORC, 'x UInt32') where indexHint(x > 10); +select x from file('u32.orc', ORC, 'x UInt64') where indexHint(x > 10); +select x from file('u64.orc', ORC, 'x UInt64') where indexHint(x > 10); From b300af350349b5bbefaa4036eed3d7c5d5a102d8 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Wed, 29 May 2024 05:00:03 +0000 Subject: [PATCH 111/133] no-fasttest --- tests/queries/0_stateless/03164_orc_signedness.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/03164_orc_signedness.sql b/tests/queries/0_stateless/03164_orc_signedness.sql index ced99c7dca7..ae2d0428ca5 100644 --- a/tests/queries/0_stateless/03164_orc_signedness.sql +++ b/tests/queries/0_stateless/03164_orc_signedness.sql @@ -1,3 +1,5 @@ +-- Tags: no-fasttest, no-parallel + set input_format_orc_filter_push_down = 1; set engine_file_truncate_on_insert = 1; From 40a3708c8f139c28f72e10f916c45a21ad235e28 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 31 May 2024 19:55:13 +0000 Subject: [PATCH 112/133] Fix test --- .../02892_orc_filter_pushdown.reference | 46 +++++++++---------- .../0_stateless/02892_orc_filter_pushdown.sql | 34 ++++++-------- 2 files changed, 35 insertions(+), 45 deletions(-) diff --git a/tests/queries/0_stateless/02892_orc_filter_pushdown.reference b/tests/queries/0_stateless/02892_orc_filter_pushdown.reference index 9059b403a34..e6c2e9b2b57 100644 --- a/tests/queries/0_stateless/02892_orc_filter_pushdown.reference +++ b/tests/queries/0_stateless/02892_orc_filter_pushdown.reference @@ -1,8 +1,4 @@ number Nullable(Int64) -u8 Nullable(Int8) -u16 Nullable(Int16) -u32 Nullable(Int32) -u64 Nullable(Int64) i8 Nullable(Int8) i16 Nullable(Int16) i32 Nullable(Int32) @@ -22,34 +18,34 @@ d64 Nullable(Decimal(18, 10)) d128 Nullable(Decimal(38, 20)) -- Go over all types individually -- { echoOn } -select count(), sum(number) from file('02892.orc') where indexHint(u8 in (10, 15, 250)); -800 4229600 -select count(1), min(u8), max(u8) from file('02892.orc') where u8 in (10, 15, 250); -66 10 15 +select count(), sum(number) from file('02892.orc') where indexHint(i8 in (10, 15, -6)); +1100 5744450 +select count(1), min(i8), max(i8) from file('02892.orc') where i8 in (10, 15, -6); +99 -6 15 select count(), sum(number) from file('02892.orc') where indexHint(i8 between -3 and 2); 1000 4999500 select count(1), min(i8), max(i8) from file('02892.orc') where i8 between -3 and 2; 208 -3 2 -select count(), sum(number) from file('02892.orc') where indexHint(u16 between 4000 and 61000 or u16 == 42); -1800 6479100 -select count(1), min(u16), max(u16) from file('02892.orc') where u16 between 4000 and 61000 or u16 == 42; +select count(), sum(number) from file('02892.orc') where indexHint(i16 between 4000 and 61000 or i16 == 42); +1200 1099400 +select count(1), min(i16), max(i16) from file('02892.orc') where i16 between 4000 and 61000 or i16 == 42; 1002 42 5000 select count(), sum(number) from file('02892.orc') where indexHint(i16 between -150 and 250); 500 2474750 select count(1), min(i16), max(i16) from file('02892.orc') where i16 between -150 and 250; 401 -150 250 -select count(), sum(number) from file('02892.orc') where indexHint(u32 in (42, 4294966296)); -200 999900 -select count(1), min(u32), max(u32) from file('02892.orc') where u32 in (42, 4294966296); -1 42 42 +select count(), sum(number) from file('02892.orc') where indexHint(i32 in (42, -1000)); +200 1099900 +select count(1), min(i32), max(i32) from file('02892.orc') where i32 in (42, -1000); +2 -1000 42 select count(), sum(number) from file('02892.orc') where indexHint(i32 between -150 and 250); 500 2474750 select count(1), min(i32), max(i32) from file('02892.orc') where i32 between -150 and 250; 401 -150 250 -select count(), sum(number) from file('02892.orc') where indexHint(u64 in (42, 18446744073709550616)); -100 494950 -select count(1), min(u64), max(u64) from file('02892.orc') where u64 in (42, 18446744073709550616); -1 42 42 +select count(), sum(number) from file('02892.orc') where indexHint(i64 in (42, -1000)); +200 1099900 +select count(1), min(i64), max(i64) from file('02892.orc') where i64 in (42, -1000); +2 -1000 42 select count(), sum(number) from file('02892.orc') where indexHint(i64 between -150 and 250); 500 2474750 select count(1), min(i64), max(i64) from file('02892.orc') where i64 between -150 and 250; @@ -111,21 +107,21 @@ select count(), sum(number) from file('02892.orc') where indexHint(0); 0 \N select count(), min(number), max(number) from file('02892.orc') where indexHint(0); 0 \N \N -select count(), sum(number) from file('02892.orc') where indexHint(s like '99%' or u64 == 2000); +select count(), sum(number) from file('02892.orc') where indexHint(s like '99%' or i64 == 2000); 300 1204850 -select count(), min(s), max(s) from file('02892.orc') where (s like '99%' or u64 == 2000); +select count(), min(s), max(s) from file('02892.orc') where (s like '99%' or i64 == 2000); 12 2000 999 select count(), sum(number) from file('02892.orc') where indexHint(s like 'z%'); 0 \N select count(), min(s), max(s) from file('02892.orc') where (s like 'z%'); 0 \N \N -select count(), sum(number) from file('02892.orc') where indexHint(u8 == 10 or 1 == 1); +select count(), sum(number) from file('02892.orc') where indexHint(i8 == 10 or 1 == 1); 10000 49995000 -select count(), min(u8), max(u8) from file('02892.orc') where (u8 == 10 or 1 == 1); +select count(), min(i8), max(i8) from file('02892.orc') where (i8 == 10 or 1 == 1); 10000 -128 127 -select count(), sum(number) from file('02892.orc') where indexHint(u8 < 0); +select count(), sum(number) from file('02892.orc') where indexHint(i8 < 0); 5300 26042350 -select count(), min(u8), max(u8) from file('02892.orc') where (u8 < 0); +select count(), min(i8), max(i8) from file('02892.orc') where (i8 < 0); 5001 -128 -1 -- { echoOn } select count(), sum(number) from file('02892.orc') where indexHint(sometimes_null is NULL); diff --git a/tests/queries/0_stateless/02892_orc_filter_pushdown.sql b/tests/queries/0_stateless/02892_orc_filter_pushdown.sql index f1d1ba12570..e3736de6a17 100644 --- a/tests/queries/0_stateless/02892_orc_filter_pushdown.sql +++ b/tests/queries/0_stateless/02892_orc_filter_pushdown.sql @@ -16,15 +16,9 @@ SET session_timezone = 'UTC'; -- Try all the types. insert into function file('02892.orc') - -- Use negative numbers to test sign extension for signed types and lack of sign extension for - -- unsigned types. with 5000 - number as n select number, - intDiv(n, 11)::UInt8 as u8, - n::UInt16 u16, - n::UInt32 as u32, - n::UInt64 as u64, intDiv(n, 11)::Int8 as i8, n::Int16 i16, n::Int32 as i32, @@ -50,26 +44,26 @@ desc file('02892.orc'); -- Go over all types individually -- { echoOn } -select count(), sum(number) from file('02892.orc') where indexHint(u8 in (10, 15, 250)); -select count(1), min(u8), max(u8) from file('02892.orc') where u8 in (10, 15, 250); +select count(), sum(number) from file('02892.orc') where indexHint(i8 in (10, 15, -6)); +select count(1), min(i8), max(i8) from file('02892.orc') where i8 in (10, 15, -6); select count(), sum(number) from file('02892.orc') where indexHint(i8 between -3 and 2); select count(1), min(i8), max(i8) from file('02892.orc') where i8 between -3 and 2; -select count(), sum(number) from file('02892.orc') where indexHint(u16 between 4000 and 61000 or u16 == 42); -select count(1), min(u16), max(u16) from file('02892.orc') where u16 between 4000 and 61000 or u16 == 42; +select count(), sum(number) from file('02892.orc') where indexHint(i16 between 4000 and 61000 or i16 == 42); +select count(1), min(i16), max(i16) from file('02892.orc') where i16 between 4000 and 61000 or i16 == 42; select count(), sum(number) from file('02892.orc') where indexHint(i16 between -150 and 250); select count(1), min(i16), max(i16) from file('02892.orc') where i16 between -150 and 250; -select count(), sum(number) from file('02892.orc') where indexHint(u32 in (42, 4294966296)); -select count(1), min(u32), max(u32) from file('02892.orc') where u32 in (42, 4294966296); +select count(), sum(number) from file('02892.orc') where indexHint(i32 in (42, -1000)); +select count(1), min(i32), max(i32) from file('02892.orc') where i32 in (42, -1000); select count(), sum(number) from file('02892.orc') where indexHint(i32 between -150 and 250); select count(1), min(i32), max(i32) from file('02892.orc') where i32 between -150 and 250; -select count(), sum(number) from file('02892.orc') where indexHint(u64 in (42, 18446744073709550616)); -select count(1), min(u64), max(u64) from file('02892.orc') where u64 in (42, 18446744073709550616); +select count(), sum(number) from file('02892.orc') where indexHint(i64 in (42, -1000)); +select count(1), min(i64), max(i64) from file('02892.orc') where i64 in (42, -1000); select count(), sum(number) from file('02892.orc') where indexHint(i64 between -150 and 250); select count(1), min(i64), max(i64) from file('02892.orc') where i64 between -150 and 250; @@ -117,17 +111,17 @@ select count(1), min(d128), max(128) from file('02892.orc') where (d128 between select count(), sum(number) from file('02892.orc') where indexHint(0); select count(), min(number), max(number) from file('02892.orc') where indexHint(0); -select count(), sum(number) from file('02892.orc') where indexHint(s like '99%' or u64 == 2000); -select count(), min(s), max(s) from file('02892.orc') where (s like '99%' or u64 == 2000); +select count(), sum(number) from file('02892.orc') where indexHint(s like '99%' or i64 == 2000); +select count(), min(s), max(s) from file('02892.orc') where (s like '99%' or i64 == 2000); select count(), sum(number) from file('02892.orc') where indexHint(s like 'z%'); select count(), min(s), max(s) from file('02892.orc') where (s like 'z%'); -select count(), sum(number) from file('02892.orc') where indexHint(u8 == 10 or 1 == 1); -select count(), min(u8), max(u8) from file('02892.orc') where (u8 == 10 or 1 == 1); +select count(), sum(number) from file('02892.orc') where indexHint(i8 == 10 or 1 == 1); +select count(), min(i8), max(i8) from file('02892.orc') where (i8 == 10 or 1 == 1); -select count(), sum(number) from file('02892.orc') where indexHint(u8 < 0); -select count(), min(u8), max(u8) from file('02892.orc') where (u8 < 0); +select count(), sum(number) from file('02892.orc') where indexHint(i8 < 0); +select count(), min(i8), max(i8) from file('02892.orc') where (i8 < 0); -- { echoOff } -- Nullable and LowCardinality. From a4b55985fafabf28c2eded19eca56e75367e8f15 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 6 Jun 2024 07:53:41 +0200 Subject: [PATCH 113/133] Fix issue with overwriting a lack of documentation failure --- tests/ci/run_check.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 9d9d1433073..064c1563913 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -23,7 +23,7 @@ from lambda_shared_package.lambda_shared.pr import ( check_pr_description, ) from pr_info import PRInfo -from report import FAILURE, PENDING, SUCCESS +from report import FAILURE, PENDING, SUCCESS, StatusType TRUSTED_ORG_IDS = { 54801242, # clickhouse @@ -93,6 +93,7 @@ def main(): description = format_description(description) gh = Github(get_best_robot_token(), per_page=100) commit = get_commit(gh, pr_info.sha) + status = SUCCESS # type: StatusType description_error, category = check_pr_description(pr_info.body, GITHUB_REPOSITORY) pr_labels_to_add = [] @@ -132,6 +133,7 @@ def main(): if pr_labels_to_remove: remove_labels(gh, pr_info, pr_labels_to_remove) + # 1. Next three IFs are in a correct order. First - fatal error if description_error: print( "::error ::Cannot run, PR description does not match the template: " @@ -146,9 +148,10 @@ def main(): f"{GITHUB_SERVER_URL}/{GITHUB_REPOSITORY}/" "blob/master/.github/PULL_REQUEST_TEMPLATE.md?plain=1" ) + status = FAILURE post_commit_status( commit, - FAILURE, + status, url, format_description(description_error), PR_CHECK, @@ -156,6 +159,7 @@ def main(): ) sys.exit(1) + # 2. Then we check if the documentation is not created to fail the Mergeable check if ( Labels.PR_FEATURE in pr_info.labels and not pr_info.has_changes_in_documentation() @@ -164,20 +168,15 @@ def main(): f"The '{Labels.PR_FEATURE}' in the labels, " "but there's no changed documentation" ) - post_commit_status( - commit, - FAILURE, - "", - f"expect adding docs for {Labels.PR_FEATURE}", - PR_CHECK, - pr_info, - ) - # allow the workflow to continue + status = FAILURE + description = f"expect adding docs for {Labels.PR_FEATURE}" + # 3. But we allow the workflow to continue + # 4. And post only a single commit status on a failure if not can_run: post_commit_status( commit, - FAILURE, + status, "", description, PR_CHECK, @@ -186,11 +185,12 @@ def main(): print("::notice ::Cannot run") sys.exit(1) + # The status for continue can be posted only one time, not more. post_commit_status( commit, - SUCCESS, + status, "", - "ok", + description, PR_CHECK, pr_info, ) From 61582a9bee43d75ab24eb357cc30045dd03b6b36 Mon Sep 17 00:00:00 2001 From: "Mikhail f. Shiryaev" Date: Thu, 6 Jun 2024 07:54:38 +0200 Subject: [PATCH 114/133] Use logging where no special strings are required --- tests/ci/run_check.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/tests/ci/run_check.py b/tests/ci/run_check.py index 064c1563913..131cbeef786 100644 --- a/tests/ci/run_check.py +++ b/tests/ci/run_check.py @@ -58,7 +58,7 @@ def pr_is_by_trusted_user(pr_user_login, pr_user_orgs): # Returns can_run, description def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: # Consider the labels and whether the user is trusted. - print("Got labels", pr_info.labels) + logging.info("Got labels: %s", pr_info.labels) if OK_SKIP_LABELS.intersection(pr_info.labels): return True, "Don't try new checks for release/backports/cherry-picks" @@ -66,9 +66,10 @@ def should_run_ci_for_pr(pr_info: PRInfo) -> Tuple[bool, str]: if Labels.CAN_BE_TESTED not in pr_info.labels and not pr_is_by_trusted_user( pr_info.user_login, pr_info.user_orgs ): - print( - f"PRs by untrusted users need the '{Labels.CAN_BE_TESTED}' label - " - "please contact a member of the core team" + logging.info( + "PRs by untrusted users need the '%s' label - " + "please contact a member of the core team", + Labels.CAN_BE_TESTED, ) return False, "Needs 'can be tested' label" @@ -126,7 +127,9 @@ def main(): f"::notice :: Add backport labels [{backport_labels}] for a given PR category" ) - print(f"Change labels: add {pr_labels_to_add}, remove {pr_labels_to_remove}") + logging.info( + "Change labels: add %s, remove %s", pr_labels_to_add, pr_labels_to_remove + ) if pr_labels_to_add: post_labels(gh, pr_info, pr_labels_to_add) @@ -165,7 +168,7 @@ def main(): and not pr_info.has_changes_in_documentation() ): print( - f"The '{Labels.PR_FEATURE}' in the labels, " + f"::error ::The '{Labels.PR_FEATURE}' in the labels, " "but there's no changed documentation" ) status = FAILURE @@ -182,7 +185,7 @@ def main(): PR_CHECK, pr_info, ) - print("::notice ::Cannot run") + print("::error ::Cannot run") sys.exit(1) # The status for continue can be posted only one time, not more. From f9ac18d74a80fe35e24baa8f896be7e891280888 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 6 Jun 2024 08:16:11 +0000 Subject: [PATCH 115/133] better description for history of a setting changes --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 8b157517263..b47b3a02466 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -95,7 +95,7 @@ static std::map sett {"hdfs_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in HDFS table engine"}, {"azure_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in AzureBlobStorage table engine"}, {"s3_ignore_file_doesnt_exist", false, false, "Allow to return 0 rows when the requested files don't exist instead of throwing an exception in S3 table engine"}, - {"min_untracked_memory", 4_MiB, 4_KiB, "A new setting."}, + {"min_untracked_memory", 4_MiB, 4_KiB, "A new setting to enable more accurate memory tracking."}, }}, {"24.5", {{"allow_deprecated_error_prone_window_functions", true, false, "Allow usage of deprecated error prone window functions (neighbor, runningAccumulate, runningDifferenceStartingWithFirstValue, runningDifference)"}, {"allow_experimental_join_condition", false, false, "Support join with inequal conditions which involve columns from both left and right table. e.g. t1.y < t2.y."}, From 45fda3fd3990b8047290af1b226d857cb47608ed Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 6 Jun 2024 08:20:52 +0000 Subject: [PATCH 116/133] use Mi suffix to make things obvious --- tests/integration/test_failed_async_inserts/test.py | 2 +- tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_failed_async_inserts/test.py b/tests/integration/test_failed_async_inserts/test.py index 2bb56b250ea..e7e504e565f 100644 --- a/tests/integration/test_failed_async_inserts/test.py +++ b/tests/integration/test_failed_async_inserts/test.py @@ -45,7 +45,7 @@ def test_failed_async_inserts(started_cluster): ignore_error=True, ) - select_query = "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery' SETTINGS min_untracked_memory = 4194304" + select_query = "SELECT value FROM system.events WHERE event == 'FailedAsyncInsertQuery' SETTINGS min_untracked_memory = '4Mi'" assert node.query(select_query) == "4\n" diff --git a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql index 92ef928bc2f..de84846c1d7 100644 --- a/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql +++ b/tests/queries/0_stateless/01017_uniqCombined_memory_usage.sql @@ -8,7 +8,7 @@ -- hence max_memory_usage for 100 rows = (96<<10)*100 = 9830400 SET use_uncompressed_cache = 0; -SET min_untracked_memory = 4194304; -- 4MiB +SET min_untracked_memory = '4Mi'; -- HashTable for UInt32 (used until (1<<13) elements), hence 8192 elements SELECT 'UInt32'; From b9edf204d9bf3b37072f3f2c6051fcc7fd286cfa Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 6 Jun 2024 08:51:22 +0000 Subject: [PATCH 117/133] better --- tests/integration/test_settings_constraints_distributed/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_settings_constraints_distributed/test.py b/tests/integration/test_settings_constraints_distributed/test.py index a1f44af1069..51541721a29 100644 --- a/tests/integration/test_settings_constraints_distributed/test.py +++ b/tests/integration/test_settings_constraints_distributed/test.py @@ -137,7 +137,7 @@ def test_select_clamps_settings(): assert ( distributed.query( - query, settings={"max_memory_usage": 1, "min_untracked_memory": 4194304} + query, settings={"max_memory_usage": 1, "min_untracked_memory": 4 * 1024 * 1024} ) == "node1\tmax_memory_usage\t11111111\n" "node1\treadonly\t0\n" From 74897790aa146ff814817912c600734c70990895 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 6 Jun 2024 09:00:35 +0000 Subject: [PATCH 118/133] Automatic style fix --- .../integration/test_settings_constraints_distributed/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_settings_constraints_distributed/test.py b/tests/integration/test_settings_constraints_distributed/test.py index 51541721a29..d29b66b43bb 100644 --- a/tests/integration/test_settings_constraints_distributed/test.py +++ b/tests/integration/test_settings_constraints_distributed/test.py @@ -137,7 +137,8 @@ def test_select_clamps_settings(): assert ( distributed.query( - query, settings={"max_memory_usage": 1, "min_untracked_memory": 4 * 1024 * 1024} + query, + settings={"max_memory_usage": 1, "min_untracked_memory": 4 * 1024 * 1024}, ) == "node1\tmax_memory_usage\t11111111\n" "node1\treadonly\t0\n" From 4cc9cecd7ce16f359c9787a84f451839b4c1acd6 Mon Sep 17 00:00:00 2001 From: Christoph Wurm Date: Thu, 6 Jun 2024 10:14:38 +0100 Subject: [PATCH 119/133] Document REPLACE PARTITION as atomic --- docs/en/sql-reference/statements/alter/partition.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index ce5cecf6fd6..0ed1e523669 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -139,7 +139,7 @@ For the query to run successfully, the following conditions must be met: ALTER TABLE table2 [ON CLUSTER cluster] REPLACE PARTITION partition_expr FROM table1 ``` -This query copies the data partition from the `table1` to `table2` and replaces existing partition in the `table2`. +This query copies the data partition from `table1` to `table2` and replaces the existing partition in `table2`. The operation is atomic. Note that: From 7900fe583699cc88f2f34fd26b4adadb275e2025 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 6 Jun 2024 11:36:57 +0200 Subject: [PATCH 120/133] Revert "Reduce lock contention for MergeTree tables (by renaming parts without holding lock)" --- .../MergeTree/DataPartStorageOnDiskBase.cpp | 16 +--- .../MergeTree/DataPartStorageOnDiskBase.h | 1 - src/Storages/MergeTree/IDataPartStorage.h | 11 +-- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 6 +- src/Storages/MergeTree/MergeTreeData.cpp | 96 ++++--------------- src/Storages/MergeTree/MergeTreeData.h | 26 ++--- .../MergeTree/MergeTreeDataMergerMutator.cpp | 5 +- src/Storages/MergeTree/MergeTreeSink.cpp | 3 +- .../MergeTree/MutateFromLogEntryTask.cpp | 3 +- .../MergeTree/MutatePlainMergeTreeTask.cpp | 3 +- .../MergeTree/ReplicatedMergeTreeSink.cpp | 5 +- src/Storages/StorageMergeTree.cpp | 14 +-- src/Storages/StorageReplicatedMergeTree.cpp | 20 ++-- 13 files changed, 59 insertions(+), 150 deletions(-) diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 120e0a6f426..378a1944396 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -59,16 +59,6 @@ std::string DataPartStorageOnDiskBase::getRelativePath() const return fs::path(root_path) / part_dir / ""; } -std::string DataPartStorageOnDiskBase::getParentDirectory() const -{ - /// Cut last "/" if it exists (it shouldn't). Otherwise fs::path behave differently. - fs::path part_dir_without_slash = part_dir.ends_with("/") ? part_dir.substr(0, part_dir.size() - 1) : part_dir; - - if (part_dir_without_slash.has_parent_path()) - return part_dir_without_slash.parent_path(); - return ""; -} - std::optional DataPartStorageOnDiskBase::getRelativePathForPrefix(LoggerPtr log, const String & prefix, bool detached, bool broken) const { assert(!broken || detached); @@ -684,9 +674,9 @@ void DataPartStorageOnDiskBase::remove( if (!has_delete_prefix) { - auto parent_path = getParentDirectory(); - if (!parent_path.empty()) + if (part_dir_without_slash.has_parent_path()) { + auto parent_path = part_dir_without_slash.parent_path(); if (parent_path == MergeTreeData::DETACHED_DIR_NAME) throw Exception( ErrorCodes::LOGICAL_ERROR, @@ -694,7 +684,7 @@ void DataPartStorageOnDiskBase::remove( part_dir, root_path); - part_dir_without_slash = fs::path(parent_path) / ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); + part_dir_without_slash = parent_path / ("delete_tmp_" + std::string{part_dir_without_slash.filename()}); } else { diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 44b2454e256..81353d4e20b 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -20,7 +20,6 @@ public: std::string getRelativePath() const override; std::string getPartDirectory() const override; std::string getFullRootPath() const override; - std::string getParentDirectory() const override; Poco::Timestamp getLastModified() const override; UInt64 calculateTotalSizeOnDisk() const override; diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 9342d6ca0ea..f6320a7e1e4 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -96,12 +96,11 @@ public: virtual MergeTreeDataPartStorageType getType() const = 0; /// Methods to get path components of a data part. - virtual std::string getFullPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving/all_1_5_1' - virtual std::string getRelativePath() const = 0; /// 'database/table/moving/all_1_5_1' - virtual std::string getPartDirectory() const = 0; /// 'all_1_5_1' - virtual std::string getFullRootPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving' - virtual std::string getParentDirectory() const = 0; /// '' (or 'detached' for 'detached/all_1_5_1') - /// Can add it if needed /// 'database/table/moving' + virtual std::string getFullPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving/all_1_5_1' + virtual std::string getRelativePath() const = 0; /// 'database/table/moving/all_1_5_1' + virtual std::string getPartDirectory() const = 0; /// 'all_1_5_1' + virtual std::string getFullRootPath() const = 0; /// '/var/lib/clickhouse/data/database/table/moving' + /// Can add it if needed /// 'database/table/moving' /// virtual std::string getRelativeRootPath() const = 0; /// Get a storage for projection. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 143394b1171..4c8f1240cf5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -737,11 +737,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks { /// Don't scare people with broken part error if (!isRetryableException(std::current_exception())) - { - auto message = getCurrentExceptionMessage(true); - LOG_ERROR(storage.log, "Part {} is broken and need manual correction. Reason: {}", - getDataPartStorage().getFullPath(), message); - } + LOG_ERROR(storage.log, "Part {} is broken and need manual correction", getDataPartStorage().getFullPath()); // There could be conditions that data part to be loaded is broken, but some of meta infos are already written // into meta data before exception, need to clean them all. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index ace28e058d4..cd706dab9ae 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3894,7 +3894,7 @@ void MergeTreeData::checkPartDynamicColumns(MutableDataPartPtr & part, DataParts } } -void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename, bool rename_in_transaction) +void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename) { part->is_temp = false; part->setState(DataPartState::PreActive); @@ -3906,15 +3906,12 @@ void MergeTreeData::preparePartForCommit(MutableDataPartPtr & part, Transaction return !may_be_cleaned_up || temporary_parts.contains(dir_name); }()); - if (need_rename && !rename_in_transaction) + if (need_rename) part->renameTo(part->name, true); LOG_TEST(log, "preparePartForCommit: inserting {} into data_parts_indexes", part->getNameWithState()); data_parts_indexes.insert(part); - if (rename_in_transaction) - out_transaction.addPart(part, need_rename); - else - out_transaction.addPart(part, /* need_rename= */ false); + out_transaction.addPart(part); } bool MergeTreeData::addTempPart( @@ -3963,8 +3960,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts, - bool rename_in_transaction) + DataPartsVector * out_covered_parts) { LOG_TRACE(log, "Renaming temporary part {} to {} with tid {}.", part->getDataPartStorage().getPartDirectory(), part->name, out_transaction.getTID()); @@ -4003,7 +3999,7 @@ bool MergeTreeData::renameTempPartAndReplaceImpl( /// All checks are passed. Now we can rename the part on disk. /// So, we maintain invariant: if a non-temporary part in filesystem then it is in data_parts - preparePartForCommit(part, out_transaction, /* need_rename= */ true, rename_in_transaction); + preparePartForCommit(part, out_transaction, /* need_rename */ true); if (out_covered_parts) { @@ -4018,31 +4014,29 @@ bool MergeTreeData::renameTempPartAndReplaceUnlocked( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - bool rename_in_transaction) + DataPartsVector * out_covered_parts) { - return renameTempPartAndReplaceImpl(part, out_transaction, lock, /*out_covered_parts=*/ nullptr, rename_in_transaction); + return renameTempPartAndReplaceImpl(part, out_transaction, lock, out_covered_parts); } MergeTreeData::DataPartsVector MergeTreeData::renameTempPartAndReplace( MutableDataPartPtr & part, - Transaction & out_transaction, - bool rename_in_transaction) + Transaction & out_transaction) { auto part_lock = lockParts(); DataPartsVector covered_parts; - renameTempPartAndReplaceImpl(part, out_transaction, part_lock, &covered_parts, rename_in_transaction); + renameTempPartAndReplaceImpl(part, out_transaction, part_lock, &covered_parts); return covered_parts; } bool MergeTreeData::renameTempPartAndAdd( MutableDataPartPtr & part, Transaction & out_transaction, - DataPartsLock & lock, - bool rename_in_transaction) + DataPartsLock & lock) { DataPartsVector covered_parts; - if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts, rename_in_transaction)) + if (!renameTempPartAndReplaceImpl(part, out_transaction, lock, &covered_parts)) return false; if (!covered_parts.empty()) @@ -4083,9 +4077,9 @@ void MergeTreeData::removePartsFromWorkingSet(MergeTreeTransaction * txn, const resetObjectColumnsFromActiveParts(acquired_lock); } -void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove, DataPartsLock * acquired_lock) +void MergeTreeData::removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove) { - auto lock = (acquired_lock) ? DataPartsLock() : lockParts(); + auto lock = lockParts(); for (const auto & part : remove) { @@ -4251,9 +4245,8 @@ MergeTreeData::PartsToRemoveFromZooKeeper MergeTreeData::removePartsInRangeFromW auto [new_data_part, tmp_dir_holder] = createEmptyPart(empty_info, partition, empty_part_name, NO_TRANSACTION_PTR); MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); - renameTempPartAndAdd(new_data_part, transaction, lock, /*rename_in_transaction=*/ true); /// All covered parts must be already removed + renameTempPartAndAdd(new_data_part, transaction, lock); /// All covered parts must be already removed - transaction.renameParts(); /// It will add the empty part to the set of Outdated parts without making it Active (exactly what we need) transaction.rollback(&lock); new_data_part->remove_time.store(0, std::memory_order_relaxed); @@ -6624,54 +6617,25 @@ TransactionID MergeTreeData::Transaction::getTID() const return Tx::PrehistoricTID; } -void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part, bool need_rename) +void MergeTreeData::Transaction::addPart(MutableDataPartPtr & part) { precommitted_parts.insert(part); - if (need_rename) - precommitted_parts_need_rename.insert(part); } void MergeTreeData::Transaction::rollback(DataPartsLock * lock) { if (!isEmpty()) { - for (const auto & part : precommitted_parts) - part->version.creation_csn.store(Tx::RolledBackCSN); - - auto non_detached_precommitted_parts = precommitted_parts; - - /// Remove detached parts from working set. - /// - /// It is possible to have detached parts here, only when rename (in - /// commit()) of detached parts had been broken (i.e. during ATTACH), - /// i.e. the part itself is broken. - DataPartsVector detached_precommitted_parts; - for (auto it = non_detached_precommitted_parts.begin(); it != non_detached_precommitted_parts.end();) - { - const auto & part = *it; - if (part->getDataPartStorage().getParentDirectory() == DETACHED_DIR_NAME) - { - detached_precommitted_parts.push_back(part); - it = non_detached_precommitted_parts.erase(it); - } - else - ++it; - } - WriteBufferFromOwnString buf; buf << "Removing parts:"; - for (const auto & part : non_detached_precommitted_parts) + for (const auto & part : precommitted_parts) buf << " " << part->getDataPartStorage().getPartDirectory(); buf << "."; - if (!detached_precommitted_parts.empty()) - { - buf << " Rollbacking parts state to temporary and removing from working set:"; - for (const auto & part : detached_precommitted_parts) - buf << " " << part->getDataPartStorage().getPartDirectory(); - buf << "."; - } LOG_DEBUG(data.log, "Undoing transaction {}. {}", getTID(), buf.str()); + for (const auto & part : precommitted_parts) + part->version.creation_csn.store(Tx::RolledBackCSN); + /// It would be much better with TSA... auto our_lock = (lock) ? DataPartsLock() : data.lockParts(); @@ -6681,7 +6645,7 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) if (!data.all_data_dropped) { Strings part_names; - for (const auto & part : non_detached_precommitted_parts) + for (const auto & part : precommitted_parts) part_names.emplace_back(part->name); throw Exception(ErrorCodes::LOGICAL_ERROR, "There are some PreActive parts ({}) to rollback, " "but data parts set is empty and table {} was not dropped. It's a bug", @@ -6690,12 +6654,8 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) } else { - data.removePartsFromWorkingSetImmediatelyAndSetTemporaryState( - detached_precommitted_parts, - &our_lock); - data.removePartsFromWorkingSet(txn, - DataPartsVector(non_detached_precommitted_parts.begin(), non_detached_precommitted_parts.end()), + DataPartsVector(precommitted_parts.begin(), precommitted_parts.end()), /* clear_without_timeout = */ true, &our_lock); } } @@ -6705,16 +6665,7 @@ void MergeTreeData::Transaction::rollback(DataPartsLock * lock) void MergeTreeData::Transaction::clear() { - chassert(precommitted_parts.size() >= precommitted_parts_need_rename.size()); precommitted_parts.clear(); - precommitted_parts_need_rename.clear(); -} - -void MergeTreeData::Transaction::renameParts() -{ - for (const auto & part_need_rename : precommitted_parts_need_rename) - part_need_rename->renameTo(part_need_rename->name, true); - precommitted_parts_need_rename.clear(); } MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock * acquired_parts_lock) @@ -6723,9 +6674,6 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock if (!isEmpty()) { - if (!precommitted_parts_need_rename.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Parts not renamed"); - auto settings = data.getSettings(); auto parts_lock = acquired_parts_lock ? DataPartsLock() : data.lockParts(); auto * owing_parts_lock = acquired_parts_lock ? acquired_parts_lock : &parts_lock; @@ -6734,8 +6682,6 @@ MergeTreeData::DataPartsVector MergeTreeData::Transaction::commit(DataPartsLock if (part->getDataPartStorage().hasActiveTransaction()) part->getDataPartStorage().commitTransaction(); - renameParts(); - if (txn) { for (const auto & part : precommitted_parts) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 7881062b724..c6f736a4afd 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -255,9 +255,7 @@ public: DataPartsVector commit(DataPartsLock * acquired_parts_lock = nullptr); - void renameParts(); - - void addPart(MutableDataPartPtr & part, bool need_rename); + void addPart(MutableDataPartPtr & part); void rollback(DataPartsLock * lock = nullptr); @@ -288,9 +286,9 @@ public: MergeTreeData & data; MergeTreeTransaction * txn; - MutableDataParts precommitted_parts; - MutableDataParts precommitted_parts_need_rename; + MutableDataParts locked_parts; + }; using TransactionUniquePtr = std::unique_ptr; @@ -590,27 +588,25 @@ public: bool renameTempPartAndAdd( MutableDataPartPtr & part, Transaction & transaction, - DataPartsLock & lock, - bool rename_in_transaction); + DataPartsLock & lock); /// The same as renameTempPartAndAdd but the block range of the part can contain existing parts. /// Returns all parts covered by the added part (in ascending order). DataPartsVector renameTempPartAndReplace( MutableDataPartPtr & part, - Transaction & out_transaction, - bool rename_in_transaction); + Transaction & out_transaction); /// Unlocked version of previous one. Useful when added multiple parts with a single lock. bool renameTempPartAndReplaceUnlocked( MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - bool rename_in_transaction); + DataPartsVector * out_covered_parts = nullptr); /// Remove parts from working set immediately (without wait for background /// process). Transfer part state to temporary. Have very limited usage only /// for new parts which aren't already present in table. - void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove, DataPartsLock * acquired_lock = nullptr); + void removePartsFromWorkingSetImmediatelyAndSetTemporaryState(const DataPartsVector & remove); /// Removes parts from the working set parts. /// Parts in add must already be in data_parts with PreActive, Active, or Outdated states. @@ -1606,10 +1602,7 @@ private: /// Preparing itself to be committed in memory: fill some fields inside part, add it to data_parts_indexes /// in precommitted state and to transaction - /// - /// @param need_rename - rename the part - /// @param rename_in_transaction - if set, the rename will be done as part of transaction (without holding DataPartsLock), otherwise inplace (when it does not make sense). - void preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename, bool rename_in_transaction = false); + void preparePartForCommit(MutableDataPartPtr & part, Transaction & out_transaction, bool need_rename); /// Low-level method for preparing parts for commit (in-memory). /// FIXME Merge MergeTreeTransaction and Transaction @@ -1617,8 +1610,7 @@ private: MutableDataPartPtr & part, Transaction & out_transaction, DataPartsLock & lock, - DataPartsVector * out_covered_parts, - bool rename_in_transaction); + DataPartsVector * out_covered_parts); /// RAII Wrapper for atomic work with currently moving parts /// Acquire them in constructor and remove them in destructor diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 791bcbc3275..2d49e1df19b 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -748,10 +748,7 @@ MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart "but transactions were enabled for this table"); /// Rename new part, add to the set and remove original parts. - auto replaced_parts = data.renameTempPartAndReplace(new_data_part, out_transaction, /*rename_in_transaction=*/ true); - - /// Explicitly rename part while still holding the lock for tmp folder to avoid cleanup - out_transaction.renameParts(); + auto replaced_parts = data.renameTempPartAndReplace(new_data_part, out_transaction); /// Let's check that all original parts have been deleted and only them. if (replaced_parts.size() != parts.size()) diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index dd28c04fef7..b7dede3cb00 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -186,8 +186,7 @@ void MergeTreeSink::finishDelayedChunk() } } - /// FIXME - added = storage.renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); + added = storage.renameTempPartAndAdd(part, transaction, lock); transaction.commit(&lock); } diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 5c59d5c1b47..8d40658bb2c 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -236,11 +236,10 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit if (data_part_storage.hasActiveTransaction()) data_part_storage.precommitTransaction(); - storage.renameTempPartAndReplace(new_part, *transaction_ptr, /*rename_in_transaction=*/ true); + storage.renameTempPartAndReplace(new_part, *transaction_ptr); try { - transaction_ptr->renameParts(); storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part, mutate_task->getHardlinkedFiles()); } catch (const Exception & e) diff --git a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp index 8a0d5c444bd..2fd02708421 100644 --- a/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp +++ b/src/Storages/MergeTree/MutatePlainMergeTreeTask.cpp @@ -97,8 +97,7 @@ bool MutatePlainMergeTreeTask::executeStep() MergeTreeData::Transaction transaction(storage, merge_mutate_entry->txn.get()); /// FIXME Transactions: it's too optimistic, better to lock parts before starting transaction - storage.renameTempPartAndReplace(new_part, transaction, /*rename_in_transaction=*/ true); - transaction.renameParts(); + storage.renameTempPartAndReplace(new_part, transaction); transaction.commit(); storage.updateMutationEntriesErrors(future_part, true, ""); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 50142185f79..4b4f4c33e7d 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -888,7 +888,7 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: try { auto lock = storage.lockParts(); - storage.renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); + storage.renameTempPartAndAdd(part, transaction, lock); } catch (const Exception & e) { @@ -903,9 +903,6 @@ std::pair, bool> ReplicatedMergeTreeSinkImpl:: throw; } - /// Rename parts before committing to ZooKeeper without holding DataPartsLock. - transaction.renameParts(); - ThreadFuzzer::maybeInjectSleep(); fiu_do_on(FailPoints::replicated_merge_tree_commit_zk_fail_after_op, { zookeeper->forceFailureAfterOperation(); }); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a85bc936031..27a76f4f21d 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1788,7 +1788,7 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa for (auto & part: new_parts) { - DataPartsVector covered_parts_by_one_part = renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); + DataPartsVector covered_parts_by_one_part = renameTempPartAndReplace(part, transaction); if (covered_parts_by_one_part.size() > 1) throw Exception(ErrorCodes::LOGICAL_ERROR, @@ -1798,10 +1798,10 @@ void StorageMergeTree::renameAndCommitEmptyParts(MutableDataPartsVector & new_pa std::move(covered_parts_by_one_part.begin(), covered_parts_by_one_part.end(), std::back_inserter(covered_parts)); } + LOG_INFO(log, "Remove {} parts by covering them with empty {} parts. With txn {}.", covered_parts.size(), new_parts.size(), transaction.getTID()); - transaction.renameParts(); transaction.commit(); /// Remove covered parts without waiting for old_parts_lifetime seconds. @@ -2064,7 +2064,7 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition( { auto lock = lockParts(); fillNewPartNameAndResetLevel(loaded_parts[i], lock); - renameTempPartAndAdd(loaded_parts[i], transaction, lock, /*rename_in_transaction=*/ false); + renameTempPartAndAdd(loaded_parts[i], transaction, lock); transaction.commit(&lock); } @@ -2180,9 +2180,8 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con for (auto part : dst_parts) { fillNewPartName(part, data_parts_lock); - renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock, /*rename_in_transaction=*/ true); + renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock); } - transaction.renameParts(); /// Populate transaction transaction.commit(&data_parts_lock); @@ -2285,9 +2284,10 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const for (auto & part : dst_parts) { dest_table_storage->fillNewPartName(part, dest_data_parts_lock); - dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock, /*rename_in_transaction=*/ false); + dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock); } + removePartsFromWorkingSet(local_context->getCurrentTransaction().get(), src_parts, true, src_data_parts_lock); transaction.commit(&src_data_parts_lock); } @@ -2447,7 +2447,7 @@ void StorageMergeTree::attachRestoredParts(MutableDataPartsVector && parts) { auto lock = lockParts(); fillNewPartName(part, lock); - renameTempPartAndAdd(part, transaction, lock, /*rename_in_transaction=*/ false); + renameTempPartAndAdd(part, transaction, lock); transaction.commit(&lock); } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9ebca78d87a..e18e66d7af9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -2093,8 +2093,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry) Transaction transaction(*this, NO_TRANSACTION_RAW); part->version.setCreationTID(Tx::PrehistoricTID, nullptr); - renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); - transaction.renameParts(); + renameTempPartAndReplace(part, transaction); checkPartChecksumsAndCommit(transaction, part); writePartLog(PartLogElement::Type::NEW_PART, {}, 0 /** log entry is fake so we don't measure the time */, @@ -2883,11 +2882,11 @@ bool StorageReplicatedMergeTree::executeReplaceRange(LogEntry & entry) Coordination::Requests ops; for (PartDescriptionPtr & part_desc : final_parts) { - renameTempPartAndReplace(part_desc->res_part, transaction, /*rename_in_transaction=*/ true); + renameTempPartAndReplace(part_desc->res_part, transaction); getCommitPartOps(ops, part_desc->res_part); - lockSharedData(*part_desc->res_part, /*replace_existing_lock=*/ true, part_desc->hardlinked_files); + + lockSharedData(*part_desc->res_part, /* replace_existing_lock */ true, part_desc->hardlinked_files); } - transaction.renameParts(); if (!ops.empty()) @@ -4959,8 +4958,7 @@ bool StorageReplicatedMergeTree::fetchPart( if (!to_detached) { Transaction transaction(*this, NO_TRANSACTION_RAW); - renameTempPartAndReplace(part, transaction, /*rename_in_transaction=*/ true); - transaction.renameParts(); + renameTempPartAndReplace(part, transaction); chassert(!part_to_clone || !is_zero_copy_part(part)); replaced_parts = checkPartChecksumsAndCommit(transaction, part, /*hardlinked_files*/ {}, /*replace_zero_copy_lock*/ true); @@ -8204,9 +8202,8 @@ void StorageReplicatedMergeTree::replacePartitionFrom( { auto data_parts_lock = lockParts(); for (auto & part : dst_parts) - renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock, /*rename_in_transaction=*/ true); + renameTempPartAndReplaceUnlocked(part, transaction, data_parts_lock); } - transaction.renameParts(); for (const auto & dst_part : dst_parts) lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); @@ -8481,7 +8478,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta auto dest_data_parts_lock = dest_table_storage->lockParts(); for (auto & part : dst_parts) - dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock, /*rename_in_transaction=*/ false); + dest_table_storage->renameTempPartAndReplaceUnlocked(part, transaction, dest_data_parts_lock); for (const auto & dst_part : dst_parts) dest_table_storage->lockSharedData(*dst_part, false, /*hardlinked_files*/ {}); @@ -10114,8 +10111,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP try { MergeTreeData::Transaction transaction(*this, NO_TRANSACTION_RAW); - auto replaced_parts = renameTempPartAndReplace(new_data_part, transaction, /*rename_in_transaction=*/ true); - transaction.renameParts(); + auto replaced_parts = renameTempPartAndReplace(new_data_part, transaction); if (!replaced_parts.empty()) { From a7e4875e4ceba1887225fa35de408cc630d6226b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 6 Jun 2024 11:45:41 +0200 Subject: [PATCH 121/133] Fix test_mask_sensitive_info/test.py::test_create_table CI: https://s3.amazonaws.com/clickhouse-test-reports/64887/5d01a2f7c8b41244ecad2cbf2c7f5ed7e6113cc9/integration_tests__asan__old_analyzer__[2_6].html Signed-off-by: Azat Khuzhin --- tests/integration/test_mask_sensitive_info/test.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_mask_sensitive_info/test.py b/tests/integration/test_mask_sensitive_info/test.py index d2562f3966a..38cbf8c1aed 100644 --- a/tests/integration/test_mask_sensitive_info/test.py +++ b/tests/integration/test_mask_sensitive_info/test.py @@ -260,8 +260,9 @@ def test_create_table(): "CREATE TABLE table16 (`x` int) ENGINE = DeltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')", "CREATE TABLE table17 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV') settings mode = 'ordered'", "CREATE TABLE table18 (x int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'CSV', 'gzip') settings mode = 'ordered'", - "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV') settings mode = 'ordered'", - "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') settings mode = 'ordered'", + # due to sensitive data substituion the query will be normalized, so not "settings" but "SETTINGS" + "CREATE TABLE table19 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV') SETTINGS mode = 'ordered'", + "CREATE TABLE table20 (`x` int) ENGINE = S3Queue('http://minio1:9001/root/data/', 'minio', '[HIDDEN]', 'CSV', 'gzip') SETTINGS mode = 'ordered'", ], must_not_contain=[password], ) From ba40f7a754c038152d66b0627ebe208029856f4a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 6 Jun 2024 13:20:31 +0200 Subject: [PATCH 122/133] Update 03165_string_functions_with_token_text_indexes.sql --- .../03165_string_functions_with_token_text_indexes.sql | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql index a0cb8a35169..fee30af0245 100644 --- a/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql +++ b/tests/queries/0_stateless/03165_string_functions_with_token_text_indexes.sql @@ -2,6 +2,8 @@ SELECT '-------- Bloom filter --------'; SELECT ''; DROP TABLE IF EXISTS 03165_token_bf; +SET allow_experimental_full_text_index=1; + CREATE TABLE 03165_token_bf ( id Int64, From 2a5f9c941c331d06fc487bbb1dda423d6b2b370c Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 6 Jun 2024 13:47:13 +0200 Subject: [PATCH 123/133] document to/fromUnixTimestampXYZ functions --- .../functions/type-conversion-functions.md | 197 +++++++++++++++--- 1 file changed, 169 insertions(+), 28 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 5dd1d5ceebe..60bdab22a58 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -2423,11 +2423,7 @@ Result: ## toUnixTimestamp64Milli -## toUnixTimestamp64Micro - -## toUnixTimestamp64Nano - -Converts a `DateTime64` to a `Int64` value with fixed sub-second precision. Input value is scaled up or down appropriately depending on it precision. +Converts a `DateTime64` to a `Int64` value with fixed millisecond precision. The input value is scaled up or down appropriately depending on its precision. :::note The output value is a timestamp in UTC, not in the timezone of `DateTime64`. @@ -2437,24 +2433,22 @@ The output value is a timestamp in UTC, not in the timezone of `DateTime64`. ```sql toUnixTimestamp64Milli(value) -toUnixTimestamp64Micro(value) -toUnixTimestamp64Nano(value) ``` **Arguments** -- `value` — DateTime64 value with any precision. +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). **Returned value** -- `value` converted to the `Int64` data type. +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). -**Examples** +**Example** Query: ```sql -WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 +WITH toDateTime64('2009-02-13 23:31:31.011', 3, 'UTC') AS dt64 SELECT toUnixTimestamp64Milli(dt64); ``` @@ -2462,14 +2456,77 @@ Result: ```response ┌─toUnixTimestamp64Milli(dt64)─┐ -│ 1568650812345 │ +│ 1234567891011 │ └──────────────────────────────┘ ``` +## toUnixTimestamp64Micro + +Converts a `DateTime64` to a `Int64` value with fixed microsecond precision. The input value is scaled up or down appropriately depending on its precision. + +:::note +The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +::: + +**Syntax** + +```sql +toUnixTimestamp64Micro(value) +``` + +**Arguments** + +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). + +**Returned value** + +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). + +**Example** + Query: -``` sql -WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 +```sql +WITH toDateTime64('1970-01-15 06:56:07.891011', 6, 'UTC') AS dt64 +SELECT toUnixTimestamp64Micro(dt64); +``` + +Result: + +```response +┌─toUnixTimestamp64Micro(dt64)─┐ +│ 1234567891011 │ +└──────────────────────────────┘ +``` + +## toUnixTimestamp64Nano + +Converts a `DateTime64` to a `Int64` value with fixed nano precision. The input value is scaled up or down appropriately depending on its precision. + +:::note +The output value is a timestamp in UTC, not in the timezone of `DateTime64`. +::: + +**Syntax** + +```sql +toUnixTimestamp64Nano(value) +``` + +**Arguments** + +- `value` — DateTime64 value with any precision. [DateTime64](../data-types/datetime64.md). + +**Returned value** + +- `value` converted to the `Int64` data type. [Int64](../data-types/int-uint.md). + +**Example** + +Query: + +```sql +WITH toDateTime64('1970-01-01 00:20:34.567891011', 9, 'UTC') AS dt64 SELECT toUnixTimestamp64Nano(dt64); ``` @@ -2477,34 +2534,32 @@ Result: ```response ┌─toUnixTimestamp64Nano(dt64)─┐ -│ 1568650812345678000 │ +│ 1234567891011 │ └─────────────────────────────┘ ``` ## fromUnixTimestamp64Milli -## fromUnixTimestamp64Micro +Converts an `Int64` to a `DateTime64` value with fixed millisecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. -## fromUnixTimestamp64Nano - -Converts an `Int64` to a `DateTime64` value with fixed sub-second precision and optional timezone. Input value is scaled up or down appropriately depending on it’s precision. Please note that input value is treated as UTC timestamp, not timestamp at given (or implicit) timezone. +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: **Syntax** ``` sql fromUnixTimestamp64Milli(value[, timezone]) -fromUnixTimestamp64Micro(value[, timezone]) -fromUnixTimestamp64Nano(value[, timezone]) ``` **Arguments** -- `value` — `Int64` value with any precision. -- `timezone` — `String` (optional) timezone name of the result. +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). **Returned value** -- `value` converted to the `DateTime64` data type. +- `value` converted to DateTime64 with precision `3`. [DateTime64](../data-types/datetime64.md). **Example** @@ -2512,15 +2567,101 @@ Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT + fromUnixTimestamp64Milli(i64, 'UTC') AS x, + toTypeName(x); ``` Result: ```response -┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ -│ 2009-02-13 23:31:31.011 │ -└──────────────────────────────────────┘ +┌───────────────────────x─┬─toTypeName(x)────────┐ +│ 2009-02-13 23:31:31.011 │ DateTime64(3, 'UTC') │ +└─────────────────────────┴──────────────────────┘ +``` + +## fromUnixTimestamp64Micro + +Converts an `Int64` to a `DateTime64` value with fixed microsecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. + +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: + +**Syntax** + +``` sql +fromUnixTimestamp64Micro(value[, timezone]) +``` + +**Arguments** + +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). + +**Returned value** + +- `value` converted to DateTime64 with precision `6`. [DateTime64](../data-types/datetime64.md). + +**Example** + +Query: + +``` sql +WITH CAST(1234567891011, 'Int64') AS i64 +SELECT + fromUnixTimestamp64Micro(i64, 'UTC') AS x, + toTypeName(x); +``` + +Result: + +```response +┌──────────────────────────x─┬─toTypeName(x)────────┐ +│ 1970-01-15 06:56:07.891011 │ DateTime64(6, 'UTC') │ +└────────────────────────────┴──────────────────────┘ +``` + +## fromUnixTimestamp64Nano + +Converts an `Int64` to a `DateTime64` value with fixed nanosecond precision and optional timezone. The input value is scaled up or down appropriately depending on its precision. + +:::note +Please note that input value is treated as a UTC timestamp, not timestamp at the given (or implicit) timezone. +::: + +**Syntax** + +``` sql +fromUnixTimestamp64Nano(value[, timezone]) +``` + +**Arguments** + +- `value` — value with any precision. [Int64](../data-types/int-uint.md). +- `timezone` — (optional) timezone name of the result. [String](../data-types/string.md). + +**Returned value** + +- `value` converted to DateTime64 with precision `9`. [DateTime64](../data-types/datetime64.md). + +**Example** + +Query: + +``` sql +WITH CAST(1234567891011, 'Int64') AS i64 +SELECT + fromUnixTimestamp64Nano(i64, 'UTC') AS x, + toTypeName(x); +``` + +Result: + +```response +┌─────────────────────────────x─┬─toTypeName(x)────────┐ +│ 1970-01-01 00:20:34.567891011 │ DateTime64(9, 'UTC') │ +└───────────────────────────────┴──────────────────────┘ ``` ## formatRow From f05a6577232e7061d13fe5888fb3da07cbfdfe39 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 6 Jun 2024 12:56:34 +0000 Subject: [PATCH 124/133] add docs --- docs/en/operations/settings/settings.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index ffaf53085c4..ada922cb037 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -3172,7 +3172,7 @@ Default value: `0`. ## lightweight_deletes_sync {#lightweight_deletes_sync} -The same as 'mutation_sync', but controls only execution of lightweight deletes. +The same as 'mutation_sync', but controls only execution of lightweight deletes. Possible values: @@ -4616,6 +4616,16 @@ Read more about [memory overcommit](memory-overcommit.md). Default value: `1GiB`. +## max_untracked_memory {#max_untracked_memory} +Small allocations and deallocations are grouped in thread local variable and tracked or profiled only when amount (in absolute value) becomes larger than specified value. If the value is higher than 'memory_profiler_step' it will be effectively lowered to 'memory_profiler_step'. + +Default value: `4MiB`. + +## min_untracked_memory {#min_untracked_memory} +Lower bound for untracked memory limit which is applied to threads with low memory consumption. Untracked memory limit equals thread memory usage devided by 16 and clamped between `min_untracked_memory` and `max_untracked_memory` for every thread. It guarantees that total untracked memory does not exceed 10% of current memory footprint even with a lot of small threads. To disable dynamic limit for untracked memory set value `4MiB`. + +Default value: `4KiB`. + ## Schema Inference settings See [schema inference](../../interfaces/schema-inference.md#schema-inference-modes) documentation for more details. From 03458a516afa0f81623fd1e11fa7586d89fa7aab Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 6 Jun 2024 15:06:16 +0200 Subject: [PATCH 125/133] Fix typo --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 60bdab22a58..2ec51d43c59 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -2501,7 +2501,7 @@ Result: ## toUnixTimestamp64Nano -Converts a `DateTime64` to a `Int64` value with fixed nano precision. The input value is scaled up or down appropriately depending on its precision. +Converts a `DateTime64` to a `Int64` value with fixed nanosecond precision. The input value is scaled up or down appropriately depending on its precision. :::note The output value is a timestamp in UTC, not in the timezone of `DateTime64`. From afc63af264c1ae2cd523485d833912f0dd5090ff Mon Sep 17 00:00:00 2001 From: Nikita Fomichev Date: Thu, 6 Jun 2024 15:22:31 +0200 Subject: [PATCH 126/133] Update `largestTriangleThreeBuckets` doc --- .../reference/largestTriangleThreeBuckets.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md b/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md index 06443994dd9..4f73aadb8da 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md +++ b/docs/en/sql-reference/aggregate-functions/reference/largestTriangleThreeBuckets.md @@ -24,6 +24,8 @@ Alias: `lttb`. - `x` — x coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md) , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md). - `y` — y coordinate. [Integer](../../../sql-reference/data-types/int-uint.md) , [Float](../../../sql-reference/data-types/float.md) , [Decimal](../../../sql-reference/data-types/decimal.md) , [Date](../../../sql-reference/data-types/date.md), [Date32](../../../sql-reference/data-types/date32.md), [DateTime](../../../sql-reference/data-types/datetime.md), [DateTime64](../../../sql-reference/data-types/datetime64.md). +NaNs are ignored in the provided series, meaning that any NaN values will be excluded from the analysis. This ensures that the function operates only on valid numerical data. + **Parameters** - `n` — number of points in the resulting series. [UInt64](../../../sql-reference/data-types/int-uint.md). @@ -61,7 +63,7 @@ Result: ``` text ┌────────largestTriangleThreeBuckets(4)(x, y)───────────┐ -│ [(1,10),(3,15),(5,40),(10,70)] │ +│ [(1,10),(3,15),(9,55),(10,70)] │ └───────────────────────────────────────────────────────┘ ``` From 05592fb5ba97dd86a744b146d15e9a1cb0422357 Mon Sep 17 00:00:00 2001 From: Konstantin Morozov Date: Thu, 6 Jun 2024 14:27:23 +0000 Subject: [PATCH 127/133] additional log for cleanupDetachedTables --- src/Databases/DatabaseAtomic.cpp | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 8edc5b737a6..ccab72cfbae 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -1,20 +1,21 @@ +#include #include +#include #include #include -#include +#include #include #include -#include +#include +#include +#include +#include #include +#include +#include "Common/logger_useful.h" #include #include #include -#include -#include -#include -#include -#include -#include namespace fs = std::filesystem; @@ -393,6 +394,7 @@ DatabaseAtomic::DetachedTables DatabaseAtomic::cleanupDetachedTables() { DetachedTables not_in_use; auto it = detached_tables.begin(); + LOG_DEBUG(log, "There are {} detached tables. Start searching non used tables.", detached_tables.size()); while (it != detached_tables.end()) { if (it->second.unique()) @@ -403,6 +405,7 @@ DatabaseAtomic::DetachedTables DatabaseAtomic::cleanupDetachedTables() else ++it; } + LOG_DEBUG(log, "Found {} non used tables in detached tables.", not_in_use.size()); /// It should be destroyed in caller with released database mutex return not_in_use; } From dd9b15daf5accedc0e850e4d12b2ebc88b24bd86 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 6 Jun 2024 23:43:14 +0800 Subject: [PATCH 128/133] Fix tupleConcat of two empty tuples --- src/Functions/tupleConcat.cpp | 5 ++++- tests/queries/0_stateless/03167_empty_tuple_concat.reference | 1 + tests/queries/0_stateless/03167_empty_tuple_concat.sql | 1 + 3 files changed, 6 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03167_empty_tuple_concat.reference create mode 100644 tests/queries/0_stateless/03167_empty_tuple_concat.sql diff --git a/src/Functions/tupleConcat.cpp b/src/Functions/tupleConcat.cpp index c48e4d61463..c9cdae10bcf 100644 --- a/src/Functions/tupleConcat.cpp +++ b/src/Functions/tupleConcat.cpp @@ -61,7 +61,7 @@ public: return std::make_shared(tuple_arg_types); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override { const size_t num_arguments = arguments.size(); Columns columns; @@ -92,6 +92,9 @@ public: columns.push_back(inner_col); } + if (columns.empty()) + return ColumnTuple::create(input_rows_count); + return ColumnTuple::create(columns); } }; diff --git a/tests/queries/0_stateless/03167_empty_tuple_concat.reference b/tests/queries/0_stateless/03167_empty_tuple_concat.reference new file mode 100644 index 00000000000..6a452c185a8 --- /dev/null +++ b/tests/queries/0_stateless/03167_empty_tuple_concat.reference @@ -0,0 +1 @@ +() diff --git a/tests/queries/0_stateless/03167_empty_tuple_concat.sql b/tests/queries/0_stateless/03167_empty_tuple_concat.sql new file mode 100644 index 00000000000..f6fce86f332 --- /dev/null +++ b/tests/queries/0_stateless/03167_empty_tuple_concat.sql @@ -0,0 +1 @@ +SELECT ()||(); From 2c193a793d197ddf459a0ba0461d5ae908c4db89 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 6 Jun 2024 16:17:36 +0000 Subject: [PATCH 129/133] typo --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index ada922cb037..b3e9da816ab 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4622,7 +4622,7 @@ Small allocations and deallocations are grouped in thread local variable and tra Default value: `4MiB`. ## min_untracked_memory {#min_untracked_memory} -Lower bound for untracked memory limit which is applied to threads with low memory consumption. Untracked memory limit equals thread memory usage devided by 16 and clamped between `min_untracked_memory` and `max_untracked_memory` for every thread. It guarantees that total untracked memory does not exceed 10% of current memory footprint even with a lot of small threads. To disable dynamic limit for untracked memory set value `4MiB`. +Lower bound for untracked memory limit which is applied to threads with low memory consumption. Untracked memory limit equals thread memory usage divided by 16 and clamped between `min_untracked_memory` and `max_untracked_memory` for every thread. It guarantees that total untracked memory does not exceed 10% of current memory footprint even with a lot of small threads. To disable dynamic limit for untracked memory set value `4MiB`. Default value: `4KiB`. From eb72c12b31560dad49caff2e532472e8920f38d5 Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 7 Jun 2024 00:51:37 +0200 Subject: [PATCH 130/133] CI: Minor fixes in ci scripts --- tests/ci/ci_settings.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/ci/ci_settings.py b/tests/ci/ci_settings.py index 62e7826dac5..7807cc7ac10 100644 --- a/tests/ci/ci_settings.py +++ b/tests/ci/ci_settings.py @@ -211,12 +211,15 @@ class CiSettings: ): res[job] = job_config + add_parents = [] for job in list(res): parent_jobs = CI_CONFIG.get_job_parents(job) for parent_job in parent_jobs: if parent_job not in res: + add_parents.append(parent_job) print(f"Job [{job}] requires [{parent_job}] - add") - res[parent_job] = job_configs[parent_job] + for job in add_parents: + res[job] = job_configs[job] for job, job_config in res.items(): batches = [] From 0deb862c93824146cf9012f95fa247e459c3683d Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 7 Jun 2024 01:22:47 +0200 Subject: [PATCH 131/133] Re-enable Fast test in MQ --- tests/ci/ci.py | 6 +++++- tests/ci/ci_settings.py | 6 ++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/tests/ci/ci.py b/tests/ci/ci.py index 55a18a2f335..ec6e84dea8c 100644 --- a/tests/ci/ci.py +++ b/tests/ci/ci.py @@ -442,7 +442,11 @@ def _configure_jobs( # filter jobs in accordance with ci settings job_configs = ci_settings.apply( - job_configs, pr_info.is_release, is_pr=pr_info.is_pr, labels=pr_info.labels + job_configs, + pr_info.is_release, + is_pr=pr_info.is_pr, + is_mq=pr_info.is_merge_queue, + labels=pr_info.labels, ) # check jobs in ci cache diff --git a/tests/ci/ci_settings.py b/tests/ci/ci_settings.py index 7807cc7ac10..83d4ddb4211 100644 --- a/tests/ci/ci_settings.py +++ b/tests/ci/ci_settings.py @@ -134,6 +134,7 @@ class CiSettings: job_config: JobConfig, is_release: bool, is_pr: bool, + is_mq: bool, labels: Iterable[str], ) -> bool: # type: ignore #too-many-return-statements if self.do_not_test: @@ -189,7 +190,7 @@ class CiSettings: if job_config.release_only and not is_release: return False - elif job_config.pr_only and not is_pr: + elif job_config.pr_only and not is_pr and not is_mq: return False return not to_deny @@ -199,6 +200,7 @@ class CiSettings: job_configs: Dict[str, JobConfig], is_release: bool, is_pr: bool, + is_mq: bool, labels: Iterable[str], ) -> Dict[str, JobConfig]: """ @@ -207,7 +209,7 @@ class CiSettings: res = {} for job, job_config in job_configs.items(): if self._check_if_selected( - job, job_config, is_release=is_release, is_pr=is_pr, labels=labels + job, job_config, is_release=is_release, is_pr=is_pr, is_mq=is_mq, labels=labels ): res[job] = job_config From 8f26f77505a08197f236f6e2cc069cd4111d71ec Mon Sep 17 00:00:00 2001 From: Max K Date: Fri, 7 Jun 2024 01:35:39 +0200 Subject: [PATCH 132/133] fix unit test --- tests/ci/test_ci_options.py | 58 +++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py index c00cd0b9216..e6fa0389649 100644 --- a/tests/ci/test_ci_options.py +++ b/tests/ci/test_ci_options.py @@ -167,19 +167,19 @@ class TestCIOptions(unittest.TestCase): ) jobs_configs = {job: JobConfig() for job in _TEST_JOB_LIST} - jobs_configs[ - "fuzzers" - ].run_by_label = ( + jobs_configs["fuzzers"].run_by_label = ( "TEST_LABEL" # check "fuzzers" appears in the result due to the label ) - jobs_configs[ - "Integration tests (asan)" - ].release_only = ( + jobs_configs["Integration tests (asan)"].release_only = ( True # still must be included as it's set with include keywords ) filtered_jobs = list( ci_options.apply( - jobs_configs, is_release=False, is_pr=True, labels=["TEST_LABEL"] + jobs_configs, + is_release=False, + is_pr=True, + is_mq=False, + labels=["TEST_LABEL"], ) ) self.assertCountEqual( @@ -212,7 +212,9 @@ class TestCIOptions(unittest.TestCase): jobs_configs["fuzzers"].run_by_label = "TEST_LABEL" # no settings are set filtered_jobs = list( - CiSettings().apply(jobs_configs, is_release=False, is_pr=True, labels=[]) + CiSettings().apply( + jobs_configs, is_release=False, is_pr=False, is_mq=True, labels=[] + ) ) self.assertCountEqual( filtered_jobs, @@ -220,9 +222,21 @@ class TestCIOptions(unittest.TestCase): "Fast test", ], ) - filtered_jobs = list( - CiSettings().apply(jobs_configs, is_release=True, is_pr=False, labels=[]) + CiSettings().apply( + jobs_configs, is_release=False, is_pr=True, is_mq=False, labels=[] + ) + ) + self.assertCountEqual( + filtered_jobs, + [ + "Fast test", + ], + ) + filtered_jobs = list( + CiSettings().apply( + jobs_configs, is_release=True, is_pr=False, is_mq=False, labels=[] + ) ) self.assertCountEqual( filtered_jobs, @@ -240,7 +254,11 @@ class TestCIOptions(unittest.TestCase): # no settings are set filtered_jobs = list( ci_settings.apply( - jobs_configs, is_release=False, is_pr=True, labels=["TEST_LABEL"] + jobs_configs, + is_release=False, + is_pr=True, + is_mq=False, + labels=["TEST_LABEL"], ) ) self.assertCountEqual( @@ -253,7 +271,11 @@ class TestCIOptions(unittest.TestCase): ci_settings.include_keywords = ["Fast"] filtered_jobs = list( ci_settings.apply( - jobs_configs, is_release=True, is_pr=False, labels=["TEST_LABEL"] + jobs_configs, + is_release=True, + is_pr=False, + is_mq=False, + labels=["TEST_LABEL"], ) ) self.assertCountEqual( @@ -271,13 +293,17 @@ class TestCIOptions(unittest.TestCase): self.assertCountEqual(ci_options.include_keywords, ["analyzer"]) self.assertIsNone(ci_options.exclude_keywords) jobs_configs = {job: JobConfig() for job in _TEST_JOB_LIST} - jobs_configs[ - "fuzzers" - ].run_by_label = "TEST_LABEL" # check "fuzzers" does not appears in the result + jobs_configs["fuzzers"].run_by_label = ( + "TEST_LABEL" # check "fuzzers" does not appears in the result + ) jobs_configs["Integration tests (asan)"].release_only = True filtered_jobs = list( ci_options.apply( - jobs_configs, is_release=False, is_pr=True, labels=["TEST_LABEL"] + jobs_configs, + is_release=False, + is_pr=True, + is_mq=False, + labels=["TEST_LABEL"], ) ) self.assertCountEqual( From 367d41e7f042137d4a25b55c740ba3835b5d5435 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 6 Jun 2024 23:41:25 +0000 Subject: [PATCH 133/133] Automatic style fix --- tests/ci/ci_settings.py | 7 ++++++- tests/ci/test_ci_options.py | 14 +++++++++----- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/tests/ci/ci_settings.py b/tests/ci/ci_settings.py index 83d4ddb4211..f25344c7701 100644 --- a/tests/ci/ci_settings.py +++ b/tests/ci/ci_settings.py @@ -209,7 +209,12 @@ class CiSettings: res = {} for job, job_config in job_configs.items(): if self._check_if_selected( - job, job_config, is_release=is_release, is_pr=is_pr, is_mq=is_mq, labels=labels + job, + job_config, + is_release=is_release, + is_pr=is_pr, + is_mq=is_mq, + labels=labels, ): res[job] = job_config diff --git a/tests/ci/test_ci_options.py b/tests/ci/test_ci_options.py index e6fa0389649..60888932803 100644 --- a/tests/ci/test_ci_options.py +++ b/tests/ci/test_ci_options.py @@ -167,10 +167,14 @@ class TestCIOptions(unittest.TestCase): ) jobs_configs = {job: JobConfig() for job in _TEST_JOB_LIST} - jobs_configs["fuzzers"].run_by_label = ( + jobs_configs[ + "fuzzers" + ].run_by_label = ( "TEST_LABEL" # check "fuzzers" appears in the result due to the label ) - jobs_configs["Integration tests (asan)"].release_only = ( + jobs_configs[ + "Integration tests (asan)" + ].release_only = ( True # still must be included as it's set with include keywords ) filtered_jobs = list( @@ -293,9 +297,9 @@ class TestCIOptions(unittest.TestCase): self.assertCountEqual(ci_options.include_keywords, ["analyzer"]) self.assertIsNone(ci_options.exclude_keywords) jobs_configs = {job: JobConfig() for job in _TEST_JOB_LIST} - jobs_configs["fuzzers"].run_by_label = ( - "TEST_LABEL" # check "fuzzers" does not appears in the result - ) + jobs_configs[ + "fuzzers" + ].run_by_label = "TEST_LABEL" # check "fuzzers" does not appears in the result jobs_configs["Integration tests (asan)"].release_only = True filtered_jobs = list( ci_options.apply(