From 8b8ef41407b864b3babe53e3709c922a77916c17 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Fri, 1 Dec 2023 01:26:44 +0000 Subject: [PATCH] Documentation --- .../system-tables/view_refreshes.md | 40 ++++++++ .../en/sql-reference/statements/alter/view.md | 42 ++++---- .../sql-reference/statements/create/view.md | 98 ++++++++++++++++++- docs/en/sql-reference/statements/system.md | 64 +++++++++++- src/Storages/MaterializedView/RefreshTask.cpp | 1 + 5 files changed, 222 insertions(+), 23 deletions(-) create mode 100644 docs/en/operations/system-tables/view_refreshes.md diff --git a/docs/en/operations/system-tables/view_refreshes.md b/docs/en/operations/system-tables/view_refreshes.md new file mode 100644 index 00000000000..ebcc24d741b --- /dev/null +++ b/docs/en/operations/system-tables/view_refreshes.md @@ -0,0 +1,40 @@ +--- +slug: /en/operations/system-tables/view_refreshes +--- +# view_refreshes + +Information about [Refreshable Materialized Views](../../sql-reference/statements/create/view.md#refreshable-materialized-view). Contains all refreshable materialized views, regardless of whether there's a refresh in progress or not. + + +Columns: + +- `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the table is in. +- `view` ([String](../../sql-reference/data-types/string.md)) — Table name. +- `status` ([String](../../sql-reference/data-types/string.md)) — Current state of the refresh. +- `last_refresh_result` ([String](../../sql-reference/data-types/string.md)) — Outcome of the latest refresh attempt. +- `last_refresh_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time of the last successful refresh. `NULL` if no refresh succeeded since server startup or table creation. +- `next_refresh_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Time at which the next refresh is scheduled to start. +- `remaining_dependencies` ([Array(String)](../../sql-reference/data-types/array.md)) — If the view has [refresh dependencies](../../sql-reference/statements/create/view.md#refresh-dependencies), this array contains the subset of those dependencies that are not satisfied for the current refresh yet. If `status = 'WaitingForDependencies'`, a refresh is ready to start as soon as these dependencies are fulfilled. +- `exception` ([String](../../sql-reference/data-types/string.md)) — if `last_refresh_result = 'Exception'`, i.e. the last refresh attempt failed, this column contains the corresponding error message and stack trace. +- `progress` ([Float64](../../sql-reference/data-types/float.md)) — Progress of the current refresh, between 0 and 1. +- `read_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Number of rows read by the current refresh so far. +- `total_rows` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Estimated total number of rows that need to be read by the current refresh. + +(There are additional columns related to current refresh progress, but they are currently unreliable.) + +**Example** + +```sql +SELECT + database, + view, + status, + last_refresh_result, + last_refresh_time, + next_refresh_time +FROM system.view_refreshes + +┌─database─┬─view───────────────────────┬─status────┬─last_refresh_result─┬───last_refresh_time─┬───next_refresh_time─┐ +│ default │ hello_documentation_reader │ Scheduled │ Finished │ 2023-12-01 01:24:00 │ 2023-12-01 01:25:00 │ +└──────────┴────────────────────────────┴───────────┴─────────────────────┴─────────────────────┴─────────────────────┘ +``` diff --git a/docs/en/sql-reference/statements/alter/view.md b/docs/en/sql-reference/statements/alter/view.md index 5c5bf0355f6..517e64e3e5b 100644 --- a/docs/en/sql-reference/statements/alter/view.md +++ b/docs/en/sql-reference/statements/alter/view.md @@ -6,28 +6,28 @@ sidebar_label: VIEW # ALTER TABLE … MODIFY QUERY Statement -You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement without interrupting ingestion process. +You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement without interrupting ingestion process. -The `allow_experimental_alter_materialized_view_structure` setting must be enabled. +The `allow_experimental_alter_materialized_view_structure` setting must be enabled. This command is created to change materialized view created with `TO [db.]name` clause. It does not change the structure of the underling storage table and it does not change the columns' definition of the materialized view, because of this the application of this command is very limited for materialized views are created without `TO [db.]name` clause. **Example with TO table** ```sql -CREATE TABLE events (ts DateTime, event_type String) +CREATE TABLE events (ts DateTime, event_type String) ENGINE = MergeTree ORDER BY (event_type, ts); -CREATE TABLE events_by_day (ts DateTime, event_type String, events_cnt UInt64) +CREATE TABLE events_by_day (ts DateTime, event_type String, events_cnt UInt64) ENGINE = SummingMergeTree ORDER BY (event_type, ts); -CREATE MATERIALIZED VIEW mv TO events_by_day AS +CREATE MATERIALIZED VIEW mv TO events_by_day AS SELECT toStartOfDay(ts) ts, event_type, count() events_cnt FROM events -GROUP BY ts, event_type; +GROUP BY ts, event_type; -INSERT INTO events -SELECT Date '2020-01-01' + interval number * 900 second, +INSERT INTO events +SELECT Date '2020-01-01' + interval number * 900 second, ['imp', 'click'][number%2+1] FROM numbers(100); @@ -43,23 +43,23 @@ ORDER BY ts, event_type; │ 2020-01-02 00:00:00 │ imp │ 2 │ └─────────────────────┴────────────┴─────────────────┘ --- Let's add the new measurment `cost` +-- Let's add the new measurment `cost` -- and the new dimension `browser`. -ALTER TABLE events +ALTER TABLE events ADD COLUMN browser String, ADD COLUMN cost Float64; -- Column do not have to match in a materialized view and TO -- (destination table), so the next alter does not break insertion. -ALTER TABLE events_by_day +ALTER TABLE events_by_day ADD COLUMN cost Float64, ADD COLUMN browser String after event_type, MODIFY ORDER BY (event_type, ts, browser); -INSERT INTO events -SELECT Date '2020-01-02' + interval number * 900 second, +INSERT INTO events +SELECT Date '2020-01-02' + interval number * 900 second, ['imp', 'click'][number%2+1], ['firefox', 'safary', 'chrome'][number%3+1], 10/(number+1)%33 @@ -82,16 +82,16 @@ ORDER BY ts, event_type; └─────────────────────┴────────────┴─────────┴────────────┴──────┘ SET allow_experimental_alter_materialized_view_structure=1; - -ALTER TABLE mv MODIFY QUERY + +ALTER TABLE mv MODIFY QUERY SELECT toStartOfDay(ts) ts, event_type, browser, count() events_cnt, sum(cost) cost FROM events GROUP BY ts, event_type, browser; -INSERT INTO events -SELECT Date '2020-01-03' + interval number * 900 second, +INSERT INTO events +SELECT Date '2020-01-03' + interval number * 900 second, ['imp', 'click'][number%2+1], ['firefox', 'safary', 'chrome'][number%3+1], 10/(number+1)%33 @@ -138,7 +138,7 @@ PRIMARY KEY (event_type, ts) ORDER BY (event_type, ts, browser) SETTINGS index_granularity = 8192 --- !!! The columns' definition is unchanged but it does not matter, we are not quering +-- !!! The columns' definition is unchanged but it does not matter, we are not quering -- MATERIALIZED VIEW, we are quering TO (storage) table. -- SELECT section is updated. @@ -169,7 +169,7 @@ The application is very limited because you can only change the `SELECT` section ```sql CREATE TABLE src_table (`a` UInt32) ENGINE = MergeTree ORDER BY a; -CREATE MATERIALIZED VIEW mv (`a` UInt32) ENGINE = MergeTree ORDER BY a AS SELECT a FROM src_table; +CREATE MATERIALIZED VIEW mv (`a` UInt32) ENGINE = MergeTree ORDER BY a AS SELECT a FROM src_table; INSERT INTO src_table (a) VALUES (1), (2); SELECT * FROM mv; ``` @@ -199,3 +199,7 @@ SELECT * FROM mv; ## ALTER LIVE VIEW Statement `ALTER LIVE VIEW ... REFRESH` statement refreshes a [Live view](../create/view.md#live-view). See [Force Live View Refresh](../create/view.md#live-view-alter-refresh). + +## ALTER TABLE … MODIFY REFRESH Statement + +`ALTER TABLE ... MODIFY REFRESH` statement changes refresh parameters of a [Refreshable Materialized View](../create/view.md#refreshable-materialized-view). See [Changing Refresh Parameters](../create/view.md#changing-refresh-parameters). diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 56828745048..22bda2f5ebe 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -37,6 +37,7 @@ SELECT a, b, c FROM (SELECT ...) ``` ## Parameterized View + Parametrized views are similar to normal views, but can be created with parameters which are not resolved immediately. These views can be used with table functions, which specify the name of the view as function name and the parameter values as its arguments. ``` sql @@ -66,7 +67,7 @@ When creating a materialized view with `TO [db].[table]`, you can't also use `PO A materialized view is implemented as follows: when inserting data to the table specified in `SELECT`, part of the inserted data is converted by this `SELECT` query, and the result is inserted in the view. -:::note +:::note Materialized views in ClickHouse use **column names** instead of column order during insertion into destination table. If some column names are not present in the `SELECT` query result, ClickHouse uses a default value, even if the column is not [Nullable](../../data-types/nullable.md). A safe practice would be to add aliases for every column when using Materialized views. Materialized views in ClickHouse are implemented more like insert triggers. If there’s some aggregation in the view query, it’s applied only to the batch of freshly inserted data. Any changes to existing data of source table (like update, delete, drop partition, etc.) does not change the materialized view. @@ -96,9 +97,102 @@ This feature is deprecated and will be removed in the future. For your convenience, the old documentation is located [here](https://pastila.nl/?00f32652/fdf07272a7b54bda7e13b919264e449f.md) +## Refreshable Materialized View {#refreshable-materialized-view} + +```sql +CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name +REFRESH EVERY|AFTER interval [OFFSET interval] +RANDOMIZE FOR interval +DEPENDS ON [db.]name [, [db.]name [, ...]] +[TO[db.]name] [(columns)] [ENGINE = engine] +AS SELECT ... +``` +where `interval` is a sequence of simple intervals: +```sql +number SECOND|MINUTE|HOUR|DAY|WEEK|MONTH|YEAR +``` + +Periodically runs the corresponding query and stores its result in a table, atomically replacing the table's previous contents. + +Differences from regular non-refreshable materialized views: + * No insert trigger. I.e. when new data is inserted into the table specified in SELECT, it's *not* automatically pushed to the refreshable materialized view. The periodic refresh runs the entire query and replaces the entire table. + * No restrictions on the SELECT query. Table functions (e.g. `url()`), views, UNION, JOIN, are all allowed. + +### Refresh Schedule + +Example refresh schedules: +```sql +REFRESH EVERY 1 DAY -- every day, at midnight +REFRESH EVERY 1 MONTH -- on 1st day of every month, at midnight +REFRESH EVERY 1 MONTH OFFSET 5 DAY 2 HOUR -- on 6th day of every month, at 2:00 am +REFRESH EVERY 2 WEEK OFFSET 5 DAY 15 HOUR 10 MINUTE -- every other Saturday, at 3:10 pm +REFRESH EVERY 30 MINUTE -- at 00:00, 00:30, 01:00, 01:30, etc +REFRESH AFTER 30 MINUTE -- 30 minutes after the previous refresh completes, no alignment with time of day +-- REFRESH AFTER 1 HOUR OFFSET 1 MINUTE -- syntax errror, OFFSET is not allowed with AFTER +``` + +`RANDOMIZE FOR` randomly adjusts the time of each refresh, e.g.: +```sql +REFRESH EVERY 1 DAY OFFSET 2 HOUR RANDOMIZE FOR 1 HOUR -- every day at random time between 01:30 and 02:30 +``` + +At most one refresh may be running at a time, for a given view. E.g. if a view with `REFRESH EVERY 1 MINUTE` takes 2 minutes to refresh, it'll just be refreshing every 2 minutes. If it then becomes faster and starts refreshing in 10 seconds, it'll go back to refreshing every minute. (In particular, it won't refresh every 10 seconds to catch up with a backlog of missed refreshes - there's no such backlog.) + +### Dependencies {#refresh-dependencies} + +`DEPENDS ON` synchronizes refreshes of different tables. By way of example, suppose there's a chain of two refreshable materialized views: +```sql +CREATE MATERIALIZED VIEW source REFRESH EVERY 1 DAY AS SELECT * FROM url(...) +CREATE MATERIALIZED VIEW destination REFRESH EVERY 1 DAY AS SELECT ... FROM source +``` +Without `DEPENDS ON`, both views will start a refresh at midnight, and `destination` typically will see yesterday's data in `source`. If we add dependency: +``` +CREATE MATERIALIZED VIEW destination REFRESH EVERY 1 DAY DEPENDS ON source AS SELECT ... FROM source +``` +then `destination`'s refresh will start only after `source`'s refresh finished for that day, so `destination` will be based on fresh data. + +Alternatively, the same result can be achieved with: +``` +CREATE MATERIALIZED VIEW destination REFRESH AFTER 1 HOUR DEPENDS ON source AS SELECT ... FROM source +``` +where `1 HOUR` can be any duration less than `source`'s refresh period. The dependent table won't be refreshed more frequently than any of its dependencies. This is a valid way to set up a chain of refreshable views without specifying the real refresh period more than once. + +A few more examples: + * `REFRESH EVERY 1 DAY OFFSET 10 MINUTE` (`destination`) depends on `REFRESH EVERY 1 DAY` (`source`)
+ If `source` refresh takes more than 10 minutes, `destination` will wait for it. + * `REFRESH EVERY 1 DAY OFFSET 1 HOUR` depends on `REFRESH EVERY 1 DAY OFFSET 23 HOUR`
+ Similar to the above, even though the corresponding refreshes happen on different calendar days. + `destination`'s refresh on day X+1 will wait for `source`'s refresh on day X (if it takes more than 2 hours). + * `REFRESH EVERY 2 HOUR` depends on `REFRESH EVERY 1 HOUR`
+ The 2 HOUR refresh happens after the 1 HOUR refresh for every other hour, e.g. after the midnight + refresh, then after the 2am refresh, etc. + * `REFRESH EVERY 1 MINUTE` depends on `REFRESH EVERY 2 HOUR`
+ `REFRESH AFTER 1 MINUTE` depends on `REFRESH EVERY 2 HOUR`
+ `REFRESH AFTER 1 MINUTE` depends on `REFRESH AFTER 2 HOUR`
+ `destination` is refreshed once after every `source` refresh, i.e. every 2 hours. The `1 MINUTE` is effectively ignored. + * `REFRESH AFTER 1 HOUR` depends on `REFRESH AFTER 1 HOUR`
+ Currently this is not recommended. + +### Changing Refresh Parameters {#changing-refresh-parameters} + +To change refresh parameters: +``` +ALTER TABLE [db.]name MODIFY REFRESH EVERY|AFTER ... [RANDOMIZE FOR ...] [DEPENDS ON ...] +``` + +:::note +This replaces refresh schedule *and* dependencies. If the table had a `DEPENDS ON`, doing a `MODIFY REFRESH` without `DEPENDS ON` will remove the dependencies. +::: + +### Other operations + +The status of all refreshable materialized views is available in table [`system.view_refreshes`](../../../operations/system-tables/view_refreshes.md). In particular, it contains refresh progress (if running), last and next refresh time, exception message if a refresh failed. + +To manually stop, start, pause, resume, or cancel refreshes use [`SYSTEM REFRESH|STOP|START|PAUSE|RESUME|CANCEL VIEW`](../system.md#refreshable-materialized-views). + ## Window View [Experimental] -:::info +:::info This is an experimental feature that may change in backwards-incompatible ways in the future releases. Enable usage of window views and `WATCH` query using [allow_experimental_window_view](../../../operations/settings/settings.md#allow-experimental-window-view) setting. Input the command `set allow_experimental_window_view = 1`. ::: diff --git a/docs/en/sql-reference/statements/system.md b/docs/en/sql-reference/statements/system.md index 695801983b7..1321dbdd269 100644 --- a/docs/en/sql-reference/statements/system.md +++ b/docs/en/sql-reference/statements/system.md @@ -449,7 +449,7 @@ SYSTEM SYNC FILE CACHE [ON CLUSTER cluster_name] ``` -### SYSTEM STOP LISTEN +## SYSTEM STOP LISTEN Closes the socket and gracefully terminates the existing connections to the server on the specified port with the specified protocol. @@ -464,7 +464,7 @@ SYSTEM STOP LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QU - If `QUERIES DEFAULT [EXCEPT .. [,..]]` modifier is specified, all default protocols are stopped, unless specified with `EXCEPT` clause. - If `QUERIES CUSTOM [EXCEPT .. [,..]]` modifier is specified, all custom protocols are stopped, unless specified with `EXCEPT` clause. -### SYSTEM START LISTEN +## SYSTEM START LISTEN Allows new connections to be established on the specified protocols. @@ -473,3 +473,63 @@ However, if the server on the specified port and protocol was not stopped using ```sql SYSTEM START LISTEN [ON CLUSTER cluster_name] [QUERIES ALL | QUERIES DEFAULT | QUERIES CUSTOM | TCP | TCP WITH PROXY | TCP SECURE | HTTP | HTTPS | MYSQL | GRPC | POSTGRESQL | PROMETHEUS | CUSTOM 'protocol'] ``` + +## Managing Refreshable Materialized Views {#refreshable-materialized-views} + +Commands to control background tasks performed by [Refreshable Materialized Views](../../sql-reference/statements/create/view.md#refreshable-materialized-view) + +Keep an eye on [`system.view_refreshes`](../../operations/system-tables/view_refreshes.md) while using them. + +### SYSTEM REFRESH VIEW + +Trigger an immediate out-of-schedule refresh of a given view. + +```sql +SYSTEM REFRESH VIEW [db.]name +``` + +### SYSTEM STOP VIEW, SYSTEM STOP VIEWS + +Disable periodic refreshing of the given view or all refreshable views. If a refresh is in progress, cancel it too. + +```sql +SYSTEM STOP VIEW [db.]name +``` +```sql +SYSTEM STOP VIEWS +``` + +### SYSTEM START VIEW, SYSTEM START VIEWS + +Enable periodic refreshing for the given view or all refreshable views. No immediate refresh is triggered. + +```sql +SYSTEM START VIEW [db.]name +``` +```sql +SYSTEM START VIEWS +``` + +### SYSTEM PAUSE VIEW + +If there's a refresh in progress for the given view, interrupt and pause it. Otherwise do nothing. The state of the refresh is preserved, and it can be resumed without losing progress. + +```sql +SYSTEM PAUSE VIEW [db.]name +``` + +### SYSTEM RESUME VIEW + +If there's a paused refresh for the given view, resume it. Otherwise do nothing. + +```sql +SYSTEM RESUME VIEW [db.]name +``` + +### SYSTEM CANCEL VIEW + +If there's a refresh in progress for the given view, interrupt and cancel it. If there's a paused refresh, discard its state. Otherwise do nothing. + +```sql +SYSTEM CANCEL VIEW [db.]name +``` diff --git a/src/Storages/MaterializedView/RefreshTask.cpp b/src/Storages/MaterializedView/RefreshTask.cpp index 8b41af6c551..7ed9597ab8f 100644 --- a/src/Storages/MaterializedView/RefreshTask.cpp +++ b/src/Storages/MaterializedView/RefreshTask.cpp @@ -406,6 +406,7 @@ void RefreshTask::initializeRefreshUnlocked(std::shared_ptrpipeline.setProgressCallback([this](const Progress & prog) { + /// TODO: Investigate why most fields are not populated. Change columns in system.view_refreshes as needed, update documentation (docs/en/operations/system-tables/view_refreshes.md). progress.incrementPiecewiseAtomically(prog); });