Merge branch 'master' into docs_add_distributed_ddl

This commit is contained in:
mergify[bot] 2021-09-18 21:45:46 +00:00 committed by GitHub
commit 700c1c4ce0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
121 changed files with 5948 additions and 2942 deletions

View File

@ -145,6 +145,19 @@ namespace common
return __builtin_mul_overflow(x, y, &res);
}
template <typename T, typename U, typename R>
inline bool mulOverflow(T x, U y, R & res)
{
// not built in type, wide integer
if constexpr (is_big_int_v<T> || is_big_int_v<R> || is_big_int_v<U>)
{
res = mulIgnoreOverflow<R>(x, y);
return false;
}
else
return __builtin_mul_overflow(x, y, &res);
}
template <>
inline bool mulOverflow(int x, int y, int & res)
{

View File

@ -77,7 +77,9 @@ void Query::executeImpl()
case CR_SERVER_LOST:
throw ConnectionLost(errorMessage(mysql_driver), err_no);
default:
throw BadQuery(errorMessage(mysql_driver), err_no);
/// Add query to the exception message, since it may differs from the user input query.
/// (also you can use this and create query with an error to see what query ClickHouse created)
throw BadQuery(errorMessage(mysql_driver) + " (query: " + query_string + ")", err_no);
}
}
}

View File

@ -51,8 +51,8 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE)
message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required")
elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0")
message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache")
set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}")
set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK "env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}")
set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "env -u SOURCE_DATE_EPOCH")
set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK "env -u SOURCE_DATE_EPOCH")
endif()
else ()
message(${RECONFIGURE_MESSAGE_LEVEL} "Not using ${CCACHE_FOUND} ${CCACHE_VERSION} bug: https://bugzilla.samba.org/show_bug.cgi?id=8118")

View File

@ -23,7 +23,6 @@ The supported formats are:
| [CustomSeparated](#format-customseparated) | ✔ | ✔ |
| [Values](#data-format-values) | ✔ | ✔ |
| [Vertical](#vertical) | ✗ | ✔ |
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
@ -944,10 +943,6 @@ test: string with 'quotes' and with some special
This format is only appropriate for outputting a query result, but not for parsing (retrieving data to insert in a table).
## VerticalRaw {#verticalraw}
Similar to [Vertical](#vertical), but with escaping disabled. This format is only suitable for outputting query results, not for parsing (receiving data and inserting it in the table).
## XML {#xml}
XML format is suitable only for output, not for parsing. Example:
@ -1579,4 +1574,4 @@ Writing to a file ".msgpk":
$ clickhouse-client --query="CREATE TABLE msgpack (array Array(UInt8)) ENGINE = Memory;"
$ clickhouse-client --query="INSERT INTO msgpack VALUES ([0, 1, 2, 3, 42, 253, 254, 255]), ([255, 254, 253, 42, 3, 2, 1, 0])";
$ clickhouse-client --query="SELECT * FROM msgpack FORMAT MsgPack" > tmp_msgpack.msgpk;
```
```

View File

@ -113,6 +113,22 @@ Features:
[MindsDB](https://mindsdb.com/) is an open-source AI layer for databases including ClickHouse that allows you to effortlessly develop, train and deploy state-of-the-art machine learning models. MindsDB Studio(GUI) allows you to train new models from database, interpret predictions made by the model, identify potential data biases, and evaluate and visualize model accuracy using the Explainable AI function to adapt and tune your Machine Learning models faster.
### DBM {#dbm}
[DBM](https://dbm.incubator.edurt.io/) DBM is a visual management tool for ClickHouse!
Features:
- Support query history (pagination, clear all, etc.)
- Support selected sql clauses query
- Support terminating query
- Support table management (metadata, delete, preview)
- Support database management (delete, create)
- Support custom query
- Support multiple data sources management(connection test, monitoring)
- Support monitor (processor, connection, query)
- Support migrate data
## Commercial {#commercial}
### DataGrip {#datagrip}
@ -190,20 +206,4 @@ SeekTable is [free](https://www.seektable.com/help/cloud-pricing) for personal/i
[Chadmin](https://github.com/bun4uk/chadmin) is a simple UI where you can visualize your currently running queries on your ClickHouse cluster and info about them and kill them if you want.
### DBM {#dbm}
[DBM](https://dbm.incubator.edurt.io/) DBM is a visual management tool for ClickHouse!
Features:
- Support query history (pagination, clear all, etc.)
- Support selected sql clauses query
- Support terminating query
- Support table management (metadata, delete, preview)
- Support database management (delete, create)
- Support custom query
- Support multiple data sources management(connection test, monitoring)
- Support monitor (processor, connection, query)
- Support migrate data
[Original article](https://clickhouse.tech/docs/en/interfaces/third-party/gui/) <!--hide-->

View File

@ -229,6 +229,42 @@ Result:
└───────────────────────────────────────┘
```
## h3ToGeoBoundary {#h3togeoboundary}
Returns array of pairs `(lon, lat)`, which corresponds to the boundary of the provided H3 index.
**Syntax**
``` sql
h3ToGeoBoundary(h3Index)
```
**Arguments**
- `h3Index` — H3 Index. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
**Returned values**
- Array of pairs '(lon, lat)'.
Type: [Array](../../../sql-reference/data-types/array.md)([Float64](../../../sql-reference/data-types/float.md), [Float64](../../../sql-reference/data-types/float.md)).
**Example**
Query:
``` sql
SELECT h3ToGeoBoundary(644325524701193974) AS coordinates;
```
Result:
``` text
┌─h3ToGeoBoundary(599686042433355775)────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│ [(37.2713558667319,-121.91508032705622),(37.353926450852256,-121.8622232890249),(37.42834118609435,-121.92354999630156),(37.42012867767779,-122.03773496427027),(37.33755608435299,-122.090428929044),(37.26319797461824,-122.02910130919001)] │
└────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┘
```
## h3kRing {#h3kring}
Lists all the [H3](#h3index) hexagons in the raduis of `k` from the given hexagon in random order.

View File

@ -5,7 +5,7 @@ toc_title: ALTER
## ALTER {#query_language_queries_alter}
Most `ALTER` queries modify table settings or data:
Most `ALTER TABLE` queries modify table settings or data:
- [COLUMN](../../../sql-reference/statements/alter/column.md)
- [PARTITION](../../../sql-reference/statements/alter/partition.md)
@ -17,9 +17,14 @@ Most `ALTER` queries modify table settings or data:
- [TTL](../../../sql-reference/statements/alter/ttl.md)
!!! note "Note"
Most `ALTER` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md).
Most `ALTER TABLE` queries are supported only for [\*MergeTree](../../../engines/table-engines/mergetree-family/index.md) tables, as well as [Merge](../../../engines/table-engines/special/merge.md) and [Distributed](../../../engines/table-engines/special/distributed.md).
While these `ALTER` settings modify entities related to role-based access control:
These `ALTER` statements manipulate views:
- [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — Modifies a [Materialized view](../create/view.md#materialized) structure.
- [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — Refreshes a [Live view](../create/view.md#live-view).
These `ALTER` statements modify entities related to role-based access control:
- [USER](../../../sql-reference/statements/alter/user.md)
- [ROLE](../../../sql-reference/statements/alter/role.md)

View File

@ -0,0 +1,44 @@
---
toc_priority: 50
toc_title: VIEW
---
# ALTER TABLE … MODIFY QUERY Statement {#alter-modify-query}
You can modify `SELECT` query that was specified when a [materialized view](../create/view.md#materialized) was created with the `ALTER TABLE … MODIFY QUERY` statement. Use it when the materialized view was created without the `TO [db.]name` clause. The `allow_experimental_alter_materialized_view_structure` setting must be enabled.
If a materialized view uses the `TO [db.]name` construction, you must [DETACH](../detach.md) the view, run [ALTER TABLE](index.md) query for the target table, and then [ATTACH](../attach.md) the previously detached (`DETACH`) view.
**Example**
```sql
CREATE TABLE src_table (`a` UInt32) ENGINE = MergeTree ORDER BY a;
CREATE MATERIALIZED VIEW mv (`a` UInt32) ENGINE = MergeTree ORDER BY a AS SELECT a FROM src_table;
INSERT INTO src_table (a) VALUES (1), (2);
SELECT * FROM mv;
```
```text
┌─a─┐
│ 1 │
│ 2 │
└───┘
```
```sql
ALTER TABLE mv MODIFY QUERY SELECT a * 2 as a FROM src_table;
INSERT INTO src_table (a) VALUES (3), (4);
SELECT * FROM mv;
```
```text
┌─a─┐
│ 6 │
│ 8 │
└───┘
┌─a─┐
│ 1 │
│ 2 │
└───┘
```
## ALTER LIVE VIEW Statement {#alter-live-view}
`ALTER LIVE VIEW ... REFRESH` statement refreshes a [Live view](../create/view.md#live-view). See [Force Live View Refresh](../create/view.md#live-view-alter-refresh).

View File

@ -5,9 +5,9 @@ toc_title: VIEW
# CREATE VIEW {#create-view}
Creates a new view. There are two types of views: normal and materialized.
Creates a new view. Views can be [normal](#normal), [materialized](#materialized) and [live](#live-view) (the latter is an experimental feature).
## Normal {#normal}
## Normal View {#normal}
Syntax:
@ -35,7 +35,7 @@ This query is fully equivalent to using the subquery:
SELECT a, b, c FROM (SELECT ...)
```
## Materialized {#materialized}
## Materialized View {#materialized}
``` sql
CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
@ -59,7 +59,7 @@ If you specify `POPULATE`, the existing table data is inserted in the view when
A `SELECT` query can contain `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Note that the corresponding conversions are performed independently on each block of inserted data. For example, if `GROUP BY` is set, data is aggregated during insertion, but only within a single packet of inserted data. The data wont be further aggregated. The exception is when using an `ENGINE` that independently performs data aggregation, such as `SummingMergeTree`.
The execution of [ALTER](../../../sql-reference/statements/alter/index.md) queries on materialized views has limitations, so they might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view.
The execution of [ALTER](../../../sql-reference/statements/alter/view.md) queries on materialized views has limitations, so they might be inconvenient. If the materialized view uses the construction `TO [db.]name`, you can `DETACH` the view, run `ALTER` for the target table, and then `ATTACH` the previously detached (`DETACH`) view.
Note that materialized view is influenced by [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert) setting. The data is merged before the insertion into a view.
@ -67,7 +67,7 @@ Views look the same as normal tables. For example, they are listed in the result
To delete a view, use [DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). Although `DROP TABLE` works for VIEWs as well.
## Live View (Experimental) {#live-view}
## Live View [Experimental] {#live-view}
!!! important "Important"
This is an experimental feature that may change in backwards-incompatible ways in the future releases.
@ -93,7 +93,7 @@ Live views work similarly to how a query in a distributed table works. But inste
See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround.
### Monitoring Changes {#live-view-monitoring}
### Monitoring Live View Changes {#live-view-monitoring}
You can monitor changes in the `LIVE VIEW` query result using [WATCH](../../../sql-reference/statements/watch.md) query.
@ -118,12 +118,11 @@ WATCH lv;
│ 1 │ 1 │
└────────┴──────────┘
┌─sum(x)─┬─_version─┐
2 │ 2 │
3 │ 2 │
└────────┴──────────┘
┌─sum(x)─┬─_version─┐
│ 6 │ 3 │
└────────┴──────────┘
...
```
```sql
@ -154,7 +153,6 @@ WATCH lv EVENTS;
┌─version─┐
│ 3 │
└─────────┘
...
```
You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables.
@ -163,7 +161,7 @@ You can execute [SELECT](../../../sql-reference/statements/select/index.md) quer
SELECT * FROM [db.]live_view WHERE ...
```
### Force Refresh {#live-view-alter-refresh}
### Force Live View Refresh {#live-view-alter-refresh}
You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement.
@ -235,7 +233,7 @@ WATCH lv
Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv does not exist..
```
### Usage {#live-view-usage}
### Live View Usage {#live-view-usage}
Most common uses of live view tables include:
@ -244,4 +242,5 @@ Most common uses of live view tables include:
- Watching for table changes and triggering a follow-up select queries.
- Watching metrics from system tables using periodic refresh.
[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/view/) <!--hide-->
**See Also**
- [ALTER LIVE VIEW](../alter/view.md#alter-live-view)

View File

@ -8,7 +8,7 @@ toc_title: SHOW
## SHOW CREATE TABLE {#show-create-table}
``` sql
SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format]
SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY|VIEW] [db.]table|view [INTO OUTFILE filename] [FORMAT format]
```
Returns a single `String`-type statement column, which contains a single value the `CREATE` query used for creating the specified object.

View File

@ -26,7 +26,6 @@ aの結果 `SELECT`、および実行する `INSERT`ファイルバックアッ
| [カスタム区切り](#format-customseparated) | ✔ | ✔ |
| [](#data-format-values) | ✔ | ✔ |
| [垂直](#vertical) | ✗ | ✔ |
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONCompact](#jsoncompact) | ✗ | ✔ |
| [JSONEachRow](#jsoneachrow) | ✔ | ✔ |
@ -819,10 +818,6 @@ test: string with 'quotes' and with some special
この形式は、クエリ結果の出力にのみ適していますが、解析(テーブルに挿入するデータの取得)には適していません。
## VerticalRaw {#verticalraw}
に類似した [垂直](#vertical) しかし、エスケープ無効で。 この形式は、クエリ結果の出力にのみ適しており、解析(データの受信とテーブルへの挿入)には適していません。
## XML {#xml}
XML形式は出力にのみ適しており、解析には適していません。 例:

View File

@ -22,7 +22,6 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT
| [CustomSeparated](#format-customseparated) | ✔ | ✔ |
| [Values](#data-format-values) | ✔ | ✔ |
| [Vertical](#vertical) | ✗ | ✔ |
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
@ -916,10 +915,6 @@ test: string with 'quotes' and with some special
Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу).
## VerticalRaw {#verticalraw}
Аналогичен [Vertical](#vertical), но с отключенным выходом. Этот формат подходит только для вывода результата выполнения запроса, но не для парсинга (приёма данных для вставки в таблицу).
## XML {#xml}
Формат XML подходит только для вывода данных, не для парсинга. Пример:
@ -1493,4 +1488,4 @@ ClickHouse поддерживает запись и чтение из файло
$ clickhouse-client --query="CREATE TABLE msgpack (array Array(UInt8)) ENGINE = Memory;"
$ clickhouse-client --query="INSERT INTO msgpack VALUES ([0, 1, 2, 3, 42, 253, 254, 255]), ([255, 254, 253, 42, 3, 2, 1, 0])";
$ clickhouse-client --query="SELECT * FROM msgpack FORMAT MsgPack" > tmp_msgpack.msgpk;
```
```

View File

@ -50,7 +50,7 @@ AS state FROM train_data;
После сохранения состояния в таблице мы можем использовать его несколько раз для прогнозирования или смёржить с другими состояниями и создать новые, улучшенные модели.
``` sql
```sql
WITH (SELECT state FROM your_model) AS model SELECT
evalMLMethod(model, param1, param2) FROM test_data
```
@ -65,9 +65,9 @@ evalMLMethod(model, param1, param2) FROM test_data
<!-- -->
``` sql
SELECT state1 + state2 FROM your_models
```
```sql
SELECT state1 + state2 FROM your_models
```
где таблица `your_models` содержит обе модели. Запрос вернёт новый объект `AggregateFunctionState`.
@ -75,9 +75,9 @@ evalMLMethod(model, param1, param2) FROM test_data
<!-- -->
``` sql
SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data
```
```sql
SELECT stochasticLinearRegression(0.01)(target, param1, param2) FROM train_data
```
Подобный запрос строит модель и возвращает её веса, отвечающие параметрам моделей и смещение. Таким образом, в приведенном выше примере запрос вернет столбец с тремя значениями.

View File

@ -14,7 +14,7 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN
В запросе указывается список из одного или более действий через запятую.
Каждое действие — операция над столбцом.
Большинство запросов `ALTER` изменяют настройки таблицы или данные:
Большинство запросов `ALTER TABLE` изменяют настройки таблицы или данные:
- [COLUMN](../../../sql-reference/statements/alter/column.md)
- [PARTITION](../../../sql-reference/statements/alter/partition.md)
@ -26,7 +26,12 @@ ALTER TABLE [db].name [ON CLUSTER cluster] ADD|DROP|CLEAR|COMMENT|MODIFY COLUMN
- [TTL](../../../sql-reference/statements/alter/ttl.md)
!!! note "Note"
Запрос `ALTER` поддерживается только для таблиц типа `*MergeTree`, а также `Merge` и `Distributed`. Запрос имеет несколько вариантов.
Запрос `ALTER TABLE` поддерживается только для таблиц типа `*MergeTree`, а также `Merge` и `Distributed`. Запрос имеет несколько вариантов.
Следующие запросы `ALTER` управляют представлениями:
- [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — изменяет структуру [Materialized view](../create/view.md#materialized).
- [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — обновляет [Live view](../create/view.md#live-view).
Следующие запросы `ALTER` изменяют сущности, связанные с управлением доступом на основе ролей:

View File

@ -0,0 +1,44 @@
---
toc_priority: 50
toc_title: VIEW
---
# Выражение ALTER TABLE … MODIFY QUERY {#alter-modify-query}
Вы можеие изменить запрос `SELECT`, который был задан при создании [материализованного представления](../create/view.md#materialized), с помощью запроса 'ALTER TABLE … MODIFY QUERY'. Используйте его если при создании материализованного представления не использовалась секция `TO [db.]name`. Настройка `allow_experimental_alter_materialized_view_structure` должна быть включена.
Если при создании материализованного представления использовалась конструкция `TO [db.]name`, то для изменения отсоедините представление с помощью [DETACH](../detach.md), измените таблицу с помощью [ALTER TABLE](index.md), а затем снова присоедините запрос с помощью [ATTACH](../attach.md).
**Пример**
```sql
CREATE TABLE src_table (`a` UInt32) ENGINE = MergeTree ORDER BY a;
CREATE MATERIALIZED VIEW mv (`a` UInt32) ENGINE = MergeTree ORDER BY a AS SELECT a FROM src_table;
INSERT INTO src_table (a) VALUES (1), (2);
SELECT * FROM mv;
```
```text
┌─a─┐
│ 1 │
│ 2 │
└───┘
```
```sql
ALTER TABLE mv MODIFY QUERY SELECT a * 2 as a FROM src_table;
INSERT INTO src_table (a) VALUES (3), (4);
SELECT * FROM mv;
```
```text
┌─a─┐
│ 6 │
│ 8 │
└───┘
┌─a─┐
│ 1 │
│ 2 │
└───┘
```
## Выражение ALTER LIVE VIEW {#alter-live-view}
Выражение `ALTER LIVE VIEW ... REFRESH` обновляет [Live-представление](../create/view.md#live-view). См. раздел [Force Live View Refresh](../create/view.md#live-view-alter-refresh).

View File

@ -5,7 +5,7 @@ toc_title: "Представление"
# CREATE VIEW {#create-view}
Создаёт представление. Представления бывают двух видов - обычные и материализованные (MATERIALIZED).
Создаёт представление. Представления бывают [обычные](#normal), [материализованные](#materialized) (MATERIALIZED) и [LIVE](#live-view).
## Обычные представления {#normal}
@ -54,7 +54,7 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na
Запрос `SELECT` может содержать `DISTINCT`, `GROUP BY`, `ORDER BY`, `LIMIT`… Следует иметь ввиду, что соответствующие преобразования будут выполняться независимо, на каждый блок вставляемых данных. Например, при наличии `GROUP BY`, данные будут агрегироваться при вставке, но только в рамках одной пачки вставляемых данных. Далее, данные не будут доагрегированы. Исключение - использование ENGINE, производящего агрегацию данных самостоятельно, например, `SummingMergeTree`.
Недоработано выполнение запросов `ALTER` над материализованными представлениями, поэтому они могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления.
Выполнение запросов [ALTER](../../../sql-reference/statements/alter/view.md) над материализованными представлениями имеет свои особенности, поэтому эти запросы могут быть неудобными для использования. Если материализованное представление использует конструкцию `TO [db.]name`, то можно выполнить `DETACH` представления, `ALTER` для целевой таблицы и последующий `ATTACH` ранее отсоединенного (`DETACH`) представления.
Обратите внимание, что работа материализованного представления находится под влиянием настройки [optimize_on_insert](../../../operations/settings/settings.md#optimize-on-insert). Перед вставкой данных в таблицу происходит их слияние.
@ -62,7 +62,7 @@ CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]na
Чтобы удалить представление, следует использовать [DROP VIEW](../../../sql-reference/statements/drop.md#drop-view). Впрочем, `DROP TABLE` тоже работает для представлений.
## LIVE-представления {#live-view}
## LIVE-представления [экспериментальный функционал] {#live-view}
!!! important "Важно"
Представления `LIVE VIEW` являются экспериментальной возможностью. Их использование может повлечь потерю совместимости в будущих версиях.
@ -86,7 +86,7 @@ LIVE-представления работают по тому же принци
В случаях, когда `LIVE VIEW` не обновляется автоматически, чтобы обновлять его принудительно с заданной периодичностью, используйте [WITH REFRESH](#live-view-with-refresh).
### Отслеживание изменений {#live-view-monitoring}
### Отслеживание изменений LIVE-представлений {#live-view-monitoring}
Для отслеживания изменений LIVE-представления используйте запрос [WATCH](../../../sql-reference/statements/watch.md).
@ -108,12 +108,11 @@ WATCH lv;
│ 1 │ 1 │
└────────┴──────────┘
┌─sum(x)─┬─_version─┐
2 │ 2 │
3 │ 2 │
└────────┴──────────┘
┌─sum(x)─┬─_version─┐
│ 6 │ 3 │
└────────┴──────────┘
...
```
```sql
@ -148,7 +147,7 @@ WATCH lv EVENTS;
SELECT * FROM [db.]live_view WHERE ...
```
### Принудительное обновление {#live-view-alter-refresh}
### Принудительное обновление LIVE-представлений {#live-view-alter-refresh}
Чтобы принудительно обновить LIVE-представление, используйте запрос `ALTER LIVE VIEW [db.]table_name REFRESH`.
@ -220,9 +219,9 @@ WATCH lv;
Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist..
```
### Использование {#live-view-usage}
### Использование LIVE-представлений {#live-view-usage}
Наиболее частые случаи использования `LIVE-VIEW`:
Наиболее частые случаи использования `LIVE-представлений`:
- Получение push-уведомлений об изменениях данных без дополнительных периодических запросов.
- Кеширование результатов часто используемых запросов для получения их без задержки.

View File

@ -8,10 +8,10 @@ toc_title: SHOW
## SHOW CREATE TABLE {#show-create-table}
``` sql
SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY] [db.]table [INTO OUTFILE filename] [FORMAT format]
SHOW CREATE [TEMPORARY] [TABLE|DICTIONARY|VIEW] [db.]table|view [INTO OUTFILE filename] [FORMAT format]
```
Возвращает один столбец типа `String` с именем statement, содержащий одно значение — запрос `CREATE TABLE`, с помощью которого был создан указанный объект.
Возвращает один столбец типа `String` с именем statement, содержащий одно значение — запрос `CREATE`, с помощью которого был создан указанный объект.
## SHOW DATABASES {#show-databases}

View File

@ -12,7 +12,7 @@ htmlmin==0.1.12
idna==2.10
Jinja2>=2.11.3
jinja2-highlight==0.6.1
jsmin==2.2.2
jsmin==3.0.0
livereload==2.6.2
Markdown==3.3.2
MarkupSafe==1.1.1

View File

@ -6,13 +6,13 @@ toc_title: 云
# ClickHouse 云服务提供商 {#clickhouse-cloud-service-providers}
!!! info "注意"
如果您已经推出具有托管 ClickHouse 服务的公共云,请随时[提交一个 pull request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/cloud.md) 将其添加到以下列表。
如果您已经推出具有托管ClickHouse服务的公共云请随时[提交一个 pull request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/cloud.md)将其添加到以下列表。
## Yandex 云 {#yandex-cloud}
[Yandex ClickHouse 托管服务](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3) 提供以下主要功能:
[Yandex ClickHouse托管服务](https://cloud.yandex.com/services/managed-clickhouse?utm_source=referrals&utm_medium=clickhouseofficialsite&utm_campaign=link3)提供以下主要功能:
- 用于 ClickHouse 复制的完全托管的 ZooKeeper 服务 [ClickHouse复制](../engines/table-engines/mergetree-family/replication.md)
- 用于[ClickHouse replication](../engines/table-engines/mergetree-family/replication.md)的完全托管的ZooKeeper服务
- 多种存储类型选择
- 不同可用区副本
- 加密与隔离
@ -20,34 +20,43 @@ toc_title: 云
## Altinity.Cloud {#altinity.cloud}
[Altinity.Cloud](https://altinity.com/cloud-database/) 是针对 Amazon 公共云的完全托管的 ClickHouse-as-a-Service
[Altinity.Cloud](https://altinity.com/cloud-database/)是针对Amazon公共云的完全托管的ClickHouse-as-a-Service
- 在 Amazon 资源上快速部署 ClickHouse 集群
- 在Amazon资源上快速部署ClickHouse集群
- 轻松进行横向扩展/纵向扩展以及节点的垂直扩展
- 具有公共端点或VPC对等的租户隔离
- 可配置存储类型以及卷配置
- 跨可用区扩展以实现性能和高可用性
- 内置监控和SQL查询编辑器
## 阿里云 {#alibaba-cloud}
## 阿里云{#alibaba-cloud}
[阿里云ClickHouse 托管服务](https://www.alibabacloud.com/zh/product/clickhouse) 提供以下主要功能:
[阿里云ClickHouse托管服务](https://www.alibabacloud.com/zh/product/clickhouse)提供以下主要功能:
- 基于阿里飞天分布式系统的高可靠云盘存储引擎
- 按需扩容,无需手动进行数据搬迁
- 支持单节点、单副本、多节点、多副本多种架构,支持冷热数据分层
- 支持访问白名单和一键恢复,多层网络安全防护,云盘加密
- 与云上日志系统、数据库、数据应用工具无缝集成
- 内置监控和数据库管理平台
- 专业的数据库专家技术支持和服务
- 基于阿里飞天分布式系统的高可靠云盘存储引擎
- 按需扩容,无需手动进行数据搬迁
- 支持单节点、单副本、多节点、多副本多种架构,支持冷热数据分层
- 支持访问白名单和一键恢复,多层网络安全防护,云盘加密
- 与云上日志系统、数据库、数据应用工具无缝集成
- 内置监控和数据库管理平台
- 专业的数据库专家技术支持和服务
## SberCloud {#sbercloud}
[SberCloud.Advanced](https://sbercloud.ru/en/advanced)提供[MapReduce Service (MRS)](https://docs.sbercloud.ru/mrs/ug/topics/ug__clickhouse.html), 一个可靠、安全且易于使用的企业级平台用于存储、处理和分析大数据。MRS允许您快速创建和管理ClickHouse集群。
- 一个ClickHouse实例由三个ZooKeeper节点和多个ClickHouse节点组成。 Dedicated Replica模式用于保证双数据副本的高可靠性。
- MRS提供平滑弹性伸缩能力快速满足集群存储容量或CPU计算资源不足场景下的业务增长需求。当您扩展集群中ClickHouse节点的容量时MRS提供一键式数据平衡工具让您主动进行数据平衡。 您可以根据业务特点确定数据均衡方式和时间,保证业务的可用性,实现平滑扩展。
- MRS采用弹性负载均衡保障高可用部署架构自动将用户访问流量分配到多个后端节点将服务能力扩展到外部系统提高容错能力。 通过ELB轮询机制数据写入本地表从不同节点的分布式表中读取。 这样就保证了数据读写负载和应用访问的高可用。
## 腾讯云 {#tencent-cloud}
[腾讯云的 ClickHouse 托管服务](https://cloud.tencent.com/product/cdwch)提供以下主要功能:
[腾讯云ClickHouse托管服务](https://cloud.tencent.com/product/cdwch)提供以下主要功能:
- 易于部署和管理, 集成监控与警报服务
- 高可用高扩展
- 通过集群级别的 VPC 保证安全可靠
- 按需定价,无需前期成本或长期承诺
- 易于在腾讯云上部署和管理
- 高度可扩展和可用
- 集成监控和警报服务
- 每个集群VPC隔离的高安全性
- 按需定价,无前期成本或长期承诺
{## [原始文章](https://clickhouse.tech/docs/en/commercial/cloud/) ##}

View File

@ -6,7 +6,7 @@ toc_title: 简介
# ClickHouse 商业服务 {#clickhouse-commercial-services}
本节是专门从事 ClickHouse 的服务提供商的目录,它们是一些独立的公司,不一定与 Yandex 有关系
此部分是专门从事ClickHouse的商业服务提供商的目录。 他们是独立的公司不一定隶属于Yandex
服务类别:
@ -14,4 +14,4 @@ toc_title: 简介
- [支持](../commercial/support.md)
!!! note "对于服务提供商"
如果您碰巧是其中之一,可以随时提交一个 pull request将您的公司添加到对应的章节如果服务不属于现有的任何目录也可以添加新的章节。提交关于文档的 pull request 最简单的方式是点击右上角的“铅笔”编辑按钮。如果您的服务在某些本地市场上有售,请确保在本地化的文档界面中也提及它(或至少在 pull request 请求描述中指出)。
如果您碰巧代表其中之一请随时提交一个pull request将您的公司添加到相应部分如果服务不适合现有类别甚至可以添加新部分。 提交关于文档的pull request最简单的方式是点击右上角的“铅笔”编辑按钮。 如果您的服务在某些本地市场可用请确保也在本地化文档页面中提及它或至少在pull request请求描述中指出)。

View File

@ -6,16 +6,20 @@ toc_title: 支持
# ClickHouse 商业支持服务提供商 {#clickhouse-commercial-support-service-providers}
!!! info "注意"
如果您已经推出 ClickHouse 商业支持服务,请随时[提交一个 pull request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/support.md) 将其添加到以下列表。
如果您已经推出ClickHouse商业支持服务请随时[提交一个pull request](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/commercial/support.md)将其添加到以下列表。
## Yandex.Cloud
来自ClickHouse作者的ClickHouse全球支持。 支持内部部署和云部署。 在clickhouse-support@yandex-team.com上询问详细信息
## Altinity {#altinity}
Altinity 自从 2017 年开始为企业提供 ClickHouse 支持服务。Altinity 的客户范围包含百强企业到初创企业等。访问 [www.altinity.com](https://www.altinity.com/) 了解更多信息。
Altinity自2017年以来一直为企业ClickHouse提供支持和服务。 Altinity的客户范围从财富100强企业到初创公司。访问 [www.altinity.com](https://www.altinity.com/)了解更多信息。
## Mafiree {#mafiree}
[服务说明](http://mafiree.com/clickhouse-analytics-services.php)
[Service description](http://mafiree.com/clickhouse-analytics-services.php)
## MinervaDB {#minervadb}
[服务说明](https://minervadb.com/index.php/clickhouse-consulting-and-support-by-minervadb/)
[Service description](https://minervadb.com/index.php/clickhouse-consulting-and-support-by-minervadb/)

View File

@ -0,0 +1,150 @@
# 如何将测试查询添加到 ClickHouse CI
ClickHouse有数百个甚至数千个功能。 每个提交都由包含数千个测试用例的一组复杂测试进行检查。
核心功能经过了很多的测试但是ClickHouse CI可以发现一些极端情况和不同的功能组合。
我们看到的大多数错误/回归都发生在测试覆盖率较差的`灰色区域`中。
我们非常有兴趣通过测试涵盖实现生活中使用的大多数可能的场景和功能组合。
## 为什么要添加测试
为什么/何时应将测试用例添加到ClickHouse代码中
1) 您使用了一些复杂的场景/功能组合/您有一些可能未被广泛使用的情况
2) 您会看到更改日志中没有通知的版本之间的某些行为发生了变化
3) 您只是想帮助提高ClickHouse的质量并确保您使用的功能在未来的版本中不会被破坏
4) 一旦测试被添加/接受,您可以确保您检查的角落案例永远不会被意外损坏。
5) 你将成为伟大的开源社区的一份子
6) 您的名字将出现在`system.contributors`表中!
7) 你会让世界变得更好。
### 要做的步骤
#### 先决条件
我假设你运行一些Linux机器你可以在其他操作系统上使用 docker/虚拟机)和任何现代浏览器/互联网连接并且你有一些基本的Linux和SQL技能。
不需要任何高度专业化的知识(因此您不需要了解 C++ 或了解ClickHouse CI的工作原理
#### 准备
1) [create GitHub account](https://github.com/join) (如果你还没有)
2) [setup git](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/set-up-git)
```bash
# for Ubuntu
sudo apt-get update
sudo apt-get install git
git config --global user.name "John Doe" # fill with your name
git config --global user.email "email@example.com" # fill with your email
```
3) [fork ClickHouse project](https://docs.github.com/en/free-pro-team@latest/github/getting-started-with-github/fork-a-repo) - 打开 [https://github.com/ClickHouse/ClickHouse](https://github.com/ClickHouse/ClickHouse) and press fork button in the top right corner:
![fork repo](https://github-images.s3.amazonaws.com/help/bootcamp/Bootcamp-Fork.png)
4) 例如将代码fork克隆到PC上的某个文件夹, `~/workspace/ClickHouse`
```
mkdir ~/workspace && cd ~/workspace
git clone https://github.com/< your GitHub username>/ClickHouse
cd ClickHouse
git remote add upstream https://github.com/ClickHouse/ClickHouse
```
#### 测试的新分支
1) 从最新的clickhouse master创建一个新分支
```
cd ~/workspace/ClickHouse
git fetch upstream
git checkout -b name_for_a_branch_with_my_test upstream/master
```
#### 安装并运行 clickhouse
1) 安装`clickhouse-server` (参考[离线文档](https://clickhouse.tech/docs/en/getting-started/install/))
2) 安装测试配置它将使用Zookeeper模拟实现并调整一些设置
```
cd ~/workspace/ClickHouse/tests/config
sudo ./install.sh
```
3) 运行clickhouse-server
```
sudo systemctl restart clickhouse-server
```
#### 创建测试文件
1) 找到测试的编号 - 在`tests/queries/0_stateless/`中找到编号最大的文件
```sh
$ cd ~/workspace/ClickHouse
$ ls tests/queries/0_stateless/[0-9]*.reference | tail -n 1
tests/queries/0_stateless/01520_client_print_query_id.reference
```
目前,测试的最后一个数字是`01520`,所以我的测试将有数字`01521`
2) 使用您测试的功能的下一个编号和名称创建一个SQL文件
```sh
touch tests/queries/0_stateless/01521_dummy_test.sql
```
3) 使用您最喜欢的编辑器编辑SQL文件请参阅下面的创建测试提示
```sh
vim tests/queries/0_stateless/01521_dummy_test.sql
```
4) 运行测试,并将其结果放入参考文件中:
```
clickhouse-client -nmT < tests/queries/0_stateless/01521_dummy_test.sql | tee tests/queries/0_stateless/01521_dummy_test.reference
```
5) 确保一切正确,如果测试输出不正确(例如由于某些错误),请使用文本编辑器调整参考文件。
#### 如何创建一个好的测试
- 测试应该是
- 最小 - 仅创建与测试功能相关的表,删除不相关的列和部分查询
- 快速 - 不应超过几秒钟(更好的亚秒)
- 正确 - 失败则功能不起作用
- 确定性的
- 隔离/无状态
- 不要依赖一些环境的东西
- 尽可能不要依赖时间
- 尝试覆盖极端情况(zeros / Nulls / empty sets / throwing exceptions)
- 要测试该查询返回错误,您可以在查询后添加特殊注释:`-- { serverError 60 }`或`-- { clientError 20 }`
- 不要切换数据库(除非必要)
- 如果需要,您可以在同一节点上创建多个表副本
- 您可以在需要时使用测试集群定义之一(请参阅 system.clusters
- 使用 `number` / `numbers_mt` / `zeros` / `zeros_mt`和类似的查询要在适用时初始化数据
- 在测试之后和测试之前清理创建的对象DROP IF EXISTS - 在有一些脏状态的情况下
- 优先选择同步操作模式 (mutations, merges)
- 以`0_stateless`文件夹中的其他SQL文件为例
- 确保您想要测试的特性/特性组合尚未被现有测试覆盖
#### 测试命名规则
正确地命名测试非常重要因此可以在clickhouse-test调用中关闭一些测试子集。
| Tester flag| 测试名称中应该包含什么 | 什么时候应该添加标志 |
|---|---|---|---|
| `--[no-]zookeeper`| "zookeeper"或"replica" | 测试使用来自ReplicatedMergeTree家族的表 |
| `--[no-]shard` | "shard"或"distributed"或"global"| 使用到127.0.0.2或类似的连接进行测试 |
| `--[no-]long` | "long"或"deadlock"或"race" | 测试运行时间超过60秒 |
#### Commit / push / 创建PR.
1) commit & push您的修改
```sh
cd ~/workspace/ClickHouse
git add tests/queries/0_stateless/01521_dummy_test.sql
git add tests/queries/0_stateless/01521_dummy_test.reference
git commit # use some nice commit message when possible
git push origin HEAD
```
2) 使用一个在推送过程中显示的链接创建一个到master的PR
3) 调整PR标题和内容在`Changelog category (leave one)`中保留
`Build/Testing/Packaging Improvement`,如果需要,请填写其余字段。

View File

@ -23,7 +23,6 @@ ClickHouse可以接受和返回各种格式的数据。受支持的输入格式
| [CustomSeparated](#format-customseparated) | ✔ | ✔ |
| [Values](#data-format-values) | ✔ | ✔ |
| [Vertical](#vertical) | ✗ | ✔ |
| [VerticalRaw](#verticalraw) | ✗ | ✔ |
| [JSON](#json) | ✗ | ✔ |
| [JSONAsString](#jsonasstring) | ✔ | ✗ |
| [JSONStrings](#jsonstrings) | ✗ | ✔ |
@ -951,31 +950,6 @@ SELECT * FROM t_null FORMAT Vertical
该格式仅适用于输出查询结果,但不适用于解析输入(将数据插入到表中)。
## VerticalRaw {#verticalraw}
`Vertical` 格式不同点在于,行是不会被转义的。
这种格式仅仅适用于输出,但不适用于解析输入(将数据插入到表中)。
示例:
:) SHOW CREATE TABLE geonames FORMAT VerticalRaw;
Row 1:
──────
statement: CREATE TABLE default.geonames ( geonameid UInt32, date Date DEFAULT CAST('2017-12-08' AS Date)) ENGINE = MergeTree(date, geonameid, 8192)
:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT VerticalRaw;
Row 1:
──────
test: string with 'quotes' and with some special
characters
和 Vertical 格式相比:
:) SELECT 'string with \'quotes\' and \t with some special \n characters' AS test FORMAT Vertical;
Row 1:
──────
test: string with \'quotes\' and \t with some special \n characters
## XML {#xml}
该格式仅适用于输出查询结果,但不适用于解析输入,示例:

View File

@ -72,6 +72,22 @@ ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix).
[clickhouse-flamegraph](https://github.com/Slach/clickhouse-flamegraph) 是一个可视化的专业工具`system.trace_log`如[flamegraph](http://www.brendangregg.com/flamegraphs.html).
### DBM {#dbm}
[DBM](https://dbm.incubator.edurt.io/) DBM是一款ClickHouse可视化管理工具!
特征:
- 支持查询历史(分页、全部清除等)
- 支持选中的sql子句查询(多窗口等)
- 支持终止查询
- 支持表管理
- 支持数据库管理
- 支持自定义查询
- 支持多数据源管理(连接测试、监控)
- 支持监控(处理进程、连接、查询)
- 支持迁移数据
## 商业 {#shang-ye}
### Holistics {#holistics-software}
@ -99,20 +115,4 @@ ClickHouse Web 界面 [Tabix](https://github.com/tabixio/tabix).
- 重构。
- 搜索和导航。
### DBM {#dbm}
[DBM](https://dbm.incubator.edurt.io/) DBM是一款ClickHouse可视化管理工具!
特征:
- 支持查询历史(分页、全部清除等)
- 支持选中的sql子句查询(多窗口等)
- 支持终止查询
- 支持表管理
- 支持数据库管理
- 支持自定义查询
- 支持多数据源管理(连接测试、监控)
- 支持监控(处理进程、连接、查询)
- 支持迁移数据
[来源文章](https://clickhouse.tech/docs/zh/interfaces/third-party/gui/) <!--hide-->

View File

@ -1274,13 +1274,14 @@ TaskStatus ClusterCopier::processPartitionPieceTaskImpl(
auto get_select_query = [&] (const DatabaseAndTableName & from_table, const String & fields, bool enable_splitting, String limit = "")
{
String query;
query += "WITH " + task_partition.name + " AS partition_key ";
query += "SELECT " + fields + " FROM " + getQuotedTable(from_table);
if (enable_splitting && experimental_use_sample_offset)
query += " SAMPLE 1/" + toString(number_of_splits) + " OFFSET " + toString(current_piece_number) + "/" + toString(number_of_splits);
/// TODO: Bad, it is better to rewrite with ASTLiteral(partition_key_field)
query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = (" + task_partition.name + " AS partition_key))";
query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = partition_key)";
if (enable_splitting && !experimental_use_sample_offset)
query += " AND ( cityHash64(" + primary_key_comma_separated + ") %" + toString(number_of_splits) + " = " + toString(current_piece_number) + " )";
@ -1851,9 +1852,9 @@ bool ClusterCopier::checkShardHasPartition(const ConnectionTimeouts & timeouts,
TaskTable & task_table = task_shard.task_table;
WriteBufferFromOwnString ss;
ss << "WITH " + partition_quoted_name + " AS partition_key ";
ss << "SELECT 1 FROM " << getQuotedTable(task_shard.table_read_shard);
ss << " WHERE (" << queryToString(task_table.engine_push_partition_key_ast);
ss << " = (" + partition_quoted_name << " AS partition_key))";
ss << " WHERE (" << queryToString(task_table.engine_push_partition_key_ast) << " = partition_key)";
if (!task_table.where_condition_str.empty())
ss << " AND (" << task_table.where_condition_str << ")";
ss << " LIMIT 1";
@ -1882,13 +1883,15 @@ bool ClusterCopier::checkPresentPartitionPiecesOnCurrentShard(const ConnectionTi
UNUSED(primary_key_comma_separated);
std::string query = "SELECT 1 FROM " + getQuotedTable(task_shard.table_read_shard);
std::string query;
query += "WITH " + partition_quoted_name + " AS partition_key ";
query += "SELECT 1 FROM " + getQuotedTable(task_shard.table_read_shard);
if (experimental_use_sample_offset)
query += " SAMPLE 1/" + toString(number_of_splits) + " OFFSET " + toString(current_piece_number) + "/" + toString(number_of_splits);
query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast)
+ " = (" + partition_quoted_name + " AS partition_key))";
query += " WHERE (" + queryToString(task_table.engine_push_partition_key_ast) + " = partition_key)";
if (!experimental_use_sample_offset)
query += " AND (cityHash64(" + primary_key_comma_separated + ") % "

View File

@ -162,18 +162,18 @@ public:
struct BlockerInThread
{
private:
BlockerInThread(const BlockerInThread &) = delete;
BlockerInThread & operator=(const BlockerInThread &) = delete;
static thread_local uint64_t counter;
static thread_local VariableContext level;
VariableContext previous_level;
public:
/// level_ - block in level and above
BlockerInThread(VariableContext level_ = VariableContext::User);
explicit BlockerInThread(VariableContext level_ = VariableContext::User);
~BlockerInThread();
BlockerInThread(const BlockerInThread &) = delete;
BlockerInThread & operator=(const BlockerInThread &) = delete;
static bool isBlocked(VariableContext current_level)
{
return counter > 0 && current_level >= level;
@ -195,9 +195,6 @@ public:
struct LockExceptionInThread
{
private:
LockExceptionInThread(const LockExceptionInThread &) = delete;
LockExceptionInThread & operator=(const LockExceptionInThread &) = delete;
static thread_local uint64_t counter;
static thread_local VariableContext level;
static thread_local bool block_fault_injections;
@ -207,9 +204,12 @@ public:
public:
/// level_ - block in level and above
/// block_fault_injections_ - block in fault injection too
LockExceptionInThread(VariableContext level_ = VariableContext::User, bool block_fault_injections_ = true);
explicit LockExceptionInThread(VariableContext level_ = VariableContext::User, bool block_fault_injections_ = true);
~LockExceptionInThread();
LockExceptionInThread(const LockExceptionInThread &) = delete;
LockExceptionInThread & operator=(const LockExceptionInThread &) = delete;
static bool isBlocked(VariableContext current_level, bool fault_injection)
{
return counter > 0 && current_level >= level && (!fault_injection || block_fault_injections);

View File

@ -4,6 +4,7 @@
#include <common/types.h>
#include <atomic>
#include <memory>
inline UInt64 clock_gettime_ns(clockid_t clock_type = CLOCK_MONOTONIC)
@ -22,7 +23,7 @@ public:
/** CLOCK_MONOTONIC works relatively efficient (~15 million calls/sec) and doesn't lead to syscall.
* Pass CLOCK_MONOTONIC_COARSE, if you need better performance with acceptable cost of several milliseconds of inaccuracy.
*/
Stopwatch(clockid_t clock_type_ = CLOCK_MONOTONIC) : clock_type(clock_type_) { start(); }
explicit Stopwatch(clockid_t clock_type_ = CLOCK_MONOTONIC) : clock_type(clock_type_) { start(); }
void start() { start_ns = nanoseconds(); is_running = true; }
void stop() { stop_ns = nanoseconds(); is_running = false; }
@ -43,11 +44,13 @@ private:
UInt64 nanoseconds() const { return clock_gettime_ns(clock_type); }
};
using StopwatchUniquePtr = std::unique_ptr<Stopwatch>;
class AtomicStopwatch
{
public:
AtomicStopwatch(clockid_t clock_type_ = CLOCK_MONOTONIC) : clock_type(clock_type_) { restart(); }
explicit AtomicStopwatch(clockid_t clock_type_ = CLOCK_MONOTONIC) : clock_type(clock_type_) { restart(); }
void restart() { start_ns = nanoseconds(); }
UInt64 elapsed() const { return nanoseconds() - start_ns; }
@ -78,11 +81,11 @@ public:
{
AtomicStopwatch * parent = nullptr;
Lock() {}
Lock() = default;
operator bool() const { return parent != nullptr; }
explicit operator bool() const { return parent != nullptr; }
Lock(AtomicStopwatch * parent_) : parent(parent_) {}
explicit Lock(AtomicStopwatch * parent_) : parent(parent_) {}
Lock(Lock &&) = default;

View File

@ -3,6 +3,7 @@
#include <Common/getNumberOfPhysicalCPUCores.h>
#include <cassert>
#include <iostream>
#include <type_traits>
#include <Poco/Util/Application.h>

View File

@ -339,6 +339,9 @@ Coordination::ZooKeeperRequestPtr deserializeCheckVersionTxn(ReadBuffer & in)
Coordination::read(result->path, in);
Coordination::read(result->version, in);
result->restored_from_zookeeper_log = true;
/// It stores version + 1 (which should be, not for request)
result->version -= 1;
return result;
}

View File

@ -387,6 +387,12 @@ inline bool isUInt8(const T & data_type)
return WhichDataType(data_type).isUInt8();
}
template <typename T>
inline bool isUInt64(const T & data_type)
{
return WhichDataType(data_type).isUInt64();
}
template <typename T>
inline bool isUnsignedInteger(const T & data_type)
{

View File

@ -17,6 +17,7 @@ class IDisk;
using DiskPtr = std::shared_ptr<IDisk>;
using Disks = std::vector<DiskPtr>;
class IReservation;
using ReservationSharedPtr = std::shared_ptr<IReservation>;
using ReservationPtr = std::unique_ptr<IReservation>;
using Reservations = std::vector<ReservationPtr>;

View File

@ -1077,6 +1077,11 @@ struct BitmapAndnotImpl
}
};
struct NameBitmapAnd
{
static constexpr auto name = "bitmapAnd";
};
template <template <typename> class Impl, typename Name>
class FunctionBitmap : public IFunction
{
@ -1176,10 +1181,16 @@ private:
const AggregateDataPtr data_ptr_0 = is_column_const[0] ? container0[0] : container0[i];
const AggregateDataPtr data_ptr_1 = is_column_const[1] ? container1[0] : container1[i];
col_to->insertFrom(data_ptr_0);
// bitmapAnd(RoaringBitMap, SmallSet) is slower than bitmapAnd(SmallSet, RoaringBitMap), so we can exchange the position of two arguments for the speed
auto * bm_1 = reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>(data_ptr_0);
auto * bm_2 = reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>(data_ptr_1);
// check the name of operation (bitmapAnd) and check if it is the situation mentioned above
auto need_exchange = (name == NameBitmapAnd::name) && bm_1->rbs.isLarge() && bm_2->rbs.isSmall();
col_to->insertFrom(need_exchange ? data_ptr_1 : data_ptr_0);
AggregateFunctionGroupBitmapData<T> & bitmap_data_1 = *reinterpret_cast<AggregateFunctionGroupBitmapData<T> *>(col_to->getData()[i]);
const AggregateFunctionGroupBitmapData<T> & bitmap_data_2
= *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(data_ptr_1);
= *reinterpret_cast<const AggregateFunctionGroupBitmapData<T> *>(need_exchange ? data_ptr_0 : data_ptr_1);
Impl<T>::apply(bitmap_data_1, bitmap_data_2);
}
return col_to;
@ -1237,10 +1248,6 @@ using FunctionBitmapAndnotCardinality = FunctionBitmapCardinality<BitmapAndnotCa
using FunctionBitmapHasAll = FunctionBitmapCardinality<BitmapHasAllImpl, NameBitmapHasAll, UInt8>;
using FunctionBitmapHasAny = FunctionBitmapCardinality<BitmapHasAnyImpl, NameBitmapHasAny, UInt8>;
struct NameBitmapAnd
{
static constexpr auto name = "bitmapAnd";
};
struct NameBitmapOr
{
static constexpr auto name = "bitmapOr";

View File

@ -0,0 +1,96 @@
#if !defined(ARCADIA_BUILD)
# include "config_functions.h"
#endif
#if USE_H3
#include <Columns/ColumnArray.h>
#include <Columns/ColumnTuple.h>
#include <Columns/ColumnsNumber.h>
#include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypesNumber.h>
#include <h3api.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int INCORRECT_DATA;
}
class FunctionH3ToGeoBoundary : public IFunction
{
public:
static constexpr auto name = "h3ToGeoBoundary";
String getName() const override { return name; }
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionH3ToGeoBoundary>(); }
size_t getNumberOfArguments() const override { return 1; }
bool useDefaultImplementationForConstants() const override { return true; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
const auto & arg = arguments[0];
if (!isUInt64(arg))
{
throw Exception(
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
"Illegal type {} of argument {} of function {}. Must be UInt64",
arg->getName(), 1, getName());
}
return std::make_shared<DataTypeArray>(
std::make_shared<DataTypeTuple>(
DataTypes{std::make_shared<DataTypeFloat64>(), std::make_shared<DataTypeFloat64>()}));
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
{
const auto * col_hindex = arguments[0].column.get();
auto latitude = ColumnFloat64::create();
auto longitude = ColumnFloat64::create();
auto offsets = DataTypeNumber<IColumn::Offset>().createColumn();
offsets->reserve(input_rows_count);
IColumn::Offset current_offset = 0;
for (size_t row = 0; row < input_rows_count; ++row)
{
H3Index h3index = col_hindex->getUInt(row);
CellBoundary boundary{};
auto err = cellToBoundary(h3index, &boundary);
if (err)
throw Exception(ErrorCodes::INCORRECT_DATA, "Incorrect H3 index: {}, error: {}", h3index, err);
for (int vert = 0; vert < boundary.numVerts; ++vert)
{
latitude->insert(radsToDegs(boundary.verts[vert].lat));
longitude->insert(radsToDegs(boundary.verts[vert].lng));
}
current_offset += boundary.numVerts;
offsets->insert(current_offset);
}
return ColumnArray::create(
ColumnTuple::create(Columns{std::move(latitude), std::move(longitude)}),
std::move(offsets));
}
};
void registerFunctionH3ToGeoBoundary(FunctionFactory & factory)
{
factory.registerFunction<FunctionH3ToGeoBoundary>();
}
}
#endif

View File

@ -29,6 +29,7 @@ void registerFunctionSvg(FunctionFactory & factory);
#if USE_H3
void registerFunctionGeoToH3(FunctionFactory &);
void registerFunctionH3ToGeo(FunctionFactory &);
void registerFunctionH3ToGeoBoundary(FunctionFactory &);
void registerFunctionH3EdgeAngle(FunctionFactory &);
void registerFunctionH3EdgeLengthM(FunctionFactory &);
void registerFunctionH3GetResolution(FunctionFactory &);
@ -81,6 +82,7 @@ void registerFunctionsGeo(FunctionFactory & factory)
#if USE_H3
registerFunctionGeoToH3(factory);
registerFunctionH3ToGeo(factory);
registerFunctionH3ToGeoBoundary(factory);
registerFunctionH3EdgeAngle(factory);
registerFunctionH3EdgeLengthM(factory);
registerFunctionH3GetResolution(factory);

View File

@ -331,12 +331,24 @@ ReturnType readIntTextImpl(T & x, ReadBuffer & buf)
if (buf.count() - initial_pos + 1 >= std::numeric_limits<T>::max_digits10)
{
T signed_res = res;
if (common::mulOverflow<T>(signed_res, 10, signed_res)
|| common::addOverflow<T>(signed_res, (*buf.position() - '0'), signed_res))
return ReturnType(false);
if (negative)
{
T signed_res = -res;
if (common::mulOverflow<T>(signed_res, 10, signed_res) ||
common::subOverflow<T>(signed_res, (*buf.position() - '0'), signed_res))
return ReturnType(false);
res = signed_res;
res = -static_cast<UnsignedT>(signed_res);
}
else
{
T signed_res = res;
if (common::mulOverflow<T>(signed_res, 10, signed_res) ||
common::addOverflow<T>(signed_res, (*buf.position() - '0'), signed_res))
return ReturnType(false);
res = signed_res;
}
break;
}
}
@ -366,7 +378,7 @@ end:
{
if constexpr (check_overflow == ReadIntTextCheckOverflow::CHECK_OVERFLOW)
{
if (common::mulOverflow<T>(x, -1, x))
if (common::mulOverflow<UnsignedT, Int8, T>(res, -1, x))
return ReturnType(false);
}
else

View File

@ -285,6 +285,7 @@ private:
/// A flag, used to distinguish between user query and internal query to a database engine (MaterializePostgreSQL).
bool is_internal_query = false;
public:
// Top-level OpenTelemetry trace context for the query. Makes sense only for a query context.
OpenTelemetryTraceContext query_trace_context;

View File

@ -91,12 +91,7 @@ void optimizeGroupBy(ASTSelectQuery * select_query, const NameSet & source_colum
const FunctionFactory & function_factory = FunctionFactory::instance();
if (!select_query->groupBy())
{
// If there is a HAVING clause without GROUP BY, make sure we have some aggregation happen.
if (select_query->having())
appendUnusedGroupByColumn(select_query, source_columns);
return;
}
const auto is_literal = [] (const ASTPtr & ast) -> bool
{

View File

@ -103,26 +103,34 @@ bool ParserParenthesisExpression::parseImpl(Pos & pos, ASTPtr & node, Expected &
const auto & expr_list = contents_node->as<ASTExpressionList &>();
/// empty expression in parentheses is not allowed
/// Empty expression in parentheses is not allowed.
if (expr_list.children.empty())
{
expected.add(pos, "non-empty parenthesized list of expressions");
return false;
}
/// Special case for one-element tuple.
if (expr_list.children.size() == 1 && is_elem)
{
node = expr_list.children.front();
}
else
{
auto function_node = std::make_shared<ASTFunction>();
function_node->name = "tuple";
function_node->arguments = contents_node;
function_node->children.push_back(contents_node);
node = function_node;
auto * ast_literal = expr_list.children.front()->as<ASTLiteral>();
/// But only if its argument is not tuple,
/// since otherwise it will do incorrect transformation:
///
/// (foo,bar) IN (('foo','bar')) -> (foo,bar) IN ('foo','bar')
if (!(ast_literal && ast_literal->value.getType() == Field::Types::Tuple))
{
node = expr_list.children.front();
return true;
}
}
auto function_node = std::make_shared<ASTFunction>();
function_node->name = "tuple";
function_node->arguments = contents_node;
function_node->children.push_back(contents_node);
node = function_node;
return true;
}

View File

@ -20,6 +20,7 @@
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeTuple.h>
#include <DataTypes/DataTypeMap.h>
#include <DataTypes/DataTypeLowCardinality.h>
namespace DB
{
@ -48,8 +49,10 @@ void ORCOutputStream::write(const void* buf, size_t length)
}
ORCBlockOutputFormat::ORCBlockOutputFormat(WriteBuffer & out_, const Block & header_, const FormatSettings & format_settings_)
: IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_), data_types(header_.getDataTypes())
: IOutputFormat(header_, out_), format_settings{format_settings_}, output_stream(out_)
{
for (const auto & type : header_.getDataTypes())
data_types.push_back(recursiveRemoveLowCardinality(type));
}
ORC_UNIQUE_PTR<orc::Type> ORCBlockOutputFormat::getORCType(const DataTypePtr & type, const std::string & column_name)
@ -482,10 +485,12 @@ void ORCBlockOutputFormat::consume(Chunk chunk)
/// The size of the batch must be no less than total amount of array elements.
ORC_UNIQUE_PTR<orc::ColumnVectorBatch> batch = writer->createRowBatch(getMaxColumnSize(chunk));
orc::StructVectorBatch & root = dynamic_cast<orc::StructVectorBatch &>(*batch);
auto columns = chunk.detachColumns();
for (auto & column : columns)
column = recursiveRemoveLowCardinality(column);
for (size_t i = 0; i != columns_num; ++i)
{
writeColumn(*root.fields[i], *chunk.getColumns()[i], data_types[i], nullptr);
}
writeColumn(*root.fields[i], *columns[i], data_types[i], nullptr);
root.numElements = rows_num;
writer->add(*batch);
}
@ -505,7 +510,7 @@ void ORCBlockOutputFormat::prepareWriter()
options.setCompression(orc::CompressionKind::CompressionKind_NONE);
size_t columns_count = header.columns();
for (size_t i = 0; i != columns_count; ++i)
schema->addStructField(header.safeGetByPosition(i).name, getORCType(data_types[i], header.safeGetByPosition(i).name));
schema->addStructField(header.safeGetByPosition(i).name, getORCType(recursiveRemoveLowCardinality(data_types[i]), header.safeGetByPosition(i).name));
writer = orc::createWriter(*schema, &output_stream, options);
}

View File

@ -26,6 +26,8 @@ public:
BackgroundProcessListEntry(const BackgroundProcessListEntry &) = delete;
BackgroundProcessListEntry & operator=(const BackgroundProcessListEntry &) = delete;
BackgroundProcessListEntry(BackgroundProcessListEntry &&) = default;
BackgroundProcessListEntry(BackgroundProcessList<ListElement, Info> & list_, const typename container_t::iterator it_, const CurrentMetrics::Metric & metric)
: list(list_), it{it_}, metric_increment{metric}
{
@ -58,7 +60,7 @@ protected:
CurrentMetrics::Metric metric;
BackgroundProcessList(const CurrentMetrics::Metric & metric_)
explicit BackgroundProcessList(const CurrentMetrics::Metric & metric_)
: metric(metric_)
{}
public:
@ -85,7 +87,7 @@ public:
virtual void onEntryCreate(const Entry & /* entry */) {}
virtual void onEntryDestroy(const Entry & /* entry */) {}
virtual inline ~BackgroundProcessList() {}
virtual inline ~BackgroundProcessList() = default;
};
}

View File

@ -0,0 +1,49 @@
#pragma once
#include "Storages/MergeTree/IMergeTreeDataPart.h"
namespace DB
{
/* Allow to compute more accurate progress statistics */
class ColumnSizeEstimator
{
using ColumnToSize = MergeTreeDataPartInMemory::ColumnToSize;
ColumnToSize map;
public:
/// Stores approximate size of columns in bytes
/// Exact values are not required since it used for relative values estimation (progress).
size_t sum_total = 0;
size_t sum_index_columns = 0;
size_t sum_ordinary_columns = 0;
ColumnSizeEstimator(ColumnToSize && map_, const Names & key_columns, const Names & ordinary_columns)
: map(std::move(map_))
{
for (const auto & name : key_columns)
if (!map.count(name)) map[name] = 0;
for (const auto & name : ordinary_columns)
if (!map.count(name)) map[name] = 0;
for (const auto & name : key_columns)
sum_index_columns += map.at(name);
for (const auto & name : ordinary_columns)
sum_ordinary_columns += map.at(name);
sum_total = std::max(static_cast<decltype(sum_index_columns)>(1), sum_index_columns + sum_ordinary_columns);
}
Float64 columnWeight(const String & column) const
{
return static_cast<Float64>(map.at(column)) / sum_total;
}
Float64 keyColumnsWeight() const
{
return static_cast<Float64>(sum_index_columns) / sum_total;
}
};
}

View File

@ -580,9 +580,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
new_projection_part->is_temp = false;
new_projection_part->setColumns(block.getNamesAndTypesList());
MergeTreePartition partition{};
IMergeTreeDataPart::MinMaxIndex minmax_idx{};
new_projection_part->partition = std::move(partition);
new_projection_part->minmax_idx = std::move(minmax_idx);
new_projection_part->minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
MergedBlockOutputStream part_out(
new_projection_part,
@ -608,7 +607,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToMemory(
new_data_part->uuid = part_uuid;
new_data_part->is_temp = true;
new_data_part->setColumns(block.getNamesAndTypesList());
new_data_part->minmax_idx.update(block, data.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
new_data_part->minmax_idx->update(block, data.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
new_data_part->partition.create(metadata_snapshot, block, 0, context);
MergedBlockOutputStream part_out(

View File

@ -0,0 +1,92 @@
#include "Storages/MergeTree/FutureMergedMutatedPart.h"
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
void FutureMergedMutatedPart::assign(MergeTreeData::DataPartsVector parts_)
{
if (parts_.empty())
return;
size_t sum_rows = 0;
size_t sum_bytes_uncompressed = 0;
MergeTreeDataPartType future_part_type = MergeTreeDataPartType::UNKNOWN;
for (const auto & part : parts_)
{
sum_rows += part->rows_count;
sum_bytes_uncompressed += part->getTotalColumnsSize().data_uncompressed;
future_part_type = std::min(future_part_type, part->getType());
}
auto chosen_type = parts_.front()->storage.choosePartTypeOnDisk(sum_bytes_uncompressed, sum_rows);
future_part_type = std::min(future_part_type, chosen_type);
assign(std::move(parts_), future_part_type);
}
void FutureMergedMutatedPart::assign(MergeTreeData::DataPartsVector parts_, MergeTreeDataPartType future_part_type)
{
if (parts_.empty())
return;
for (const MergeTreeData::DataPartPtr & part : parts_)
{
const MergeTreeData::DataPartPtr & first_part = parts_.front();
if (part->partition.value != first_part->partition.value)
throw Exception(
"Attempting to merge parts " + first_part->name + " and " + part->name + " that are in different partitions",
ErrorCodes::LOGICAL_ERROR);
}
parts = std::move(parts_);
UInt32 max_level = 0;
Int64 max_mutation = 0;
for (const auto & part : parts)
{
max_level = std::max(max_level, part->info.level);
max_mutation = std::max(max_mutation, part->info.mutation);
}
type = future_part_type;
part_info.partition_id = parts.front()->info.partition_id;
part_info.min_block = parts.front()->info.min_block;
part_info.max_block = parts.back()->info.max_block;
part_info.level = max_level + 1;
part_info.mutation = max_mutation;
if (parts.front()->storage.format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)
{
DayNum min_date = DayNum(std::numeric_limits<UInt16>::max());
DayNum max_date = DayNum(std::numeric_limits<UInt16>::min());
for (const auto & part : parts)
{
/// NOTE: getting min and max dates from part names (instead of part data) because we want
/// the merged part name be determined only by source part names.
/// It is simpler this way when the real min and max dates for the block range can change
/// (e.g. after an ALTER DELETE command).
DayNum part_min_date;
DayNum part_max_date;
MergeTreePartInfo::parseMinMaxDatesFromPartName(part->name, part_min_date, part_max_date);
min_date = std::min(min_date, part_min_date);
max_date = std::max(max_date, part_max_date);
}
name = part_info.getPartNameV0(min_date, max_date);
}
else
name = part_info.getPartName();
}
void FutureMergedMutatedPart::updatePath(const MergeTreeData & storage, const IReservation * reservation)
{
path = storage.getFullPathOnDisk(reservation->getDisk()) + name + "/";
}
}

View File

@ -0,0 +1,49 @@
#pragma once
#include "common/types.h"
#include "Storages/MergeTree/MergeTreeData.h"
#include "Storages/MergeTree/MergeTreeDataPartType.h"
#include "Storages/MergeTree/MergeTreePartInfo.h"
#include "Storages/MergeTree/MergeType.h"
#include "Storages/MergeTree/IMergeTreeDataPart.h"
namespace DB
{
class MergeTreeData;
/// Auxiliary struct holding metainformation for the future merged or mutated part.
struct FutureMergedMutatedPart
{
String name;
UUID uuid = UUIDHelpers::Nil;
String path;
MergeTreeDataPartType type;
MergeTreePartInfo part_info;
MergeTreeData::DataPartsVector parts;
MergeType merge_type = MergeType::REGULAR;
const MergeTreePartition & getPartition() const { return parts.front()->partition; }
FutureMergedMutatedPart() = default;
explicit FutureMergedMutatedPart(MergeTreeData::DataPartsVector parts_)
{
assign(std::move(parts_));
}
FutureMergedMutatedPart(MergeTreeData::DataPartsVector parts_, MergeTreeDataPartType future_part_type)
{
assign(std::move(parts_), future_part_type);
}
void assign(MergeTreeData::DataPartsVector parts_);
void assign(MergeTreeData::DataPartsVector parts_, MergeTreeDataPartType future_part_type);
void updatePath(const MergeTreeData & storage, const IReservation * reservation);
};
using FutureMergedMutatedPartPtr = std::shared_ptr<FutureMergedMutatedPart>;
}

View File

@ -23,6 +23,7 @@ namespace DB
class IExecutableTask
{
public:
using TaskResultCallback = std::function<void(bool)>;
virtual bool executeStep() = 0;
virtual void onCompleted() = 0;
virtual StorageID getStorageID() = 0;
@ -62,7 +63,7 @@ public:
private:
bool res = false;
std::function<bool()> job_to_execute;
std::function<void(bool)> job_result_callback;
IExecutableTask::TaskResultCallback job_result_callback;
StorageID id;
};

View File

@ -68,6 +68,7 @@ void IMergeTreeDataPart::MinMaxIndex::load(const MergeTreeData & data, const Dis
auto minmax_column_names = data.getMinMaxColumnsNames(partition_key);
auto minmax_column_types = data.getMinMaxColumnsTypes(partition_key);
size_t minmax_idx_size = minmax_column_types.size();
hyperrectangle.reserve(minmax_idx_size);
for (size_t i = 0; i < minmax_idx_size; ++i)
{
@ -287,6 +288,8 @@ IMergeTreeDataPart::IMergeTreeDataPart(
state = State::Committed;
incrementStateMetric(state);
incrementTypeMetric(part_type);
minmax_idx = std::make_shared<MinMaxIndex>();
}
IMergeTreeDataPart::IMergeTreeDataPart(
@ -310,6 +313,8 @@ IMergeTreeDataPart::IMergeTreeDataPart(
state = State::Committed;
incrementStateMetric(state);
incrementTypeMetric(part_type);
minmax_idx = std::make_shared<MinMaxIndex>();
}
IMergeTreeDataPart::~IMergeTreeDataPart()
@ -360,9 +365,9 @@ IMergeTreeDataPart::State IMergeTreeDataPart::getState() const
std::pair<DayNum, DayNum> IMergeTreeDataPart::getMinMaxDate() const
{
if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized)
if (storage.minmax_idx_date_column_pos != -1 && minmax_idx->initialized)
{
const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos];
const auto & hyperrectangle = minmax_idx->hyperrectangle[storage.minmax_idx_date_column_pos];
return {DayNum(hyperrectangle.left.get<UInt64>()), DayNum(hyperrectangle.right.get<UInt64>())};
}
else
@ -371,9 +376,9 @@ std::pair<DayNum, DayNum> IMergeTreeDataPart::getMinMaxDate() const
std::pair<time_t, time_t> IMergeTreeDataPart::getMinMaxTime() const
{
if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized)
if (storage.minmax_idx_time_column_pos != -1 && minmax_idx->initialized)
{
const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos];
const auto & hyperrectangle = minmax_idx->hyperrectangle[storage.minmax_idx_time_column_pos];
/// The case of DateTime
if (hyperrectangle.left.getType() == Field::Types::UInt64)
@ -786,7 +791,7 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex()
const auto & date_lut = DateLUT::instance();
partition = MergeTreePartition(date_lut.toNumYYYYMM(min_date));
minmax_idx = MinMaxIndex(min_date, max_date);
minmax_idx = std::make_shared<MinMaxIndex>(min_date, max_date);
}
else
{
@ -798,9 +803,9 @@ void IMergeTreeDataPart::loadPartitionAndMinMaxIndex()
{
if (parent_part)
// projection parts don't have minmax_idx, and it's always initialized
minmax_idx.initialized = true;
minmax_idx->initialized = true;
else
minmax_idx.load(storage, volume->getDisk(), path);
minmax_idx->load(storage, volume->getDisk(), path);
}
if (parent_part)
return;

View File

@ -288,7 +288,9 @@ public:
void merge(const MinMaxIndex & other);
};
MinMaxIndex minmax_idx;
using MinMaxIndexPtr = std::shared_ptr<MinMaxIndex>;
MinMaxIndexPtr minmax_idx;
Checksums checksums;

View File

@ -18,7 +18,7 @@ public:
using WrittenOffsetColumns = std::set<std::string>;
const MergeTreeIndexGranularity & getIndexGranularity()
const MergeTreeIndexGranularity & getIndexGranularity() const
{
return writer->getIndexGranularity();
}
@ -45,4 +45,6 @@ protected:
IMergeTreeDataPart::MergeTreeWriterPtr writer;
};
using IMergedBlockOutputStreamPtr = std::shared_ptr<IMergedBlockOutputStream>;
}

View File

@ -0,0 +1,277 @@
#include <Storages/MergeTree/MergeFromLogEntryTask.h>
#include <common/logger_useful.h>
#include <Common/ProfileEvents.h>
#include <Storages/StorageReplicatedMergeTree.h>
namespace ProfileEvents
{
extern const Event DataAfterMergeDiffersFromReplica;
extern const Event ReplicatedPartMerges;
}
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_DATA_PART_NAME;
extern const int LOGICAL_ERROR;
}
std::pair<bool, ReplicatedMergeMutateTaskBase::PartLogWriter> MergeFromLogEntryTask::prepare()
{
LOG_TRACE(log, "Executing log entry to merge parts {} to {}",
fmt::join(entry.source_parts, ", "), entry.new_part_name);
const auto storage_settings_ptr = storage.getSettings();
if (storage_settings_ptr->always_fetch_merged_part)
{
LOG_INFO(log, "Will fetch part {} because setting 'always_fetch_merged_part' is true", entry.new_part_name);
return {false, {}};
}
if (entry.merge_type == MergeType::TTL_RECOMPRESS &&
(time(nullptr) - entry.create_time) <= storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds() &&
entry.source_replica != storage.replica_name)
{
LOG_INFO(log, "Will try to fetch part {} until '{}' because this part assigned to recompression merge. "
"Source replica {} will try to merge this part first", entry.new_part_name,
DateLUT::instance().timeToString(entry.create_time + storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds()), entry.source_replica);
return {false, {}};
}
/// In some use cases merging can be more expensive than fetching
/// and it may be better to spread merges tasks across the replicas
/// instead of doing exactly the same merge cluster-wise
std::optional<String> replica_to_execute_merge;
bool replica_to_execute_merge_picked = false;
if (storage.merge_strategy_picker.shouldMergeOnSingleReplica(entry))
{
replica_to_execute_merge = storage.merge_strategy_picker.pickReplicaToExecuteMerge(entry);
replica_to_execute_merge_picked = true;
if (replica_to_execute_merge)
{
LOG_DEBUG(log,
"Prefer fetching part {} from replica {} due to execute_merges_on_single_replica_time_threshold",
entry.new_part_name, replica_to_execute_merge.value());
return {false, {}};
}
}
for (const String & source_part_name : entry.source_parts)
{
MergeTreeData::DataPartPtr source_part_or_covering = storage.getActiveContainingPart(source_part_name);
if (!source_part_or_covering)
{
/// We do not have one of source parts locally, try to take some already merged part from someone.
LOG_DEBUG(log, "Don't have all parts for merge {}; will try to fetch it instead", entry.new_part_name);
return {false, {}};
}
if (source_part_or_covering->name != source_part_name)
{
/// We do not have source part locally, but we have some covering part. Possible options:
/// 1. We already have merged part (source_part_or_covering->name == new_part_name)
/// 2. We have some larger merged part which covers new_part_name (and therefore it covers source_part_name too)
/// 3. We have two intersecting parts, both cover source_part_name. It's logical error.
/// TODO Why 1 and 2 can happen? Do we need more assertions here or somewhere else?
constexpr const char * message = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
LOG_WARNING(log, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
if (!source_part_or_covering->info.contains(MergeTreePartInfo::fromPartName(entry.new_part_name, storage.format_version)))
throw Exception(ErrorCodes::LOGICAL_ERROR, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
return {false, {}};
}
parts.push_back(source_part_or_covering);
}
/// All source parts are found locally, we can execute merge
if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr))
{
/// If entry is old enough, and have enough size, and part are exists in any replica,
/// then prefer fetching of merged part from replica.
size_t sum_parts_bytes_on_disk = 0;
for (const auto & item : parts)
sum_parts_bytes_on_disk += item->getBytesOnDisk();
if (sum_parts_bytes_on_disk >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold)
{
String replica = storage.findReplicaHavingPart(entry.new_part_name, true); /// NOTE excessive ZK requests for same data later, may remove.
if (!replica.empty())
{
LOG_DEBUG(log, "Prefer to fetch {} from replica {}", entry.new_part_name, replica);
return {false, {}};
}
}
}
/// Start to make the main work
size_t estimated_space_for_merge = MergeTreeDataMergerMutator::estimateNeededDiskSpace(parts);
/// Can throw an exception while reserving space.
IMergeTreeDataPart::TTLInfos ttl_infos;
size_t max_volume_index = 0;
for (auto & part_ptr : parts)
{
ttl_infos.update(part_ptr->ttl_infos);
max_volume_index = std::max(max_volume_index, storage.getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk()));
}
/// It will live until the whole task is being destroyed
table_lock_holder = storage.lockForShare(RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations);
StorageMetadataPtr metadata_snapshot = storage.getInMemoryMetadataPtr();
auto future_merged_part = std::make_shared<FutureMergedMutatedPart>(parts, entry.new_part_type);
if (future_merged_part->name != entry.new_part_name)
{
throw Exception("Future merged part name " + backQuote(future_merged_part->name) + " differs from part name in log entry: "
+ backQuote(entry.new_part_name), ErrorCodes::BAD_DATA_PART_NAME);
}
std::optional<CurrentlySubmergingEmergingTagger> tagger;
ReservationSharedPtr reserved_space = storage.balancedReservation(
metadata_snapshot,
estimated_space_for_merge,
max_volume_index,
future_merged_part->name,
future_merged_part->part_info,
future_merged_part->parts,
&tagger,
&ttl_infos);
if (!reserved_space)
reserved_space = storage.reserveSpacePreferringTTLRules(
metadata_snapshot, estimated_space_for_merge, ttl_infos, time(nullptr), max_volume_index);
future_merged_part->uuid = entry.new_part_uuid;
future_merged_part->updatePath(storage, reserved_space.get());
future_merged_part->merge_type = entry.merge_type;
if (storage_settings_ptr->allow_remote_fs_zero_copy_replication)
{
if (auto disk = reserved_space->getDisk(); disk->getType() == DB::DiskType::S3)
{
if (storage.merge_strategy_picker.shouldMergeOnSingleReplicaShared(entry))
{
if (!replica_to_execute_merge_picked)
replica_to_execute_merge = storage.merge_strategy_picker.pickReplicaToExecuteMerge(entry);
if (replica_to_execute_merge)
{
LOG_DEBUG(log,
"Prefer fetching part {} from replica {} due s3_execute_merges_on_single_replica_time_threshold",
entry.new_part_name, replica_to_execute_merge.value());
return {false, {}};
}
}
}
}
/// Account TTL merge
if (isTTLMergeType(future_merged_part->merge_type))
storage.getContext()->getMergeList().bookMergeWithTTL();
auto table_id = storage.getStorageID();
/// Add merge to list
merge_mutate_entry = storage.getContext()->getMergeList().insert(storage.getStorageID(), future_merged_part);
transaction_ptr = std::make_unique<MergeTreeData::Transaction>(storage);
stopwatch_ptr = std::make_unique<Stopwatch>();
merge_task = storage.merger_mutator.mergePartsToTemporaryPart(
future_merged_part,
metadata_snapshot,
merge_mutate_entry.get(),
table_lock_holder,
entry.create_time,
storage.getContext(),
reserved_space,
entry.deduplicate,
entry.deduplicate_by_columns,
storage.merging_params);
return {true, [this, stopwatch = *stopwatch_ptr] (const ExecutionStatus & execution_status)
{
storage.writePartLog(
PartLogElement::MERGE_PARTS, execution_status, stopwatch.elapsed(),
entry.new_part_name, part, parts, merge_mutate_entry.get());
}};
}
bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWriter write_part_log)
{
part = merge_task->getFuture().get();
/// Task is not needed
merge_task.reset();
storage.merger_mutator.renameMergedTemporaryPart(part, parts, transaction_ptr.get());
try
{
storage.checkPartChecksumsAndCommit(*transaction_ptr, part);
}
catch (const Exception & e)
{
if (MergeTreeDataPartChecksums::isBadChecksumsErrorCode(e.code()))
{
transaction_ptr->rollback();
ProfileEvents::increment(ProfileEvents::DataAfterMergeDiffersFromReplica);
LOG_ERROR(log,
"{}. Data after merge is not byte-identical to data on another replicas. There could be several"
" reasons: 1. Using newer version of compression library after server update. 2. Using another"
" compression method. 3. Non-deterministic compression algorithm (highly unlikely). 4."
" Non-deterministic merge algorithm due to logical error in code. 5. Data corruption in memory due"
" to bug in code. 6. Data corruption in memory due to hardware issue. 7. Manual modification of"
" source data after server startup. 8. Manual modification of checksums stored in ZooKeeper. 9."
" Part format related settings like 'enable_mixed_granularity_parts' are different on different"
" replicas. We will download merged part from replica to force byte-identical result.",
getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
if (storage.getSettings()->detach_not_byte_identical_parts)
storage.forgetPartAndMoveToDetached(std::move(part), "merge-not-byte-identical");
else
storage.tryRemovePartImmediately(std::move(part));
/// No need to delete the part from ZK because we can be sure that the commit transaction
/// didn't go through.
return false;
}
throw;
}
/** Removing old parts from ZK and from the disk is delayed - see ReplicatedMergeTreeCleanupThread, clearOldParts.
*/
/** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts.
* This is not a problem, because in this case the merge will remain in the queue, and we will try again.
*/
storage.merge_selecting_task->schedule();
ProfileEvents::increment(ProfileEvents::ReplicatedPartMerges);
write_part_log({});
return true;
}
}

View File

@ -0,0 +1,49 @@
#pragma once
#include <memory>
#include <utility>
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/MergeTask.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
#include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
#include <Storages/MergeTree/ReplicatedMergeMutateTaskBase.h>
namespace DB
{
class MergeFromLogEntryTask : public shared_ptr_helper<MergeFromLogEntryTask>, public ReplicatedMergeMutateTaskBase
{
public:
template <class Callback>
MergeFromLogEntryTask(ReplicatedMergeTreeQueue::SelectedEntryPtr selected_entry_, StorageReplicatedMergeTree & storage_, Callback && task_result_callback_)
: ReplicatedMergeMutateTaskBase(&Poco::Logger::get("MergeFromLogEntryTask"), storage_, selected_entry_, task_result_callback_) {}
protected:
/// Both return false if we can't execute merge.
std::pair<bool, ReplicatedMergeMutateTaskBase::PartLogWriter> prepare() override;
bool finalize(ReplicatedMergeMutateTaskBase::PartLogWriter write_part_log) override;
bool executeInnerTask() override
{
return merge_task->execute();
}
private:
TableLockHolder table_lock_holder{nullptr};
MergeTreeData::DataPartsVector parts;
MergeTreeData::TransactionUniquePtr transaction_ptr{nullptr};
StopwatchUniquePtr stopwatch_ptr{nullptr};
MergeTreeData::MutableDataPartPtr part;
MergeTaskPtr merge_task;
};
using MergeFromLogEntryTaskPtr = std::shared_ptr<MergeFromLogEntryTask>;
}

View File

@ -1,5 +1,6 @@
#include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Common/CurrentMetrics.h>
#include <common/getThreadId.h>
#include <Common/CurrentThread.h>
@ -8,34 +9,10 @@
namespace DB
{
MergeListElement::MergeListElement(const StorageID & table_id_, const FutureMergedMutatedPart & future_part)
: table_id{table_id_}
, partition_id{future_part.part_info.partition_id}
, result_part_name{future_part.name}
, result_part_path{future_part.path}
, result_part_info{future_part.part_info}
, num_parts{future_part.parts.size()}
, thread_id{getThreadId()}
, merge_type{future_part.merge_type}
, merge_algorithm{MergeAlgorithm::Undecided}
MemoryTrackerThreadSwitcher::MemoryTrackerThreadSwitcher(MemoryTracker * memory_tracker_ptr)
{
for (const auto & source_part : future_part.parts)
{
source_part_names.emplace_back(source_part->name);
source_part_paths.emplace_back(source_part->getFullPath());
total_size_bytes_compressed += source_part->getBytesOnDisk();
total_size_marks += source_part->getMarksCount();
total_rows_count += source_part->index_granularity.getTotalRows();
}
if (!future_part.parts.empty())
{
source_data_version = future_part.parts[0]->info.getDataVersion();
is_mutation = (result_part_info.getDataVersion() != source_data_version);
}
/// Each merge is executed into separate background processing pool thread
// Each merge is executed into separate background processing pool thread
background_thread_memory_tracker = CurrentThread::getMemoryTracker();
if (background_thread_memory_tracker)
{
@ -52,7 +29,44 @@ MergeListElement::MergeListElement(const StorageID & table_id_, const FutureMerg
}
background_thread_memory_tracker_prev_parent = background_thread_memory_tracker->getParent();
background_thread_memory_tracker->setParent(&memory_tracker);
background_thread_memory_tracker->setParent(memory_tracker_ptr);
}
}
MemoryTrackerThreadSwitcher::~MemoryTrackerThreadSwitcher()
{
// Unplug memory_tracker from current background processing pool thread
if (background_thread_memory_tracker)
background_thread_memory_tracker->setParent(background_thread_memory_tracker_prev_parent);
}
MergeListElement::MergeListElement(const StorageID & table_id_, FutureMergedMutatedPartPtr future_part)
: table_id{table_id_}
, partition_id{future_part->part_info.partition_id}
, result_part_name{future_part->name}
, result_part_path{future_part->path}
, result_part_info{future_part->part_info}
, num_parts{future_part->parts.size()}
, thread_id{getThreadId()}
, merge_type{future_part->merge_type}
, merge_algorithm{MergeAlgorithm::Undecided}
{
for (const auto & source_part : future_part->parts)
{
source_part_names.emplace_back(source_part->name);
source_part_paths.emplace_back(source_part->getFullPath());
total_size_bytes_compressed += source_part->getBytesOnDisk();
total_size_marks += source_part->getMarksCount();
total_rows_count += source_part->index_granularity.getTotalRows();
}
if (!future_part->parts.empty())
{
source_data_version = future_part->parts[0]->info.getDataVersion();
is_mutation = (result_part_info.getDataVersion() != source_data_version);
}
}
@ -90,11 +104,7 @@ MergeInfo MergeListElement::getInfo() const
return res;
}
MergeListElement::~MergeListElement()
{
/// Unplug memory_tracker from current background processing pool thread
if (background_thread_memory_tracker)
background_thread_memory_tracker->setParent(background_thread_memory_tracker_prev_parent);
}
MergeListElement::~MergeListElement() = default;
}

View File

@ -53,6 +53,23 @@ struct MergeInfo
};
struct FutureMergedMutatedPart;
using FutureMergedMutatedPartPtr = std::shared_ptr<FutureMergedMutatedPart>;
/**
* Since merge is executed with multiple threads, this class
* switches the parent MemoryTracker to account all the memory used.
*/
class MemoryTrackerThreadSwitcher : boost::noncopyable
{
public:
explicit MemoryTrackerThreadSwitcher(MemoryTracker * memory_tracker_ptr);
~MemoryTrackerThreadSwitcher();
private:
MemoryTracker * background_thread_memory_tracker;
MemoryTracker * background_thread_memory_tracker_prev_parent = nullptr;
};
using MemoryTrackerThreadSwitcherPtr = std::unique_ptr<MemoryTrackerThreadSwitcher>;
struct MergeListElement : boost::noncopyable
{
@ -87,15 +104,13 @@ struct MergeListElement : boost::noncopyable
std::atomic<UInt64> columns_written{};
MemoryTracker memory_tracker{VariableContext::Process};
MemoryTracker * background_thread_memory_tracker;
MemoryTracker * background_thread_memory_tracker_prev_parent = nullptr;
UInt64 thread_id;
MergeType merge_type;
/// Detected after merge already started
std::atomic<MergeAlgorithm> merge_algorithm;
MergeListElement(const StorageID & table_id_, const FutureMergedMutatedPart & future_part);
MergeListElement(const StorageID & table_id_, FutureMergedMutatedPartPtr future_part);
MergeInfo getInfo() const;

View File

@ -0,0 +1,51 @@
#pragma once
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MutationCommands.h>
namespace DB
{
class StorageMergeTree;
struct StorageInMemoryMetadata;
using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
struct FutureMergedMutatedPart;
using FutureMergedMutatedPartPtr = std::shared_ptr<FutureMergedMutatedPart>;
struct CurrentlyMergingPartsTagger
{
FutureMergedMutatedPartPtr future_part;
ReservationSharedPtr reserved_space;
StorageMergeTree & storage;
// Optional tagger to maintain volatile parts for the JBOD balancer
std::optional<CurrentlySubmergingEmergingTagger> tagger;
CurrentlyMergingPartsTagger(
FutureMergedMutatedPartPtr future_part_,
size_t total_size,
StorageMergeTree & storage_,
const StorageMetadataPtr & metadata_snapshot,
bool is_mutation);
~CurrentlyMergingPartsTagger();
};
using CurrentlyMergingPartsTaggerPtr = std::unique_ptr<CurrentlyMergingPartsTagger>;
struct MergeMutateSelectedEntry
{
FutureMergedMutatedPartPtr future_part;
CurrentlyMergingPartsTaggerPtr tagger;
MutationCommandsConstPtr commands;
MergeMutateSelectedEntry(FutureMergedMutatedPartPtr future_part_, CurrentlyMergingPartsTaggerPtr tagger_, MutationCommandsConstPtr commands_)
: future_part(future_part_)
, tagger(std::move(tagger_))
, commands(commands_)
{}
};
using MergeMutateSelectedEntryPtr = std::shared_ptr<MergeMutateSelectedEntry>;
}

View File

@ -0,0 +1,115 @@
#include <Storages/MergeTree/MergePlainMergeTreeTask.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/StorageMergeTree.h>
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
StorageID MergePlainMergeTreeTask::getStorageID()
{
return storage.getStorageID();
}
void MergePlainMergeTreeTask::onCompleted()
{
bool delay = state == State::SUCCESS;
task_result_callback(delay);
}
bool MergePlainMergeTreeTask::executeStep()
{
/// Make out memory tracker a parent of current thread memory tracker
MemoryTrackerThreadSwitcherPtr switcher;
if (merge_list_entry)
switcher = std::make_unique<MemoryTrackerThreadSwitcher>(&(*merge_list_entry)->memory_tracker);
switch (state)
{
case State::NEED_PREPARE :
{
prepare();
state = State::NEED_EXECUTE;
return true;
}
case State::NEED_EXECUTE :
{
try
{
if (merge_task->execute())
return true;
state = State::NEED_FINISH;
return true;
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
throw;
}
}
case State::NEED_FINISH :
{
finish();
state = State::SUCCESS;
return false;
}
case State::SUCCESS:
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Task with state SUCCESS mustn't be executed again");
}
}
return false;
}
void MergePlainMergeTreeTask::prepare()
{
future_part = merge_mutate_entry->future_part;
stopwatch_ptr = std::make_unique<Stopwatch>();
merge_list_entry = storage.getContext()->getMergeList().insert(storage.getStorageID(), future_part);
write_part_log = [this] (const ExecutionStatus & execution_status)
{
storage.writePartLog(
PartLogElement::MERGE_PARTS,
execution_status,
stopwatch_ptr->elapsed(),
future_part->name,
new_part,
future_part->parts,
merge_list_entry.get());
};
merge_task = storage.merger_mutator.mergePartsToTemporaryPart(
future_part,
metadata_snapshot,
merge_list_entry.get(),
table_lock_holder,
time(nullptr),
storage.getContext(),
merge_mutate_entry->tagger->reserved_space,
deduplicate,
deduplicate_by_columns,
storage.merging_params);
}
void MergePlainMergeTreeTask::finish()
{
new_part = merge_task->getFuture().get();
storage.merger_mutator.renameMergedTemporaryPart(new_part, future_part->parts, nullptr);
write_part_log({});
}
}

View File

@ -0,0 +1,88 @@
#pragma once
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/MergeTask.h>
#include <Storages/MutationCommands.h>
#include <Storages/MergeTree/MergeMutateSelectedEntry.h>
namespace DB
{
class StorageMergeTree;
class MergePlainMergeTreeTask : public IExecutableTask
{
public:
template <class Callback>
MergePlainMergeTreeTask(
StorageMergeTree & storage_,
StorageMetadataPtr metadata_snapshot_,
bool deduplicate_,
Names deduplicate_by_columns_,
MergeMutateSelectedEntryPtr merge_mutate_entry_,
TableLockHolder table_lock_holder_,
Callback && task_result_callback_)
: storage(storage_)
, metadata_snapshot(metadata_snapshot_)
, deduplicate(deduplicate_)
, deduplicate_by_columns(deduplicate_by_columns_)
, merge_mutate_entry(merge_mutate_entry_)
, table_lock_holder(table_lock_holder_)
, task_result_callback(task_result_callback_) {}
bool executeStep() override;
void onCompleted() override;
StorageID getStorageID() override;
private:
void prepare();
void finish();
enum class State
{
NEED_PREPARE,
NEED_EXECUTE,
NEED_FINISH,
SUCCESS
};
State state{State::NEED_PREPARE};
StorageMergeTree & storage;
StorageMetadataPtr metadata_snapshot;
bool deduplicate;
Names deduplicate_by_columns;
std::shared_ptr<MergeMutateSelectedEntry> merge_mutate_entry{nullptr};
TableLockHolder table_lock_holder;
FutureMergedMutatedPartPtr future_part{nullptr};
MergeTreeData::MutableDataPartPtr new_part;
std::unique_ptr<Stopwatch> stopwatch_ptr{nullptr};
using MergeListEntryPtr = std::unique_ptr<MergeListEntry>;
MergeListEntryPtr merge_list_entry;
std::function<void(const ExecutionStatus &)> write_part_log;
IExecutableTask::TaskResultCallback task_result_callback;
MergeTaskPtr merge_task{nullptr};
};
using MergePlainMergeTreeTaskPtr = std::shared_ptr<MergePlainMergeTreeTask>;
[[ maybe_unused ]] static void executeHere(MergePlainMergeTreeTaskPtr task)
{
while (task->executeStep()) {}
}
}

View File

@ -0,0 +1,94 @@
#pragma once
#include "common/types.h"
#include "Common/ProfileEvents.h"
#include "IO/Progress.h"
#include "Storages/MergeTree/MergeList.h"
namespace ProfileEvents
{
extern const Event MergesTimeMilliseconds;
extern const Event MergedUncompressedBytes;
extern const Event MergedRows;
extern const Event Merge;
}
namespace DB
{
/** Progress callback.
* What it should update:
* - approximate progress
* - amount of read rows
* - various metrics
* - time elapsed for current merge.
*/
/// Auxiliary struct that for each merge stage stores its current progress.
/// A stage is: the horizontal stage + a stage for each gathered column (if we are doing a
/// Vertical merge) or a mutation of a single part. During a single stage all rows are read.
struct MergeStageProgress
{
explicit MergeStageProgress(Float64 weight_)
: is_first(true) , weight(weight_) {}
MergeStageProgress(Float64 initial_progress_, Float64 weight_)
: initial_progress(initial_progress_), is_first(false), weight(weight_) {}
Float64 initial_progress = 0.0;
bool is_first;
Float64 weight;
UInt64 total_rows = 0;
UInt64 rows_read = 0;
};
class MergeProgressCallback
{
public:
MergeProgressCallback(
MergeList::Entry & merge_entry_, UInt64 & watch_prev_elapsed_, MergeStageProgress & stage_)
: merge_entry(merge_entry_)
, watch_prev_elapsed(watch_prev_elapsed_)
, stage(stage_)
{
updateWatch();
}
MergeList::Entry & merge_entry;
UInt64 & watch_prev_elapsed;
MergeStageProgress & stage;
void updateWatch()
{
UInt64 watch_curr_elapsed = merge_entry->watch.elapsed();
ProfileEvents::increment(ProfileEvents::MergesTimeMilliseconds, (watch_curr_elapsed - watch_prev_elapsed) / 1000000);
watch_prev_elapsed = watch_curr_elapsed;
}
void operator() (const Progress & value)
{
ProfileEvents::increment(ProfileEvents::MergedUncompressedBytes, value.read_bytes);
if (stage.is_first)
{
ProfileEvents::increment(ProfileEvents::MergedRows, value.read_rows);
ProfileEvents::increment(ProfileEvents::Merge);
}
updateWatch();
merge_entry->bytes_read_uncompressed += value.read_bytes;
if (stage.is_first)
merge_entry->rows_read += value.read_rows;
stage.total_rows += value.total_rows_to_read;
stage.rows_read += value.read_rows;
if (stage.total_rows > 0)
{
merge_entry->progress.store(
stage.initial_progress + stage.weight * stage.rows_read / stage.total_rows,
std::memory_order_relaxed);
}
}
};
}

View File

@ -0,0 +1,857 @@
#include "Storages/MergeTree/MergeTask.h"
#include <memory>
#include <fmt/format.h>
#include <common/logger_useful.h>
#include "Common/ActionBlocker.h"
#include "Storages/MergeTree/MergeTreeData.h"
#include "Storages/MergeTree/IMergeTreeDataPart.h"
#include "Storages/MergeTree/MergeTreeSequentialSource.h"
#include "Storages/MergeTree/FutureMergedMutatedPart.h"
#include "Processors/Transforms/ExpressionTransform.h"
#include "Processors/Merges/MergingSortedTransform.h"
#include "Processors/Merges/CollapsingSortedTransform.h"
#include "Processors/Merges/SummingSortedTransform.h"
#include "Processors/Merges/ReplacingSortedTransform.h"
#include "Processors/Merges/GraphiteRollupSortedTransform.h"
#include "Processors/Merges/AggregatingSortedTransform.h"
#include "Processors/Merges/VersionedCollapsingTransform.h"
#include "Processors/Executors/PipelineExecutingBlockInputStream.h"
#include "DataStreams/DistinctSortedBlockInputStream.h"
#include "DataStreams/TTLBlockInputStream.h"
#include <DataStreams/TTLCalcInputStream.h>
#include <DataStreams/ExpressionBlockInputStream.h>
#include <DataStreams/MaterializingBlockInputStream.h>
#include <DataStreams/DistinctSortedBlockInputStream.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ABORTED;
extern const int DIRECTORY_ALREADY_EXISTS;
extern const int LOGICAL_ERROR;
}
/// PK columns are sorted and merged, ordinary columns are gathered using info from merge step
static void extractMergingAndGatheringColumns(
const NamesAndTypesList & storage_columns,
const ExpressionActionsPtr & sorting_key_expr,
const IndicesDescription & indexes,
const MergeTreeData::MergingParams & merging_params,
NamesAndTypesList & gathering_columns, Names & gathering_column_names,
NamesAndTypesList & merging_columns, Names & merging_column_names)
{
Names sort_key_columns_vec = sorting_key_expr->getRequiredColumns();
std::set<String> key_columns(sort_key_columns_vec.cbegin(), sort_key_columns_vec.cend());
for (const auto & index : indexes)
{
Names index_columns_vec = index.expression->getRequiredColumns();
std::copy(index_columns_vec.cbegin(), index_columns_vec.cend(),
std::inserter(key_columns, key_columns.end()));
}
/// Force sign column for Collapsing mode
if (merging_params.mode == MergeTreeData::MergingParams::Collapsing)
key_columns.emplace(merging_params.sign_column);
/// Force version column for Replacing mode
if (merging_params.mode == MergeTreeData::MergingParams::Replacing)
key_columns.emplace(merging_params.version_column);
/// Force sign column for VersionedCollapsing mode. Version is already in primary key.
if (merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing)
key_columns.emplace(merging_params.sign_column);
/// Force to merge at least one column in case of empty key
if (key_columns.empty())
key_columns.emplace(storage_columns.front().name);
/// TODO: also force "summing" and "aggregating" columns to make Horizontal merge only for such columns
for (const auto & column : storage_columns)
{
if (key_columns.count(column.name))
{
merging_columns.emplace_back(column);
merging_column_names.emplace_back(column.name);
}
else
{
gathering_columns.emplace_back(column);
gathering_column_names.emplace_back(column.name);
}
}
}
bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare()
{
const String local_tmp_prefix = global_ctx->parent_part ? ctx->prefix : "tmp_merge_";
if (global_ctx->merges_blocker->isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
/// We don't want to perform merge assigned with TTL as normal merge, so
/// throw exception
if (isTTLMergeType(global_ctx->future_part->merge_type) && global_ctx->ttl_merges_blocker->isCancelled())
throw Exception("Cancelled merging parts with TTL", ErrorCodes::ABORTED);
LOG_DEBUG(ctx->log, "Merging {} parts: from {} to {} into {}",
global_ctx->future_part->parts.size(),
global_ctx->future_part->parts.front()->name,
global_ctx->future_part->parts.back()->name,
global_ctx->future_part->type.toString());
if (global_ctx->deduplicate)
{
if (global_ctx->deduplicate_by_columns.empty())
LOG_DEBUG(ctx->log, "DEDUPLICATE BY all columns");
else
LOG_DEBUG(ctx->log, "DEDUPLICATE BY ('{}')", fmt::join(global_ctx->deduplicate_by_columns, "', '"));
}
ctx->disk = global_ctx->space_reservation->getDisk();
auto local_new_part_tmp_path = global_ctx->data->relative_data_path + local_tmp_prefix + global_ctx->future_part->name + (global_ctx->parent_part ? ".proj" : "") + "/";
if (ctx->disk->exists(local_new_part_tmp_path))
throw Exception("Directory " + fullPath(ctx->disk, local_new_part_tmp_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
global_ctx->all_column_names = global_ctx->metadata_snapshot->getColumns().getNamesOfPhysical();
global_ctx->storage_columns = global_ctx->metadata_snapshot->getColumns().getAllPhysical();
extractMergingAndGatheringColumns(
global_ctx->storage_columns,
global_ctx->metadata_snapshot->getSortingKey().expression,
global_ctx->metadata_snapshot->getSecondaryIndices(),
ctx->merging_params,
global_ctx->gathering_columns,
global_ctx->gathering_column_names,
global_ctx->merging_columns,
global_ctx->merging_column_names);
auto local_single_disk_volume = std::make_shared<SingleDiskVolume>("volume_" + global_ctx->future_part->name, ctx->disk, 0);
global_ctx->new_data_part = global_ctx->data->createPart(
global_ctx->future_part->name,
global_ctx->future_part->type,
global_ctx->future_part->part_info,
local_single_disk_volume,
local_tmp_prefix + global_ctx->future_part->name + (global_ctx->parent_part ? ".proj" : ""),
global_ctx->parent_part.get());
global_ctx->new_data_part->uuid = global_ctx->future_part->uuid;
global_ctx->new_data_part->setColumns(global_ctx->storage_columns);
global_ctx->new_data_part->partition.assign(global_ctx->future_part->getPartition());
global_ctx->new_data_part->is_temp = global_ctx->parent_part == nullptr;
ctx->need_remove_expired_values = false;
ctx->force_ttl = false;
for (const auto & part : global_ctx->future_part->parts)
{
global_ctx->new_data_part->ttl_infos.update(part->ttl_infos);
if (global_ctx->metadata_snapshot->hasAnyTTL() && !part->checkAllTTLCalculated(global_ctx->metadata_snapshot))
{
LOG_INFO(ctx->log, "Some TTL values were not calculated for part {}. Will calculate them forcefully during merge.", part->name);
ctx->need_remove_expired_values = true;
ctx->force_ttl = true;
}
}
const auto & local_part_min_ttl = global_ctx->new_data_part->ttl_infos.part_min_ttl;
if (local_part_min_ttl && local_part_min_ttl <= global_ctx->time_of_merge)
ctx->need_remove_expired_values = true;
if (ctx->need_remove_expired_values && global_ctx->ttl_merges_blocker->isCancelled())
{
LOG_INFO(ctx->log, "Part {} has values with expired TTL, but merges with TTL are cancelled.", global_ctx->new_data_part->name);
ctx->need_remove_expired_values = false;
}
ctx->sum_input_rows_upper_bound = (*global_ctx->merge_entry)->total_rows_count;
ctx->sum_compressed_bytes_upper_bound = (*global_ctx->merge_entry)->total_size_bytes_compressed;
global_ctx->chosen_merge_algorithm = chooseMergeAlgorithm();
(*global_ctx->merge_entry)->merge_algorithm.store(global_ctx->chosen_merge_algorithm, std::memory_order_relaxed);
LOG_DEBUG(ctx->log, "Selected MergeAlgorithm: {}", toString(global_ctx->chosen_merge_algorithm));
/// Note: this is done before creating input streams, because otherwise data.data_parts_mutex
/// (which is locked in data.getTotalActiveSizeInBytes())
/// (which is locked in shared mode when input streams are created) and when inserting new data
/// the order is reverse. This annoys TSan even though one lock is locked in shared mode and thus
/// deadlock is impossible.
ctx->compression_codec = global_ctx->data->getCompressionCodecForPart(
(*global_ctx->merge_entry)->total_size_bytes_compressed, global_ctx->new_data_part->ttl_infos, global_ctx->time_of_merge);
ctx->tmp_disk = global_ctx->context->getTemporaryVolume()->getDisk();
switch (global_ctx->chosen_merge_algorithm)
{
case MergeAlgorithm::Horizontal :
{
global_ctx->merging_columns = global_ctx->storage_columns;
global_ctx->merging_column_names = global_ctx->all_column_names;
global_ctx->gathering_columns.clear();
global_ctx->gathering_column_names.clear();
break;
}
case MergeAlgorithm::Vertical :
{
ctx->rows_sources_file = createTemporaryFile(ctx->tmp_disk->getPath());
ctx->rows_sources_uncompressed_write_buf = ctx->tmp_disk->writeFile(fileName(ctx->rows_sources_file->path()));
ctx->rows_sources_write_buf = std::make_unique<CompressedWriteBuffer>(*ctx->rows_sources_uncompressed_write_buf);
MergeTreeDataPartInMemory::ColumnToSize local_merged_column_to_size;
for (const MergeTreeData::DataPartPtr & part : global_ctx->future_part->parts)
part->accumulateColumnSizes(local_merged_column_to_size);
ctx->column_sizes = ColumnSizeEstimator(
std::move(local_merged_column_to_size),
global_ctx->merging_column_names,
global_ctx->gathering_column_names);
if (global_ctx->data->getSettings()->fsync_part_directory)
global_ctx->sync_guard = ctx->disk->getDirectorySyncGuard(local_new_part_tmp_path);
break;
}
default :
throw Exception("Merge algorithm must be chosen", ErrorCodes::LOGICAL_ERROR);
}
/// If merge is vertical we cannot calculate it
ctx->blocks_are_granules_size = (global_ctx->chosen_merge_algorithm == MergeAlgorithm::Vertical);
/// Merged stream will be created and available as merged_stream variable
createMergedStream();
global_ctx->to = std::make_shared<MergedBlockOutputStream>(
global_ctx->new_data_part,
global_ctx->metadata_snapshot,
global_ctx->merging_columns,
MergeTreeIndexFactory::instance().getMany(global_ctx->metadata_snapshot->getSecondaryIndices()),
ctx->compression_codec,
ctx->blocks_are_granules_size);
global_ctx->merged_stream->readPrefix();
/// TODO: const
const_cast<MergedBlockOutputStream&>(*global_ctx->to).writePrefix();
global_ctx->rows_written = 0;
ctx->initial_reservation = global_ctx->space_reservation ? global_ctx->space_reservation->getSize() : 0;
ctx->is_cancelled = [merges_blocker = global_ctx->merges_blocker,
ttl_merges_blocker = global_ctx->ttl_merges_blocker,
need_remove = ctx->need_remove_expired_values]() -> bool
{
return merges_blocker->isCancelled() || (need_remove && ttl_merges_blocker->isCancelled());
};
/// This is the end of preparation. Execution will be per block.
return false;
}
MergeTask::StageRuntimeContextPtr MergeTask::ExecuteAndFinalizeHorizontalPart::getContextForNextStage()
{
auto new_ctx = std::make_shared<VerticalMergeRuntimeContext>();
new_ctx->rows_sources_write_buf = std::move(ctx->rows_sources_write_buf);
new_ctx->rows_sources_uncompressed_write_buf = std::move(ctx->rows_sources_uncompressed_write_buf);
new_ctx->rows_sources_file = std::move(ctx->rows_sources_file);
new_ctx->column_sizes = std::move(ctx->column_sizes);
new_ctx->compression_codec = std::move(ctx->compression_codec);
new_ctx->tmp_disk = std::move(ctx->tmp_disk);
new_ctx->it_name_and_type = std::move(ctx->it_name_and_type);
new_ctx->column_num_for_vertical_merge = std::move(ctx->column_num_for_vertical_merge);
new_ctx->read_with_direct_io = std::move(ctx->read_with_direct_io);
new_ctx->need_sync = std::move(ctx->need_sync);
ctx.reset();
return new_ctx;
}
MergeTask::StageRuntimeContextPtr MergeTask::VerticalMergeStage::getContextForNextStage()
{
auto new_ctx = std::make_shared<MergeProjectionsRuntimeContext>();
new_ctx->need_sync = std::move(ctx->need_sync);
ctx.reset();
return new_ctx;
}
bool MergeTask::ExecuteAndFinalizeHorizontalPart::execute()
{
assert(subtasks_iterator != subtasks.end());
if ((*subtasks_iterator)())
return true;
/// Move to the next subtask in an array of subtasks
++subtasks_iterator;
return subtasks_iterator != subtasks.end();
}
bool MergeTask::ExecuteAndFinalizeHorizontalPart::executeImpl()
{
Block block;
if (!ctx->is_cancelled() && (block = global_ctx->merged_stream->read()))
{
global_ctx->rows_written += block.rows();
const_cast<MergedBlockOutputStream &>(*global_ctx->to).write(block);
(*global_ctx->merge_entry)->rows_written = global_ctx->merged_stream->getProfileInfo().rows;
(*global_ctx->merge_entry)->bytes_written_uncompressed = global_ctx->merged_stream->getProfileInfo().bytes;
/// Reservation updates is not performed yet, during the merge it may lead to higher free space requirements
if (global_ctx->space_reservation && ctx->sum_input_rows_upper_bound)
{
/// The same progress from merge_entry could be used for both algorithms (it should be more accurate)
/// But now we are using inaccurate row-based estimation in Horizontal case for backward compatibility
Float64 progress = (global_ctx->chosen_merge_algorithm == MergeAlgorithm::Horizontal)
? std::min(1., 1. * global_ctx->rows_written / ctx->sum_input_rows_upper_bound)
: std::min(1., (*global_ctx->merge_entry)->progress.load(std::memory_order_relaxed));
global_ctx->space_reservation->update(static_cast<size_t>((1. - progress) * ctx->initial_reservation));
}
/// Need execute again
return true;
}
global_ctx->merged_stream->readSuffix();
global_ctx->merged_stream.reset();
if (global_ctx->merges_blocker->isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
if (ctx->need_remove_expired_values && global_ctx->ttl_merges_blocker->isCancelled())
throw Exception("Cancelled merging parts with expired TTL", ErrorCodes::ABORTED);
const auto data_settings = global_ctx->data->getSettings();
const size_t sum_compressed_bytes_upper_bound = (*global_ctx->merge_entry)->total_size_bytes_compressed;
ctx->need_sync = needSyncPart(ctx->sum_input_rows_upper_bound, sum_compressed_bytes_upper_bound, *data_settings);
return false;
}
bool MergeTask::VerticalMergeStage::prepareVerticalMergeForAllColumns() const
{
/// No need to execute this part if it is horizontal merge.
if (global_ctx->chosen_merge_algorithm != MergeAlgorithm::Vertical)
return false;
size_t sum_input_rows_exact = (*global_ctx->merge_entry)->rows_read;
(*global_ctx->merge_entry)->columns_written = global_ctx->merging_column_names.size();
(*global_ctx->merge_entry)->progress.store(ctx->column_sizes->keyColumnsWeight(), std::memory_order_relaxed);
ctx->column_part_streams = BlockInputStreams(global_ctx->future_part->parts.size());
ctx->rows_sources_write_buf->next();
ctx->rows_sources_uncompressed_write_buf->next();
/// Ensure data has written to disk.
ctx->rows_sources_uncompressed_write_buf->finalize();
size_t rows_sources_count = ctx->rows_sources_write_buf->count();
/// In special case, when there is only one source part, and no rows were skipped, we may have
/// skipped writing rows_sources file. Otherwise rows_sources_count must be equal to the total
/// number of input rows.
if ((rows_sources_count > 0 || global_ctx->future_part->parts.size() > 1) && sum_input_rows_exact != rows_sources_count)
throw Exception("Number of rows in source parts (" + toString(sum_input_rows_exact)
+ ") differs from number of bytes written to rows_sources file (" + toString(rows_sources_count)
+ "). It is a bug.", ErrorCodes::LOGICAL_ERROR);
ctx->rows_sources_read_buf = std::make_unique<CompressedReadBufferFromFile>(ctx->tmp_disk->readFile(fileName(ctx->rows_sources_file->path())));
/// For external cycle
global_ctx->gathering_column_names_size = global_ctx->gathering_column_names.size();
ctx->column_num_for_vertical_merge = 0;
ctx->it_name_and_type = global_ctx->gathering_columns.cbegin();
return false;
}
void MergeTask::VerticalMergeStage::prepareVerticalMergeForOneColumn() const
{
const String & column_name = ctx->it_name_and_type->name;
Names column_names{column_name};
ctx->progress_before = (*global_ctx->merge_entry)->progress.load(std::memory_order_relaxed);
global_ctx->column_progress = std::make_unique<MergeStageProgress>(ctx->progress_before, ctx->column_sizes->columnWeight(column_name));
for (size_t part_num = 0; part_num < global_ctx->future_part->parts.size(); ++part_num)
{
auto column_part_source = std::make_shared<MergeTreeSequentialSource>(
*global_ctx->data, global_ctx->metadata_snapshot, global_ctx->future_part->parts[part_num], column_names, ctx->read_with_direct_io, true);
/// Dereference unique_ptr
column_part_source->setProgressCallback(
MergeProgressCallback(*global_ctx->merge_entry, global_ctx->watch_prev_elapsed, *global_ctx->column_progress));
QueryPipeline column_part_pipeline;
column_part_pipeline.init(Pipe(std::move(column_part_source)));
column_part_pipeline.setMaxThreads(1);
ctx->column_part_streams[part_num] =
std::make_shared<PipelineExecutingBlockInputStream>(std::move(column_part_pipeline));
}
ctx->rows_sources_read_buf->seek(0, 0);
ctx->column_gathered_stream = std::make_unique<ColumnGathererStream>(column_name, ctx->column_part_streams, *ctx->rows_sources_read_buf);
ctx->column_to = std::make_unique<MergedColumnOnlyOutputStream>(
global_ctx->new_data_part,
global_ctx->metadata_snapshot,
ctx->column_gathered_stream->getHeader(),
ctx->compression_codec,
/// we don't need to recalc indices here
/// because all of them were already recalculated and written
/// as key part of vertical merge
std::vector<MergeTreeIndexPtr>{},
&global_ctx->written_offset_columns,
global_ctx->to->getIndexGranularity());
ctx->column_elems_written = 0;
ctx->column_to->writePrefix();
}
bool MergeTask::VerticalMergeStage::executeVerticalMergeForOneColumn() const
{
Block block;
if (!global_ctx->merges_blocker->isCancelled() && (block = ctx->column_gathered_stream->read()))
{
ctx->column_elems_written += block.rows();
ctx->column_to->write(block);
/// Need execute again
return true;
}
return false;
}
void MergeTask::VerticalMergeStage::finalizeVerticalMergeForOneColumn() const
{
const String & column_name = ctx->it_name_and_type->name;
if (global_ctx->merges_blocker->isCancelled())
throw Exception("Cancelled merging parts", ErrorCodes::ABORTED);
ctx->column_gathered_stream->readSuffix();
auto changed_checksums = ctx->column_to->writeSuffixAndGetChecksums(global_ctx->new_data_part, global_ctx->checksums_gathered_columns, ctx->need_sync);
global_ctx->checksums_gathered_columns.add(std::move(changed_checksums));
if (global_ctx->rows_written != ctx->column_elems_written)
{
throw Exception("Written " + toString(ctx->column_elems_written) + " elements of column " + column_name +
", but " + toString(global_ctx->rows_written) + " rows of PK columns", ErrorCodes::LOGICAL_ERROR);
}
/// NOTE: 'progress' is modified by single thread, but it may be concurrently read from MergeListElement::getInfo() (StorageSystemMerges).
(*global_ctx->merge_entry)->columns_written += 1;
(*global_ctx->merge_entry)->bytes_written_uncompressed += ctx->column_gathered_stream->getProfileInfo().bytes;
(*global_ctx->merge_entry)->progress.store(ctx->progress_before + ctx->column_sizes->columnWeight(column_name), std::memory_order_relaxed);
/// This is the external cycle increment.
++ctx->column_num_for_vertical_merge;
++ctx->it_name_and_type;
}
bool MergeTask::VerticalMergeStage::finalizeVerticalMergeForAllColumns() const
{
/// No need to execute this part if it is horizontal merge.
if (global_ctx->chosen_merge_algorithm != MergeAlgorithm::Vertical)
return false;
return false;
}
bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() const
{
for (const auto & part : global_ctx->future_part->parts)
global_ctx->new_data_part->minmax_idx->merge(*part->minmax_idx);
/// Print overall profiling info. NOTE: it may duplicates previous messages
{
double elapsed_seconds = (*global_ctx->merge_entry)->watch.elapsedSeconds();
LOG_DEBUG(ctx->log,
"Merge sorted {} rows, containing {} columns ({} merged, {} gathered) in {} sec., {} rows/sec., {}/sec.",
(*global_ctx->merge_entry)->rows_read,
global_ctx->all_column_names.size(),
global_ctx->merging_column_names.size(),
global_ctx->gathering_column_names.size(),
elapsed_seconds,
(*global_ctx->merge_entry)->rows_read / elapsed_seconds,
ReadableSize((*global_ctx->merge_entry)->bytes_read_uncompressed / elapsed_seconds));
}
const auto & projections = global_ctx->metadata_snapshot->getProjections();
// tasks_for_projections.reserve(projections.size());
for (const auto & projection : projections)
{
MergeTreeData::DataPartsVector projection_parts;
for (const auto & part : global_ctx->future_part->parts)
{
auto it = part->getProjectionParts().find(projection.name);
if (it != part->getProjectionParts().end())
projection_parts.push_back(it->second);
}
if (projection_parts.size() < global_ctx->future_part->parts.size())
{
LOG_DEBUG(ctx->log, "Projection {} is not merged because some parts don't have it", projection.name);
continue;
}
LOG_DEBUG(
ctx->log,
"Selected {} projection_parts from {} to {}",
projection_parts.size(),
projection_parts.front()->name,
projection_parts.back()->name);
auto projection_future_part = std::make_shared<FutureMergedMutatedPart>();
projection_future_part->assign(std::move(projection_parts));
projection_future_part->name = projection.name;
projection_future_part->path = global_ctx->future_part->path + "/" + projection.name + ".proj/";
projection_future_part->part_info = {"all", 0, 0, 0};
MergeTreeData::MergingParams projection_merging_params;
projection_merging_params.mode = MergeTreeData::MergingParams::Ordinary;
if (projection.type == ProjectionDescription::Type::Aggregate)
projection_merging_params.mode = MergeTreeData::MergingParams::Aggregating;
// TODO Should we use a new merge_entry for projection?
ctx->tasks_for_projections.emplace_back(std::make_shared<MergeTask>(
projection_future_part,
projection.metadata,
global_ctx->merge_entry,
global_ctx->time_of_merge,
global_ctx->context,
global_ctx->space_reservation,
global_ctx->deduplicate,
global_ctx->deduplicate_by_columns,
projection_merging_params,
global_ctx->new_data_part,
"", // empty string for projection
global_ctx->data,
global_ctx->merges_blocker,
global_ctx->ttl_merges_blocker));
}
/// We will iterate through projections and execute them
ctx->projections_iterator = ctx->tasks_for_projections.begin();
return false;
}
bool MergeTask::MergeProjectionsStage::executeProjections() const
{
if (ctx->projections_iterator == ctx->tasks_for_projections.end())
return false;
if ((*ctx->projections_iterator)->execute())
return true;
++ctx->projections_iterator;
return true;
}
bool MergeTask::MergeProjectionsStage::finalizeProjectionsAndWholeMerge() const
{
const auto & projections = global_ctx->metadata_snapshot->getProjections();
size_t iter = 0;
for (const auto & projection : projections)
{
auto future = ctx->tasks_for_projections[iter]->getFuture();
++iter;
global_ctx->new_data_part->addProjectionPart(projection.name, future.get());
}
if (global_ctx->chosen_merge_algorithm != MergeAlgorithm::Vertical)
global_ctx->to->writeSuffixAndFinalizePart(global_ctx->new_data_part, ctx->need_sync);
else
global_ctx->to->writeSuffixAndFinalizePart(global_ctx->new_data_part, ctx->need_sync, &global_ctx->storage_columns, &global_ctx->checksums_gathered_columns);
global_ctx->promise.set_value(global_ctx->new_data_part);
return false;
}
bool MergeTask::VerticalMergeStage::execute()
{
assert(subtasks_iterator != subtasks.end());
if ((*subtasks_iterator)())
return true;
/// Move to the next subtask in an array of subtasks
++subtasks_iterator;
return subtasks_iterator != subtasks.end();
}
bool MergeTask::MergeProjectionsStage::execute()
{
assert(subtasks_iterator != subtasks.end());
if ((*subtasks_iterator)())
return true;
/// Move to the next subtask in an array of subtasks
++subtasks_iterator;
return subtasks_iterator != subtasks.end();
}
bool MergeTask::VerticalMergeStage::executeVerticalMergeForAllColumns() const
{
/// No need to execute this part if it is horizontal merge.
if (global_ctx->chosen_merge_algorithm != MergeAlgorithm::Vertical)
return false;
/// This is the external cycle condition
if (ctx->column_num_for_vertical_merge >= global_ctx->gathering_column_names_size)
return false;
switch (ctx->vertical_merge_one_column_state)
{
case VerticalMergeRuntimeContext::State::NEED_PREPARE:
{
prepareVerticalMergeForOneColumn();
ctx->vertical_merge_one_column_state = VerticalMergeRuntimeContext::State::NEED_EXECUTE;
return true;
}
case VerticalMergeRuntimeContext::State::NEED_EXECUTE:
{
if (executeVerticalMergeForOneColumn())
return true;
ctx->vertical_merge_one_column_state = VerticalMergeRuntimeContext::State::NEED_FINISH;
return true;
}
case VerticalMergeRuntimeContext::State::NEED_FINISH:
{
finalizeVerticalMergeForOneColumn();
ctx->vertical_merge_one_column_state = VerticalMergeRuntimeContext::State::NEED_PREPARE;
return true;
}
}
return false;
}
bool MergeTask::execute()
{
assert(stages_iterator != stages.end());
if ((*stages_iterator)->execute())
return true;
/// Stage is finished, need initialize context for the next stage
auto next_stage_context = (*stages_iterator)->getContextForNextStage();
/// Move to the next stage in an array of stages
++stages_iterator;
if (stages_iterator == stages.end())
return false;
(*stages_iterator)->setRuntimeContext(std::move(next_stage_context), global_ctx);
return true;
}
void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream()
{
/** Read from all parts, merge and write into a new one.
* In passing, we calculate expression for sorting.
*/
Pipes pipes;
global_ctx->watch_prev_elapsed = 0;
/// We count total amount of bytes in parts
/// and use direct_io + aio if there is more than min_merge_bytes_to_use_direct_io
ctx->read_with_direct_io = false;
const auto data_settings = global_ctx->data->getSettings();
if (data_settings->min_merge_bytes_to_use_direct_io != 0)
{
size_t total_size = 0;
for (const auto & part : global_ctx->future_part->parts)
{
total_size += part->getBytesOnDisk();
if (total_size >= data_settings->min_merge_bytes_to_use_direct_io)
{
LOG_DEBUG(ctx->log, "Will merge parts reading files in O_DIRECT");
ctx->read_with_direct_io = true;
break;
}
}
}
/// Using unique_ptr, because MergeStageProgress has no default constructor
global_ctx->horizontal_stage_progress = std::make_unique<MergeStageProgress>(
ctx->column_sizes ? ctx->column_sizes->keyColumnsWeight() : 1.0);
for (const auto & part : global_ctx->future_part->parts)
{
auto input = std::make_unique<MergeTreeSequentialSource>(
*global_ctx->data, global_ctx->metadata_snapshot, part, global_ctx->merging_column_names, ctx->read_with_direct_io, true);
/// Dereference unique_ptr and pass horizontal_stage_progress by reference
input->setProgressCallback(
MergeProgressCallback(*global_ctx->merge_entry, global_ctx->watch_prev_elapsed, *global_ctx->horizontal_stage_progress));
Pipe pipe(std::move(input));
if (global_ctx->metadata_snapshot->hasSortingKey())
{
pipe.addSimpleTransform([this](const Block & header)
{
return std::make_shared<ExpressionTransform>(header, global_ctx->metadata_snapshot->getSortingKey().expression);
});
}
pipes.emplace_back(std::move(pipe));
}
Names sort_columns = global_ctx->metadata_snapshot->getSortingKeyColumns();
SortDescription sort_description;
size_t sort_columns_size = sort_columns.size();
sort_description.reserve(sort_columns_size);
Names partition_key_columns = global_ctx->metadata_snapshot->getPartitionKey().column_names;
Block header = pipes.at(0).getHeader();
for (size_t i = 0; i < sort_columns_size; ++i)
sort_description.emplace_back(header.getPositionByName(sort_columns[i]), 1, 1);
/// The order of the streams is important: when the key is matched, the elements go in the order of the source stream number.
/// In the merged part, the lines with the same key must be in the ascending order of the identifier of original part,
/// that is going in insertion order.
ProcessorPtr merged_transform;
/// If merge is vertical we cannot calculate it
ctx->blocks_are_granules_size = (global_ctx->chosen_merge_algorithm == MergeAlgorithm::Vertical);
UInt64 merge_block_size = data_settings->merge_max_block_size;
switch (ctx->merging_params.mode)
{
case MergeTreeData::MergingParams::Ordinary:
merged_transform = std::make_shared<MergingSortedTransform>(
header, pipes.size(), sort_description, merge_block_size, 0, false, ctx->rows_sources_write_buf.get(), true, ctx->blocks_are_granules_size);
break;
case MergeTreeData::MergingParams::Collapsing:
merged_transform = std::make_shared<CollapsingSortedTransform>(
header, pipes.size(), sort_description, ctx->merging_params.sign_column, false,
merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
break;
case MergeTreeData::MergingParams::Summing:
merged_transform = std::make_shared<SummingSortedTransform>(
header, pipes.size(), sort_description, ctx->merging_params.columns_to_sum, partition_key_columns, merge_block_size);
break;
case MergeTreeData::MergingParams::Aggregating:
merged_transform = std::make_shared<AggregatingSortedTransform>(header, pipes.size(), sort_description, merge_block_size);
break;
case MergeTreeData::MergingParams::Replacing:
merged_transform = std::make_shared<ReplacingSortedTransform>(
header, pipes.size(), sort_description, ctx->merging_params.version_column,
merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
break;
case MergeTreeData::MergingParams::Graphite:
merged_transform = std::make_shared<GraphiteRollupSortedTransform>(
header, pipes.size(), sort_description, merge_block_size,
ctx->merging_params.graphite_params, global_ctx->time_of_merge);
break;
case MergeTreeData::MergingParams::VersionedCollapsing:
merged_transform = std::make_shared<VersionedCollapsingTransform>(
header, pipes.size(), sort_description, ctx->merging_params.sign_column,
merge_block_size, ctx->rows_sources_write_buf.get(), ctx->blocks_are_granules_size);
break;
}
QueryPipeline pipeline;
pipeline.init(Pipe::unitePipes(std::move(pipes)));
pipeline.addTransform(std::move(merged_transform));
pipeline.setMaxThreads(1);
global_ctx->merged_stream = std::make_shared<PipelineExecutingBlockInputStream>(std::move(pipeline));
if (global_ctx->deduplicate)
global_ctx->merged_stream = std::make_shared<DistinctSortedBlockInputStream>(
global_ctx->merged_stream, sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns);
if (ctx->need_remove_expired_values)
global_ctx->merged_stream = std::make_shared<TTLBlockInputStream>(
global_ctx->merged_stream, *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl);
if (global_ctx->metadata_snapshot->hasSecondaryIndices())
{
const auto & indices = global_ctx->metadata_snapshot->getSecondaryIndices();
global_ctx->merged_stream = std::make_shared<ExpressionBlockInputStream>(
global_ctx->merged_stream, indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()));
global_ctx->merged_stream = std::make_shared<MaterializingBlockInputStream>(global_ctx->merged_stream);
}
}
MergeAlgorithm MergeTask::ExecuteAndFinalizeHorizontalPart::chooseMergeAlgorithm() const
{
const size_t sum_rows_upper_bound = (*global_ctx->merge_entry)->total_rows_count;
const auto data_settings = global_ctx->data->getSettings();
if (global_ctx->deduplicate)
return MergeAlgorithm::Horizontal;
if (data_settings->enable_vertical_merge_algorithm == 0)
return MergeAlgorithm::Horizontal;
if (ctx->need_remove_expired_values)
return MergeAlgorithm::Horizontal;
for (const auto & part : global_ctx->future_part->parts)
if (!part->supportsVerticalMerge())
return MergeAlgorithm::Horizontal;
bool is_supported_storage =
ctx->merging_params.mode == MergeTreeData::MergingParams::Ordinary ||
ctx->merging_params.mode == MergeTreeData::MergingParams::Collapsing ||
ctx->merging_params.mode == MergeTreeData::MergingParams::Replacing ||
ctx->merging_params.mode == MergeTreeData::MergingParams::VersionedCollapsing;
bool enough_ordinary_cols = global_ctx->gathering_columns.size() >= data_settings->vertical_merge_algorithm_min_columns_to_activate;
bool enough_total_rows = sum_rows_upper_bound >= data_settings->vertical_merge_algorithm_min_rows_to_activate;
bool no_parts_overflow = global_ctx->future_part->parts.size() <= RowSourcePart::MAX_PARTS;
auto merge_alg = (is_supported_storage && enough_total_rows && enough_ordinary_cols && no_parts_overflow) ?
MergeAlgorithm::Vertical : MergeAlgorithm::Horizontal;
return merge_alg;
}
}

View File

@ -0,0 +1,367 @@
#pragma once
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/MergeProgress.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/IMergedBlockOutputStream.h>
#include <Storages/MergeTree/MergedBlockOutputStream.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/ColumnSizeEstimator.h>
#include <Storages/MergeTree/MergedColumnOnlyOutputStream.h>
#include <DataStreams/ColumnGathererStream.h>
#include <Compression/CompressedReadBufferFromFile.h>
#include <memory>
#include <list>
namespace DB
{
class MergeTask;
using MergeTaskPtr = std::shared_ptr<MergeTask>;
/**
* Overview of the merge algorithm
*
* Each merge is executed sequentially block by block.
* The main idea is to make a merge not a subroutine which is executed
* in a thread pool and may occupy a thread for a period of time,
* but to make a merge a coroutine which can suspend the execution
* in some points and then resume the execution from this point.
*
* A perfect point where to suspend the execution is after the work over a block is finished.
* The task itself will be executed via BackgroundJobExecutor.
*
* The interface of the task is simple.
* The main method is `execute()` which will return true, if the task wants to be executed again and false otherwise.
*
* With this kind of task we can give a merge a priority.
* A priority is simple - the lower the size of the merge, the higher priority.
* So, if ClickHouse wants to merge some really big parts into a bigger part,
* then it will be executed for a long time, because the result of the merge is not really needed immediately.
* It is better to merge small parts as soon as possible.
*/
class MergeTask
{
public:
MergeTask(
FutureMergedMutatedPartPtr future_part_,
StorageMetadataPtr metadata_snapshot_,
MergeList::Entry * merge_entry_,
time_t time_of_merge_,
ContextPtr context_,
ReservationSharedPtr space_reservation_,
bool deduplicate_,
Names deduplicate_by_columns_,
MergeTreeData::MergingParams merging_params_,
MergeTreeDataPartPtr parent_part_,
String prefix_,
MergeTreeData * data_,
ActionBlocker * merges_blocker_,
ActionBlocker * ttl_merges_blocker_)
{
global_ctx = std::make_shared<GlobalRuntimeContext>();
global_ctx->future_part = std::move(future_part_);
global_ctx->metadata_snapshot = std::move(metadata_snapshot_);
global_ctx->merge_entry = std::move(merge_entry_);
global_ctx->time_of_merge = std::move(time_of_merge_);
global_ctx->context = std::move(context_);
global_ctx->space_reservation = std::move(space_reservation_);
global_ctx->deduplicate = std::move(deduplicate_);
global_ctx->deduplicate_by_columns = std::move(deduplicate_by_columns_);
global_ctx->parent_part = std::move(parent_part_);
global_ctx->data = std::move(data_);
global_ctx->merges_blocker = std::move(merges_blocker_);
global_ctx->ttl_merges_blocker = std::move(ttl_merges_blocker_);
auto prepare_stage_ctx = std::make_shared<ExecuteAndFinalizeHorizontalPartRuntimeContext>();
prepare_stage_ctx->prefix = std::move(prefix_);
prepare_stage_ctx->merging_params = std::move(merging_params_);
(*stages.begin())->setRuntimeContext(std::move(prepare_stage_ctx), global_ctx);
}
std::future<MergeTreeData::MutableDataPartPtr> getFuture()
{
return global_ctx->promise.get_future();
}
bool execute();
private:
struct IStage;
using StagePtr = std::shared_ptr<IStage>;
struct IStageRuntimeContext {};
using StageRuntimeContextPtr = std::shared_ptr<IStageRuntimeContext>;
struct IStage
{
virtual void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) = 0;
virtual StageRuntimeContextPtr getContextForNextStage() = 0;
virtual bool execute() = 0;
virtual ~IStage() = default;
};
/// By default this context is uninitialed, but some variables has to be set after construction,
/// some variables are used in a process of execution
/// Proper initialization is responsibility of the author
struct GlobalRuntimeContext : public IStageRuntimeContext //-V730
{
MergeList::Entry * merge_entry{nullptr};
MergeTreeData * data{nullptr};
ActionBlocker * merges_blocker{nullptr};
ActionBlocker * ttl_merges_blocker{nullptr};
StorageMetadataPtr metadata_snapshot{nullptr};
FutureMergedMutatedPartPtr future_part{nullptr};
MergeTreeDataPartPtr parent_part{nullptr};
ContextPtr context{nullptr};
time_t time_of_merge{0};
ReservationSharedPtr space_reservation{nullptr};
bool deduplicate{false};
Names deduplicate_by_columns{};
NamesAndTypesList gathering_columns{};
NamesAndTypesList merging_columns{};
Names gathering_column_names{};
Names merging_column_names{};
NamesAndTypesList storage_columns{};
Names all_column_names{};
MergeTreeData::DataPart::Checksums checksums_gathered_columns{};
MergeAlgorithm chosen_merge_algorithm{MergeAlgorithm::Undecided};
size_t gathering_column_names_size{0};
std::unique_ptr<MergeStageProgress> horizontal_stage_progress{nullptr};
std::unique_ptr<MergeStageProgress> column_progress{nullptr};
std::shared_ptr<MergedBlockOutputStream> to{nullptr};
BlockInputStreamPtr merged_stream{nullptr};
SyncGuardPtr sync_guard{nullptr};
MergeTreeData::MutableDataPartPtr new_data_part{nullptr};
size_t rows_written{0};
UInt64 watch_prev_elapsed{0};
std::promise<MergeTreeData::MutableDataPartPtr> promise{};
IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns{};
};
using GlobalRuntimeContextPtr = std::shared_ptr<GlobalRuntimeContext>;
/// By default this context is uninitialed, but some variables has to be set after construction,
/// some variables are used in a process of execution
/// Proper initialization is responsibility of the author
struct ExecuteAndFinalizeHorizontalPartRuntimeContext : public IStageRuntimeContext //-V730
{
/// Dependencies
String prefix;
MergeTreeData::MergingParams merging_params{};
DiskPtr tmp_disk{nullptr};
DiskPtr disk{nullptr};
bool need_remove_expired_values{false};
bool force_ttl{false};
CompressionCodecPtr compression_codec{nullptr};
size_t sum_input_rows_upper_bound{0};
std::unique_ptr<TemporaryFile> rows_sources_file{nullptr};
std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf{nullptr};
std::unique_ptr<WriteBuffer> rows_sources_write_buf{nullptr};
std::optional<ColumnSizeEstimator> column_sizes{};
size_t initial_reservation{0};
bool read_with_direct_io{false};
std::function<bool()> is_cancelled{};
/// Local variables for this stage
size_t sum_compressed_bytes_upper_bound{0};
bool blocks_are_granules_size{false};
Poco::Logger * log{&Poco::Logger::get("MergeTask::PrepareStage")};
/// Dependencies for next stages
std::list<DB::NameAndTypePair>::const_iterator it_name_and_type;
size_t column_num_for_vertical_merge{0};
bool need_sync{false};
};
using ExecuteAndFinalizeHorizontalPartRuntimeContextPtr = std::shared_ptr<ExecuteAndFinalizeHorizontalPartRuntimeContext>;
struct ExecuteAndFinalizeHorizontalPart : public IStage
{
bool execute() override;
bool prepare();
bool executeImpl();
using ExecuteAndFinalizeHorizontalPartSubtasks = std::array<std::function<bool()>, 2>;
ExecuteAndFinalizeHorizontalPartSubtasks subtasks
{
[this] () { return prepare(); },
[this] () { return executeImpl(); }
};
ExecuteAndFinalizeHorizontalPartSubtasks::iterator subtasks_iterator = subtasks.begin();
MergeAlgorithm chooseMergeAlgorithm() const;
void createMergedStream();
void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) override
{
ctx = static_pointer_cast<ExecuteAndFinalizeHorizontalPartRuntimeContext>(local);
global_ctx = static_pointer_cast<GlobalRuntimeContext>(global);
}
StageRuntimeContextPtr getContextForNextStage() override;
ExecuteAndFinalizeHorizontalPartRuntimeContextPtr ctx;
GlobalRuntimeContextPtr global_ctx;
};
/// By default this context is uninitialed, but some variables has to be set after construction,
/// some variables are used in a process of execution
/// Proper initialization is responsibility of the author
struct VerticalMergeRuntimeContext : public IStageRuntimeContext //-V730
{
/// Begin dependencies from previous stage
std::unique_ptr<WriteBuffer> rows_sources_write_buf{nullptr};
std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf{nullptr};
std::unique_ptr<TemporaryFile> rows_sources_file;
std::optional<ColumnSizeEstimator> column_sizes;
CompressionCodecPtr compression_codec;
DiskPtr tmp_disk{nullptr};
std::list<DB::NameAndTypePair>::const_iterator it_name_and_type;
size_t column_num_for_vertical_merge{0};
bool read_with_direct_io{false};
bool need_sync{false};
/// End dependencies from previous stages
enum class State
{
NEED_PREPARE,
NEED_EXECUTE,
NEED_FINISH
};
State vertical_merge_one_column_state{State::NEED_PREPARE};
Float64 progress_before = 0;
std::unique_ptr<MergedColumnOnlyOutputStream> column_to{nullptr};
size_t column_elems_written{0};
BlockInputStreams column_part_streams;
std::unique_ptr<ColumnGathererStream> column_gathered_stream;
std::unique_ptr<CompressedReadBufferFromFile> rows_sources_read_buf{nullptr};
};
using VerticalMergeRuntimeContextPtr = std::shared_ptr<VerticalMergeRuntimeContext>;
struct VerticalMergeStage : public IStage
{
bool execute() override;
void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) override
{
ctx = static_pointer_cast<VerticalMergeRuntimeContext>(local);
global_ctx = static_pointer_cast<GlobalRuntimeContext>(global);
}
StageRuntimeContextPtr getContextForNextStage() override;
bool prepareVerticalMergeForAllColumns() const;
bool executeVerticalMergeForAllColumns() const;
bool finalizeVerticalMergeForAllColumns() const;
using VerticalMergeStageSubtasks = std::array<std::function<bool()>, 3>;
VerticalMergeStageSubtasks subtasks
{
[this] () { return prepareVerticalMergeForAllColumns(); },
[this] () { return executeVerticalMergeForAllColumns(); },
[this] () { return finalizeVerticalMergeForAllColumns(); }
};
VerticalMergeStageSubtasks::iterator subtasks_iterator = subtasks.begin();
void prepareVerticalMergeForOneColumn() const;
bool executeVerticalMergeForOneColumn() const;
void finalizeVerticalMergeForOneColumn() const;
VerticalMergeRuntimeContextPtr ctx;
GlobalRuntimeContextPtr global_ctx;
};
/// By default this context is uninitialed, but some variables has to be set after construction,
/// some variables are used in a process of execution
/// Proper initialization is responsibility of the author
struct MergeProjectionsRuntimeContext : public IStageRuntimeContext //-V730
{
/// Only one dependency
bool need_sync{false};
using MergeTasks = std::deque<MergeTaskPtr>;
MergeTasks tasks_for_projections;
MergeTasks::iterator projections_iterator;
Poco::Logger * log{&Poco::Logger::get("MergeTask::MergeProjectionsStage")};
};
using MergeProjectionsRuntimeContextPtr = std::shared_ptr<MergeProjectionsRuntimeContext>;
struct MergeProjectionsStage : public IStage
{
bool execute() override;
void setRuntimeContext(StageRuntimeContextPtr local, StageRuntimeContextPtr global) override
{
ctx = static_pointer_cast<MergeProjectionsRuntimeContext>(local);
global_ctx = static_pointer_cast<GlobalRuntimeContext>(global);
}
StageRuntimeContextPtr getContextForNextStage() override { return nullptr; }
bool mergeMinMaxIndexAndPrepareProjections() const;
bool executeProjections() const;
bool finalizeProjectionsAndWholeMerge() const;
using MergeProjectionsStageSubtasks = std::array<std::function<bool()>, 3>;
MergeProjectionsStageSubtasks subtasks
{
[this] () { return mergeMinMaxIndexAndPrepareProjections(); },
[this] () { return executeProjections(); },
[this] () { return finalizeProjectionsAndWholeMerge(); }
};
MergeProjectionsStageSubtasks::iterator subtasks_iterator = subtasks.begin();
MergeProjectionsRuntimeContextPtr ctx;
GlobalRuntimeContextPtr global_ctx;
};
GlobalRuntimeContextPtr global_ctx;
using Stages = std::array<StagePtr, 3>;
Stages stages
{
std::make_shared<ExecuteAndFinalizeHorizontalPart>(),
std::make_shared<VerticalMergeStage>(),
std::make_shared<MergeProjectionsStage>()
};
Stages::iterator stages_iterator = stages.begin();
};
/// FIXME
[[ maybe_unused]] static MergeTreeData::MutableDataPartPtr executeHere(MergeTaskPtr task)
{
while (task->execute()) {}
return task->getFuture().get();
}
}

View File

@ -111,6 +111,9 @@ void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
if (item->is_currently_deleting)
{
erase_from_active();
/// This is significant to order the destructors.
item->task.reset();
return;
}
@ -125,21 +128,28 @@ void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
std::lock_guard guard(mutex);
erase_from_active();
has_tasks.notify_one();
}
try
{
ALLOW_ALLOCATIONS_IN_SCOPE;
/// In a situation of a lack of memory this method can throw an exception,
/// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
/// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
item->task->onCompleted();
try
{
ALLOW_ALLOCATIONS_IN_SCOPE;
/// In a situation of a lack of memory this method can throw an exception,
/// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
/// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
item->task->onCompleted();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
/// We have to call reset() under a lock, otherwise a race is possible.
/// Imagine, that task is finally completed (last execution returned false),
/// we removed the task from both queues, but still have pointer.
/// The thread that shutdowns storage will scan queues in order to find some tasks to wait for, but will find nothing.
/// So, the destructor of a task and the destructor of a storage will be executed concurrently.
item->task.reset();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}

View File

@ -4248,10 +4248,10 @@ Block MergeTreeData::getMinMaxCountProjectionBlock(
const auto & part = parts[part_idx];
if (!part->minmax_idx.initialized)
if (!part->minmax_idx->initialized)
throw Exception("Found a non-empty part with uninitialized minmax_idx. It's a bug", ErrorCodes::LOGICAL_ERROR);
size_t minmax_idx_size = part->minmax_idx.hyperrectangle.size();
size_t minmax_idx_size = part->minmax_idx->hyperrectangle.size();
if (2 * minmax_idx_size + 1 != minmax_count_columns.size())
throw Exception(
ErrorCodes::LOGICAL_ERROR,
@ -4266,7 +4266,7 @@ Block MergeTreeData::getMinMaxCountProjectionBlock(
size_t max_pos = i * 2 + 1;
auto & min_column = assert_cast<ColumnAggregateFunction &>(*minmax_count_columns[min_pos]);
auto & max_column = assert_cast<ColumnAggregateFunction &>(*minmax_count_columns[max_pos]);
const auto & range = part->minmax_idx.hyperrectangle[i];
const auto & range = part->minmax_idx->hyperrectangle[i];
insert(min_column, range.left);
insert(max_column, range.right);
}

View File

@ -273,6 +273,8 @@ public:
void clear() { precommitted_parts.clear(); }
};
using TransactionUniquePtr = std::unique_ptr<Transaction>;
using PathWithDisk = std::pair<String, DiskPtr>;
struct PartsTemporaryRename : private boost::noncopyable
@ -859,6 +861,7 @@ protected:
friend struct ReplicatedMergeTreeTableMetadata;
friend class StorageReplicatedMergeTree;
friend class MergeTreeDataWriter;
friend class MergeTask;
bool require_part_metadata;
@ -1145,4 +1148,12 @@ struct CurrentlySubmergingEmergingTagger
~CurrentlySubmergingEmergingTagger();
};
/// TODO: move it somewhere
[[ maybe_unused ]] static bool needSyncPart(size_t input_rows, size_t input_bytes, const MergeTreeSettings & settings)
{
return ((settings.min_rows_to_fsync_after_merge && input_rows >= settings.min_rows_to_fsync_after_merge)
|| (settings.min_compressed_bytes_to_fsync_after_merge && input_bytes >= settings.min_compressed_bytes_to_fsync_after_merge));
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,13 +1,16 @@
#pragma once
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MutationCommands.h>
#include <atomic>
#include <functional>
#include <Common/ActionBlocker.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MutationCommands.h>
#include <Storages/MergeTree/TTLMergeSelector.h>
#include <Storages/MergeTree/MergeAlgorithm.h>
#include <Storages/MergeTree/MergeType.h>
#include <Storages/MergeTree/MergeTask.h>
#include <Storages/MergeTree/MutateTask.h>
#include <Storages/MergeTree/IMergedBlockOutputStream.h>
@ -30,38 +33,6 @@ enum class ExecuteTTLType
RECALCULATE= 2,
};
/// Auxiliary struct holding metainformation for the future merged or mutated part.
struct FutureMergedMutatedPart
{
String name;
UUID uuid = UUIDHelpers::Nil;
String path;
MergeTreeDataPartType type;
MergeTreePartInfo part_info;
MergeTreeData::DataPartsVector parts;
MergeType merge_type = MergeType::REGULAR;
const MergeTreePartition & getPartition() const { return parts.front()->partition; }
FutureMergedMutatedPart() = default;
explicit FutureMergedMutatedPart(MergeTreeData::DataPartsVector parts_)
{
assign(std::move(parts_));
}
FutureMergedMutatedPart(MergeTreeData::DataPartsVector parts_, MergeTreeDataPartType future_part_type)
{
assign(std::move(parts_), future_part_type);
}
void assign(MergeTreeData::DataPartsVector parts_);
void assign(MergeTreeData::DataPartsVector parts_, MergeTreeDataPartType future_part_type);
void updatePath(const MergeTreeData & storage, const ReservationPtr & reservation);
};
/** Can select parts for background processes and do them.
* Currently helps with merges, mutations and moves
*/
@ -95,7 +66,7 @@ public:
* - A part that already merges with something in one place, you can not start to merge into something else in another place.
*/
SelectPartsDecision selectPartsToMerge(
FutureMergedMutatedPart & future_part,
FutureMergedMutatedPartPtr future_part,
bool aggressive,
size_t max_total_size_to_merge,
const AllowedMergingPredicate & can_merge,
@ -108,7 +79,7 @@ public:
* and without expired TTL won't be merged with itself.
*/
SelectPartsDecision selectAllPartsToMergeWithinPartition(
FutureMergedMutatedPart & future_part,
FutureMergedMutatedPartPtr future_part,
UInt64 & available_disk_space,
const AllowedMergingPredicate & can_merge,
const String & partition_id,
@ -117,24 +88,21 @@ public:
String * out_disable_reason = nullptr,
bool optimize_skip_merged_partitions = false);
/** Merge the parts.
/** Creates a task to merge parts.
* If `reservation != nullptr`, now and then reduces the size of the reserved space
* is approximately proportional to the amount of data already written.
*
* Creates and returns a temporary part.
* To end the merge, call the function renameMergedTemporaryPart.
*
* time_of_merge - the time when the merge was assigned.
* Important when using ReplicatedGraphiteMergeTree to provide the same merge on replicas.
*/
MergeTreeData::MutableDataPartPtr mergePartsToTemporaryPart(
const FutureMergedMutatedPart & future_part,
MergeTaskPtr mergePartsToTemporaryPart(
FutureMergedMutatedPartPtr future_part,
const StorageMetadataPtr & metadata_snapshot,
MergeListEntry & merge_entry,
TableLockHolder & table_lock_holder,
MergeListEntry * merge_entry,
TableLockHolder table_lock_holder,
time_t time_of_merge,
ContextPtr context,
const ReservationPtr & space_reservation,
ReservationSharedPtr space_reservation,
bool deduplicate,
const Names & deduplicate_by_columns,
const MergeTreeData::MergingParams & merging_params,
@ -142,14 +110,14 @@ public:
const String & prefix = "");
/// Mutate a single data part with the specified commands. Will create and return a temporary part.
MergeTreeData::MutableDataPartPtr mutatePartToTemporaryPart(
const FutureMergedMutatedPart & future_part,
const StorageMetadataPtr & metadata_snapshot,
const MutationCommands & commands,
MergeListEntry & merge_entry,
MutateTaskPtr mutatePartToTemporaryPart(
FutureMergedMutatedPartPtr future_part,
StorageMetadataPtr metadata_snapshot,
MutationCommandsConstPtr commands,
MergeListEntry * merge_entry,
time_t time_of_mutation,
ContextPtr context,
const ReservationPtr & space_reservation,
ReservationSharedPtr space_reservation,
TableLockHolder & table_lock_holder);
MergeTreeData::DataPartPtr renameMergedTemporaryPart(
@ -166,6 +134,8 @@ private:
*/
MergeTreeData::DataPartsVector selectAllPartsFromPartition(const String & partition_id);
friend class MutateTask;
/** Split mutation commands into two parts:
* First part should be executed by mutations interpreter.
* Other is just simple drop/renames, so they can be executed without interpreter.
@ -176,21 +146,6 @@ private:
MutationCommands & for_interpreter,
MutationCommands & for_file_renames);
/// Apply commands to source_part i.e. remove and rename some columns in
/// source_part and return set of files, that have to be removed or renamed
/// from filesystem and in-memory checksums. Ordered result is important,
/// because we can apply renames that affects each other: x -> z, y -> x.
static NameToNameVector collectFilesForRenames(MergeTreeData::DataPartPtr source_part, const MutationCommands & commands_for_removes, const String & mrk_extension);
/// Files, that we don't need to remove and don't need to hardlink, for example columns.txt and checksums.txt.
/// Because we will generate new versions of them after we perform mutation.
static NameSet collectFilesToSkip(
const MergeTreeDataPartPtr & source_part,
const Block & updated_header,
const std::set<MergeTreeIndexPtr> & indices_to_recalc,
const String & mrk_extension,
const std::set<ProjectionDescriptionRawPtr> & projections_to_recalc);
/// Get the columns list of the resulting part in the same order as storage_columns.
static NamesAndTypesList getColumnsForNewDataPart(
MergeTreeData::DataPartPtr source_part,
@ -198,87 +153,8 @@ private:
NamesAndTypesList storage_columns,
const MutationCommands & commands_for_removes);
/// Get skip indices, that should exists in the resulting data part.
static MergeTreeIndices getIndicesForNewDataPart(
const IndicesDescription & all_indices,
const MutationCommands & commands_for_removes);
static std::vector<ProjectionDescriptionRawPtr> getProjectionsForNewDataPart(
const ProjectionsDescription & all_projections,
const MutationCommands & commands_for_removes);
static ExecuteTTLType shouldExecuteTTL(const StorageMetadataPtr & metadata_snapshot, const ColumnDependencies & dependencies);
/// Return set of indices which should be recalculated during mutation also
/// wraps input stream into additional expression stream
static std::set<MergeTreeIndexPtr> getIndicesToRecalculate(
BlockInputStreamPtr & input_stream,
const NameSet & updated_columns,
const StorageMetadataPtr & metadata_snapshot,
ContextPtr context,
const NameSet & materialized_indices,
const MergeTreeData::DataPartPtr & source_part);
static std::set<ProjectionDescriptionRawPtr> getProjectionsToRecalculate(
const NameSet & updated_columns,
const StorageMetadataPtr & metadata_snapshot,
const NameSet & materialized_projections,
const MergeTreeData::DataPartPtr & source_part);
void writeWithProjections(
MergeTreeData::MutableDataPartPtr new_data_part,
const StorageMetadataPtr & metadata_snapshot,
const std::vector<ProjectionDescriptionRawPtr> & projections_to_build,
BlockInputStreamPtr mutating_stream,
IMergedBlockOutputStream & out,
time_t time_of_mutation,
MergeListEntry & merge_entry,
const ReservationPtr & space_reservation,
TableLockHolder & holder,
ContextPtr context,
IMergeTreeDataPart::MinMaxIndex * minmax_idx = nullptr);
/// Override all columns of new part using mutating_stream
void mutateAllPartColumns(
MergeTreeData::MutableDataPartPtr new_data_part,
const StorageMetadataPtr & metadata_snapshot,
const MergeTreeIndices & skip_indices,
const std::vector<ProjectionDescriptionRawPtr> & projections_to_build,
BlockInputStreamPtr mutating_stream,
time_t time_of_mutation,
const CompressionCodecPtr & compression_codec,
MergeListEntry & merge_entry,
ExecuteTTLType execute_ttl_type,
bool need_sync,
const ReservationPtr & space_reservation,
TableLockHolder & holder,
ContextPtr context);
/// Mutate some columns of source part with mutation_stream
void mutateSomePartColumns(
const MergeTreeDataPartPtr & source_part,
const StorageMetadataPtr & metadata_snapshot,
const std::set<MergeTreeIndexPtr> & indices_to_recalc,
const std::set<ProjectionDescriptionRawPtr> & projections_to_recalc,
const Block & mutation_header,
MergeTreeData::MutableDataPartPtr new_data_part,
BlockInputStreamPtr mutating_stream,
time_t time_of_mutation,
const CompressionCodecPtr & compression_codec,
MergeListEntry & merge_entry,
ExecuteTTLType execute_ttl_type,
bool need_sync,
const ReservationPtr & space_reservation,
TableLockHolder & holder,
ContextPtr context);
/// Initialize and write to disk new part fields like checksums, columns,
/// etc.
static void finalizeMutatedPart(
const MergeTreeDataPartPtr & source_part,
MergeTreeData::MutableDataPartPtr new_data_part,
ExecuteTTLType execute_ttl_type,
const CompressionCodecPtr & codec);
static ExecuteTTLType shouldExecuteTTL(
const StorageMetadataPtr & metadata_snapshot, const ColumnDependencies & dependencies);
public :
/** Is used to cancel all merges and mutations. On cancel() call all currently running actions will throw exception soon.
@ -297,9 +173,6 @@ private:
bool need_remove_expired_values,
const MergeTreeData::MergingParams & merging_params) const;
bool checkOperationIsNotCanceled(const MergeListEntry & merge_entry) const;
private:
MergeTreeData & data;
const size_t background_pool_size;

View File

@ -1531,7 +1531,7 @@ void MergeTreeDataSelectExecutor::selectPartsToRead(
counters.num_initial_selected_granules += num_granules;
if (minmax_idx_condition && !minmax_idx_condition->checkInHyperrectangle(
part->minmax_idx.hyperrectangle, minmax_columns_types).can_be_true)
part->minmax_idx->hyperrectangle, minmax_columns_types).can_be_true)
continue;
counters.num_parts_after_minmax += 1;
@ -1601,7 +1601,7 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
counters.num_initial_selected_granules += num_granules;
if (minmax_idx_condition
&& !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx.hyperrectangle, minmax_columns_types)
&& !minmax_idx_condition->checkInHyperrectangle(part->minmax_idx->hyperrectangle, minmax_columns_types)
.can_be_true)
continue;

View File

@ -277,8 +277,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
/// This will generate unique name in scope of current server process.
Int64 temp_index = data.insert_increment.get();
IMergeTreeDataPart::MinMaxIndex minmax_idx;
minmax_idx.update(block, data.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
minmax_idx->update(block, data.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
MergeTreePartition partition(std::move(block_with_partition.partition));
@ -286,8 +286,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(
String part_name;
if (data.format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)
{
DayNum min_date(minmax_idx.hyperrectangle[data.minmax_idx_date_column_pos].left.get<UInt64>());
DayNum max_date(minmax_idx.hyperrectangle[data.minmax_idx_date_column_pos].right.get<UInt64>());
DayNum min_date(minmax_idx->hyperrectangle[data.minmax_idx_date_column_pos].left.get<UInt64>());
DayNum max_date(minmax_idx->hyperrectangle[data.minmax_idx_date_column_pos].right.get<UInt64>());
const auto & date_lut = DateLUT::instance();

View File

@ -197,7 +197,7 @@ MergeTreeData::MutableDataPartsVector MergeTreeWriteAheadLog::restore(const Stor
{},
CompressionCodecFactory::instance().get("NONE", {}));
part->minmax_idx.update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
part->minmax_idx->update(block, storage.getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
part->partition.create(metadata_snapshot, block, 0, context);
if (metadata_snapshot->hasSortingKey())
metadata_snapshot->getSortingKey().expression->execute(block);

View File

@ -7,7 +7,7 @@ namespace DB
/// Type of Merge. Used to control amount of different merges during merges
/// assignment. Also allows to apply special logic during merge process
/// (mergePartsToTemporaryPart). Stored in FutureMergedMutatedPart and
/// Stored in FutureMergedMutatedPart and
/// ReplicatedMergeTreeLogEntry.
///
/// Order is important, don't try to change it.

View File

@ -135,8 +135,8 @@ void MergedBlockOutputStream::finalizePartOnDisk(
if (storage.format_version >= MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING || isCompactPart(new_part))
{
new_part->partition.store(storage, volume->getDisk(), part_path, checksums);
if (new_part->minmax_idx.initialized)
new_part->minmax_idx.store(storage, volume->getDisk(), part_path, checksums);
if (new_part->minmax_idx->initialized)
new_part->minmax_idx->store(storage, volume->getDisk(), part_path, checksums);
else if (rows_count)
throw Exception("MinMax index was not initialized for new non-empty part " + new_part->name
+ ". It is a bug.", ErrorCodes::LOGICAL_ERROR);

View File

@ -60,4 +60,6 @@ private:
CompressionCodecPtr default_codec;
};
using MergedBlockOutputStreamPtr = std::shared_ptr<MergedBlockOutputStream>;
}

View File

@ -33,5 +33,7 @@ private:
Block header;
};
using MergedColumnOnlyOutputStreamPtr = std::shared_ptr<MergedColumnOnlyOutputStream>;
}

View File

@ -0,0 +1,138 @@
#include <Storages/MergeTree/MutateFromLogEntryTask.h>
#include <common/logger_useful.h>
#include <Common/ProfileEvents.h>
#include <Storages/StorageReplicatedMergeTree.h>
namespace ProfileEvents
{
extern const Event DataAfterMutationDiffersFromReplica;
extern const Event ReplicatedPartMutations;
}
namespace DB
{
std::pair<bool, ReplicatedMergeMutateTaskBase::PartLogWriter> MutateFromLogEntryTask::prepare()
{
const String & source_part_name = entry.source_parts.at(0);
const auto storage_settings_ptr = storage.getSettings();
LOG_TRACE(log, "Executing log entry to mutate part {} to {}", source_part_name, entry.new_part_name);
MergeTreeData::DataPartPtr source_part = storage.getActiveContainingPart(source_part_name);
if (!source_part)
{
LOG_DEBUG(log, "Source part {} for {} is not ready; will try to fetch it instead", source_part_name, entry.new_part_name);
return {false, {}};
}
if (source_part->name != source_part_name)
{
LOG_WARNING(log, "Part " + source_part_name + " is covered by " + source_part->name
+ " but should be mutated to " + entry.new_part_name + ". "
+ "Possibly the mutation of this part is not needed and will be skipped. This shouldn't happen often.");
return {false, {}};
}
/// TODO - some better heuristic?
size_t estimated_space_for_result = MergeTreeDataMergerMutator::estimateNeededDiskSpace({source_part});
if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr)
&& estimated_space_for_result >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold)
{
/// If entry is old enough, and have enough size, and some replica has the desired part,
/// then prefer fetching from replica.
String replica = storage.findReplicaHavingPart(entry.new_part_name, true); /// NOTE excessive ZK requests for same data later, may remove.
if (!replica.empty())
{
LOG_DEBUG(log, "Prefer to fetch {} from replica {}", entry.new_part_name, replica);
return {false, {}};
}
}
new_part_info = MergeTreePartInfo::fromPartName(entry.new_part_name, storage.format_version);
commands = MutationCommands::create(storage.queue.getMutationCommands(source_part, new_part_info.mutation));
/// Once we mutate part, we must reserve space on the same disk, because mutations can possibly create hardlinks.
/// Can throw an exception.
reserved_space = storage.reserveSpace(estimated_space_for_result, source_part->volume);
table_lock_holder = storage.lockForShare(
RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations);
StorageMetadataPtr metadata_snapshot = storage.getInMemoryMetadataPtr();
transaction_ptr = std::make_unique<MergeTreeData::Transaction>(storage);
future_mutated_part = std::make_shared<FutureMergedMutatedPart>();
future_mutated_part->name = entry.new_part_name;
future_mutated_part->uuid = entry.new_part_uuid;
future_mutated_part->parts.push_back(source_part);
future_mutated_part->part_info = new_part_info;
future_mutated_part->updatePath(storage, reserved_space.get());
future_mutated_part->type = source_part->getType();
merge_mutate_entry = storage.getContext()->getMergeList().insert(storage.getStorageID(), future_mutated_part);
stopwatch_ptr = std::make_unique<Stopwatch>();
mutate_task = storage.merger_mutator.mutatePartToTemporaryPart(
future_mutated_part, metadata_snapshot, commands, merge_mutate_entry.get(),
entry.create_time, storage.getContext(), reserved_space, table_lock_holder);
return {true, [this] (const ExecutionStatus & execution_status)
{
storage.writePartLog(
PartLogElement::MUTATE_PART, execution_status, stopwatch_ptr->elapsed(),
entry.new_part_name, new_part, future_mutated_part->parts, merge_mutate_entry.get());
}};
}
bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWriter write_part_log)
{
new_part = mutate_task->getFuture().get();
storage.renameTempPartAndReplace(new_part, nullptr, transaction_ptr.get());
try
{
storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part);
}
catch (const Exception & e)
{
if (MergeTreeDataPartChecksums::isBadChecksumsErrorCode(e.code()))
{
transaction_ptr->rollback();
ProfileEvents::increment(ProfileEvents::DataAfterMutationDiffersFromReplica);
LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
if (storage.getSettings()->detach_not_byte_identical_parts)
storage.forgetPartAndMoveToDetached(std::move(new_part), "mutate-not-byte-identical");
else
storage.tryRemovePartImmediately(std::move(new_part));
/// No need to delete the part from ZK because we can be sure that the commit transaction
/// didn't go through.
return false;
}
throw;
}
/** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts.
* This is not a problem, because in this case the entry will remain in the queue, and we will try again.
*/
storage.merge_selecting_task->schedule();
ProfileEvents::increment(ProfileEvents::ReplicatedPartMutations);
write_part_log({});
return true;
}
}

View File

@ -0,0 +1,49 @@
#pragma once
#include <common/shared_ptr_helper.h>
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/MutateTask.h>
#include <Storages/MergeTree/ReplicatedMergeMutateTaskBase.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
#include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
namespace DB
{
class MutateFromLogEntryTask : public shared_ptr_helper<MutateFromLogEntryTask>, public ReplicatedMergeMutateTaskBase
{
public:
template <typename Callback>
MutateFromLogEntryTask(
ReplicatedMergeTreeQueue::SelectedEntryPtr selected_entry_,
StorageReplicatedMergeTree & storage_,
Callback && task_result_callback_)
: ReplicatedMergeMutateTaskBase(&Poco::Logger::get("MutateFromLogEntryTask"), storage_, selected_entry_, task_result_callback_) {}
private:
std::pair<bool, ReplicatedMergeMutateTaskBase::PartLogWriter> prepare() override;
bool finalize(ReplicatedMergeMutateTaskBase::PartLogWriter write_part_log) override;
bool executeInnerTask() override
{
return mutate_task->execute();
}
TableLockHolder table_lock_holder{nullptr};
ReservationSharedPtr reserved_space{nullptr};
MergeTreePartInfo new_part_info;
MutationCommandsConstPtr commands;
MergeTreeData::TransactionUniquePtr transaction_ptr{nullptr};
StopwatchUniquePtr stopwatch_ptr{nullptr};
MergeTreeData::MutableDataPartPtr new_part{nullptr};
FutureMergedMutatedPartPtr future_mutated_part{nullptr};
MutateTaskPtr mutate_task;
};
}

View File

@ -0,0 +1,106 @@
#include <Storages/MergeTree/MutatePlainMergeTreeTask.h>
#include <Storages/StorageMergeTree.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
StorageID MutatePlainMergeTreeTask::getStorageID()
{
return storage.getStorageID();
}
void MutatePlainMergeTreeTask::onCompleted()
{
bool delay = state == State::SUCCESS;
task_result_callback(delay);
}
void MutatePlainMergeTreeTask::prepare()
{
future_part = merge_mutate_entry->future_part;
merge_list_entry = storage.getContext()->getMergeList().insert(storage.getStorageID(), future_part);
stopwatch = std::make_unique<Stopwatch>();
write_part_log = [this] (const ExecutionStatus & execution_status)
{
storage.writePartLog(
PartLogElement::MUTATE_PART,
execution_status,
stopwatch->elapsed(),
future_part->name,
new_part,
future_part->parts,
merge_list_entry.get());
};
mutate_task = storage.merger_mutator.mutatePartToTemporaryPart(
future_part, metadata_snapshot, merge_mutate_entry->commands, merge_list_entry.get(),
time(nullptr), storage.getContext(), merge_mutate_entry->tagger->reserved_space, table_lock_holder);
}
bool MutatePlainMergeTreeTask::executeStep()
{
/// Make out memory tracker a parent of current thread memory tracker
MemoryTrackerThreadSwitcherPtr switcher;
if (merge_list_entry)
switcher = std::make_unique<MemoryTrackerThreadSwitcher>(&(*merge_list_entry)->memory_tracker);
switch (state)
{
case State::NEED_PREPARE :
{
prepare();
state = State::NEED_EXECUTE;
return true;
}
case State::NEED_EXECUTE :
{
try
{
if (mutate_task->execute())
return true;
new_part = mutate_task->getFuture().get();
storage.renameTempPartAndReplace(new_part);
storage.updateMutationEntriesErrors(future_part, true, "");
write_part_log({});
state = State::NEED_FINISH;
return true;
}
catch (...)
{
storage.updateMutationEntriesErrors(future_part, false, getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
return false;
}
}
case State::NEED_FINISH :
{
// Nothing to do
state = State::SUCCESS;
return false;
}
case State::SUCCESS:
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Task with state SUCCESS mustn't be executed again");
}
}
return false;
}
}

View File

@ -0,0 +1,81 @@
#pragma once
#include <functional>
#include <Core/Names.h>
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/MutateTask.h>
#include <Storages/MutationCommands.h>
#include <Storages/MergeTree/MergeMutateSelectedEntry.h>
namespace DB
{
struct StorageInMemoryMetadata;
using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
struct FutureMergedMutatedPart;
using FutureMergedMutatedPartPtr = std::shared_ptr<FutureMergedMutatedPart>;
class StorageMergeTree;
class MutatePlainMergeTreeTask : public IExecutableTask
{
public:
template <class Callback>
MutatePlainMergeTreeTask(
StorageMergeTree & storage_,
StorageMetadataPtr metadata_snapshot_,
MergeMutateSelectedEntryPtr merge_mutate_entry_,
TableLockHolder & table_lock_holder_,
Callback && task_result_callback_)
: storage(storage_)
, metadata_snapshot(metadata_snapshot_)
, merge_mutate_entry(merge_mutate_entry_)
, table_lock_holder(table_lock_holder_)
, task_result_callback(task_result_callback_) {}
bool executeStep() override;
void onCompleted() override;
StorageID getStorageID() override;
private:
void prepare();
enum class State
{
NEED_PREPARE,
NEED_EXECUTE,
NEED_FINISH,
SUCCESS
};
State state{State::NEED_PREPARE};
StorageMergeTree & storage;
StorageMetadataPtr metadata_snapshot;
MergeMutateSelectedEntryPtr merge_mutate_entry{nullptr};
TableLockHolder & table_lock_holder;
FutureMergedMutatedPartPtr future_part{nullptr};
std::unique_ptr<Stopwatch> stopwatch;
MergeTreeData::MutableDataPartPtr new_part;
using MergeListEntryPtr = std::unique_ptr<MergeListEntry>;
MergeListEntryPtr merge_list_entry;
std::function<void(const ExecutionStatus & execution_status)> write_part_log;
IExecutableTask::TaskResultCallback task_result_callback;
MutateTaskPtr mutate_task;
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,72 @@
#pragma once
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeProgress.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/IMergedBlockOutputStream.h>
#include <Storages/MutationCommands.h>
#include <Interpreters/MutationsInterpreter.h>
namespace DB
{
class MutateTask;
using MutateTaskPtr = std::shared_ptr<MutateTask>;\
class MergeTreeDataMergerMutator;
struct MutationContext;
class MutateTask
{
public:
MutateTask(
FutureMergedMutatedPartPtr future_part_,
StorageMetadataPtr metadata_snapshot_,
MutationCommandsConstPtr commands_,
MergeListEntry * mutate_entry_,
time_t time_of_mutation_,
ContextPtr context_,
ReservationSharedPtr space_reservation_,
TableLockHolder & table_lock_holder_,
MergeTreeData & data_,
MergeTreeDataMergerMutator & mutator_,
ActionBlocker & merges_blocker_);
bool execute();
std::future<MergeTreeData::MutableDataPartPtr> getFuture()
{
return promise.get_future();
}
private:
bool prepare();
enum class State
{
NEED_PREPARE,
NEED_EXECUTE
};
State state{State::NEED_PREPARE};
std::promise<MergeTreeData::MutableDataPartPtr> promise;
std::shared_ptr<MutationContext> ctx;
ExecutableTaskPtr task;
};
[[ maybe_unused]] static MergeTreeData::MutableDataPartPtr executeHere(MutateTaskPtr task)
{
while (task->execute()) {}
return task->getFuture().get();
}
}

View File

@ -0,0 +1,241 @@
#include <Storages/MergeTree/ReplicatedMergeMutateTaskBase.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NO_REPLICA_HAS_PART;
extern const int LOGICAL_ERROR;
extern const int ABORTED;
extern const int PART_IS_TEMPORARILY_LOCKED;
}
StorageID ReplicatedMergeMutateTaskBase::getStorageID()
{
return storage.getStorageID();
}
void ReplicatedMergeMutateTaskBase::onCompleted()
{
bool successfully_executed = state == State::SUCCESS;
task_result_callback(successfully_executed);
}
bool ReplicatedMergeMutateTaskBase::executeStep()
{
std::exception_ptr saved_exception;
try
{
/// We don't have any backoff for failed entries
/// we just count amount of tries for each of them.
try
{
return executeImpl();
}
catch (const Exception & e)
{
if (e.code() == ErrorCodes::NO_REPLICA_HAS_PART)
{
/// If no one has the right part, probably not all replicas work; We will not write to log with Error level.
LOG_INFO(log, e.displayText());
}
else if (e.code() == ErrorCodes::ABORTED)
{
/// Interrupted merge or downloading a part is not an error.
LOG_INFO(log, e.message());
}
else if (e.code() == ErrorCodes::PART_IS_TEMPORARILY_LOCKED)
{
/// Part cannot be added temporarily
LOG_INFO(log, e.displayText());
storage.cleanup_thread.wakeup();
}
else
tryLogCurrentException(log, __PRETTY_FUNCTION__);
/** This exception will be written to the queue element, and it can be looked up using `system.replication_queue` table.
* The thread that performs this action will sleep a few seconds after the exception.
* See `queue.processEntry` function.
*/
throw;
}
catch (...)
{
tryLogCurrentException(log, __PRETTY_FUNCTION__);
throw;
}
}
catch (...)
{
saved_exception = std::current_exception();
}
if (saved_exception)
{
std::lock_guard lock(storage.queue.state_mutex);
auto & log_entry = selected_entry->log_entry;
log_entry->exception = saved_exception;
if (log_entry->type == ReplicatedMergeTreeLogEntryData::MUTATE_PART)
{
/// Record the exception in the system.mutations table.
Int64 result_data_version = MergeTreePartInfo::fromPartName(log_entry->new_part_name, storage.queue.format_version)
.getDataVersion();
auto source_part_info = MergeTreePartInfo::fromPartName(
log_entry->source_parts.at(0), storage.queue.format_version);
auto in_partition = storage.queue.mutations_by_partition.find(source_part_info.partition_id);
if (in_partition != storage.queue.mutations_by_partition.end())
{
auto mutations_begin_it = in_partition->second.upper_bound(source_part_info.getDataVersion());
auto mutations_end_it = in_partition->second.upper_bound(result_data_version);
for (auto it = mutations_begin_it; it != mutations_end_it; ++it)
{
ReplicatedMergeTreeQueue::MutationStatus & status = *it->second;
status.latest_failed_part = log_entry->source_parts.at(0);
status.latest_failed_part_info = source_part_info;
status.latest_fail_time = time(nullptr);
status.latest_fail_reason = getExceptionMessage(saved_exception, false);
}
}
}
}
return false;
}
bool ReplicatedMergeMutateTaskBase::executeImpl()
{
MemoryTrackerThreadSwitcherPtr switcher;
if (merge_mutate_entry)
switcher = std::make_unique<MemoryTrackerThreadSwitcher>(&(*merge_mutate_entry)->memory_tracker);
auto remove_processed_entry = [&] () -> bool
{
try
{
storage.queue.removeProcessedEntry(storage.getZooKeeper(), selected_entry->log_entry);
state = State::SUCCESS;
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
return false;
};
auto execute_fetch = [&] () -> bool
{
if (storage.executeFetch(entry))
return remove_processed_entry();
return false;
};
switch (state)
{
case State::NEED_PREPARE :
{
{
auto res = checkExistingPart();
/// Depending on condition there is no need to execute a merge
if (res == CheckExistingPartResult::PART_EXISTS)
return remove_processed_entry();
}
bool res = false;
std::tie(res, part_log_writer) = prepare();
/// Avoid resheduling, execute fetch here, in the same thread.
if (!res)
return execute_fetch();
state = State::NEED_EXECUTE_INNER_MERGE;
return true;
}
case State::NEED_EXECUTE_INNER_MERGE :
{
try
{
if (!executeInnerTask())
{
state = State::NEED_FINALIZE;
return true;
}
}
catch (...)
{
if (part_log_writer)
part_log_writer(ExecutionStatus::fromCurrentException());
throw;
}
return true;
}
case State::NEED_FINALIZE :
{
try
{
if (!finalize(part_log_writer))
return execute_fetch();
}
catch (...)
{
if (part_log_writer)
part_log_writer(ExecutionStatus::fromCurrentException());
throw;
}
return remove_processed_entry();
}
case State::SUCCESS :
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Do not call execute on previously succeeded task");
}
}
return false;
}
ReplicatedMergeMutateTaskBase::CheckExistingPartResult ReplicatedMergeMutateTaskBase::checkExistingPart()
{
/// If we already have this part or a part covering it, we do not need to do anything.
/// The part may be still in the PreCommitted -> Committed transition so we first search
/// among PreCommitted parts to definitely find the desired part if it exists.
MergeTreeData::DataPartPtr existing_part = storage.getPartIfExists(entry.new_part_name, {MergeTreeDataPartState::PreCommitted});
if (!existing_part)
existing_part = storage.getActiveContainingPart(entry.new_part_name);
/// Even if the part is local, it (in exceptional cases) may not be in ZooKeeper. Let's check that it is there.
if (existing_part && storage.getZooKeeper()->exists(fs::path(storage.replica_path) / "parts" / existing_part->name))
{
LOG_DEBUG(log, "Skipping action for part {} because part {} already exists.", entry.new_part_name, existing_part->name);
/// We will exit from all the execution process
return CheckExistingPartResult::PART_EXISTS;
}
return CheckExistingPartResult::OK;
}
}

View File

@ -0,0 +1,80 @@
#pragma once
#include <common/logger_useful.h>
#include <Storages/MergeTree/IExecutableTask.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
namespace DB
{
class StorageReplicatedMergeTree;
/**
* This is used as a base of MergeFromLogEntryTask and MutateFromLogEntryTaskBase
*/
class ReplicatedMergeMutateTaskBase : public IExecutableTask
{
public:
template <class Callback>
ReplicatedMergeMutateTaskBase(
Poco::Logger * log_,
StorageReplicatedMergeTree & storage_,
ReplicatedMergeTreeQueue::SelectedEntryPtr & selected_entry_,
Callback && task_result_callback_)
: selected_entry(selected_entry_)
, entry(*selected_entry->log_entry)
, log(log_)
, storage(storage_)
/// This is needed to ask an asssignee to assign a new merge/mutate operation
/// It takes bool argument and true means that current task is successfully executed.
, task_result_callback(task_result_callback_) {}
~ReplicatedMergeMutateTaskBase() override = default;
void onCompleted() override;
StorageID getStorageID() override;
bool executeStep() override;
protected:
using PartLogWriter = std::function<void(const ExecutionStatus &)>;
virtual std::pair<bool, PartLogWriter> prepare() = 0;
virtual bool finalize(ReplicatedMergeMutateTaskBase::PartLogWriter write_part_log) = 0;
/// Will execute a part of inner MergeTask or MutateTask
virtual bool executeInnerTask() = 0;
/// This is important not to execute the same mutation in parallel
/// selected_entry is a RAII class, so the time of living must be the same as for the whole task
ReplicatedMergeTreeQueue::SelectedEntryPtr selected_entry;
ReplicatedMergeTreeLogEntry & entry;
MergeList::EntryPtr merge_mutate_entry{nullptr};
Poco::Logger * log;
StorageReplicatedMergeTree & storage;
private:
enum class CheckExistingPartResult
{
PART_EXISTS,
OK
};
CheckExistingPartResult checkExistingPart();
bool executeImpl() ;
enum class State
{
NEED_PREPARE,
NEED_EXECUTE_INNER_MERGE,
NEED_FINALIZE,
SUCCESS
};
PartLogWriter part_log_writer{};
State state{State::NEED_PREPARE};
IExecutableTask::TaskResultCallback task_result_callback;
};
}

View File

@ -58,7 +58,6 @@ public:
/// returns the replica name
/// and it's not current replica should do the merge
/// used in shouldExecuteLogEntry and in tryExecuteMerge
std::optional<String> pickReplicaToExecuteMerge(const ReplicatedMergeTreeLogEntryData & entry);
/// checks (in zookeeper) if the picked replica finished the merge

View File

@ -111,6 +111,18 @@ ReplicatedMergeTreePartCheckThread::MissingPartSearchResult ReplicatedMergeTreeP
bool found_part_with_the_same_max_block = false;
Strings replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas");
/// Move our replica to the end of replicas
for (auto it = replicas.begin(); it != replicas.end(); ++it)
{
String replica_path = storage.zookeeper_path + "/replicas/" + *it;
if (replica_path == storage.replica_path)
{
std::iter_swap(it, replicas.rbegin());
break;
}
}
/// Check all replicas and our replica must be this last one
for (const String & replica : replicas)
{
String replica_path = storage.zookeeper_path + "/replicas/" + replica;
@ -146,7 +158,7 @@ ReplicatedMergeTreePartCheckThread::MissingPartSearchResult ReplicatedMergeTreeP
if (found_part_with_the_same_min_block && found_part_with_the_same_max_block)
{
/// FIXME It may never appear
LOG_WARNING(log, "Found parts with the same min block and with the same max block as the missing part {}. Hoping that it will eventually appear as a result of a merge.", part_name);
LOG_WARNING(log, "Found parts with the same min block and with the same max block as the missing part {} on replica {}. Hoping that it will eventually appear as a result of a merge.", part_name, replica);
return MissingPartSearchResult::FoundAndDontNeedFetch;
}
}

View File

@ -55,14 +55,31 @@ void ReplicatedMergeTreeQueue::clear()
mutation_pointer.clear();
}
void ReplicatedMergeTreeQueue::initialize(const MergeTreeData::DataParts & parts)
void ReplicatedMergeTreeQueue::initialize(zkutil::ZooKeeperPtr zookeeper)
{
std::lock_guard lock(state_mutex);
for (const auto & part : parts)
LOG_TRACE(log, "Initializing parts in queue");
/// Get current parts state from zookeeper
Strings parts = zookeeper->getChildren(replica_path + "/parts");
for (const auto & part_name : parts)
{
current_parts.add(part->name, nullptr);
virtual_parts.add(part->name, nullptr);
LOG_TEST(log, "Adding part {} to current and virtual parts", part_name);
current_parts.add(part_name, nullptr);
virtual_parts.add(part_name, nullptr);
}
/// Drop parts can negatively affect virtual parts. So when we load parts
/// from zookeeper we can break invariant with virtual parts. To fix this we
/// have it here.
for (const LogEntryPtr & entry : queue)
{
if (entry->isDropPart(format_version))
virtual_parts.removePartAndCoveredParts(*entry->getDropRange(format_version));
}
LOG_TRACE(log, "Queue initialized");
}
bool ReplicatedMergeTreeQueue::isVirtualPart(const MergeTreeData::DataPartPtr & data_part) const
@ -163,7 +180,11 @@ void ReplicatedMergeTreeQueue::insertUnlocked(
const LogEntryPtr & entry, std::optional<time_t> & min_unprocessed_insert_time_changed,
std::lock_guard<std::mutex> & state_lock)
{
for (const String & virtual_part_name : entry->getVirtualPartNames(format_version))
auto entry_virtual_parts = entry->getVirtualPartNames(format_version);
LOG_TEST(log, "Insert entry {} to queue with type {} with virtual parts [{}]", entry->znode_name, entry->typeToString(), fmt::join(entry_virtual_parts, ", "));
for (const String & virtual_part_name : entry_virtual_parts)
{
virtual_parts.add(virtual_part_name, nullptr);
/// Don't add drop range parts to mutations
@ -227,6 +248,11 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
std::optional<time_t> & max_processed_insert_time_changed,
std::unique_lock<std::mutex> & state_lock)
{
auto entry_virtual_parts = entry->getVirtualPartNames(format_version);
LOG_TEST(log, "Removing {} entry {} from queue with type {} with virtual parts [{}]",
is_successful ? "successful" : "unsuccessful",
entry->znode_name, entry->typeToString(), fmt::join(entry_virtual_parts, ", "));
/// Update insert times.
if (entry->type == LogEntry::GET_PART || entry->type == LogEntry::ATTACH_PART)
{
@ -254,6 +280,7 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
{
if (!entry->actual_new_part_name.empty())
{
LOG_TEST(log, "Entry {} has actual new part name {}, removing it from mutations", entry->znode_name, entry->actual_new_part_name);
/// We don't add bigger fetched part to current_parts because we
/// have an invariant `virtual_parts` = `current_parts` + `queue`.
///
@ -264,7 +291,9 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
removeCoveredPartsFromMutations(entry->actual_new_part_name, /*remove_part = */ false, /*remove_covered_parts = */ true);
}
for (const String & virtual_part_name : entry->getVirtualPartNames(format_version))
LOG_TEST(log, "Adding parts [{}] to current parts", fmt::join(entry_virtual_parts, ", "));
for (const String & virtual_part_name : entry_virtual_parts)
{
current_parts.add(virtual_part_name, nullptr);
@ -275,14 +304,21 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
if (auto drop_range_part_name = entry->getDropRange(format_version))
{
MergeTreePartInfo drop_range_info = MergeTreePartInfo::fromPartName(*drop_range_part_name, format_version);
/// DROP PART doesn't have virtual parts so remove from current
/// parts all covered parts.
if (entry->isDropPart(format_version))
{
LOG_TEST(log, "Removing drop part from current and virtual parts {}", *drop_range_part_name);
current_parts.removePartAndCoveredParts(*drop_range_part_name);
}
else
{
LOG_TEST(log, "Removing drop range from current and virtual parts {}", *drop_range_part_name);
current_parts.remove(*drop_range_part_name);
}
virtual_parts.remove(*drop_range_part_name);
@ -307,7 +343,9 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
drop_ranges.removeDropRange(entry);
}
for (const String & virtual_part_name : entry->getVirtualPartNames(format_version))
LOG_TEST(log, "Removing unsuccessful entry {} virtual parts [{}]", entry->znode_name, fmt::join(entry_virtual_parts, ", "));
for (const String & virtual_part_name : entry_virtual_parts)
{
/// This part will never appear, so remove it from virtual parts
virtual_parts.remove(virtual_part_name);
@ -324,6 +362,9 @@ void ReplicatedMergeTreeQueue::updateStateOnQueueEntryRemoval(
void ReplicatedMergeTreeQueue::removeCoveredPartsFromMutations(const String & part_name, bool remove_part, bool remove_covered_parts)
{
auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
LOG_TEST(log, "Removing part {} from mutations (remove_part: {}, remove_covered_parts: {})", part_name, remove_part, remove_covered_parts);
auto in_partition = mutations_by_partition.find(part_info.partition_id);
if (in_partition == mutations_by_partition.end())
return;
@ -361,11 +402,17 @@ void ReplicatedMergeTreeQueue::removeCoveredPartsFromMutations(const String & pa
void ReplicatedMergeTreeQueue::addPartToMutations(const String & part_name)
{
LOG_TEST(log, "Adding part {} to mutations", part_name);
auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
/// Do not add special virtual parts to parts_to_do
if (part_info.isFakeDropRangePart())
{
LOG_TEST(log, "Part {} is fake drop range part, will not add it to mutations", part_name);
return;
}
auto in_partition = mutations_by_partition.find(part_info.partition_id);
if (in_partition == mutations_by_partition.end())
@ -1444,33 +1491,7 @@ bool ReplicatedMergeTreeQueue::processEntry(
if (saved_exception)
{
std::lock_guard lock(state_mutex);
entry->exception = saved_exception;
if (entry->type == ReplicatedMergeTreeLogEntryData::MUTATE_PART)
{
/// Record the exception in the system.mutations table.
Int64 result_data_version = MergeTreePartInfo::fromPartName(entry->new_part_name, format_version)
.getDataVersion();
auto source_part_info = MergeTreePartInfo::fromPartName(
entry->source_parts.at(0), format_version);
auto in_partition = mutations_by_partition.find(source_part_info.partition_id);
if (in_partition != mutations_by_partition.end())
{
auto mutations_begin_it = in_partition->second.upper_bound(source_part_info.getDataVersion());
auto mutations_end_it = in_partition->second.upper_bound(result_data_version);
for (auto it = mutations_begin_it; it != mutations_end_it; ++it)
{
MutationStatus & status = *it->second;
status.latest_failed_part = entry->source_parts.at(0);
status.latest_failed_part_info = source_part_info;
status.latest_fail_time = time(nullptr);
status.latest_fail_reason = getExceptionMessage(saved_exception, false);
}
}
}
return false;
}

View File

@ -31,6 +31,8 @@ class ReplicatedMergeTreeQueue
private:
friend class CurrentlyExecuting;
friend class ReplicatedMergeTreeMergePredicate;
friend class MergeFromLogEntryTask;
friend class ReplicatedMergeMutateTaskBase;
using LogEntry = ReplicatedMergeTreeLogEntry;
using LogEntryPtr = LogEntry::Ptr;
@ -277,8 +279,8 @@ public:
/// Clears queue state
void clear();
/// Put a set of (already existing) parts in virtual_parts.
void initialize(const MergeTreeData::DataParts & parts);
/// Get set of parts from zookeeper
void initialize(zkutil::ZooKeeperPtr zookeeper);
/** Inserts an action to the end of the queue.
* To restore broken parts during operation.

View File

@ -174,6 +174,8 @@ bool ReplicatedMergeTreeRestartingThread::tryStartup()
try
{
storage.queue.initialize(zookeeper);
storage.queue.load(zookeeper);
/// pullLogsToQueue() after we mark replica 'is_active' (and after we repair if it was lost);

View File

@ -1,14 +1,16 @@
#pragma once
#include <optional>
#include <vector>
#include <memory>
#include <unordered_map>
#include <common/shared_ptr_helper.h>
#include <Parsers/ASTAlterQuery.h>
#include <Storages/IStorage_fwd.h>
#include <DataTypes/IDataType.h>
#include <Core/Names.h>
#include <optional>
#include <unordered_map>
namespace DB
{
@ -68,7 +70,7 @@ struct MutationCommand
};
/// Multiple mutation commands, possible from different ALTER queries
class MutationCommands : public std::vector<MutationCommand>
class MutationCommands : public shared_ptr_helper<MutationCommands>, public std::vector<MutationCommand>
{
public:
std::shared_ptr<ASTExpressionList> ast() const;
@ -77,4 +79,6 @@ public:
void readText(ReadBuffer & in);
};
using MutationCommandsConstPtr = std::shared_ptr<MutationCommands>;
}

View File

@ -0,0 +1,125 @@
#include "RabbitMQConnection.h"
#include <common/logger_useful.h>
#include <IO/WriteHelpers.h>
namespace DB
{
static const auto CONNECT_SLEEP = 200;
static const auto RETRIES_MAX = 20;
RabbitMQConnection::RabbitMQConnection(const RabbitMQConfiguration & configuration_, Poco::Logger * log_)
: configuration(configuration_)
, log(log_)
, event_handler(loop.getLoop(), log)
{
}
String RabbitMQConnection::connectionInfoForLog() const
{
return configuration.host + ':' + toString(configuration.port);
}
bool RabbitMQConnection::isConnected()
{
std::lock_guard lock(mutex);
return isConnectedImpl();
}
bool RabbitMQConnection::connect()
{
std::lock_guard lock(mutex);
connectImpl();
return isConnectedImpl();
}
bool RabbitMQConnection::reconnect()
{
std::lock_guard lock(mutex);
if (isConnectedImpl())
return true;
disconnectImpl();
/// This will force immediate closure if not yet closed
if (!connection->closed())
connection->close(true);
LOG_DEBUG(log, "Trying to restore connection to {}", connectionInfoForLog());
connectImpl();
return isConnectedImpl();
}
ChannelPtr RabbitMQConnection::createChannel()
{
std::lock_guard lock(mutex);
return std::make_unique<AMQP::TcpChannel>(connection.get());
}
void RabbitMQConnection::disconnect(bool immediately)
{
std::lock_guard lock(mutex);
disconnectImpl(immediately);
}
void RabbitMQConnection::heartbeat()
{
std::lock_guard lock(mutex);
connection->heartbeat();
}
bool RabbitMQConnection::closed()
{
std::lock_guard lock(mutex);
return connection->closed();
}
bool RabbitMQConnection::isConnectedImpl() const
{
return event_handler.connectionRunning() && connection->usable();
}
void RabbitMQConnection::connectImpl()
{
if (configuration.connection_string.empty())
{
LOG_DEBUG(log, "Connecting to: {}:{} (user: {})", configuration.host, configuration.port, configuration.username);
AMQP::Login login(configuration.username, configuration.password);
AMQP::Address address(configuration.host, configuration.port, login, configuration.vhost, configuration.secure);
connection = std::make_unique<AMQP::TcpConnection>(&event_handler, address);
}
else
{
AMQP::Address address(configuration.connection_string);
connection = std::make_unique<AMQP::TcpConnection>(&event_handler, address);
}
auto cnt_retries = 0;
while (true)
{
event_handler.iterateLoop();
if (connection->ready() || cnt_retries++ == RETRIES_MAX)
break;
std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
}
}
void RabbitMQConnection::disconnectImpl(bool immediately)
{
connection->close(immediately);
/** Connection is not closed immediately (firstly, all pending operations are completed, and then
* an AMQP closing-handshake is performed). But cannot open a new connection until previous one is properly closed
*/
size_t cnt_retries = 0;
while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
event_handler.iterateLoop();
}
}

View File

@ -0,0 +1,65 @@
#pragma once
#include <Storages/RabbitMQ/UVLoop.h>
#include <Storages/RabbitMQ/RabbitMQHandler.h>
namespace DB
{
struct RabbitMQConfiguration
{
String host;
UInt16 port;
String username;
String password;
String vhost;
bool secure;
String connection_string;
};
class RabbitMQConnection
{
public:
RabbitMQConnection(const RabbitMQConfiguration & configuration_, Poco::Logger * log_);
bool isConnected();
bool connect();
bool reconnect();
void disconnect(bool immediately = false);
void heartbeat();
bool closed();
ChannelPtr createChannel();
/// RabbitMQHandler is thread safe. Any public methods can be called concurrently.
RabbitMQHandler & getHandler() { return event_handler; }
String connectionInfoForLog() const;
private:
bool isConnectedImpl() const;
void connectImpl();
void disconnectImpl(bool immediately = false);
RabbitMQConfiguration configuration;
Poco::Logger * log;
UVLoop loop;
RabbitMQHandler event_handler;
std::unique_ptr<AMQP::TcpConnection> connection;
std::mutex mutex;
};
using RabbitMQConnectionPtr = std::unique_ptr<RabbitMQConnection>;
}

View File

@ -17,6 +17,7 @@ namespace Loop
static const UInt8 STOP = 2;
}
using ChannelPtr = std::unique_ptr<AMQP::TcpChannel>;
class RabbitMQHandler : public AMQP::LibUvHandler
{
@ -40,8 +41,8 @@ public:
void stopLoop();
bool connectionRunning() { return connection_running.load(); }
bool loopRunning() { return loop_running.load(); }
bool connectionRunning() const { return connection_running.load(); }
bool loopRunning() const { return loop_running.load(); }
void updateLoopState(UInt8 state) { loop_state.store(state); }
UInt8 getLoopState() { return loop_state.load(); }
@ -55,4 +56,6 @@ private:
std::mutex startup_mutex;
};
using RabbitMQHandlerPtr = std::shared_ptr<RabbitMQHandler>;
}

View File

@ -16,7 +16,7 @@ namespace DB
ReadBufferFromRabbitMQConsumer::ReadBufferFromRabbitMQConsumer(
ChannelPtr consumer_channel_,
HandlerPtr event_handler_,
RabbitMQHandler & event_handler_,
std::vector<String> & queues_,
size_t channel_id_base_,
const String & channel_base_,
@ -35,8 +35,6 @@ ReadBufferFromRabbitMQConsumer::ReadBufferFromRabbitMQConsumer(
, stopped(stopped_)
, received(queue_size_)
{
if (consumer_channel)
setupChannel();
}
@ -122,6 +120,12 @@ void ReadBufferFromRabbitMQConsumer::updateAckTracker(AckTracker record_info)
void ReadBufferFromRabbitMQConsumer::setupChannel()
{
if (!consumer_channel)
return;
/// We mark initialized only once.
initialized = true;
wait_subscription.store(true);
consumer_channel->onReady([&]()
@ -159,7 +163,7 @@ bool ReadBufferFromRabbitMQConsumer::needChannelUpdate()
void ReadBufferFromRabbitMQConsumer::iterateEventLoop()
{
event_handler->iterateLoop();
event_handler.iterateLoop();
}

View File

@ -15,16 +15,13 @@ namespace Poco
namespace DB
{
using ChannelPtr = std::shared_ptr<AMQP::TcpChannel>;
using HandlerPtr = std::shared_ptr<RabbitMQHandler>;
class ReadBufferFromRabbitMQConsumer : public ReadBuffer
{
public:
ReadBufferFromRabbitMQConsumer(
ChannelPtr consumer_channel_,
HandlerPtr event_handler_,
RabbitMQHandler & event_handler_,
std::vector<String> & queues_,
size_t channel_id_base_,
const String & channel_base_,
@ -78,6 +75,12 @@ public:
auto getMessageID() const { return current.message_id; }
auto getTimestamp() const { return current.timestamp; }
void initialize()
{
if (!initialized)
setupChannel();
}
private:
bool nextImpl() override;
@ -85,7 +88,7 @@ private:
void iterateEventLoop();
ChannelPtr consumer_channel;
HandlerPtr event_handler;
RabbitMQHandler & event_handler; /// Used concurrently, but is thread safe.
std::vector<String> queues;
const String channel_base;
const size_t channel_id_base;
@ -102,6 +105,9 @@ private:
AckTracker last_inserted_record_info;
UInt64 prev_tag = 0, channel_id_counter = 0;
/// Has initial setup after constructor been made?
bool initialized = false;
};
}

View File

@ -37,8 +37,6 @@
namespace DB
{
static const auto CONNECT_SLEEP = 200;
static const auto RETRIES_MAX = 20;
static const uint32_t QUEUE_SIZE = 100000;
static const auto MAX_FAILED_READ_ATTEMPTS = 10;
static const auto RESCHEDULE_MS = 500;
@ -73,7 +71,8 @@ StorageRabbitMQ::StorageRabbitMQ(
const StorageID & table_id_,
ContextPtr context_,
const ColumnsDescription & columns_,
std::unique_ptr<RabbitMQSettings> rabbitmq_settings_)
std::unique_ptr<RabbitMQSettings> rabbitmq_settings_,
bool is_attach_)
: IStorage(table_id_)
, WithContext(context_->getGlobalContext())
, rabbitmq_settings(std::move(rabbitmq_settings_))
@ -91,23 +90,26 @@ StorageRabbitMQ::StorageRabbitMQ(
, use_user_setup(rabbitmq_settings->rabbitmq_queue_consume.value)
, hash_exchange(num_consumers > 1 || num_queues > 1)
, log(&Poco::Logger::get("StorageRabbitMQ (" + table_id_.table_name + ")"))
, address(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_host_port))
, parsed_address(parseAddress(address, 5672))
, login_password(std::make_pair(
getContext()->getConfigRef().getString("rabbitmq.username"),
getContext()->getConfigRef().getString("rabbitmq.password")))
, vhost(getContext()->getConfigRef().getString("rabbitmq.vhost", getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_vhost)))
, connection_string(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_address))
, secure(rabbitmq_settings->rabbitmq_secure.value)
, semaphore(0, num_consumers)
, unique_strbase(getRandomName())
, queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize())))
, milliseconds_to_wait(RESCHEDULE_MS)
, is_attach(is_attach_)
{
event_handler = std::make_shared<RabbitMQHandler>(loop.getLoop(), log);
if (secure)
auto parsed_address = parseAddress(getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_host_port), 5672);
configuration =
{
.host = parsed_address.first,
.port = parsed_address.second,
.username = getContext()->getConfigRef().getString("rabbitmq.username"),
.password = getContext()->getConfigRef().getString("rabbitmq.password"),
.vhost = getContext()->getConfigRef().getString("rabbitmq.vhost", getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_vhost)),
.secure = rabbitmq_settings->rabbitmq_secure.value,
.connection_string = getContext()->getMacros()->expand(rabbitmq_settings->rabbitmq_address)
};
if (configuration.secure)
SSL_library_init();
restoreConnection(false);
StorageInMemoryMetadata storage_metadata;
storage_metadata.setColumns(columns_);
@ -116,17 +118,6 @@ StorageRabbitMQ::StorageRabbitMQ(
rabbitmq_context = addSettings(getContext());
rabbitmq_context->makeQueryContext();
/// One looping task for all consumers as they share the same connection == the same handler == the same event loop
event_handler->updateLoopState(Loop::STOP);
looping_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQLoopingTask", [this]{ loopingFunc(); });
looping_task->deactivate();
streaming_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQStreamingTask", [this]{ streamingToViewsFunc(); });
streaming_task->deactivate();
connection_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQConnectionTask", [this]{ connectionFunc(); });
connection_task->deactivate();
if (queue_base.empty())
{
/* Make sure that local exchange name is unique for each table and is not the same as client's exchange name. It also needs to
@ -151,6 +142,31 @@ StorageRabbitMQ::StorageRabbitMQ(
}
bridge_exchange = sharding_exchange + "_bridge";
try
{
connection = std::make_unique<RabbitMQConnection>(configuration, log);
if (connection->connect())
initRabbitMQ();
else if (!is_attach)
throw Exception(ErrorCodes::CANNOT_CONNECT_RABBITMQ, "Cannot connect to {}", connection->connectionInfoForLog());
}
catch (...)
{
tryLogCurrentException(log);
if (!is_attach)
throw;
}
/// One looping task for all consumers as they share the same connection == the same handler == the same event loop
looping_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQLoopingTask", [this]{ loopingFunc(); });
looping_task->deactivate();
streaming_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQStreamingTask", [this]{ streamingToViewsFunc(); });
streaming_task->deactivate();
connection_task = getContext()->getMessageBrokerSchedulePool().createTask("RabbitMQConnectionTask", [this]{ connectionFunc(); });
connection_task->deactivate();
}
@ -222,14 +238,19 @@ ContextMutablePtr StorageRabbitMQ::addSettings(ContextPtr local_context) const
void StorageRabbitMQ::loopingFunc()
{
if (event_handler->connectionRunning())
event_handler->startLoop();
if (!rabbit_is_ready)
return;
if (connection->isConnected())
connection->getHandler().startLoop();
}
void StorageRabbitMQ::connectionFunc()
{
if (restoreConnection(true))
if (rabbit_is_ready)
return;
if (connection->reconnect())
initRabbitMQ();
else
connection_task->scheduleAfter(RESCHEDULE_MS);
@ -242,7 +263,9 @@ void StorageRabbitMQ::connectionFunc()
void StorageRabbitMQ::deactivateTask(BackgroundSchedulePool::TaskHolder & task, bool wait, bool stop_loop)
{
if (stop_loop)
event_handler->updateLoopState(Loop::STOP);
{
connection->getHandler().updateLoopState(Loop::STOP);
}
std::unique_lock<std::mutex> lock(task_mutex, std::defer_lock);
if (lock.try_lock())
@ -268,7 +291,7 @@ size_t StorageRabbitMQ::getMaxBlockSize() const
void StorageRabbitMQ::initRabbitMQ()
{
if (stream_cancelled)
if (stream_cancelled || rabbit_is_ready)
return;
if (use_user_setup)
@ -278,19 +301,28 @@ void StorageRabbitMQ::initRabbitMQ()
return;
}
AMQP::TcpChannel rabbit_channel(connection.get());
try
{
auto rabbit_channel = connection->createChannel();
/// Main exchange -> Bridge exchange -> ( Sharding exchange ) -> Queues -> Consumers
/// Main exchange -> Bridge exchange -> ( Sharding exchange ) -> Queues -> Consumers
initExchange(rabbit_channel);
bindExchange(rabbit_channel);
initExchange(*rabbit_channel);
bindExchange(*rabbit_channel);
for (const auto i : collections::range(0, num_queues))
bindQueue(i + 1, rabbit_channel);
for (const auto i : collections::range(0, num_queues))
bindQueue(i + 1, *rabbit_channel);
LOG_TRACE(log, "RabbitMQ setup completed");
rabbit_is_ready = true;
rabbit_channel.close();
LOG_TRACE(log, "RabbitMQ setup completed");
rabbit_is_ready = true;
rabbit_channel->close();
}
catch (...)
{
tryLogCurrentException(log);
if (!is_attach)
throw;
}
}
@ -380,7 +412,7 @@ void StorageRabbitMQ::bindExchange(AMQP::TcpChannel & rabbit_channel)
}
rabbit_channel.bindExchange(exchange_name, bridge_exchange, routing_keys[0], bind_headers)
.onSuccess([&]() { event_handler->stopLoop(); })
.onSuccess([&]() { connection->getHandler().stopLoop(); })
.onError([&](const char * message)
{
throw Exception(
@ -392,7 +424,7 @@ void StorageRabbitMQ::bindExchange(AMQP::TcpChannel & rabbit_channel)
else if (exchange_type == AMQP::ExchangeType::fanout || exchange_type == AMQP::ExchangeType::consistent_hash)
{
rabbit_channel.bindExchange(exchange_name, bridge_exchange, routing_keys[0])
.onSuccess([&]() { event_handler->stopLoop(); })
.onSuccess([&]() { connection->getHandler().stopLoop(); })
.onError([&](const char * message)
{
throw Exception(
@ -410,7 +442,7 @@ void StorageRabbitMQ::bindExchange(AMQP::TcpChannel & rabbit_channel)
{
++bound_keys;
if (bound_keys == routing_keys.size())
event_handler->stopLoop();
connection->getHandler().stopLoop();
})
.onError([&](const char * message)
{
@ -422,7 +454,7 @@ void StorageRabbitMQ::bindExchange(AMQP::TcpChannel & rabbit_channel)
}
}
event_handler->startBlockingLoop();
connection->getHandler().startBlockingLoop();
}
@ -441,7 +473,7 @@ void StorageRabbitMQ::bindQueue(size_t queue_id, AMQP::TcpChannel & rabbit_chann
* fanout exchange it can be arbitrary
*/
rabbit_channel.bindQueue(consumer_exchange, queue_name, std::to_string(queue_id))
.onSuccess([&] { event_handler->stopLoop(); })
.onSuccess([&] { connection->getHandler().stopLoop(); })
.onError([&](const char * message)
{
throw Exception(
@ -507,57 +539,22 @@ void StorageRabbitMQ::bindQueue(size_t queue_id, AMQP::TcpChannel & rabbit_chann
/// AMQP::autodelete setting is not allowed, because in case of server restart there will be no consumers
/// and deleting queues should not take place.
rabbit_channel.declareQueue(queue_name, AMQP::durable, queue_settings).onSuccess(success_callback).onError(error_callback);
event_handler->startBlockingLoop();
}
bool StorageRabbitMQ::restoreConnection(bool reconnecting)
{
size_t cnt_retries = 0;
if (reconnecting)
{
connection->close(); /// Connection might be unusable, but not closed
/* Connection is not closed immediately (firstly, all pending operations are completed, and then
* an AMQP closing-handshake is performed). But cannot open a new connection until previous one is properly closed
*/
while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
event_handler->iterateLoop();
/// This will force immediate closure if not yet closed
if (!connection->closed())
connection->close(true);
LOG_TRACE(log, "Trying to restore connection to " + address);
}
auto amqp_address = connection_string.empty() ? AMQP::Address(parsed_address.first, parsed_address.second,
AMQP::Login(login_password.first, login_password.second), vhost, secure)
: AMQP::Address(connection_string);
connection = std::make_unique<AMQP::TcpConnection>(event_handler.get(), amqp_address);
cnt_retries = 0;
while (!connection->ready() && !stream_cancelled && cnt_retries++ != RETRIES_MAX)
{
event_handler->iterateLoop();
std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
}
return event_handler->connectionRunning();
connection->getHandler().startBlockingLoop();
}
bool StorageRabbitMQ::updateChannel(ChannelPtr & channel)
{
if (event_handler->connectionRunning())
try
{
channel = std::make_shared<AMQP::TcpChannel>(connection.get());
return true;
channel = connection->createChannel();
return channel->usable();
}
catch (...)
{
tryLogCurrentException(log);
return false;
}
channel = nullptr;
return false;
}
@ -576,11 +573,11 @@ void StorageRabbitMQ::unbindExchange()
std::call_once(flag, [&]()
{
streaming_task->deactivate();
event_handler->updateLoopState(Loop::STOP);
connection->getHandler().updateLoopState(Loop::STOP);
looping_task->deactivate();
AMQP::TcpChannel rabbit_channel(connection.get());
rabbit_channel.removeExchange(bridge_exchange)
auto rabbit_channel = connection->createChannel();
rabbit_channel->removeExchange(bridge_exchange)
.onSuccess([&]()
{
exchange_removed.store(true);
@ -592,9 +589,9 @@ void StorageRabbitMQ::unbindExchange()
while (!exchange_removed.load())
{
event_handler->iterateLoop();
connection->getHandler().iterateLoop();
}
rabbit_channel.close();
rabbit_channel->close();
});
}
@ -618,13 +615,16 @@ Pipe StorageRabbitMQ::read(
auto modified_context = addSettings(local_context);
auto block_size = getMaxBlockSize();
if (!event_handler->connectionRunning())
if (!connection->isConnected())
{
if (event_handler->loopRunning())
if (connection->getHandler().loopRunning())
deactivateTask(looping_task, false, true);
restoreConnection(true);
if (!connection->reconnect())
throw Exception(ErrorCodes::CANNOT_CONNECT_RABBITMQ, "No connection to {}", connection->connectionInfoForLog());
}
initializeBuffers();
Pipes pipes;
pipes.reserve(num_created_consumers);
@ -638,7 +638,7 @@ Pipe StorageRabbitMQ::read(
pipes.emplace_back(std::make_shared<SourceFromInputStream>(converting_stream));
}
if (!event_handler->loopRunning() && event_handler->connectionRunning())
if (!connection->getHandler().loopRunning() && connection->isConnected())
looping_task->activateAndSchedule();
LOG_DEBUG(log, "Starting reading {} streams", pipes.size());
@ -656,16 +656,35 @@ SinkToStoragePtr StorageRabbitMQ::write(const ASTPtr &, const StorageMetadataPtr
void StorageRabbitMQ::startup()
{
if (event_handler->connectionRunning())
initRabbitMQ();
else
connection_task->activateAndSchedule();
if (!rabbit_is_ready)
{
if (connection->isConnected())
{
try
{
initRabbitMQ();
}
catch (...)
{
tryLogCurrentException(log);
if (!is_attach)
throw;
}
}
else
{
connection_task->activateAndSchedule();
}
}
for (size_t i = 0; i < num_consumers; ++i)
{
try
{
pushReadBuffer(createReadBuffer());
auto buffer = createReadBuffer();
if (rabbit_is_ready)
buffer->initialize();
pushReadBuffer(std::move(buffer));
++num_created_consumers;
}
catch (const AMQP::Exception & e)
@ -675,7 +694,7 @@ void StorageRabbitMQ::startup()
}
}
event_handler->updateLoopState(Loop::RUN);
connection->getHandler().updateLoopState(Loop::RUN);
streaming_task->activateAndSchedule();
}
@ -693,25 +712,28 @@ void StorageRabbitMQ::shutdown()
deactivateTask(streaming_task, true, false);
deactivateTask(looping_task, true, true);
if (drop_table)
/// Just a paranoid try catch, it is not actually needed.
try
{
for (auto & buffer : buffers)
buffer->closeChannel();
cleanupRabbitMQ();
if (drop_table)
{
for (auto & buffer : buffers)
buffer->closeChannel();
cleanupRabbitMQ();
}
/// It is important to close connection here - before removing consumer buffers, because
/// it will finish and clean callbacks, which might use those buffers data.
connection->disconnect();
for (size_t i = 0; i < num_created_consumers; ++i)
popReadBuffer();
}
catch (...)
{
tryLogCurrentException(log);
}
/// It is important to close connection here - before removing consumer buffers, because
/// it will finish and clean callbacks, which might use those buffers data.
connection->close();
/// Connection is not closed immediately - it requires the loop to shutdown it properly and to
/// finish all callbacks.
size_t cnt_retries = 0;
while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
event_handler->iterateLoop();
for (size_t i = 0; i < num_created_consumers; ++i)
popReadBuffer();
}
@ -722,7 +744,8 @@ void StorageRabbitMQ::cleanupRabbitMQ() const
if (use_user_setup)
return;
if (!event_handler->connectionRunning())
connection->heartbeat();
if (!connection->isConnected())
{
String queue_names;
for (const auto & queue : queues)
@ -738,27 +761,27 @@ void StorageRabbitMQ::cleanupRabbitMQ() const
return;
}
AMQP::TcpChannel rabbit_channel(connection.get());
auto rabbit_channel = connection->createChannel();
for (const auto & queue : queues)
{
/// AMQP::ifunused is needed, because it is possible to share queues between multiple tables and dropping
/// on of them should not affect others.
/// AMQP::ifempty is not used on purpose.
rabbit_channel.removeQueue(queue, AMQP::ifunused)
rabbit_channel->removeQueue(queue, AMQP::ifunused)
.onSuccess([&](uint32_t num_messages)
{
LOG_TRACE(log, "Successfully deleted queue {}, messages contained {}", queue, num_messages);
event_handler->stopLoop();
connection->getHandler().stopLoop();
})
.onError([&](const char * message)
{
LOG_ERROR(log, "Failed to delete queue {}. Error message: {}", queue, message);
event_handler->stopLoop();
connection->getHandler().stopLoop();
});
}
event_handler->startBlockingLoop();
rabbit_channel.close();
connection->getHandler().startBlockingLoop();
rabbit_channel->close();
/// Also there is no need to cleanup exchanges as they were created with AMQP::autodelete option. Once queues
/// are removed, exchanges will also be cleaned.
@ -801,12 +824,9 @@ ConsumerBufferPtr StorageRabbitMQ::popReadBuffer(std::chrono::milliseconds timeo
ConsumerBufferPtr StorageRabbitMQ::createReadBuffer()
{
ChannelPtr consumer_channel;
if (event_handler->connectionRunning())
consumer_channel = std::make_shared<AMQP::TcpChannel>(connection.get());
ChannelPtr consumer_channel = connection->createChannel();
return std::make_shared<ReadBufferFromRabbitMQConsumer>(
consumer_channel, event_handler, queues, ++consumer_id,
std::move(consumer_channel), connection->getHandler(), queues, ++consumer_id,
unique_strbase, log, row_delimiter, queue_size, stream_cancelled);
}
@ -814,7 +834,7 @@ ConsumerBufferPtr StorageRabbitMQ::createReadBuffer()
ProducerBufferPtr StorageRabbitMQ::createWriteBuffer()
{
return std::make_shared<WriteBufferToRabbitMQProducer>(
parsed_address, getContext(), login_password, vhost, routing_keys, exchange_name, exchange_type,
configuration, getContext(), routing_keys, exchange_name, exchange_type,
producer_id.fetch_add(1), persistent, wait_confirm, log,
row_delimiter ? std::optional<char>{row_delimiter} : std::nullopt, 1, 1024);
}
@ -848,10 +868,24 @@ bool StorageRabbitMQ::checkDependencies(const StorageID & table_id)
}
void StorageRabbitMQ::initializeBuffers()
{
assert(rabbit_is_ready);
if (!initialized)
{
for (const auto & buffer : buffers)
buffer->initialize();
initialized = true;
}
}
void StorageRabbitMQ::streamingToViewsFunc()
{
if (rabbit_is_ready && (event_handler->connectionRunning() || restoreConnection(true)))
if (rabbit_is_ready && (connection->isConnected() || connection->reconnect()))
{
initializeBuffers();
try
{
auto table_id = getStorageID();
@ -876,7 +910,7 @@ void StorageRabbitMQ::streamingToViewsFunc()
/// Reschedule with backoff.
if (milliseconds_to_wait < BACKOFF_TRESHOLD)
milliseconds_to_wait *= 2;
event_handler->updateLoopState(Loop::STOP);
connection->getHandler().updateLoopState(Loop::STOP);
break;
}
else
@ -888,7 +922,7 @@ void StorageRabbitMQ::streamingToViewsFunc()
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
if (duration.count() > MAX_THREAD_WORK_DURATION_MS)
{
event_handler->updateLoopState(Loop::STOP);
connection->getHandler().updateLoopState(Loop::STOP);
LOG_TRACE(log, "Reschedule streaming. Thread work duration limit exceeded.");
break;
}
@ -958,9 +992,9 @@ bool StorageRabbitMQ::streamToViews()
std::atomic<bool> stub = {false};
if (!event_handler->loopRunning())
if (!connection->getHandler().loopRunning())
{
event_handler->updateLoopState(Loop::RUN);
connection->getHandler().updateLoopState(Loop::RUN);
looping_task->activateAndSchedule();
}
@ -972,13 +1006,14 @@ bool StorageRabbitMQ::streamToViews()
deactivateTask(looping_task, false, true);
size_t queue_empty = 0;
if (!event_handler->connectionRunning())
if (!connection->isConnected())
{
if (stream_cancelled)
return true;
if (restoreConnection(true))
if (connection->reconnect())
{
LOG_DEBUG(log, "Connection restored, updating channels");
for (auto & stream : streams)
stream->as<RabbitMQBlockInputStream>()->updateChannel();
}
@ -1007,7 +1042,10 @@ bool StorageRabbitMQ::streamToViews()
buffer->updateAckTracker();
if (updateChannel(buffer->getChannel()))
{
LOG_TRACE(log, "Connection is active, but channel update is needed");
buffer->setupChannel();
}
}
}
@ -1026,12 +1064,12 @@ bool StorageRabbitMQ::streamToViews()
if (!stream->as<RabbitMQBlockInputStream>()->sendAck())
{
/// Iterate loop to activate error callbacks if they happened
event_handler->iterateLoop();
if (!event_handler->connectionRunning())
connection->getHandler().iterateLoop();
if (!connection->isConnected())
break;
}
event_handler->iterateLoop();
connection->getHandler().iterateLoop();
}
}
@ -1044,7 +1082,7 @@ bool StorageRabbitMQ::streamToViews()
}
else
{
event_handler->updateLoopState(Loop::RUN);
connection->getHandler().updateLoopState(Loop::RUN);
looping_task->activateAndSchedule();
}
@ -1072,7 +1110,7 @@ void registerStorageRabbitMQ(StorageFactory & factory)
if (!rabbitmq_settings->rabbitmq_format.changed)
throw Exception("You must specify `rabbitmq_format` setting", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
return StorageRabbitMQ::create(args.table_id, args.getContext(), args.columns, std::move(rabbitmq_settings));
return StorageRabbitMQ::create(args.table_id, args.getContext(), args.columns, std::move(rabbitmq_settings), args.attach);
};
factory.registerStorage("RabbitMQ", creator_fn, StorageFactory::StorageFeatures{ .supports_settings = true, });

View File

@ -7,9 +7,8 @@
#include <mutex>
#include <atomic>
#include <Storages/RabbitMQ/Buffer_fwd.h>
#include <Storages/RabbitMQ/RabbitMQHandler.h>
#include <Storages/RabbitMQ/RabbitMQSettings.h>
#include <Storages/RabbitMQ/UVLoop.h>
#include <Storages/RabbitMQ/RabbitMQConnection.h>
#include <Common/thread_local_rng.h>
#include <amqpcpp/libuv.h>
#include <uv.h>
@ -19,8 +18,6 @@
namespace DB
{
using ChannelPtr = std::shared_ptr<AMQP::TcpChannel>;
class StorageRabbitMQ final: public shared_ptr_helper<StorageRabbitMQ>, public IStorage, WithContext
{
friend struct shared_ptr_helper<StorageRabbitMQ>;
@ -76,7 +73,8 @@ protected:
const StorageID & table_id_,
ContextPtr context_,
const ColumnsDescription & columns_,
std::unique_ptr<RabbitMQSettings> rabbitmq_settings_);
std::unique_ptr<RabbitMQSettings> rabbitmq_settings_,
bool is_attach_);
private:
ContextMutablePtr rabbitmq_context;
@ -103,16 +101,9 @@ private:
bool hash_exchange;
Poco::Logger * log;
String address;
std::pair<String, UInt16> parsed_address;
std::pair<String, String> login_password;
String vhost;
String connection_string;
bool secure;
UVLoop loop;
std::shared_ptr<RabbitMQHandler> event_handler;
std::unique_ptr<AMQP::TcpConnection> connection; /// Connection for all consumers
RabbitMQConnectionPtr connection; /// Connection for all consumers
RabbitMQConfiguration configuration;
size_t num_created_consumers = 0;
Poco::Semaphore semaphore;
@ -143,8 +134,11 @@ private:
std::atomic<bool> stream_cancelled{false};
size_t read_attempts = 0;
mutable bool drop_table = false;
bool is_attach;
ConsumerBufferPtr createReadBuffer();
void initializeBuffers();
bool initialized = false;
/// Functions working in the background
void streamingToViewsFunc();
@ -166,7 +160,6 @@ private:
void bindExchange(AMQP::TcpChannel & rabbit_channel);
void bindQueue(size_t queue_id, AMQP::TcpChannel & rabbit_channel);
bool restoreConnection(bool reconnecting);
bool streamToViews();
bool checkDependencies(const StorageID & table_id);

View File

@ -15,8 +15,6 @@
namespace DB
{
static const auto CONNECT_SLEEP = 200;
static const auto RETRIES_MAX = 20;
static const auto BATCH = 1000;
static const auto RETURNED_LIMIT = 50000;
@ -26,10 +24,8 @@ namespace ErrorCodes
}
WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
std::pair<String, UInt16> & parsed_address_,
const RabbitMQConfiguration & configuration_,
ContextPtr global_context,
const std::pair<String, String> & login_password_,
const String & vhost_,
const Names & routing_keys_,
const String & exchange_name_,
const AMQP::ExchangeType exchange_type_,
@ -41,9 +37,7 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
size_t rows_per_message,
size_t chunk_size_)
: WriteBuffer(nullptr, 0)
, parsed_address(parsed_address_)
, login_password(login_password_)
, vhost(vhost_)
, connection(configuration_, log_)
, routing_keys(routing_keys_)
, exchange_name(exchange_name_)
, exchange_type(exchange_type_)
@ -57,20 +51,10 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
, max_rows(rows_per_message)
, chunk_size(chunk_size_)
{
event_handler = std::make_unique<RabbitMQHandler>(loop.getLoop(), log);
if (setupConnection(false))
{
if (connection.connect())
setupChannel();
}
else
{
if (!connection->closed())
connection->close(true);
throw Exception("Cannot connect to RabbitMQ host: " + parsed_address.first + ", port: " + std::to_string(parsed_address.second),
ErrorCodes::CANNOT_CONNECT_RABBITMQ);
}
throw Exception(ErrorCodes::CANNOT_CONNECT_RABBITMQ, "Cannot connect to RabbitMQ {}", connection.connectionInfoForLog());
writing_task = global_context->getSchedulePool().createTask("RabbitMQWritingTask", [this]{ writingFunc(); });
writing_task->deactivate();
@ -92,15 +76,7 @@ WriteBufferToRabbitMQProducer::WriteBufferToRabbitMQProducer(
WriteBufferToRabbitMQProducer::~WriteBufferToRabbitMQProducer()
{
writing_task->deactivate();
connection->close();
size_t cnt_retries = 0;
while (!connection->closed() && cnt_retries++ != RETRIES_MAX)
{
event_handler->iterateLoop();
std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
}
connection.disconnect();
assert(rows == 0);
}
@ -131,42 +107,9 @@ void WriteBufferToRabbitMQProducer::countRow()
}
bool WriteBufferToRabbitMQProducer::setupConnection(bool reconnecting)
{
size_t cnt_retries = 0;
if (reconnecting)
{
connection->close();
while (!connection->closed() && ++cnt_retries != RETRIES_MAX)
event_handler->iterateLoop();
if (!connection->closed())
connection->close(true);
LOG_TRACE(log, "Trying to set up connection");
}
connection = std::make_unique<AMQP::TcpConnection>(event_handler.get(),
AMQP::Address(
parsed_address.first, parsed_address.second,
AMQP::Login(login_password.first, login_password.second), vhost));
cnt_retries = 0;
while (!connection->ready() && ++cnt_retries != RETRIES_MAX)
{
event_handler->iterateLoop();
std::this_thread::sleep_for(std::chrono::milliseconds(CONNECT_SLEEP));
}
return event_handler->connectionRunning();
}
void WriteBufferToRabbitMQProducer::setupChannel()
{
producer_channel = std::make_unique<AMQP::TcpChannel>(connection.get());
producer_channel = connection.createChannel();
producer_channel->onError([&](const char * message)
{
@ -322,8 +265,11 @@ void WriteBufferToRabbitMQProducer::writingFunc()
if (wait_num.load() && delivery_record.empty() && payloads.empty() && returned.empty())
wait_all = false;
else if ((!producer_channel->usable() && event_handler->connectionRunning()) || (!event_handler->connectionRunning() && setupConnection(true)))
setupChannel();
else if (!producer_channel->usable())
{
if (connection.reconnect())
setupChannel();
}
}
LOG_DEBUG(log, "Producer on channel {} completed", channel_id);
@ -355,7 +301,7 @@ void WriteBufferToRabbitMQProducer::reinitializeChunks()
void WriteBufferToRabbitMQProducer::iterateEventLoop()
{
event_handler->iterateLoop();
connection.getHandler().iterateLoop();
}
}

View File

@ -6,8 +6,7 @@
#include <mutex>
#include <atomic>
#include <amqpcpp.h>
#include <Storages/RabbitMQ/RabbitMQHandler.h>
#include <Storages/RabbitMQ/UVLoop.h>
#include <Storages/RabbitMQ/RabbitMQConnection.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Core/BackgroundSchedulePool.h>
#include <Core/Names.h>
@ -19,14 +18,12 @@ class WriteBufferToRabbitMQProducer : public WriteBuffer
{
public:
WriteBufferToRabbitMQProducer(
std::pair<String, UInt16> & parsed_address_,
const RabbitMQConfiguration & configuration_,
ContextPtr global_context,
const std::pair<String, String> & login_password_,
const String & vhost_,
const Names & routing_keys_,
const String & exchange_name_,
const AMQP::ExchangeType exchange_type_,
const size_t channel_id_,
const size_t channel_id_base_,
const bool persistent_,
std::atomic<bool> & wait_confirm_,
Poco::Logger * log_,
@ -48,14 +45,12 @@ private:
void iterateEventLoop();
void writingFunc();
bool setupConnection(bool reconnecting);
void setupChannel();
void removeRecord(UInt64 received_delivery_tag, bool multiple, bool republish);
void publish(ConcurrentBoundedQueue<std::pair<UInt64, String>> & message, bool republishing);
std::pair<String, UInt16> parsed_address;
const std::pair<String, String> login_password;
const String vhost;
RabbitMQConnection connection;
const Names routing_keys;
const String exchange_name;
AMQP::ExchangeType exchange_type;
@ -70,9 +65,6 @@ private:
AMQP::Table key_arguments;
BackgroundSchedulePool::TaskHolder writing_task;
UVLoop loop;
std::unique_ptr<RabbitMQHandler> event_handler;
std::unique_ptr<AMQP::TcpConnection> connection;
std::unique_ptr<AMQP::TcpChannel> producer_channel;
bool producer_ready = false;

View File

@ -1,5 +1,7 @@
#include "StorageMergeTree.h"
#include <optional>
#include <Databases/IDatabase.h>
#include <Common/escapeForFileName.h>
#include <Common/typeid_cast.h>
@ -21,13 +23,13 @@
#include <Storages/PartitionCommands.h>
#include <Storages/MergeTree/MergeTreeSink.h>
#include <Storages/MergeTree/MergeTreeDataPartInMemory.h>
#include <Storages/MergeTree/MergePlainMergeTreeTask.h>
#include <Storages/MergeTree/PartitionPruner.h>
#include <Storages/MergeTree/MergeList.h>
#include <Storages/MergeTree/checkDataPart.h>
#include <Processors/Pipe.h>
#include <Processors/QueryPlan/BuildQueryPipelineSettings.h>
#include <Processors/QueryPlan/Optimizations/QueryPlanOptimizationSettings.h>
#include <optional>
namespace CurrentMetrics
{
@ -37,6 +39,7 @@ namespace CurrentMetrics
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
@ -327,8 +330,8 @@ void StorageMergeTree::alter(
/// While exists, marks parts as 'currently_merging_mutating_parts' and reserves free space on filesystem.
StorageMergeTree::CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger(
FutureMergedMutatedPart & future_part_,
CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger(
FutureMergedMutatedPartPtr future_part_,
size_t total_size,
StorageMergeTree & storage_,
const StorageMetadataPtr & metadata_snapshot,
@ -339,12 +342,12 @@ StorageMergeTree::CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger(
/// if we mutate part, than we should reserve space on the same disk, because mutations possible can create hardlinks
if (is_mutation)
reserved_space = storage.tryReserveSpace(total_size, future_part_.parts[0]->volume);
reserved_space = storage.tryReserveSpace(total_size, future_part->parts[0]->volume);
else
{
IMergeTreeDataPart::TTLInfos ttl_infos;
size_t max_volume_index = 0;
for (auto & part_ptr : future_part_.parts)
for (auto & part_ptr : future_part->parts)
{
ttl_infos.update(part_ptr->ttl_infos);
max_volume_index = std::max(max_volume_index, storage.getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk()));
@ -354,9 +357,9 @@ StorageMergeTree::CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger(
metadata_snapshot,
total_size,
max_volume_index,
future_part.name,
future_part.part_info,
future_part.parts,
future_part->name,
future_part->part_info,
future_part->parts,
&tagger,
&ttl_infos);
@ -368,26 +371,26 @@ StorageMergeTree::CurrentlyMergingPartsTagger::CurrentlyMergingPartsTagger(
if (!reserved_space)
{
if (is_mutation)
throw Exception("Not enough space for mutating part '" + future_part.parts[0]->name + "'", ErrorCodes::NOT_ENOUGH_SPACE);
throw Exception("Not enough space for mutating part '" + future_part->parts[0]->name + "'", ErrorCodes::NOT_ENOUGH_SPACE);
else
throw Exception("Not enough space for merging parts", ErrorCodes::NOT_ENOUGH_SPACE);
}
future_part_.updatePath(storage, reserved_space);
future_part->updatePath(storage, reserved_space.get());
for (const auto & part : future_part.parts)
for (const auto & part : future_part->parts)
{
if (storage.currently_merging_mutating_parts.count(part))
throw Exception("Tagging already tagged part " + part->name + ". This is a bug.", ErrorCodes::LOGICAL_ERROR);
}
storage.currently_merging_mutating_parts.insert(future_part.parts.begin(), future_part.parts.end());
storage.currently_merging_mutating_parts.insert(future_part->parts.begin(), future_part->parts.end());
}
StorageMergeTree::CurrentlyMergingPartsTagger::~CurrentlyMergingPartsTagger()
CurrentlyMergingPartsTagger::~CurrentlyMergingPartsTagger()
{
std::lock_guard lock(storage.currently_processing_in_background_mutex);
for (const auto & part : future_part.parts)
for (const auto & part : future_part->parts)
{
if (!storage.currently_merging_mutating_parts.count(part))
std::terminate();
@ -420,12 +423,12 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, String
}
void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPart result_part, bool is_successful, const String & exception_message)
void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPartPtr result_part, bool is_successful, const String & exception_message)
{
/// Update the information about failed parts in the system.mutations table.
Int64 sources_data_version = result_part.parts.at(0)->info.getDataVersion();
Int64 result_data_version = result_part.part_info.getDataVersion();
Int64 sources_data_version = result_part->parts.at(0)->info.getDataVersion();
Int64 result_data_version = result_part->part_info.getDataVersion();
if (sources_data_version != result_data_version)
{
std::lock_guard lock(currently_processing_in_background_mutex);
@ -437,7 +440,7 @@ void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPart resul
MergeTreeMutationEntry & entry = it->second;
if (is_successful)
{
if (!entry.latest_failed_part.empty() && result_part.part_info.contains(entry.latest_failed_part_info))
if (!entry.latest_failed_part.empty() && result_part->part_info.contains(entry.latest_failed_part_info))
{
entry.latest_failed_part.clear();
entry.latest_failed_part_info = MergeTreePartInfo();
@ -447,8 +450,8 @@ void StorageMergeTree::updateMutationEntriesErrors(FutureMergedMutatedPart resul
}
else
{
entry.latest_failed_part = result_part.parts.at(0)->name;
entry.latest_failed_part_info = result_part.parts.at(0)->info;
entry.latest_failed_part = result_part->parts.at(0)->name;
entry.latest_failed_part_info = result_part->parts.at(0)->info;
entry.latest_fail_time = time(nullptr);
entry.latest_fail_reason = exception_message;
}
@ -682,7 +685,7 @@ void StorageMergeTree::loadMutations()
increment.value = std::max(Int64(increment.value.load()), current_mutations_by_version.rbegin()->first);
}
std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::selectPartsToMerge(
std::shared_ptr<MergeMutateSelectedEntry> StorageMergeTree::selectPartsToMerge(
const StorageMetadataPtr & metadata_snapshot,
bool aggressive,
const String & partition_id,
@ -695,10 +698,10 @@ std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::se
{
auto data_settings = getSettings();
FutureMergedMutatedPart future_part;
auto future_part = std::make_shared<FutureMergedMutatedPart>();
if (storage_settings.get()->assign_part_uuids)
future_part.uuid = UUIDHelpers::generateV4();
future_part->uuid = UUIDHelpers::generateV4();
/// You must call destructor with unlocked `currently_processing_in_background_mutex`.
CurrentlyMergingPartsTaggerPtr merging_tagger;
@ -788,11 +791,11 @@ std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::se
}
/// Account TTL merge here to avoid exceeding the max_number_of_merges_with_ttl_in_pool limit
if (isTTLMergeType(future_part.merge_type))
if (isTTLMergeType(future_part->merge_type))
getContext()->getMergeList().bookMergeWithTTL();
merging_tagger = std::make_unique<CurrentlyMergingPartsTagger>(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part.parts), *this, metadata_snapshot, false);
return std::make_shared<MergeMutateSelectedEntry>(future_part, std::move(merging_tagger), MutationCommands{});
merging_tagger = std::make_unique<CurrentlyMergingPartsTagger>(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part->parts), *this, metadata_snapshot, false);
return std::make_shared<MergeMutateSelectedEntry>(future_part, std::move(merging_tagger), MutationCommands::create());
}
bool StorageMergeTree::merge(
@ -835,76 +838,33 @@ bool StorageMergeTree::merge(
if (!merge_mutate_entry)
return false;
return mergeSelectedParts(metadata_snapshot, deduplicate, deduplicate_by_columns, *merge_mutate_entry, table_lock_holder);
}
bool StorageMergeTree::mergeSelectedParts(
const StorageMetadataPtr & metadata_snapshot,
bool deduplicate,
const Names & deduplicate_by_columns,
MergeMutateSelectedEntry & merge_mutate_entry,
TableLockHolder & table_lock_holder)
{
auto & future_part = merge_mutate_entry.future_part;
Stopwatch stopwatch;
MutableDataPartPtr new_part;
/// Copying a vector of columns `deduplicate bu columns.
auto task = std::make_shared<MergePlainMergeTreeTask>(
*this, metadata_snapshot, deduplicate, deduplicate_by_columns, merge_mutate_entry, table_lock_holder, [](bool){});
auto merge_list_entry = getContext()->getMergeList().insert(getStorageID(), future_part);
auto write_part_log = [&] (const ExecutionStatus & execution_status)
{
writePartLog(
PartLogElement::MERGE_PARTS,
execution_status,
stopwatch.elapsed(),
future_part.name,
new_part,
future_part.parts,
merge_list_entry.get());
};
try
{
new_part = merger_mutator.mergePartsToTemporaryPart(
future_part,
metadata_snapshot,
*(merge_list_entry),
table_lock_holder,
time(nullptr),
getContext(),
merge_mutate_entry.tagger->reserved_space,
deduplicate,
deduplicate_by_columns,
merging_params);
merger_mutator.renameMergedTemporaryPart(new_part, future_part.parts, nullptr);
write_part_log({});
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
throw;
}
executeHere(task);
return true;
}
bool StorageMergeTree::partIsAssignedToBackgroundOperation(const DataPartPtr & part) const
{
std::lock_guard background_processing_lock(currently_processing_in_background_mutex);
return currently_merging_mutating_parts.count(part);
}
std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::selectPartsToMutate(
std::shared_ptr<MergeMutateSelectedEntry> StorageMergeTree::selectPartsToMutate(
const StorageMetadataPtr & metadata_snapshot, String * /* disable_reason */, TableLockHolder & /* table_lock_holder */)
{
size_t max_ast_elements = getContext()->getSettingsRef().max_expanded_ast_elements;
FutureMergedMutatedPart future_part;
auto future_part = std::make_shared<FutureMergedMutatedPart>();
if (storage_settings.get()->assign_part_uuids)
future_part.uuid = UUIDHelpers::generateV4();
future_part->uuid = UUIDHelpers::generateV4();
MutationCommands commands;
auto commands = MutationCommands::create();
CurrentlyMergingPartsTaggerPtr tagger;
@ -983,18 +943,18 @@ std::shared_ptr<StorageMergeTree::MergeMutateSelectedEntry> StorageMergeTree::se
break;
current_ast_elements += commands_size;
commands.insert(commands.end(), it->second.commands.begin(), it->second.commands.end());
commands->insert(commands->end(), it->second.commands.begin(), it->second.commands.end());
}
if (!commands.empty())
if (!commands->empty())
{
auto new_part_info = part->info;
new_part_info.mutation = current_mutations_by_version.rbegin()->first;
future_part.parts.push_back(part);
future_part.part_info = new_part_info;
future_part.name = part->getNewName(new_part_info);
future_part.type = part->getType();
future_part->parts.push_back(part);
future_part->part_info = new_part_info;
future_part->name = part->getNewName(new_part_info);
future_part->type = part->getType();
tagger = std::make_unique<CurrentlyMergingPartsTagger>(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true);
return std::make_shared<MergeMutateSelectedEntry>(future_part, std::move(tagger), commands);
@ -1018,18 +978,20 @@ bool StorageMergeTree::mutateSelectedPart(const StorageMetadataPtr & metadata_sn
PartLogElement::MUTATE_PART,
execution_status,
stopwatch.elapsed(),
future_part.name,
future_part->name,
new_part,
future_part.parts,
future_part->parts,
merge_list_entry.get());
};
try
{
new_part = merger_mutator.mutatePartToTemporaryPart(
future_part, metadata_snapshot, merge_mutate_entry.commands, *(merge_list_entry),
auto task = merger_mutator.mutatePartToTemporaryPart(
future_part, metadata_snapshot, merge_mutate_entry.commands, merge_list_entry.get(),
time(nullptr), getContext(), merge_mutate_entry.tagger->reserved_space, table_lock_holder);
new_part = executeHere(task);
renameTempPartAndReplace(new_part);
updateMutationEntriesErrors(future_part, true, "");
@ -1057,7 +1019,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
auto share_lock = lockForShare(RWLockImpl::NO_QUERY, getSettings()->lock_acquire_timeout_for_background_operations);
bool has_mutations;
bool has_mutations = false;
{
std::unique_lock lock(currently_processing_in_background_mutex);
if (merger_mutator.merges_blocker.isCancelled())
@ -1065,36 +1027,32 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
merge_entry = selectPartsToMerge(metadata_snapshot, false, {}, false, nullptr, share_lock, lock);
if (!merge_entry)
{
mutate_entry = selectPartsToMutate(metadata_snapshot, nullptr, share_lock);
has_mutations = !current_mutations_by_version.empty();
}
if (!mutate_entry && has_mutations)
{
/// Notify in case of errors
std::lock_guard lock(mutation_wait_mutex);
mutation_wait_event.notify_all();
has_mutations = !current_mutations_by_version.empty();
}
}
if (merge_entry)
{
assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
[this, metadata_snapshot, merge_entry, share_lock] () mutable
{
return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
}, common_assignee_trigger, getStorageID()));
auto task = std::make_shared<MergePlainMergeTreeTask>(*this, metadata_snapshot, false, Names{}, merge_entry, share_lock, common_assignee_trigger);
assignee.scheduleMergeMutateTask(task);
return true;
}
if (mutate_entry)
{
assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
[this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
{
return mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
}, common_assignee_trigger, getStorageID()));
auto task = std::make_shared<MutatePlainMergeTreeTask>(*this, metadata_snapshot, mutate_entry, share_lock, common_assignee_trigger);
assignee.scheduleMergeMutateTask(task);
return true;
}
if (has_mutations)
{
/// Notify in case of errors if no mutation was successfully selected.
/// Otherwise, notification will occur after any of mutations complete.
std::lock_guard lock(mutation_wait_mutex);
mutation_wait_event.notify_all();
}
bool scheduled = false;
if (time_after_previous_cleanup_temporary_directories.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_temporary_directories_interval_seconds))
{
@ -1106,7 +1064,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
}, common_assignee_trigger, getStorageID()));
scheduled = true;
}
if (time_after_previous_cleanup_parts.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_parts_interval_seconds))
if (auto lock = time_after_previous_cleanup_parts.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_parts_interval_seconds))
{
assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
[this, share_lock] ()

View File

@ -13,6 +13,9 @@
#include <Storages/MergeTree/MergeTreeMutationEntry.h>
#include <Storages/MergeTree/MergeTreeMutationStatus.h>
#include <Storages/MergeTree/MergeTreeDeduplicationLog.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/MergePlainMergeTreeTask.h>
#include <Storages/MergeTree/MutatePlainMergeTreeTask.h>
#include <Disks/StoragePolicy.h>
#include <Common/SimpleIncrement.h>
@ -134,6 +137,7 @@ private:
std::atomic<bool> shutdown_called {false};
private:
void loadMutations();
/// Load and initialize deduplication logs. Even if deduplication setting
@ -158,38 +162,8 @@ private:
/// Wait until mutation with version will finish mutation for all parts
void waitForMutation(Int64 version, const String & file_name);
struct CurrentlyMergingPartsTagger
{
FutureMergedMutatedPart future_part;
ReservationPtr reserved_space;
StorageMergeTree & storage;
// Optional tagger to maintain volatile parts for the JBOD balancer
std::optional<CurrentlySubmergingEmergingTagger> tagger;
CurrentlyMergingPartsTagger(
FutureMergedMutatedPart & future_part_,
size_t total_size,
StorageMergeTree & storage_,
const StorageMetadataPtr & metadata_snapshot,
bool is_mutation);
~CurrentlyMergingPartsTagger();
};
using CurrentlyMergingPartsTaggerPtr = std::unique_ptr<CurrentlyMergingPartsTagger>;
friend struct CurrentlyMergingPartsTagger;
struct MergeMutateSelectedEntry
{
FutureMergedMutatedPart future_part;
CurrentlyMergingPartsTaggerPtr tagger;
MutationCommands commands;
MergeMutateSelectedEntry(const FutureMergedMutatedPart & future_part_, CurrentlyMergingPartsTaggerPtr && tagger_, const MutationCommands & commands_)
: future_part(future_part_)
, tagger(std::move(tagger_))
, commands(commands_)
{}
};
std::shared_ptr<MergeMutateSelectedEntry> selectPartsToMerge(
const StorageMetadataPtr & metadata_snapshot,
@ -202,7 +176,6 @@ private:
bool optimize_skip_merged_partitions = false,
SelectPartsDecision * select_decision_out = nullptr);
bool mergeSelectedParts(const StorageMetadataPtr & metadata_snapshot, bool deduplicate, const Names & deduplicate_by_columns, MergeMutateSelectedEntry & entry, TableLockHolder & table_lock_holder);
std::shared_ptr<MergeMutateSelectedEntry> selectPartsToMutate(const StorageMetadataPtr & metadata_snapshot, String * disable_reason, TableLockHolder & table_lock_holder);
bool mutateSelectedPart(const StorageMetadataPtr & metadata_snapshot, MergeMutateSelectedEntry & entry, TableLockHolder & table_lock_holder);
@ -226,7 +199,7 @@ private:
/// Update mutation entries after part mutation execution. May reset old
/// errors if mutation was successful. Otherwise update last_failed* fields
/// in mutation entries.
void updateMutationEntriesErrors(FutureMergedMutatedPart result_part, bool is_successful, const String & exception_message);
void updateMutationEntriesErrors(FutureMergedMutatedPartPtr result_part, bool is_successful, const String & exception_message);
/// Return empty optional if mutation was killed. Otherwise return partially
/// filled mutation status with information about error (latest_fail*) and
@ -243,6 +216,8 @@ private:
friend class MergeTreeProjectionBlockOutputStream;
friend class MergeTreeSink;
friend class MergeTreeData;
friend class MergePlainMergeTreeTask;
friend class MutatePlainMergeTreeTask;
protected:

View File

@ -28,6 +28,8 @@
#include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Storages/MergeTree/MergeFromLogEntryTask.h>
#include <Storages/MergeTree/MutateFromLogEntryTask.h>
#include <Storages/VirtualColumnUtils.h>
#include <Storages/MergeTree/MergeTreeReaderCompact.h>
@ -126,7 +128,6 @@ namespace ErrorCodes
extern const int PARTITION_DOESNT_EXIST;
extern const int UNFINISHED;
extern const int RECEIVED_ERROR_TOO_MANY_REQUESTS;
extern const int BAD_DATA_PART_NAME;
extern const int PART_IS_TEMPORARILY_LOCKED;
extern const int CANNOT_ASSIGN_OPTIMIZE;
extern const int KEEPER_EXCEPTION;
@ -1222,34 +1223,24 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
Coordination::Requests ops;
String has_replica = findReplicaHavingPart(part_name, true);
if (!has_replica.empty())
LOG_ERROR(log, "Removing locally missing part from ZooKeeper and queueing a fetch: {}", part_name);
time_t part_create_time = 0;
Coordination::ExistsResponse exists_resp = exists_futures[i].get();
if (exists_resp.error == Coordination::Error::ZOK)
{
LOG_ERROR(log, "Removing locally missing part from ZooKeeper and queueing a fetch: {}", part_name);
time_t part_create_time = 0;
Coordination::ExistsResponse exists_resp = exists_futures[i].get();
if (exists_resp.error == Coordination::Error::ZOK)
{
part_create_time = exists_resp.stat.ctime / 1000;
removePartFromZooKeeper(part_name, ops, exists_resp.stat.numChildren > 0);
}
LogEntry log_entry;
log_entry.type = LogEntry::GET_PART;
log_entry.source_replica = "";
log_entry.new_part_name = part_name;
log_entry.create_time = part_create_time;
/// We assume that this occurs before the queue is loaded (queue.initialize).
ops.emplace_back(zkutil::makeCreateRequest(
fs::path(replica_path) / "queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
enqueue_futures.emplace_back(zookeeper->asyncMulti(ops));
}
else
{
LOG_ERROR(log, "Not found active replica having part {}", part_name);
enqueuePartForCheck(part_name);
part_create_time = exists_resp.stat.ctime / 1000;
removePartFromZooKeeper(part_name, ops, exists_resp.stat.numChildren > 0);
}
LogEntry log_entry;
log_entry.type = LogEntry::GET_PART;
log_entry.source_replica = "";
log_entry.new_part_name = part_name;
log_entry.create_time = part_create_time;
/// We assume that this occurs before the queue is loaded (queue.initialize).
ops.emplace_back(zkutil::makeCreateRequest(
fs::path(replica_path) / "queue/queue-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
enqueue_futures.emplace_back(zookeeper->asyncMulti(ops));
}
for (auto & future : enqueue_futures)
@ -1556,7 +1547,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)
return true; /// NOTE Deletion from `virtual_parts` is not done, but it is only necessary for merge.
}
bool do_fetch = false;
// bool do_fetch = false;
switch (entry.type)
{
@ -1564,17 +1555,12 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)
/// We surely don't have this part locally as we've checked it before, so download it.
[[fallthrough]];
case LogEntry::GET_PART:
do_fetch = true;
break;
return executeFetch(entry);
// do_fetch = true;
case LogEntry::MERGE_PARTS:
/// Sometimes it's better to fetch the merged part instead of merging,
/// e.g when we don't have all the source parts.
do_fetch = !tryExecuteMerge(entry);
break;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Merge has to be executed by another function");
case LogEntry::MUTATE_PART:
/// Sometimes it's better to fetch mutated part instead of merging.
do_fetch = !tryExecutePartMutation(entry);
break;
throw Exception(ErrorCodes::LOGICAL_ERROR, "Mutation has to be executed by another function");
case LogEntry::ALTER_METADATA:
return executeMetadataAlter(entry);
case LogEntry::SYNC_PINNED_PART_UUIDS:
@ -1587,380 +1573,7 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected log entry type: {}", static_cast<int>(entry.type));
}
if (do_fetch)
return executeFetch(entry);
return true;
}
bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
{
LOG_TRACE(log, "Executing log entry to merge parts {} to {}",
fmt::join(entry.source_parts, ", "), entry.new_part_name);
const auto storage_settings_ptr = getSettings();
if (storage_settings_ptr->always_fetch_merged_part)
{
LOG_INFO(log, "Will fetch part {} because setting 'always_fetch_merged_part' is true", entry.new_part_name);
return false;
}
if (entry.merge_type == MergeType::TTL_RECOMPRESS &&
(time(nullptr) - entry.create_time) <= storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds() &&
entry.source_replica != replica_name)
{
LOG_INFO(log, "Will try to fetch part {} until '{}' because this part assigned to recompression merge. "
"Source replica {} will try to merge this part first", entry.new_part_name,
DateLUT::instance().timeToString(entry.create_time + storage_settings_ptr->try_fetch_recompressed_part_timeout.totalSeconds()), entry.source_replica);
return false;
}
/// In some use cases merging can be more expensive than fetching
/// and it may be better to spread merges tasks across the replicas
/// instead of doing exactly the same merge cluster-wise
std::optional<String> replica_to_execute_merge;
bool replica_to_execute_merge_picked = false;
if (merge_strategy_picker.shouldMergeOnSingleReplica(entry))
{
replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry);
replica_to_execute_merge_picked = true;
if (replica_to_execute_merge)
{
LOG_DEBUG(log,
"Prefer fetching part {} from replica {} due to execute_merges_on_single_replica_time_threshold",
entry.new_part_name, replica_to_execute_merge.value());
return false;
}
}
DataPartsVector parts;
for (const String & source_part_name : entry.source_parts)
{
DataPartPtr source_part_or_covering = getActiveContainingPart(source_part_name);
if (!source_part_or_covering)
{
/// We do not have one of source parts locally, try to take some already merged part from someone.
LOG_DEBUG(log, "Don't have all parts for merge {}; will try to fetch it instead", entry.new_part_name);
return false;
}
if (source_part_or_covering->name != source_part_name)
{
/// We do not have source part locally, but we have some covering part. Possible options:
/// 1. We already have merged part (source_part_or_covering->name == new_part_name)
/// 2. We have some larger merged part which covers new_part_name (and therefore it covers source_part_name too)
/// 3. We have two intersecting parts, both cover source_part_name. It's logical error.
/// TODO Why 1 and 2 can happen? Do we need more assertions here or somewhere else?
constexpr const char * message = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
LOG_WARNING(log, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
if (!source_part_or_covering->info.contains(MergeTreePartInfo::fromPartName(entry.new_part_name, format_version)))
throw Exception(ErrorCodes::LOGICAL_ERROR, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
return false;
}
parts.push_back(source_part_or_covering);
}
/// All source parts are found locally, we can execute merge
if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr))
{
/// If entry is old enough, and have enough size, and part are exists in any replica,
/// then prefer fetching of merged part from replica.
size_t sum_parts_bytes_on_disk = 0;
for (const auto & part : parts)
sum_parts_bytes_on_disk += part->getBytesOnDisk();
if (sum_parts_bytes_on_disk >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold)
{
String replica = findReplicaHavingPart(entry.new_part_name, true); /// NOTE excessive ZK requests for same data later, may remove.
if (!replica.empty())
{
LOG_DEBUG(log, "Prefer to fetch {} from replica {}", entry.new_part_name, replica);
return false;
}
}
}
/// Start to make the main work
size_t estimated_space_for_merge = MergeTreeDataMergerMutator::estimateNeededDiskSpace(parts);
/// Can throw an exception while reserving space.
IMergeTreeDataPart::TTLInfos ttl_infos;
size_t max_volume_index = 0;
for (auto & part_ptr : parts)
{
ttl_infos.update(part_ptr->ttl_infos);
max_volume_index = std::max(max_volume_index, getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk()));
}
auto table_lock = lockForShare(RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations);
StorageMetadataPtr metadata_snapshot = getInMemoryMetadataPtr();
FutureMergedMutatedPart future_merged_part(parts, entry.new_part_type);
if (future_merged_part.name != entry.new_part_name)
{
throw Exception("Future merged part name " + backQuote(future_merged_part.name) + " differs from part name in log entry: "
+ backQuote(entry.new_part_name), ErrorCodes::BAD_DATA_PART_NAME);
}
std::optional<CurrentlySubmergingEmergingTagger> tagger;
ReservationPtr reserved_space = balancedReservation(
metadata_snapshot,
estimated_space_for_merge,
max_volume_index,
future_merged_part.name,
future_merged_part.part_info,
future_merged_part.parts,
&tagger,
&ttl_infos);
if (!reserved_space)
reserved_space
= reserveSpacePreferringTTLRules(metadata_snapshot, estimated_space_for_merge, ttl_infos, time(nullptr), max_volume_index);
future_merged_part.uuid = entry.new_part_uuid;
future_merged_part.updatePath(*this, reserved_space);
future_merged_part.merge_type = entry.merge_type;
if (reserved_space->getDisk()->supportZeroCopyReplication()
&& storage_settings_ptr->allow_remote_fs_zero_copy_replication
&& merge_strategy_picker.shouldMergeOnSingleReplicaShared(entry))
{
if (!replica_to_execute_merge_picked)
replica_to_execute_merge = merge_strategy_picker.pickReplicaToExecuteMerge(entry);
if (replica_to_execute_merge)
{
LOG_DEBUG(log,
"Prefer fetching part {} from replica {} due to remote_fs_execute_merges_on_single_replica_time_threshold",
entry.new_part_name, replica_to_execute_merge.value());
return false;
}
}
/// Account TTL merge
if (isTTLMergeType(future_merged_part.merge_type))
getContext()->getMergeList().bookMergeWithTTL();
auto table_id = getStorageID();
/// Add merge to list
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(getStorageID(), future_merged_part);
Transaction transaction(*this);
MutableDataPartPtr part;
Stopwatch stopwatch;
auto write_part_log = [&] (const ExecutionStatus & execution_status)
{
writePartLog(
PartLogElement::MERGE_PARTS, execution_status, stopwatch.elapsed(),
entry.new_part_name, part, parts, merge_entry.get());
};
try
{
part = merger_mutator.mergePartsToTemporaryPart(
future_merged_part,
metadata_snapshot,
*merge_entry,
table_lock,
entry.create_time,
getContext(),
reserved_space,
entry.deduplicate,
entry.deduplicate_by_columns,
merging_params);
merger_mutator.renameMergedTemporaryPart(part, parts, &transaction);
try
{
checkPartChecksumsAndCommit(transaction, part);
}
catch (const Exception & e)
{
if (MergeTreeDataPartChecksums::isBadChecksumsErrorCode(e.code()))
{
transaction.rollback();
ProfileEvents::increment(ProfileEvents::DataAfterMergeDiffersFromReplica);
LOG_ERROR(log,
"{}. Data after merge is not byte-identical to data on another replicas. There could be several"
" reasons: 1. Using newer version of compression library after server update. 2. Using another"
" compression method. 3. Non-deterministic compression algorithm (highly unlikely). 4."
" Non-deterministic merge algorithm due to logical error in code. 5. Data corruption in memory due"
" to bug in code. 6. Data corruption in memory due to hardware issue. 7. Manual modification of"
" source data after server startup. 8. Manual modification of checksums stored in ZooKeeper. 9."
" Part format related settings like 'enable_mixed_granularity_parts' are different on different"
" replicas. We will download merged part from replica to force byte-identical result.",
getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
if (storage_settings_ptr->detach_not_byte_identical_parts)
forgetPartAndMoveToDetached(std::move(part), "merge-not-byte-identical");
else
tryRemovePartImmediately(std::move(part));
/// No need to delete the part from ZK because we can be sure that the commit transaction
/// didn't go through.
return false;
}
throw;
}
/** Removing old parts from ZK and from the disk is delayed - see ReplicatedMergeTreeCleanupThread, clearOldParts.
*/
/** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts.
* This is not a problem, because in this case the merge will remain in the queue, and we will try again.
*/
merge_selecting_task->schedule();
ProfileEvents::increment(ProfileEvents::ReplicatedPartMerges);
write_part_log({});
return true;
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
throw;
}
}
bool StorageReplicatedMergeTree::tryExecutePartMutation(const StorageReplicatedMergeTree::LogEntry & entry)
{
const String & source_part_name = entry.source_parts.at(0);
const auto storage_settings_ptr = getSettings();
LOG_TRACE(log, "Executing log entry to mutate part {} to {}", source_part_name, entry.new_part_name);
DataPartPtr source_part = getActiveContainingPart(source_part_name);
if (!source_part)
{
LOG_DEBUG(log, "Source part {} for {} is not ready; will try to fetch it instead", source_part_name, entry.new_part_name);
return false;
}
if (source_part->name != source_part_name)
{
LOG_WARNING(log, "Part " + source_part_name + " is covered by " + source_part->name
+ " but should be mutated to " + entry.new_part_name + ". "
+ "Possibly the mutation of this part is not needed and will be skipped. This shouldn't happen often.");
return false;
}
/// TODO - some better heuristic?
size_t estimated_space_for_result = MergeTreeDataMergerMutator::estimateNeededDiskSpace({source_part});
if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr)
&& estimated_space_for_result >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold)
{
/// If entry is old enough, and have enough size, and some replica has the desired part,
/// then prefer fetching from replica.
String replica = findReplicaHavingPart(entry.new_part_name, true); /// NOTE excessive ZK requests for same data later, may remove.
if (!replica.empty())
{
LOG_DEBUG(log, "Prefer to fetch {} from replica {}", entry.new_part_name, replica);
return false;
}
}
MergeTreePartInfo new_part_info = MergeTreePartInfo::fromPartName(
entry.new_part_name, format_version);
MutationCommands commands = queue.getMutationCommands(source_part, new_part_info.mutation);
/// Once we mutate part, we must reserve space on the same disk, because mutations can possibly create hardlinks.
/// Can throw an exception.
ReservationPtr reserved_space = reserveSpace(estimated_space_for_result, source_part->volume);
auto table_lock = lockForShare(
RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations);
StorageMetadataPtr metadata_snapshot = getInMemoryMetadataPtr();
MutableDataPartPtr new_part;
Transaction transaction(*this);
FutureMergedMutatedPart future_mutated_part;
future_mutated_part.name = entry.new_part_name;
future_mutated_part.uuid = entry.new_part_uuid;
future_mutated_part.parts.push_back(source_part);
future_mutated_part.part_info = new_part_info;
future_mutated_part.updatePath(*this, reserved_space);
future_mutated_part.type = source_part->getType();
MergeList::EntryPtr merge_entry = getContext()->getMergeList().insert(getStorageID(), future_mutated_part);
Stopwatch stopwatch;
auto write_part_log = [&] (const ExecutionStatus & execution_status)
{
writePartLog(
PartLogElement::MUTATE_PART, execution_status, stopwatch.elapsed(),
entry.new_part_name, new_part, future_mutated_part.parts, merge_entry.get());
};
try
{
new_part = merger_mutator.mutatePartToTemporaryPart(
future_mutated_part, metadata_snapshot, commands, *merge_entry,
entry.create_time, getContext(), reserved_space, table_lock);
renameTempPartAndReplace(new_part, nullptr, &transaction);
try
{
checkPartChecksumsAndCommit(transaction, new_part);
}
catch (const Exception & e)
{
if (MergeTreeDataPartChecksums::isBadChecksumsErrorCode(e.code()))
{
transaction.rollback();
ProfileEvents::increment(ProfileEvents::DataAfterMutationDiffersFromReplica);
LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
if (storage_settings_ptr->detach_not_byte_identical_parts)
forgetPartAndMoveToDetached(std::move(new_part), "mutate-not-byte-identical");
else
tryRemovePartImmediately(std::move(new_part));
/// No need to delete the part from ZK because we can be sure that the commit transaction
/// didn't go through.
return false;
}
throw;
}
/** With `ZSESSIONEXPIRED` or `ZOPERATIONTIMEOUT`, we can inadvertently roll back local changes to the parts.
* This is not a problem, because in this case the entry will remain in the queue, and we will try again.
*/
merge_selecting_task->schedule();
ProfileEvents::increment(ProfileEvents::ReplicatedPartMutations);
write_part_log({});
return true;
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
throw;
}
// return true;
}
@ -3135,9 +2748,9 @@ ReplicatedMergeTreeQueue::SelectedEntryPtr StorageReplicatedMergeTree::selectQue
return selected;
}
bool StorageReplicatedMergeTree::processQueueEntry(ReplicatedMergeTreeQueue::SelectedEntryPtr selected_entry)
{
LogEntryPtr & entry = selected_entry->log_entry;
return queue.processEntry([this]{ return getZooKeeper(); }, entry, [&](LogEntryPtr & entry_to_process)
{
@ -3192,8 +2805,10 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssigne
if (!selected_entry)
return false;
auto job_type = selected_entry->log_entry->type;
/// Depending on entry type execute in fetches (small) pool or big merge_mutate pool
if (selected_entry->log_entry->type == LogEntry::GET_PART)
if (job_type == LogEntry::GET_PART)
{
assignee.scheduleFetchTask(ExecutableLambdaAdapter::create(
[this, selected_entry] () mutable
@ -3202,6 +2817,18 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssigne
}, common_assignee_trigger, getStorageID()));
return true;
}
else if (job_type == LogEntry::MERGE_PARTS)
{
auto task = MergeFromLogEntryTask::create(selected_entry, *this, common_assignee_trigger);
assignee.scheduleMergeMutateTask(task);
return true;
}
else if (job_type == LogEntry::MUTATE_PART)
{
auto task = MutateFromLogEntryTask::create(selected_entry, *this, common_assignee_trigger);
assignee.scheduleMergeMutateTask(task);
return true;
}
else
{
assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
@ -3288,24 +2915,24 @@ void StorageReplicatedMergeTree::mergeSelectingTask()
bool merge_with_ttl_allowed = merges_and_mutations_queued.merges_with_ttl < storage_settings_ptr->max_replicated_merges_with_ttl_in_queue &&
getTotalMergesWithTTLInMergeList() < storage_settings_ptr->max_number_of_merges_with_ttl_in_pool;
FutureMergedMutatedPart future_merged_part;
auto future_merged_part = std::make_shared<FutureMergedMutatedPart>();
if (storage_settings.get()->assign_part_uuids)
future_merged_part.uuid = UUIDHelpers::generateV4();
future_merged_part->uuid = UUIDHelpers::generateV4();
if (max_source_parts_size_for_merge > 0 &&
merger_mutator.selectPartsToMerge(future_merged_part, false, max_source_parts_size_for_merge, merge_pred, merge_with_ttl_allowed, nullptr) == SelectPartsDecision::SELECTED)
{
create_result = createLogEntryToMergeParts(
zookeeper,
future_merged_part.parts,
future_merged_part.name,
future_merged_part.uuid,
future_merged_part.type,
future_merged_part->parts,
future_merged_part->name,
future_merged_part->uuid,
future_merged_part->type,
deduplicate,
deduplicate_by_columns,
nullptr,
merge_pred.getVersion(),
future_merged_part.merge_type);
future_merged_part->merge_type);
}
/// If there are many mutations in queue, it may happen, that we cannot enqueue enough merges to merge all new parts
else if (max_source_part_size_for_mutation > 0 && queue.countMutations() > 0
@ -3324,7 +2951,7 @@ void StorageReplicatedMergeTree::mergeSelectingTask()
create_result = createLogEntryToMutatePart(
*part,
future_merged_part.uuid,
future_merged_part->uuid,
desired_mutation_version->first,
desired_mutation_version->second,
merge_pred.getVersion());
@ -4299,8 +3926,6 @@ void StorageReplicatedMergeTree::startup()
try
{
queue.initialize(getDataParts());
InterserverIOEndpointPtr data_parts_exchange_ptr = std::make_shared<DataPartsExchange::Service>(*this);
[[maybe_unused]] auto prev_ptr = std::atomic_exchange(&data_parts_exchange_endpoint, data_parts_exchange_ptr);
assert(prev_ptr == nullptr);
@ -4609,10 +4234,10 @@ bool StorageReplicatedMergeTree::optimize(
std::lock_guard merge_selecting_lock(merge_selecting_mutex);
ReplicatedMergeTreeMergePredicate can_merge = queue.getMergePredicate(zookeeper);
FutureMergedMutatedPart future_merged_part;
auto future_merged_part = std::make_shared<FutureMergedMutatedPart>();
if (storage_settings.get()->assign_part_uuids)
future_merged_part.uuid = UUIDHelpers::generateV4();
future_merged_part->uuid = UUIDHelpers::generateV4();
SelectPartsDecision select_decision = merger_mutator.selectAllPartsToMergeWithinPartition(
future_merged_part, disk_space, can_merge, partition_id, true, metadata_snapshot, nullptr, query_context->getSettingsRef().optimize_skip_merged_partitions);
@ -4622,10 +4247,10 @@ bool StorageReplicatedMergeTree::optimize(
ReplicatedMergeTreeLogEntryData merge_entry;
CreateMergeEntryResult create_result = createLogEntryToMergeParts(
zookeeper, future_merged_part.parts,
future_merged_part.name, future_merged_part.uuid, future_merged_part.type,
zookeeper, future_merged_part->parts,
future_merged_part->name, future_merged_part->uuid, future_merged_part->type,
deduplicate, deduplicate_by_columns,
&merge_entry, can_merge.getVersion(), future_merged_part.merge_type);
&merge_entry, can_merge.getVersion(), future_merged_part->merge_type);
if (create_result == CreateMergeEntryResult::MissingPart)
return handle_noop("Can't create merge queue node in ZooKeeper, because some parts are missing");
@ -4649,9 +4274,9 @@ bool StorageReplicatedMergeTree::optimize(
std::lock_guard merge_selecting_lock(merge_selecting_mutex);
ReplicatedMergeTreeMergePredicate can_merge = queue.getMergePredicate(zookeeper);
FutureMergedMutatedPart future_merged_part;
auto future_merged_part = std::make_shared<FutureMergedMutatedPart>();
if (storage_settings.get()->assign_part_uuids)
future_merged_part.uuid = UUIDHelpers::generateV4();
future_merged_part->uuid = UUIDHelpers::generateV4();
String disable_reason;
SelectPartsDecision select_decision = SelectPartsDecision::CANNOT_SELECT;
@ -4684,10 +4309,10 @@ bool StorageReplicatedMergeTree::optimize(
ReplicatedMergeTreeLogEntryData merge_entry;
CreateMergeEntryResult create_result = createLogEntryToMergeParts(
zookeeper, future_merged_part.parts,
future_merged_part.name, future_merged_part.uuid, future_merged_part.type,
zookeeper, future_merged_part->parts,
future_merged_part->name, future_merged_part->uuid, future_merged_part->type,
deduplicate, deduplicate_by_columns,
&merge_entry, can_merge.getVersion(), future_merged_part.merge_type);
&merge_entry, can_merge.getVersion(), future_merged_part->merge_type);
if (create_result == CreateMergeEntryResult::MissingPart)
return handle_noop("Can't create merge queue node in ZooKeeper, because some parts are missing");
@ -5957,7 +5582,7 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, Conte
///
/// Leader replica records its decisions to the replication log (/log directory in ZK) in the form of
/// MUTATE_PART entries and all replicas then execute them in the background pool
/// (see tryExecutePartMutation() function). When a replica encounters a MUTATE_PART command, it is
/// (see MutateTask class). When a replica encounters a MUTATE_PART command, it is
/// guaranteed that the corresponding mutation entry is already loaded (when we pull entries from
/// replication log into the replica queue, we also load mutation entries). Note that just as with merges
/// the replica can decide not to do the mutation locally and fetch the mutated part from another replica
@ -6041,7 +5666,18 @@ void StorageReplicatedMergeTree::waitMutation(const String & znode_name, size_t
auto zookeeper = getZooKeeper();
Strings replicas;
if (mutations_sync == 2) /// wait for all replicas
{
replicas = zookeeper->getChildren(fs::path(zookeeper_path) / "replicas");
/// This replica should be first, to ensure that the mutation will be loaded into memory
for (auto it = replicas.begin(); it != replicas.end(); ++it)
{
if (*it == replica_name)
{
std::iter_swap(it, replicas.rbegin());
break;
}
}
}
else if (mutations_sync == 1) /// just wait for ourself
replicas.push_back(replica_name);
@ -7459,8 +7095,8 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
ReservationPtr reservation = reserveSpacePreferringTTLRules(metadata_snapshot, 0, move_ttl_infos, time(nullptr), 0, true);
VolumePtr volume = getStoragePolicy()->getVolume(0);
IMergeTreeDataPart::MinMaxIndex minmax_idx;
minmax_idx.update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
auto minmax_idx = std::make_shared<IMergeTreeDataPart::MinMaxIndex>();
minmax_idx->update(block, getMinMaxColumnsNames(metadata_snapshot->getPartitionKey()));
auto new_data_part = createPart(
lost_part_name,

View File

@ -21,6 +21,8 @@
#include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <Storages/MergeTree/LeaderElection.h>
#include <Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/MergeFromLogEntryTask.h>
#include <DataTypes/DataTypesNumber.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/PartLog.h>
@ -282,6 +284,9 @@ private:
friend class ReplicatedMergeTreeQueue;
friend class PartMovesBetweenShardsOrchestrator;
friend class MergeTreeData;
friend class MergeFromLogEntryTask;
friend class MutateFromLogEntryTask;
friend class ReplicatedMergeMutateTaskBase;
using MergeStrategyPicker = ReplicatedMergeTreeMergeStrategyPicker;
using LogEntry = ReplicatedMergeTreeLogEntry;
@ -470,19 +475,11 @@ private:
void executeDropRange(const LogEntry & entry);
/// Do the merge or recommend to make the fetch instead of the merge
bool tryExecuteMerge(const LogEntry & entry);
/// Execute alter of table metadata. Set replica/metadata and replica/columns
/// nodes in zookeeper and also changes in memory metadata.
/// New metadata and columns values stored in entry.
bool executeMetadataAlter(const LogEntry & entry);
/// Execute MUTATE_PART entry. Part name and mutation commands
/// stored in entry. This function relies on MergerMutator class.
bool tryExecutePartMutation(const LogEntry & entry);
/// Fetch part from other replica (inserted or merged/mutated)
/// NOTE: Attention! First of all tries to find covering part on other replica
/// and set it into entry.actual_new_part_name. After that tries to fetch this new covering part.
@ -512,6 +509,9 @@ private:
ReplicatedMergeTreeQueue::SelectedEntryPtr selectQueueEntry();
MergeFromLogEntryTaskPtr getTaskToProcessMergeQueueEntry(ReplicatedMergeTreeQueue::SelectedEntryPtr entry);
bool processQueueEntry(ReplicatedMergeTreeQueue::SelectedEntryPtr entry);
/// Postcondition:
@ -605,7 +605,6 @@ private:
std::unordered_set<String> currently_fetching_parts;
std::mutex currently_fetching_parts_mutex;
/// With the quorum being tracked, add a replica to the quorum for the part.
void updateQuorum(const String & part_name, bool is_parallel);

View File

@ -109,7 +109,7 @@ static void check(
std::string transformed_query = transformQueryForExternalDatabase(
query_info, state.getColumns(), IdentifierQuotingStyle::DoubleQuotes, "test", "table", state.context);
EXPECT_EQ(transformed_query, expected);
EXPECT_EQ(transformed_query, expected) << query;
}
@ -128,6 +128,18 @@ TEST(TransformQueryForExternalDatabase, InWithSingleElement)
R"(SELECT "column" FROM "test"."table" WHERE "column" NOT IN ('hello', 'world'))");
}
TEST(TransformQueryForExternalDatabase, InWithMultipleColumns)
{
const State & state = State::instance();
check(state, 1,
"SELECT column FROM test.table WHERE (1,1) IN ((1,1))",
R"(SELECT "column" FROM "test"."table" WHERE 1)");
check(state, 1,
"SELECT field, value FROM test.table WHERE (field, value) IN (('foo', 'bar'))",
R"(SELECT "field", "value" FROM "test"."table" WHERE ("field", "value") IN (('foo', 'bar')))");
}
TEST(TransformQueryForExternalDatabase, InWithTable)
{
const State & state = State::instance();

View File

@ -105,9 +105,9 @@ void dropAliases(ASTPtr & node)
}
bool isCompatible(const IAST & node)
bool isCompatible(IAST & node)
{
if (const auto * function = node.as<ASTFunction>())
if (auto * function = node.as<ASTFunction>())
{
if (function->parameters) /// Parametric aggregate functions
return false;
@ -135,8 +135,14 @@ bool isCompatible(const IAST & node)
/// A tuple with zero or one elements is represented by a function tuple(x) and is not compatible,
/// but a normal tuple with more than one element is represented as a parenthesized expression (x, y) and is perfectly compatible.
if (name == "tuple" && function->arguments->children.size() <= 1)
return false;
/// So to support tuple with zero or one elements we can clear function name to get (x) instead of tuple(x)
if (name == "tuple")
{
if (function->arguments->children.size() <= 1)
{
function->name.clear();
}
}
/// If the right hand side of IN is a table identifier (example: x IN table), then it's not compatible.
if ((name == "in" || name == "notIn")

View File

@ -33,6 +33,7 @@ SRCS(
MergeTree/DataPartsExchange.cpp
MergeTree/DropPartsRanges.cpp
MergeTree/EphemeralLockInZooKeeper.cpp
MergeTree/FutureMergedMutatedPart.cpp
MergeTree/IMergeTreeDataPart.cpp
MergeTree/IMergeTreeDataPartWriter.cpp
MergeTree/IMergeTreeReader.cpp
@ -40,7 +41,10 @@ SRCS(
MergeTree/KeyCondition.cpp
MergeTree/LevelMergeSelector.cpp
MergeTree/MergeAlgorithm.cpp
MergeTree/MergeFromLogEntryTask.cpp
MergeTree/MergeList.cpp
MergeTree/MergePlainMergeTreeTask.cpp
MergeTree/MergeTask.cpp
MergeTree/MergeTreeBackgroundExecutor.cpp
MergeTree/MergeTreeBaseSelectProcessor.cpp
MergeTree/MergeTreeBlockReadUtils.cpp
@ -95,10 +99,14 @@ SRCS(
MergeTree/MergeType.cpp
MergeTree/MergedBlockOutputStream.cpp
MergeTree/MergedColumnOnlyOutputStream.cpp
MergeTree/MutateFromLogEntryTask.cpp
MergeTree/MutatePlainMergeTreeTask.cpp
MergeTree/MutateTask.cpp
MergeTree/PartMovesBetweenShardsOrchestrator.cpp
MergeTree/PartitionPruner.cpp
MergeTree/PinnedPartUUIDs.cpp
MergeTree/ReplicatedFetchList.cpp
MergeTree/ReplicatedMergeMutateTaskBase.cpp
MergeTree/ReplicatedMergeTreeAddress.cpp
MergeTree/ReplicatedMergeTreeAltersSequence.cpp
MergeTree/ReplicatedMergeTreeCleanupThread.cpp

Some files were not shown because too many files have changed in this diff Show More