Merge branch 'master' into 46229-repl-clickhouse-keeper

This commit is contained in:
Nikita Mikhaylov 2023-05-31 02:29:08 +02:00 committed by GitHub
commit 31829f7cfc
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
254 changed files with 4845 additions and 1553 deletions

2
contrib/aws vendored

@ -1 +1 @@
Subproject commit ecccfc026a42b30023289410a67024d561f4bf3e
Subproject commit ca02358dcc7ce3ab733dd4cbcc32734eecfa4ee3

2
contrib/aws-c-auth vendored

@ -1 +1 @@
Subproject commit 30df6c407e2df43bd244e2c34c9b4a4b87372bfb
Subproject commit 97133a2b5dbca1ccdf88cd6f44f39d0531d27d12

@ -1 +1 @@
Subproject commit 324fd1d973ccb25c813aa747bf1759cfde5121c5
Subproject commit 45dcb2849c891dba2100b270b4676765c92949ff

@ -1 +1 @@
Subproject commit 39bfa94a14b7126bf0c1330286ef8db452d87e66
Subproject commit 2f9b60c42f90840ec11822acda3d8cdfa97a773d

2
contrib/aws-c-http vendored

@ -1 +1 @@
Subproject commit 2c5a2a7d5556600b9782ffa6c9d7e09964df1abc
Subproject commit dd34461987947672444d0bc872c5a733dfdb9711

2
contrib/aws-c-io vendored

@ -1 +1 @@
Subproject commit 5d32c453560d0823df521a686bf7fbacde7f9be3
Subproject commit d58ed4f272b1cb4f89ac9196526ceebe5f2b0d89

2
contrib/aws-c-mqtt vendored

@ -1 +1 @@
Subproject commit 882c689561a3db1466330ccfe3b63637e0a575d3
Subproject commit 33c3455cec82b16feb940e12006cefd7b3ef4194

2
contrib/aws-c-s3 vendored

@ -1 +1 @@
Subproject commit a41255ece72a7c887bba7f9d998ca3e14f4c8a1b
Subproject commit d7bfe602d6925948f1fff95784e3613cca6a3900

@ -1 +1 @@
Subproject commit 25bf5cf225f977c3accc6a05a0a7a181ef2a4a30
Subproject commit 208a701fa01e99c7c8cc3dcebc8317da71362972

@ -1 +1 @@
Subproject commit 48e7c0e01479232f225c8044d76c84e74192889d
Subproject commit ad53be196a25bbefa3700a01187fdce573a7d2d0

View File

@ -52,8 +52,8 @@ endif()
# Directories.
SET(AWS_SDK_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws")
SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-core")
SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/aws-cpp-sdk-s3")
SET(AWS_SDK_CORE_DIR "${AWS_SDK_DIR}/src/aws-cpp-sdk-core")
SET(AWS_SDK_S3_DIR "${AWS_SDK_DIR}/generated/src/aws-cpp-sdk-s3")
SET(AWS_AUTH_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-auth")
SET(AWS_CAL_DIR "${ClickHouse_SOURCE_DIR}/contrib/aws-c-cal")

2
contrib/aws-crt-cpp vendored

@ -1 +1 @@
Subproject commit ec0bea288f451d884c0d80d534bc5c66241c39a4
Subproject commit 8a301b7e842f1daed478090c869207300972379f

2
contrib/aws-s2n-tls vendored

@ -1 +1 @@
Subproject commit 0f1ba9e5c4a67cb3898de0c0b4f911d4194dc8de
Subproject commit 71f4794b7580cf780eb4aca77d69eded5d3c7bb4

2
contrib/boost vendored

@ -1 +1 @@
Subproject commit 8fe7b3326ef482ee6ecdf5a4f698f2b8c2780f98
Subproject commit aec12eea7fc762721ae16943d1361340c66c9c17

View File

@ -25,6 +25,9 @@ message(STATUS "Intel QPL version: ${QPL_VERSION}")
# Generate 8 library targets: middle_layer_lib, isal, isal_asm, qplcore_px, qplcore_avx512, qplcore_sw_dispatcher, core_iaa, middle_layer_lib.
# Output ch_contrib::qpl by linking with 8 library targets.
# The qpl submodule comes with its own version of isal. It contains code which does not exist in upstream isal. It would be nice to link
# only upstream isal (ch_contrib::isal) but at this point we can't.
include("${QPL_PROJECT_DIR}/cmake/CompileOptions.cmake")
# check nasm compiler

View File

@ -5,8 +5,8 @@ echo "Using sparse checkout for aws"
FILES_TO_CHECKOUT=$(git rev-parse --git-dir)/info/sparse-checkout
echo '/*' > $FILES_TO_CHECKOUT
echo '!/*/*' >> $FILES_TO_CHECKOUT
echo '/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
echo '/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT
echo '/src/aws-cpp-sdk-core/*' >> $FILES_TO_CHECKOUT
echo '/generated/src/aws-cpp-sdk-s3/*' >> $FILES_TO_CHECKOUT
git config core.sparsecheckout true
git checkout $1

View File

@ -131,14 +131,17 @@ CREATE TABLE table_with_asterisk (name String, value UInt32)
The following settings can be set before query execution or placed into configuration file.
- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `64Mb`.
- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `512Mb`.
- `s3_max_single_part_upload_size` — The maximum size of object to upload using singlepart upload to S3. Default value is `32Mb`.
- `s3_min_upload_part_size` — The minimum size of part to upload during multipart upload to [S3 Multipart upload](https://docs.aws.amazon.com/AmazonS3/latest/dev/uploadobjusingmpu.html). Default value is `16Mb`.
- `s3_max_redirects` — Max number of S3 redirects hops allowed. Default value is `10`.
- `s3_single_read_retries` — The maximum number of attempts during single read. Default value is `4`.
- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`.
- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited).
- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`.
- `s3_upload_part_size_multiply_factor` - Multiply `s3_min_upload_part_size` by this factor each time `s3_multiply_parts_count_threshold` parts were uploaded from a single write to S3. Default values is `2`.
- `s3_upload_part_size_multiply_parts_count_threshold` - Each time this number of parts was uploaded to S3 `s3_min_upload_part_size multiplied` by `s3_upload_part_size_multiply_factor`. DEfault value us `500`.
- `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object. Its number should be limited. The value `0` means unlimited. Default value is `20`. Each inflight part has a buffer with size `s3_min_upload_part_size` for the first `s3_upload_part_size_multiply_factor` parts and more when file is big enought, see `upload_part_size_multiply_factor`. With default settings one uploaded file consumes not more than `320Mb` for a file which is less than `8G`. The consumption is greater for a larger file.
Security consideration: if malicious user can specify arbitrary S3 URLs, `s3_max_redirects` must be set to zero to avoid [SSRF](https://en.wikipedia.org/wiki/Server-side_request_forgery) attacks; or alternatively, `remote_host_filter` must be specified in server configuration.

View File

@ -1219,11 +1219,12 @@ Authentication parameters (the disk will try all available methods **and** Manag
* `account_name` and `account_key` - For authentication using Shared Key.
Limit parameters (mainly for internal usage):
* `max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage.
* `min_bytes_for_seek` - Limits the size of a seekable region.
* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage.
* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage.
* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated.
* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurenly for one object.
Other parameters:
* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks/<disk_name>/`.

View File

@ -258,4 +258,4 @@ Since [remote](../../../sql-reference/table-functions/remote.md) and [cluster](.
- [Virtual columns](../../../engines/table-engines/index.md#table_engines-virtual_columns) description
- [background_distributed_schedule_pool_size](../../../operations/settings/settings.md#background_distributed_schedule_pool_size) setting
- [shardNum()](../../../sql-reference/functions/other-functions.md#shard-num) and [shardCount()](../../../sql-reference/functions/other-functions.md#shard-count) functions
- [shardNum()](../../../sql-reference/functions/other-functions.md#shardnum) and [shardCount()](../../../sql-reference/functions/other-functions.md#shardcount) functions

View File

@ -37,7 +37,7 @@ The data is in CSV files but uses a semi-colon for the delimiter. The rows look
│ 13199 │ BMP180 │ 6664 │ 52.514 │ 13.44 │ 2019-06-01T00:00:07 │ 101855.54 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 19.74 │
│ 12753 │ BMP180 │ 6440 │ 44.616 │ 2.032 │ 2019-06-01T00:00:07 │ 99475 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17 │
│ 16956 │ BMP180 │ 8594 │ 52.052 │ 8.354 │ 2019-06-01T00:00:08 │ 101322 │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ 17.2 │
└───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘
└───────────┴─────────────┴──────────┴────────┴───────┴─────────────────────┴──────────┴──────────┴───────────────────┴─────────────┘
```
2. We will use the following `MergeTree` table to store the data in ClickHouse:

View File

@ -167,9 +167,9 @@ user = 'myuser',
password = 'mypass',
host = '127.0.0.1',
port = 3306,
database = 'test'
connection_pool_size = 8
on_duplicate_clause = 1
database = 'test',
connection_pool_size = 8,
on_duplicate_clause = 1,
replace_query = 1
```

View File

@ -917,9 +917,9 @@ We recommend using this option in macOS since the `getrlimit()` function returns
Restriction on deleting tables.
If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you cant delete it using a DROP query.
If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_table_size_to_drop` (in bytes), you cant delete it using a [DROP](../../sql-reference/statements/drop.md) query or [TRUNCATE](../../sql-reference/statements/truncate.md) query.
If you still need to delete the table without restarting the ClickHouse server, create the `<clickhouse-path>/flags/force_drop_table` file and run the DROP query.
This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `<clickhouse-path>/flags/force_drop_table` file.
Default value: 50 GB.
@ -931,6 +931,28 @@ The value 0 means that you can delete all tables without any restrictions.
<max_table_size_to_drop>0</max_table_size_to_drop>
```
## max_partition_size_to_drop {#max-partition-size-to-drop}
Restriction on dropping partitions.
If the size of a [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) table exceeds `max_partition_size_to_drop` (in bytes), you cant drop a partition using a [DROP PARTITION](../../sql-reference/statements/alter/partition.md#drop-partitionpart) query.
This setting does not require a restart of the Clickhouse server to apply. Another way to disable the restriction is to create the `<clickhouse-path>/flags/force_drop_table` file.
Default value: 50 GB.
The value 0 means that you can drop partitions without any restrictions.
:::note
This limitation does not restrict drop table and truncate table, see [max_table_size_to_drop](#max-table-size-to-drop)
:::
**Example**
``` xml
<max_partition_size_to_drop>0</max_partition_size_to_drop>
```
## max_thread_pool_size {#max-thread-pool-size}
ClickHouse uses threads from the Global Thread pool to process queries. If there is no idle thread to process a query, then a new thread is created in the pool. `max_thread_pool_size` limits the maximum number of threads in the pool.
@ -1319,12 +1341,14 @@ Queries are logged in the [system.part_log](../../operations/system-tables/part_
Use the following parameters to configure logging:
- `database` Name of the database.
- `table` Name of the system table.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `database` - Name of the database.
- `table` - Name of the system table.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
**Example**
@ -1395,12 +1419,14 @@ Queries are logged in the [system.query_log](../../operations/system-tables/quer
Use the following parameters to configure logging:
- `database` Name of the database.
- `table` Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `database` - Name of the database.
- `table` - Name of the system table the queries will be logged in.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
If the table does not exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.
@ -1451,12 +1477,14 @@ Queries are logged in the [system.query_thread_log](../../operations/system-tabl
Use the following parameters to configure logging:
- `database` Name of the database.
- `table` Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `database` - Name of the database.
- `table` - Name of the system table the queries will be logged in.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
If the table does not exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.
@ -1479,12 +1507,14 @@ Queries are logged in the [system.query_views_log](../../operations/system-table
Use the following parameters to configure logging:
- `database` Name of the database.
- `table` Name of the system table the queries will be logged in.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `database` - Name of the database.
- `table` - Name of the system table the queries will be logged in.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
If the table does not exist, ClickHouse will create it. If the structure of the query views log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.
@ -1505,13 +1535,15 @@ Settings for the [text_log](../../operations/system-tables/text_log.md#system_ta
Parameters:
- `level` — Maximum Message Level (by default `Trace`) which will be stored in a table.
- `database` — Database name.
- `table` — Table name.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `level` - Maximum Message Level (by default `Trace`) which will be stored in a table.
- `database` - Database name.
- `table` - Table name.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
**Example**
```xml
@ -1534,12 +1566,14 @@ Settings for the [trace_log](../../operations/system-tables/trace_log.md#system_
Parameters:
- `database` — Database for storing a table.
- `table` — Table name.
- `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` defined.
- `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
- `storage_policy` Name of storage policy to use for the table (optional)
- `database` - Database for storing a table.
- `table` - Table name.
- `partition_by` - [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
- `order_by` - [Custom sorting key](../../engines/table-engines/mergetree-family/mergetree.md#order_by) for a system table. Can't be used if `engine` defined.
- `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` or `order_by` defined.
- `flush_interval_milliseconds` - Interval for flushing data from the buffer in memory to the table.
- `storage_policy` - Name of storage policy to use for the table (optional).
- `settings` - [Additional parameters](../../engines/table-engines/mergetree-family/mergetree#settings) that control the behavior of the MergeTree (optional).
The default server configuration file `config.xml` contains the following settings section:

View File

@ -577,7 +577,7 @@ Default value: 20
**Usage**
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_pool_size) * [background_pool_size](/docs/en/operations/server-configuration-parameters/settings#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
The value of the `number_of_free_entries_in_pool_to_execute_mutation` setting should be less than the value of the [background_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_pool_size) * [background_merges_mutations_concurrency_ratio](/docs/en/operations/server-configuration-parameters/settings.md/#background_merges_mutations_concurrency_ratio). Otherwise, ClickHouse throws an exception.
## max_part_loading_threads {#max-part-loading-threads}

View File

@ -1187,6 +1187,36 @@ Disable limit on kafka_num_consumers that depends on the number of available CPU
Default value: false.
## postgresql_connection_pool_size {#postgresql-connection-pool-size}
Connection pool size for PostgreSQL table engine and database engine.
Default value: 16
## postgresql_connection_pool_size {#postgresql-connection-pool-size}
Connection pool push/pop timeout on empty pool for PostgreSQL table engine and database engine. By default it will block on empty pool.
Default value: 5000
## postgresql_connection_pool_auto_close_connection {#postgresql-connection-pool-auto-close-connection}
Close connection before returning connection to the pool.
Default value: true.
## odbc_bridge_connection_pool_size {#odbc-bridge-connection-pool-size}
Connection pool size for each connection settings string in ODBC bridge.
Default value: 16
## odbc_bridge_use_connection_pooling {#odbc-bridge-use-connection-pooling}
Use connection pooling in ODBC bridge. If set to false, a new connection is created every time.
Default value: true
## use_uncompressed_cache {#setting-use_uncompressed_cache}
Whether to use a cache of uncompressed blocks. Accepts 0 or 1. By default, 0 (disabled).
@ -3563,7 +3593,7 @@ SETTINGS index_granularity = 8192 │
## external_table_functions_use_nulls {#external-table-functions-use-nulls}
Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md)] table functions use Nullable columns.
Defines how [mysql](../../sql-reference/table-functions/mysql.md), [postgresql](../../sql-reference/table-functions/postgresql.md) and [odbc](../../sql-reference/table-functions/odbc.md) table functions use Nullable columns.
Possible values:

View File

@ -0,0 +1,27 @@
---
slug: /en/operations/system-tables/build_options
---
# build_options
Contains information about the ClickHouse server's build options.
Columns:
- `name` (String) — Name of the build option, e.g. `USE_ODBC`
- `value` (String) — Value of the build option, e.g. `1`
**Example**
``` sql
SELECT * FROM system.build_options LIMIT 5
```
``` text
┌─name─────────────┬─value─┐
│ USE_BROTLI │ 1 │
│ USE_BZIP2 │ 1 │
│ USE_CAPNP │ 1 │
│ USE_CASSANDRA │ 1 │
│ USE_DATASKETCHES │ 1 │
└──────────────────┴───────┘
```

View File

@ -29,7 +29,7 @@ select first_value(b) from test_data
### example2
The NULL value is ignored.
```sql
select first_value(b) ignore nulls sfrom test_data
select first_value(b) ignore nulls from test_data
```
```text

View File

@ -2234,7 +2234,7 @@ Result:
## Regular Expression Tree Dictionary {#regexp-tree-dictionary}
Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of (user agent)[https://en.wikipedia.org/wiki/User_agent] strings, which can be expressed elegantly with regexp tree dictionaries.
Regular expression tree dictionaries are a special type of dictionary which represent the mapping from key to attributes using a tree of regular expressions. There are some use cases, e.g. parsing of [user agent](https://en.wikipedia.org/wiki/User_agent) strings, which can be expressed elegantly with regexp tree dictionaries.
### Use Regular Expression Tree Dictionary in ClickHouse Open-Source
@ -2280,7 +2280,7 @@ This config consists of a list of regular expression tree nodes. Each node has t
- The value of an attribute may contain **back references**, referring to capture groups of the matched regular expression. In the example, the value of attribute `version` in the first node consists of a back-reference `\1` to capture group `(\d+[\.\d]*)` in the regular expression. Back-reference numbers range from 1 to 9 and are written as `$1` or `\1` (for number 1). The back reference is replaced by the matched capture group during query execution.
- **child nodes**: a list of children of a regexp tree node, each of which has its own attributes and (potentially) children nodes. String matching proceeds in a depth-first fashion. If a string matches a regexp node, the dictionary checks if it also matches the nodes' child nodes. If that is the case, the attributes of the deepest matching node are assigned. Attributes of a child node overwrite equally named attributes of parent nodes. The name of child nodes in YAML files can be arbitrary, e.g. `versions` in above example.
Regexp tree dictionaries only allow access using functions `dictGet`, `dictGetOrDefault` and `dictGetOrNull`.
Regexp tree dictionaries only allow access using the functions `dictGet` and `dictGetOrDefault`.
Example:

View File

@ -323,11 +323,11 @@ Alias: `REPEAT`
**Arguments**
- `s` — The string to repeat. [String](../../sql-reference/data-types/string.md).
- `n` — The number of times to repeat the string. [UInt or Int](../../sql-reference/data-types/int-uint.md).
- `n` — The number of times to repeat the string. [UInt* or Int*](../../sql-reference/data-types/int-uint.md).
**Returned value**
The single string containing string `s` repeated `n` times. If `n` \< 1, the function returns empty string.
A string containing string `s` repeated `n` times. If `n` <= 0, the function returns the empty string.
Type: `String`.
@ -345,6 +345,44 @@ Result:
└────────────────────────────────┘
```
## space
Concatenates a space (` `) as many times with itself as specified.
**Syntax**
``` sql
space(n)
```
Alias: `SPACE`.
**Arguments**
- `n` — The number of times to repeat the space. [UInt* or Int*](../../sql-reference/data-types/int-uint.md).
**Returned value**
The string containing string ` ` repeated `n` times. If `n` <= 0, the function returns the empty string.
Type: `String`.
**Example**
Query:
``` sql
SELECT space(3);
```
Result:
``` text
┌─space(3) ────┐
│ │
└──────────────┘
```
## reverse
Reverses the sequence of bytes in a string.

View File

@ -544,10 +544,10 @@ Result:
└─────┴──────────┴───────┘
```
##Filling grouped by sorting prefix
## Filling grouped by sorting prefix
It can be useful to fill rows which have the same values in particular columns independently, - a good example is filling missing values in time series.
Assume there is the following time series table
Assume there is the following time series table:
``` sql
CREATE TABLE timeseries
(
@ -567,7 +567,7 @@ SELECT * FROM timeseries;
└───────────┴─────────────────────────┴───────┘
```
And we'd like to fill missing values for each sensor independently with 1 second interval.
The way to achieve it is to use `sensor_id` column as sorting prefix for filling column `timestamp`
The way to achieve it is to use `sensor_id` column as sorting prefix for filling column `timestamp`:
```
SELECT *
FROM timeseries
@ -589,7 +589,7 @@ INTERPOLATE ( value AS 9999 )
│ 432 │ 2021-12-01 00:00:05.000 │ 5 │
└───────────┴─────────────────────────┴───────┘
```
Here, the `value` column was interpolated with `9999` just to make filled rows more noticeable
Here, the `value` column was interpolated with `9999` just to make filled rows more noticeable.
This behavior is controlled by setting `use_with_fill_by_sorting_prefix` (enabled by default)
## Related content

View File

@ -34,7 +34,7 @@ For the `SAMPLE` clause the following syntax is supported:
| `SAMPLE k OFFSET m` | Here `k` and `m` are the numbers from 0 to 1. The query is executed on a sample of `k` fraction of the data. The data used for the sample is offset by `m` fraction. [Read more](#select-sample-offset) |
## SAMPLE K
## SAMPLE K {#select-sample-k}
Here `k` is the number from 0 to 1 (both fractional and decimal notations are supported). For example, `SAMPLE 1/2` or `SAMPLE 0.5`.
@ -54,7 +54,7 @@ ORDER BY PageViews DESC LIMIT 1000
In this example, the query is executed on a sample from 0.1 (10%) of data. Values of aggregate functions are not corrected automatically, so to get an approximate result, the value `count()` is manually multiplied by 10.
## SAMPLE N
## SAMPLE N {#select-sample-n}
Here `n` is a sufficiently large integer. For example, `SAMPLE 10000000`.
@ -90,7 +90,7 @@ FROM visits
SAMPLE 10000000
```
## SAMPLE K OFFSET M
## SAMPLE K OFFSET M {#select-sample-offset}
Here `k` and `m` are numbers from 0 to 1. Examples are shown below.

View File

@ -1137,6 +1137,16 @@
<ttl>event_date + INTERVAL 30 DAY DELETE</ttl>
-->
<!--
ORDER BY expr: https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#order_by
Example:
event_date, event_time
event_date, type, query_id
event_date, event_time, initial_query_id
<order_by>event_date, event_time, initial_query_id</order_by>
-->
<!-- Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
Example: <engine>ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024</engine>
-->

View File

@ -152,6 +152,13 @@ public:
nested_func->merge(place, rhs, arena);
}
bool isAbleToParallelizeMerge() const override { return nested_func->isAbleToParallelizeMerge(); }
void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, ThreadPool & thread_pool, Arena * arena) const override
{
nested_func->merge(place, rhs, thread_pool, arena);
}
void mergeBatch(
size_t row_begin,
size_t row_end,

View File

@ -59,16 +59,31 @@ UInt64 BackupEntryFromImmutableFile::getSize() const
UInt128 BackupEntryFromImmutableFile::getChecksum() const
{
{
std::lock_guard lock{size_and_checksum_mutex};
if (checksum_adjusted)
return *checksum;
if (checksum)
{
if (copy_encrypted)
checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
checksum_adjusted = true;
return *checksum;
}
}
auto calculated_checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
{
std::lock_guard lock{size_and_checksum_mutex};
if (!checksum_adjusted)
{
if (!checksum)
checksum = BackupEntryWithChecksumCalculation<IBackupEntry>::getChecksum();
else if (copy_encrypted)
checksum = combineChecksums(*checksum, disk->getEncryptedFileIV(file_path));
checksum = calculated_checksum;
checksum_adjusted = true;
}
return *checksum;
}
}
std::optional<UInt128> BackupEntryFromImmutableFile::getPartialChecksum(size_t prefix_length) const

View File

@ -44,7 +44,7 @@ private:
const DataSourceDescription data_source_description;
const bool copy_encrypted;
mutable std::optional<UInt64> file_size;
mutable std::optional<UInt64> checksum;
mutable std::optional<UInt128> checksum;
mutable bool file_size_adjusted = false;
mutable bool checksum_adjusted = false;
mutable std::mutex size_and_checksum_mutex;

View File

@ -8,15 +8,32 @@ namespace DB
template <typename Base>
UInt128 BackupEntryWithChecksumCalculation<Base>::getChecksum() const
{
{
std::lock_guard lock{checksum_calculation_mutex};
if (calculated_checksum)
return *calculated_checksum;
}
size_t size = this->getSize();
{
std::lock_guard lock{checksum_calculation_mutex};
if (!calculated_checksum)
{
auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(this->getSize()));
if (size == 0)
{
calculated_checksum = 0;
}
else
{
auto read_buffer = this->getReadBuffer(ReadSettings{}.adjustBufferSize(size));
HashingReadBuffer hashing_read_buffer(*read_buffer);
hashing_read_buffer.ignoreAll();
calculated_checksum = hashing_read_buffer.getHash();
}
}
return *calculated_checksum;
}
}
template <typename Base>

View File

@ -0,0 +1,350 @@
#include <gtest/gtest.h>
#include <Backups/BackupEntryFromAppendOnlyFile.h>
#include <Backups/BackupEntryFromImmutableFile.h>
#include <Backups/BackupEntryFromSmallFile.h>
#include <Disks/IDisk.h>
#include <Disks/DiskLocal.h>
#include <Disks/DiskEncrypted.h>
#include <IO/FileEncryptionCommon.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
#include <Poco/TemporaryFile.h>
using namespace DB;
class BackupEntriesTest : public ::testing::Test
{
protected:
void SetUp() override
{
/// Make local disk.
temp_dir = std::make_unique<Poco::TemporaryFile>();
temp_dir->createDirectories();
local_disk = std::make_shared<DiskLocal>("local_disk", temp_dir->path() + "/", 0);
/// Make encrypted disk.
auto settings = std::make_unique<DiskEncryptedSettings>();
settings->wrapped_disk = local_disk;
settings->current_algorithm = FileEncryption::Algorithm::AES_128_CTR;
settings->keys[0] = "1234567890123456";
settings->current_key_id = 0;
settings->disk_path = "encrypted/";
encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
}
void TearDown() override
{
encrypted_disk.reset();
local_disk.reset();
}
static void writeFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
writeString(std::string_view{"Some text"}, *buf);
buf->finalize();
}
static void writeEmptyFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Rewrite, {});
buf->finalize();
}
static void appendFile(DiskPtr disk, const String & filepath)
{
auto buf = disk->writeFile(filepath, DBMS_DEFAULT_BUFFER_SIZE, WriteMode::Append, {});
writeString(std::string_view{"Appended"}, *buf);
buf->finalize();
}
static String getChecksum(const BackupEntryPtr & backup_entry)
{
return getHexUIntUppercase(backup_entry->getChecksum());
}
static const constexpr std::string_view NO_CHECKSUM = "no checksum";
static String getPartialChecksum(const BackupEntryPtr & backup_entry, size_t prefix_length)
{
auto partial_checksum = backup_entry->getPartialChecksum(prefix_length);
if (!partial_checksum)
return String{NO_CHECKSUM};
return getHexUIntUppercase(*partial_checksum);
}
static String readAll(const BackupEntryPtr & backup_entry)
{
auto in = backup_entry->getReadBuffer({});
String str;
readStringUntilEOF(str, *in);
return str;
}
std::unique_ptr<Poco::TemporaryFile> temp_dir;
std::shared_ptr<DiskLocal> local_disk;
std::shared_ptr<DiskEncrypted> encrypted_disk;
};
static const constexpr std::string_view ZERO_CHECKSUM = "00000000000000000000000000000000";
static const constexpr std::string_view SOME_TEXT_CHECKSUM = "28B5529750AC210952FFD366774363ED";
static const constexpr std::string_view S_CHECKSUM = "C27395C39AFB5557BFE47661CC9EB86C";
static const constexpr std::string_view SOME_TEX_CHECKSUM = "D00D9BE8D87919A165F14EDD31088A0E";
static const constexpr std::string_view SOME_TEXT_APPENDED_CHECKSUM = "5A1F10F638DC7A226231F3FD927D1726";
static const constexpr std::string_view PRECALCULATED_CHECKSUM = "1122334455667788AABBCCDDAABBCCDD";
static const constexpr UInt128 PRECALCULATED_CHECKSUM_UINT128 = (UInt128(0x1122334455667788) << 64) | 0xAABBCCDDAABBCCDD;
static const size_t PRECALCULATED_SIZE = 123;
TEST_F(BackupEntriesTest, BackupEntryFromImmutableFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
writeEmptyFile(local_disk, "empty.txt");
auto empty_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "empty.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(local_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE - 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
EXPECT_EQ(readAll(precalculated_entry), "Some text");
}
TEST_F(BackupEntriesTest, BackupEntryFromAppendOnlyFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
appendFile(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
auto appended_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(appended_entry->getSize(), 17);
EXPECT_EQ(getChecksum(appended_entry), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 22), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(appended_entry, 1000), SOME_TEXT_APPENDED_CHECKSUM);
EXPECT_EQ(readAll(appended_entry), "Some textAppended");
writeEmptyFile(local_disk, "empty_appended.txt");
auto empty_entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "empty_appended.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
appendFile(local_disk, "empty_appended.txt");
EXPECT_EQ(empty_entry->getSize(), 0);
EXPECT_EQ(getChecksum(empty_entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(empty_entry, 1000), ZERO_CHECKSUM);
EXPECT_EQ(readAll(empty_entry), "");
}
TEST_F(BackupEntriesTest, PartialChecksumBeforeFullChecksum)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
entry = std::make_shared<BackupEntryFromAppendOnlyFile>(local_disk, "a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
TEST_F(BackupEntriesTest, BackupEntryFromSmallFile)
{
writeFile(local_disk, "a.txt");
auto entry = std::make_shared<BackupEntryFromSmallFile>(local_disk, "a.txt");
local_disk->removeFile("a.txt");
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), S_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), SOME_TEX_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
TEST_F(BackupEntriesTest, DecryptedEntriesFromEncryptedDisk)
{
{
writeFile(encrypted_disk, "a.txt");
std::pair<BackupEntryPtr, bool /* partial_checksum_allowed */> test_cases[]
= {{std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt"), false},
{std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt"), true},
{std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt"), true}};
for (const auto & [entry, partial_checksum_allowed] : test_cases)
{
EXPECT_EQ(entry->getSize(), 9);
EXPECT_EQ(getChecksum(entry), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), partial_checksum_allowed ? S_CHECKSUM : NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 8), partial_checksum_allowed ? SOME_TEX_CHECKSUM : NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 9), SOME_TEXT_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1000), SOME_TEXT_CHECKSUM);
EXPECT_EQ(readAll(entry), "Some text");
}
}
{
writeEmptyFile(encrypted_disk, "empty.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt"),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt"),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt")};
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 0);
EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(readAll(entry), "");
}
}
{
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", false, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE);
EXPECT_EQ(getChecksum(precalculated_entry), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), PRECALCULATED_CHECKSUM);
EXPECT_EQ(readAll(precalculated_entry), "Some text");
}
}
TEST_F(BackupEntriesTest, EncryptedEntriesFromEncryptedDisk)
{
{
writeFile(encrypted_disk, "a.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true)};
auto encrypted_checksum = getChecksum(entries[0]);
EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
auto partial_checksum = getPartialChecksum(entries[1], 9);
EXPECT_NE(partial_checksum, NO_CHECKSUM);
EXPECT_NE(partial_checksum, ZERO_CHECKSUM);
EXPECT_NE(partial_checksum, SOME_TEXT_CHECKSUM);
EXPECT_NE(partial_checksum, encrypted_checksum);
auto encrypted_data = readAll(entries[0]);
EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 9 + FileEncryption::Header::kSize);
EXPECT_EQ(getChecksum(entry), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
auto encrypted_checksum_9 = getPartialChecksum(entry, 9);
EXPECT_TRUE(encrypted_checksum_9 == NO_CHECKSUM || encrypted_checksum_9 == partial_checksum);
EXPECT_EQ(getPartialChecksum(entry, 9 + FileEncryption::Header::kSize), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(entry, 1000), encrypted_checksum);
EXPECT_EQ(readAll(entry), encrypted_data);
}
}
{
writeEmptyFile(encrypted_disk, "empty.txt");
BackupEntryPtr entries[]
= {std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromAppendOnlyFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true),
std::make_shared<BackupEntryFromSmallFile>(encrypted_disk, "empty.txt", /* copy_encrypted= */ true)};
for (const auto & entry : entries)
{
EXPECT_EQ(entry->getSize(), 0);
EXPECT_EQ(getChecksum(entry), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(entry, 1), ZERO_CHECKSUM);
EXPECT_EQ(readAll(entry), "");
}
}
{
auto precalculated_entry = std::make_shared<BackupEntryFromImmutableFile>(encrypted_disk, "a.txt", /* copy_encrypted= */ true, PRECALCULATED_SIZE, PRECALCULATED_CHECKSUM_UINT128);
EXPECT_EQ(precalculated_entry->getSize(), PRECALCULATED_SIZE + FileEncryption::Header::kSize);
auto encrypted_checksum = getChecksum(precalculated_entry);
EXPECT_NE(encrypted_checksum, NO_CHECKSUM);
EXPECT_NE(encrypted_checksum, ZERO_CHECKSUM);
EXPECT_NE(encrypted_checksum, SOME_TEXT_CHECKSUM);
EXPECT_NE(encrypted_checksum, PRECALCULATED_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 0), ZERO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE), NO_CHECKSUM);
EXPECT_EQ(getPartialChecksum(precalculated_entry, PRECALCULATED_SIZE + FileEncryption::Header::kSize), encrypted_checksum);
EXPECT_EQ(getPartialChecksum(precalculated_entry, 1000), encrypted_checksum);
auto encrypted_data = readAll(precalculated_entry);
EXPECT_EQ(encrypted_data.size(), 9 + FileEncryption::Header::kSize);
}
}

View File

@ -121,7 +121,7 @@ ConnectionEstablisherAsync::ConnectionEstablisherAsync(
epoll.add(timeout_descriptor.getDescriptor());
}
void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, ResumeCallback)
void ConnectionEstablisherAsync::Task::run(AsyncCallback async_callback, SuspendCallback)
{
connection_establisher_async.reset();
connection_establisher_async.connection_establisher.setAsyncCallback(async_callback);

View File

@ -91,7 +91,7 @@ private:
ConnectionEstablisherAsync & connection_establisher_async;
void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
};
void cancelAfter() override;

View File

@ -57,7 +57,7 @@ bool PacketReceiver::checkTimeout()
return true;
}
void PacketReceiver::Task::run(AsyncCallback async_callback, ResumeCallback suspend_callback)
void PacketReceiver::Task::run(AsyncCallback async_callback, SuspendCallback suspend_callback)
{
while (true)
{

View File

@ -57,7 +57,7 @@ private:
PacketReceiver & receiver;
void run(AsyncCallback async_callback, ResumeCallback suspend_callback) override;
void run(AsyncCallback async_callback, SuspendCallback suspend_callback) override;
};
/// When epoll file descriptor is ready, check if it's an expired timeout.

View File

@ -6,6 +6,7 @@
#include <Common/noexcept_scope.h>
#include <Common/setThreadName.h>
#include <Common/logger_useful.h>
#include <Common/ThreadPool.h>
namespace DB
{
@ -41,9 +42,14 @@ std::exception_ptr LoadJob::exception() const
return load_exception;
}
ssize_t LoadJob::priority() const
size_t LoadJob::executionPool() const
{
return load_priority;
return execution_pool_id;
}
size_t LoadJob::pool() const
{
return pool_id;
}
void LoadJob::wait() const
@ -112,8 +118,9 @@ void LoadJob::enqueued()
enqueue_time = std::chrono::system_clock::now();
}
void LoadJob::execute(const LoadJobPtr & self)
void LoadJob::execute(size_t pool, const LoadJobPtr & self)
{
execution_pool_id = pool;
start_time = std::chrono::system_clock::now();
func(self);
}
@ -148,22 +155,35 @@ void LoadTask::remove()
{
loader.remove(jobs);
jobs.clear();
goal_jobs.clear();
}
}
void LoadTask::detach()
{
jobs.clear();
goal_jobs.clear();
}
AsyncLoader::AsyncLoader(Metric metric_threads, Metric metric_active_threads, size_t max_threads_, bool log_failures_, bool log_progress_)
AsyncLoader::AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_)
: log_failures(log_failures_)
, log_progress(log_progress_)
, log(&Poco::Logger::get("AsyncLoader"))
, max_threads(max_threads_)
, pool(metric_threads, metric_active_threads, max_threads)
{
pools.reserve(pool_initializers.size());
for (auto && init : pool_initializers)
pools.push_back({
.name = init.name,
.priority = init.priority,
.thread_pool = std::make_unique<ThreadPool>(
init.metric_threads,
init.metric_active_threads,
init.max_threads,
/* max_free_threads = */ 0,
init.max_threads),
.max_threads = init.max_threads
});
}
AsyncLoader::~AsyncLoader()
@ -175,13 +195,20 @@ void AsyncLoader::start()
{
std::unique_lock lock{mutex};
is_running = true;
for (size_t i = 0; workers < max_threads && i < ready_queue.size(); i++)
spawn(lock);
updateCurrentPriorityAndSpawn(lock);
}
void AsyncLoader::wait()
{
pool.wait();
// Because job can create new jobs in other pools we have to recheck in cycle
std::unique_lock lock{mutex};
while (!scheduled_jobs.empty())
{
lock.unlock();
for (auto & p : pools)
p.thread_pool->wait();
lock.lock();
}
}
void AsyncLoader::stop()
@ -191,7 +218,7 @@ void AsyncLoader::stop()
is_running = false;
// NOTE: there is no need to notify because workers never wait
}
pool.wait();
wait();
}
void AsyncLoader::schedule(LoadTask & task)
@ -229,9 +256,9 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
old_jobs = finished_jobs.size();
}
// Make set of jobs to schedule:
// Pass 1. Make set of jobs to schedule:
// 1) exclude already scheduled or finished jobs
// 2) include pending dependencies, that are not yet scheduled
// 2) include assigned job dependencies (that are not yet scheduled)
LoadJobSet jobs;
for (const auto & job : input_jobs)
gatherNotScheduled(job, jobs, lock);
@ -242,17 +269,18 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
// We do not want any exception to be throws after this point, because the following code is not exception-safe
DENY_ALLOCATIONS_IN_SCOPE;
// Schedule all incoming jobs
// Pass 2. Schedule all incoming jobs
for (const auto & job : jobs)
{
chassert(job->pool() < pools.size());
NOEXCEPT_SCOPE({
ALLOW_ALLOCATIONS_IN_SCOPE;
scheduled_jobs.emplace(job, Info{.initial_priority = job->load_priority, .priority = job->load_priority});
scheduled_jobs.try_emplace(job);
job->scheduled();
});
}
// Process dependencies on scheduled pending jobs
// Pass 3. Process dependencies on scheduled jobs, priority inheritance
for (const auto & job : jobs)
{
Info & info = scheduled_jobs.find(job)->second;
@ -267,17 +295,18 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
});
info.dependencies_left++;
// Priority inheritance: prioritize deps to have at least given `priority` to avoid priority inversion
prioritize(dep, info.priority, lock);
// Priority inheritance: prioritize deps to have at least given `pool.priority` to avoid priority inversion
prioritize(dep, job->pool_id, lock);
}
}
// Enqueue non-blocked jobs (w/o dependencies) to ready queue
if (!info.is_blocked())
if (!info.isBlocked())
enqueue(info, job, lock);
}
// Process dependencies on other jobs. It is done in a separate pass to facilitate propagation of cancel signals (if any).
// Pass 4: Process dependencies on other jobs.
// It is done in a separate pass to facilitate cancelling due to already failed dependencies.
for (const auto & job : jobs)
{
if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
@ -285,12 +314,12 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
for (const auto & dep : job->dependencies)
{
if (scheduled_jobs.contains(dep))
continue; // Skip dependencies on scheduled pending jobs (already processed)
continue; // Skip dependencies on scheduled jobs (already processed in pass 3)
LoadStatus dep_status = dep->status();
if (dep_status == LoadStatus::OK)
continue; // Dependency on already successfully finished job -- it's okay.
// Dependency on not scheduled pending job -- it's bad.
// Dependency on assigned job -- it's bad.
// Probably, there is an error in `jobs` set, `gatherNotScheduled()` should have fixed it.
chassert(dep_status != LoadStatus::PENDING);
@ -305,7 +334,7 @@ void AsyncLoader::scheduleImpl(const LoadJobSet & input_jobs)
job->name,
getExceptionMessage(dep->exception(), /* with_stacktrace = */ false)));
});
finish(lock, job, LoadStatus::CANCELED, e);
finish(job, LoadStatus::CANCELED, e, lock);
break; // This job is now finished, stop its dependencies processing
}
}
@ -327,13 +356,14 @@ void AsyncLoader::gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs,
}
}
void AsyncLoader::prioritize(const LoadJobPtr & job, ssize_t new_priority)
void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool)
{
if (!job)
return;
chassert(new_pool < pools.size());
DENY_ALLOCATIONS_IN_SCOPE;
std::unique_lock lock{mutex};
prioritize(job, new_priority, lock);
prioritize(job, new_pool, lock);
}
void AsyncLoader::remove(const LoadJobSet & jobs)
@ -347,14 +377,14 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
{
if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
{
if (info->second.is_executing())
if (info->second.isExecuting())
continue; // Skip executing jobs on the first pass
std::exception_ptr e;
NOEXCEPT_SCOPE({
ALLOW_ALLOCATIONS_IN_SCOPE;
e = std::make_exception_ptr(Exception(ErrorCodes::ASYNC_LOAD_CANCELED, "Load job '{}' canceled", job->name));
});
finish(lock, job, LoadStatus::CANCELED, e);
finish(job, LoadStatus::CANCELED, e, lock);
}
}
// On the second pass wait for executing jobs to finish
@ -363,7 +393,7 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
{
// Job is currently executing
chassert(info->second.is_executing());
chassert(info->second.isExecuting());
lock.unlock();
job->waitNoThrow(); // Wait for job to finish
lock.lock();
@ -379,25 +409,36 @@ void AsyncLoader::remove(const LoadJobSet & jobs)
}
}
void AsyncLoader::setMaxThreads(size_t value)
void AsyncLoader::setMaxThreads(size_t pool, size_t value)
{
std::unique_lock lock{mutex};
pool.setMaxThreads(value);
pool.setMaxFreeThreads(value);
pool.setQueueSize(value);
max_threads = value;
auto & p = pools[pool];
p.thread_pool->setMaxThreads(value);
p.thread_pool->setQueueSize(value); // Keep queue size equal max threads count to avoid blocking during spawning
p.max_threads = value;
if (!is_running)
return;
for (size_t i = 0; workers < max_threads && i < ready_queue.size(); i++)
spawn(lock);
for (size_t i = 0; canSpawnWorker(p, lock) && i < p.ready_queue.size(); i++)
spawn(p, lock);
}
size_t AsyncLoader::getMaxThreads() const
size_t AsyncLoader::getMaxThreads(size_t pool) const
{
std::unique_lock lock{mutex};
return max_threads;
return pools[pool].max_threads;
}
const String & AsyncLoader::getPoolName(size_t pool) const
{
return pools[pool].name; // NOTE: lock is not needed because `name` is const and `pools` are immutable
}
Priority AsyncLoader::getPoolPriority(size_t pool) const
{
return pools[pool].priority; // NOTE: lock is not needed because `priority` is const and `pools` are immutable
}
size_t AsyncLoader::getScheduledJobCount() const
{
std::unique_lock lock{mutex};
@ -412,11 +453,10 @@ std::vector<AsyncLoader::JobState> AsyncLoader::getJobStates() const
states.emplace(job->name, JobState{
.job = job,
.dependencies_left = info.dependencies_left,
.is_executing = info.is_executing(),
.is_blocked = info.is_blocked(),
.is_ready = info.is_ready(),
.initial_priority = info.initial_priority,
.ready_seqno = last_ready_seqno
.ready_seqno = info.ready_seqno,
.is_blocked = info.isBlocked(),
.is_ready = info.isReady(),
.is_executing = info.isExecuting()
});
for (const auto & job : finished_jobs)
states.emplace(job->name, JobState{.job = job});
@ -462,21 +502,21 @@ String AsyncLoader::checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, Lo
return {};
}
void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job)
void AsyncLoader::finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock)
{
chassert(scheduled_jobs.contains(job)); // Job was pending
if (status == LoadStatus::OK)
{
// Notify waiters
job->ok();
// Update dependent jobs and enqueue if ready
chassert(scheduled_jobs.contains(job)); // Job was pending
for (const auto & dep : scheduled_jobs[job].dependent_jobs)
{
chassert(scheduled_jobs.contains(dep)); // All depended jobs must be pending
Info & dep_info = scheduled_jobs[dep];
dep_info.dependencies_left--;
if (!dep_info.is_blocked())
if (!dep_info.isBlocked())
enqueue(dep_info, dep, lock);
}
}
@ -488,11 +528,10 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
else if (status == LoadStatus::CANCELED)
job->canceled(exception_from_job);
chassert(scheduled_jobs.contains(job)); // Job was pending
Info & info = scheduled_jobs[job];
if (info.is_ready())
if (info.isReady())
{
ready_queue.erase(info.key());
pools[job->pool_id].ready_queue.erase(info.ready_seqno);
info.ready_seqno = 0;
}
@ -512,7 +551,7 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
dep->name,
getExceptionMessage(exception_from_job, /* with_stacktrace = */ false)));
});
finish(lock, dep, LoadStatus::CANCELED, e);
finish(dep, LoadStatus::CANCELED, e, lock);
}
// Clean dependency graph edges pointing to canceled jobs
@ -531,87 +570,130 @@ void AsyncLoader::finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr &
});
}
void AsyncLoader::prioritize(const LoadJobPtr & job, ssize_t new_priority, std::unique_lock<std::mutex> & lock)
void AsyncLoader::prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock)
{
if (auto info = scheduled_jobs.find(job); info != scheduled_jobs.end())
{
if (info->second.priority >= new_priority)
return; // Never lower priority
Pool & old_pool = pools[job->pool_id];
Pool & new_pool = pools[new_pool_id];
if (old_pool.priority <= new_pool.priority)
return; // Never lower priority or change pool leaving the same priority
// Update priority and push job forward through ready queue if needed
if (info->second.ready_seqno)
ready_queue.erase(info->second.key());
info->second.priority = new_priority;
job->load_priority.store(new_priority); // Set user-facing priority (may affect executing jobs)
if (info->second.ready_seqno)
UInt64 ready_seqno = info->second.ready_seqno;
// Requeue job into the new pool queue without allocations
if (ready_seqno)
{
NOEXCEPT_SCOPE({
ALLOW_ALLOCATIONS_IN_SCOPE;
ready_queue.emplace(info->second.key(), job);
});
new_pool.ready_queue.insert(old_pool.ready_queue.extract(ready_seqno));
if (canSpawnWorker(new_pool, lock))
spawn(new_pool, lock);
}
// Set user-facing pool (may affect executing jobs)
job->pool_id.store(new_pool_id);
// Recurse into dependencies
for (const auto & dep : job->dependencies)
prioritize(dep, new_priority, lock);
prioritize(dep, new_pool_id, lock);
}
}
void AsyncLoader::enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock)
{
chassert(!info.is_blocked());
chassert(!info.isBlocked());
chassert(info.ready_seqno == 0);
info.ready_seqno = ++last_ready_seqno;
Pool & pool = pools[job->pool_id];
NOEXCEPT_SCOPE({
ALLOW_ALLOCATIONS_IN_SCOPE;
ready_queue.emplace(info.key(), job);
pool.ready_queue.emplace(info.ready_seqno, job);
});
job->enqueued();
if (is_running && workers < max_threads)
spawn(lock);
if (canSpawnWorker(pool, lock))
spawn(pool, lock);
}
void AsyncLoader::spawn(std::unique_lock<std::mutex> &)
bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &)
{
workers++;
return is_running
&& !pool.ready_queue.empty()
&& pool.workers < pool.max_threads
&& (!current_priority || *current_priority >= pool.priority);
}
bool AsyncLoader::canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &)
{
return is_running
&& !pool.ready_queue.empty()
&& pool.workers <= pool.max_threads
&& (!current_priority || *current_priority >= pool.priority);
}
void AsyncLoader::updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> & lock)
{
// Find current priority.
// NOTE: We assume low number of pools, so O(N) scans are fine.
std::optional<Priority> priority;
for (Pool & pool : pools)
{
if (pool.isActive() && (!priority || *priority > pool.priority))
priority = pool.priority;
}
current_priority = priority;
// Spawn workers in all pools with current priority
for (Pool & pool : pools)
{
for (size_t i = 0; canSpawnWorker(pool, lock) && i < pool.ready_queue.size(); i++)
spawn(pool, lock);
}
}
void AsyncLoader::spawn(Pool & pool, std::unique_lock<std::mutex> &)
{
pool.workers++;
current_priority = pool.priority; // canSpawnWorker() ensures this would not decrease current_priority
NOEXCEPT_SCOPE({
ALLOW_ALLOCATIONS_IN_SCOPE;
pool.scheduleOrThrowOnError([this] { worker(); });
pool.thread_pool->scheduleOrThrowOnError([this, &pool] { worker(pool); });
});
}
void AsyncLoader::worker()
void AsyncLoader::worker(Pool & pool)
{
DENY_ALLOCATIONS_IN_SCOPE;
size_t pool_id = &pool - &*pools.begin();
LoadJobPtr job;
std::exception_ptr exception_from_job;
while (true)
{
// This is inside the loop to also reset previous thread names set inside the jobs
setThreadName("AsyncLoader");
setThreadName(pool.name.c_str());
{
std::unique_lock lock{mutex};
// Handle just executed job
if (exception_from_job)
finish(lock, job, LoadStatus::FAILED, exception_from_job);
finish(job, LoadStatus::FAILED, exception_from_job, lock);
else if (job)
finish(lock, job, LoadStatus::OK);
finish(job, LoadStatus::OK, {}, lock);
if (!is_running || ready_queue.empty() || workers > max_threads)
if (!canWorkerLive(pool, lock))
{
workers--;
if (--pool.workers == 0)
updateCurrentPriorityAndSpawn(lock); // It will spawn lower priority workers if needed
return;
}
// Take next job to be executed from the ready queue
auto it = ready_queue.begin();
auto it = pool.ready_queue.begin();
job = it->second;
ready_queue.erase(it);
pool.ready_queue.erase(it);
scheduled_jobs.find(job)->second.ready_seqno = 0; // This job is no longer in the ready queue
}
@ -619,7 +701,7 @@ void AsyncLoader::worker()
try
{
job->execute(job);
job->execute(pool_id, job);
exception_from_job = {};
}
catch (...)

View File

@ -11,8 +11,9 @@
#include <boost/noncopyable.hpp>
#include <base/types.h>
#include <Common/CurrentMetrics.h>
#include <Common/Priority.h>
#include <Common/Stopwatch.h>
#include <Common/ThreadPool.h>
#include <Common/ThreadPool_fwd.h>
namespace Poco { class Logger; }
@ -46,22 +47,28 @@ class LoadJob : private boost::noncopyable
{
public:
template <class Func, class LoadJobSetType>
LoadJob(LoadJobSetType && dependencies_, String name_, Func && func_, ssize_t priority_ = 0)
LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, Func && func_)
: dependencies(std::forward<LoadJobSetType>(dependencies_))
, name(std::move(name_))
, pool_id(pool_id_)
, func(std::forward<Func>(func_))
, load_priority(priority_)
{}
// Current job status.
LoadStatus status() const;
std::exception_ptr exception() const;
// Returns current value of a priority of the job. May differ from initial priority.
ssize_t priority() const;
// Returns pool in which the job is executing (was executed). May differ from initial pool and from current pool.
// Value is only valid (and constant) after execution started.
size_t executionPool() const;
// Returns current pool of the job. May differ from initial and execution pool.
// This value is intended for creating new jobs during this job execution.
// Value may change during job execution by `prioritize()`.
size_t pool() const;
// Sync wait for a pending job to be finished: OK, FAILED or CANCELED status.
// Throws if job is FAILED or CANCELED. Returns or throws immediately on non-pending job.
// Throws if job is FAILED or CANCELED. Returns or throws immediately if called on non-pending job.
void wait() const;
// Wait for a job to reach any non PENDING status.
@ -90,10 +97,11 @@ private:
void scheduled();
void enqueued();
void execute(const LoadJobPtr & self);
void execute(size_t pool, const LoadJobPtr & self);
std::atomic<size_t> execution_pool_id;
std::atomic<size_t> pool_id;
std::function<void(const LoadJobPtr & self)> func;
std::atomic<ssize_t> load_priority;
mutable std::mutex mutex;
mutable std::condition_variable finished;
@ -115,25 +123,25 @@ struct EmptyJobFunc
template <class Func = EmptyJobFunc>
LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, Func && func = EmptyJobFunc())
{
return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), std::forward<Func>(func));
return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), 0, std::forward<Func>(func));
}
template <class Func = EmptyJobFunc>
LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, Func && func = EmptyJobFunc())
{
return std::make_shared<LoadJob>(dependencies, std::move(name), std::forward<Func>(func));
return std::make_shared<LoadJob>(dependencies, std::move(name), 0, std::forward<Func>(func));
}
template <class Func = EmptyJobFunc>
LoadJobPtr makeLoadJob(LoadJobSet && dependencies, ssize_t priority, String name, Func && func = EmptyJobFunc())
LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc())
{
return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), std::forward<Func>(func), priority);
return std::make_shared<LoadJob>(std::move(dependencies), std::move(name), pool_id, std::forward<Func>(func));
}
template <class Func = EmptyJobFunc>
LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, ssize_t priority, String name, Func && func = EmptyJobFunc())
LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, Func && func = EmptyJobFunc())
{
return std::make_shared<LoadJob>(dependencies, std::move(name), std::forward<Func>(func), priority);
return std::make_shared<LoadJob>(dependencies, std::move(name), pool_id, std::forward<Func>(func));
}
// Represents a logically connected set of LoadJobs required to achieve some goals (final LoadJob in the set).
@ -185,7 +193,7 @@ inline void scheduleLoad(const LoadTaskPtrs & tasks)
}
template <class... Args>
inline void scheduleLoad(Args && ... args)
inline void scheduleLoadAll(Args && ... args)
{
(scheduleLoad(std::forward<Args>(args)), ...);
}
@ -208,16 +216,16 @@ inline void waitLoad(const LoadTaskPtrs & tasks)
}
template <class... Args>
inline void waitLoad(Args && ... args)
inline void waitLoadAll(Args && ... args)
{
(waitLoad(std::forward<Args>(args)), ...);
}
template <class... Args>
inline void scheduleAndWaitLoad(Args && ... args)
inline void scheduleAndWaitLoadAll(Args && ... args)
{
scheduleLoad(std::forward<Args>(args)...);
waitLoad(std::forward<Args>(args)...);
scheduleLoadAll(std::forward<Args>(args)...);
waitLoadAll(std::forward<Args>(args)...);
}
inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
@ -228,6 +236,14 @@ inline LoadJobSet getGoals(const LoadTaskPtrs & tasks)
return result;
}
inline LoadJobSet getGoalsOr(const LoadTaskPtrs & tasks, const LoadJobSet & alternative)
{
LoadJobSet result;
for (const auto & task : tasks)
result.insert(task->goals().begin(), task->goals().end());
return result.empty() ? alternative : result;
}
inline LoadJobSet joinJobs(const LoadJobSet & jobs1, const LoadJobSet & jobs2)
{
LoadJobSet result;
@ -251,100 +267,118 @@ inline LoadTaskPtrs joinTasks(const LoadTaskPtrs & tasks1, const LoadTaskPtrs &
return result;
}
// `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks dependencies and priorities of jobs.
// `AsyncLoader` is a scheduler for DAG of `LoadJob`s. It tracks job dependencies and priorities.
// Basic usage example:
// // Start async_loader with two thread pools (0=fg, 1=bg):
// AsyncLoader async_loader({
// {"FgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 2, .priority{0}}
// {"BgPool", CurrentMetrics::AsyncLoaderThreads, CurrentMetrics::AsyncLoaderThreadsActive, .max_threads = 1, .priority{1}}
// });
//
// // Create and schedule a task consisting of three jobs. Job1 has no dependencies and is run first.
// // Job2 and job3 depend on job1 and are run only after job1 completion.
// auto job_func = [&] (const LoadJobPtr & self) {
// LOG_TRACE(log, "Executing load job '{}' with priority '{}'", self->name, self->priority());
// LOG_TRACE(log, "Executing load job '{}' in pool '{}'", self->name, async_loader->getPoolName(self->pool()));
// };
// auto job1 = makeLoadJob({}, "job1", job_func);
// auto job2 = makeLoadJob({ job1 }, "job2", job_func);
// auto job3 = makeLoadJob({ job1 }, "job3", job_func);
// auto job1 = makeLoadJob({}, "job1", /* pool_id = */ 1, job_func);
// auto job2 = makeLoadJob({ job1 }, "job2", /* pool_id = */ 1, job_func);
// auto job3 = makeLoadJob({ job1 }, "job3", /* pool_id = */ 1, job_func);
// auto task = makeLoadTask(async_loader, { job1, job2, job3 });
// task.schedule();
// Here we have created and scheduled a task consisting of three jobs. Job1 has no dependencies and is run first.
// Job2 and job3 depend on job1 and are run only after job1 completion. Another thread may prioritize a job and wait for it:
// async_loader->prioritize(job3, /* priority = */ 1); // higher priority jobs are run first, default priority is zero.
// job3->wait(); // blocks until job completion or cancellation and rethrow an exception (if any)
//
// AsyncLoader tracks state of all scheduled jobs. Job lifecycle is the following:
// 1) Job is constructed with PENDING status and initial priority. The job is placed into a task.
// 2) The task is scheduled with all its jobs and their dependencies. A scheduled job may be ready (i.e. have all its dependencies finished) or blocked.
// 3a) When all dependencies are successfully executed, the job became ready. A ready job is enqueued into the ready queue.
// // Another thread may prioritize a job by changing its pool and wait for it:
// async_loader->prioritize(job3, /* pool_id = */ 0); // Increase priority: 1 -> 0 (lower is better)
// job3->wait(); // Blocks until job completion or cancellation and rethrow an exception (if any)
//
// Every job has a pool associated with it. AsyncLoader starts every job in its thread pool.
// Each pool has a constant priority and a mutable maximum number of threads.
// Higher priority (lower `pool.priority` value) jobs are run first.
// No job with lower priority is started while there is at least one higher priority job ready or running.
//
// Job priority can be elevated (but cannot be lowered)
// (a) if either it has a dependent job with higher priority:
// in this case the priority and the pool of a dependent job is inherited during `schedule()` call;
// (b) or job was explicitly prioritized by `prioritize(job, higher_priority_pool)` call:
// this also leads to a priority inheritance for all the dependencies.
// Value stored in load job `pool_id` field is atomic and can be changed even during job execution.
// Job is, of course, not moved from its initial thread pool, but it should use `self->pool()` for
// all new jobs it create to avoid priority inversion. To obtain pool in which job is being executed
// call `self->execution_pool()` instead.
//
// === IMPLEMENTATION DETAILS ===
// All possible states and statuses of a job:
// .---------- scheduled ----------.
// ctor --> assigned --> blocked --> ready --> executing --> finished ------> removed --> dtor
// STATUS: '------------------ PENDING -----------------' '-- OK|FAILED|CANCELED --'
//
// AsyncLoader tracks state of all scheduled and finished jobs. Job lifecycle is the following:
// 1) A job is constructed with PENDING status and assigned to a pool. The job is placed into a task.
// 2) The task is scheduled with all its jobs and their dependencies. A scheduled job may be ready, blocked (and later executing).
// 3a) When all dependencies are successfully finished, the job became ready. A ready job is enqueued into the ready queue of its pool.
// 3b) If at least one of the job dependencies is failed or canceled, then this job is canceled (with all it's dependent jobs as well).
// On cancellation an ASYNC_LOAD_CANCELED exception is generated and saved inside LoadJob object. The job status is changed to CANCELED.
// Exception is rethrown by any existing or new `wait()` call. The job is moved to the set of the finished jobs.
// 4) The scheduled pending ready job starts execution by a worker. The job is dequeued. Callback `job_func` is called.
// Status of an executing job is PENDING. And it is still considered as a scheduled job by AsyncLoader.
// Note that `job_func` of a CANCELED job is never executed.
// 4) The ready job starts execution by a worker. The job is dequeued. Callback `job_func` is called.
// Status of an executing job is PENDING. Note that `job_func` of a CANCELED job is never executed.
// 5a) On successful execution the job status is changed to OK and all existing and new `wait()` calls finish w/o exceptions.
// 5b) Any exception thrown out of `job_func` is wrapped into an ASYNC_LOAD_FAILED exception and saved inside LoadJob.
// The job status is changed to FAILED. All the dependent jobs are canceled. The exception is rethrown from all existing and new `wait()` calls.
// 6) The job is no longer considered as scheduled and is instead moved to the finished jobs set. This is just for introspection of the finished jobs.
// 7) The task containing this job is destructed or `remove()` is explicitly called. The job is removed from the finished job set.
// 8) The job is destructed.
//
// Every job has a priority associated with it. AsyncLoader runs higher priority (greater `priority` value) jobs first. Job priority can be elevated
// (a) if either it has a dependent job with higher priority (in this case priority of a dependent job is inherited);
// (b) or job was explicitly prioritized by `prioritize(job, higher_priority)` call (this also leads to a priority inheritance for all the dependencies).
// Note that to avoid priority inversion `job_func` should use `self->priority()` to schedule new jobs in AsyncLoader or any other pool.
// Value stored in load job priority field is atomic and can be increased even during job execution.
//
// When a task is scheduled it can contain dependencies on previously scheduled jobs. These jobs can have any status. If job A being scheduled depends on
// another job B that is not yet scheduled, then job B will also be scheduled (even if the task does not contain it).
class AsyncLoader : private boost::noncopyable
{
private:
// Key of a pending job in the ready queue.
struct ReadyKey
// Thread pool for job execution.
// Pools control the following aspects of job execution:
// 1) Concurrency: Amount of concurrently executing jobs in a pool is `max_threads`.
// 2) Priority: As long as there is executing worker with higher priority, workers with lower priorities are not started
// (although, they can finish last job started before higher priority jobs appeared)
struct Pool
{
ssize_t priority; // Ascending order
ssize_t initial_priority; // Ascending order
UInt64 ready_seqno; // Descending order
const String name;
const Priority priority;
std::unique_ptr<ThreadPool> thread_pool; // NOTE: we avoid using a `ThreadPool` queue to be able to move jobs between pools.
std::map<UInt64, LoadJobPtr> ready_queue; // FIFO queue of jobs to be executed in this pool. Map is used for faster erasing. Key is `ready_seqno`
size_t max_threads; // Max number of workers to be spawn
size_t workers = 0; // Number of currently execution workers
bool operator<(const ReadyKey & rhs) const
{
if (priority > rhs.priority)
return true;
if (priority < rhs.priority)
return false;
if (initial_priority > rhs.initial_priority)
return true;
if (initial_priority < rhs.initial_priority)
return false;
return ready_seqno < rhs.ready_seqno;
}
bool isActive() const { return workers > 0 || !ready_queue.empty(); }
};
// Scheduling information for a pending job.
struct Info
{
ssize_t initial_priority = 0; // Initial priority passed into schedule().
ssize_t priority = 0; // Elevated priority, due to priority inheritance or prioritize().
size_t dependencies_left = 0; // Current number of dependencies on pending jobs.
UInt64 ready_seqno = 0; // Zero means that job is not in ready queue.
LoadJobSet dependent_jobs; // Set of jobs dependent on this job.
// Three independent states of a non-finished job.
bool is_blocked() const { return dependencies_left > 0; }
bool is_ready() const { return dependencies_left == 0 && ready_seqno > 0; }
bool is_executing() const { return dependencies_left == 0 && ready_seqno == 0; }
// Get key of a ready job
ReadyKey key() const
{
return {.priority = priority, .initial_priority = initial_priority, .ready_seqno = ready_seqno};
}
// Three independent states of a scheduled job.
bool isBlocked() const { return dependencies_left > 0; }
bool isReady() const { return dependencies_left == 0 && ready_seqno > 0; }
bool isExecuting() const { return dependencies_left == 0 && ready_seqno == 0; }
};
public:
using Metric = CurrentMetrics::Metric;
AsyncLoader(Metric metric_threads, Metric metric_active_threads, size_t max_threads_, bool log_failures_, bool log_progress_);
// Helper struct for AsyncLoader construction
struct PoolInitializer
{
String name;
Metric metric_threads;
Metric metric_active_threads;
size_t max_threads;
Priority priority;
};
AsyncLoader(std::vector<PoolInitializer> pool_initializers, bool log_failures_, bool log_progress_);
// Stops AsyncLoader before destruction
// WARNING: all tasks instances should be destructed before associated AsyncLoader.
~AsyncLoader();
// Start workers to execute scheduled load jobs.
// Start workers to execute scheduled load jobs. Note that AsyncLoader is constructed as already started.
void start();
// Wait for all load jobs to finish, including all new jobs. So at first take care to stop adding new jobs.
@ -356,28 +390,32 @@ public:
// - or canceled using ~Task() or remove() later.
void stop();
// Schedule all jobs of given `task` and their dependencies (if any, not scheduled yet).
// Higher priority jobs (with greater `job->priority()` value) are executed earlier.
// All dependencies of a scheduled job inherit its priority if it is higher. This way higher priority job
// never wait for (blocked by) lower priority jobs. No priority inversion is possible.
// Schedule all jobs of given `task` and their dependencies (even if they are not in task).
// All dependencies of a scheduled job inherit its pool if it has higher priority. This way higher priority job
// never waits for (blocked by) lower priority jobs. No priority inversion is possible.
// Idempotent: multiple schedule() calls for the same job are no-op.
// Note that `task` destructor ensures that all its jobs are finished (OK, FAILED or CANCELED)
// and are removed from AsyncLoader, so it is thread-safe to destroy them.
void schedule(LoadTask & task);
void schedule(const LoadTaskPtr & task);
// Schedule all tasks atomically. To ensure only highest priority jobs among all tasks are run first.
void schedule(const std::vector<LoadTaskPtr> & tasks);
void schedule(const LoadTaskPtrs & tasks);
// Increase priority of a job and all its dependencies recursively.
void prioritize(const LoadJobPtr & job, ssize_t new_priority);
// Jobs from higher (than `new_pool`) priority pools are not changed.
void prioritize(const LoadJobPtr & job, size_t new_pool);
// Remove finished jobs, cancel scheduled jobs, wait for executing jobs to finish and remove them.
void remove(const LoadJobSet & jobs);
// Increase or decrease maximum number of simultaneously executing jobs.
void setMaxThreads(size_t value);
// Increase or decrease maximum number of simultaneously executing jobs in `pool`.
void setMaxThreads(size_t pool, size_t value);
size_t getMaxThreads(size_t pool) const;
const String & getPoolName(size_t pool) const;
Priority getPoolPriority(size_t pool) const;
size_t getMaxThreads() const;
size_t getScheduledJobCount() const;
// Helper class for introspection
@ -385,11 +423,10 @@ public:
{
LoadJobPtr job;
size_t dependencies_left = 0;
bool is_executing = false;
UInt64 ready_seqno = 0;
bool is_blocked = false;
bool is_ready = false;
std::optional<ssize_t> initial_priority;
std::optional<UInt64> ready_seqno;
bool is_executing = false;
};
// For introspection and debug only, see `system.async_loader` table
@ -398,42 +435,32 @@ public:
private:
void checkCycle(const LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
String checkCycleImpl(const LoadJobPtr & job, LoadJobSet & left, LoadJobSet & visited, std::unique_lock<std::mutex> & lock);
void finish(std::unique_lock<std::mutex> & lock, const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job = {});
void finish(const LoadJobPtr & job, LoadStatus status, std::exception_ptr exception_from_job, std::unique_lock<std::mutex> & lock);
void scheduleImpl(const LoadJobSet & input_jobs);
void gatherNotScheduled(const LoadJobPtr & job, LoadJobSet & jobs, std::unique_lock<std::mutex> & lock);
void prioritize(const LoadJobPtr & job, ssize_t new_priority, std::unique_lock<std::mutex> & lock);
void prioritize(const LoadJobPtr & job, size_t new_pool_id, std::unique_lock<std::mutex> & lock);
void enqueue(Info & info, const LoadJobPtr & job, std::unique_lock<std::mutex> & lock);
void spawn(std::unique_lock<std::mutex> &);
void worker();
bool canSpawnWorker(Pool & pool, std::unique_lock<std::mutex> &);
bool canWorkerLive(Pool & pool, std::unique_lock<std::mutex> &);
void updateCurrentPriorityAndSpawn(std::unique_lock<std::mutex> &);
void spawn(Pool & pool, std::unique_lock<std::mutex> &);
void worker(Pool & pool);
// Logging
const bool log_failures; // Worker should log all exceptions caught from job functions.
const bool log_progress; // Periodically log total progress
Poco::Logger * log;
std::chrono::system_clock::time_point busy_period_start_time;
AtomicStopwatch stopwatch;
size_t old_jobs = 0; // Number of jobs that were finished in previous busy period (for correct progress indication)
mutable std::mutex mutex; // Guards all the fields below.
bool is_running = false;
// Full set of scheduled pending jobs along with scheduling info.
std::unordered_map<LoadJobPtr, Info> scheduled_jobs;
// Subset of scheduled pending non-blocked jobs (waiting for a worker to be executed).
// Represent a queue of jobs in order of decreasing priority and FIFO for jobs with equal priorities.
std::map<ReadyKey, LoadJobPtr> ready_queue;
// Set of finished jobs (for introspection only, until jobs are removed).
LoadJobSet finished_jobs;
// Increasing counter for `ReadyKey` assignment (to preserve FIFO order of the jobs with equal priorities).
UInt64 last_ready_seqno = 0;
// For executing jobs. Note that we avoid using an internal queue of the pool to be able to prioritize jobs.
size_t max_threads;
size_t workers = 0;
ThreadPool pool;
bool is_running = true;
std::optional<Priority> current_priority; // highest priority among active pools
UInt64 last_ready_seqno = 0; // Increasing counter for ready queue keys.
std::unordered_map<LoadJobPtr, Info> scheduled_jobs; // Full set of scheduled pending jobs along with scheduling info.
std::vector<Pool> pools; // Thread pools for job execution and ready queues
LoadJobSet finished_jobs; // Set of finished jobs (for introspection only, until jobs are removed).
AtomicStopwatch stopwatch; // For progress indication
size_t old_jobs = 0; // Number of jobs that were finished in previous busy period (for correct progress indication)
std::chrono::system_clock::time_point busy_period_start_time;
};
}

View File

@ -3,18 +3,11 @@
namespace DB
{
thread_local FiberInfo current_fiber_info;
AsyncTaskExecutor::AsyncTaskExecutor(std::unique_ptr<AsyncTask> task_) : task(std::move(task_))
{
createFiber();
}
FiberInfo AsyncTaskExecutor::getCurrentFiberInfo()
{
return current_fiber_info;
}
void AsyncTaskExecutor::resume()
{
if (routine_is_finished)
@ -38,10 +31,7 @@ void AsyncTaskExecutor::resume()
void AsyncTaskExecutor::resumeUnlocked()
{
auto parent_fiber_info = current_fiber_info;
current_fiber_info = FiberInfo{&fiber, &parent_fiber_info};
fiber = std::move(fiber).resume();
current_fiber_info = parent_fiber_info;
fiber.resume();
}
void AsyncTaskExecutor::cancel()
@ -69,30 +59,19 @@ struct AsyncTaskExecutor::Routine
struct AsyncCallback
{
AsyncTaskExecutor & executor;
Fiber & fiber;
SuspendCallback suspend_callback;
void operator()(int fd, Poco::Timespan timeout, AsyncEventTimeoutType type, const std::string & desc, uint32_t events)
{
executor.processAsyncEvent(fd, timeout, type, desc, events);
fiber = std::move(fiber).resume();
suspend_callback();
executor.clearAsyncEvent();
}
};
struct ResumeCallback
void operator()(SuspendCallback suspend_callback)
{
Fiber & fiber;
void operator()()
{
fiber = std::move(fiber).resume();
}
};
Fiber operator()(Fiber && sink)
{
auto async_callback = AsyncCallback{executor, sink};
auto suspend_callback = ResumeCallback{sink};
auto async_callback = AsyncCallback{executor, suspend_callback};
try
{
executor.task->run(async_callback, suspend_callback);
@ -110,18 +89,17 @@ struct AsyncTaskExecutor::Routine
}
executor.routine_is_finished = true;
return std::move(sink);
}
};
void AsyncTaskExecutor::createFiber()
{
fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this});
fiber = Fiber(fiber_stack, Routine{*this});
}
void AsyncTaskExecutor::destroyFiber()
{
boost::context::fiber to_destroy = std::move(fiber);
Fiber to_destroy = std::move(fiber);
}
String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description)

View File

@ -22,7 +22,7 @@ enum class AsyncEventTimeoutType
};
using AsyncCallback = std::function<void(int, Poco::Timespan, AsyncEventTimeoutType, const std::string &, uint32_t)>;
using ResumeCallback = std::function<void()>;
using SuspendCallback = std::function<void()>;
struct FiberInfo
{
@ -38,7 +38,7 @@ struct FiberInfo
struct AsyncTask
{
public:
virtual void run(AsyncCallback async_callback, ResumeCallback suspend_callback) = 0;
virtual void run(AsyncCallback async_callback, SuspendCallback suspend_callback) = 0;
virtual ~AsyncTask() = default;
};
@ -80,7 +80,6 @@ public:
};
#endif
static FiberInfo getCurrentFiberInfo();
protected:
/// Method that is called in resume() before actual fiber resuming.
/// If it returns false, resume() will return immediately without actual fiber resuming.
@ -124,48 +123,6 @@ private:
std::unique_ptr<AsyncTask> task;
};
/// Simple implementation for fiber local variable.
template <typename T>
struct FiberLocal
{
public:
FiberLocal()
{
/// Initialize main instance for this thread. Instances for fibers will inherit it,
/// (it's needed because main instance could be changed before creating fibers
/// and changes should be visible in fibers).
data[nullptr] = T();
}
T & operator*()
{
return get();
}
T * operator->()
{
return &get();
}
private:
T & get()
{
return getInstanceForFiber(AsyncTaskExecutor::getCurrentFiberInfo());
}
T & getInstanceForFiber(FiberInfo info)
{
auto it = data.find(info.fiber);
/// If it's the first request, we need to initialize instance for the fiber
/// using instance from parent fiber or main thread that created fiber.
if (it == data.end())
it = data.insert({info.fiber, getInstanceForFiber(*info.parent_fiber_info)}).first;
return it->second;
}
std::unordered_map<const Fiber *, T> data;
};
String getSocketTimeoutExceededMessageByTimeoutType(AsyncEventTimeoutType type, Poco::Timespan timeout, const String & socket_description);
}

View File

@ -1041,18 +1041,16 @@ void AsynchronousMetrics::update(TimePoint update_time)
// It doesn't read the EOL itself.
++cpuinfo->position();
if (s.rfind("processor", 0) == 0)
static constexpr std::string_view PROCESSOR = "processor";
if (s.starts_with(PROCESSOR))
{
/// s390x example: processor 0: version = FF, identification = 039C88, machine = 3906
/// non s390x example: processor : 0
if (auto colon = s.find_first_of(':'))
{
#ifdef __s390x__
core_id = std::stoi(s.substr(10)); /// 10: length of "processor" plus 1
#else
core_id = std::stoi(s.substr(colon + 2));
#endif
}
auto core_id_start = std::ssize(PROCESSOR);
while (core_id_start < std::ssize(s) && !std::isdigit(s[core_id_start]))
++core_id_start;
core_id = std::stoi(s.substr(core_id_start));
}
else if (s.rfind("cpu MHz", 0) == 0)
{

View File

@ -3,5 +3,147 @@
/// BOOST_USE_ASAN, BOOST_USE_TSAN and BOOST_USE_UCONTEXT should be correctly defined for sanitizers.
#include <base/defines.h>
#include <boost/context/fiber.hpp>
#include <map>
/// Class wrapper for boost::context::fiber.
/// It tracks current executing fiber for thread and
/// supports storing fiber-specific data
/// that will be destroyed on fiber destructor.
class Fiber
{
private:
using Impl = boost::context::fiber;
using FiberPtr = Fiber *;
template <typename T> friend class FiberLocal;
public:
template< typename StackAlloc, typename Fn>
Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
{
}
Fiber() = default;
Fiber(Fiber && other) = default;
Fiber & operator=(Fiber && other) = default;
Fiber(const Fiber &) = delete;
Fiber & operator =(const Fiber &) = delete;
explicit operator bool() const
{
return impl.operator bool();
}
void resume()
{
/// Update information about current executing fiber.
FiberPtr & current_fiber = getCurrentFiber();
FiberPtr parent_fiber = current_fiber;
current_fiber = this;
impl = std::move(impl).resume();
/// Restore parent fiber.
current_fiber = parent_fiber;
}
private:
template <typename Fn>
struct RoutineImpl
{
struct SuspendCallback
{
Impl & impl;
void operator()()
{
impl = std::move(impl).resume();
}
};
explicit RoutineImpl(Fn && fn_) : fn(std::move(fn_))
{
}
Impl operator()(Impl && sink)
{
SuspendCallback suspend_callback{sink};
fn(suspend_callback);
return std::move(sink);
}
Fn fn;
};
static FiberPtr & getCurrentFiber()
{
thread_local static FiberPtr current_fiber;
return current_fiber;
}
/// Special wrapper to store data in uniquer_ptr.
struct DataWrapper
{
virtual ~DataWrapper() = default;
};
using DataPtr = std::unique_ptr<DataWrapper>;
/// Get reference to fiber-specific data by key
/// (the pointer to the structure that uses this data).
DataPtr & getLocalData(void * key)
{
return local_data[key];
}
Impl && release()
{
return std::move(impl);
}
Impl impl;
std::map<void *, DataPtr> local_data;
};
/// Implementation for fiber local variable.
/// If we are in fiber, it returns fiber local data,
/// otherwise it returns it's single field.
/// Fiber local data is destroyed in Fiber destructor.
/// Implementation is similar to boost::fiber::fiber_specific_ptr
/// (we cannot use it because we don't use boost::fiber API.
template <typename T>
class FiberLocal
{
public:
T & operator*()
{
return get();
}
T * operator->()
{
return &get();
}
private:
struct DataWrapperImpl : public Fiber::DataWrapper
{
T impl;
};
T & get()
{
Fiber * current_fiber = Fiber::getCurrentFiber();
if (!current_fiber)
return main_instance;
Fiber::DataPtr & ptr = current_fiber->getLocalData(this);
/// Initialize instance on first request.
if (!ptr)
ptr = std::make_unique<DataWrapperImpl>();
return dynamic_cast<DataWrapperImpl *>(ptr.get())->impl;
}
T main_instance;
};
using Fiber = boost::context::fiber;

View File

@ -15,9 +15,8 @@ namespace DB
namespace OpenTelemetry
{
///// This code can be executed inside several fibers in one thread,
///// we should use fiber local tracing context.
thread_local FiberLocal<TracingContextOnThread> current_fiber_trace_context;
/// This code can be executed inside fibers, we should use fiber local tracing context.
thread_local FiberLocal<TracingContextOnThread> current_trace_context;
bool Span::addAttribute(std::string_view name, UInt64 value) noexcept
{
@ -109,7 +108,7 @@ bool Span::addAttributeImpl(std::string_view name, std::string_view value) noexc
SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
{
if (!current_fiber_trace_context->isTraceEnabled())
if (!current_trace_context->isTraceEnabled())
{
return;
}
@ -117,8 +116,8 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
/// Use try-catch to make sure the ctor is exception safe.
try
{
this->trace_id = current_fiber_trace_context->trace_id;
this->parent_span_id = current_fiber_trace_context->span_id;
this->trace_id = current_trace_context->trace_id;
this->parent_span_id = current_trace_context->span_id;
this->span_id = thread_local_rng(); // create a new id for this span
this->operation_name = _operation_name;
this->kind = _kind;
@ -137,7 +136,7 @@ SpanHolder::SpanHolder(std::string_view _operation_name, SpanKind _kind)
}
/// Set current span as parent of other spans created later on this thread.
current_fiber_trace_context->span_id = this->span_id;
current_trace_context->span_id = this->span_id;
}
void SpanHolder::finish() noexcept
@ -146,12 +145,12 @@ void SpanHolder::finish() noexcept
return;
// First of all, restore old value of current span.
assert(current_fiber_trace_context->span_id == span_id);
current_fiber_trace_context->span_id = parent_span_id;
assert(current_trace_context->span_id == span_id);
current_trace_context->span_id = parent_span_id;
try
{
auto log = current_fiber_trace_context->span_log.lock();
auto log = current_trace_context->span_log.lock();
/// The log might be disabled, check it before use
if (log)
@ -274,7 +273,7 @@ void TracingContext::serialize(WriteBuffer & buf) const
const TracingContextOnThread & CurrentContext()
{
return *current_fiber_trace_context;
return *current_trace_context;
}
void TracingContextOnThread::reset() noexcept
@ -296,7 +295,7 @@ TracingContextHolder::TracingContextHolder(
/// If any exception is raised during the construction, the tracing is not enabled on current thread.
try
{
if (current_fiber_trace_context->isTraceEnabled())
if (current_trace_context->isTraceEnabled())
{
///
/// This is not the normal case,
@ -309,15 +308,15 @@ TracingContextHolder::TracingContextHolder(
/// So this branch ensures this class can be instantiated multiple times on one same thread safely.
///
this->is_context_owner = false;
this->root_span.trace_id = current_fiber_trace_context->trace_id;
this->root_span.parent_span_id = current_fiber_trace_context->span_id;
this->root_span.trace_id = current_trace_context->trace_id;
this->root_span.parent_span_id = current_trace_context->span_id;
this->root_span.span_id = thread_local_rng();
this->root_span.operation_name = _operation_name;
this->root_span.start_time_us
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
/// Set the root span as parent of other spans created on current thread
current_fiber_trace_context->span_id = this->root_span.span_id;
current_trace_context->span_id = this->root_span.span_id;
return;
}
@ -361,10 +360,10 @@ TracingContextHolder::TracingContextHolder(
}
/// Set up trace context on current thread only when the root span is successfully initialized.
*current_fiber_trace_context = _parent_trace_context;
current_fiber_trace_context->span_id = this->root_span.span_id;
current_fiber_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
current_fiber_trace_context->span_log = _span_log;
*current_trace_context = _parent_trace_context;
current_trace_context->span_id = this->root_span.span_id;
current_trace_context->trace_flags = TRACE_FLAG_SAMPLED;
current_trace_context->span_log = _span_log;
}
TracingContextHolder::~TracingContextHolder()
@ -376,7 +375,7 @@ TracingContextHolder::~TracingContextHolder()
try
{
auto shared_span_log = current_fiber_trace_context->span_log.lock();
auto shared_span_log = current_trace_context->span_log.lock();
if (shared_span_log)
{
try
@ -407,11 +406,11 @@ TracingContextHolder::~TracingContextHolder()
if (this->is_context_owner)
{
/// Clear the context on current thread
current_fiber_trace_context->reset();
current_trace_context->reset();
}
else
{
current_fiber_trace_context->span_id = this->root_span.parent_span_id;
current_trace_context->span_id = this->root_span.parent_span_id;
}
}

11
src/Common/Priority.h Normal file
View File

@ -0,0 +1,11 @@
#pragma once
#include <base/types.h>
/// Common type for priority values.
/// Separate type (rather than `Int64` is used just to avoid implicit conversion errors and to default-initialize
struct Priority
{
Int64 value = 0; /// Note that lower value means higher priority.
constexpr operator Int64() const { return value; } /// NOLINT
};

View File

@ -8,6 +8,9 @@
M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \
M(SelectQuery, "Same as Query, but only for SELECT queries.") \
M(InsertQuery, "Same as Query, but only for INSERT queries.") \
M(QueriesWithSubqueries, "Count queries with all subqueries") \
M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \
M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \
M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \
M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \
M(AsyncInsertRows, "Number of rows inserted by asynchronous INSERT queries.") \
@ -366,7 +369,7 @@ The server successfully detected this situation and will download merged part fr
M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \
M(WriteBufferFromS3Bytes, "Bytes written to S3.") \
M(WriteBufferFromS3RequestsErrors, "Number of exceptions while writing to S3.") \
\
M(WriteBufferFromS3WaitInflightLimitMicroseconds, "Time spent on waiting while some of the current requests are done when its number reached the limit defined by s3_max_inflight_parts_for_one_file.") \
M(QueryMemoryLimitExceeded, "Number of times when memory limit exceeded for query.") \
\
M(CachedReadBufferReadFromSourceMicroseconds, "Time reading from filesystem cache source (from remote filesystem, etc)") \

View File

@ -92,7 +92,7 @@ public:
String getName() const override { return LogElement::name(); }
static const char * getDefaultOrderBy() { return "(event_date, event_time)"; }
static const char * getDefaultOrderBy() { return "event_date, event_time"; }
protected:
Poco::Logger * log;

View File

@ -123,7 +123,7 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)
template <typename Thread>
template <typename ReturnType>
ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, ssize_t priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context)
ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, Priority priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context)
{
auto on_error = [&](const std::string & reason)
{
@ -231,19 +231,19 @@ void ThreadPoolImpl<Thread>::startNewThreadsNoLock()
}
template <typename Thread>
void ThreadPoolImpl<Thread>::scheduleOrThrowOnError(Job job, ssize_t priority)
void ThreadPoolImpl<Thread>::scheduleOrThrowOnError(Job job, Priority priority)
{
scheduleImpl<void>(std::move(job), priority, std::nullopt);
}
template <typename Thread>
bool ThreadPoolImpl<Thread>::trySchedule(Job job, ssize_t priority, uint64_t wait_microseconds) noexcept
bool ThreadPoolImpl<Thread>::trySchedule(Job job, Priority priority, uint64_t wait_microseconds) noexcept
{
return scheduleImpl<bool>(std::move(job), priority, wait_microseconds);
}
template <typename Thread>
void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, ssize_t priority, uint64_t wait_microseconds, bool propagate_opentelemetry_tracing_context)
void ThreadPoolImpl<Thread>::scheduleOrThrow(Job job, Priority priority, uint64_t wait_microseconds, bool propagate_opentelemetry_tracing_context)
{
scheduleImpl<void>(std::move(job), priority, wait_microseconds, propagate_opentelemetry_tracing_context);
}

View File

@ -18,6 +18,7 @@
#include <Common/OpenTelemetryTraceContext.h>
#include <Common/CurrentMetrics.h>
#include <Common/ThreadPool_fwd.h>
#include <Common/Priority.h>
#include <base/scope_guard.h>
/** Very simple thread pool similar to boost::threadpool.
@ -59,17 +60,17 @@ public:
/// If any thread was throw an exception, first exception will be rethrown from this method,
/// and exception will be cleared.
/// Also throws an exception if cannot create thread.
/// Priority: greater is higher.
/// Priority: lower is higher.
/// NOTE: Probably you should call wait() if exception was thrown. If some previously scheduled jobs are using some objects,
/// located on stack of current thread, the stack must not be unwinded until all jobs finished. However,
/// if ThreadPool is a local object, it will wait for all scheduled jobs in own destructor.
void scheduleOrThrowOnError(Job job, ssize_t priority = 0);
void scheduleOrThrowOnError(Job job, Priority priority = {});
/// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or return false.
bool trySchedule(Job job, ssize_t priority = 0, uint64_t wait_microseconds = 0) noexcept;
bool trySchedule(Job job, Priority priority = {}, uint64_t wait_microseconds = 0) noexcept;
/// Similar to scheduleOrThrowOnError(...). Wait for specified amount of time and schedule a job or throw an exception.
void scheduleOrThrow(Job job, ssize_t priority = 0, uint64_t wait_microseconds = 0, bool propagate_opentelemetry_tracing_context = true);
void scheduleOrThrow(Job job, Priority priority = {}, uint64_t wait_microseconds = 0, bool propagate_opentelemetry_tracing_context = true);
/// Wait for all currently active jobs to be done.
/// You may call schedule and wait many times in arbitrary order.
@ -123,15 +124,15 @@ private:
struct JobWithPriority
{
Job job;
ssize_t priority;
Priority priority;
DB::OpenTelemetry::TracingContextOnThread thread_trace_context;
JobWithPriority(Job job_, ssize_t priority_, const DB::OpenTelemetry::TracingContextOnThread& thread_trace_context_)
JobWithPriority(Job job_, Priority priority_, const DB::OpenTelemetry::TracingContextOnThread & thread_trace_context_)
: job(job_), priority(priority_), thread_trace_context(thread_trace_context_) {}
bool operator< (const JobWithPriority & rhs) const
bool operator<(const JobWithPriority & rhs) const
{
return priority < rhs.priority;
return priority > rhs.priority; // Reversed for `priority_queue` max-heap to yield minimum value (i.e. highest priority) first
}
};
@ -141,7 +142,7 @@ private:
std::stack<OnDestroyCallback> on_destroy_callbacks;
template <typename ReturnType>
ReturnType scheduleImpl(Job job, ssize_t priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context = true);
ReturnType scheduleImpl(Job job, Priority priority, std::optional<uint64_t> wait_microseconds, bool propagate_opentelemetry_tracing_context = true);
void worker(typename std::list<Thread>::iterator thread_it);
@ -227,7 +228,7 @@ public:
DB::ThreadStatus thread_status;
std::apply(function, arguments);
},
0, // default priority
{}, // default priority
0, // default wait_microseconds
propagate_opentelemetry_context
);

View File

@ -30,6 +30,11 @@ namespace DB::ErrorCodes
extern const int ASYNC_LOAD_CANCELED;
}
struct Initializer {
size_t max_threads = 1;
Priority priority;
};
struct AsyncLoaderTest
{
AsyncLoader loader;
@ -37,10 +42,34 @@ struct AsyncLoaderTest
std::mutex rng_mutex;
pcg64 rng{randomSeed()};
explicit AsyncLoaderTest(std::vector<Initializer> initializers)
: loader(getPoolInitializers(initializers), /* log_failures = */ false, /* log_progress = */ false)
{
loader.stop(); // All tests call `start()` manually to better control ordering
}
explicit AsyncLoaderTest(size_t max_threads = 1)
: loader(CurrentMetrics::TablesLoaderThreads, CurrentMetrics::TablesLoaderThreadsActive, max_threads, /* log_failures = */ false, /* log_progress = */ false)
: AsyncLoaderTest({{.max_threads = max_threads}})
{}
std::vector<AsyncLoader::PoolInitializer> getPoolInitializers(std::vector<Initializer> initializers)
{
std::vector<AsyncLoader::PoolInitializer> result;
size_t pool_id = 0;
for (auto & desc : initializers)
{
result.push_back({
.name = fmt::format("Pool{}", pool_id),
.metric_threads = CurrentMetrics::TablesLoaderThreads,
.metric_active_threads = CurrentMetrics::TablesLoaderThreadsActive,
.max_threads = desc.max_threads,
.priority = desc.priority
});
pool_id++;
}
return result;
}
template <typename T>
T randomInt(T from, T to)
{
@ -114,16 +143,19 @@ struct AsyncLoaderTest
TEST(AsyncLoader, Smoke)
{
AsyncLoaderTest t(2);
AsyncLoaderTest t({
{.max_threads = 2, .priority = Priority{0}},
{.max_threads = 2, .priority = Priority{1}},
});
static constexpr ssize_t low_priority = -1;
static constexpr size_t low_priority_pool = 1;
std::atomic<size_t> jobs_done{0};
std::atomic<size_t> low_priority_jobs_done{0};
auto job_func = [&] (const LoadJobPtr & self) {
jobs_done++;
if (self->priority() == low_priority)
if (self->pool() == low_priority_pool)
low_priority_jobs_done++;
};
@ -135,7 +167,7 @@ TEST(AsyncLoader, Smoke)
auto job3 = makeLoadJob({ job2 }, "job3", job_func);
auto job4 = makeLoadJob({ job2 }, "job4", job_func);
auto task2 = t.schedule({ job3, job4 });
auto job5 = makeLoadJob({ job3, job4 }, low_priority, "job5", job_func);
auto job5 = makeLoadJob({ job3, job4 }, low_priority_pool, "job5", job_func);
task2->merge(t.schedule({ job5 }));
std::thread waiter_thread([=] { job5->wait(); });
@ -387,6 +419,8 @@ TEST(AsyncLoader, CancelExecutingTask)
}
}
// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function
// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482
TEST(AsyncLoader, DISABLED_JobFailure)
{
AsyncLoaderTest t;
@ -536,7 +570,7 @@ TEST(AsyncLoader, TestOverload)
AsyncLoaderTest t(3);
t.loader.start();
size_t max_threads = t.loader.getMaxThreads();
size_t max_threads = t.loader.getMaxThreads(/* pool = */ 0);
std::atomic<int> executing{0};
for (int concurrency = 4; concurrency <= 8; concurrency++)
@ -562,15 +596,35 @@ TEST(AsyncLoader, TestOverload)
TEST(AsyncLoader, StaticPriorities)
{
AsyncLoaderTest t(1);
AsyncLoaderTest t({
{.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority{-2}},
{.max_threads = 1, .priority{-3}},
{.max_threads = 1, .priority{-4}},
{.max_threads = 1, .priority{-5}},
{.max_threads = 1, .priority{-6}},
{.max_threads = 1, .priority{-7}},
{.max_threads = 1, .priority{-8}},
{.max_threads = 1, .priority{-9}},
});
std::string schedule;
auto job_func = [&] (const LoadJobPtr & self)
{
schedule += fmt::format("{}{}", self->name, self->priority());
schedule += fmt::format("{}{}", self->name, self->pool());
};
// Job DAG with priorities. After priority inheritance from H9, jobs D9 and E9 can be
// executed in undefined order (Tested further in DynamicPriorities)
// A0(9) -+-> B3
// |
// `-> C4
// |
// `-> D1(9) -.
// | +-> F0(9) --> G0(9) --> H9
// `-> E2(9) -'
std::vector<LoadJobPtr> jobs;
jobs.push_back(makeLoadJob({}, 0, "A", job_func)); // 0
jobs.push_back(makeLoadJob({ jobs[0] }, 3, "B", job_func)); // 1
@ -584,25 +638,113 @@ TEST(AsyncLoader, StaticPriorities)
t.loader.start();
t.loader.wait();
ASSERT_TRUE(schedule == "A9E9D9F9G9H9C4B3" || schedule == "A9D9E9F9G9H9C4B3");
}
ASSERT_EQ(schedule, "A9E9D9F9G9H9C4B3");
TEST(AsyncLoader, SimplePrioritization)
{
AsyncLoaderTest t({
{.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority{-2}},
});
t.loader.start();
std::atomic<int> executed{0}; // Number of previously executed jobs (to test execution order)
LoadJobPtr job_to_prioritize;
auto job_func_A_booster = [&] (const LoadJobPtr &)
{
ASSERT_EQ(executed++, 0);
t.loader.prioritize(job_to_prioritize, 2);
};
auto job_func_B_tester = [&] (const LoadJobPtr &)
{
ASSERT_EQ(executed++, 2);
};
auto job_func_C_boosted = [&] (const LoadJobPtr &)
{
ASSERT_EQ(executed++, 1);
};
std::vector<LoadJobPtr> jobs;
jobs.push_back(makeLoadJob({}, 1, "A", job_func_A_booster)); // 0
jobs.push_back(makeLoadJob({jobs[0]}, 1, "B", job_func_B_tester)); // 1
jobs.push_back(makeLoadJob({}, 0, "C", job_func_C_boosted)); // 2
auto task = makeLoadTask(t.loader, { jobs.begin(), jobs.end() });
job_to_prioritize = jobs[2]; // C
scheduleAndWaitLoadAll(task);
}
TEST(AsyncLoader, DynamicPriorities)
{
AsyncLoaderTest t(1);
AsyncLoaderTest t({
{.max_threads = 1, .priority{0}},
{.max_threads = 1, .priority{-1}},
{.max_threads = 1, .priority{-2}},
{.max_threads = 1, .priority{-3}},
{.max_threads = 1, .priority{-4}},
{.max_threads = 1, .priority{-5}},
{.max_threads = 1, .priority{-6}},
{.max_threads = 1, .priority{-7}},
{.max_threads = 1, .priority{-8}},
{.max_threads = 1, .priority{-9}},
});
for (bool prioritize : {false, true})
{
// Although all pools have max_threads=1, workers from different pools can run simultaneously just after `prioritize()` call
std::barrier sync(2);
bool wait_sync = prioritize;
std::mutex schedule_mutex;
std::string schedule;
LoadJobPtr job_to_prioritize;
// Order of execution of jobs D and E after prioritization is undefined, because it depend on `ready_seqno`
// (Which depends on initial `schedule()` order, which in turn depend on `std::unordered_map` order)
// So we have to obtain `ready_seqno` to be sure.
UInt64 ready_seqno_D = 0;
UInt64 ready_seqno_E = 0;
auto job_func = [&] (const LoadJobPtr & self)
{
{
std::unique_lock lock{schedule_mutex};
schedule += fmt::format("{}{}", self->name, self->executionPool());
}
if (prioritize && self->name == "C")
t.loader.prioritize(job_to_prioritize, 9); // dynamic prioritization
schedule += fmt::format("{}{}", self->name, self->priority());
{
for (const auto & state : t.loader.getJobStates())
{
if (state.job->name == "D")
ready_seqno_D = state.ready_seqno;
if (state.job->name == "E")
ready_seqno_E = state.ready_seqno;
}
// Jobs D and E should be enqueued at the moment
ASSERT_LT(0, ready_seqno_D);
ASSERT_LT(0, ready_seqno_E);
// Dynamic prioritization G0 -> G9
// Note that it will spawn concurrent worker in higher priority pool
t.loader.prioritize(job_to_prioritize, 9);
sync.arrive_and_wait(); // (A) wait for higher priority worker (B) to test they can be concurrent
}
if (wait_sync && (self->name == "D" || self->name == "E"))
{
wait_sync = false;
sync.arrive_and_wait(); // (B)
}
};
// Job DAG with initial priorities. During execution of C4, job G0 priority is increased to G9, postponing B3 job executing.
@ -624,14 +766,19 @@ TEST(AsyncLoader, DynamicPriorities)
jobs.push_back(makeLoadJob({ jobs[6] }, 0, "H", job_func)); // 7
auto task = t.schedule({ jobs.begin(), jobs.end() });
job_to_prioritize = jobs[6];
job_to_prioritize = jobs[6]; // G
t.loader.start();
t.loader.wait();
t.loader.stop();
if (prioritize)
{
if (ready_seqno_D < ready_seqno_E)
ASSERT_EQ(schedule, "A4C4D9E9F9G9B3H0");
else
ASSERT_EQ(schedule, "A4C4E9D9F9G9B3H0");
}
else
ASSERT_EQ(schedule, "A4C4B3E2D1F0G0H0");
}
@ -742,8 +889,64 @@ TEST(AsyncLoader, SetMaxThreads)
syncs[idx]->arrive_and_wait(); // (A)
sync_index++;
if (sync_index < syncs.size())
t.loader.setMaxThreads(max_threads_values[sync_index]);
t.loader.setMaxThreads(/* pool = */ 0, max_threads_values[sync_index]);
syncs[idx]->arrive_and_wait(); // (B) this sync point is required to allow `executing` value to go back down to zero after we change number of workers
}
t.loader.wait();
}
TEST(AsyncLoader, DynamicPools)
{
const size_t max_threads[] { 2, 10 };
const int jobs_in_chain = 16;
AsyncLoaderTest t({
{.max_threads = max_threads[0], .priority{0}},
{.max_threads = max_threads[1], .priority{-1}},
});
t.loader.start();
std::atomic<size_t> executing[2] { 0, 0 }; // Number of currently executing jobs per pool
for (int concurrency = 1; concurrency <= 12; concurrency++)
{
std::atomic<bool> boosted{false}; // Visible concurrency was increased
std::atomic<int> left{concurrency * jobs_in_chain / 2}; // Number of jobs to start before `prioritize()` call
LoadJobSet jobs_to_prioritize;
auto job_func = [&] (const LoadJobPtr & self)
{
auto pool_id = self->executionPool();
executing[pool_id]++;
if (executing[pool_id] > max_threads[0])
boosted = true;
ASSERT_LE(executing[pool_id], max_threads[pool_id]);
// Dynamic prioritization
if (--left == 0)
{
for (const auto & job : jobs_to_prioritize)
t.loader.prioritize(job, 1);
}
t.randomSleepUs(100, 200, 100);
ASSERT_LE(executing[pool_id], max_threads[pool_id]);
executing[pool_id]--;
};
std::vector<LoadTaskPtr> tasks;
tasks.reserve(concurrency);
for (int i = 0; i < concurrency; i++)
tasks.push_back(makeLoadTask(t.loader, t.chainJobSet(jobs_in_chain, job_func)));
jobs_to_prioritize = getGoals(tasks); // All jobs
scheduleAndWaitLoadAll(tasks);
ASSERT_EQ(executing[0], 0);
ASSERT_EQ(executing[1], 0);
ASSERT_EQ(boosted, concurrency > 2);
boosted = false;
}
}

View File

@ -28,7 +28,7 @@ void CachedCompressedReadBuffer::initInput()
}
void CachedCompressedReadBuffer::prefetch(int64_t priority)
void CachedCompressedReadBuffer::prefetch(Priority priority)
{
initInput();
file_in->prefetch(priority);

View File

@ -36,7 +36,7 @@ private:
bool nextImpl() override;
void prefetch(int64_t priority) override;
void prefetch(Priority priority) override;
/// Passed into file_in.
ReadBufferFromFileBase::ProfileCallback profile_callback;

View File

@ -51,7 +51,7 @@ CompressedReadBufferFromFile::CompressedReadBufferFromFile(std::unique_ptr<ReadB
}
void CompressedReadBufferFromFile::prefetch(int64_t priority)
void CompressedReadBufferFromFile::prefetch(Priority priority)
{
file_in.prefetch(priority);
}

View File

@ -43,7 +43,7 @@ private:
bool nextImpl() override;
void prefetch(int64_t priority) override;
void prefetch(Priority priority) override;
public:
explicit CompressedReadBufferFromFile(std::unique_ptr<ReadBufferFromFileBase> buf, bool allow_different_codecs_ = false);

View File

@ -47,7 +47,8 @@ struct Settings;
M(Bool, compress_snapshots_with_zstd_format, true, "Write compressed snapshots in ZSTD format (instead of custom LZ4)", 0) \
M(UInt64, configuration_change_tries_count, 20, "How many times we will try to apply configuration change (add/remove server) to the cluster", 0) \
M(UInt64, max_log_file_size, 50 * 1024 * 1024, "Max size of the Raft log file. If possible, each created log file will preallocate this amount of bytes on disk. Set to 0 to disable the limit", 0) \
M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0)
M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \
M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -471,17 +471,6 @@ void KeeperServer::shutdown()
namespace
{
// Serialize the request with all the necessary information for the leader
// we don't know ZXID and digest yet so we don't serialize it
nuraft::ptr<nuraft::buffer> getZooKeeperRequestMessage(const KeeperStorage::RequestForSession & request_for_session)
{
DB::WriteBufferFromNuraftBuffer write_buf;
DB::writeIntBinary(request_for_session.session_id, write_buf);
request_for_session.request->write(write_buf);
DB::writeIntBinary(request_for_session.time, write_buf);
return write_buf.getBuffer();
}
// Serialize the request for the log entry
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
{
@ -489,12 +478,11 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
DB::writeIntBinary(request_for_session.session_id, write_buf);
request_for_session.request->write(write_buf);
DB::writeIntBinary(request_for_session.time, write_buf);
DB::writeIntBinary(request_for_session.zxid, write_buf);
assert(request_for_session.digest);
DB::writeIntBinary(request_for_session.digest->version, write_buf);
if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
DB::writeIntBinary(request_for_session.digest->value, write_buf);
/// we fill with dummy values to eliminate unnecessary copy later on when we will write correct values
DB::writeIntBinary(static_cast<int64_t>(0), write_buf); /// zxid
DB::writeIntBinary(KeeperStorage::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
DB::writeIntBinary(static_cast<uint64_t>(0), write_buf); /// digest value
/// if new fields are added, update KeeperStateMachine::ZooKeeperLogSerializationVersion along with parseRequest function and PreAppendLog callback handler
return write_buf.getBuffer();
}
@ -512,9 +500,7 @@ RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForS
{
std::vector<nuraft::ptr<nuraft::buffer>> entries;
for (const auto & request_for_session : requests_for_sessions)
{
entries.push_back(getZooKeeperRequestMessage(request_for_session));
}
entries.push_back(getZooKeeperLogEntry(request_for_session));
std::lock_guard lock{server_write_mutex};
if (is_recovering)
@ -635,14 +621,50 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
assert(entry->get_val_type() == nuraft::app_log);
auto next_zxid = state_machine->getNextZxid();
auto & entry_buf = entry->get_buf();
auto request_for_session = state_machine->parseRequest(entry_buf);
request_for_session.zxid = next_zxid;
if (!state_machine->preprocess(request_for_session))
auto entry_buf = entry->get_buf_ptr();
KeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
auto request_for_session = state_machine->parseRequest(*entry_buf, /*final=*/false, &serialization_version);
request_for_session->zxid = next_zxid;
if (!state_machine->preprocess(*request_for_session))
return nuraft::cb_func::ReturnCode::ReturnNull;
request_for_session.digest = state_machine->getNodesDigest();
entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), getZooKeeperLogEntry(request_for_session), entry->get_val_type());
request_for_session->digest = state_machine->getNodesDigest();
/// older versions of Keeper can send logs that are missing some fields
size_t bytes_missing = 0;
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
bytes_missing += sizeof(request_for_session->time);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
bytes_missing += sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (bytes_missing != 0)
{
auto new_buffer = nuraft::buffer::alloc(entry_buf->size() + bytes_missing);
memcpy(new_buffer->data_begin(), entry_buf->data_begin(), entry_buf->size());
entry_buf = std::move(new_buffer);
entry = nuraft::cs_new<nuraft::log_entry>(entry->get_term(), entry_buf, entry->get_val_type());
}
size_t write_buffer_header_size
= sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
write_buffer_header_size += sizeof(request_for_session->time);
auto * buffer_start = reinterpret_cast<BufferBase::Position>(entry_buf->data_begin() + entry_buf->size() - write_buffer_header_size);
WriteBuffer write_buf(buffer_start, write_buffer_header_size);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
writeIntBinary(request_for_session->time, write_buf);
writeIntBinary(request_for_session->zxid, write_buf);
writeIntBinary(request_for_session->digest->version, write_buf);
if (request_for_session->digest->version != KeeperStorage::NO_DIGEST)
writeIntBinary(request_for_session->digest->value, write_buf);
break;
}
case nuraft::cb_func::AppendLogFailed:
@ -654,8 +676,8 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
assert(entry->get_val_type() == nuraft::app_log);
auto & entry_buf = entry->get_buf();
auto request_for_session = state_machine->parseRequest(entry_buf);
state_machine->rollbackRequest(request_for_session, true);
auto request_for_session = state_machine->parseRequest(entry_buf, true);
state_machine->rollbackRequest(*request_for_session, true);
break;
}
default:

View File

@ -1,16 +1,16 @@
#include <cerrno>
#include <base/errnoToString.h>
#include <base/defines.h>
#include <future>
#include <Coordination/KeeperSnapshotManager.h>
#include <Coordination/KeeperStateMachine.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <IO/ReadHelpers.h>
#include <base/defines.h>
#include <base/errnoToString.h>
#include <sys/mman.h>
#include <Common/ProfileEvents.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Common/ProfileEvents.h>
#include <Common/logger_useful.h>
#include "Coordination/KeeperStorage.h"
@ -60,6 +60,7 @@ KeeperStateMachine::KeeperStateMachine(
coordination_settings->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_)
, min_request_size_to_cache(coordination_settings_->min_request_size_for_cache)
, last_committed_idx(0)
, log(&Poco::Logger::get("KeeperStateMachine"))
, superdigest(superdigest_)
@ -149,19 +150,19 @@ void assertDigest(
nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data);
if (!request_for_session.zxid)
request_for_session.zxid = log_idx;
auto request_for_session = parseRequest(data, /*final=*/false);
if (!request_for_session->zxid)
request_for_session->zxid = log_idx;
preprocess(request_for_session);
preprocess(*request_for_session);
return nullptr;
}
KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer & data)
std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
{
ReadBufferFromNuraftBuffer buffer(data);
KeeperStorage::RequestForSession request_for_session;
readIntBinary(request_for_session.session_id, buffer);
auto request_for_session = std::make_shared<KeeperStorage::RequestForSession>();
readIntBinary(request_for_session->session_id, buffer);
int32_t length;
Coordination::read(length, buffer);
@ -169,29 +170,81 @@ KeeperStorage::RequestForSession KeeperStateMachine::parseRequest(nuraft::buffer
int32_t xid;
Coordination::read(xid, buffer);
static constexpr std::array non_cacheable_xids{
Coordination::WATCH_XID,
Coordination::PING_XID,
Coordination::AUTH_XID,
Coordination::CLOSE_XID,
};
const bool should_cache
= min_request_size_to_cache != 0 && request_for_session->session_id != -1 && data.size() >= min_request_size_to_cache
&& std::all_of(
non_cacheable_xids.begin(), non_cacheable_xids.end(), [&](const auto non_cacheable_xid) { return xid != non_cacheable_xid; });
if (should_cache)
{
std::lock_guard lock(request_cache_mutex);
if (auto xid_to_request_it = parsed_request_cache.find(request_for_session->session_id);
xid_to_request_it != parsed_request_cache.end())
{
auto & xid_to_request = xid_to_request_it->second;
if (auto request_it = xid_to_request.find(xid); request_it != xid_to_request.end())
{
if (final)
{
auto request = std::move(request_it->second);
xid_to_request.erase(request_it);
return request;
}
else
return request_it->second;
}
}
}
Coordination::OpNum opnum;
Coordination::read(opnum, buffer);
request_for_session.request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
request_for_session.request->xid = xid;
request_for_session.request->readImpl(buffer);
request_for_session->request = Coordination::ZooKeeperRequestFactory::instance().get(opnum);
request_for_session->request->xid = xid;
request_for_session->request->readImpl(buffer);
if (!buffer.eof())
readIntBinary(request_for_session.time, buffer);
else /// backward compatibility
request_for_session.time
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
if (!buffer.eof())
readIntBinary(request_for_session.zxid, buffer);
using enum ZooKeeperLogSerializationVersion;
ZooKeeperLogSerializationVersion version = INITIAL;
if (!buffer.eof())
{
request_for_session.digest.emplace();
readIntBinary(request_for_session.digest->version, buffer);
if (request_for_session.digest->version != KeeperStorage::DigestVersion::NO_DIGEST)
readIntBinary(request_for_session.digest->value, buffer);
version = WITH_TIME;
readIntBinary(request_for_session->time, buffer);
}
else
request_for_session->time
= std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
if (!buffer.eof())
{
version = WITH_ZXID_DIGEST;
readIntBinary(request_for_session->zxid, buffer);
chassert(!buffer.eof());
request_for_session->digest.emplace();
readIntBinary(request_for_session->digest->version, buffer);
if (request_for_session->digest->version != KeeperStorage::DigestVersion::NO_DIGEST || !buffer.eof())
readIntBinary(request_for_session->digest->value, buffer);
}
if (serialization_version)
*serialization_version = version;
if (should_cache && !final)
{
std::lock_guard lock(request_cache_mutex);
parsed_request_cache[request_for_session->session_id].emplace(xid, request_for_session);
}
return request_for_session;
@ -231,15 +284,15 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data);
if (!request_for_session.zxid)
request_for_session.zxid = log_idx;
auto request_for_session = parseRequest(data, true);
if (!request_for_session->zxid)
request_for_session->zxid = log_idx;
/// Special processing of session_id request
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
if (request_for_session->request->getOpNum() == Coordination::OpNum::SessionID)
{
const Coordination::ZooKeeperSessionIDRequest & session_id_request
= dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session.request);
= dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request);
int64_t session_id;
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id;
@ -261,25 +314,34 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
}
else
{
if (request_for_session->request->getOpNum() == Coordination::OpNum::Close)
{
std::lock_guard lock(request_cache_mutex);
parsed_request_cache.erase(request_for_session->session_id);
}
std::lock_guard lock(storage_and_responses_lock);
KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(
request_for_session.request, request_for_session.session_id, request_for_session.zxid);
KeeperStorage::ResponsesForSessions responses_for_sessions
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
if (!responses_queue.push(response_for_session))
{
ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response_for_session.session_id);
LOG_WARNING(
log,
"Failed to push response with session id {} to the queue, probably because of shutdown",
response_for_session.session_id);
}
if (keeper_context->digest_enabled && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(true), *request_for_session.request, true);
if (keeper_context->digest_enabled && request_for_session->digest)
assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true);
}
ProfileEvents::increment(ProfileEvents::KeeperCommits);
last_committed_idx = log_idx;
if (commit_callback)
commit_callback(request_for_session);
commit_callback(*request_for_session);
return nullptr;
}
@ -330,14 +392,14 @@ void KeeperStateMachine::commit_config(const uint64_t /* log_idx */, nuraft::ptr
void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data);
auto request_for_session = parseRequest(data, true);
// If we received a log from an older node, use the log_idx as the zxid
// log_idx will always be larger or equal to the zxid so we can safely do this
// (log_idx is increased for all logs, while zxid is only increased for requests)
if (!request_for_session.zxid)
request_for_session.zxid = log_idx;
if (!request_for_session->zxid)
request_for_session->zxid = log_idx;
rollbackRequest(request_for_session, false);
rollbackRequest(*request_for_session, false);
}
void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
@ -541,11 +603,7 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
/// Pure local request, just process it with storage
std::lock_guard lock(storage_and_responses_lock);
auto responses = storage->processRequest(
request_for_session.request,
request_for_session.session_id,
std::nullopt,
true /*check_acl*/,
true /*is_local*/);
request_for_session.request, request_for_session.session_id, std::nullopt, true /*check_acl*/, true /*is_local*/);
for (const auto & response : responses)
if (!responses_queue.push(response))
LOG_WARNING(log, "Failed to push response with session id {} to the queue, probably because of shutdown", response.session_id);

View File

@ -36,7 +36,22 @@ public:
/// Read state from the latest snapshot
void init();
static KeeperStorage::RequestForSession parseRequest(nuraft::buffer & data);
enum ZooKeeperLogSerializationVersion
{
INITIAL = 0,
WITH_TIME = 1,
WITH_ZXID_DIGEST = 2,
};
/// lifetime of a parsed request is:
/// [preprocess/PreAppendLog -> commit]
/// [preprocess/PreAppendLog -> rollback]
/// on events like commit and rollback we can remove the parsed request to keep the memory usage at minimum
/// request cache is also cleaned on session close in case something strange happened
///
/// final - whether it's the final time we will fetch the request so we can safely remove it from cache
/// serialization_version - information about which fields were parsed from the buffer so we can modify the buffer accordingly
std::shared_ptr<KeeperStorage::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);
bool preprocess(const KeeperStorage::RequestForSession & request_for_session);
@ -138,6 +153,13 @@ private:
/// for request.
mutable std::mutex storage_and_responses_lock;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorage::RequestForSession>>> parsed_request_cache;
uint64_t min_request_size_to_cache{0};
/// we only need to protect the access to the map itself
/// requests can be modified from anywhere without lock because a single request
/// can be processed only in 1 thread at any point
std::mutex request_cache_mutex;
/// Last committed Raft log number.
std::atomic<uint64_t> last_committed_idx;

View File

@ -110,7 +110,7 @@ public:
struct RequestForSession
{
int64_t session_id;
int64_t time;
int64_t time{0};
Coordination::ZooKeeperRequestPtr request;
int64_t zxid{0};
std::optional<Digest> digest;

View File

@ -1,5 +1,4 @@
#include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Common/logger_useful.h>
namespace DB
{

View File

@ -63,7 +63,7 @@ namespace DB
\
M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \
M(Int32, dns_cache_update_period, 15, "Internal DNS cache update period in seconds.", 0) \
M(UInt32, dns_max_consecutive_failures, 1024, "Max connection failures before dropping host from ClickHouse DNS cache.", 0) \
M(UInt32, dns_max_consecutive_failures, 1024, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \
\
M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \
M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \

View File

@ -78,6 +78,7 @@ class IColumn;
M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \
M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \
M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3 s3_min_upload_part_size multiplied by s3_upload_part_size_multiply_factor.", 0) \
M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \
M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \
@ -93,6 +94,7 @@ class IColumn;
M(Bool, s3_check_objects_after_upload, false, "Check each uploaded object to s3 with head request to be sure that upload was successful", 0) \
M(Bool, s3_allow_parallel_part_upload, true, "Use multiple threads for s3 multipart upload. It may lead to slightly higher memory usage", 0) \
M(Bool, s3_throw_on_zero_files_match, false, "Throw an error, when ListObjects request cannot match any files", 0) \
M(UInt64, s3_retry_attempts, 10, "Setting for Aws::Client::RetryStrategy, Aws::Client does retries itself, 0 means no retries", 0) \
M(Bool, enable_s3_requests_logging, false, "Enable very explicit logging of S3 requests. Makes sense for debug only.", 0) \
M(UInt64, hdfs_replication, 0, "The actual number of replications can be specified when the hdfs file is created.", 0) \
M(Bool, hdfs_truncate_on_insert, false, "Enables or disables truncate before insert in s3 engine tables", 0) \

View File

@ -138,19 +138,6 @@ namespace
}
}
String getCurrentKey(const String & path, const DiskEncryptedSettings & settings)
{
auto it = settings.keys.find(settings.current_key_id);
if (it == settings.keys.end())
throw Exception(
ErrorCodes::DATA_ENCRYPTION_ERROR,
"Not found a key with the current ID {} required to cipher file {}",
settings.current_key_id,
quoteString(path));
return it->second;
}
String getKey(const String & path, const FileEncryption::Header & header, const DiskEncryptedSettings & settings)
{
auto it = settings.keys.find(header.key_id);
@ -203,18 +190,19 @@ private:
};
DiskEncrypted::DiskEncrypted(
const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_)
: DiskEncrypted(name_, parseDiskEncryptedSettings(name_, config_, config_prefix_, map_))
const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_, bool use_fake_transaction_)
: DiskEncrypted(name_, parseDiskEncryptedSettings(name_, config_, config_prefix_, map_), use_fake_transaction_)
{
}
DiskEncrypted::DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_)
DiskEncrypted::DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_, bool use_fake_transaction_)
: IDisk(name_)
, delegate(settings_->wrapped_disk)
, encrypted_name(name_)
, disk_path(settings_->disk_path)
, disk_absolute_path(settings_->wrapped_disk->getPath() + settings_->disk_path)
, current_settings(std::move(settings_))
, use_fake_transaction(use_fake_transaction_)
{
delegate->createDirectories(disk_path);
}
@ -309,38 +297,6 @@ std::unique_ptr<ReadBufferFromFileBase> DiskEncrypted::readFile(
return std::make_unique<ReadBufferFromEncryptedFile>(settings.local_fs_buffer_size, std::move(buffer), key, header);
}
std::unique_ptr<WriteBufferFromFileBase> DiskEncrypted::writeFile(const String & path, size_t buf_size, WriteMode mode, const WriteSettings &)
{
auto wrapped_path = wrappedPath(path);
FileEncryption::Header header;
String key;
UInt64 old_file_size = 0;
auto settings = current_settings.get();
if (mode == WriteMode::Append && exists(path))
{
old_file_size = getFileSize(path);
if (old_file_size)
{
/// Append mode: we continue to use the same header.
auto read_buffer = delegate->readFile(wrapped_path, ReadSettings().adjustBufferSize(FileEncryption::Header::kSize));
header = readHeader(*read_buffer);
key = getKey(path, header, *settings);
}
}
if (!old_file_size)
{
/// Rewrite mode: we generate a new header.
key = getCurrentKey(path, *settings);
header.algorithm = settings->current_algorithm;
header.key_id = settings->current_key_id;
header.key_hash = calculateKeyHash(key);
header.init_vector = InitVector::random();
}
auto buffer = delegate->writeFile(wrapped_path, buf_size, mode);
return std::make_unique<WriteBufferFromEncryptedFile>(buf_size, std::move(buffer), key, header, old_file_size);
}
size_t DiskEncrypted::getFileSize(const String & path) const
{
auto wrapped_path = wrappedPath(path);
@ -416,7 +372,7 @@ void registerDiskEncrypted(DiskFactory & factory, bool global_skip_access_check)
const DisksMap & map) -> DiskPtr
{
bool skip_access_check = global_skip_access_check || config.getBool(config_prefix + ".skip_access_check", false);
DiskPtr disk = std::make_shared<DiskEncrypted>(name, config, config_prefix, map);
DiskPtr disk = std::make_shared<DiskEncrypted>(name, config, config_prefix, map, config.getBool(config_prefix + ".use_fake_transaction", true));
disk->startup(context, skip_access_check);
return disk;
};

View File

@ -6,22 +6,14 @@
#include <Disks/IDisk.h>
#include <Common/MultiVersion.h>
#include <Disks/FakeDiskTransaction.h>
#include <Disks/DiskEncryptedTransaction.h>
namespace DB
{
class ReadBufferFromFileBase;
class WriteBufferFromFileBase;
namespace FileEncryption { enum class Algorithm; }
struct DiskEncryptedSettings
{
DiskPtr wrapped_disk;
String disk_path;
std::unordered_map<UInt64, String> keys;
UInt64 current_key_id;
FileEncryption::Algorithm current_algorithm;
};
/// Encrypted disk ciphers all written files on the fly and writes the encrypted files to an underlying (normal) disk.
/// And when we read files from an encrypted disk it deciphers them automatically,
@ -29,8 +21,8 @@ struct DiskEncryptedSettings
class DiskEncrypted : public IDisk
{
public:
DiskEncrypted(const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_);
DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_);
DiskEncrypted(const String & name_, const Poco::Util::AbstractConfiguration & config_, const String & config_prefix_, const DisksMap & map_, bool use_fake_transaction_);
DiskEncrypted(const String & name_, std::unique_ptr<const DiskEncryptedSettings> settings_, bool use_fake_transaction_);
const String & getName() const override { return encrypted_name; }
const String & getPath() const override { return disk_absolute_path; }
@ -59,28 +51,30 @@ public:
void createDirectory(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->createDirectory(wrapped_path);
auto tx = createEncryptedTransaction();
tx->createDirectory(path);
tx->commit();
}
void createDirectories(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->createDirectories(wrapped_path);
auto tx = createEncryptedTransaction();
tx->createDirectories(path);
tx->commit();
}
void clearDirectory(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->clearDirectory(wrapped_path);
auto tx = createEncryptedTransaction();
tx->clearDirectory(path);
tx->commit();
}
void moveDirectory(const String & from_path, const String & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate->moveDirectory(wrapped_from_path, wrapped_to_path);
auto tx = createEncryptedTransaction();
tx->moveDirectory(from_path, to_path);
tx->commit();
}
DirectoryIteratorPtr iterateDirectory(const String & path) const override
@ -91,22 +85,23 @@ public:
void createFile(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->createFile(wrapped_path);
auto tx = createEncryptedTransaction();
tx->createFile(path);
tx->commit();
}
void moveFile(const String & from_path, const String & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate->moveFile(wrapped_from_path, wrapped_to_path);
auto tx = createEncryptedTransaction();
tx->moveFile(from_path, to_path);
tx->commit();
}
void replaceFile(const String & from_path, const String & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate->replaceFile(wrapped_from_path, wrapped_to_path);
auto tx = createEncryptedTransaction();
tx->replaceFile(from_path, to_path);
tx->commit();
}
void listFiles(const String & path, std::vector<String> & file_names) const override
@ -129,61 +124,67 @@ public:
const String & path,
size_t buf_size,
WriteMode mode,
const WriteSettings & settings) override;
const WriteSettings & settings) override
{
auto tx = createEncryptedTransaction();
auto result = tx->writeFile(path, buf_size, mode, settings);
return result;
}
void removeFile(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeFile(wrapped_path);
auto tx = createEncryptedTransaction();
tx->removeFile(path);
tx->commit();
}
void removeFileIfExists(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeFileIfExists(wrapped_path);
auto tx = createEncryptedTransaction();
tx->removeFileIfExists(path);
tx->commit();
}
void removeDirectory(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeDirectory(wrapped_path);
auto tx = createEncryptedTransaction();
tx->removeDirectory(path);
tx->commit();
}
void removeRecursive(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeRecursive(wrapped_path);
auto tx = createEncryptedTransaction();
tx->removeRecursive(path);
tx->commit();
}
void removeSharedFile(const String & path, bool flag) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeSharedFile(wrapped_path, flag);
auto tx = createEncryptedTransaction();
tx->removeSharedFile(path, flag);
tx->commit();
}
void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeSharedRecursive(wrapped_path, keep_all_batch_data, file_names_remove_metadata_only);
auto tx = createEncryptedTransaction();
tx->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
tx->commit();
}
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
for (const auto & file : files)
{
auto wrapped_path = wrappedPath(file.path);
bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
delegate->removeSharedFileIfExists(wrapped_path, keep);
else
delegate->removeSharedFile(wrapped_path, keep);
}
auto tx = createEncryptedTransaction();
tx->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
tx->commit();
}
void removeSharedFileIfExists(const String & path, bool flag) override
{
auto wrapped_path = wrappedPath(path);
delegate->removeSharedFileIfExists(wrapped_path, flag);
auto tx = createEncryptedTransaction();
tx->removeSharedFileIfExists(path, flag);
tx->commit();
}
Strings getBlobPath(const String & path) const override
@ -194,8 +195,9 @@ public:
void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override
{
auto wrapped_path = wrappedPath(path);
delegate->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function));
auto tx = createEncryptedTransaction();
tx->writeFileUsingBlobWritingFunction(path, mode, std::move(write_blob_function));
tx->commit();
}
std::unique_ptr<ReadBufferFromFileBase> readEncryptedFile(const String & path, const ReadSettings & settings) const override
@ -210,8 +212,9 @@ public:
WriteMode mode,
const WriteSettings & settings) const override
{
auto wrapped_path = wrappedPath(path);
return delegate->writeFile(wrapped_path, buf_size, mode, settings);
auto tx = createEncryptedTransaction();
auto buf = tx->writeEncryptedFile(path, buf_size, mode, settings);
return buf;
}
size_t getEncryptedFileSize(const String & path) const override
@ -228,8 +231,9 @@ public:
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override
{
auto wrapped_path = wrappedPath(path);
delegate->setLastModified(wrapped_path, timestamp);
auto tx = createEncryptedTransaction();
tx->setLastModified(path, timestamp);
tx->commit();
}
Poco::Timestamp getLastModified(const String & path) const override
@ -246,15 +250,16 @@ public:
void setReadOnly(const String & path) override
{
auto wrapped_path = wrappedPath(path);
delegate->setReadOnly(wrapped_path);
auto tx = createEncryptedTransaction();
tx->setReadOnly(path);
tx->commit();
}
void createHardLink(const String & src_path, const String & dst_path) override
{
auto wrapped_src_path = wrappedPath(src_path);
auto wrapped_dst_path = wrappedPath(dst_path);
delegate->createHardLink(wrapped_src_path, wrapped_dst_path);
auto tx = createEncryptedTransaction();
tx->createHardLink(src_path, dst_path);
tx->commit();
}
void truncateFile(const String & path, size_t size) override;
@ -289,12 +294,23 @@ public:
SyncGuardPtr getDirectorySyncGuard(const String & path) const override;
std::shared_ptr<DiskEncryptedTransaction> createEncryptedTransaction() const
{
auto delegate_transaction = delegate->createTransaction();
return std::make_shared<DiskEncryptedTransaction>(delegate_transaction, disk_path, *current_settings.get(), delegate.get());
}
DiskTransactionPtr createTransaction() override
{
/// Need to overwrite explicetly because this disk change
/// a lot of "delegate" methods.
if (use_fake_transaction)
{
return std::make_shared<FakeDiskTransaction>(*this);
}
else
{
return createEncryptedTransaction();
}
}
UInt64 getTotalSpace() const override
{
@ -331,10 +347,7 @@ public:
private:
String wrappedPath(const String & path) const
{
// if path starts_with disk_path -> got already wrapped path
if (!disk_path.empty() && path.starts_with(disk_path))
return path;
return disk_path + path;
return DiskEncryptedTransaction::wrappedPath(disk_path, path);
}
DiskPtr delegate;
@ -342,6 +355,7 @@ private:
const String disk_path;
const String disk_absolute_path;
MultiVersion<DiskEncryptedSettings> current_settings;
bool use_fake_transaction;
};
}

View File

@ -0,0 +1,120 @@
#include <Disks/DiskEncryptedTransaction.h>
#if USE_SSL
#include <IO/FileEncryptionCommon.h>
#include <Common/Exception.h>
#include <boost/algorithm/hex.hpp>
#include <IO/ReadBufferFromEncryptedFile.h>
#include <IO/ReadBufferFromFileDecorator.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromEncryptedFile.h>
#include <Common/quoteString.h>
namespace DB
{
namespace ErrorCodes
{
extern const int DATA_ENCRYPTION_ERROR;
}
namespace
{
FileEncryption::Header readHeader(ReadBufferFromFileBase & read_buffer)
{
try
{
FileEncryption::Header header;
header.read(read_buffer);
return header;
}
catch (Exception & e)
{
e.addMessage("While reading the header of encrypted file " + quoteString(read_buffer.getFileName()));
throw;
}
}
String getCurrentKey(const String & path, const DiskEncryptedSettings & settings)
{
auto it = settings.keys.find(settings.current_key_id);
if (it == settings.keys.end())
throw Exception(
ErrorCodes::DATA_ENCRYPTION_ERROR,
"Not found a key with the current ID {} required to cipher file {}",
settings.current_key_id,
quoteString(path));
return it->second;
}
String getKey(const String & path, const FileEncryption::Header & header, const DiskEncryptedSettings & settings)
{
auto it = settings.keys.find(header.key_id);
if (it == settings.keys.end())
throw Exception(
ErrorCodes::DATA_ENCRYPTION_ERROR,
"Not found a key with ID {} required to decipher file {}",
header.key_id,
quoteString(path));
String key = it->second;
if (FileEncryption::calculateKeyHash(key) != header.key_hash)
throw Exception(
ErrorCodes::DATA_ENCRYPTION_ERROR, "Wrong key with ID {}, could not decipher file {}", header.key_id, quoteString(path));
return key;
}
}
void DiskEncryptedTransaction::copyFile(const std::string & from_file_path, const std::string & to_file_path)
{
auto wrapped_from_path = wrappedPath(from_file_path);
auto wrapped_to_path = wrappedPath(to_file_path);
delegate_transaction->copyFile(wrapped_from_path, wrapped_to_path);
}
std::unique_ptr<WriteBufferFromFileBase> DiskEncryptedTransaction::writeFile( // NOLINT
const std::string & path,
size_t buf_size,
WriteMode mode,
const WriteSettings & settings,
bool autocommit)
{
auto wrapped_path = wrappedPath(path);
FileEncryption::Header header;
String key;
UInt64 old_file_size = 0;
if (mode == WriteMode::Append && delegate_disk->exists(wrapped_path))
{
size_t size = delegate_disk->getFileSize(wrapped_path);
old_file_size = size > FileEncryption::Header::kSize ? (size - FileEncryption::Header::kSize) : 0;
if (old_file_size)
{
/// Append mode: we continue to use the same header.
auto read_buffer = delegate_disk->readFile(wrapped_path, ReadSettings().adjustBufferSize(FileEncryption::Header::kSize));
header = readHeader(*read_buffer);
key = getKey(path, header, current_settings);
}
}
if (!old_file_size)
{
/// Rewrite mode: we generate a new header.
key = getCurrentKey(path, current_settings);
header.algorithm = current_settings.current_algorithm;
header.key_id = current_settings.current_key_id;
header.key_hash = FileEncryption::calculateKeyHash(key);
header.init_vector = FileEncryption::InitVector::random();
}
auto buffer = delegate_transaction->writeFile(wrapped_path, buf_size, mode, settings, autocommit);
return std::make_unique<WriteBufferFromEncryptedFile>(buf_size, std::move(buffer), key, header, old_file_size);
}
}
#endif

View File

@ -0,0 +1,259 @@
#pragma once
#include "config.h"
#if USE_SSL
#include <Disks/IDiskTransaction.h>
#include <Disks/IDisk.h>
#include <IO/ReadBufferFromFile.h>
#include <IO/WriteBufferFromFile.h>
namespace DB
{
namespace FileEncryption { enum class Algorithm; }
struct DiskEncryptedSettings
{
DiskPtr wrapped_disk;
String disk_path;
std::unordered_map<UInt64, String> keys;
UInt64 current_key_id;
FileEncryption::Algorithm current_algorithm;
};
class DiskEncryptedTransaction : public IDiskTransaction
{
public:
static String wrappedPath(const String disk_path, const String & path)
{
// if path starts_with disk_path -> got already wrapped path
if (!disk_path.empty() && path.starts_with(disk_path))
return path;
return disk_path + path;
}
DiskEncryptedTransaction(DiskTransactionPtr delegate_transaction_, const std::string & disk_path_, DiskEncryptedSettings current_settings_, IDisk * delegate_disk_)
: delegate_transaction(delegate_transaction_)
, disk_path(disk_path_)
, current_settings(current_settings_)
, delegate_disk(delegate_disk_)
{}
/// Tries to commit all accumulated operations simultaneously.
/// If something fails rollback and throw exception.
void commit() override // NOLINT
{
delegate_transaction->commit();
}
void undo() override
{
delegate_transaction->undo();
}
~DiskEncryptedTransaction() override = default;
/// Create directory.
void createDirectory(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->createDirectory(wrapped_path);
}
/// Create directory and all parent directories if necessary.
void createDirectories(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->createDirectories(wrapped_path);
}
/// Remove all files from the directory. Directories are not removed.
void clearDirectory(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->clearDirectory(wrapped_path);
}
/// Move directory from `from_path` to `to_path`.
void moveDirectory(const std::string & from_path, const std::string & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate_transaction->moveDirectory(wrapped_from_path, wrapped_to_path);
}
void moveFile(const std::string & from_path, const std::string & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate_transaction->moveFile(wrapped_from_path, wrapped_to_path);
}
void createFile(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->createFile(wrapped_path);
}
/// Move the file from `from_path` to `to_path`.
/// If a file with `to_path` path already exists, it will be replaced.
void replaceFile(const std::string & from_path, const std::string & to_path) override
{
auto wrapped_from_path = wrappedPath(from_path);
auto wrapped_to_path = wrappedPath(to_path);
delegate_transaction->replaceFile(wrapped_from_path, wrapped_to_path);
}
/// Only copy of several files supported now. Disk interface support copy to another disk
/// but it's impossible to implement correctly in transactions because other disk can
/// use different metadata storage.
/// TODO: maybe remove it at all, we don't want copies
void copyFile(const std::string & from_file_path, const std::string & to_file_path) override;
/// Open the file for write and return WriteBufferFromFileBase object.
std::unique_ptr<WriteBufferFromFileBase> writeFile( /// NOLINT
const std::string & path,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
WriteMode mode = WriteMode::Rewrite,
const WriteSettings & settings = {},
bool autocommit = true) override;
/// Remove file. Throws exception if file doesn't exists or it's a directory.
void removeFile(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeFile(wrapped_path);
}
/// Remove file if it exists.
void removeFileIfExists(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeFileIfExists(wrapped_path);
}
/// Remove directory. Throws exception if it's not a directory or if directory is not empty.
void removeDirectory(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeDirectory(wrapped_path);
}
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
void removeRecursive(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeRecursive(wrapped_path);
}
/// Remove file. Throws exception if file doesn't exists or if directory is not empty.
/// Differs from removeFile for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
void removeSharedFile(const std::string & path, bool keep_shared_data) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeSharedFile(wrapped_path, keep_shared_data);
}
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
/// Second bool param is a flag to remove (false) or keep (true) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
void removeSharedRecursive(const std::string & path, bool keep_all_shared_data, const NameSet & file_names_remove_metadata_only) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeSharedRecursive(wrapped_path, keep_all_shared_data, file_names_remove_metadata_only);
}
/// Remove file or directory if it exists.
/// Differs from removeFileIfExists for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
void removeSharedFileIfExists(const std::string & path, bool keep_shared_data) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->removeSharedFileIfExists(wrapped_path, keep_shared_data);
}
/// Batch request to remove multiple files.
/// May be much faster for blob storage.
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
for (const auto & file : files)
{
auto wrapped_path = wrappedPath(file.path);
bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
delegate_transaction->removeSharedFileIfExists(wrapped_path, keep);
else
delegate_transaction->removeSharedFile(wrapped_path, keep);
}
}
/// Set last modified time to file or directory at `path`.
void setLastModified(const std::string & path, const Poco::Timestamp & timestamp) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->setLastModified(wrapped_path, timestamp);
}
/// Just chmod.
void chmod(const String & path, mode_t mode) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->chmod(wrapped_path, mode);
}
/// Set file at `path` as read-only.
void setReadOnly(const std::string & path) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->setReadOnly(wrapped_path);
}
/// Create hardlink from `src_path` to `dst_path`.
void createHardLink(const std::string & src_path, const std::string & dst_path) override
{
auto wrapped_src_path = wrappedPath(src_path);
auto wrapped_dst_path = wrappedPath(dst_path);
delegate_transaction->createHardLink(wrapped_src_path, wrapped_dst_path);
}
void writeFileUsingBlobWritingFunction(const String & path, WriteMode mode, WriteBlobFunction && write_blob_function) override
{
auto wrapped_path = wrappedPath(path);
delegate_transaction->writeFileUsingBlobWritingFunction(wrapped_path, mode, std::move(write_blob_function));
}
std::unique_ptr<WriteBufferFromFileBase> writeEncryptedFile(
const String & path,
size_t buf_size,
WriteMode mode,
const WriteSettings & settings) const
{
auto wrapped_path = wrappedPath(path);
return delegate_transaction->writeFile(wrapped_path, buf_size, mode, settings);
}
private:
String wrappedPath(const String & path) const
{
return wrappedPath(disk_path, path);
}
DiskTransactionPtr delegate_transaction;
std::string disk_path;
DiskEncryptedSettings current_settings;
IDisk * delegate_disk;
};
}
#endif

View File

@ -188,12 +188,12 @@ try
try
{
file->write(payload.data(), payload.size());
file->finalize();
}
catch (...)
{
/// Log current exception, because finalize() can throw a different exception.
tryLogCurrentException(__PRETTY_FUNCTION__);
file->finalize();
throw;
}
}

View File

@ -83,19 +83,19 @@ bool AsynchronousBoundedReadBuffer::hasPendingDataToRead()
}
std::future<IAsynchronousReader::Result>
AsynchronousBoundedReadBuffer::asyncReadInto(char * data, size_t size, int64_t priority)
AsynchronousBoundedReadBuffer::asyncReadInto(char * data, size_t size, Priority priority)
{
IAsynchronousReader::Request request;
request.descriptor = std::make_shared<RemoteFSFileDescriptor>(*impl, async_read_counters);
request.buf = data;
request.size = size;
request.offset = file_offset_of_buffer_end;
request.priority = read_settings.priority + priority;
request.priority = Priority{read_settings.priority.value + priority.value};
request.ignore = bytes_to_ignore;
return reader.submit(request);
}
void AsynchronousBoundedReadBuffer::prefetch(int64_t priority)
void AsynchronousBoundedReadBuffer::prefetch(Priority priority)
{
if (prefetch_future.valid())
return;

View File

@ -39,7 +39,7 @@ public:
off_t seek(off_t offset_, int whence) override;
void prefetch(int64_t priority) override;
void prefetch(Priority priority) override;
void setReadUntilPosition(size_t position) override; /// [..., position).
@ -72,7 +72,7 @@ private:
struct LastPrefetchInfo
{
UInt64 submit_time = 0;
size_t priority = 0;
Priority priority;
};
LastPrefetchInfo last_prefetch_info;
@ -87,7 +87,7 @@ private:
int64_t size,
const std::unique_ptr<Stopwatch> & execution_watch);
std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, Priority priority);
void resetPrefetch(FilesystemPrefetchState state);

View File

@ -146,7 +146,8 @@ std::unique_ptr<S3::Client> getClient(
S3::ServerSideEncryptionKMSConfig sse_kms_config = S3::getSSEKMSConfig(config_prefix, config);
client_configuration.retryStrategy
= std::make_shared<Aws::Client::DefaultRetryStrategy>(config.getUInt(config_prefix + ".retry_attempts", 10));
= std::make_shared<Aws::Client::DefaultRetryStrategy>(
config.getUInt64(config_prefix + ".retry_attempts", settings.request_settings.retry_attempts));
return S3::ClientFactory::instance().create(
client_configuration,

View File

@ -40,7 +40,7 @@ protected:
settings->keys[0] = key;
settings->current_key_id = 0;
settings->disk_path = path;
encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings));
encrypted_disk = std::make_shared<DiskEncrypted>("encrypted_disk", std::move(settings), true);
}
String getFileNames()

View File

@ -1230,8 +1230,11 @@ public:
/// The case when arguments are the same (tautological comparison). Return constant.
/// NOTE: Nullable types are special case.
/// (BTW, this function use default implementation for Nullable, so Nullable types cannot be here. Check just in case.)
/// NOTE: We consider NaN comparison to be implementation specific (and in our implementation NaNs are sometimes equal sometimes not).
if (left_type->equals(*right_type) && !left_type->isNullable() && !isTuple(left_type) && col_left_untyped == col_right_untyped)
if (left_type->equals(*right_type) &&
!left_type->isNullable() &&
!isTuple(left_type) &&
!WhichDataType(left_type).isFloat() &&
col_left_untyped == col_right_untyped)
{
ColumnPtr result_column;

View File

@ -13,7 +13,6 @@ namespace DB
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int TOO_LARGE_STRING_SIZE;
}
@ -25,18 +24,16 @@ struct RepeatImpl
/// Safety threshold against DoS.
static inline void checkRepeatTime(UInt64 repeat_time)
{
static constexpr UInt64 max_repeat_times = 1000000;
static constexpr UInt64 max_repeat_times = 1'000'000;
if (repeat_time > max_repeat_times)
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}",
std::to_string(repeat_time), std::to_string(max_repeat_times));
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}", repeat_time, max_repeat_times);
}
static inline void checkStringSize(UInt64 size)
{
static constexpr UInt64 max_string_size = 1 << 30;
if (size > max_string_size)
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size ({}) in function repeat, maximum is: {}",
size, max_string_size);
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too large string size ({}) in function repeat, maximum is: {}", size, max_string_size);
}
template <typename T>
@ -186,36 +183,37 @@ public:
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
arguments[0]->getName(), getName());
if (!isInteger(arguments[1]))
throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}",
arguments[1]->getName(), getName());
return arguments[0];
FunctionArgumentDescriptors args{
{"s", &isString<IDataType>, nullptr, "String"},
{"n", &isInteger<IDataType>, nullptr, "Integer"},
};
validateFunctionArgumentTypes(*this, arguments, args);
return std::make_shared<DataTypeString>();
}
bool useDefaultImplementationForConstants() const override { return true; }
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t) const override
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
{
const auto & strcolumn = arguments[0].column;
const auto & numcolumn = arguments[1].column;
const auto & col_str = arguments[0].column;
const auto & col_num = arguments[1].column;
ColumnPtr res;
if (const ColumnString * col = checkAndGetColumn<ColumnString>(strcolumn.get()))
if (const ColumnString * col = checkAndGetColumn<ColumnString>(col_str.get()))
{
if (const ColumnConst * scale_column_num = checkAndGetColumn<ColumnConst>(numcolumn.get()))
if (const ColumnConst * col_num_const = checkAndGetColumn<ColumnConst>(col_num.get()))
{
auto col_res = ColumnString::create();
castType(arguments[1].type.get(), [&](const auto & type)
{
using DataType = std::decay_t<decltype(type)>;
using T = typename DataType::FieldType;
T repeat_time = scale_column_num->getValue<T>();
RepeatImpl::vectorStrConstRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), repeat_time);
T times = col_num_const->getValue<T>();
RepeatImpl::vectorStrConstRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), times);
return true;
});
return col_res;
@ -224,9 +222,9 @@ public:
{
using DataType = std::decay_t<decltype(type)>;
using T = typename DataType::FieldType;
const ColumnVector<T> * colnum = checkAndGetColumn<ColumnVector<T>>(numcolumn.get());
const ColumnVector<T> * column = checkAndGetColumn<ColumnVector<T>>(col_num.get());
auto col_res = ColumnString::create();
RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), colnum->getData());
RepeatImpl::vectorStrVectorRepeat(col->getChars(), col->getOffsets(), col_res->getChars(), col_res->getOffsets(), column->getData());
res = std::move(col_res);
return true;
}))
@ -234,7 +232,7 @@ public:
return res;
}
}
else if (const ColumnConst * col_const = checkAndGetColumn<ColumnConst>(strcolumn.get()))
else if (const ColumnConst * col_const = checkAndGetColumn<ColumnConst>(col_str.get()))
{
/// Note that const-const case is handled by useDefaultImplementationForConstants.
@ -244,9 +242,9 @@ public:
{
using DataType = std::decay_t<decltype(type)>;
using T = typename DataType::FieldType;
const ColumnVector<T> * colnum = checkAndGetColumn<ColumnVector<T>>(numcolumn.get());
const ColumnVector<T> * column = checkAndGetColumn<ColumnVector<T>>(col_num.get());
auto col_res = ColumnString::create();
RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), colnum->getData());
RepeatImpl::constStrVectorRepeat(copy_str, col_res->getChars(), col_res->getOffsets(), column->getData());
res = std::move(col_res);
return true;
}))

179
src/Functions/space.cpp Normal file
View File

@ -0,0 +1,179 @@
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
#include <DataTypes/DataTypesNumber.h>
#include <DataTypes/DataTypeString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <cstring>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_COLUMN;
extern const int TOO_LARGE_STRING_SIZE;
}
namespace
{
/// Prints whitespace n-times. Actually, space() could also be pushed down to repeat(). Chose a standalone-implementation because
/// we can do memset() whereas repeat() does memcpy().
class FunctionSpace : public IFunction
{
private:
static constexpr auto space = ' ';
/// Safety threshold against DoS.
static inline void checkRepeatTime(size_t repeat_time)
{
static constexpr auto max_repeat_times = 1'000'000uz;
if (repeat_time > max_repeat_times)
throw Exception(ErrorCodes::TOO_LARGE_STRING_SIZE, "Too many times to repeat ({}), maximum is: {}", repeat_time, max_repeat_times);
}
public:
static constexpr auto name = "space";
static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionSpace>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 1; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
{
FunctionArgumentDescriptors args{
{"n", &isInteger<IDataType>, nullptr, "Integer"}
};
validateFunctionArgumentTypes(*this, arguments, args);
return std::make_shared<DataTypeString>();
}
template <typename DataType>
bool executeConstant(ColumnPtr col_times, ColumnString::Offsets & res_offsets, ColumnString::Chars & res_chars) const
{
const ColumnConst * col_times_const = checkAndGetColumn<ColumnConst>(col_times.get());
const ColumnPtr & col_times_const_internal = col_times_const->getDataColumnPtr();
if (!checkAndGetColumn<typename DataType::ColumnType>(col_times_const_internal.get()))
return false;
using T = typename DataType::FieldType;
T times = col_times_const->getValue<T>();
if (times < 1)
times = 0;
checkRepeatTime(times);
res_offsets.resize(col_times->size());
res_chars.resize(col_times->size() * (times + 1));
size_t pos = 0;
for (size_t i = 0; i < col_times->size(); ++i)
{
memset(res_chars.begin() + pos, space, times);
pos += times;
*(res_chars.begin() + pos) = '\0';
pos += 1;
res_offsets[i] = pos;
}
return true;
}
template <typename DataType>
bool executeVector(ColumnPtr col_times_, ColumnString::Offsets & res_offsets, ColumnString::Chars & res_chars) const
{
auto * col_times = checkAndGetColumn<typename DataType::ColumnType>(col_times_.get());
if (!col_times)
return false;
res_offsets.resize(col_times->size());
res_chars.resize(col_times->size() * 10); /// heuristic
const PaddedPODArray<typename DataType::FieldType> & times_data = col_times->getData();
size_t pos = 0;
for (size_t i = 0; i < col_times->size(); ++i)
{
typename DataType::FieldType times = times_data[i];
if (times < 1)
times = 0;
checkRepeatTime(times);
if (pos + times + 1 > res_chars.size())
res_chars.resize(std::max(2 * res_chars.size(), static_cast<size_t>(pos + times + 1)));
memset(res_chars.begin() + pos, space, times);
pos += times;
*(res_chars.begin() + pos) = '\0';
pos += 1;
res_offsets[i] = pos;
}
res_chars.resize(pos);
return true;
}
ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override
{
const auto & col_num = arguments[0].column;
auto col_res = ColumnString::create();
ColumnString::Offsets & res_offsets = col_res->getOffsets();
ColumnString::Chars & res_chars = col_res->getChars();
if (const ColumnConst * col_num_const = checkAndGetColumn<ColumnConst>(col_num.get()))
{
if ((executeConstant<DataTypeUInt8>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeUInt16>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeUInt32>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeUInt64>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeInt8>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeInt16>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeInt32>(col_num, res_offsets, res_chars))
|| (executeConstant<DataTypeInt64>(col_num, res_offsets, res_chars)))
return col_res;
}
else
{
if ((executeVector<DataTypeUInt8>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeUInt16>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeUInt32>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeUInt64>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeInt8>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeInt16>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeInt32>(col_num, res_offsets, res_chars))
|| (executeVector<DataTypeInt64>(col_num, res_offsets, res_chars)))
return col_res;
}
throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of argument of function {}", arguments[0].column->getName(), getName());
}
};
}
REGISTER_FUNCTION(Space)
{
factory.registerFunction<FunctionSpace>({}, FunctionFactory::CaseInsensitive);
}
}

View File

@ -26,7 +26,7 @@ namespace ErrorCodes
AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
const std::string & file_name_,
size_t buf_size,
int flags,
@ -60,7 +60,7 @@ AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
AsynchronousReadBufferFromFile::AsynchronousReadBufferFromFile(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
int & fd_,
const std::string & original_file_name,
size_t buf_size,

View File

@ -17,7 +17,7 @@ protected:
public:
explicit AsynchronousReadBufferFromFile(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
const std::string & file_name_,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
int flags = -1,
@ -28,7 +28,7 @@ public:
/// Use pre-opened file descriptor.
explicit AsynchronousReadBufferFromFile(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
int & fd, /// Will be set to -1 if constructor didn't throw and ownership of file descriptor is passed to the object.
const std::string & original_file_name = {},
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
@ -58,7 +58,7 @@ private:
public:
AsynchronousReadBufferFromFileWithDescriptorsCache(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
const std::string & file_name_,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
int flags = -1,

View File

@ -40,14 +40,14 @@ std::string AsynchronousReadBufferFromFileDescriptor::getFileName() const
}
std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescriptor::asyncReadInto(char * data, size_t size, int64_t priority)
std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescriptor::asyncReadInto(char * data, size_t size, Priority priority)
{
IAsynchronousReader::Request request;
request.descriptor = std::make_shared<IAsynchronousReader::LocalFileDescriptor>(fd);
request.buf = data;
request.size = size;
request.offset = file_offset_of_buffer_end;
request.priority = base_priority + priority;
request.priority = Priority{base_priority.value + priority.value};
request.ignore = bytes_to_ignore;
bytes_to_ignore = 0;
@ -61,7 +61,7 @@ std::future<IAsynchronousReader::Result> AsynchronousReadBufferFromFileDescripto
}
void AsynchronousReadBufferFromFileDescriptor::prefetch(int64_t priority)
void AsynchronousReadBufferFromFileDescriptor::prefetch(Priority priority)
{
if (prefetch_future.valid())
return;
@ -151,7 +151,7 @@ void AsynchronousReadBufferFromFileDescriptor::finalize()
AsynchronousReadBufferFromFileDescriptor::AsynchronousReadBufferFromFileDescriptor(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
int fd_,
size_t buf_size,
char * existing_memory,

View File

@ -4,6 +4,7 @@
#include <IO/AsynchronousReader.h>
#include <Interpreters/Context.h>
#include <Common/Throttler_fwd.h>
#include <Common/Priority.h>
#include <optional>
#include <unistd.h>
@ -18,7 +19,7 @@ class AsynchronousReadBufferFromFileDescriptor : public ReadBufferFromFileBase
{
protected:
IAsynchronousReader & reader;
int64_t base_priority;
Priority base_priority;
Memory<> prefetch_buffer;
std::future<IAsynchronousReader::Result> prefetch_future;
@ -39,7 +40,7 @@ protected:
public:
AsynchronousReadBufferFromFileDescriptor(
IAsynchronousReader & reader_,
Int32 priority_,
Priority priority_,
int fd_,
size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
char * existing_memory = nullptr,
@ -49,7 +50,7 @@ public:
~AsynchronousReadBufferFromFileDescriptor() override;
void prefetch(int64_t priority) override;
void prefetch(Priority priority) override;
int getFD() const
{
@ -70,7 +71,7 @@ public:
size_t getFileSize() override;
private:
std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, int64_t priority);
std::future<IAsynchronousReader::Result> asyncReadInto(char * data, size_t size, Priority priority);
};
}

View File

@ -6,6 +6,7 @@
#include <future>
#include <boost/noncopyable.hpp>
#include <Common/Stopwatch.h>
#include <Common/Priority.h>
namespace DB
@ -47,7 +48,7 @@ public:
size_t offset = 0;
size_t size = 0;
char * buf = nullptr;
int64_t priority = 0;
Priority priority;
size_t ignore = 0;
};

View File

@ -19,7 +19,7 @@ public:
const ReadBuffer & getWrappedReadBuffer() const { return *in; }
ReadBuffer & getWrappedReadBuffer() { return *in; }
void prefetch(int64_t priority) override { in->prefetch(priority); }
void prefetch(Priority priority) override { in->prefetch(priority); }
protected:
std::unique_ptr<ReadBuffer> in;

View File

@ -2,6 +2,7 @@
#include <Common/ErrorCodes.h>
#include <Common/Exception.h>
#include <Common/Priority.h>
#include <IO/ResourceRequest.h>
#include <Poco/Util/AbstractConfiguration.h>
@ -37,7 +38,7 @@ inline const Poco::Util::AbstractConfiguration & emptyConfig()
struct SchedulerNodeInfo
{
double weight = 1.0; /// Weight of this node among it's siblings
Int64 priority = 0; /// Priority of this node among it's siblings (higher value means higher priority)
Priority priority; /// Priority of this node among it's siblings (lower value means higher priority)
/// Arbitrary data accessed/stored by parent
union {
@ -65,7 +66,7 @@ struct SchedulerNodeInfo
void setPriority(Int64 value)
{
priority = value;
priority.value = value;
}
};

View File

@ -87,7 +87,7 @@ bool ParallelReadBuffer::addReaderToPool()
auto worker = read_workers.emplace_back(std::make_shared<ReadWorker>(std::move(reader), range_start, size));
++active_working_reader;
schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, 0);
schedule([this, my_worker = std::move(worker)]() mutable { readerThreadFunction(std::move(my_worker)); }, Priority{});
return true;
}

View File

@ -20,7 +20,7 @@ public:
~PeekableReadBuffer() override;
void prefetch(int64_t priority) override { sub_buf->prefetch(priority); }
void prefetch(Priority priority) override { sub_buf->prefetch(priority); }
/// Sets checkpoint at current position
ALWAYS_INLINE inline void setCheckpoint()

View File

@ -6,6 +6,7 @@
#include <memory>
#include <Common/Exception.h>
#include <Common/Priority.h>
#include <IO/BufferBase.h>
#include <IO/AsynchronousReader.h>
@ -20,7 +21,7 @@ namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
}
static constexpr auto DEFAULT_PREFETCH_PRIORITY = 0;
static constexpr auto DEFAULT_PREFETCH_PRIORITY = Priority{0};
/** A simple abstract class for buffered data reading (char sequences) from somewhere.
* Unlike std::istream, it provides access to the internal buffer,
@ -208,10 +209,10 @@ public:
/** Do something to allow faster subsequent call to 'nextImpl' if possible.
* It's used for asynchronous readers with double-buffering.
* `priority` is the Threadpool priority, with which the prefetch task will be schedules.
* Smaller is more priority.
* `priority` is the `ThreadPool` priority, with which the prefetch task will be scheduled.
* Lower value means higher priority.
*/
virtual void prefetch(int64_t /* priority */) {}
virtual void prefetch(Priority) {}
/**
* Set upper bound for read range [..., position).

View File

@ -124,7 +124,7 @@ bool ReadBufferFromFileDescriptor::nextImpl()
}
void ReadBufferFromFileDescriptor::prefetch(int64_t)
void ReadBufferFromFileDescriptor::prefetch(Priority)
{
#if defined(POSIX_FADV_WILLNEED)
/// For direct IO, loading data into page cache is pointless.

View File

@ -25,7 +25,7 @@ protected:
ThrottlerPtr throttler;
bool nextImpl() override;
void prefetch(int64_t priority) override;
void prefetch(Priority priority) override;
/// Name or some description of file.
std::string getFileName() const override;

View File

@ -12,7 +12,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
{
if (whence == SEEK_SET)
{
if (offset >= 0 && internal_buffer.begin() + offset < internal_buffer.end())
if (offset >= 0 && internal_buffer.begin() + offset <= internal_buffer.end())
{
pos = internal_buffer.begin() + offset;
working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().
@ -25,7 +25,7 @@ off_t ReadBufferFromMemory::seek(off_t offset, int whence)
else if (whence == SEEK_CUR)
{
Position new_pos = pos + offset;
if (new_pos >= internal_buffer.begin() && new_pos < internal_buffer.end())
if (new_pos >= internal_buffer.begin() && new_pos <= internal_buffer.end())
{
pos = new_pos;
working_buffer = internal_buffer; /// We need to restore `working_buffer` in case the position was at EOF before this seek().

View File

@ -5,6 +5,7 @@
#include <Core/Defines.h>
#include <Interpreters/Cache/FileCache_fwd.h>
#include <Common/Throttler_fwd.h>
#include <Common/Priority.h>
#include <IO/ResourceLink.h>
namespace DB
@ -84,8 +85,8 @@ struct ReadSettings
size_t mmap_threshold = 0;
MMappedFileCache * mmap_cache = nullptr;
/// For 'pread_threadpool'/'io_uring' method. Lower is more priority.
size_t priority = 0;
/// For 'pread_threadpool'/'io_uring' method. Lower value is higher priority.
Priority priority;
bool load_marks_asynchronously = true;

View File

@ -26,12 +26,12 @@ class PriorityPolicy : public ISchedulerNode
struct Item
{
ISchedulerNode * child = nullptr;
Int64 priority = 0; // higher value means higher priority
Priority priority; // lower value means higher priority
/// For max-heap by priority
bool operator<(const Item& rhs) const noexcept
{
return priority < rhs.priority;
return priority > rhs.priority; // Reversed for heap top to yield highest priority (lowest value) child first
}
};

View File

@ -22,9 +22,9 @@ TEST(IOResourcePriorityPolicy, Priorities)
ResourceTest t;
t.add<PriorityPolicy>("/");
t.add<FifoQueue>("/A", "<priority>1</priority>");
t.add<FifoQueue>("/A", "<priority>3</priority>");
t.add<FifoQueue>("/B", "<priority>2</priority>");
t.add<FifoQueue>("/C", "<priority>3</priority>");
t.add<FifoQueue>("/C", "<priority>1</priority>");
t.enqueue("/A", {10, 10, 10});
t.enqueue("/B", {10, 10, 10});
@ -56,9 +56,9 @@ TEST(IOResourcePriorityPolicy, Activation)
ResourceTest t;
t.add<PriorityPolicy>("/");
t.add<FifoQueue>("/A", "<priority>1</priority>");
t.add<FifoQueue>("/A", "<priority>3</priority>");
t.add<FifoQueue>("/B", "<priority>2</priority>");
t.add<FifoQueue>("/C", "<priority>3</priority>");
t.add<FifoQueue>("/C", "<priority>1</priority>");
t.enqueue("/A", {10, 10, 10, 10, 10, 10});
t.enqueue("/B", {10});

View File

@ -49,7 +49,7 @@ TEST(IOResourceStaticResourceManager, Prioritization)
{
// Lock is not required here because this is called during request execution and we have max_requests = 1
if (last_priority)
EXPECT_TRUE(priority <= *last_priority); // Should be true if every queue arrived at the same time at busy period start
EXPECT_TRUE(priority >= *last_priority); // Should be true if every queue arrived at the same time at busy period start
last_priority = priority;
};
@ -63,8 +63,8 @@ TEST(IOResourceStaticResourceManager, Prioritization)
<res1>
<node path="/"> <type>inflight_limit</type><max_requests>1</max_requests></node>
<node path="/prio"> <type>priority</type></node>
<node path="/prio/A"> <priority>-1</priority></node>
<node path="/prio/B"> <priority>1</priority></node>
<node path="/prio/A"> <priority>1</priority></node>
<node path="/prio/B"> <priority>-1</priority></node>
<node path="/prio/C"> </node>
<node path="/prio/D"> </node>
<node path="/prio/leader"></node>

View File

@ -361,7 +361,7 @@ namespace
task->exception = std::current_exception();
}
task_finish_notify();
}, 0);
}, Priority{});
}
catch (...)
{

View File

@ -17,7 +17,7 @@ public:
off_t seek(off_t off, int whence) override;
void prefetch(int64_t priority) override { impl->prefetch(priority); }
void prefetch(Priority priority) override { impl->prefetch(priority); }
private:
UInt64 min_bytes_for_seek; /// Minimum positive seek offset which shall be executed using seek operation.

View File

@ -92,8 +92,11 @@ WriteBufferFromS3::WriteBufferFromS3(
, write_settings(write_settings_)
, client_ptr(std::move(client_ptr_))
, object_metadata(std::move(object_metadata_))
, buffer_allocation_policy(ChooseBufferPolicy(request_settings_.getUploadSettings()))
, task_tracker(std::make_unique<WriteBufferFromS3::TaskTracker>(std::move(schedule_)))
, buffer_allocation_policy(ChooseBufferPolicy(upload_settings))
, task_tracker(
std::make_unique<WriteBufferFromS3::TaskTracker>(
std::move(schedule_),
upload_settings.max_inflight_parts_for_one_file))
{
LOG_TRACE(log, "Create WriteBufferFromS3, {}", getLogDetails());
@ -109,8 +112,11 @@ void WriteBufferFromS3::nextImpl()
ErrorCodes::LOGICAL_ERROR,
"Cannot write to prefinalized buffer for S3, the file could have been created with PutObjectRequest");
/// Make sense to call to before adding new async task to check if there is an exception
task_tracker->waitReady();
/// Make sense to call waitIfAny before adding new async task to check if there is an exception
/// The faster the exception is propagated the lesser time is spent for cancellation
/// Despite the fact that `task_tracker->add()` collects tasks statuses and propagates their exceptions
/// that call is necessary for the case when the is no in-flight limitation and therefore `task_tracker->add()` doesn't wait anything
task_tracker->waitIfAny();
hidePartialData();
@ -134,7 +140,8 @@ void WriteBufferFromS3::preFinalize()
LOG_TRACE(log, "preFinalize WriteBufferFromS3. {}", getLogDetails());
task_tracker->waitReady();
/// This function should not be run again if an exception has occurred
is_prefinalized = true;
hidePartialData();
@ -166,8 +173,6 @@ void WriteBufferFromS3::preFinalize()
{
writeMultipartUpload();
}
is_prefinalized = true;
}
void WriteBufferFromS3::finalizeImpl()
@ -212,8 +217,8 @@ String WriteBufferFromS3::getLogDetails() const
multipart_upload_details = fmt::format(", upload id {}, upload has finished {}"
, multipart_upload_id, multipart_upload_finished);
return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, finalized {}{}",
bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), finalized, multipart_upload_details);
return fmt::format("Details: bucket {}, key {}, total size {}, count {}, hidden_size {}, offset {}, with pool: {}, prefinalized {}, finalized {}{}",
bucket, key, total_size, count(), hidden_size, offset(), task_tracker->isAsync(), is_prefinalized, finalized, multipart_upload_details);
}
void WriteBufferFromS3::tryToAbortMultipartUpload()
@ -234,7 +239,7 @@ WriteBufferFromS3::~WriteBufferFromS3()
{
LOG_TRACE(log, "Close WriteBufferFromS3. {}.", getLogDetails());
// That descructor could be call with finalized=false in case of exceptions
// That destructor could be call with finalized=false in case of exceptions
if (!finalized)
{
LOG_ERROR(log, "WriteBufferFromS3 is not finalized in destructor. It could be if an exception occurs. File is not written to S3. {}.", getLogDetails());

View File

@ -4,12 +4,18 @@
#include <IO/WriteBufferFromS3TaskTracker.h>
namespace ProfileEvents
{
extern const Event WriteBufferFromS3WaitInflightLimitMicroseconds;
}
namespace DB
{
WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_)
WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_)
: is_async(bool(scheduler_))
, scheduler(scheduler_ ? std::move(scheduler_) : syncRunner())
, max_tasks_inflight(max_tasks_inflight_)
{}
WriteBufferFromS3::TaskTracker::~TaskTracker()
@ -28,103 +34,152 @@ ThreadPoolCallbackRunner<void> WriteBufferFromS3::TaskTracker::syncRunner()
};
}
void WriteBufferFromS3::TaskTracker::waitReady()
{
LOG_TEST(log, "waitReady, in queue {}", futures.size());
/// Exceptions are propagated
auto it = futures.begin();
while (it != futures.end())
{
chassert(it->valid());
if (it->wait_for(std::chrono::seconds(0)) != std::future_status::ready)
{
++it;
continue;
}
try
{
it->get();
} catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
throw;
}
it = futures.erase(it);
}
LOG_TEST(log, "waitReady ended, in queue {}", futures.size());
}
void WriteBufferFromS3::TaskTracker::waitAll()
{
LOG_TEST(log, "waitAll, in queue {}", futures.size());
/// Exceptions are propagated
for (auto & future : futures)
{
try
{
future.get();
} catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
throw;
}
}
futures.clear();
std::lock_guard lock(mutex);
finished_futures.clear();
}
void WriteBufferFromS3::TaskTracker::safeWaitAll()
{
LOG_TEST(log, "safeWaitAll, wait in queue {}", futures.size());
/// Exceptions are not propagated
for (auto & future : futures)
{
LOG_TEST(log, "safeWaitAll, wait future");
if (future.valid())
future.wait();
}
LOG_TEST(log, "safeWaitAll, get in queue {}", futures.size());
for (auto & future : futures)
{
if (future.valid())
{
try
{
/// Exceptions are not propagated
future.get();
} catch (...)
{
/// But at least they are printed
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
futures.clear();
LOG_TEST(log, "safeWaitAll ended, get in queue {}", futures.size());
std::lock_guard lock(mutex);
finished_futures.clear();
}
void WriteBufferFromS3::TaskTracker::waitIfAny()
{
LOG_TEST(log, "waitIfAny, in queue {}", futures.size());
if (futures.empty())
return;
Stopwatch watch;
{
std::lock_guard lock(mutex);
for (auto & it : finished_futures)
{
/// actually that call might lock this thread until the future is set finally
/// however that won't lock us for long, the task is about to finish when the pointer appears in the `finished_futures`
it->get();
/// in case of exception in `it->get()`
/// it it not necessary to remove `it` from list `futures`
/// `TaskTracker` has to be destroyed after any exception occurs, for this `safeWaitAll` is called.
/// `safeWaitAll` handles invalid futures in the list `futures`
futures.erase(it);
}
finished_futures.clear();
}
watch.stop();
ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
LOG_TEST(log, "waitIfAny ended, in queue {}", futures.size());
}
void WriteBufferFromS3::TaskTracker::add(Callback && func)
{
LOG_TEST(log, "add, in queue {}", futures.size());
/// All this fuzz is about 2 things. This is the most critical place of TaskTracker.
/// The first is not to fail insertion in the list `futures`.
/// In order to face it, the element is allocated at the end of the list `futures` in advance.
/// The second is not to fail the notification of the task.
/// In order to face it, the list element, which would be inserted to the list `finished_futures`,
/// is allocated in advance as an other list `pre_allocated_finished` with one element inside.
auto future = scheduler(std::move(func), 0);
auto exit_scope = scope_guard(
[&future]()
/// preallocation for the first issue
futures.emplace_back();
auto future_placeholder = std::prev(futures.end());
/// preallocation for the second issue
FinishedList pre_allocated_finished {future_placeholder};
Callback func_with_notification = [&, func=std::move(func), pre_allocated_finished=std::move(pre_allocated_finished)] () mutable
{
future.wait();
SCOPE_EXIT({
DENY_ALLOCATIONS_IN_SCOPE;
std::lock_guard lock(mutex);
finished_futures.splice(finished_futures.end(), pre_allocated_finished);
has_finished.notify_one();
});
func();
};
/// this move is nothrow
*future_placeholder = scheduler(std::move(func_with_notification), Priority{});
LOG_TEST(log, "add ended, in queue {}, limit {}", futures.size(), max_tasks_inflight);
waitTilInflightShrink();
}
void WriteBufferFromS3::TaskTracker::waitTilInflightShrink()
{
if (!max_tasks_inflight)
return;
LOG_TEST(log, "waitTilInflightShrink, in queue {}", futures.size());
Stopwatch watch;
/// Alternative approach is to wait until at least futures.size() - max_tasks_inflight element are finished
/// However the faster finished task is collected the faster CH checks if there is an exception
/// The faster an exception is propagated the lesser time is spent for cancellation
while (futures.size() >= max_tasks_inflight)
{
std::unique_lock lock(mutex);
has_finished.wait(lock, [this] () TSA_REQUIRES(mutex) { return !finished_futures.empty(); });
for (auto & it : finished_futures)
{
SCOPE_EXIT({
/// According to basic exception safety TaskTracker has to be destroyed after exception
/// If it would be true than this SCOPE_EXIT is superfluous
/// However WriteBufferWithFinalizeCallback, WriteBufferFromFileDecorator do call finalize in d-tor
/// TaskTracker has to cope this until the issue with finalizing in d-tor is addressed in #50274
futures.erase(it);
});
it->get();
}
);
futures.push_back(std::move(future));
finished_futures.clear();
}
exit_scope.release();
LOG_TEST(log, "add ended, in queue {}", futures.size());
watch.stop();
ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds());
LOG_TEST(log, "waitTilInflightShrink ended, in queue {}", futures.size());
}
bool WriteBufferFromS3::TaskTracker::isAsync() const

View File

@ -6,36 +6,61 @@
#include "WriteBufferFromS3.h"
#include <list>
namespace DB
{
/// That class is used only in WriteBufferFromS3 for now.
/// Therefore it declared as a part of WriteBufferFromS3.
/// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool.
/// TaskTracker brings the methods waitReady, waitAll/safeWaitAll
/// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll
/// to help with coordination of the running tasks.
/// Basic exception safety is provided. If exception occurred the object has to be destroyed.
/// No thread safety is provided. Use this object with no concurrency.
class WriteBufferFromS3::TaskTracker
{
public:
using Callback = std::function<void()>;
explicit TaskTracker(ThreadPoolCallbackRunner<void> scheduler_);
TaskTracker(ThreadPoolCallbackRunner<void> scheduler_, size_t max_tasks_inflight_);
~TaskTracker();
static ThreadPoolCallbackRunner<void> syncRunner();
bool isAsync() const;
void waitReady();
/// waitIfAny collects statuses from already finished tasks
/// There could be no finished tasks yet, so waitIfAny do nothing useful in that case
/// the first exception is thrown if any task has failed
void waitIfAny();
/// Well, waitAll waits all the tasks until they finish and collects their statuses
void waitAll();
/// safeWaitAll does the same as waitAll but mutes the exceptions
void safeWaitAll();
void add(Callback && func);
private:
bool is_async;
/// waitTilInflightShrink waits til the number of in-flight tasks beyond the limit `max_tasks_inflight`.
void waitTilInflightShrink() TSA_NO_THREAD_SAFETY_ANALYSIS;
const bool is_async;
ThreadPoolCallbackRunner<void> scheduler;
std::list<std::future<void>> futures;
const size_t max_tasks_inflight;
using FutureList = std::list<std::future<void>>;
FutureList futures;
Poco::Logger * log = &Poco::Logger::get("TaskTracker");
std::mutex mutex;
std::condition_variable has_finished TSA_GUARDED_BY(mutex);
using FinishedList = std::list<FutureList::iterator>;
FinishedList finished_futures TSA_GUARDED_BY(mutex);
};
}

View File

@ -2041,7 +2041,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
*/
if (data.hasNullKeyData())
{
has_null_key_data = Method::one_key_nullable_optimization;
has_null_key_data = true;
out_cols->key_columns[0]->insertDefault();
insertAggregatesIntoColumns(data.getNullKeyData(), out_cols->final_aggregate_columns, arena);
data.hasNullKeyData() = false;
@ -2076,6 +2076,7 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena
res.emplace_back(insertResultsIntoColumns<use_compiled_functions>(places, std::move(out_cols.value()), arena, has_null_key_data));
places.clear();
out_cols.reset();
has_null_key_data = false;
}
}
});

Some files were not shown because too many files have changed in this diff Show More