Merge branch 'master' into tests/improve-hung-check

This commit is contained in:
Alexander Tokmakov 2023-01-17 20:09:18 +01:00
commit 2cd0ba7fff
164 changed files with 4252 additions and 1230 deletions

View File

@ -683,3 +683,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py

View File

@ -169,3 +169,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py --check-approved

View File

@ -4388,3 +4388,4 @@ jobs:
run: |
cd "$GITHUB_WORKSPACE/tests/ci"
python3 finish_check.py
python3 merge_pr.py --check-approved

2
contrib/poco vendored

@ -1 +1 @@
Subproject commit 799234226187c0ae0b8c90f23465b25ed7956e56
Subproject commit 0ab9bba7ccad3c8dacce04a35cb3b78218547ab4

View File

@ -5,6 +5,7 @@ set -x
# core.COMM.PID-TID
sysctl kernel.core_pattern='core.%e.%p-%P'
dmesg --clear ||:
set -e
set -u
@ -368,6 +369,7 @@ if [ -f core.zst ]; then
fi
rg --text -F '<Fatal>' server.log > fatal.log ||:
dmesg -T > dmesg.log ||:
zstd --threads=0 server.log
@ -396,6 +398,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
<a href="fuzzer.log">fuzzer.log</a>
<a href="server.log.zst">server.log.zst</a>
<a href="main.log">main.log</a>
<a href="dmesg.log">dmesg.log</a>
${CORE_LINK}
</p>
<table>

View File

@ -128,6 +128,7 @@ function run_tests()
if [[ "${HIGH_LEVEL_COVERAGE}" = "YES" ]]; then
ADDITIONAL_OPTIONS+=('--report-coverage')
ADDITIONAL_OPTIONS+=('--report-logs-stats')
fi
set +e

View File

@ -289,6 +289,7 @@ if __name__ == "__main__":
"--database=system",
"--hung-check",
"--stress",
"--report-logs-stats",
"00001_select_1",
]
)

View File

@ -136,3 +136,7 @@ DESCRIBE TABLE test_database.test_table;
│ data │ Nullable(String) │
└────────┴───────────────────┘
```
## Related content
- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)

View File

@ -175,3 +175,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32)
- [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
- [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
## Related content
- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)

View File

@ -1203,12 +1203,14 @@ SELECT * FROM json_each_row_nested
- [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
- [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
- [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
- [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
- [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
- [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
- [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
- [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `false`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
- [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
- [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.

View File

@ -266,7 +266,7 @@ Default value: 0.
Limits the size in bytes of the hash table used when joining tables.
This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).
This setting applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).
If the query contains joins, ClickHouse checks this setting for every intermediate result.

View File

@ -402,40 +402,62 @@ Default value: `ALL`.
## join_algorithm {#settings-join_algorithm}
Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm.
Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used.
Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine.
Possible values:
- `default``hash` or `direct`, if possible (same as `direct,hash`)
### `default`
- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
This is the equivalent of `hash` or `direct`, if possible (same as `direct,hash`)
- `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.
### `grace_hash`
[Grace hash join](https://en.wikipedia.org/wiki/Hash_join#Grace_hash_join) is used. Grace hash provides an algorithm option that provides performant complex joins while limiting memory use.
The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which dont belong to the current bucket are flushed and reassigned.
### `hash`
[Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
### `parallel_hash`
A variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.
When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM.
- `partial_merge` — a variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.
### `partial_merge`
A variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.
The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported).
When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.
When using the `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by the `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.
- `direct` - can be applied when the right storage supports key-value requests.
### `direct`
This algorithm can be applied when the storage for the right table supports key-value requests.
The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md/#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs.
- `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated.
### `auto`
- `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
When set to `auto`, `hash` join is tried first, and the algorithm is switched on the fly to another algorithm if the memory limit is violated.
- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.
### `full_sorting_merge`
[Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
### `prefer_partial_merge`
ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.
## join_any_take_last_row {#settings-join_any_take_last_row}
Changes behaviour of join operations with `ANY` strictness.
Changes the behaviour of join operations with `ANY` strictness.
:::warning
This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables.
@ -498,7 +520,7 @@ Default value: 65536.
Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk.
The bigger the value of the setting, the more RAM used and the less disk I/O needed.
The bigger the value of the setting, the more RAM is used and the less disk I/O is needed.
Possible values:
@ -514,12 +536,12 @@ Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations.
Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour.
:::
When the legacy behaviour enabled:
When the legacy behaviour is enabled:
- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping.
- Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do.
When the legacy behaviour disabled:
When the legacy behaviour is disabled:
- Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations.
- Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables.
@ -572,7 +594,7 @@ Default value: `163840`.
## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem}
The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.
The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.
Possible values:
@ -706,7 +728,7 @@ log_queries=1
## log_queries_min_query_duration_ms {#settings-log-queries-min-query-duration-ms}
If enabled (non-zero), queries faster then the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:
If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:
- `system.query_log`
- `system.query_thread_log`
@ -741,7 +763,7 @@ log_queries_min_type='EXCEPTION_WHILE_PROCESSING'
Setting up query threads logging.
Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.
Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#settings-log-queries) is true. Queries threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.
Possible values:
@ -760,7 +782,7 @@ log_query_threads=1
Setting up query views logging.
When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.
When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.
Example:
@ -787,7 +809,7 @@ It can be used to improve the readability of server logs. Additionally, it helps
Possible values:
- Any string no longer than [max_query_size](#settings-max_query_size). If length is exceeded, the server throws an exception.
- Any string no longer than [max_query_size](#settings-max_query_size). If the max_query_size is exceeded, the server throws an exception.
Default value: empty string.
@ -821,11 +843,11 @@ The setting also does not have a purpose when using INSERT SELECT, since data is
Default value: 1,048,576.
The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.
The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.
## min_insert_block_size_rows {#min-insert-block-size-rows}
Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
Possible values:
@ -891,7 +913,7 @@ Higher values will lead to higher memory usage.
## max_compress_block_size {#max-compress-block-size}
The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.
The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.
:::warning
This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse.
@ -935,7 +957,7 @@ Default value: 1000.
## interactive_delay {#interactive-delay}
The interval in microseconds for checking whether request execution has been cancelled and sending the progress.
The interval in microseconds for checking whether request execution has been canceled and sending the progress.
Default value: 100,000 (checks for cancelling and sends the progress ten times per second).
@ -4122,7 +4144,20 @@ Enabled by default.
Serialize named tuple columns as JSON objects.
Disabled by default.
Enabled by default.
### input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects}
Parse named tuple columns as JSON objects.
Enabled by default.
### input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple}
Insert default values for missing elements in JSON object while parsing named tuple.
This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled.
Enabled by default.
### output_format_json_array_of_rows {#output_format_json_array_of_rows}

View File

@ -120,5 +120,6 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.
## Related Content
- [Extracting, converting, and querying data in local files using clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local)
- [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1)
- [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data)

View File

@ -57,6 +57,7 @@ ClickHouse-specific aggregate functions:
- [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md)
- [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md)
- [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md)
- [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md)
- [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md)
- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md)
- [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md)
@ -77,4 +78,6 @@ ClickHouse-specific aggregate functions:
- [contingency](./contingency.md)
- [cramersV](./cramersv.md)
- [cramersVBiasCorrected](./cramersvbiascorrected.md)
- [theilsU](./theilsu.md)
- [theilsU](./theilsu.md)
- [maxIntersections](./maxintersections.md)
- [maxIntersectionsPosition](./maxintersectionsposition.md)

View File

@ -0,0 +1,64 @@
---
slug: /en/sql-reference/aggregate-functions/reference/maxintersections
sidebar_position: 360
title: maxIntersections
---
# maxIntersections
Aggregate function that calculates the maximum number of times that a group of intervals intersects each other (if all the intervals intersect at least once).
The syntax is:
```sql
maxIntersections(start_column, end_column)
```
**Arguments**
- `start_column` the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
**Returned value**
Returns the maximum number of intersected intervals.
**Example**
```sql
CREATE TABLE my_events (
start UInt32,
end UInt32
)
Engine = MergeTree
ORDER BY tuple();
INSERT INTO my_events VALUES
(1, 3),
(1, 6),
(2, 5),
(3, 7);
```
The intervals look like the following:
```response
1 - 3
1 - - - - 6
2 - - 5
3 - - - 7
```
Three of these intervals have a common value (the value is `4`, but the value that is common is not important, we are measuring the count of the intersections). The intervals `(1,3)` and `(3,7)` share an endpoint but are not considered intersecting by the `maxIntersections` function.
```sql
SELECT maxIntersections(start, end) FROM my_events;
```
Response:
```response
3
```
If you have multiple occurrences of the maximum interval, you can use the [`maxIntersectionsPosition` function](./maxintersectionsposition.md) to locate the number and location of those occurrences.

View File

@ -0,0 +1,64 @@
---
slug: /en/sql-reference/aggregate-functions/reference/maxintersectionsposition
sidebar_position: 361
title: maxIntersectionsPosition
---
# maxIntersectionsPosition
Aggregate function that calculates the positions of the occurrences of the [`maxIntersections` function](./maxintersections.md).
The syntax is:
```sql
maxIntersectionsPosition(start_column, end_column)
```
**Arguments**
- `start_column` the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
**Returned value**
Returns the start positions of the maximum number of intersected intervals.
**Example**
```sql
CREATE TABLE my_events (
start UInt32,
end UInt32
)
Engine = MergeTree
ORDER BY tuple();
INSERT INTO my_events VALUES
(1, 3),
(1, 6),
(2, 5),
(3, 7);
```
The intervals look like the following:
```response
1 - 3
1 - - - - 6
2 - - 5
3 - - - 7
```
Notice that three of these intervals have the value 4 in common, and that starts with the 2nd interval:
```sql
SELECT maxIntersectionsPosition(start, end) FROM my_events;
```
Response:
```response
2
```
In other words, the `(1,6)` row is the start of the 3 intervals that intersect, and 3 is the maximum number of intervals that intersect.

View File

@ -0,0 +1,68 @@
---
slug: /en/sql-reference/aggregate-functions/reference/quantileInterpolatedWeighted
sidebar_position: 203
---
# quantileInterpolatedWeighted
Computes [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using linear interpolation, taking into account the weight of each element.
To get the interpolated value, all the passed values are combined into an array, which are then sorted by their corresponding weights. Quantile interpolation is then performed using the [weighted percentile method](https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method) by building a cumulative distribution based on weights and then a linear interpolation is performed using the weights and the values to compute the quantiles.
When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
**Syntax**
``` sql
quantileInterpolatedWeighted(level)(expr, weight)
```
Alias: `medianInterpolatedWeighted`.
**Arguments**
- `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
- `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
- `weight` — Column with weights of sequence members. Weight is a number of value occurrences.
**Returned value**
- Quantile of the specified level.
Type:
- [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
- [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
- [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
**Example**
Input table:
``` text
┌─n─┬─val─┐
│ 0 │ 3 │
│ 1 │ 2 │
│ 2 │ 1 │
│ 5 │ 4 │
└───┴─────┘
```
Query:
``` sql
SELECT quantileInterpolatedWeighted(n, val) FROM t
```
Result:
``` text
┌─quantileInterpolatedWeighted(n, val)─┐
│ 1 │
└──────────────────────────────────────┘
```
**See Also**
- [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
- [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)

View File

@ -9,7 +9,7 @@ sidebar_position: 201
Syntax: `quantiles(level1, level2, …)(x)`
All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
## quantilesExactExclusive

View File

@ -6,6 +6,10 @@ sidebar_label: JSON
# JSON
:::warning
This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/guides/developer/working-with-json/json-load-data.md) instead.
:::
Stores JavaScript Object Notation (JSON) documents in a single column.
`JSON` is an alias for `Object('json')`.

View File

@ -121,7 +121,7 @@ Accepts an empty array and returns a one-element array that is equal to the defa
## range(end), range(\[start, \] end \[, step\])
Returns an array of `UInt` numbers from `start` to `end - 1` by `step`.
Returns an array of numbers from `start` to `end - 1` by `step`. The supported types are [UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64](../data-types/int-uint.md).
**Syntax**
``` sql
@ -130,31 +130,30 @@ range([start, ] end [, step])
**Arguments**
- `start` — The first element of the array. Optional, required if `step` is used. Default value: 0. [UInt](../data-types/int-uint.md)
- `end` — The number before which the array is constructed. Required. [UInt](../data-types/int-uint.md)
- `step` — Determines the incremental step between each element in the array. Optional. Default value: 1. [UInt](../data-types/int-uint.md)
- `start` — The first element of the array. Optional, required if `step` is used. Default value: 0.
- `end` — The number before which the array is constructed. Required.
- `step` — Determines the incremental step between each element in the array. Optional. Default value: 1.
**Returned value**
- Array of `UInt` numbers from `start` to `end - 1` by `step`.
- Array of numbers from `start` to `end - 1` by `step`.
**Implementation details**
- All arguments must be positive values: `start`, `end`, `step` are `UInt` data types, as well as elements of the returned array.
- All arguments `start`, `end`, `step` must be below data types: `UInt8`, `UInt16`, `UInt32`, `UInt64`,`Int8`, `Int16`, `Int32`, `Int64`, as well as elements of the returned array, which's type is a super type of all arguments's.
- An exception is thrown if query results in arrays with a total length of more than number of elements specified by the [function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block) setting.
**Examples**
Query:
``` sql
SELECT range(5), range(1, 5), range(1, 5, 2);
SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
```
Result:
```txt
┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
│ [0,1,2,3,4] │ [1,2,3,4] │ [1,3] │
└─────────────┴─────────────┴────────────────┘
┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─
│ [0,1,2,3,4] │ [1,2,3,4] │ [1,3] │ [-1,1,3] │
└─────────────┴─────────────┴────────────────┴─────────────────
```
## array(x1, …), operator \[x1, …\]

View File

@ -39,3 +39,16 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(
│ [68] │ -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │
└──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
```
```sql
CREATE TABLE random (a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)) engine=Memory;
INSERT INTO random SELECT * FROM generateRandom() LIMIT 2;
SELECT * FROM random;
```
```text
┌─a────────────────────────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐
│ [] │ 68091.8197 │ ('2037-10-02 12:44:23.368','039ecab7-81c2-45ee-208c-844e5c6c5652') │
│ [8,-83,0,-22,65,9,-30,28,64] │ -186233.4909 │ ('2062-01-11 00:06:04.124','69563ea1-5ad1-f870-16d8-67061da0df25') │
└──────────────────────────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
```

View File

@ -117,7 +117,7 @@ SELECT notEmpty([1,2]);
## range(end), range(\[start, \] end \[, step\]) {#range}
返回一个以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组
返回一个以`step`作为增量步长的从`start`到`end - 1`的整形数字数组, 支持类型包括[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)
**语法**
``` sql
@ -126,31 +126,30 @@ range([start, ] end [, step])
**参数**
- `start` — 数组的第一个元素。可选项,如果设置了`step`时同样需要`start`默认值为0,类型为[UInt](../data-types/int-uint.md)
- `end` — 计数到`end`结束,但不包括`end`,必填项,类型为[UInt](../data-types/int-uint.md)
- `step` — 确定数组中每个元素之间的增量步长。可选项默认值为1,类型为[UInt](../data-types/int-uint.md)
- `start` — 数组的第一个元素。可选项,如果设置了`step`时同样需要`start`默认值为0。
- `end` — 计数到`end`结束,但不包括`end`,必填项。
- `step` — 确定数组中每个元素之间的增量步长。可选项默认值为1。
**返回值**
- 以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组。
- 以`step`作为增量步长的从`start`到`end - 1`的数字数组。
**注意事项**
- 所有参数必须是正值:`start`、`end`、`step`,类型均为`UInt`,结果数组的元素与此相同
- 所有参数`start`、`end`、`step`必须属于以下几种类型之一:[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)。结果数组的元素数据类型为所有入参类型的最小超类,也必须属于以上几种类型之一
- 如果查询结果的数组总长度超过[function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block)指定的元素数,将会抛出异常。
**示例**
查询语句:
``` sql
SELECT range(5), range(1, 5), range(1, 5, 2);
SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
```
结果:
```txt
┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
│ [0,1,2,3,4] │ [1,2,3,4] │ [1,3] │
└─────────────┴─────────────┴────────────────┘
┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─
│ [0,1,2,3,4] │ [1,2,3,4] │ [1,3] │ [-1,1,3] │
└─────────────┴─────────────┴────────────────┴─────────────────
```
## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1}

View File

@ -140,6 +140,7 @@ namespace CurrentMetrics
namespace ProfileEvents
{
extern const Event MainConfigLoads;
extern const Event ServerStartupMilliseconds;
}
namespace fs = std::filesystem;
@ -652,6 +653,8 @@ static void sanityChecks(Server & server)
int Server::main(const std::vector<std::string> & /*args*/)
try
{
Stopwatch startup_watch;
Poco::Logger * log = &logger();
UseSSL use_ssl;
@ -1822,6 +1825,9 @@ try
LOG_INFO(log, "Ready for connections.");
}
startup_watch.stop();
ProfileEvents::increment(ProfileEvents::ServerStartupMilliseconds, startup_watch.elapsedMilliseconds());
try
{
global_context->startClusterDiscovery();

View File

@ -167,6 +167,7 @@ enum class AccessType
M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \
M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
M(SYSTEM_WAIT_LOADING_PARTS, "WAIT LOADING PARTS", TABLE, SYSTEM) \
M(SYSTEM_SYNC_DATABASE_REPLICA, "SYNC DATABASE REPLICA", DATABASE, SYSTEM) \
M(SYSTEM_SYNC_TRANSACTION_LOG, "SYNC TRANSACTION LOG", GLOBAL, SYSTEM) \
M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \

View File

@ -53,7 +53,7 @@ TEST(AccessRights, Union)
"SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, "
"SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, "
"SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, "
"SYSTEM RESTORE REPLICA, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
"SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
}

View File

@ -207,7 +207,7 @@ private:
{
// Fuse points if their text representations differ only in last digit
auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon();
if (points[left].mean + min_diff >= points[right].mean)
if (points[left].mean + std::fabs(min_diff) >= points[right].mean)
{
points[left] = points[left] + points[right];
}

View File

@ -232,6 +232,9 @@ struct NameQuantilesExactInclusive { static constexpr auto name = "quantilesExac
struct NameQuantileExactWeighted { static constexpr auto name = "quantileExactWeighted"; };
struct NameQuantilesExactWeighted { static constexpr auto name = "quantilesExactWeighted"; };
struct NameQuantileInterpolatedWeighted { static constexpr auto name = "quantileInterpolatedWeighted"; };
struct NameQuantilesInterpolatedWeighted { static constexpr auto name = "quantilesInterpolatedWeighted"; };
struct NameQuantileTiming { static constexpr auto name = "quantileTiming"; };
struct NameQuantileTimingWeighted { static constexpr auto name = "quantileTimingWeighted"; };
struct NameQuantilesTiming { static constexpr auto name = "quantilesTiming"; };

View File

@ -0,0 +1,70 @@
#include <AggregateFunctions/AggregateFunctionQuantile.h>
#include <AggregateFunctions/QuantileInterpolatedWeighted.h>
#include <AggregateFunctions/AggregateFunctionFactory.h>
#include <AggregateFunctions/Helpers.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeDateTime.h>
#include <Core/Field.h>
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
}
namespace
{
template <typename Value, bool _> using FuncQuantileInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantileInterpolatedWeighted, true, void, false>;
template <typename Value, bool _> using FuncQuantilesInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantilesInterpolatedWeighted, true, void, true>;
template <template <typename, bool> class Function>
AggregateFunctionPtr createAggregateFunctionQuantile(
const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
{
/// Second argument type check doesn't depend on the type of the first one.
Function<void, true>::assertSecondArg(argument_types);
const DataTypePtr & argument_type = argument_types[0];
WhichDataType which(argument_type);
#define DISPATCH(TYPE) \
if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
FOR_BASIC_NUMERIC_TYPES(DISPATCH)
#undef DISPATCH
if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
throw Exception("Illegal type " + argument_type->getName() + " of argument for aggregate function " + name,
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
}
}
void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory & factory)
{
/// For aggregate functions returning array we cannot return NULL on empty set.
AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
factory.registerFunction(NameQuantileInterpolatedWeighted::name, createAggregateFunctionQuantile<FuncQuantileInterpolatedWeighted>);
factory.registerFunction(NameQuantilesInterpolatedWeighted::name, { createAggregateFunctionQuantile<FuncQuantilesInterpolatedWeighted>, properties });
/// 'median' is an alias for 'quantile'
factory.registerAlias("medianInterpolatedWeighted", NameQuantileInterpolatedWeighted::name);
}
}

View File

@ -0,0 +1,308 @@
#pragma once
#include <base/sort.h>
#include <Common/HashTable/HashMap.h>
#include <Common/NaNUtils.h>
namespace DB
{
struct Settings;
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
/** Approximates Quantile by:
* - sorting input values and weights
* - building a cumulative distribution based on weights
* - performing linear interpolation between the weights and values
*
*/
template <typename Value>
struct QuantileInterpolatedWeighted
{
struct Int128Hash
{
size_t operator()(Int128 x) const
{
return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
}
};
using Weight = UInt64;
using UnderlyingType = NativeType<Value>;
using Hasher = std::conditional_t<std::is_same_v<Value, Decimal128>, Int128Hash, HashCRC32<UnderlyingType>>;
/// When creating, the hash table must be small.
using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
Map map;
void add(const Value & x)
{
/// We must skip NaNs as they are not compatible with comparison sorting.
if (!isNaN(x))
++map[x];
}
void add(const Value & x, Weight weight)
{
if (!isNaN(x))
map[x] += weight;
}
void merge(const QuantileInterpolatedWeighted & rhs)
{
for (const auto & pair : rhs.map)
map[pair.getKey()] += pair.getMapped();
}
void serialize(WriteBuffer & buf) const
{
map.write(buf);
}
void deserialize(ReadBuffer & buf)
{
typename Map::Reader reader(buf);
while (reader.next())
{
const auto & pair = reader.get();
map[pair.first] = pair.second;
}
}
Value get(Float64 level) const
{
return getImpl<Value>(level);
}
void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
{
getManyImpl<Value>(levels, indices, size, result);
}
/// The same, but in the case of an empty state, NaN is returned.
Float64 getFloat(Float64) const
{
throw Exception("Method getFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
}
void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
{
throw Exception("Method getManyFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
}
private:
using Pair = typename std::pair<UnderlyingType, Float64>;
/// Get the value of the `level` quantile. The level must be between 0 and 1.
template <typename T>
T getImpl(Float64 level) const
{
size_t size = map.size();
if (0 == size)
return std::numeric_limits<Value>::quiet_NaN();
/// Maintain a vector of pair of values and weights for easier sorting and for building
/// a cumulative distribution using the provided weights.
std::vector<Pair> value_weight_pairs;
value_weight_pairs.reserve(size);
/// Note: weight provided must be a 64-bit integer
/// Float64 is used as accumulator here to get approximate results.
/// But weight used in the internal array is stored as Float64 as we
/// do some quantile estimation operation which involves division and
/// require Float64 level of precision.
Float64 sum_weight = 0;
for (const auto & pair : map)
{
sum_weight += pair.getMapped();
auto value = pair.getKey();
auto weight = pair.getMapped();
value_weight_pairs.push_back({value, weight});
}
::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
Float64 accumulated = 0;
/// vector for populating and storing the cumulative sum using the provided weights.
/// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
std::vector<Float64> weights_cum_sum;
weights_cum_sum.reserve(size);
for (size_t idx = 0; idx < size; ++idx)
{
accumulated += value_weight_pairs[idx].second;
weights_cum_sum.push_back(accumulated);
}
/// The following estimation of quantile is general and the idea is:
/// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
/// calculates a simple cumulative distribution based on weights
if (sum_weight != 0)
{
for (size_t idx = 0; idx < size; ++idx)
value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
}
/// perform linear interpolation
size_t idx = 0;
if (size >= 2)
{
if (level >= value_weight_pairs[size - 2].second)
{
idx = size - 2;
}
else
{
size_t start = 0, end = size - 1;
while (start <= end)
{
size_t mid = start + (end - start) / 2;
if (mid > size)
break;
if (level > value_weight_pairs[mid + 1].second)
start = mid + 1;
else
{
idx = mid;
end = mid - 1;
}
}
}
}
size_t l = idx;
size_t u = idx + 1 < size ? idx + 1 : idx;
Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
if (level < xl)
yr = yl;
if (level > xr)
yl = yr;
return static_cast<T>(interpolate(level, xl, xr, yl, yr));
}
/// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
/// indices - an array of index levels such that the corresponding elements will go in ascending order.
template <typename T>
void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
{
size_t size = map.size();
if (0 == size)
{
for (size_t i = 0; i < num_levels; ++i)
result[i] = Value();
return;
}
std::vector<Pair> value_weight_pairs;
value_weight_pairs.reserve(size);
Float64 sum_weight = 0;
for (const auto & pair : map)
{
sum_weight += pair.getMapped();
auto value = pair.getKey();
auto weight = pair.getMapped();
value_weight_pairs.push_back({value, weight});
}
::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
Float64 accumulated = 0;
/// vector for populating and storing the cumulative sum using the provided weights.
/// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
std::vector<Float64> weights_cum_sum;
weights_cum_sum.reserve(size);
for (size_t idx = 0; idx < size; ++idx)
{
accumulated += value_weight_pairs[idx].second;
weights_cum_sum.emplace_back(accumulated);
}
/// The following estimation of quantile is general and the idea is:
/// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
/// calculates a simple cumulative distribution based on weights
if (sum_weight != 0)
{
for (size_t idx = 0; idx < size; ++idx)
value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
}
for (size_t level_index = 0; level_index < num_levels; ++level_index)
{
/// perform linear interpolation for every level
auto level = levels[indices[level_index]];
size_t idx = 0;
if (size >= 2)
{
if (level >= value_weight_pairs[size - 2].second)
{
idx = size - 2;
}
else
{
size_t start = 0, end = size - 1;
while (start <= end)
{
size_t mid = start + (end - start) / 2;
if (mid > size)
break;
if (level > value_weight_pairs[mid + 1].second)
start = mid + 1;
else
{
idx = mid;
end = mid - 1;
}
}
}
}
size_t l = idx;
size_t u = idx + 1 < size ? idx + 1 : idx;
Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
if (level < xl)
yr = yl;
if (level > xr)
yl = yr;
result[indices[level_index]] = static_cast<T>(interpolate(level, xl, xr, yl, yr));
}
}
/// This ignores overflows or NaN's that might arise during add, sub and mul operations and doesn't aim to provide exact
/// results since `the quantileInterpolatedWeighted` function itself relies mainly on approximation.
UnderlyingType NO_SANITIZE_UNDEFINED interpolate(Float64 level, Float64 xl, Float64 xr, UnderlyingType yl, UnderlyingType yr) const
{
UnderlyingType dy = yr - yl;
Float64 dx = xr - xl;
dx = dx == 0 ? 1 : dx; /// to handle NaN behavior that might arise during integer division below.
/// yl + (dy / dx) * (level - xl)
return static_cast<UnderlyingType>(yl + (dy / dx) * (level - xl));
}
};
}

View File

@ -21,6 +21,7 @@ void registerAggregateFunctionsQuantile(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileExactLow(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileExactHigh(AggregateFunctionFactory &);
void registerAggregateFunctionsQuantileExactInclusive(AggregateFunctionFactory &);
@ -106,6 +107,7 @@ void registerAggregateFunctions()
registerAggregateFunctionsQuantileDeterministic(factory);
registerAggregateFunctionsQuantileExact(factory);
registerAggregateFunctionsQuantileExactWeighted(factory);
registerAggregateFunctionsQuantileInterpolatedWeighted(factory);
registerAggregateFunctionsQuantileExactLow(factory);
registerAggregateFunctionsQuantileExactHigh(factory);
registerAggregateFunctionsQuantileExactInclusive(factory);

View File

@ -11,6 +11,7 @@
#include <Parsers/ASTQualifiedAsterisk.h>
#include <Parsers/ASTColumnsMatcher.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTColumnsTransformers.h>
namespace DB
{
@ -206,19 +207,43 @@ QueryTreeNodePtr MatcherNode::cloneImpl() const
ASTPtr MatcherNode::toASTImpl() const
{
ASTPtr result;
ASTPtr transformers;
if (!children.empty())
{
transformers = std::make_shared<ASTColumnsTransformerList>();
for (const auto & child : children)
transformers->children.push_back(child->toAST());
}
if (matcher_type == MatcherNodeType::ASTERISK)
{
if (qualified_identifier.empty())
{
result = std::make_shared<ASTAsterisk>();
auto asterisk = std::make_shared<ASTAsterisk>();
if (transformers)
{
asterisk->transformers = std::move(transformers);
asterisk->children.push_back(asterisk->transformers);
}
result = asterisk;
}
else
{
auto qualified_asterisk = std::make_shared<ASTQualifiedAsterisk>();
auto identifier_parts = qualified_identifier.getParts();
qualified_asterisk->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
qualified_asterisk->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
qualified_asterisk->children.push_back(qualified_asterisk->qualifier);
if (transformers)
{
qualified_asterisk->transformers = std::move(transformers);
qualified_asterisk->children.push_back(qualified_asterisk->transformers);
}
result = qualified_asterisk;
}
@ -229,6 +254,13 @@ ASTPtr MatcherNode::toASTImpl() const
{
auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
regexp_matcher->setPattern(columns_matcher->pattern());
if (transformers)
{
regexp_matcher->transformers = std::move(transformers);
regexp_matcher->children.push_back(regexp_matcher->transformers);
}
result = regexp_matcher;
}
else
@ -237,7 +269,14 @@ ASTPtr MatcherNode::toASTImpl() const
regexp_matcher->setPattern(columns_matcher->pattern());
auto identifier_parts = qualified_identifier.getParts();
regexp_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
regexp_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
regexp_matcher->children.push_back(regexp_matcher->qualifier);
if (transformers)
{
regexp_matcher->transformers = std::move(transformers);
regexp_matcher->children.push_back(regexp_matcher->transformers);
}
result = regexp_matcher;
}
@ -257,23 +296,36 @@ ASTPtr MatcherNode::toASTImpl() const
{
auto columns_list_matcher = std::make_shared<ASTColumnsListMatcher>();
columns_list_matcher->column_list = std::move(column_list);
columns_list_matcher->children.push_back(columns_list_matcher->column_list);
if (transformers)
{
columns_list_matcher->transformers = std::move(transformers);
columns_list_matcher->children.push_back(columns_list_matcher->transformers);
}
result = columns_list_matcher;
}
else
{
auto columns_list_matcher = std::make_shared<ASTQualifiedColumnsListMatcher>();
columns_list_matcher->column_list = std::move(column_list);
auto identifier_parts = qualified_identifier.getParts();
columns_list_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
columns_list_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
columns_list_matcher->column_list = std::move(column_list);
columns_list_matcher->children.push_back(columns_list_matcher->qualifier);
columns_list_matcher->children.push_back(columns_list_matcher->column_list);
if (transformers)
{
columns_list_matcher->transformers = std::move(transformers);
columns_list_matcher->children.push_back(columns_list_matcher->transformers);
}
result = columns_list_matcher;
}
}
for (const auto & child : children)
result->children.push_back(child->toAST());
return result;
}

View File

@ -73,7 +73,7 @@ public:
if (!inner_function_node)
return;
auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
const auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
if (inner_function_arguments_nodes.size() != 2)
return;
@ -119,13 +119,15 @@ public:
{
lower_function_name = function_name_if_constant_is_negative;
}
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[1], lower_function_name);
auto inner_function = aggregate_function_arguments_nodes[0];
auto inner_function_right_argument = std::move(inner_function_arguments_nodes[1]);
aggregate_function_arguments_nodes = {inner_function_right_argument};
inner_function_arguments_nodes[1] = node;
node = std::move(inner_function);
auto inner_function_clone = inner_function_node->clone();
auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
auto inner_function_clone_right_argument = inner_function_clone_arguments_nodes[1];
aggregate_function_arguments_nodes = {inner_function_clone_right_argument};
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_right_argument, lower_function_name);
inner_function_clone_arguments_nodes[1] = node;
node = std::move(inner_function_clone);
}
else if (right_argument_constant_node)
{
@ -136,18 +138,20 @@ public:
{
lower_function_name = function_name_if_constant_is_negative;
}
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[0], function_name_if_constant_is_negative);
auto inner_function = aggregate_function_arguments_nodes[0];
auto inner_function_left_argument = std::move(inner_function_arguments_nodes[0]);
aggregate_function_arguments_nodes = {inner_function_left_argument};
inner_function_arguments_nodes[0] = node;
node = std::move(inner_function);
auto inner_function_clone = inner_function_node->clone();
auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
auto inner_function_clone_left_argument = inner_function_clone_arguments_nodes[0];
aggregate_function_arguments_nodes = {inner_function_clone_left_argument};
resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_left_argument, lower_function_name);
inner_function_clone_arguments_nodes[0] = node;
node = std::move(inner_function_clone);
}
}
private:
static inline void resolveAggregateFunctionNode(FunctionNode & function_node, QueryTreeNodePtr & argument, const String & aggregate_function_name)
static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name)
{
auto function_aggregate_function = function_node.getAggregateFunction();

View File

@ -0,0 +1,124 @@
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
#include <Analyzer/ColumnNode.h>
#include <Analyzer/FunctionNode.h>
#include <Analyzer/HashUtils.h>
#include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/QueryNode.h>
#include <Analyzer/SortNode.h>
#include <Functions/IFunction.h>
namespace DB
{
namespace
{
class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitor<OptimizeRedundantFunctionsInOrderByVisitor>
{
public:
static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
{
if (node->as<FunctionNode>())
return false;
return true;
}
void visitImpl(QueryTreeNodePtr & node)
{
auto * query = node->as<QueryNode>();
if (!query)
return;
if (!query->hasOrderBy())
return;
auto & order_by = query->getOrderBy();
for (auto & elem : order_by.getNodes())
{
auto * order_by_elem = elem->as<SortNode>();
if (order_by_elem->withFill())
return;
}
QueryTreeNodes new_order_by_nodes;
new_order_by_nodes.reserve(order_by.getNodes().size());
for (auto & elem : order_by.getNodes())
{
auto & order_by_expr = elem->as<SortNode>()->getExpression();
switch (order_by_expr->getNodeType())
{
case QueryTreeNodeType::FUNCTION:
{
if (isRedundantExpression(order_by_expr))
continue;
break;
}
case QueryTreeNodeType::COLUMN:
{
existing_keys.insert(order_by_expr);
break;
}
default:
break;
}
new_order_by_nodes.push_back(elem);
}
existing_keys.clear();
if (new_order_by_nodes.size() < order_by.getNodes().size())
order_by.getNodes() = std::move(new_order_by_nodes);
}
private:
QueryTreeNodePtrWithHashSet existing_keys;
bool isRedundantExpression(QueryTreeNodePtr function)
{
QueryTreeNodes nodes_to_process{ function };
while (!nodes_to_process.empty())
{
auto node = nodes_to_process.back();
nodes_to_process.pop_back();
// TODO: handle constants here
switch (node->getNodeType())
{
case QueryTreeNodeType::FUNCTION:
{
auto * function_node = node->as<FunctionNode>();
const auto & function_arguments = function_node->getArguments().getNodes();
if (function_arguments.empty())
return false;
const auto & function_base = function_node->getFunction();
if (!function_base || !function_base->isDeterministicInScopeOfQuery())
return false;
// Process arguments in order
for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
nodes_to_process.push_back(*it);
break;
}
case QueryTreeNodeType::COLUMN:
{
if (!existing_keys.contains(node))
return false;
break;
}
default:
return false;
}
}
return true;
}
};
}
void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
{
OptimizeRedundantFunctionsInOrderByVisitor().visit(query_tree_node);
}
}

View File

@ -0,0 +1,23 @@
#pragma once
#include <Analyzer/IQueryTreePass.h>
namespace DB
{
/** If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x.
* Optimize ORDER BY x, y, f(x), g(x, y), f(h(x)), t(f(x), g(x)) into ORDER BY x, y
* in case if f(), g(), h(), t() are deterministic (in scope of query).
* Don't optimize ORDER BY f(x), g(x), x even if f(x) is bijection for x or g(x).
*/
class OptimizeRedundantFunctionsInOrderByPass final : public IQueryTreePass
{
public:
String getName() override { return "OptimizeRedundantFunctionsInOrderBy"; }
String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; }
void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
};
}

View File

@ -77,11 +77,11 @@ public:
if (!nested_function || nested_function->getFunctionName() != "if")
return;
auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
const auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
if (nested_if_function_arguments_nodes.size() != 3)
return;
auto & cond_argument = nested_if_function_arguments_nodes[0];
const auto & cond_argument = nested_if_function_arguments_nodes[0];
const auto * if_true_condition_constant_node = nested_if_function_arguments_nodes[1]->as<ConstantNode>();
const auto * if_false_condition_constant_node = nested_if_function_arguments_nodes[2]->as<ConstantNode>();
@ -101,7 +101,7 @@ public:
/// Rewrite `sum(if(cond, 1, 0))` into `countIf(cond)`.
if (if_true_condition_value == 1 && if_false_condition_value == 0)
{
function_node_arguments_nodes[0] = std::move(nested_if_function_arguments_nodes[0]);
function_node_arguments_nodes[0] = nested_if_function_arguments_nodes[0];
function_node_arguments_nodes.resize(1);
resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
@ -120,7 +120,7 @@ public:
auto not_function = std::make_shared<FunctionNode>("not");
auto & not_function_arguments = not_function->getArguments().getNodes();
not_function_arguments.push_back(std::move(nested_if_function_arguments_nodes[0]));
not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);
not_function->resolveAsFunction(FunctionFactory::instance().get("not", context)->build(not_function->getArgumentColumns()));

View File

@ -111,7 +111,7 @@ private:
QueryTreeNodePtr buildJoinTree(const ASTPtr & tables_in_select_query, const ContextPtr & context) const;
ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const;
ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const;
ASTPtr query;
QueryTreeNodePtr query_tree_node;
@ -439,13 +439,13 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
}
else if (const auto * asterisk = expression->as<ASTAsterisk>())
{
auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
auto column_transformers = buildColumnTransformers(asterisk->transformers, context);
result = std::make_shared<MatcherNode>(std::move(column_transformers));
}
else if (const auto * qualified_asterisk = expression->as<ASTQualifiedAsterisk>())
{
auto & qualified_identifier = qualified_asterisk->children.at(0)->as<ASTTableIdentifier &>();
auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
auto & qualified_identifier = qualified_asterisk->qualifier->as<ASTIdentifier &>();
auto column_transformers = buildColumnTransformers(qualified_asterisk->transformers, context);
result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_transformers));
}
else if (const auto * ast_literal = expression->as<ASTLiteral>())
@ -543,7 +543,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
}
else if (const auto * columns_regexp_matcher = expression->as<ASTColumnsRegexpMatcher>())
{
auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
auto column_transformers = buildColumnTransformers(columns_regexp_matcher->transformers, context);
result = std::make_shared<MatcherNode>(columns_regexp_matcher->getMatcher(), std::move(column_transformers));
}
else if (const auto * columns_list_matcher = expression->as<ASTColumnsListMatcher>())
@ -557,18 +557,18 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
}
auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
auto column_transformers = buildColumnTransformers(columns_list_matcher->transformers, context);
result = std::make_shared<MatcherNode>(std::move(column_list_identifiers), std::move(column_transformers));
}
else if (const auto * qualified_columns_regexp_matcher = expression->as<ASTQualifiedColumnsRegexpMatcher>())
{
auto & qualified_identifier = qualified_columns_regexp_matcher->children.at(0)->as<ASTTableIdentifier &>();
auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
auto & qualified_identifier = qualified_columns_regexp_matcher->qualifier->as<ASTIdentifier &>();
auto column_transformers = buildColumnTransformers(qualified_columns_regexp_matcher->transformers, context);
result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), qualified_columns_regexp_matcher->getMatcher(), std::move(column_transformers));
}
else if (const auto * qualified_columns_list_matcher = expression->as<ASTQualifiedColumnsListMatcher>())
{
auto & qualified_identifier = qualified_columns_list_matcher->children.at(0)->as<ASTTableIdentifier &>();
auto & qualified_identifier = qualified_columns_list_matcher->qualifier->as<ASTIdentifier &>();
Identifiers column_list_identifiers;
column_list_identifiers.reserve(qualified_columns_list_matcher->column_list->children.size());
@ -579,7 +579,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
}
auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
auto column_transformers = buildColumnTransformers(qualified_columns_list_matcher->transformers, context);
result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_list_identifiers), std::move(column_transformers));
}
else
@ -833,15 +833,15 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select
}
ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const
ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const
{
ColumnTransformersNodes column_transformers;
size_t children_size = matcher_expression->children.size();
for (; start_child_index < children_size; ++start_child_index)
if (!matcher_expression)
return column_transformers;
for (const auto & child : matcher_expression->children)
{
const auto & child = matcher_expression->children[start_child_index];
if (auto * apply_transformer = child->as<ASTColumnsApplyTransformer>())
{
if (apply_transformer->lambda)

View File

@ -15,6 +15,7 @@
#include <Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h>
#include <Analyzer/Passes/FuseFunctionsPass.h>
#include <Analyzer/Passes/IfTransformStringsToEnumPass.h>
#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
@ -91,7 +92,6 @@ public:
* TODO: Support setting optimize_move_functions_out_of_any.
* TODO: Support setting optimize_aggregators_of_group_by_keys.
* TODO: Support setting optimize_duplicate_order_by_and_distinct.
* TODO: Support setting optimize_redundant_functions_in_order_by.
* TODO: Support setting optimize_monotonous_functions_in_order_by.
* TODO: Support settings.optimize_or_like_chain.
* TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
@ -203,6 +203,9 @@ void addQueryTreePasses(QueryTreePassManager & manager)
if (settings.optimize_if_chain_to_multiif)
manager.addPass(std::make_unique<IfChainToMultiIfPass>());
if (settings.optimize_redundant_functions_in_order_by)
manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());

View File

@ -156,10 +156,9 @@ void BackupWriterS3::copyObjectImpl(
const String & src_key,
const String & dst_bucket,
const String & dst_key,
const Aws::S3::Model::HeadObjectResult & head,
size_t size,
const std::optional<ObjectAttributes> & metadata) const
{
size_t size = head.GetContentLength();
LOG_TRACE(log, "Copying {} bytes using single-operation copy", size);
Aws::S3::Model::CopyObjectRequest request;
@ -177,7 +176,7 @@ void BackupWriterS3::copyObjectImpl(
if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
|| outcome.GetError().GetExceptionName() == "InvalidRequest"))
{ // Can't come here with MinIO, MinIO allows single part upload for large objects.
copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
return;
}
@ -191,10 +190,9 @@ void BackupWriterS3::copyObjectMultipartImpl(
const String & src_key,
const String & dst_bucket,
const String & dst_key,
const Aws::S3::Model::HeadObjectResult & head,
size_t size,
const std::optional<ObjectAttributes> & metadata) const
{
size_t size = head.GetContentLength();
LOG_TRACE(log, "Copying {} bytes using multipart upload copy", size);
String multipart_upload_id;
@ -309,16 +307,16 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_
std::string source_bucket = object_storage->getObjectsNamespace();
auto file_path = fs::path(s3_uri.key) / file_name_to;
auto head = S3::headObject(*client, source_bucket, objects[0].absolute_path).GetResult();
if (static_cast<size_t>(head.GetContentLength()) < request_settings.getUploadSettings().max_single_operation_copy_size)
auto size = S3::getObjectSize(*client, source_bucket, objects[0].absolute_path);
if (size < request_settings.getUploadSettings().max_single_operation_copy_size)
{
copyObjectImpl(
source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
}
else
{
copyObjectMultipartImpl(
source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
}
}
}

View File

@ -67,7 +67,7 @@ private:
const String & src_key,
const String & dst_bucket,
const String & dst_key,
const Aws::S3::Model::HeadObjectResult & head,
size_t size,
const std::optional<ObjectAttributes> & metadata = std::nullopt) const;
void copyObjectMultipartImpl(
@ -75,7 +75,7 @@ private:
const String & src_key,
const String & dst_bucket,
const String & dst_key,
const Aws::S3::Model::HeadObjectResult & head,
size_t size,
const std::optional<ObjectAttributes> & metadata = std::nullopt) const;
void removeFilesBatch(const Strings & file_names);

View File

@ -309,6 +309,8 @@ The server successfully detected this situation and will download merged part fr
M(S3CopyObject, "Number of S3 API CopyObject calls.") \
M(S3ListObjects, "Number of S3 API ListObjects calls.") \
M(S3HeadObject, "Number of S3 API HeadObject calls.") \
M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.") \
M(S3GetObjectMetadata, "Number of S3 API GetObject calls for getting metadata.") \
M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.") \
M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.") \
M(S3UploadPart, "Number of S3 API UploadPart calls.") \
@ -321,6 +323,8 @@ The server successfully detected this situation and will download merged part fr
M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \
M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.") \
M(DiskS3HeadObject, "Number of DiskS3 API HeadObject calls.") \
M(DiskS3GetObjectAttributes, "Number of DiskS3 API GetObjectAttributes calls.") \
M(DiskS3GetObjectMetadata, "Number of DiskS3 API GetObject calls for getting metadata.") \
M(DiskS3CreateMultipartUpload, "Number of DiskS3 API CreateMultipartUpload calls.") \
M(DiskS3UploadPartCopy, "Number of DiskS3 API UploadPartCopy calls.") \
M(DiskS3UploadPart, "Number of DiskS3 API UploadPart calls.") \
@ -449,7 +453,8 @@ The server successfully detected this situation and will download merged part fr
M(OverflowBreak, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'break' and the result is incomplete.") \
M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.") \
M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.") \
\
M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\
namespace ProfileEvents
{

View File

@ -7,6 +7,29 @@
#include <Poco/Message.h>
#include <Common/CurrentThread.h>
/// This wrapper is useful to save formatted message into a String before sending it to a logger
class LogToStrImpl
{
String & out_str;
Poco::Logger * logger;
bool propagate_to_actual_log = true;
public:
LogToStrImpl(String & out_str_, Poco::Logger * logger_) : out_str(out_str_) , logger(logger_) {}
LogToStrImpl & operator -> () { return *this; }
bool is(Poco::Message::Priority priority) { propagate_to_actual_log &= logger->is(priority); return true; }
LogToStrImpl * getChannel() {return this; }
const String & name() const { return logger->name(); }
void log(const Poco::Message & message)
{
out_str = message.getText();
if (!propagate_to_actual_log)
return;
if (auto * channel = logger->getChannel())
channel->log(message);
}
};
#define LogToStr(x, y) std::make_unique<LogToStrImpl>(x, y)
namespace
{
@ -17,8 +40,37 @@ namespace
[[maybe_unused]] const ::Poco::Logger * getLogger(const ::Poco::Logger * logger) { return logger; };
[[maybe_unused]] const ::Poco::Logger * getLogger(const std::atomic<::Poco::Logger *> & logger) { return logger.load(); };
[[maybe_unused]] std::unique_ptr<LogToStrImpl> getLogger(std::unique_ptr<LogToStrImpl> && logger) { return logger; };
template<typename T> struct is_fmt_runtime : std::false_type {};
template<typename T> struct is_fmt_runtime<fmt::basic_runtime<T>> : std::true_type {};
/// Usually we use LOG_*(...) macros with either string literals or fmt::runtime(whatever) as a format string.
/// This function is useful to get a string_view to a static format string passed to LOG_* macro.
template <typename T> constexpr std::string_view tryGetStaticFormatString(T && x)
{
if constexpr (is_fmt_runtime<T>::value)
{
/// It definitely was fmt::runtime(something).
/// We are not sure about a lifetime of the string, so return empty view.
/// Also it can be arbitrary string, not a formatting pattern.
/// So returning empty pattern will not pollute the set of patterns.
return std::string_view();
}
else
{
/// Most likely it was a string literal.
/// Unfortunately, there's no good way to check if something is a string literal.
/// But fmtlib requires a format string to be compile-time constant unless fmt::runtime is used.
static_assert(std::is_nothrow_convertible<T, const char * const>::value);
static_assert(!std::is_pointer<T>::value);
return std::string_view(x);
}
}
}
#define LOG_IMPL_FIRST_ARG(X, ...) X
/// Logs a message to a specified logger with that level.
/// If more than one argument is provided,
/// the first argument is interpreted as template with {}-substitutions
@ -30,7 +82,7 @@ namespace
auto _logger = ::getLogger(logger); \
const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \
(DB::CurrentThread::getGroup()->client_logs_level >= (priority)); \
if (_logger->is((PRIORITY)) || _is_clients_log) \
if (_is_clients_log || _logger->is((PRIORITY))) \
{ \
std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
if (auto _channel = _logger->getChannel()) \
@ -40,7 +92,7 @@ namespace
file_function += "; "; \
file_function += __PRETTY_FUNCTION__; \
Poco::Message poco_message(_logger->name(), formatted_message, \
(PRIORITY), file_function.c_str(), __LINE__); \
(PRIORITY), file_function.c_str(), __LINE__, tryGetStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__))); \
_channel->log(poco_message); \
} \
} \

View File

@ -773,6 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \

View File

@ -80,7 +80,8 @@ namespace SettingsChangesHistory
/// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
{
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}}},
{"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
{"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
{"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
{"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
{"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}},

View File

@ -16,6 +16,7 @@ namespace ErrorCodes
{
extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
extern const int NOT_FOUND_COLUMN_IN_BLOCK;
extern const int INCORRECT_DATA;
}
@ -154,7 +155,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co
void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
{
if (settings.json.named_tuples_as_objects
if (settings.json.write_named_tuples_as_objects
&& have_explicit_names)
{
writeChar('{', ostr);
@ -185,7 +186,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
{
if (settings.json.named_tuples_as_objects
if (settings.json.read_named_tuples_as_objects
&& have_explicit_names)
{
skipWhitespaceIfAny(istr);
@ -194,12 +195,15 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
addElementSafe(elems.size(), column, [&]
{
// Require all elements but in arbitrary order.
for (size_t i = 0; i < elems.size(); ++i)
std::vector<UInt8> seen_elements(elems.size(), 0);
size_t i = 0;
while (!istr.eof() && *istr.position() != '}')
{
if (i == elems.size())
throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {}", elems.size());
if (i > 0)
{
skipWhitespaceIfAny(istr);
assertChar(',', istr);
skipWhitespaceIfAny(istr);
}
@ -211,12 +215,35 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
skipWhitespaceIfAny(istr);
const size_t element_pos = getPositionByName(name);
seen_elements[element_pos] = 1;
auto & element_column = extractElementColumn(column, element_pos);
elems[element_pos]->deserializeTextJSON(element_column, istr, settings);
skipWhitespaceIfAny(istr);
++i;
}
skipWhitespaceIfAny(istr);
assertChar('}', istr);
/// Check if we have missing elements.
if (i != elems.size())
{
for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos)
{
if (seen_elements[element_pos])
continue;
if (!settings.json.defaults_for_missing_elements_in_named_tuple)
throw Exception(
ErrorCodes::INCORRECT_DATA,
"JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, "
"enable setting input_format_json_defaults_for_missing_elements_in_named_tuple",
elems[element_pos]->getElementName());
auto & element_column = extractElementColumn(column, element_pos);
element_column.insertDefault();
}
}
});
}
else

View File

@ -2,6 +2,7 @@
#include <Dictionaries/getDictionaryConfigurationFromAST.h>
#include <Interpreters/Cluster.h>
#include <Interpreters/Context.h>
#include <Interpreters/misc.h>
#include <Interpreters/InDepthNodeVisitor.h>
#include <Interpreters/evaluateConstantExpression.h>
#include <Interpreters/getClusterName.h>
@ -175,7 +176,7 @@ namespace
/// Finds dependencies of a function.
void visitFunction(const ASTFunction & function)
{
if (function.name == "joinGet" || function.name == "dictHas" || function.name == "dictIsIn" || function.name.starts_with("dictGet"))
if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
{
/// dictGet('dict_name', attr_names, id_expr)
/// dictHas('dict_name', id_expr)

View File

@ -1,6 +1,7 @@
#include <Databases/DDLLoadingDependencyVisitor.h>
#include <Dictionaries/getDictionaryConfigurationFromAST.h>
#include <Interpreters/Context.h>
#include <Interpreters/misc.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTIdentifier.h>
@ -52,23 +53,41 @@ bool DDLMatcherBase::needChildVisit(const ASTPtr & node, const ASTPtr & child)
return true;
}
ssize_t DDLMatcherBase::getPositionOfTableNameArgument(const ASTFunction & function)
ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function)
{
if (function.name == "joinGet" ||
function.name == "dictHas" ||
function.name == "dictIsIn" ||
function.name.starts_with("dictGet"))
if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
return 0;
if (Poco::toLower(function.name) == "in")
return -1;
}
ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToVisit(const ASTFunction & function)
{
ssize_t maybe_res = getPositionOfTableNameArgumentToEvaluate(function);
if (0 <= maybe_res)
return maybe_res;
if (functionIsInOrGlobalInOperator(function.name))
{
if (function.children.empty())
return -1;
const auto * args = function.children[0]->as<ASTExpressionList>();
if (!args || args->children.size() != 2)
return -1;
if (args->children[1]->as<ASTFunction>())
return -1;
return 1;
}
return -1;
}
void DDLLoadingDependencyVisitor::visit(const ASTFunction & function, Data & data)
{
ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToVisit(function);
if (table_name_arg_idx < 0)
return;
extractTableNameFromArgument(function, data, table_name_arg_idx);

View File

@ -23,7 +23,8 @@ class DDLMatcherBase
{
public:
static bool needChildVisit(const ASTPtr & node, const ASTPtr & child);
static ssize_t getPositionOfTableNameArgument(const ASTFunction & function);
static ssize_t getPositionOfTableNameArgumentToVisit(const ASTFunction & function);
static ssize_t getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function);
};
/// Visits ASTCreateQuery and extracts the names of all tables which should be loaded before a specified table.

View File

@ -23,7 +23,7 @@ void NormalizeAndEvaluateConstants::visit(const ASTFunction & function, Data & d
{
/// Replace expressions like "dictGet(currentDatabase() || '.dict', 'value', toUInt32(1))"
/// with "dictGet('db_name.dict', 'value', toUInt32(1))"
ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToEvaluate(function);
if (table_name_arg_idx < 0)
return;

View File

@ -171,8 +171,9 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
if (!hasPendingDataToRead())
return false;
size_t size, offset;
chassert(file_offset_of_buffer_end <= impl->getFileSize());
size_t size, offset;
if (prefetch_future.valid())
{
ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::AsynchronousRemoteReadWaitMicroseconds);
@ -210,8 +211,8 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
/// In case of multiple files for the same file in clickhouse (i.e. log family)
/// file_offset_of_buffer_end will not match getImplementationBufferOffset()
/// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
assert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
assert(file_offset_of_buffer_end <= impl->getFileSize());
chassert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
chassert(file_offset_of_buffer_end <= impl->getFileSize());
return bytes_read;
}
@ -277,6 +278,15 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
/// First reset the buffer so the next read will fetch new data to the buffer.
resetWorkingBuffer();
if (read_until_position && new_pos > *read_until_position)
{
ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
impl->reset();
file_offset_of_buffer_end = new_pos = *read_until_position; /// read_until_position is a non-included boundary.
return new_pos;
}
/**
* Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
* Note: we read in range [file_offset_of_buffer_end, read_until_position).

View File

@ -256,7 +256,7 @@ size_t ReadBufferFromRemoteFSGather::getFileSize() const
String ReadBufferFromRemoteFSGather::getInfoForLog()
{
if (!current_buf)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get info: buffer not initialized");
return "";
return current_buf->getInfoForLog();
}

View File

@ -125,14 +125,19 @@ std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path
getRandomASCIIString(key_name_total_size - key_name_prefix_size));
}
Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
size_t S3ObjectStorage::getObjectSize(const std::string & bucket_from, const std::string & key) const
{
return S3::headObject(*client.get(), bucket_from, key, "", true);
return S3::getObjectSize(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true);
}
bool S3ObjectStorage::exists(const StoredObject & object) const
{
return S3::objectExists(*client.get(), bucket, object.absolute_path, "", true);
return S3::objectExists(*client.get(), bucket, object.absolute_path, {}, /* for_disk_s3= */ true);
}
void S3ObjectStorage::checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const
{
return S3::checkObjectExists(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true, description);
}
std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
@ -409,13 +414,10 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons
{
ObjectMetadata result;
auto object_head = requestObjectHeadData(bucket, path);
throwIfError(object_head);
auto & object_head_result = object_head.GetResult();
result.size_bytes = object_head_result.GetContentLength();
result.last_modified = object_head_result.GetLastModified().Millis();
result.attributes = object_head_result.GetMetadata();
auto object_info = S3::getObjectInfo(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);
result.size_bytes = object_info.size;
result.last_modified = object_info.last_modification_time;
result.attributes = S3::getObjectMetadata(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);
return result;
}
@ -442,7 +444,7 @@ void S3ObjectStorage::copyObjectImpl(
const String & src_key,
const String & dst_bucket,
const String & dst_key,
std::optional<Aws::S3::Model::HeadObjectResult> head,
size_t size,
std::optional<ObjectAttributes> metadata) const
{
auto client_ptr = client.get();
@ -464,7 +466,7 @@ void S3ObjectStorage::copyObjectImpl(
if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
|| outcome.GetError().GetExceptionName() == "InvalidRequest"))
{ // Can't come here with MinIO, MinIO allows single part upload for large objects.
copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
return;
}
@ -472,12 +474,7 @@ void S3ObjectStorage::copyObjectImpl(
auto settings_ptr = s3_settings.get();
if (settings_ptr->request_settings.check_objects_after_upload)
{
auto object_head = requestObjectHeadData(dst_bucket, dst_key);
if (!object_head.IsSuccess())
throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
}
checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
}
void S3ObjectStorage::copyObjectMultipartImpl(
@ -485,15 +482,11 @@ void S3ObjectStorage::copyObjectMultipartImpl(
const String & src_key,
const String & dst_bucket,
const String & dst_key,
std::optional<Aws::S3::Model::HeadObjectResult> head,
size_t size,
std::optional<ObjectAttributes> metadata) const
{
if (!head)
head = requestObjectHeadData(src_bucket, src_key).GetResult();
auto settings_ptr = s3_settings.get();
auto client_ptr = client.get();
size_t size = head->GetContentLength();
String multipart_upload_id;
@ -569,29 +562,24 @@ void S3ObjectStorage::copyObjectMultipartImpl(
}
if (settings_ptr->request_settings.check_objects_after_upload)
{
auto object_head = requestObjectHeadData(dst_bucket, dst_key);
if (!object_head.IsSuccess())
throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
}
checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
}
void S3ObjectStorage::copyObject( // NOLINT
const StoredObject & object_from, const StoredObject & object_to, std::optional<ObjectAttributes> object_to_attributes)
{
auto head = requestObjectHeadData(bucket, object_from.absolute_path).GetResult();
auto size = getObjectSize(bucket, object_from.absolute_path);
static constexpr int64_t multipart_upload_threashold = 5UL * 1024 * 1024 * 1024;
if (head.GetContentLength() >= multipart_upload_threashold)
if (size >= multipart_upload_threashold)
{
copyObjectMultipartImpl(
bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
}
else
{
copyObjectImpl(
bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
}
}

View File

@ -172,7 +172,7 @@ private:
const String & src_key,
const String & dst_bucket,
const String & dst_key,
std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
size_t size,
std::optional<ObjectAttributes> metadata = std::nullopt) const;
void copyObjectMultipartImpl(
@ -180,13 +180,14 @@ private:
const String & src_key,
const String & dst_bucket,
const String & dst_key,
std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
size_t size,
std::optional<ObjectAttributes> metadata = std::nullopt) const;
void removeObjectImpl(const StoredObject & object, bool if_exists);
void removeObjectsImpl(const StoredObjects & objects, bool if_exists);
Aws::S3::Model::HeadObjectOutcome requestObjectHeadData(const std::string & bucket_from, const std::string & key) const;
size_t getObjectSize(const std::string & bucket_from, const std::string & key) const;
void checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const;
std::string bucket;

View File

@ -90,7 +90,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats;
format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;

View File

@ -153,7 +153,9 @@ struct FormatSettings
bool quote_denormals = true;
bool quote_decimals = false;
bool escape_forward_slashes = true;
bool named_tuples_as_objects = false;
bool read_named_tuples_as_objects = false;
bool write_named_tuples_as_objects = false;
bool defaults_for_missing_elements_in_named_tuple = false;
bool serialize_as_strings = false;
bool read_bools_as_numbers = true;
bool read_numbers_as_strings = true;

View File

@ -118,6 +118,16 @@ struct MatchImpl
if (haystack_offsets.empty())
return;
/// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
/// - col [not] [i]like '%' / '%%'
/// - match(col, '.*')
if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
{
for (auto & x : res)
x = !negate;
return;
}
/// Special case that the [I]LIKE expression reduces to finding a substring in a string
String strstr_pattern;
if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
@ -267,6 +277,16 @@ struct MatchImpl
if (haystack.empty())
return;
/// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
/// - col [not] [i]like '%' / '%%'
/// - match(col, '.*')
if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
{
for (auto & x : res)
x = !negate;
return;
}
/// Special case that the [I]LIKE expression reduces to finding a substring in a string
String strstr_pattern;
if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))

View File

@ -250,7 +250,7 @@ size_t ReadBufferFromS3::getFileSize()
if (file_size)
return *file_size;
auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, true, read_settings.for_object_storage);
auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, /* for_disk_s3= */ read_settings.for_object_storage);
file_size = object_size;
return *file_size;

View File

@ -27,6 +27,8 @@
# include <aws/core/utils/UUID.h>
# include <aws/core/http/HttpClientFactory.h>
# include <aws/s3/S3Client.h>
# include <aws/s3/model/GetObjectAttributesRequest.h>
# include <aws/s3/model/GetObjectRequest.h>
# include <aws/s3/model/HeadObjectRequest.h>
# include <IO/S3/PocoHTTPClientFactory.h>
@ -40,7 +42,11 @@
namespace ProfileEvents
{
extern const Event S3GetObjectAttributes;
extern const Event S3GetObjectMetadata;
extern const Event S3HeadObject;
extern const Event DiskS3GetObjectAttributes;
extern const Event DiskS3GetObjectMetadata;
extern const Event DiskS3HeadObject;
}
@ -699,6 +705,92 @@ public:
}
};
/// Extracts the endpoint from a constructed S3 client.
String getEndpoint(const Aws::S3::S3Client & client)
{
const auto * endpoint_provider = dynamic_cast<const Aws::S3::Endpoint::S3DefaultEpProviderBase *>(const_cast<Aws::S3::S3Client &>(client).accessEndpointProvider().get());
if (!endpoint_provider)
return {};
String endpoint;
endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint);
return endpoint;
}
/// Performs a request to get the size and last modification time of an object.
/// The function performs either HeadObject or GetObjectAttributes request depending on the endpoint.
std::pair<std::optional<DB::S3::ObjectInfo>, Aws::S3::S3Error> tryGetObjectInfo(
const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
{
auto endpoint = getEndpoint(client);
bool use_get_object_attributes_request = (endpoint.find(".amazonaws.com") != String::npos);
if (use_get_object_attributes_request)
{
/// It's better not to use `HeadObject` requests for AWS S3 because they don't work well with the global region.
/// Details: `HeadObject` request never returns a response body (even if there is an error) however
/// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv"
/// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways
/// to determine the correct region and try to repeat the request again with the correct region.
/// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies,
/// but for `HeadObject` there is no response body so this way doesn't work. That's why we use `GetObjectAttributes` request instead.
/// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information.
ProfileEvents::increment(ProfileEvents::S3GetObjectAttributes);
if (for_disk_s3)
ProfileEvents::increment(ProfileEvents::DiskS3GetObjectAttributes);
Aws::S3::Model::GetObjectAttributesRequest req;
req.SetBucket(bucket);
req.SetKey(key);
if (!version_id.empty())
req.SetVersionId(version_id);
req.SetObjectAttributes({Aws::S3::Model::ObjectAttributes::ObjectSize});
auto outcome = client.GetObjectAttributes(req);
if (outcome.IsSuccess())
{
const auto & result = outcome.GetResult();
DB::S3::ObjectInfo object_info;
object_info.size = static_cast<size_t>(result.GetObjectSize());
object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
return {object_info, {}};
}
return {std::nullopt, outcome.GetError()};
}
else
{
/// By default we use `HeadObject` requests.
/// We cannot just use `GetObjectAttributes` requests always because some S3 providers (e.g. Minio)
/// don't support `GetObjectAttributes` requests.
ProfileEvents::increment(ProfileEvents::S3HeadObject);
if (for_disk_s3)
ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
Aws::S3::Model::HeadObjectRequest req;
req.SetBucket(bucket);
req.SetKey(key);
if (!version_id.empty())
req.SetVersionId(version_id);
auto outcome = client.HeadObject(req);
if (outcome.IsSuccess())
{
const auto & result = outcome.GetResult();
DB::S3::ObjectInfo object_info;
object_info.size = static_cast<size_t>(result.GetContentLength());
object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
return {object_info, {}};
}
return {std::nullopt, outcome.GetError()};
}
}
}
@ -894,54 +986,33 @@ namespace S3
return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY;
}
Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
{
ProfileEvents::increment(ProfileEvents::S3HeadObject);
if (for_disk_s3)
ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
Aws::S3::Model::HeadObjectRequest req;
req.SetBucket(bucket);
req.SetKey(key);
if (!version_id.empty())
req.SetVersionId(version_id);
return client.HeadObject(req);
}
S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
{
auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
if (outcome.IsSuccess())
auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
if (object_info)
{
auto read_result = outcome.GetResultWithOwnership();
return {.size = static_cast<size_t>(read_result.GetContentLength()), .last_modification_time = read_result.GetLastModified().Millis() / 1000};
return *object_info;
}
else if (throw_on_error)
{
const auto & error = outcome.GetError();
throw DB::Exception(ErrorCodes::S3_ERROR,
"Failed to HEAD object: {}. HTTP response code: {}",
"Failed to get object attributes: {}. HTTP response code: {}",
error.GetMessage(), static_cast<size_t>(error.GetResponseCode()));
}
return {};
}
size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
{
return getObjectInfo(client, bucket, key, version_id, throw_on_error, for_disk_s3).size;
return getObjectInfo(client, bucket, key, version_id, for_disk_s3, throw_on_error).size;
}
bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
{
auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
if (outcome.IsSuccess())
auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
if (object_info)
return true;
const auto & error = outcome.GetError();
if (isNotFoundError(error.GetErrorType()))
return false;
@ -949,6 +1020,48 @@ namespace S3
"Failed to check existence of key {} in bucket {}: {}",
key, bucket, error.GetMessage());
}
void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, std::string_view description)
{
auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
if (object_info)
return;
throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}",
(description.empty() ? "" : (String(description) + ": ")), key, bucket, error.GetMessage());
}
std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
{
ProfileEvents::increment(ProfileEvents::S3GetObjectMetadata);
if (for_disk_s3)
ProfileEvents::increment(ProfileEvents::DiskS3GetObjectMetadata);
/// We must not use the `HeadObject` request, see the comment about `HeadObjectRequest` in S3Common.h.
Aws::S3::Model::GetObjectRequest req;
req.SetBucket(bucket);
req.SetKey(key);
/// Only the first byte will be read.
/// We don't need that first byte but the range should be set otherwise the entire object will be read.
req.SetRange("bytes=0-0");
if (!version_id.empty())
req.SetVersionId(version_id);
auto outcome = client.GetObject(req);
if (outcome.IsSuccess())
return outcome.GetResult().GetMetadata();
if (!throw_on_error)
return {};
const auto & error = outcome.GetError();
throw S3Exception(error.GetErrorType(),
"Failed to get metadata of key {} in bucket {}: {}",
key, bucket, error.GetMessage());
}
}
}

View File

@ -11,15 +11,15 @@
#if USE_AWS_S3
#include <base/types.h>
#include <aws/core/Aws.h>
#include <aws/core/client/ClientConfiguration.h>
#include <aws/s3/S3Client.h>
#include <aws/s3/S3Errors.h>
#include <Poco/URI.h>
#include <Common/Exception.h>
#include <Common/Throttler_fwd.h>
#include <Poco/URI.h>
#include <aws/core/Aws.h>
#include <aws/s3/S3Errors.h>
namespace Aws::S3 { class S3Client; }
namespace DB
{
@ -121,22 +121,29 @@ struct URI
static void validateBucket(const String & bucket, const Poco::URI & uri);
};
/// WARNING: Don't use `HeadObjectRequest`! Use the functions below instead.
/// For explanation see the comment about `HeadObject` request in the function tryGetObjectInfo().
struct ObjectInfo
{
size_t size = 0;
time_t last_modification_time = 0;
};
bool isNotFoundError(Aws::S3::S3Errors error);
ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);
Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);
S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);
bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);
/// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message.
void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, std::string_view description = {});
bool isNotFoundError(Aws::S3::S3Errors error);
/// Returns the object's metadata.
std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);
}
#endif

View File

@ -182,12 +182,8 @@ void WriteBufferFromS3::finalizeImpl()
if (check_objects_after_upload)
{
LOG_TRACE(log, "Checking object {} exists after upload", key);
auto response = S3::headObject(*client_ptr, bucket, key, "", write_settings.for_object_storage);
if (!response.IsSuccess())
throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
else
LOG_TRACE(log, "Object {} exists after upload", key);
S3::checkObjectExists(*client_ptr, bucket, key, {}, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
LOG_TRACE(log, "Object {} exists after upload", key);
}
}

View File

@ -28,13 +28,29 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableIdentifier &
database = current_database;
}
DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database)
{
alias = identifier.tryGetAlias();
if (identifier.name_parts.size() == 2)
std::tie(database, table) = std::tie(identifier.name_parts[0], identifier.name_parts[1]);
else if (identifier.name_parts.size() == 1)
table = identifier.name_parts[0];
else
throw Exception("Logical error: invalid identifier", ErrorCodes::LOGICAL_ERROR);
if (database.empty())
database = current_database;
}
DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTPtr & node, const String & current_database)
{
const auto * identifier = node->as<ASTTableIdentifier>();
if (!identifier)
throw Exception("Logical error: table identifier expected", ErrorCodes::LOGICAL_ERROR);
*this = DatabaseAndTableWithAlias(*identifier, current_database);
if (const auto * table_identifier = node->as<ASTTableIdentifier>())
*this = DatabaseAndTableWithAlias(*table_identifier, current_database);
else if (const auto * identifier = node->as<ASTIdentifier>())
*this = DatabaseAndTableWithAlias(*identifier, current_database);
else
throw Exception("Logical error: identifier or table identifier expected", ErrorCodes::LOGICAL_ERROR);
}
DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database)

View File

@ -14,6 +14,7 @@ namespace DB
{
class ASTSelectQuery;
class ASTIdentifier;
class ASTTableIdentifier;
struct ASTTableExpression;
@ -28,6 +29,7 @@ struct DatabaseAndTableWithAlias
DatabaseAndTableWithAlias() = default;
explicit DatabaseAndTableWithAlias(const ASTPtr & identifier_node, const String & current_database = "");
explicit DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database = "");
explicit DatabaseAndTableWithAlias(const ASTTableIdentifier & identifier, const String & current_database = "");
explicit DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database = "");

View File

@ -25,6 +25,7 @@ static const std::unordered_map<String, String> quantile_fuse_name_mapping = {
{NameQuantileExactInclusive::name, NameQuantilesExactInclusive::name},
{NameQuantileExactLow::name, NameQuantilesExactLow::name},
{NameQuantileExactWeighted::name, NameQuantilesExactWeighted::name},
{NameQuantileInterpolatedWeighted::name, NameQuantilesInterpolatedWeighted::name},
{NameQuantileTDigest::name, NameQuantilesTDigest::name},
{NameQuantileTDigestWeighted::name, NameQuantilesTDigestWeighted::name},
{NameQuantileTiming::name, NameQuantilesTiming::name},
@ -61,9 +62,11 @@ void GatherFunctionQuantileData::FuseQuantileAggregatesData::addFuncNode(ASTPtr
const auto & arguments = func->arguments->children;
bool need_two_args = func->name == NameQuantileDeterministic::name || func->name == NameQuantileExactWeighted::name
|| func->name == NameQuantileTimingWeighted::name || func->name == NameQuantileTDigestWeighted::name
|| func->name == NameQuantileBFloat16Weighted::name;
|| func->name == NameQuantileInterpolatedWeighted::name || func->name == NameQuantileTimingWeighted::name
|| func->name == NameQuantileTDigestWeighted::name || func->name == NameQuantileBFloat16Weighted::name;
if (arguments.size() != (need_two_args ? 2 : 1))
return;

View File

@ -288,6 +288,20 @@ struct ExplainSettings : public Settings
}
};
struct QuerySyntaxSettings
{
bool oneline = false;
constexpr static char name[] = "SYNTAX";
std::unordered_map<std::string, std::reference_wrapper<bool>> boolean_settings =
{
{"oneline", oneline},
};
std::unordered_map<std::string, std::reference_wrapper<Int64>> integer_settings;
};
template <typename Settings>
ExplainSettings<Settings> checkAndGetSettings(const ASTPtr & ast_settings)
{
@ -362,13 +376,12 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
}
case ASTExplainQuery::AnalyzedSyntax:
{
if (ast.getSettings())
throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING);
auto settings = checkAndGetSettings<QuerySyntaxSettings>(ast.getSettings());
ExplainAnalyzedSyntaxVisitor::Data data(getContext());
ExplainAnalyzedSyntaxVisitor(data).visit(query);
ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false));
ast.getExplainedQuery()->format(IAST::FormatSettings(buf, settings.oneline));
break;
}
case ASTExplainQuery::QueryTree:

View File

@ -487,7 +487,7 @@ BlockIO InterpreterSystemQuery::execute()
dropDatabaseReplica(query);
break;
case Type::SYNC_REPLICA:
syncReplica(query);
syncReplica();
break;
case Type::SYNC_DATABASE_REPLICA:
syncReplicatedDatabase(query);
@ -507,6 +507,9 @@ BlockIO InterpreterSystemQuery::execute()
case Type::RESTORE_REPLICA:
restoreReplica();
break;
case Type::WAIT_LOADING_PARTS:
waitLoadingParts();
break;
case Type::RESTART_DISK:
restartDisk(query.disk);
case Type::FLUSH_LOGS:
@ -852,7 +855,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
throw Exception("Invalid query", ErrorCodes::LOGICAL_ERROR);
}
void InterpreterSystemQuery::syncReplica(ASTSystemQuery &)
void InterpreterSystemQuery::syncReplica()
{
getContext()->checkAccess(AccessType::SYSTEM_SYNC_REPLICA, table_id);
StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
@ -872,6 +875,23 @@ void InterpreterSystemQuery::syncReplica(ASTSystemQuery &)
throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs());
}
void InterpreterSystemQuery::waitLoadingParts()
{
getContext()->checkAccess(AccessType::SYSTEM_WAIT_LOADING_PARTS, table_id);
StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
if (auto * merge_tree = dynamic_cast<MergeTreeData *>(table.get()))
{
LOG_TRACE(log, "Waiting for loading of parts of table {}", table_id.getFullTableName());
merge_tree->waitForOutdatedPartsToBeLoaded();
LOG_TRACE(log, "Finished waiting for loading of parts of table {}", table_id.getFullTableName());
}
else
{
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Command WAIT LOADING PARTS is supported only for MergeTree table, but got: {}", table->getName());
}
}
void InterpreterSystemQuery::syncReplicatedDatabase(ASTSystemQuery & query)
{
@ -1071,6 +1091,11 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
required_access.emplace_back(AccessType::SYSTEM_RESTART_REPLICA);
break;
}
case Type::WAIT_LOADING_PARTS:
{
required_access.emplace_back(AccessType::SYSTEM_WAIT_LOADING_PARTS, query.getDatabase(), query.getTable());
break;
}
case Type::SYNC_DATABASE_REPLICA:
{
required_access.emplace_back(AccessType::SYSTEM_SYNC_DATABASE_REPLICA, query.getDatabase());

View File

@ -56,7 +56,8 @@ private:
void restartReplica(const StorageID & replica, ContextMutablePtr system_context);
void restartReplicas(ContextMutablePtr system_context);
void syncReplica(ASTSystemQuery & query);
void syncReplica();
void waitLoadingParts();
void syncReplicatedDatabase(ASTSystemQuery & query);

View File

@ -49,7 +49,8 @@ ASTPtr makeSubqueryTemplate()
ASTPtr makeSubqueryQualifiedAsterisk()
{
auto asterisk = std::make_shared<ASTQualifiedAsterisk>();
asterisk->children.emplace_back(std::make_shared<ASTTableIdentifier>("--.s"));
asterisk->qualifier = std::make_shared<ASTIdentifier>("--.s");
asterisk->children.push_back(asterisk->qualifier);
return asterisk;
}
@ -153,24 +154,34 @@ private:
for (auto & table_name : data.tables_order)
data.addTableColumns(table_name, columns);
for (const auto & transformer : asterisk->children)
IASTColumnsTransformer::transform(transformer, columns);
if (asterisk->transformers)
{
for (const auto & transformer : asterisk->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
{
has_asterisks = true;
auto & identifier = child->children[0]->as<ASTTableIdentifier &>();
if (!qualified_asterisk->qualifier)
throw Exception("Logical error: qualified asterisk must have a qualifier", ErrorCodes::LOGICAL_ERROR);
auto & identifier = qualified_asterisk->qualifier->as<ASTIdentifier &>();
data.addTableColumns(identifier.name(), columns);
// QualifiedAsterisk's transformers start to appear at child 1
for (const auto * it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
if (qualified_asterisk->transformers)
{
if (it->get()->as<ASTColumnsApplyTransformer>() || it->get()->as<ASTColumnsExceptTransformer>() || it->get()->as<ASTColumnsReplaceTransformer>())
IASTColumnsTransformer::transform(*it, columns);
else
throw Exception("Logical error: qualified asterisk must only have children of IASTColumnsTransformer type", ErrorCodes::LOGICAL_ERROR);
for (const auto & transformer : qualified_asterisk->transformers->children)
{
if (transformer->as<ASTColumnsApplyTransformer>() ||
transformer->as<ASTColumnsExceptTransformer>() ||
transformer->as<ASTColumnsReplaceTransformer>())
IASTColumnsTransformer::transform(transformer, columns);
else
throw Exception("Logical error: qualified asterisk must only have children of IASTColumnsTransformer type", ErrorCodes::LOGICAL_ERROR);
}
}
}
else if (const auto * columns_list_matcher = child->as<ASTColumnsListMatcher>())
@ -180,8 +191,11 @@ private:
for (const auto & ident : columns_list_matcher->column_list->children)
columns.emplace_back(ident->clone());
for (const auto & transformer : columns_list_matcher->children)
IASTColumnsTransformer::transform(transformer, columns);
if (columns_list_matcher->transformers)
{
for (const auto & transformer : columns_list_matcher->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (const auto * columns_regexp_matcher = child->as<ASTColumnsRegexpMatcher>())
{
@ -193,8 +207,11 @@ private:
columns,
[&](const String & column_name) { return columns_regexp_matcher->isColumnMatching(column_name); });
for (const auto & transformer : columns_regexp_matcher->children)
IASTColumnsTransformer::transform(transformer, columns);
if (columns_regexp_matcher->transformers)
{
for (const auto & transformer : columns_regexp_matcher->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else
data.new_select_expression_list->children.push_back(child);
@ -425,6 +442,7 @@ private:
{
if (data.expression_list->children.empty())
data.expression_list->children.emplace_back(std::make_shared<ASTAsterisk>());
select.setExpression(ASTSelectQuery::Expression::SELECT, std::move(data.expression_list));
}
data.done = true;

View File

@ -154,7 +154,7 @@ private:
static void visit(const ASTQualifiedAsterisk & node, const ASTPtr &, Data & data)
{
auto & identifier = node.children[0]->as<ASTTableIdentifier &>();
auto & identifier = node.qualifier->as<ASTIdentifier &>();
bool rewritten = false;
for (const auto & table : data)
{

View File

@ -303,7 +303,6 @@ bool MergeTreeTransaction::rollback() noexcept
part->version.unlockRemovalTID(tid, TransactionInfoContext{part->storage.getStorageID(), part->name});
}
assert([&]()
{
std::lock_guard lock{mutex};

View File

@ -49,7 +49,9 @@ NamesAndTypesList TextLogElement::getNamesAndTypes()
{"revision", std::make_shared<DataTypeUInt32>()},
{"source_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"source_line", std::make_shared<DataTypeUInt64>()}
{"source_line", std::make_shared<DataTypeUInt64>()},
{"message_format_string", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
};
}
@ -74,6 +76,8 @@ void TextLogElement::appendToBlock(MutableColumns & columns) const
columns[i++]->insert(source_file);
columns[i++]->insert(source_line);
columns[i++]->insert(message_format_string);
}
TextLog::TextLog(ContextPtr context_, const String & database_name_,

View File

@ -28,6 +28,8 @@ struct TextLogElement
String source_file;
UInt64 source_line{};
std::string_view message_format_string;
static std::string name() { return "TextLog"; }
static NamesAndTypesList getNamesAndTypes();
static NamesAndAliases getNamesAndAliases() { return {}; }

View File

@ -156,21 +156,19 @@ void TranslateQualifiedNamesMatcher::visit(ASTFunction & node, const ASTPtr &, D
func_arguments->children.clear();
}
void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk &, const ASTPtr & ast, Data & data)
void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & node, const ASTPtr &, Data & data)
{
if (ast->children.empty())
throw Exception("Logical error: qualified asterisk must have children", ErrorCodes::LOGICAL_ERROR);
auto & ident = ast->children[0];
if (!node.qualifier)
throw Exception("Logical error: qualified asterisk must have a qualifier", ErrorCodes::LOGICAL_ERROR);
/// @note it could contain table alias as table name.
DatabaseAndTableWithAlias db_and_table(ident);
DatabaseAndTableWithAlias db_and_table(node.qualifier);
for (const auto & known_table : data.tables)
if (db_and_table.satisfies(known_table.table, true))
return;
throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
throw Exception("Unknown qualified identifier: " + node.qualifier->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
}
void TranslateQualifiedNamesMatcher::visit(ASTTableJoin & join, const ASTPtr & , Data & data)
@ -266,16 +264,22 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
first_table = false;
}
for (const auto & transformer : asterisk->children)
IASTColumnsTransformer::transform(transformer, columns);
if (asterisk->transformers)
{
for (const auto & transformer : asterisk->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (auto * asterisk_column_list = child->as<ASTColumnsListMatcher>())
{
for (const auto & ident : asterisk_column_list->column_list->children)
columns.emplace_back(ident->clone());
for (const auto & transformer : asterisk_column_list->children)
IASTColumnsTransformer::transform(transformer, columns);
if (asterisk_column_list->transformers)
{
for (const auto & transformer : asterisk_column_list->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (const auto * asterisk_regexp_pattern = child->as<ASTColumnsRegexpMatcher>())
{
@ -292,12 +296,15 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
first_table = false;
}
for (const auto & transformer : asterisk_regexp_pattern->children)
IASTColumnsTransformer::transform(transformer, columns);
if (asterisk_regexp_pattern->transformers)
{
for (const auto & transformer : asterisk_regexp_pattern->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
{
DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->qualifier);
for (const auto & table : tables_with_columns)
{
@ -309,10 +316,10 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
}
}
// QualifiedAsterisk's transformers start to appear at child 1
for (const auto * it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
if (qualified_asterisk->transformers)
{
IASTColumnsTransformer::transform(*it, columns);
for (const auto & transformer : qualified_asterisk->transformers->children)
IASTColumnsTransformer::transform(transformer, columns);
}
}
else

View File

@ -133,6 +133,8 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)
elem.source_file = msg.getSourceFile();
elem.source_line = msg.getSourceLine();
elem.message_format_string = msg.getFormatString();
std::shared_ptr<TextLog> text_log_locked{};
{
std::lock_guard<std::mutex> lock(text_log_mutex);

View File

@ -8,21 +8,37 @@ namespace DB
ASTPtr ASTAsterisk::clone() const
{
auto clone = std::make_shared<ASTAsterisk>(*this);
clone->cloneChildren();
if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
return clone;
}
void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const { ostr.write('*'); }
void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const
{
if (expression)
{
expression->appendColumnName(ostr);
writeCString(".", ostr);
}
ostr.write('*');
}
void ASTAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
if (expression)
{
expression->formatImpl(settings, state, frame);
settings.ostr << ".";
}
settings.ostr << "*";
/// Format column transformers
for (const auto & child : children)
if (transformers)
{
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}

View File

@ -16,6 +16,8 @@ public:
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
ASTPtr expression;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
};

View File

@ -18,12 +18,20 @@ namespace ErrorCodes
ASTPtr ASTColumnsRegexpMatcher::clone() const
{
auto clone = std::make_shared<ASTColumnsRegexpMatcher>(*this);
clone->cloneChildren();
if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
return clone;
}
void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
{
if (expression)
{
expression->appendColumnName(ostr);
writeCString(".", ostr);
}
writeCString("COLUMNS(", ostr);
writeQuotedString(original_pattern, ostr);
writeChar(')', ostr);
@ -38,15 +46,21 @@ void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const
void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
settings.ostr << (settings.hilite ? hilite_keyword : "");
if (expression)
{
expression->formatImpl(settings, state, frame);
settings.ostr << ".";
}
settings.ostr << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
settings.ostr << quoteString(original_pattern);
settings.ostr << ")";
/// Format column transformers
for (const auto & child : children)
if (transformers)
{
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}
@ -60,6 +74,11 @@ void ASTColumnsRegexpMatcher::setPattern(String pattern)
DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
}
const String & ASTColumnsRegexpMatcher::getPattern() const
{
return original_pattern;
}
const std::shared_ptr<re2::RE2> & ASTColumnsRegexpMatcher::getMatcher() const
{
return column_matcher;
@ -73,19 +92,23 @@ bool ASTColumnsRegexpMatcher::isColumnMatching(const String & column_name) const
ASTPtr ASTColumnsListMatcher::clone() const
{
auto clone = std::make_shared<ASTColumnsListMatcher>(*this);
clone->column_list = column_list->clone();
clone->cloneChildren();
return clone;
}
void ASTColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
{
column_list->updateTreeHash(hash_state);
IAST::updateTreeHashImpl(hash_state);
if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
clone->column_list = column_list->clone();
clone->children.push_back(clone->column_list);
return clone;
}
void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
{
if (expression)
{
expression->appendColumnName(ostr);
writeCString(".", ostr);
}
writeCString("COLUMNS(", ostr);
for (auto * it = column_list->children.begin(); it != column_list->children.end(); ++it)
{
@ -99,7 +122,15 @@ void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
settings.ostr << (settings.hilite ? hilite_keyword : "");
if (expression)
{
expression->formatImpl(settings, state, frame);
settings.ostr << ".";
}
settings.ostr << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
{
@ -111,33 +142,39 @@ void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatSt
}
settings.ostr << ")";
/// Format column transformers
for (const auto & child : children)
if (transformers)
{
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}
ASTPtr ASTQualifiedColumnsRegexpMatcher::clone() const
{
auto clone = std::make_shared<ASTQualifiedColumnsRegexpMatcher>(*this);
clone->cloneChildren();
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
clone->qualifier = qualifier->clone();
clone->children.push_back(clone->qualifier);
return clone;
}
void ASTQualifiedColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
{
const auto & qualifier = children.at(0);
qualifier->appendColumnName(ostr);
writeCString(".COLUMNS(", ostr);
writeQuotedString(original_pattern, ostr);
writeChar(')', ostr);
}
void ASTQualifiedColumnsRegexpMatcher::setPattern(String pattern)
void ASTQualifiedColumnsRegexpMatcher::setPattern(String pattern, bool set_matcher)
{
original_pattern = std::move(pattern);
if (!set_matcher)
return;
column_matcher = std::make_shared<RE2>(original_pattern, RE2::Quiet);
if (!column_matcher->ok())
throw DB::Exception(
@ -166,35 +203,35 @@ void ASTQualifiedColumnsRegexpMatcher::formatImpl(const FormatSettings & setting
{
settings.ostr << (settings.hilite ? hilite_keyword : "");
const auto & qualifier = children.at(0);
qualifier->formatImpl(settings, state, frame);
settings.ostr << ".COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
settings.ostr << quoteString(original_pattern);
settings.ostr << ")";
/// Format column transformers
size_t children_size = children.size();
for (size_t i = 1; i < children_size; ++i)
if (transformers)
{
const auto & child = children[i];
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}
ASTPtr ASTQualifiedColumnsListMatcher::clone() const
{
auto clone = std::make_shared<ASTQualifiedColumnsListMatcher>(*this);
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
clone->qualifier = qualifier->clone();
clone->column_list = column_list->clone();
clone->cloneChildren();
clone->children.push_back(clone->qualifier);
clone->children.push_back(clone->column_list);
return clone;
}
void ASTQualifiedColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
{
const auto & qualifier = children.at(0);
qualifier->appendColumnName(ostr);
writeCString(".COLUMNS(", ostr);
@ -208,19 +245,10 @@ void ASTQualifiedColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
writeChar(')', ostr);
}
void ASTQualifiedColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
{
column_list->updateTreeHash(hash_state);
IAST::updateTreeHashImpl(hash_state);
}
void ASTQualifiedColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "");
const auto & qualifier = children.at(0);
qualifier->formatImpl(settings, state, frame);
settings.ostr << ".COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
@ -232,14 +260,9 @@ void ASTQualifiedColumnsListMatcher::formatImpl(const FormatSettings & settings,
}
settings.ostr << ")";
/// Format column transformers
size_t children_size = children.size();
for (size_t i = 1; i < children_size; ++i)
if (transformers)
{
const auto & child = children[i];
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}

View File

@ -24,10 +24,13 @@ public:
void appendColumnName(WriteBuffer & ostr) const override;
void setPattern(String pattern);
const String & getPattern() const;
const std::shared_ptr<re2::RE2> & getMatcher() const;
bool isColumnMatching(const String & column_name) const;
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr expression;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
@ -43,9 +46,10 @@ public:
String getID(char) const override { return "ColumnsListMatcher"; }
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr expression;
ASTPtr column_list;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
};
@ -59,10 +63,12 @@ public:
void appendColumnName(WriteBuffer & ostr) const override;
const std::shared_ptr<re2::RE2> & getMatcher() const;
void setPattern(String pattern);
void setPattern(String pattern, bool set_matcher = true);
void setMatcher(std::shared_ptr<re2::RE2> matcher);
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr qualifier;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
@ -78,9 +84,10 @@ public:
String getID(char) const override { return "QualifiedColumnsListMatcher"; }
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
void updateTreeHashImpl(SipHash & hash_state) const override;
ASTPtr qualifier;
ASTPtr column_list;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
};

View File

@ -19,6 +19,15 @@ namespace ErrorCodes
extern const int CANNOT_COMPILE_REGEXP;
}
void ASTColumnsTransformerList::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
for (const auto & child : children)
{
settings.ostr << ' ';
child->formatImpl(settings, state, frame);
}
}
void IASTColumnsTransformer::transform(const ASTPtr & transformer, ASTs & nodes)
{
if (const auto * apply = transformer->as<ASTColumnsApplyTransformer>())

View File

@ -9,6 +9,23 @@ namespace re2
namespace DB
{
/// A list of column transformers
class ASTColumnsTransformerList : public IAST
{
public:
String getID(char) const override { return "ColumnsTransformerList"; }
ASTPtr clone() const override
{
auto clone = std::make_shared<ASTColumnsTransformerList>(*this);
clone->cloneChildren();
return clone;
}
protected:
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
};
class IASTColumnsTransformer : public IAST
{
public:

View File

@ -6,6 +6,10 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
/// AST, EXPLAIN or other query with meaning of explanation query instead of execution
class ASTExplainQuery : public ASTQueryWithOutput
@ -23,6 +27,45 @@ public:
CurrentTransaction, /// 'EXPLAIN CURRENT TRANSACTION'
};
static String toString(ExplainKind kind)
{
switch (kind)
{
case ParsedAST: return "EXPLAIN AST";
case AnalyzedSyntax: return "EXPLAIN SYNTAX";
case QueryTree: return "EXPLAIN QUERY TREE";
case QueryPlan: return "EXPLAIN";
case QueryPipeline: return "EXPLAIN PIPELINE";
case QueryEstimates: return "EXPLAIN ESTIMATE";
case TableOverride: return "EXPLAIN TABLE OVERRIDE";
case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION";
}
UNREACHABLE();
}
static ExplainKind fromString(const String & str)
{
if (str == "EXPLAIN AST")
return ParsedAST;
if (str == "EXPLAIN SYNTAX")
return AnalyzedSyntax;
if (str == "EXPLAIN QUERY TREE")
return QueryTree;
if (str == "EXPLAIN" || str == "EXPLAIN PLAN")
return QueryPlan;
if (str == "EXPLAIN PIPELINE")
return QueryPipeline;
if (str == "EXPLAIN ESTIMATE")
return QueryEstimates;
if (str == "EXPLAIN TABLE OVERRIDE")
return TableOverride;
if (str == "EXPLAIN CURRENT TRANSACTION")
return CurrentTransaction;
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown explain kind '{}'", str);
}
explicit ASTExplainQuery(ExplainKind kind_) : kind(kind_) {}
String getID(char delim) const override { return "Explain" + (delim + toString(kind)); }
@ -103,23 +146,6 @@ private:
/// Used by EXPLAIN TABLE OVERRIDE
ASTPtr table_function;
ASTPtr table_override;
static String toString(ExplainKind kind)
{
switch (kind)
{
case ParsedAST: return "EXPLAIN AST";
case AnalyzedSyntax: return "EXPLAIN SYNTAX";
case QueryTree: return "EXPLAIN QUERY TREE";
case QueryPlan: return "EXPLAIN";
case QueryPipeline: return "EXPLAIN PIPELINE";
case QueryEstimates: return "EXPLAIN ESTIMATE";
case TableOverride: return "EXPLAIN TABLE OVERRIDE";
case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION";
}
UNREACHABLE();
}
};
}

View File

@ -7,22 +7,18 @@ namespace DB
void ASTQualifiedAsterisk::appendColumnName(WriteBuffer & ostr) const
{
const auto & qualifier = children.at(0);
qualifier->appendColumnName(ostr);
writeCString(".*", ostr);
}
void ASTQualifiedAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
const auto & qualifier = children.at(0);
qualifier->formatImpl(settings, state, frame);
settings.ostr << ".*";
/// Format column transformers
for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
if (transformers)
{
settings.ostr << ' ';
(*it)->formatImpl(settings, state, frame);
transformers->formatImpl(settings, state, frame);
}
}

View File

@ -17,11 +17,18 @@ public:
ASTPtr clone() const override
{
auto clone = std::make_shared<ASTQualifiedAsterisk>(*this);
clone->cloneChildren();
if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
clone->qualifier = qualifier->clone();
clone->children.push_back(clone->qualifier);
return clone;
}
void appendColumnName(WriteBuffer & ostr) const override;
ASTPtr qualifier;
ASTPtr transformers;
protected:
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
};

View File

@ -166,6 +166,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
else if ( type == Type::RESTART_REPLICA
|| type == Type::RESTORE_REPLICA
|| type == Type::SYNC_REPLICA
|| type == Type::WAIT_LOADING_PARTS
|| type == Type::FLUSH_DISTRIBUTED
|| type == Type::RELOAD_DICTIONARY
|| type == Type::RELOAD_MODEL

View File

@ -35,6 +35,7 @@ public:
RESTART_REPLICAS,
RESTART_REPLICA,
RESTORE_REPLICA,
WAIT_LOADING_PARTS,
DROP_REPLICA,
DROP_DATABASE_REPLICA,
SYNC_REPLICA,

View File

@ -28,6 +28,8 @@
#include <Parsers/ASTWindowDefinition.h>
#include <Parsers/ASTAssignment.h>
#include <Parsers/ASTColumnsMatcher.h>
#include <Parsers/ASTExplainQuery.h>
#include <Parsers/ASTSetQuery.h>
#include <Parsers/ASTSelectQuery.h>
#include <Parsers/ASTTablesInSelectQuery.h>
#include <Parsers/ExpressionListParsers.h>
@ -116,8 +118,40 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
}
else if (ASTPtr explain_node; explain.parse(pos, explain_node, expected))
{
/// Replace SELECT * FROM (EXPLAIN SELECT ...) with SELECT * FROM viewExplain(EXPLAIN SELECT ...)
result_node = buildSelectFromTableFunction(makeASTFunction("viewExplain", explain_node));
const auto & explain_query = explain_node->as<const ASTExplainQuery &>();
if (explain_query.getTableFunction() || explain_query.getTableOverride())
throw Exception("EXPLAIN in a subquery cannot have a table function or table override", ErrorCodes::BAD_ARGUMENTS);
/// Replace subquery `(EXPLAIN <kind> <explain_settings> SELECT ...)`
/// with `(SELECT * FROM viewExplain("<kind>", "<explain_settings>", SELECT ...))`
String kind_str = ASTExplainQuery::toString(explain_query.getKind());
String settings_str;
if (ASTPtr settings_ast = explain_query.getSettings())
{
if (!settings_ast->as<ASTSetQuery>())
throw Exception("EXPLAIN settings must be a SET query", ErrorCodes::BAD_ARGUMENTS);
settings_str = queryToString(settings_ast);
}
const ASTPtr & explained_ast = explain_query.getExplainedQuery();
if (explained_ast)
{
auto view_explain = makeASTFunction("viewExplain",
std::make_shared<ASTLiteral>(kind_str),
std::make_shared<ASTLiteral>(settings_str),
explained_ast);
result_node = buildSelectFromTableFunction(view_explain);
}
else
{
auto view_explain = makeASTFunction("viewExplain",
std::make_shared<ASTLiteral>(kind_str),
std::make_shared<ASTLiteral>(settings_str));
result_node = buildSelectFromTableFunction(view_explain);
}
}
else
{
@ -1623,13 +1657,21 @@ bool ParserAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
++pos;
auto asterisk = std::make_shared<ASTAsterisk>();
auto transformers = std::make_shared<ASTColumnsTransformerList>();
ParserColumnsTransformers transformers_p(allowed_transformers);
ASTPtr transformer;
while (transformers_p.parse(pos, transformer, expected))
{
asterisk->children.push_back(transformer);
transformers->children.push_back(transformer);
}
node = asterisk;
if (!transformers->children.empty())
{
asterisk->transformers = std::move(transformers);
asterisk->children.push_back(asterisk->transformers);
}
node = std::move(asterisk);
return true;
}
return false;
@ -1638,7 +1680,7 @@ bool ParserAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
bool ParserQualifiedAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
if (!ParserCompoundIdentifier(true, true).parse(pos, node, expected))
if (!ParserCompoundIdentifier(false, true).parse(pos, node, expected))
return false;
if (pos->type != TokenType::Dot)
@ -1650,13 +1692,23 @@ bool ParserQualifiedAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & exp
++pos;
auto res = std::make_shared<ASTQualifiedAsterisk>();
res->children.push_back(node);
auto transformers = std::make_shared<ASTColumnsTransformerList>();
ParserColumnsTransformers transformers_p;
ASTPtr transformer;
while (transformers_p.parse(pos, transformer, expected))
{
res->children.push_back(transformer);
transformers->children.push_back(transformer);
}
res->qualifier = std::move(node);
res->children.push_back(res->qualifier);
if (!transformers->children.empty())
{
res->transformers = std::move(transformers);
res->children.push_back(res->transformers);
}
node = std::move(res);
return true;
}
@ -1680,28 +1732,44 @@ static bool parseColumnsMatcherBody(IParser::Pos & pos, ASTPtr & node, Expected
return false;
++pos;
auto transformers = std::make_shared<ASTColumnsTransformerList>();
ParserColumnsTransformers transformers_p(allowed_transformers);
ASTPtr transformer;
while (transformers_p.parse(pos, transformer, expected))
{
transformers->children.push_back(transformer);
}
ASTPtr res;
if (column_list)
{
auto list_matcher = std::make_shared<ASTColumnsListMatcher>();
list_matcher->column_list = column_list;
res = list_matcher;
list_matcher->column_list = std::move(column_list);
list_matcher->children.push_back(list_matcher->column_list);
if (!transformers->children.empty())
{
list_matcher->transformers = std::move(transformers);
list_matcher->children.push_back(list_matcher->transformers);
}
node = std::move(list_matcher);
}
else
{
auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
regexp_matcher->setPattern(regex_node->as<ASTLiteral &>().value.get<String>());
res = regexp_matcher;
if (!transformers->children.empty())
{
regexp_matcher->transformers = std::move(transformers);
regexp_matcher->children.push_back(regexp_matcher->transformers);
}
node = std::move(regexp_matcher);
}
ParserColumnsTransformers transformers_p(allowed_transformers);
ASTPtr transformer;
while (transformers_p.parse(pos, transformer, expected))
{
res->children.push_back(transformer);
}
node = std::move(res);
return true;
}
@ -1717,29 +1785,19 @@ bool ParserColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expect
bool ParserQualifiedColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
{
if (!ParserCompoundIdentifier(true, true).parse(pos, node, expected))
if (!ParserCompoundIdentifier(false, true).parse(pos, node, expected))
return false;
auto identifier_node = node;
const auto & identifier_node_typed = identifier_node->as<ASTTableIdentifier &>();
auto & identifier_node_typed = identifier_node->as<ASTIdentifier &>();
auto & name_parts = identifier_node_typed.name_parts;
/// ParserCompoundIdentifier parse identifier.COLUMNS
if (identifier_node_typed.name_parts.size() == 1 || identifier_node_typed.name_parts.back() != "COLUMNS")
if (name_parts.size() == 1 || name_parts.back() != "COLUMNS")
return false;
/// TODO: ASTTableIdentifier can contain only 2 parts
if (identifier_node_typed.name_parts.size() == 2)
{
auto table_name = identifier_node_typed.name_parts[0];
identifier_node = std::make_shared<ASTTableIdentifier>(table_name);
}
else
{
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected identifier to contain no more than 2 parts. Actual {}",
identifier_node_typed.full_name);
}
name_parts.pop_back();
identifier_node = std::make_shared<ASTIdentifier>(std::move(name_parts), false, std::move(node->children));
if (!parseColumnsMatcherBody(pos, node, expected, allowed_transformers))
return false;
@ -1747,28 +1805,36 @@ bool ParserQualifiedColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected
if (auto * columns_list_matcher = node->as<ASTColumnsListMatcher>())
{
auto result = std::make_shared<ASTQualifiedColumnsListMatcher>();
result->qualifier = std::move(identifier_node);
result->column_list = std::move(columns_list_matcher->column_list);
result->children.reserve(columns_list_matcher->children.size() + 1);
result->children.push_back(std::move(identifier_node));
result->children.push_back(result->qualifier);
result->children.push_back(result->column_list);
for (auto && child : columns_list_matcher->children)
result->children.push_back(std::move(child));
if (columns_list_matcher->transformers)
{
result->transformers = std::move(columns_list_matcher->transformers);
result->children.push_back(result->transformers);
}
node = result;
node = std::move(result);
}
else if (auto * column_regexp_matcher = node->as<ASTColumnsRegexpMatcher>())
{
auto result = std::make_shared<ASTQualifiedColumnsRegexpMatcher>();
result->setPattern(column_regexp_matcher->getPattern(), false);
result->setMatcher(column_regexp_matcher->getMatcher());
result->children.reserve(column_regexp_matcher->children.size() + 1);
result->children.push_back(std::move(identifier_node));
result->qualifier = std::move(identifier_node);
result->children.push_back(result->qualifier);
for (auto && child : column_regexp_matcher->children)
result->children.push_back(std::move(child));
if (column_regexp_matcher->transformers)
{
result->transformers = std::move(column_regexp_matcher->transformers);
result->children.push_back(result->transformers);
}
node = result;
node = std::move(result);
}
else
{

View File

@ -4,6 +4,7 @@
#include <Parsers/ParserSetQuery.h>
#include <Parsers/ASTAsterisk.h>
#include <Parsers/ASTColumnsMatcher.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTFunction.h>
#include <Parsers/ASTFunctionWithKeyValueArguments.h>
@ -2194,7 +2195,7 @@ struct ParserExpressionImpl
using Layers = std::vector<std::unique_ptr<Layer>>;
Action tryParseOperand(Layers & layers, IParser::Pos & pos, Expected & expected);
static Action tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected);
Action tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected);
};
@ -2523,8 +2524,6 @@ Action ParserExpressionImpl::tryParseOperand(Layers & layers, IParser::Pos & pos
Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected)
{
ASTPtr tmp;
/// ParserExpression can be called in this part of the query:
/// ALTER TABLE partition_all2 CLEAR INDEX [ p ] IN PARTITION ALL
///
@ -2544,17 +2543,17 @@ Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & po
if (cur_op == operators_table.end())
{
ASTPtr alias;
ParserAlias alias_parser(layers.back()->allow_alias_without_as_keyword);
auto old_pos = pos;
if (layers.back()->allow_alias &&
!layers.back()->parsed_alias &&
alias_parser.parse(pos, tmp, expected) &&
layers.back()->insertAlias(tmp))
alias_parser.parse(pos, alias, expected) &&
layers.back()->insertAlias(alias))
{
layers.back()->parsed_alias = true;
return Action::OPERATOR;
}
pos = old_pos;
return Action::NONE;
}
@ -2618,33 +2617,57 @@ Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & po
layers.back()->pushOperand(function);
}
/// Dot (TupleElement operator) can be a beginning of a .* or .COLUMNS expressions
if (op.type == OperatorType::TupleElement)
{
ASTPtr tmp;
if (asterisk_parser.parse(pos, tmp, expected) ||
columns_matcher_parser.parse(pos, tmp, expected))
{
if (auto * asterisk = tmp->as<ASTAsterisk>())
{
if (!layers.back()->popOperand(asterisk->expression))
return Action::NONE;
}
else if (auto * columns_list_matcher = tmp->as<ASTColumnsListMatcher>())
{
if (!layers.back()->popOperand(columns_list_matcher->expression))
return Action::NONE;
}
else if (auto * columns_regexp_matcher = tmp->as<ASTColumnsRegexpMatcher>())
{
if (!layers.back()->popOperand(columns_regexp_matcher->expression))
return Action::NONE;
}
layers.back()->pushOperand(std::move(tmp));
return Action::OPERATOR;
}
}
layers.back()->pushOperator(op);
if (op.type == OperatorType::ArrayElement)
layers.push_back(std::make_unique<ArrayElementLayer>());
Action next = Action::OPERAND;
/// isNull & isNotNull are postfix unary operators
if (op.type == OperatorType::IsNull)
next = Action::OPERATOR;
if (op.type == OperatorType::StartBetween || op.type == OperatorType::StartNotBetween)
layers.back()->between_counter++;
return Action::OPERATOR;
if (op.type == OperatorType::Cast)
{
next = Action::OPERATOR;
ASTPtr type_ast;
if (!ParserDataType().parse(pos, type_ast, expected))
return Action::NONE;
layers.back()->pushOperand(std::make_shared<ASTLiteral>(queryToString(type_ast)));
return Action::OPERATOR;
}
return next;
if (op.type == OperatorType::ArrayElement)
layers.push_back(std::make_unique<ArrayElementLayer>());
if (op.type == OperatorType::StartBetween || op.type == OperatorType::StartNotBetween)
layers.back()->between_counter++;
return Action::OPERAND;
}
}

View File

@ -253,6 +253,7 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &
case Type::RESTART_REPLICA:
case Type::SYNC_REPLICA:
case Type::WAIT_LOADING_PARTS:
{
if (!parseQueryWithOnCluster(res, pos, expected))
return false;

View File

@ -196,7 +196,7 @@ void PipelineExecutor::executeSingleThread(size_t thread_num)
#ifndef NDEBUG
auto & context = tasks.getThreadContext(thread_num);
LOG_TRACE(log,
LOG_TEST(log,
"Thread finished. Total time: {} sec. Execution time: {} sec. Processing time: {} sec. Wait time: {} sec.",
context.total_time_ns / 1e9,
context.execution_time_ns / 1e9,

View File

@ -304,7 +304,9 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node
};
}
if (null_as_default)
/// If the Union is ['Null', Nested-Type], since the Nested-Type can not be inside
/// Nullable, so we will get Nested-Type, instead of Nullable type.
if (null_as_default || !target.isNullable())
{
auto nested_deserialize = this->createDeserializeFn(root_node->leafAt(non_null_union_index), target_type);
return [non_null_union_index, nested_deserialize](IColumn & column, avro::Decoder & decoder)
@ -1001,7 +1003,7 @@ DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node)
case avro::Type::AVRO_STRING:
return std::make_shared<DataTypeString>();
case avro::Type::AVRO_BYTES:
return std::make_shared<DataTypeFloat32>();
return std::make_shared<DataTypeString>();
case avro::Type::AVRO_ENUM:
{
if (node->names() < 128)

View File

@ -402,14 +402,10 @@ void TCPHandler::runImpl()
{
auto callback = [this]()
{
{
std::lock_guard task_callback_lock(task_callback_mutex);
std::scoped_lock lock(task_callback_mutex, fatal_error_mutex);
if (isQueryCancelled())
return true;
}
std::lock_guard lock(fatal_error_mutex);
if (isQueryCancelled())
return true;
sendProgress();
sendSelectProfileEvents();
@ -424,6 +420,9 @@ void TCPHandler::runImpl()
}
state.io.onFinish();
std::lock_guard lock(task_callback_mutex);
/// Send final progress after calling onFinish(), since it will update the progress.
///
/// NOTE: we cannot send Progress for regular INSERT (with VALUES)
@ -446,8 +445,11 @@ void TCPHandler::runImpl()
if (state.is_connection_closed)
break;
sendLogs();
sendEndOfStream();
{
std::lock_guard lock(task_callback_mutex);
sendLogs();
sendEndOfStream();
}
/// QueryState should be cleared before QueryScope, since otherwise
/// the MemoryTracker will be wrong for possible deallocations.
@ -760,6 +762,9 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
}
}
/// Defer locking to cover a part of the scope below and everything after it
std::unique_lock progress_lock(task_callback_mutex, std::defer_lock);
{
PullingAsyncPipelineExecutor executor(pipeline);
CurrentMetrics::Increment query_thread_metric_increment{CurrentMetrics::QueryThread};
@ -796,6 +801,11 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
}
}
/// This lock wasn't acquired before and we make .lock() call here
/// so everything under this line is covered even together
/// with sendProgress() out of the scope
progress_lock.lock();
/** If data has run out, we will send the profiling data and total values to
* the last zero block to be able to use
* this information in the suffix output of stream.

View File

@ -60,6 +60,7 @@ namespace ErrorCodes
extern const int TOO_MANY_PARTITIONS;
extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES;
extern const int ARGUMENT_OUT_OF_BOUND;
extern const int LOGICAL_ERROR;
}
@ -365,18 +366,22 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(
const std::string & relative_path_,
ConnectionPoolPtr pool_,
ActionBlocker & monitor_blocker_,
BackgroundSchedulePool & bg_pool)
BackgroundSchedulePool & bg_pool,
bool initialize_from_disk)
: storage(storage_)
, pool(std::move(pool_))
, disk(disk_)
, relative_path(relative_path_)
, path(fs::path(disk->getPath()) / relative_path / "")
, broken_relative_path(fs::path(relative_path) / "broken")
, broken_path(fs::path(path) / "broken" / "")
, should_batch_inserts(storage.getDistributedSettingsRef().monitor_batch_inserts)
, split_batch_on_failure(storage.getDistributedSettingsRef().monitor_split_batch_on_failure)
, dir_fsync(storage.getDistributedSettingsRef().fsync_directories)
, min_batched_block_size_rows(storage.getContext()->getSettingsRef().min_insert_block_size_rows)
, min_batched_block_size_bytes(storage.getContext()->getSettingsRef().min_insert_block_size_bytes)
, current_batch_file_path(path + "current_batch.txt")
, pending_files(std::numeric_limits<size_t>::max())
, default_sleep_time(storage.getDistributedSettingsRef().monitor_sleep_time_ms.totalMilliseconds())
, sleep_time(default_sleep_time)
, max_sleep_time(storage.getDistributedSettingsRef().monitor_max_sleep_time_ms.totalMilliseconds())
@ -385,6 +390,11 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(
, metric_pending_files(CurrentMetrics::DistributedFilesToInsert, 0)
, metric_broken_files(CurrentMetrics::BrokenDistributedFilesToInsert, 0)
{
fs::create_directory(broken_path);
if (initialize_from_disk)
initializeFilesFromDisk();
task_handle = bg_pool.createTask(getLoggerName() + "/Bg", [this]{ run(); });
task_handle->activateAndSchedule();
}
@ -392,35 +402,29 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(
StorageDistributedDirectoryMonitor::~StorageDistributedDirectoryMonitor()
{
if (!quit)
if (!pending_files.isFinished())
{
quit = true;
pending_files.clearAndFinish();
task_handle->deactivate();
}
}
void StorageDistributedDirectoryMonitor::flushAllData()
{
if (quit)
if (pending_files.isFinished())
return;
std::lock_guard lock{mutex};
const auto & files = getFiles();
if (!files.empty())
{
processFiles(files);
/// Update counters.
getFiles();
}
if (!hasPendingFiles())
return;
processFiles();
}
void StorageDistributedDirectoryMonitor::shutdownAndDropAllData()
{
if (!quit)
if (!pending_files.isFinished())
{
quit = true;
pending_files.clearAndFinish();
task_handle->deactivate();
}
@ -434,19 +438,21 @@ void StorageDistributedDirectoryMonitor::run()
std::lock_guard lock{mutex};
bool do_sleep = false;
while (!quit)
while (!pending_files.isFinished())
{
do_sleep = true;
const auto & files = getFiles();
if (files.empty())
if (!hasPendingFiles())
break;
if (!monitor_blocker.isCancelled())
{
try
{
do_sleep = !processFiles(files);
processFiles();
/// No errors while processing existing files.
/// Let's see maybe there are more files to process.
do_sleep = false;
std::lock_guard status_lock(status_mutex);
status.last_exception = std::exception_ptr{};
@ -470,9 +476,7 @@ void StorageDistributedDirectoryMonitor::run()
}
}
else
{
LOG_DEBUG(log, "Skipping send data over distributed table.");
}
const auto now = std::chrono::system_clock::now();
if (now - last_decrease_time > decrease_error_count_period)
@ -487,10 +491,7 @@ void StorageDistributedDirectoryMonitor::run()
break;
}
/// Update counters.
getFiles();
if (!quit && do_sleep)
if (!pending_files.isFinished() && do_sleep)
task_handle->scheduleAfter(sleep_time.count());
}
@ -568,41 +569,83 @@ ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::stri
settings.distributed_replica_error_cap);
}
std::map<UInt64, std::string> StorageDistributedDirectoryMonitor::getFiles()
bool StorageDistributedDirectoryMonitor::hasPendingFiles() const
{
std::map<UInt64, std::string> files;
return fs::exists(current_batch_file_path) || !current_batch_file.empty() || !pending_files.empty();
}
void StorageDistributedDirectoryMonitor::initializeFilesFromDisk()
{
/// NOTE: This method does not requires to hold status_mutex, hence, no TSA
/// annotations in the header file.
fs::directory_iterator end;
for (fs::directory_iterator it{path}; it != end; ++it)
/// Initialize pending files
{
const auto & file_path_str = it->path();
if (!it->is_directory() && startsWith(fs::path(file_path_str).extension(), ".bin"))
size_t bytes_count = 0;
for (fs::directory_iterator it{path}; it != end; ++it)
{
files[parse<UInt64>(fs::path(file_path_str).stem())] = file_path_str;
const auto & file_path = it->path();
const auto & base_name = file_path.stem().string();
if (!it->is_directory() && startsWith(fs::path(file_path).extension(), ".bin") && parse<UInt64>(base_name))
{
const std::string & file_path_str = file_path.string();
if (!pending_files.push(file_path_str))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add pending file");
bytes_count += fs::file_size(file_path);
}
else if (base_name != "tmp" && base_name != "broken")
{
/// It is OK to log current_batch.txt here too (useful for debugging).
LOG_WARNING(log, "Unexpected file {} in {}", file_path.string(), path);
}
}
LOG_TRACE(log, "Files set to {}", pending_files.size());
LOG_TRACE(log, "Bytes set to {}", bytes_count);
metric_pending_files.changeTo(pending_files.size());
status.files_count = pending_files.size();
status.bytes_count = bytes_count;
}
return files;
/// Initialize broken files
{
size_t broken_bytes_count = 0;
size_t broken_files = 0;
for (fs::directory_iterator it{broken_path}; it != end; ++it)
{
const auto & file_path = it->path();
if (!it->is_directory() && startsWith(fs::path(file_path).extension(), ".bin") && parse<UInt64>(file_path.stem()))
broken_bytes_count += fs::file_size(file_path);
else
LOG_WARNING(log, "Unexpected file {} in {}", file_path.string(), broken_path);
}
LOG_TRACE(log, "Broken files set to {}", broken_files);
LOG_TRACE(log, "Broken bytes set to {}", broken_bytes_count);
metric_broken_files.changeTo(broken_files);
status.broken_files_count = broken_files;
status.broken_bytes_count = broken_bytes_count;
}
}
bool StorageDistributedDirectoryMonitor::processFiles(const std::map<UInt64, std::string> & files)
void StorageDistributedDirectoryMonitor::processFiles()
{
if (should_batch_inserts)
{
processFilesWithBatching(files);
}
processFilesWithBatching();
else
{
for (const auto & file : files)
{
if (quit)
return true;
/// Process unprocessed file.
if (!current_batch_file.empty())
processFile(current_batch_file);
processFile(file.second);
}
while (pending_files.tryPop(current_batch_file))
processFile(current_batch_file);
}
return true;
}
void StorageDistributedDirectoryMonitor::processFile(const std::string & file_path)
@ -649,7 +692,11 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
thread_trace_context->root_span.addAttribute(std::current_exception());
e.addMessage(fmt::format("While sending {}", file_path));
maybeMarkAsBroken(file_path, e);
if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
{
markAsBroken(file_path);
current_batch_file.clear();
}
throw;
}
catch (...)
@ -662,6 +709,7 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
markAsSend(file_path);
current_batch_file.clear();
LOG_TRACE(log, "Finished processing `{}` (took {} ms)", file_path, watch.elapsedMilliseconds());
}
@ -701,23 +749,19 @@ struct StorageDistributedDirectoryMonitor::BatchHeader
struct StorageDistributedDirectoryMonitor::Batch
{
std::vector<UInt64> file_indices;
size_t total_rows = 0;
size_t total_bytes = 0;
bool recovered = false;
StorageDistributedDirectoryMonitor & parent;
const std::map<UInt64, String> & file_index_to_path;
std::vector<std::string> files;
bool split_batch_on_failure = true;
bool fsync = false;
bool dir_fsync = false;
Batch(
StorageDistributedDirectoryMonitor & parent_,
const std::map<UInt64, String> & file_index_to_path_)
explicit Batch(StorageDistributedDirectoryMonitor & parent_)
: parent(parent_)
, file_index_to_path(file_index_to_path_)
, split_batch_on_failure(parent.split_batch_on_failure)
, fsync(parent.storage.getDistributedSettingsRef().fsync_after_insert)
, dir_fsync(parent.dir_fsync)
@ -732,7 +776,7 @@ struct StorageDistributedDirectoryMonitor::Batch
void send()
{
if (file_indices.empty())
if (files.empty())
return;
CurrentMetrics::Increment metric_increment{CurrentMetrics::DistributedSend};
@ -775,7 +819,7 @@ struct StorageDistributedDirectoryMonitor::Batch
}
catch (const Exception & e)
{
if (split_batch_on_failure && file_indices.size() > 1 && isSplittableErrorCode(e.code(), e.isRemoteException()))
if (split_batch_on_failure && files.size() > 1 && isSplittableErrorCode(e.code(), e.isRemoteException()))
{
tryLogCurrentException(parent.log, "Trying to split batch due to");
sendSeparateFiles();
@ -795,44 +839,28 @@ struct StorageDistributedDirectoryMonitor::Batch
}
else
{
std::vector<std::string> files;
for (const auto && file_info : file_index_to_path | boost::adaptors::indexed())
{
if (file_info.index() > 8)
{
files.push_back("...");
break;
}
files.push_back(file_info.value().second);
}
e.addMessage(fmt::format("While sending batch, nums: {}, files: {}", file_index_to_path.size(), fmt::join(files, "\n")));
e.addMessage(fmt::format("While sending a batch of {} files, files: {}", files.size(), fmt::join(files, "\n")));
throw;
}
}
if (!batch_broken)
{
LOG_TRACE(parent.log, "Sent a batch of {} files (took {} ms).", file_indices.size(), watch.elapsedMilliseconds());
LOG_TRACE(parent.log, "Sent a batch of {} files (took {} ms).", files.size(), watch.elapsedMilliseconds());
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, parent.disk, parent.relative_path);
for (UInt64 file_index : file_indices)
parent.markAsSend(file_index_to_path.at(file_index));
for (const auto & file : files)
parent.markAsSend(file);
}
else if (!batch_marked_as_broken)
{
LOG_ERROR(parent.log, "Marking a batch of {} files as broken.", file_indices.size());
LOG_ERROR(parent.log, "Marking a batch of {} files as broken, files: {}", files.size(), fmt::join(files, "\n"));
for (UInt64 file_idx : file_indices)
{
auto file_path = file_index_to_path.find(file_idx);
if (file_path != file_index_to_path.end())
parent.markAsBroken(file_path->second);
}
for (const auto & file : files)
parent.markAsBroken(file);
}
file_indices.clear();
files.clear();
total_rows = 0;
total_bytes = 0;
recovered = false;
@ -842,8 +870,11 @@ struct StorageDistributedDirectoryMonitor::Batch
void writeText(WriteBuffer & out)
{
for (UInt64 file_idx : file_indices)
out << file_idx << '\n';
for (const auto & file : files)
{
UInt64 file_index = parse<UInt64>(fs::path(file).stem());
out << file_index << '\n';
}
}
void readText(ReadBuffer & in)
@ -852,8 +883,9 @@ struct StorageDistributedDirectoryMonitor::Batch
{
UInt64 idx;
in >> idx >> "\n";
file_indices.push_back(idx);
files.push_back(fmt::format("{}/{}.bin", parent.path, idx));
}
recovered = true;
}
@ -865,14 +897,9 @@ private:
IConnectionPool::Entry connection;
for (UInt64 file_idx : file_indices)
for (const auto & file : files)
{
auto file_path = file_index_to_path.find(file_idx);
if (file_path == file_index_to_path.end())
throw Exception(ErrorCodes::DISTRIBUTED_BROKEN_BATCH_INFO,
"Failed to send batch: file with index {} is absent", file_idx);
ReadBufferFromFile in(file_path->second);
ReadBufferFromFile in(file);
const auto & distributed_header = readDistributedHeader(in, parent.log);
OpenTelemetry::TracingContextHolder thread_trace_context(__PRETTY_FUNCTION__,
@ -886,7 +913,7 @@ private:
compression_expected = connection->getCompression() == Protocol::Compression::Enable;
LOG_DEBUG(parent.log, "Sending a batch of {} files to {} ({} rows, {} bytes).",
file_indices.size(),
files.size(),
connection->getDescription(),
formatReadableQuantity(total_rows),
formatReadableSizeWithBinarySuffix(total_bytes));
@ -907,19 +934,11 @@ private:
{
size_t broken_files = 0;
for (UInt64 file_idx : file_indices)
for (const auto & file : files)
{
auto file_path = file_index_to_path.find(file_idx);
if (file_path == file_index_to_path.end())
{
LOG_ERROR(parent.log, "Failed to send one file from batch: file with index {} is absent", file_idx);
++broken_files;
continue;
}
try
{
ReadBufferFromFile in(file_path->second);
ReadBufferFromFile in(file);
const auto & distributed_header = readDistributedHeader(in, parent.log);
// this function is called in a separated thread, so we set up the trace context from the file
@ -941,9 +960,11 @@ private:
}
catch (Exception & e)
{
e.addMessage(fmt::format("While sending {}", file_path->second));
parent.maybeMarkAsBroken(file_path->second, e);
++broken_files;
if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
{
parent.markAsBroken(file);
++broken_files;
}
}
}
@ -1023,13 +1044,18 @@ std::shared_ptr<ISource> StorageDistributedDirectoryMonitor::createSourceFromFil
return std::make_shared<DirectoryMonitorSource>(file_name);
}
bool StorageDistributedDirectoryMonitor::addAndSchedule(size_t file_size, size_t ms)
bool StorageDistributedDirectoryMonitor::addAndSchedule(const std::string & file_path, size_t file_size, size_t ms)
{
if (quit)
/// NOTE: It is better not to throw in this case, since the file is already
/// on disk (see DistributedSink), and it will be processed next time.
if (pending_files.isFinished())
return false;
if (!pending_files.push(file_path))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add pending file");
{
std::lock_guard status_lock(status_mutex);
std::lock_guard lock(status_mutex);
metric_pending_files.add();
status.bytes_count += file_size;
++status.files_count;
@ -1045,33 +1071,25 @@ StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::g
return current_status;
}
void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map<UInt64, std::string> & files)
void StorageDistributedDirectoryMonitor::processFilesWithBatching()
{
std::unordered_set<UInt64> file_indices_to_skip;
/// Possibly, we failed to send a batch on the previous iteration. Try to send exactly the same batch.
if (fs::exists(current_batch_file_path))
{
/// Possibly, we failed to send a batch on the previous iteration. Try to send exactly the same batch.
Batch batch(*this, files);
Batch batch(*this);
ReadBufferFromFile in{current_batch_file_path};
batch.readText(in);
file_indices_to_skip.insert(batch.file_indices.begin(), batch.file_indices.end());
batch.send();
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
fs::remove(current_batch_file_path);
}
std::unordered_map<BatchHeader, Batch, BatchHeader::Hash> header_to_batch;
for (const auto & file : files)
std::string file_path;
while (pending_files.tryPop(file_path))
{
if (quit)
return;
UInt64 file_idx = file.first;
const String & file_path = file.second;
if (file_indices_to_skip.contains(file_idx))
continue;
size_t total_rows = 0;
size_t total_bytes = 0;
Block header;
@ -1110,8 +1128,9 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
}
catch (const Exception & e)
{
if (maybeMarkAsBroken(file_path, e))
if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
{
markAsBroken(file_path);
tryLogCurrentException(log, "File is marked broken due to");
continue;
}
@ -1125,9 +1144,9 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
std::move(distributed_header.client_info),
std::move(header)
);
Batch & batch = header_to_batch.try_emplace(batch_header, *this, files).first->second;
Batch & batch = header_to_batch.try_emplace(batch_header, *this).first->second;
batch.file_indices.push_back(file_idx);
batch.files.push_back(file_path);
batch.total_rows += total_rows;
batch.total_bytes += total_bytes;
@ -1155,16 +1174,10 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path)
{
const auto last_path_separator_pos = file_path.rfind('/');
const auto & base_path = file_path.substr(0, last_path_separator_pos + 1);
const auto & file_name = file_path.substr(last_path_separator_pos + 1);
const String & broken_path = fs::path(base_path) / "broken/";
const String & broken_file_path = fs::path(broken_path) / file_name;
fs::create_directory(broken_path);
const String & broken_file_path = fs::path(broken_path) / fs::path(file_path).filename();
auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, fs::path(relative_path) / "broken/");
auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, broken_relative_path);
{
std::lock_guard status_lock(status_mutex);
@ -1198,21 +1211,9 @@ void StorageDistributedDirectoryMonitor::markAsSend(const std::string & file_pat
fs::remove(file_path);
}
bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e)
{
/// Mark file as broken if necessary.
if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
{
markAsBroken(file_path);
return true;
}
else
return false;
}
std::string StorageDistributedDirectoryMonitor::getLoggerName() const
{
return storage.getStorageID().getFullTableName() + ".DirectoryMonitor";
return storage.getStorageID().getFullTableName() + ".DirectoryMonitor." + disk->getName();
}
void StorageDistributedDirectoryMonitor::updatePath(const std::string & new_relative_path)

View File

@ -1,6 +1,7 @@
#pragma once
#include <Core/BackgroundSchedulePool.h>
#include <Common/ConcurrentBoundedQueue.h>
#include <Client/ConnectionPool.h>
#include <atomic>
@ -38,7 +39,8 @@ public:
const std::string & relative_path_,
ConnectionPoolPtr pool_,
ActionBlocker & monitor_blocker_,
BackgroundSchedulePool & bg_pool);
BackgroundSchedulePool & bg_pool,
bool initialize_from_disk);
~StorageDistributedDirectoryMonitor();
@ -53,7 +55,7 @@ public:
static std::shared_ptr<ISource> createSourceFromFile(const String & file_name);
/// For scheduling via DistributedSink.
bool addAndSchedule(size_t file_size, size_t ms);
bool addAndSchedule(const std::string & file_path, size_t file_size, size_t ms);
struct InternalStatus
{
@ -78,14 +80,15 @@ public:
private:
void run();
std::map<UInt64, std::string> getFiles();
bool processFiles(const std::map<UInt64, std::string> & files);
bool hasPendingFiles() const;
void initializeFilesFromDisk();
void processFiles();
void processFile(const std::string & file_path);
void processFilesWithBatching(const std::map<UInt64, std::string> & files);
void processFilesWithBatching();
void markAsBroken(const std::string & file_path);
void markAsSend(const std::string & file_path);
bool maybeMarkAsBroken(const std::string & file_path, const Exception & e);
std::string getLoggerName() const;
@ -95,25 +98,33 @@ private:
DiskPtr disk;
std::string relative_path;
std::string path;
std::string broken_relative_path;
std::string broken_path;
const bool should_batch_inserts = false;
const bool split_batch_on_failure = true;
const bool dir_fsync = false;
const size_t min_batched_block_size_rows = 0;
const size_t min_batched_block_size_bytes = 0;
String current_batch_file_path;
/// This is pending data (due to some error) for should_batch_inserts==true
std::string current_batch_file_path;
/// This is pending data (due to some error) for should_batch_inserts==false
std::string current_batch_file;
struct BatchHeader;
struct Batch;
std::mutex status_mutex;
InternalStatus status;
ConcurrentBoundedQueue<std::string> pending_files;
const std::chrono::milliseconds default_sleep_time;
std::chrono::milliseconds sleep_time;
const std::chrono::milliseconds max_sleep_time;
std::chrono::time_point<std::chrono::system_clock> last_decrease_time {std::chrono::system_clock::now()};
std::atomic<bool> quit {false};
std::mutex mutex;
Poco::Logger * log;
ActionBlocker & monitor_blocker;

View File

@ -724,6 +724,9 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
return guard;
};
std::vector<std::string> bin_files;
bin_files.reserve(dir_names.size());
auto it = dir_names.begin();
/// on first iteration write block to a temporary directory for subsequent
/// hardlinking to ensure the inode is not freed until we're done
@ -802,8 +805,8 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
}
// Create hardlink here to reuse increment number
const std::string block_file_path(fs::path(path) / file_name);
createHardLink(first_file_tmp_path, block_file_path);
bin_files.push_back(fs::path(path) / file_name);
createHardLink(first_file_tmp_path, bin_files.back());
auto dir_sync_guard = make_directory_sync_guard(*it);
}
++it;
@ -814,8 +817,8 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
const std::string path(fs::path(disk_path) / (data_path + *it));
fs::create_directory(path);
const std::string block_file_path(fs::path(path) / (toString(storage.file_names_increment.get()) + ".bin"));
createHardLink(first_file_tmp_path, block_file_path);
bin_files.push_back(fs::path(path) / (toString(storage.file_names_increment.get()) + ".bin"));
createHardLink(first_file_tmp_path, bin_files.back());
auto dir_sync_guard = make_directory_sync_guard(*it);
}
@ -826,10 +829,13 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
/// Notify
auto sleep_ms = context->getSettingsRef().distributed_directory_monitor_sleep_time_ms;
for (const auto & dir_name : dir_names)
for (size_t i = 0; i < dir_names.size(); ++i)
{
const auto & dir_name = dir_names[i];
const auto & bin_file = bin_files[i];
auto & directory_monitor = storage.requireDirectoryMonitor(disk, dir_name, /* startup= */ false);
directory_monitor.addAndSchedule(file_size, sleep_ms.totalMilliseconds());
directory_monitor.addAndSchedule(bin_file, file_size, sleep_ms.totalMilliseconds());
}
}

View File

@ -651,7 +651,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
}
#endif
LOG_WARNING(log, fmt::runtime(e.message() + " Will retry fetching part without zero-copy."));
LOG_WARNING(log, "Will retry fetching part without zero-copy: {}", e.message());
/// It's important to release session from HTTP pool. Otherwise it's possible to get deadlock
/// on http pool.

View File

@ -109,10 +109,11 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare()
/// 2. We have some larger merged part which covers new_part_name (and therefore it covers source_part_name too)
/// 3. We have two intersecting parts, both cover source_part_name. It's logical error.
/// TODO Why 1 and 2 can happen? Do we need more assertions here or somewhere else?
constexpr const char * message = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
LOG_WARNING(log, fmt::runtime(message), source_part_name, source_part_or_covering->name, entry.new_part_name);
constexpr auto fmt_string = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
String message;
LOG_WARNING(LogToStr(message, log), fmt_string, source_part_name, source_part_or_covering->name, entry.new_part_name);
if (!source_part_or_covering->info.contains(MergeTreePartInfo::fromPartName(entry.new_part_name, storage.format_version)))
throw Exception(ErrorCodes::LOGICAL_ERROR, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
throw Exception(ErrorCodes::LOGICAL_ERROR, message);
return PrepareResult{
.prepared_successfully = false,

File diff suppressed because it is too large Load Diff

View File

@ -1052,6 +1052,8 @@ public:
/// Returns an object that protects temporary directory from cleanup
scope_guard getTemporaryPartDirectoryHolder(const String & part_dir_name) const;
void waitForOutdatedPartsToBeLoaded() const;
protected:
friend class IMergeTreeDataPart;
friend class MergeTreeDataMergerMutator;
@ -1068,7 +1070,6 @@ protected:
/// under lockForShare if rename is possible.
String relative_data_path;
/// Current column sizes in compressed and uncompressed form.
ColumnSizeByName column_sizes;
@ -1330,6 +1331,88 @@ protected:
void resetObjectColumnsFromActiveParts(const DataPartsLock & lock);
void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock);
/** A structure that explicitly represents a "merge tree" of parts
* which is implicitly presented by min-max block numbers and levels of parts.
* The children of node are parts which are covered by parent part.
* This tree provides the order of loading of parts.
*
* We start to traverse tree from the top level and load parts
* corresposponded to nodes. If part is loaded successfully then
* we stop traversal at this node. Otherwise part is broken and we
* traverse its children and try to load covered parts which will
* replace broken covering part. Unloaded nodes represent outdated parts
* nd they are pushed to background task and loaded asynchronoulsy.
*/
class PartLoadingTree
{
public:
struct Node
{
Node(const MergeTreePartInfo & info_, const String & name_, const DiskPtr & disk_)
: info(info_), name(name_), disk(disk_)
{
}
const MergeTreePartInfo info;
const String name;
const DiskPtr disk;
bool is_loaded = false;
std::map<MergeTreePartInfo, std::shared_ptr<Node>> children;
};
struct PartLoadingInfo
{
PartLoadingInfo(const MergeTreePartInfo & info_, const String & name_, const DiskPtr & disk_)
: info(info_), name(name_), disk(disk_)
{
}
/// Store name explicitly because it cannot be easily
/// retrieved from info in tables with old syntax.
MergeTreePartInfo info;
String name;
DiskPtr disk;
};
using NodePtr = std::shared_ptr<Node>;
using PartLoadingInfos = std::vector<PartLoadingInfo>;
/// Builds a tree from the list of part infos.
static PartLoadingTree build(PartLoadingInfos nodes);
/// Traverses a tree and call @func on each node.
/// If recursive is false traverses only the top level.
template <typename Func>
void traverse(bool recursive, Func && func);
private:
/// NOTE: Parts should be added in descending order of their levels
/// because rearranging tree to the new root is not supported.
void add(const MergeTreePartInfo & info, const String & name, const DiskPtr & disk);
std::unordered_map<String, NodePtr> root_by_partition;
};
using PartLoadingTreeNodes = std::vector<PartLoadingTree::NodePtr>;
struct LoadPartResult
{
bool is_broken = false;
std::optional<size_t> size_of_part;
MutableDataPartPtr part;
};
mutable std::mutex outdated_data_parts_mutex;
mutable std::condition_variable outdated_data_parts_cv;
BackgroundSchedulePool::TaskHolder outdated_data_parts_loading_task;
PartLoadingTreeNodes outdated_unloaded_data_parts TSA_GUARDED_BY(outdated_data_parts_mutex);
bool outdated_data_parts_loading_canceled TSA_GUARDED_BY(outdated_data_parts_mutex) = false;
void loadOutdatedDataParts(bool is_async);
void startOutdatedDataPartsLoadingTask();
void stopOutdatedDataPartsLoadingTask();
static void incrementInsertedPartsProfileEvent(MergeTreeDataPartType type);
static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type);
@ -1408,18 +1491,20 @@ private:
/// Returns default settings for storage with possible changes from global config.
virtual std::unique_ptr<MergeTreeSettings> getDefaultSettings() const = 0;
void loadDataPartsFromDisk(
MutableDataPartsVector & broken_parts_to_detach,
MutableDataPartsVector & duplicate_parts_to_remove,
LoadPartResult loadDataPart(
const MergeTreePartInfo & part_info,
const String & part_name,
const DiskPtr & part_disk_ptr,
MergeTreeDataPartState to_state,
std::mutex & part_loading_mutex);
std::vector<LoadPartResult> loadDataPartsFromDisk(
ThreadPool & pool,
size_t num_parts,
std::queue<std::vector<std::pair<String, DiskPtr>>> & parts_queue,
bool skip_sanity_checks,
std::queue<PartLoadingTreeNodes> & parts_queue,
const MergeTreeSettingsPtr & settings);
void loadDataPartsFromWAL(
MutableDataPartsVector & duplicate_parts_to_remove,
MutableDataPartsVector & parts_from_wal);
void loadDataPartsFromWAL(MutableDataPartsVector & parts_from_wal);
/// Create zero-copy exclusive lock for part and disk. Useful for coordination of
/// distributed operations which can lead to data duplication. Implemented only in ReplicatedMergeTree.
@ -1430,7 +1515,7 @@ private:
/// Otherwise, in non-parallel case will break and return.
void clearPartsFromFilesystemImpl(const DataPartsVector & parts, NameSet * part_names_succeed);
static MutableDataPartPtr preparePartForRemoval(const DataPartPtr & part);
static MutableDataPartPtr asMutableDeletingPart(const DataPartPtr & part);
mutable TemporaryParts temporary_parts;
};

View File

@ -193,7 +193,8 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
if (!metadata_snapshot->hasPartitionKey()) /// Table is not partitioned.
{
result.emplace_back(Block(block), Row{});
result[0].offsets = chunk_offsets;
if (chunk_offsets != nullptr)
result[0].offsets = std::move(chunk_offsets->offsets);
return result;
}
@ -230,7 +231,7 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
/// do not interfere with possible calculated primary key columns of the same name.
result.emplace_back(Block(block), get_partition(0));
if (!chunk_offsets_with_partition.empty())
result[0].offsets = chunk_offsets_with_partition[0];
result[0].offsets = std::move(chunk_offsets_with_partition[0]->offsets);
return result;
}
@ -245,7 +246,7 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
}
for (size_t i = 0; i < chunk_offsets_with_partition.size(); ++i)
result[i].offsets = chunk_offsets_with_partition[i];
result[i].offsets = std::move(chunk_offsets_with_partition[i]->offsets);
return result;
}

View File

@ -22,15 +22,15 @@ struct BlockWithPartition
{
Block block;
Row partition;
ChunkOffsetsPtr offsets;
std::vector<size_t> offsets;
BlockWithPartition(Block && block_, Row && partition_)
: block(block_), partition(std::move(partition_))
{
}
BlockWithPartition(Block && block_, Row && partition_, ChunkOffsetsPtr chunk_offsets_)
: block(block_), partition(std::move(partition_)), offsets(chunk_offsets_)
BlockWithPartition(Block && block_, Row && partition_, std::vector<size_t> && offsets_)
: block(block_), partition(std::move(partition_)), offsets(std::move(offsets_))
{
}
};

View File

@ -373,9 +373,9 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na
throw;
tryLogCurrentException(log, __PRETTY_FUNCTION__);
String message = "Part " + part_name + " looks broken. Removing it and will try to fetch.";
LOG_ERROR(log, fmt::runtime(message));
constexpr auto fmt_string = "Part {} looks broken. Removing it and will try to fetch.";
String message = fmt::format(fmt_string, part_name);
LOG_ERROR(log, fmt_string, part_name);
/// Delete part locally.
storage.outdateBrokenPartAndCloneToDetached(part, "broken");
@ -392,9 +392,9 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na
/// Probably, someone just wrote down the part, and has not yet added to ZK.
/// Therefore, delete only if the part is old (not very reliable).
ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);
String message = "Unexpected part " + part_name + " in filesystem. Removing.";
LOG_ERROR(log, fmt::runtime(message));
constexpr auto fmt_string = "Unexpected part {} in filesystem. Removing.";
String message = fmt::format(fmt_string, part_name);
LOG_ERROR(log, fmt_string, part_name);
storage.outdateBrokenPartAndCloneToDetached(part, "unexpected");
return {part_name, false, message};
}

View File

@ -1191,12 +1191,10 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
if (entry_for_same_part_it != future_parts.end())
{
const LogEntry & another_entry = *entry_for_same_part_it->second;
out_reason = fmt::format(
"Not executing log entry {} of type {} for part {} "
"because another log entry {} of type {} for the same part ({}) is being processed.",
entry.znode_name, entry.type, entry.new_part_name,
another_entry.znode_name, another_entry.type, another_entry.new_part_name);
LOG_INFO(log, fmt::runtime(out_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
"because another log entry {} of type {} for the same part ({}) is being processed.";
LOG_INFO(LogToStr(out_reason, log), fmt_string, entry.znode_name, entry.type, entry.new_part_name,
another_entry.znode_name, another_entry.type, another_entry.new_part_name);
return true;
/** When the corresponding action is completed, then `isNotCoveredByFuturePart` next time, will succeed,
@ -1238,11 +1236,9 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
{
if (entry.znode_name < future_part_elem.second->znode_name)
{
out_reason = fmt::format(
"Not executing log entry {} for part {} "
"because it is not disjoint with part {} that is currently executing and another entry {} is newer.",
entry.znode_name, new_part_name, future_part_elem.first, future_part_elem.second->znode_name);
LOG_TRACE(log, fmt::runtime(out_reason));
constexpr auto fmt_string = "Not executing log entry {} for part {} "
"because it is not disjoint with part {} that is currently executing and another entry {} is newer.";
LOG_TRACE(LogToStr(out_reason, log), fmt_string, entry.znode_name, new_part_name, future_part_elem.first, future_part_elem.second->znode_name);
return true;
}
@ -1250,11 +1246,9 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
continue;
}
out_reason = fmt::format(
"Not executing log entry {} for part {} "
"because it is not disjoint with part {} that is currently executing.",
entry.znode_name, new_part_name, future_part_elem.first);
LOG_TRACE(log, fmt::runtime(out_reason));
constexpr auto fmt_string = "Not executing log entry {} for part {} "
"because it is not disjoint with part {} that is currently executing.";
LOG_TEST(LogToStr(out_reason, log), fmt_string, entry.znode_name, new_part_name, future_part_elem.first);
return true;
}
@ -1337,11 +1331,9 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
{
if (future_parts.contains(name))
{
out_postpone_reason = fmt::format(
"Not executing log entry {} of type {} for part {} "
"because part {} is not ready yet (log entry for that part is being processed).",
entry.znode_name, entry.typeToString(), entry.new_part_name, name);
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
"because part {} is not ready yet (log entry for that part is being processed).";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name, name);
return false;
}
@ -1357,10 +1349,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
if (merger_mutator.merges_blocker.isCancelled())
{
out_postpone_reason = fmt::format(
"Not executing log entry {} of type {} for part {} because merges and mutations are cancelled now.",
entry.znode_name, entry.typeToString(), entry.new_part_name);
LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} because merges and mutations are cancelled now.";
LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name);
return false;
}
@ -1375,8 +1365,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
if (!disks.empty() && only_s3_storage && storage.checkZeroCopyLockExists(entry.new_part_name, disks[0]))
{
out_postpone_reason = "Not executing merge/mutation for the part " + entry.new_part_name
+ ", waiting other replica to execute it and will fetch after.";
constexpr auto fmt_string = "Not executing merge/mutation for the part {}, waiting other replica to execute it and will fetch after.";
out_postpone_reason = fmt::format(fmt_string, entry.new_part_name);
return false;
}
}
@ -1387,9 +1377,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
if (replica_to_execute_merge && !merge_strategy_picker.isMergeFinishedByReplica(replica_to_execute_merge.value(), entry))
{
String reason = "Not executing merge for the part " + entry.new_part_name
+ ", waiting for " + replica_to_execute_merge.value() + " to execute merge.";
out_postpone_reason = reason;
constexpr auto fmt_string = "Not executing merge for the part {}, waiting for {} to execute merge.";
out_postpone_reason = fmt::format(fmt_string, entry.new_part_name, replica_to_execute_merge.value());
return false;
}
}
@ -1411,20 +1400,16 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
{
if (merger_mutator.ttl_merges_blocker.isCancelled())
{
out_postpone_reason = fmt::format(
"Not executing log entry {} for part {} because merges with TTL are cancelled now.",
entry.znode_name, entry.new_part_name);
LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} for part {} because merges with TTL are cancelled now.";
LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.new_part_name);
return false;
}
size_t total_merges_with_ttl = data.getTotalMergesWithTTLInMergeList();
if (total_merges_with_ttl >= data_settings->max_number_of_merges_with_ttl_in_pool)
{
out_postpone_reason = fmt::format(
"Not executing log entry {} for part {} because {} merges with TTL already executing, maximum {}.",
entry.znode_name, entry.new_part_name, total_merges_with_ttl,
data_settings->max_number_of_merges_with_ttl_in_pool);
LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} for part {} because {} merges with TTL already executing, maximum {}.";
LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.new_part_name, total_merges_with_ttl,
data_settings->max_number_of_merges_with_ttl_in_pool);
return false;
}
}
@ -1432,12 +1417,10 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
if (!ignore_max_size && sum_parts_size_in_bytes > max_source_parts_size)
{
out_postpone_reason = fmt::format("Not executing log entry {} of type {} for part {}"
" because source parts size ({}) is greater than the current maximum ({}).",
entry.znode_name, entry.typeToString(), entry.new_part_name,
ReadableSize(sum_parts_size_in_bytes), ReadableSize(max_source_parts_size));
LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {}"
" because source parts size ({}) is greater than the current maximum ({}).";
LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name,
ReadableSize(sum_parts_size_in_bytes), ReadableSize(max_source_parts_size));
return false;
}
@ -1450,10 +1433,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
if (!alter_sequence.canExecuteMetaAlter(entry.alter_version, state_lock))
{
int head_alter = alter_sequence.getHeadAlterVersion(state_lock);
out_postpone_reason = fmt::format(
"Cannot execute alter metadata {} with version {} because another alter {} must be executed before",
entry.znode_name, entry.alter_version, head_alter);
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Cannot execute alter metadata {} with version {} because another alter {} must be executed before";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version, head_alter);
return false;
}
}
@ -1466,17 +1447,13 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
int head_alter = alter_sequence.getHeadAlterVersion(state_lock);
if (head_alter == entry.alter_version)
{
out_postpone_reason = fmt::format(
"Cannot execute alter data {} with version {} because metadata still not altered",
entry.znode_name, entry.alter_version);
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Cannot execute alter data {} with version {} because metadata still not altered";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version);
}
else
{
out_postpone_reason = fmt::format(
"Cannot execute alter data {} with version {} because another alter {} must be executed before",
entry.znode_name, entry.alter_version, head_alter);
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Cannot execute alter data {} with version {} because another alter {} must be executed before";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version, head_alter);
}
return false;
@ -1498,14 +1475,12 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
{
if (drop_range_info.isDisjoint(info))
continue;
out_postpone_reason = fmt::format(
"Not executing log entry {} of type {} for part {} "
"because another DROP_RANGE or REPLACE_RANGE entry with not disjoint range {} is currently executing.",
entry.znode_name,
entry.typeToString(),
entry.new_part_name,
info.getPartNameForLogs());
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
"because another DROP_RANGE or REPLACE_RANGE entry with not disjoint range {} is currently executing.";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name,
entry.typeToString(),
entry.new_part_name,
info.getPartNameForLogs());
return false;
}
}
@ -1531,11 +1506,10 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
auto new_part_info = MergeTreePartInfo::fromPartName(new_part_name, format_version);
if (!new_part_info.isDisjoint(drop_part_info))
{
out_postpone_reason = fmt::format(
"Not executing log entry {} of type {} for part {} "
"because it probably depends on {} (REPLACE_RANGE).",
entry.znode_name, entry.typeToString(), entry.new_part_name, replace_entry->znode_name);
LOG_TRACE(log, fmt::runtime(out_postpone_reason));
constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
"because it probably depends on {} (REPLACE_RANGE).";
LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(),
entry.new_part_name, replace_entry->znode_name);
return false;
}
}

Some files were not shown because too many files have changed in this diff Show More