Merge branch 'master' into tests/improve-hung-check

2024-11-21 15:12:02 +00:00 · 2023-01-17 20:09:18 +01:00 · 2023-01-17 20:09:18 +01:00 · 2cd0ba7fff
commit 2cd0ba7fff
parent 19cc2e2484 8b13b85ea0
164 changed files with 4252 additions and 1230 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -683,3 +683,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py
--- a/.github/workflows/docs_check.yml
+++ b/.github/workflows/docs_check.yml
@ -169,3 +169,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py --check-approved
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -4388,3 +4388,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py --check-approved
--- a/contrib/poco
+++ b/contrib/poco
@ -1 +1 @@
-Subproject commit 799234226187c0ae0b8c90f23465b25ed7956e56
+Subproject commit 0ab9bba7ccad3c8dacce04a35cb3b78218547ab4
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -5,6 +5,7 @@ set -x

 # core.COMM.PID-TID
 sysctl kernel.core_pattern='core.%e.%p-%P'
+dmesg --clear ||:

 set -e
 set -u
@ -368,6 +369,7 @@ if [ -f core.zst ]; then
 fi

 rg --text -F '<Fatal>' server.log > fatal.log ||:
+dmesg -T > dmesg.log ||:

 zstd --threads=0 server.log

@ -396,6 +398,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
  <a href="fuzzer.log">fuzzer.log</a>
  <a href="server.log.zst">server.log.zst</a>
  <a href="main.log">main.log</a>
+  <a href="dmesg.log">dmesg.log</a>
  ${CORE_LINK}
 </p>
 <table>
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -128,6 +128,7 @@ function run_tests()

    if [[ "${HIGH_LEVEL_COVERAGE}" = "YES" ]]; then
        ADDITIONAL_OPTIONS+=('--report-coverage')
+        ADDITIONAL_OPTIONS+=('--report-logs-stats')
    fi

    set +e
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -289,6 +289,7 @@ if __name__ == "__main__":
                "--database=system",
                "--hung-check",
                "--stress",
+                "--report-logs-stats",
                "00001_select_1",
            ]
        )
--- a/docs/en/engines/database-engines/postgresql.md
+++ b/docs/en/engines/database-engines/postgresql.md
@ -136,3 +136,7 @@ DESCRIBE TABLE test_database.test_table;
 │ data   │ Nullable(String)  │
 └────────┴───────────────────┘
 ```
+
+## Related content
+
+- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)
--- a/docs/en/engines/table-engines/integrations/postgresql.md
+++ b/docs/en/engines/table-engines/integrations/postgresql.md
@ -175,3 +175,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32)

 -   [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
 -   [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
+
+## Related content
+- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -1203,12 +1203,14 @@ SELECT * FROM json_each_row_nested
 - [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
 - [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
 - [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
+- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
+- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
 - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
 - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
 - [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
 - [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
 - [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `false`.
+- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
 - [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
 - [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.

--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -266,7 +266,7 @@ Default value: 0.

 Limits the size in bytes of the hash table used when joining tables.

-This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).
+This setting applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).

 If the query contains joins, ClickHouse checks this setting for every intermediate result.

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -402,40 +402,62 @@ Default value: `ALL`.

 ## join_algorithm {#settings-join_algorithm}

-Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm.
+Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used.

 Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine.

 Possible values:

- `default` — `hash` or `direct`, if possible (same as `direct,hash`)
+### `default` 

- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
+This is the equivalent of `hash` or `direct`, if possible (same as `direct,hash`)

- `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.
+### `grace_hash` 
+
+[Grace hash join](https://en.wikipedia.org/wiki/Hash_join#Grace_hash_join) is used.  Grace hash provides an algorithm option that provides performant complex joins while limiting memory use.
+
+The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned.
+
+### `hash`
+
+[Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
+
+### `parallel_hash` 
+
+A variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.

 When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM.

- `partial_merge` — a variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.
+### `partial_merge` 
+
+A variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.

 The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported).

-When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.
+When using the `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by the `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.

- `direct` - can be applied when the right storage supports key-value requests.
+### `direct` 
+
+This algorithm can be applied when the storage for the right table supports key-value requests.

 The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md/#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs.

- `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated.
+### `auto` 

- `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
+When set to `auto`, `hash` join is tried first, and the algorithm is switched on the fly to another algorithm if the memory limit is violated.

- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.
+### `full_sorting_merge` 
+
+[Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
+
+### `prefer_partial_merge` 
+
+ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.


 ## join_any_take_last_row {#settings-join_any_take_last_row}

-Changes behaviour of join operations with `ANY` strictness.
+Changes the behaviour of join operations with `ANY` strictness.

 :::warning
 This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables.
@ -498,7 +520,7 @@ Default value: 65536.

 Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk.

-The bigger the value of the setting, the more RAM used and the less disk I/O needed.
+The bigger the value of the setting, the more RAM is used and the less disk I/O is needed.

 Possible values:

@ -514,12 +536,12 @@ Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations.
 Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour.
 :::

-When the legacy behaviour enabled:
+When the legacy behaviour is enabled:

 -   Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping.
 -   Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do.

-When the legacy behaviour disabled:
+When the legacy behaviour is disabled:

 -   Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations.
 -   Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables.
@ -572,7 +594,7 @@ Default value: `163840`.

 ## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem}

-The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.
+The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.

 Possible values:

@ -706,7 +728,7 @@ log_queries=1

 ## log_queries_min_query_duration_ms {#settings-log-queries-min-query-duration-ms}

-If enabled (non-zero), queries faster then the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:
+If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:

 - `system.query_log`
 - `system.query_thread_log`
@ -741,7 +763,7 @@ log_queries_min_type='EXCEPTION_WHILE_PROCESSING'

 Setting up query threads logging.

-Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.
+Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.

 Possible values:

@ -760,7 +782,7 @@ log_query_threads=1

 Setting up query views logging.

-When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.
+When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.

 Example:

@ -787,7 +809,7 @@ It can be used to improve the readability of server logs. Additionally, it helps

 Possible values:

-   Any string no longer than [max_query_size](#settings-max_query_size). If length is exceeded, the server throws an exception.
+-   Any string no longer than [max_query_size](#settings-max_query_size). If the max_query_size is exceeded, the server throws an exception.

 Default value: empty string.

@ -821,11 +843,11 @@ The setting also does not have a purpose when using INSERT SELECT, since data is

 Default value: 1,048,576.

-The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.
+The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.

 ## min_insert_block_size_rows {#min-insert-block-size-rows}

-Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.

 Possible values:

@ -891,7 +913,7 @@ Higher values will lead to higher memory usage.

 ## max_compress_block_size {#max-compress-block-size}

-The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.
+The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.

 :::warning
 This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse.
@ -935,7 +957,7 @@ Default value: 1000.

 ## interactive_delay {#interactive-delay}

-The interval in microseconds for checking whether request execution has been cancelled and sending the progress.
+The interval in microseconds for checking whether request execution has been canceled and sending the progress.

 Default value: 100,000 (checks for cancelling and sends the progress ten times per second).

@ -4122,7 +4144,20 @@ Enabled by default.

 Serialize named tuple columns as JSON objects.

-Disabled by default.
+Enabled by default.
+
+### input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects}
+
+Parse named tuple columns as JSON objects.
+
+Enabled by default.
+
+### input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple}
+
+Insert default values for missing elements in JSON object while parsing named tuple.
+This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled.
+
+Enabled by default.

 ### output_format_json_array_of_rows {#output_format_json_array_of_rows}

--- a/docs/en/operations/utilities/clickhouse-local.md
+++ b/docs/en/operations/utilities/clickhouse-local.md
@ -120,5 +120,6 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.

 ## Related Content

+- [Extracting, converting, and querying data in local files using clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local)
 - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1)
 - [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data)
--- a/docs/en/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/index.md
@ -57,6 +57,7 @@ ClickHouse-specific aggregate functions:
 -   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md)
 -   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md)
 -   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md)
+-   [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md)
 -   [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md)
 -   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md)
 -   [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md)
@ -77,4 +78,6 @@ ClickHouse-specific aggregate functions:
 -   [contingency](./contingency.md)
 -   [cramersV](./cramersv.md)
 -   [cramersVBiasCorrected](./cramersvbiascorrected.md)
-   [theilsU](./theilsu.md)
+-   [theilsU](./theilsu.md)
+-   [maxIntersections](./maxintersections.md)
+-   [maxIntersectionsPosition](./maxintersectionsposition.md)
--- a/docs/en/sql-reference/aggregate-functions/reference/maxintersections.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/maxintersections.md
@ -0,0 +1,64 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/maxintersections
+sidebar_position: 360
+title: maxIntersections
+---
+
+# maxIntersections
+
+Aggregate function that calculates the maximum number of times that a group of intervals intersects each other (if all the intervals intersect at least once).
+
+The syntax is:
+
+```sql
+maxIntersections(start_column, end_column)
+```
+
+**Arguments**
+
+- `start_column` – the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
+
+- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
+
+**Returned value**
+
+Returns the maximum number of intersected intervals.
+
+**Example**
+
+```sql
+CREATE TABLE my_events (
+    start UInt32,
+    end UInt32
+)
+Engine = MergeTree
+ORDER BY tuple();
+
+INSERT INTO my_events VALUES
+   (1, 3),
+   (1, 6),
+   (2, 5),
+   (3, 7);
+```
+
+The intervals look like the following:
+
+```response
+1 - 3
+1 - - - - 6
+  2 - - 5
+    3 - - - 7
+```
+
+Three of these intervals have a common value (the value is `4`, but the value that is common is not important, we are measuring the count of the intersections). The intervals `(1,3)` and `(3,7)` share an endpoint but are not considered intersecting by the `maxIntersections` function.
+
+```sql
+SELECT maxIntersections(start, end) FROM my_events;
+```
+
+Response:
+```response
+3
+```
+
+If you have multiple occurrences of the maximum interval, you can use the [`maxIntersectionsPosition` function](./maxintersectionsposition.md) to locate the number and location of those occurrences.
--- a/docs/en/sql-reference/aggregate-functions/reference/maxintersectionsposition.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/maxintersectionsposition.md
@ -0,0 +1,64 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/maxintersectionsposition
+sidebar_position: 361
+title: maxIntersectionsPosition
+---
+
+# maxIntersectionsPosition
+
+Aggregate function that calculates the positions of the occurrences of the [`maxIntersections` function](./maxintersections.md).
+
+The syntax is:
+
+```sql
+maxIntersectionsPosition(start_column, end_column)
+```
+
+**Arguments**
+
+- `start_column` – the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
+
+- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
+
+**Returned value**
+
+Returns the start positions of the maximum number of intersected intervals.
+
+**Example**
+
+```sql
+CREATE TABLE my_events (
+    start UInt32,
+    end UInt32
+)
+Engine = MergeTree
+ORDER BY tuple();
+
+INSERT INTO my_events VALUES
+   (1, 3),
+   (1, 6),
+   (2, 5),
+   (3, 7);
+```
+
+The intervals look like the following:
+
+```response
+1 - 3
+1 - - - - 6
+  2 - - 5
+    3 - - - 7
+```
+
+Notice that three of these intervals have the value 4 in common, and that starts with the 2nd interval:
+
+```sql
+SELECT maxIntersectionsPosition(start, end) FROM my_events;
+```
+
+Response:
+```response
+2
+```
+
+In other words, the `(1,6)` row is the start of the 3 intervals that intersect, and 3 is the maximum number of intervals that intersect.
--- a/docs/en/sql-reference/aggregate-functions/reference/quantileinterpolatedweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantileinterpolatedweighted.md
@ -0,0 +1,68 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/quantileInterpolatedWeighted
+sidebar_position: 203
+---
+
+# quantileInterpolatedWeighted
+
+Computes [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using linear interpolation, taking into account the weight of each element.
+
+To get the interpolated value, all the passed values are combined into an array, which are then sorted by their corresponding weights. Quantile interpolation is then performed using the [weighted percentile method](https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method) by building a cumulative distribution based on weights and then a linear interpolation is performed using the weights and the values to compute the quantiles.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileInterpolatedWeighted(level)(expr, weight)
+```
+
+Alias: `medianInterpolatedWeighted`.
+
+**Arguments**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `weight` — Column with weights of sequence members. Weight is a number of value occurrences.
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Input table:
+
+``` text
+┌─n─┬─val─┐
+│ 0 │   3 │
+│ 1 │   2 │
+│ 2 │   1 │
+│ 5 │   4 │
+└───┴─────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileInterpolatedWeighted(n, val) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileInterpolatedWeighted(n, val)─┐
+│                                    1 │
+└──────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
@ -9,7 +9,7 @@ sidebar_position: 201

 Syntax: `quantiles(level1, level2, …)(x)`

-All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
+All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.

 ## quantilesExactExclusive

--- a/docs/en/sql-reference/data-types/json.md
+++ b/docs/en/sql-reference/data-types/json.md
@ -6,6 +6,10 @@ sidebar_label: JSON

 # JSON

+:::warning
+This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/guides/developer/working-with-json/json-load-data.md) instead.
+:::
+
 Stores JavaScript Object Notation (JSON) documents in a single column.

 `JSON` is an alias for `Object('json')`.
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -121,7 +121,7 @@ Accepts an empty array and returns a one-element array that is equal to the defa

 ## range(end), range(\[start, \] end \[, step\])

-Returns an array of `UInt` numbers from `start` to `end - 1` by `step`.
+Returns an array of numbers from `start` to `end - 1` by `step`. The supported types are [UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64](../data-types/int-uint.md).

 **Syntax**
 ``` sql
@ -130,31 +130,30 @@ range([start, ] end [, step])

 **Arguments**

-   `start` — The first element of the array. Optional, required if `step` is used. Default value: 0. [UInt](../data-types/int-uint.md)
-   `end` — The number before which the array is constructed. Required. [UInt](../data-types/int-uint.md)
-   `step` — Determines the incremental step between each element in the array. Optional. Default value: 1. [UInt](../data-types/int-uint.md)
+-   `start` — The first element of the array. Optional, required if `step` is used. Default value: 0.
+-   `end` — The number before which the array is constructed. Required.
+-   `step` — Determines the incremental step between each element in the array. Optional. Default value: 1.

 **Returned value**

-   Array of `UInt` numbers from `start` to `end - 1` by `step`.
+-   Array of numbers from `start` to `end - 1` by `step`.

 **Implementation details**

-   All arguments must be positive values: `start`, `end`, `step` are `UInt` data types, as well as elements of the returned array.
+-   All arguments `start`, `end`, `step` must be below data types: `UInt8`, `UInt16`, `UInt32`, `UInt64`,`Int8`, `Int16`, `Int32`, `Int64`, as well as elements of the returned array, which's type is a super type of all arguments's.
 -   An exception is thrown if query results in arrays with a total length of more than number of elements specified by the [function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block) setting.

-
 **Examples**

 Query:
 ``` sql
-SELECT range(5), range(1, 5), range(1, 5, 2);
+SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
 ```
 Result:
 ```txt
-┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
-│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │
-└─────────────┴─────────────┴────────────────┘
+┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─┐
+│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │ [-1,1,3]        │
+└─────────────┴─────────────┴────────────────┴─────────────────┘
 ```

 ## array(x1, …), operator \[x1, …\]
--- a/docs/en/sql-reference/table-functions/generate.md
+++ b/docs/en/sql-reference/table-functions/generate.md
@ -39,3 +39,16 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(
 │ [68]     │  -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │
 └──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
 ```
+
+```sql
+CREATE TABLE random (a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)) engine=Memory;
+INSERT INTO random SELECT * FROM generateRandom() LIMIT 2;
+SELECT * FROM random;
+```
+
+```text
+┌─a────────────────────────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐
+│ []                           │   68091.8197 │ ('2037-10-02 12:44:23.368','039ecab7-81c2-45ee-208c-844e5c6c5652') │
+│ [8,-83,0,-22,65,9,-30,28,64] │ -186233.4909 │ ('2062-01-11 00:06:04.124','69563ea1-5ad1-f870-16d8-67061da0df25') │
+└──────────────────────────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
+```
--- a/docs/zh/sql-reference/functions/array-functions.md
+++ b/docs/zh/sql-reference/functions/array-functions.md
@ -117,7 +117,7 @@ SELECT notEmpty([1,2]);

 ## range(end), range(\[start, \] end \[, step\]) {#range}

-返回一个以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组。
+返回一个以`step`作为增量步长的从`start`到`end - 1`的整形数字数组， 支持类型包括[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)。

 **语法**
 ``` sql
@ -126,31 +126,30 @@ range([start, ] end [, step])

 **参数**

-   `start` — 数组的第一个元素。可选项，如果设置了`step`时同样需要`start`，默认值为：0，类型为[UInt](../data-types/int-uint.md)。
-   `end` — 计数到`end`结束，但不包括`end`，必填项，类型为[UInt](../data-types/int-uint.md)。
-   `step` — 确定数组中每个元素之间的增量步长。可选项，默认值为：1，类型为[UInt](../data-types/int-uint.md)。
+-   `start` — 数组的第一个元素。可选项，如果设置了`step`时同样需要`start`，默认值为：0。
+-   `end` — 计数到`end`结束，但不包括`end`，必填项。
+-   `step` — 确定数组中每个元素之间的增量步长。可选项，默认值为：1。

 **返回值**

-   以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组。
+-   以`step`作为增量步长的从`start`到`end - 1`的数字数组。

 **注意事项**

-   所有参数必须是正值：`start`、`end`、`step`，类型均为`UInt`，结果数组的元素与此相同。
+-   所有参数`start`、`end`、`step`必须属于以下几种类型之一：[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)。结果数组的元素数据类型为所有入参类型的最小超类，也必须属于以上几种类型之一。
 -   如果查询结果的数组总长度超过[function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block)指定的元素数，将会抛出异常。

-
 **示例**

 查询语句:
 ``` sql
-SELECT range(5), range(1, 5), range(1, 5, 2);
+SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
 ```
 结果:
 ```txt
-┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
-│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │
-└─────────────┴─────────────┴────────────────┘
+┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─┐
+│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │ [-1,1,3]        │
+└─────────────┴─────────────┴────────────────┴─────────────────┘
 ```

 ## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1}
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -140,6 +140,7 @@ namespace CurrentMetrics
 namespace ProfileEvents
 {
    extern const Event MainConfigLoads;
+    extern const Event ServerStartupMilliseconds;
 }

 namespace fs = std::filesystem;
@ -652,6 +653,8 @@ static void sanityChecks(Server & server)
 int Server::main(const std::vector<std::string> & /*args*/)
 try
 {
+    Stopwatch startup_watch;
+
    Poco::Logger * log = &logger();

    UseSSL use_ssl;
@ -1822,6 +1825,9 @@ try
            LOG_INFO(log, "Ready for connections.");
        }

+        startup_watch.stop();
+        ProfileEvents::increment(ProfileEvents::ServerStartupMilliseconds, startup_watch.elapsedMilliseconds());
+
        try
        {
            global_context->startClusterDiscovery();
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -167,6 +167,7 @@ enum class AccessType
    M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
+    M(SYSTEM_WAIT_LOADING_PARTS, "WAIT LOADING PARTS", TABLE, SYSTEM) \
    M(SYSTEM_SYNC_DATABASE_REPLICA, "SYNC DATABASE REPLICA", DATABASE, SYSTEM) \
    M(SYSTEM_SYNC_TRANSACTION_LOG, "SYNC TRANSACTION LOG", GLOBAL, SYSTEM) \
    M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \
--- a/src/Access/tests/gtest_access_rights_ops.cpp
+++ b/src/Access/tests/gtest_access_rights_ops.cpp
@ -53,7 +53,7 @@ TEST(AccessRights, Union)
              "SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, "
              "SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, "
              "SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, "
-              "SYSTEM RESTORE REPLICA, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
+              "SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
 }


--- a/src/AggregateFunctions/AggregateFunctionHistogram.h
+++ b/src/AggregateFunctions/AggregateFunctionHistogram.h
@ -207,7 +207,7 @@ private:
        {
            // Fuse points if their text representations differ only in last digit
            auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon();
-            if (points[left].mean + min_diff >= points[right].mean)
+            if (points[left].mean + std::fabs(min_diff) >= points[right].mean)
            {
                points[left] = points[left] + points[right];
            }
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@ -232,6 +232,9 @@ struct NameQuantilesExactInclusive { static constexpr auto name = "quantilesExac
 struct NameQuantileExactWeighted { static constexpr auto name = "quantileExactWeighted"; };
 struct NameQuantilesExactWeighted { static constexpr auto name = "quantilesExactWeighted"; };

+struct NameQuantileInterpolatedWeighted { static constexpr auto name = "quantileInterpolatedWeighted"; };
+struct NameQuantilesInterpolatedWeighted { static constexpr auto name = "quantilesInterpolatedWeighted"; };
+
 struct NameQuantileTiming { static constexpr auto name = "quantileTiming"; };
 struct NameQuantileTimingWeighted { static constexpr auto name = "quantileTimingWeighted"; };
 struct NameQuantilesTiming { static constexpr auto name = "quantilesTiming"; };
--- a/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
@ -0,0 +1,70 @@
+#include <AggregateFunctions/AggregateFunctionQuantile.h>
+#include <AggregateFunctions/QuantileInterpolatedWeighted.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+    template <typename Value, bool _> using FuncQuantileInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantileInterpolatedWeighted, true, void, false>;
+    template <typename Value, bool _> using FuncQuantilesInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantilesInterpolatedWeighted, true, void, true>;
+
+    template <template <typename, bool> class Function>
+    AggregateFunctionPtr createAggregateFunctionQuantile(
+        const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+    {
+        /// Second argument type check doesn't depend on the type of the first one.
+        Function<void, true>::assertSecondArg(argument_types);
+
+        const DataTypePtr & argument_type = argument_types[0];
+        WhichDataType which(argument_type);
+
+#define DISPATCH(TYPE) \
+    if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
+        FOR_BASIC_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+        if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
+        if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
+
+        if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
+        if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
+
+        if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
+        if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
+        if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
+        if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
+
+        throw Exception("Illegal type " + argument_type->getName() + " of argument for aggregate function " + name,
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+}
+
+void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory & factory)
+{
+    /// For aggregate functions returning array we cannot return NULL on empty set.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
+
+    factory.registerFunction(NameQuantileInterpolatedWeighted::name, createAggregateFunctionQuantile<FuncQuantileInterpolatedWeighted>);
+    factory.registerFunction(NameQuantilesInterpolatedWeighted::name, { createAggregateFunctionQuantile<FuncQuantilesInterpolatedWeighted>, properties });
+
+    /// 'median' is an alias for 'quantile'
+    factory.registerAlias("medianInterpolatedWeighted", NameQuantileInterpolatedWeighted::name);
+}
+
+}
--- a/src/AggregateFunctions/QuantileInterpolatedWeighted.h
+++ b/src/AggregateFunctions/QuantileInterpolatedWeighted.h
@ -0,0 +1,308 @@
+#pragma once
+
+#include <base/sort.h>
+
+#include <Common/HashTable/HashMap.h>
+#include <Common/NaNUtils.h>
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+/** Approximates Quantile by:
+  * - sorting input values and weights
+  * - building a cumulative distribution based on weights
+  * - performing linear interpolation between the weights and values
+  *
+  */
+template <typename Value>
+struct QuantileInterpolatedWeighted
+{
+    struct Int128Hash
+    {
+        size_t operator()(Int128 x) const
+        {
+            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
+        }
+    };
+
+    using Weight = UInt64;
+    using UnderlyingType = NativeType<Value>;
+    using Hasher = std::conditional_t<std::is_same_v<Value, Decimal128>, Int128Hash, HashCRC32<UnderlyingType>>;
+
+    /// When creating, the hash table must be small.
+    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
+
+    Map map;
+
+    void add(const Value & x)
+    {
+        /// We must skip NaNs as they are not compatible with comparison sorting.
+        if (!isNaN(x))
+            ++map[x];
+    }
+
+    void add(const Value & x, Weight weight)
+    {
+        if (!isNaN(x))
+            map[x] += weight;
+    }
+
+    void merge(const QuantileInterpolatedWeighted & rhs)
+    {
+        for (const auto & pair : rhs.map)
+            map[pair.getKey()] += pair.getMapped();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        map.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        typename Map::Reader reader(buf);
+        while (reader.next())
+        {
+            const auto & pair = reader.get();
+            map[pair.first] = pair.second;
+        }
+    }
+
+    Value get(Float64 level) const
+    {
+        return getImpl<Value>(level);
+    }
+
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
+    {
+        getManyImpl<Value>(levels, indices, size, result);
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64) const
+    {
+        throw Exception("Method getFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
+    {
+        throw Exception("Method getManyFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+private:
+    using Pair = typename std::pair<UnderlyingType, Float64>;
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    template <typename T>
+    T getImpl(Float64 level) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+            return std::numeric_limits<Value>::quiet_NaN();
+
+        /// Maintain a vector of pair of values and weights for easier sorting and for building
+        /// a cumulative distribution using the provided weights.
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        /// Note: weight provided must be a 64-bit integer
+        /// Float64 is used as accumulator here to get approximate results.
+        /// But weight used in the internal array is stored as Float64 as we
+        /// do some quantile estimation operation which involves division and
+        /// require Float64 level of precision.
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.push_back(accumulated);
+        }
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        /// perform linear interpolation
+        size_t idx = 0;
+        if (size >= 2)
+        {
+            if (level >= value_weight_pairs[size - 2].second)
+            {
+                idx = size - 2;
+            }
+            else
+            {
+                size_t start = 0, end = size - 1;
+                while (start <= end)
+                {
+                    size_t mid = start + (end - start) / 2;
+                    if (mid > size)
+                        break;
+                    if (level > value_weight_pairs[mid + 1].second)
+                        start = mid + 1;
+                    else
+                    {
+                        idx = mid;
+                        end = mid - 1;
+                    }
+                }
+            }
+        }
+
+        size_t l = idx;
+        size_t u = idx + 1 < size ? idx + 1 : idx;
+
+        Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+        UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+        if (level < xl)
+            yr = yl;
+        if (level > xr)
+            yl = yr;
+
+        return static_cast<T>(interpolate(level, xl, xr, yl, yr));
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    template <typename T>
+    void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+        {
+            for (size_t i = 0; i < num_levels; ++i)
+                result[i] = Value();
+            return;
+        }
+
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.emplace_back(accumulated);
+        }
+
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        for (size_t level_index = 0; level_index < num_levels; ++level_index)
+        {
+            /// perform linear interpolation for every level
+            auto level = levels[indices[level_index]];
+
+            size_t idx = 0;
+            if (size >= 2)
+            {
+                if (level >= value_weight_pairs[size - 2].second)
+                {
+                    idx = size - 2;
+                }
+                else
+                {
+                    size_t start = 0, end = size - 1;
+                    while (start <= end)
+                    {
+                        size_t mid = start + (end - start) / 2;
+                        if (mid > size)
+                            break;
+                        if (level > value_weight_pairs[mid + 1].second)
+                            start = mid + 1;
+                        else
+                        {
+                            idx = mid;
+                            end = mid - 1;
+                        }
+                    }
+                }
+            }
+
+            size_t l = idx;
+            size_t u = idx + 1 < size ? idx + 1 : idx;
+
+            Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+            UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+            if (level < xl)
+                yr = yl;
+            if (level > xr)
+                yl = yr;
+
+            result[indices[level_index]] = static_cast<T>(interpolate(level, xl, xr, yl, yr));
+        }
+    }
+
+    /// This ignores overflows or NaN's that might arise during add, sub and mul operations and doesn't aim to provide exact
+    /// results since `the quantileInterpolatedWeighted` function itself relies mainly on approximation.
+    UnderlyingType NO_SANITIZE_UNDEFINED interpolate(Float64 level, Float64 xl, Float64 xr, UnderlyingType yl, UnderlyingType yr) const
+    {
+        UnderlyingType dy = yr - yl;
+        Float64 dx = xr - xl;
+        dx = dx == 0 ? 1 : dx; /// to handle NaN behavior that might arise during integer division below.
+
+        /// yl + (dy / dx) * (level - xl)
+        return static_cast<UnderlyingType>(yl + (dy / dx) * (level - xl));
+    }
+};
+
+}
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -21,6 +21,7 @@ void registerAggregateFunctionsQuantile(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory &);
+void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactLow(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactHigh(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactInclusive(AggregateFunctionFactory &);
@ -106,6 +107,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionsQuantileDeterministic(factory);
        registerAggregateFunctionsQuantileExact(factory);
        registerAggregateFunctionsQuantileExactWeighted(factory);
+        registerAggregateFunctionsQuantileInterpolatedWeighted(factory);
        registerAggregateFunctionsQuantileExactLow(factory);
        registerAggregateFunctionsQuantileExactHigh(factory);
        registerAggregateFunctionsQuantileExactInclusive(factory);
--- a/src/Analyzer/MatcherNode.cpp
+++ b/src/Analyzer/MatcherNode.cpp
@ -11,6 +11,7 @@
 #include <Parsers/ASTQualifiedAsterisk.h>
 #include <Parsers/ASTColumnsMatcher.h>
 #include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTColumnsTransformers.h>

 namespace DB
 {
@ -206,19 +207,43 @@ QueryTreeNodePtr MatcherNode::cloneImpl() const
 ASTPtr MatcherNode::toASTImpl() const
 {
    ASTPtr result;
+    ASTPtr transformers;
+
+    if (!children.empty())
+    {
+        transformers = std::make_shared<ASTColumnsTransformerList>();
+
+        for (const auto & child : children)
+            transformers->children.push_back(child->toAST());
+    }

    if (matcher_type == MatcherNodeType::ASTERISK)
    {
        if (qualified_identifier.empty())
        {
-            result = std::make_shared<ASTAsterisk>();
+            auto asterisk = std::make_shared<ASTAsterisk>();
+
+            if (transformers)
+            {
+                asterisk->transformers = std::move(transformers);
+                asterisk->children.push_back(asterisk->transformers);
+            }
+
+            result = asterisk;
        }
        else
        {
            auto qualified_asterisk = std::make_shared<ASTQualifiedAsterisk>();

            auto identifier_parts = qualified_identifier.getParts();
-            qualified_asterisk->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            qualified_asterisk->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            qualified_asterisk->children.push_back(qualified_asterisk->qualifier);
+
+            if (transformers)
+            {
+                qualified_asterisk->transformers = std::move(transformers);
+                qualified_asterisk->children.push_back(qualified_asterisk->transformers);
+            }

            result = qualified_asterisk;
        }
@ -229,6 +254,13 @@ ASTPtr MatcherNode::toASTImpl() const
        {
            auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
            regexp_matcher->setPattern(columns_matcher->pattern());
+
+            if (transformers)
+            {
+                regexp_matcher->transformers = std::move(transformers);
+                regexp_matcher->children.push_back(regexp_matcher->transformers);
+            }
+
            result = regexp_matcher;
        }
        else
@ -237,7 +269,14 @@ ASTPtr MatcherNode::toASTImpl() const
            regexp_matcher->setPattern(columns_matcher->pattern());

            auto identifier_parts = qualified_identifier.getParts();
-            regexp_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            regexp_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            regexp_matcher->children.push_back(regexp_matcher->qualifier);
+
+            if (transformers)
+            {
+                regexp_matcher->transformers = std::move(transformers);
+                regexp_matcher->children.push_back(regexp_matcher->transformers);
+            }

            result = regexp_matcher;
        }
@ -257,23 +296,36 @@ ASTPtr MatcherNode::toASTImpl() const
        {
            auto columns_list_matcher = std::make_shared<ASTColumnsListMatcher>();
            columns_list_matcher->column_list = std::move(column_list);
+            columns_list_matcher->children.push_back(columns_list_matcher->column_list);
+
+            if (transformers)
+            {
+                columns_list_matcher->transformers = std::move(transformers);
+                columns_list_matcher->children.push_back(columns_list_matcher->transformers);
+            }
+
            result = columns_list_matcher;
        }
        else
        {
            auto columns_list_matcher = std::make_shared<ASTQualifiedColumnsListMatcher>();
-            columns_list_matcher->column_list = std::move(column_list);

            auto identifier_parts = qualified_identifier.getParts();
-            columns_list_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            columns_list_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            columns_list_matcher->column_list = std::move(column_list);
+            columns_list_matcher->children.push_back(columns_list_matcher->qualifier);
+            columns_list_matcher->children.push_back(columns_list_matcher->column_list);
+
+            if (transformers)
+            {
+                columns_list_matcher->transformers = std::move(transformers);
+                columns_list_matcher->children.push_back(columns_list_matcher->transformers);
+            }

            result = columns_list_matcher;
        }
    }

-    for (const auto & child : children)
-        result->children.push_back(child->toAST());
-
    return result;
 }

--- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
+++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
@ -73,7 +73,7 @@ public:
        if (!inner_function_node)
            return;

-        auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
+        const auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
        if (inner_function_arguments_nodes.size() != 2)
            return;

@ -119,13 +119,15 @@ public:
            {
                lower_function_name = function_name_if_constant_is_negative;
            }
-            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[1], lower_function_name);

-            auto inner_function = aggregate_function_arguments_nodes[0];
-            auto inner_function_right_argument = std::move(inner_function_arguments_nodes[1]);
-            aggregate_function_arguments_nodes = {inner_function_right_argument};
-            inner_function_arguments_nodes[1] = node;
-            node = std::move(inner_function);
+            auto inner_function_clone = inner_function_node->clone();
+            auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
+            auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
+            auto inner_function_clone_right_argument = inner_function_clone_arguments_nodes[1];
+            aggregate_function_arguments_nodes = {inner_function_clone_right_argument};
+            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_right_argument, lower_function_name);
+            inner_function_clone_arguments_nodes[1] = node;
+            node = std::move(inner_function_clone);
        }
        else if (right_argument_constant_node)
        {
@ -136,18 +138,20 @@ public:
            {
                lower_function_name = function_name_if_constant_is_negative;
            }
-            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[0], function_name_if_constant_is_negative);

-            auto inner_function = aggregate_function_arguments_nodes[0];
-            auto inner_function_left_argument = std::move(inner_function_arguments_nodes[0]);
-            aggregate_function_arguments_nodes = {inner_function_left_argument};
-            inner_function_arguments_nodes[0] = node;
-            node = std::move(inner_function);
+            auto inner_function_clone = inner_function_node->clone();
+            auto & inner_function_clone_arguments = inner_function_clone->as<FunctionNode &>().getArguments();
+            auto & inner_function_clone_arguments_nodes = inner_function_clone_arguments.getNodes();
+            auto inner_function_clone_left_argument = inner_function_clone_arguments_nodes[0];
+            aggregate_function_arguments_nodes = {inner_function_clone_left_argument};
+            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_clone_left_argument, lower_function_name);
+            inner_function_clone_arguments_nodes[0] = node;
+            node = std::move(inner_function_clone);
        }
    }

 private:
-    static inline void resolveAggregateFunctionNode(FunctionNode & function_node, QueryTreeNodePtr & argument, const String & aggregate_function_name)
+    static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name)
    {
        auto function_aggregate_function = function_node.getAggregateFunction();

--- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp
+++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp
@ -0,0 +1,124 @@
+#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
+#include <Analyzer/ColumnNode.h>
+#include <Analyzer/FunctionNode.h>
+#include <Analyzer/HashUtils.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Analyzer/QueryNode.h>
+#include <Analyzer/SortNode.h>
+#include <Functions/IFunction.h>
+
+namespace DB
+{
+
+namespace
+{
+
+class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitor<OptimizeRedundantFunctionsInOrderByVisitor>
+{
+public:
+    static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
+    {
+        if (node->as<FunctionNode>())
+            return false;
+        return true;
+    }
+
+    void visitImpl(QueryTreeNodePtr & node)
+    {
+        auto * query = node->as<QueryNode>();
+        if (!query)
+            return;
+
+        if (!query->hasOrderBy())
+            return;
+
+        auto & order_by = query->getOrderBy();
+        for (auto & elem : order_by.getNodes())
+        {
+            auto * order_by_elem = elem->as<SortNode>();
+            if (order_by_elem->withFill())
+                return;
+        }
+
+        QueryTreeNodes new_order_by_nodes;
+        new_order_by_nodes.reserve(order_by.getNodes().size());
+
+        for (auto & elem : order_by.getNodes())
+        {
+            auto & order_by_expr = elem->as<SortNode>()->getExpression();
+            switch (order_by_expr->getNodeType())
+            {
+                case QueryTreeNodeType::FUNCTION:
+                {
+                    if (isRedundantExpression(order_by_expr))
+                        continue;
+                    break;
+                }
+                case QueryTreeNodeType::COLUMN:
+                {
+                    existing_keys.insert(order_by_expr);
+                    break;
+                }
+                default:
+                    break;
+            }
+
+            new_order_by_nodes.push_back(elem);
+        }
+        existing_keys.clear();
+
+        if (new_order_by_nodes.size() < order_by.getNodes().size())
+            order_by.getNodes() = std::move(new_order_by_nodes);
+    }
+
+private:
+    QueryTreeNodePtrWithHashSet existing_keys;
+
+    bool isRedundantExpression(QueryTreeNodePtr function)
+    {
+        QueryTreeNodes nodes_to_process{ function };
+        while (!nodes_to_process.empty())
+        {
+            auto node = nodes_to_process.back();
+            nodes_to_process.pop_back();
+
+            // TODO: handle constants here
+            switch (node->getNodeType())
+            {
+                case QueryTreeNodeType::FUNCTION:
+                {
+                    auto * function_node = node->as<FunctionNode>();
+                    const auto & function_arguments = function_node->getArguments().getNodes();
+                    if (function_arguments.empty())
+                        return false;
+                    const auto & function_base = function_node->getFunction();
+                    if (!function_base || !function_base->isDeterministicInScopeOfQuery())
+                        return false;
+
+                    // Process arguments in order
+                    for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
+                        nodes_to_process.push_back(*it);
+                    break;
+                }
+                case QueryTreeNodeType::COLUMN:
+                {
+                    if (!existing_keys.contains(node))
+                        return false;
+                    break;
+                }
+                default:
+                    return false;
+            }
+        }
+        return true;
+    }
+};
+
+}
+
+void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
+{
+    OptimizeRedundantFunctionsInOrderByVisitor().visit(query_tree_node);
+}
+
+}
--- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h
+++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <Analyzer/IQueryTreePass.h>
+
+namespace DB
+{
+
+/** If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x.
+  * Optimize ORDER BY x, y, f(x), g(x, y), f(h(x)), t(f(x), g(x)) into ORDER BY x, y
+  * in case if f(), g(), h(), t() are deterministic (in scope of query).
+  * Don't optimize ORDER BY f(x), g(x), x even if f(x) is bijection for x or g(x).
+  */
+class OptimizeRedundantFunctionsInOrderByPass final : public IQueryTreePass
+{
+public:
+    String getName() override { return "OptimizeRedundantFunctionsInOrderBy"; }
+
+    String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; }
+
+    void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
+};
+
+}
--- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp
+++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp
@ -77,11 +77,11 @@ public:
        if (!nested_function || nested_function->getFunctionName() != "if")
            return;

-        auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
+        const auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
        if (nested_if_function_arguments_nodes.size() != 3)
            return;

-        auto & cond_argument = nested_if_function_arguments_nodes[0];
+        const auto & cond_argument = nested_if_function_arguments_nodes[0];
        const auto * if_true_condition_constant_node = nested_if_function_arguments_nodes[1]->as<ConstantNode>();
        const auto * if_false_condition_constant_node = nested_if_function_arguments_nodes[2]->as<ConstantNode>();

@ -101,7 +101,7 @@ public:
        /// Rewrite `sum(if(cond, 1, 0))` into `countIf(cond)`.
        if (if_true_condition_value == 1 && if_false_condition_value == 0)
        {
-            function_node_arguments_nodes[0] = std::move(nested_if_function_arguments_nodes[0]);
+            function_node_arguments_nodes[0] = nested_if_function_arguments_nodes[0];
            function_node_arguments_nodes.resize(1);

            resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
@ -120,7 +120,7 @@ public:
            auto not_function = std::make_shared<FunctionNode>("not");

            auto & not_function_arguments = not_function->getArguments().getNodes();
-            not_function_arguments.push_back(std::move(nested_if_function_arguments_nodes[0]));
+            not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);

            not_function->resolveAsFunction(FunctionFactory::instance().get("not", context)->build(not_function->getArgumentColumns()));

--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -111,7 +111,7 @@ private:

    QueryTreeNodePtr buildJoinTree(const ASTPtr & tables_in_select_query, const ContextPtr & context) const;

-    ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const;
+    ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const;

    ASTPtr query;
    QueryTreeNodePtr query_tree_node;
@ -439,13 +439,13 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
    }
    else if (const auto * asterisk = expression->as<ASTAsterisk>())
    {
-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(asterisk->transformers, context);
        result = std::make_shared<MatcherNode>(std::move(column_transformers));
    }
    else if (const auto * qualified_asterisk = expression->as<ASTQualifiedAsterisk>())
    {
-        auto & qualified_identifier = qualified_asterisk->children.at(0)->as<ASTTableIdentifier &>();
-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto & qualified_identifier = qualified_asterisk->qualifier->as<ASTIdentifier &>();
+        auto column_transformers = buildColumnTransformers(qualified_asterisk->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_transformers));
    }
    else if (const auto * ast_literal = expression->as<ASTLiteral>())
@ -543,7 +543,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
    }
    else if (const auto * columns_regexp_matcher = expression->as<ASTColumnsRegexpMatcher>())
    {
-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(columns_regexp_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(columns_regexp_matcher->getMatcher(), std::move(column_transformers));
    }
    else if (const auto * columns_list_matcher = expression->as<ASTColumnsListMatcher>())
@ -557,18 +557,18 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
            column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
        }

-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(columns_list_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(std::move(column_list_identifiers), std::move(column_transformers));
    }
    else if (const auto * qualified_columns_regexp_matcher = expression->as<ASTQualifiedColumnsRegexpMatcher>())
    {
-        auto & qualified_identifier = qualified_columns_regexp_matcher->children.at(0)->as<ASTTableIdentifier &>();
-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto & qualified_identifier = qualified_columns_regexp_matcher->qualifier->as<ASTIdentifier &>();
+        auto column_transformers = buildColumnTransformers(qualified_columns_regexp_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), qualified_columns_regexp_matcher->getMatcher(), std::move(column_transformers));
    }
    else if (const auto * qualified_columns_list_matcher = expression->as<ASTQualifiedColumnsListMatcher>())
    {
-        auto & qualified_identifier = qualified_columns_list_matcher->children.at(0)->as<ASTTableIdentifier &>();
+        auto & qualified_identifier = qualified_columns_list_matcher->qualifier->as<ASTIdentifier &>();

        Identifiers column_list_identifiers;
        column_list_identifiers.reserve(qualified_columns_list_matcher->column_list->children.size());
@ -579,7 +579,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
            column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
        }

-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(qualified_columns_list_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_list_identifiers), std::move(column_transformers));
    }
    else
@ -833,15 +833,15 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select
 }


-ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const
+ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const
 {
    ColumnTransformersNodes column_transformers;
-    size_t children_size = matcher_expression->children.size();

-    for (; start_child_index < children_size; ++start_child_index)
+    if (!matcher_expression)
+        return column_transformers;
+
+    for (const auto & child : matcher_expression->children)
    {
-        const auto & child = matcher_expression->children[start_child_index];
-
        if (auto * apply_transformer = child->as<ASTColumnsApplyTransformer>())
        {
            if (apply_transformer->lambda)
--- a/src/Analyzer/QueryTreePassManager.cpp
+++ b/src/Analyzer/QueryTreePassManager.cpp
@ -15,6 +15,7 @@
 #include <Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h>
 #include <Analyzer/Passes/FuseFunctionsPass.h>
 #include <Analyzer/Passes/IfTransformStringsToEnumPass.h>
+#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>

 #include <IO/WriteHelpers.h>
 #include <IO/Operators.h>
@ -91,7 +92,6 @@ public:
  * TODO: Support setting optimize_move_functions_out_of_any.
  * TODO: Support setting optimize_aggregators_of_group_by_keys.
  * TODO: Support setting optimize_duplicate_order_by_and_distinct.
-  * TODO: Support setting optimize_redundant_functions_in_order_by.
  * TODO: Support setting optimize_monotonous_functions_in_order_by.
  * TODO: Support settings.optimize_or_like_chain.
  * TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
@ -203,6 +203,9 @@ void addQueryTreePasses(QueryTreePassManager & manager)
    if (settings.optimize_if_chain_to_multiif)
        manager.addPass(std::make_unique<IfChainToMultiIfPass>());

+    if (settings.optimize_redundant_functions_in_order_by)
+        manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
+
    manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
    manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());

--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -156,10 +156,9 @@ void BackupWriterS3::copyObjectImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    const Aws::S3::Model::HeadObjectResult & head,
+    size_t size,
    const std::optional<ObjectAttributes> & metadata) const
 {
-    size_t size = head.GetContentLength();
    LOG_TRACE(log, "Copying {} bytes using single-operation copy", size);

    Aws::S3::Model::CopyObjectRequest request;
@ -177,7 +176,7 @@ void BackupWriterS3::copyObjectImpl(
    if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
            || outcome.GetError().GetExceptionName() == "InvalidRequest"))
    { // Can't come here with MinIO, MinIO allows single part upload for large objects.
-        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
+        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
        return;
    }

@ -191,10 +190,9 @@ void BackupWriterS3::copyObjectMultipartImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    const Aws::S3::Model::HeadObjectResult & head,
+    size_t size,
    const std::optional<ObjectAttributes> & metadata) const
 {
-    size_t size = head.GetContentLength();
    LOG_TRACE(log, "Copying {} bytes using multipart upload copy", size);

    String multipart_upload_id;
@ -309,16 +307,16 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_
        std::string source_bucket = object_storage->getObjectsNamespace();
        auto file_path = fs::path(s3_uri.key) / file_name_to;

-        auto head = S3::headObject(*client, source_bucket, objects[0].absolute_path).GetResult();
-        if (static_cast<size_t>(head.GetContentLength()) < request_settings.getUploadSettings().max_single_operation_copy_size)
+        auto size = S3::getObjectSize(*client, source_bucket, objects[0].absolute_path);
+        if (size < request_settings.getUploadSettings().max_single_operation_copy_size)
        {
            copyObjectImpl(
-                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
+                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
        }
        else
        {
            copyObjectMultipartImpl(
-                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
+                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
        }
    }
 }
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@ -67,7 +67,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        const Aws::S3::Model::HeadObjectResult & head,
+        size_t size,
        const std::optional<ObjectAttributes> & metadata = std::nullopt) const;

    void copyObjectMultipartImpl(
@ -75,7 +75,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        const Aws::S3::Model::HeadObjectResult & head,
+        size_t size,
        const std::optional<ObjectAttributes> & metadata = std::nullopt) const;

    void removeFilesBatch(const Strings & file_names);
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -309,6 +309,8 @@ The server successfully detected this situation and will download merged part fr
    M(S3CopyObject, "Number of S3 API CopyObject calls.") \
    M(S3ListObjects, "Number of S3 API ListObjects calls.") \
    M(S3HeadObject,  "Number of S3 API HeadObject calls.") \
+    M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.") \
+    M(S3GetObjectMetadata, "Number of S3 API GetObject calls for getting metadata.") \
    M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.") \
    M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.") \
    M(S3UploadPart, "Number of S3 API UploadPart calls.") \
@ -321,6 +323,8 @@ The server successfully detected this situation and will download merged part fr
    M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \
    M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.") \
    M(DiskS3HeadObject,  "Number of DiskS3 API HeadObject calls.") \
+    M(DiskS3GetObjectAttributes, "Number of DiskS3 API GetObjectAttributes calls.") \
+    M(DiskS3GetObjectMetadata, "Number of DiskS3 API GetObject calls for getting metadata.") \
    M(DiskS3CreateMultipartUpload, "Number of DiskS3 API CreateMultipartUpload calls.") \
    M(DiskS3UploadPartCopy, "Number of DiskS3 API UploadPartCopy calls.") \
    M(DiskS3UploadPart, "Number of DiskS3 API UploadPart calls.") \
@ -449,7 +453,8 @@ The server successfully detected this situation and will download merged part fr
    M(OverflowBreak, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'break' and the result is incomplete.") \
    M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.") \
    M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.") \
-
+    \
+    M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\

 namespace ProfileEvents
 {
--- a/src/Common/logger_useful.h
+++ b/src/Common/logger_useful.h
@ -7,6 +7,29 @@
 #include <Poco/Message.h>
 #include <Common/CurrentThread.h>

+/// This wrapper is useful to save formatted message into a String before sending it to a logger
+class LogToStrImpl
+{
+    String & out_str;
+    Poco::Logger * logger;
+    bool propagate_to_actual_log = true;
+public:
+    LogToStrImpl(String & out_str_, Poco::Logger * logger_) : out_str(out_str_) , logger(logger_) {}
+    LogToStrImpl & operator -> () { return *this; }
+    bool is(Poco::Message::Priority priority) { propagate_to_actual_log &= logger->is(priority); return true; }
+    LogToStrImpl * getChannel() {return this; }
+    const String & name() const { return logger->name(); }
+    void log(const Poco::Message & message)
+    {
+        out_str = message.getText();
+        if (!propagate_to_actual_log)
+            return;
+        if (auto * channel = logger->getChannel())
+            channel->log(message);
+    }
+};
+
+#define LogToStr(x, y) std::make_unique<LogToStrImpl>(x, y)

 namespace
 {
@ -17,8 +40,37 @@ namespace

    [[maybe_unused]] const ::Poco::Logger * getLogger(const ::Poco::Logger * logger) { return logger; };
    [[maybe_unused]] const ::Poco::Logger * getLogger(const std::atomic<::Poco::Logger *> & logger) { return logger.load(); };
+    [[maybe_unused]] std::unique_ptr<LogToStrImpl> getLogger(std::unique_ptr<LogToStrImpl> && logger) { return logger; };
+
+    template<typename T> struct is_fmt_runtime : std::false_type {};
+    template<typename T> struct is_fmt_runtime<fmt::basic_runtime<T>> : std::true_type {};
+
+    /// Usually we use LOG_*(...) macros with either string literals or fmt::runtime(whatever) as a format string.
+    /// This function is useful to get a string_view to a static format string passed to LOG_* macro.
+    template <typename T> constexpr std::string_view tryGetStaticFormatString(T && x)
+    {
+        if constexpr (is_fmt_runtime<T>::value)
+        {
+            /// It definitely was fmt::runtime(something).
+            /// We are not sure about a lifetime of the string, so return empty view.
+            /// Also it can be arbitrary string, not a formatting pattern.
+            /// So returning empty pattern will not pollute the set of patterns.
+            return std::string_view();
+        }
+        else
+        {
+            /// Most likely it was a string literal.
+            /// Unfortunately, there's no good way to check if something is a string literal.
+            /// But fmtlib requires a format string to be compile-time constant unless fmt::runtime is used.
+            static_assert(std::is_nothrow_convertible<T, const char * const>::value);
+            static_assert(!std::is_pointer<T>::value);
+            return std::string_view(x);
+        }
+    }
 }

+#define LOG_IMPL_FIRST_ARG(X, ...) X
+
 /// Logs a message to a specified logger with that level.
 /// If more than one argument is provided,
 ///  the first argument is interpreted as template with {}-substitutions
@ -30,7 +82,7 @@ namespace
    auto _logger = ::getLogger(logger);                                           \
    const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) &&    \
        (DB::CurrentThread::getGroup()->client_logs_level >= (priority));         \
-    if (_logger->is((PRIORITY)) || _is_clients_log)                               \
+    if (_is_clients_log || _logger->is((PRIORITY)))                               \
    {                                                                             \
        std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
        if (auto _channel = _logger->getChannel())                                \
@ -40,7 +92,7 @@ namespace
            file_function += "; ";                                                \
            file_function += __PRETTY_FUNCTION__;                                 \
            Poco::Message poco_message(_logger->name(), formatted_message,        \
-                                 (PRIORITY), file_function.c_str(), __LINE__);    \
+                (PRIORITY), file_function.c_str(), __LINE__, tryGetStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__)));    \
            _channel->log(poco_message);                                          \
        }                                                                         \
    }                                                                             \
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -773,6 +773,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
    M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
    M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
+    M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
+    M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
    M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -80,7 +80,8 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
-    {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}}},
+    {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
+              {"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
    {"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
               {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
               {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}},
--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
 {
    extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
    extern const int NOT_FOUND_COLUMN_IN_BLOCK;
+    extern const int INCORRECT_DATA;
 }


@ -154,7 +155,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co

 void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
 {
-    if (settings.json.named_tuples_as_objects
+    if (settings.json.write_named_tuples_as_objects
        && have_explicit_names)
    {
        writeChar('{', ostr);
@ -185,7 +186,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu

 void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    if (settings.json.named_tuples_as_objects
+    if (settings.json.read_named_tuples_as_objects
        && have_explicit_names)
    {
        skipWhitespaceIfAny(istr);
@ -194,12 +195,15 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr

        addElementSafe(elems.size(), column, [&]
        {
-            // Require all elements but in arbitrary order.
-            for (size_t i = 0; i < elems.size(); ++i)
+            std::vector<UInt8> seen_elements(elems.size(), 0);
+            size_t i = 0;
+            while (!istr.eof() && *istr.position() != '}')
            {
+                if (i == elems.size())
+                    throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {}", elems.size());
+
                if (i > 0)
                {
-                    skipWhitespaceIfAny(istr);
                    assertChar(',', istr);
                    skipWhitespaceIfAny(istr);
                }
@ -211,12 +215,35 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
                skipWhitespaceIfAny(istr);

                const size_t element_pos = getPositionByName(name);
+                seen_elements[element_pos] = 1;
                auto & element_column = extractElementColumn(column, element_pos);
                elems[element_pos]->deserializeTextJSON(element_column, istr, settings);
+
+                skipWhitespaceIfAny(istr);
+                ++i;
            }

-            skipWhitespaceIfAny(istr);
            assertChar('}', istr);
+
+            /// Check if we have missing elements.
+            if (i != elems.size())
+            {
+                for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos)
+                {
+                    if (seen_elements[element_pos])
+                        continue;
+
+                    if (!settings.json.defaults_for_missing_elements_in_named_tuple)
+                        throw Exception(
+                            ErrorCodes::INCORRECT_DATA,
+                            "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, "
+                            "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple",
+                            elems[element_pos]->getElementName());
+
+                    auto & element_column = extractElementColumn(column, element_pos);
+                    element_column.insertDefault();
+                }
+            }
        });
    }
    else
--- a/src/Databases/DDLDependencyVisitor.cpp
+++ b/src/Databases/DDLDependencyVisitor.cpp
@ -2,6 +2,7 @@
 #include <Dictionaries/getDictionaryConfigurationFromAST.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/misc.h>
 #include <Interpreters/InDepthNodeVisitor.h>
 #include <Interpreters/evaluateConstantExpression.h>
 #include <Interpreters/getClusterName.h>
@ -175,7 +176,7 @@ namespace
        /// Finds dependencies of a function.
        void visitFunction(const ASTFunction & function)
        {
-            if (function.name == "joinGet" || function.name == "dictHas" || function.name == "dictIsIn" || function.name.starts_with("dictGet"))
+            if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
            {
                /// dictGet('dict_name', attr_names, id_expr)
                /// dictHas('dict_name', id_expr)
--- a/src/Databases/DDLLoadingDependencyVisitor.cpp
+++ b/src/Databases/DDLLoadingDependencyVisitor.cpp
@ -1,6 +1,7 @@
 #include <Databases/DDLLoadingDependencyVisitor.h>
 #include <Dictionaries/getDictionaryConfigurationFromAST.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/misc.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
@ -52,23 +53,41 @@ bool DDLMatcherBase::needChildVisit(const ASTPtr & node, const ASTPtr & child)
    return true;
 }

-ssize_t DDLMatcherBase::getPositionOfTableNameArgument(const ASTFunction & function)
+ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function)
 {
-    if (function.name == "joinGet" ||
-        function.name == "dictHas" ||
-        function.name == "dictIsIn" ||
-        function.name.starts_with("dictGet"))
+    if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
        return 0;

-    if (Poco::toLower(function.name) == "in")
+    return -1;
+}
+
+ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToVisit(const ASTFunction & function)
+{
+    ssize_t maybe_res = getPositionOfTableNameArgumentToEvaluate(function);
+    if (0 <= maybe_res)
+        return maybe_res;
+
+    if (functionIsInOrGlobalInOperator(function.name))
+    {
+        if (function.children.empty())
+            return -1;
+
+        const auto * args = function.children[0]->as<ASTExpressionList>();
+        if (!args || args->children.size() != 2)
+            return -1;
+
+        if (args->children[1]->as<ASTFunction>())
+            return -1;
+
        return 1;
+    }

    return -1;
 }

 void DDLLoadingDependencyVisitor::visit(const ASTFunction & function, Data & data)
 {
-    ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
+    ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToVisit(function);
    if (table_name_arg_idx < 0)
        return;
    extractTableNameFromArgument(function, data, table_name_arg_idx);
--- a/src/Databases/DDLLoadingDependencyVisitor.h
+++ b/src/Databases/DDLLoadingDependencyVisitor.h
@ -23,7 +23,8 @@ class DDLMatcherBase
 {
 public:
    static bool needChildVisit(const ASTPtr & node, const ASTPtr & child);
-    static ssize_t getPositionOfTableNameArgument(const ASTFunction & function);
+    static ssize_t getPositionOfTableNameArgumentToVisit(const ASTFunction & function);
+    static ssize_t getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function);
 };

 /// Visits ASTCreateQuery and extracts the names of all tables which should be loaded before a specified table.
--- a/src/Databases/NormalizeAndEvaluateConstantsVisitor.cpp
+++ b/src/Databases/NormalizeAndEvaluateConstantsVisitor.cpp
@ -23,7 +23,7 @@ void NormalizeAndEvaluateConstants::visit(const ASTFunction & function, Data & d
 {
    /// Replace expressions like "dictGet(currentDatabase() || '.dict', 'value', toUInt32(1))"
    /// with "dictGet('db_name.dict', 'value', toUInt32(1))"
-    ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
+    ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToEvaluate(function);
    if (table_name_arg_idx < 0)
        return;

--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -171,8 +171,9 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    if (!hasPendingDataToRead())
        return false;

-    size_t size, offset;
+    chassert(file_offset_of_buffer_end <= impl->getFileSize());

+    size_t size, offset;
    if (prefetch_future.valid())
    {
        ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::AsynchronousRemoteReadWaitMicroseconds);
@ -210,8 +211,8 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    /// In case of multiple files for the same file in clickhouse (i.e. log family)
    /// file_offset_of_buffer_end will not match getImplementationBufferOffset()
    /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
-    assert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
-    assert(file_offset_of_buffer_end <= impl->getFileSize());
+    chassert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
+    chassert(file_offset_of_buffer_end <= impl->getFileSize());

    return bytes_read;
 }
@ -277,6 +278,15 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    /// First reset the buffer so the next read will fetch new data to the buffer.
    resetWorkingBuffer();

+    if (read_until_position && new_pos > *read_until_position)
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
+        impl->reset();
+
+        file_offset_of_buffer_end = new_pos = *read_until_position; /// read_until_position is a non-included boundary.
+        return new_pos;
+    }
+
    /**
    * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
    * Note: we read in range [file_offset_of_buffer_end, read_until_position).
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -256,7 +256,7 @@ size_t ReadBufferFromRemoteFSGather::getFileSize() const
 String ReadBufferFromRemoteFSGather::getInfoForLog()
 {
    if (!current_buf)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get info: buffer not initialized");
+        return "";

    return current_buf->getInfoForLog();
 }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -125,14 +125,19 @@ std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path
        getRandomASCIIString(key_name_total_size - key_name_prefix_size));
 }

-Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
+size_t S3ObjectStorage::getObjectSize(const std::string & bucket_from, const std::string & key) const
 {
-    return S3::headObject(*client.get(), bucket_from, key, "", true);
+    return S3::getObjectSize(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true);
 }

 bool S3ObjectStorage::exists(const StoredObject & object) const
 {
-    return S3::objectExists(*client.get(), bucket, object.absolute_path, "", true);
+    return S3::objectExists(*client.get(), bucket, object.absolute_path, {}, /* for_disk_s3= */ true);
+}
+
+void S3ObjectStorage::checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const
+{
+    return S3::checkObjectExists(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true, description);
 }

 std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
@ -409,13 +414,10 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons
 {
    ObjectMetadata result;

-    auto object_head = requestObjectHeadData(bucket, path);
-    throwIfError(object_head);
-
-    auto & object_head_result = object_head.GetResult();
-    result.size_bytes = object_head_result.GetContentLength();
-    result.last_modified = object_head_result.GetLastModified().Millis();
-    result.attributes = object_head_result.GetMetadata();
+    auto object_info = S3::getObjectInfo(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);
+    result.size_bytes = object_info.size;
+    result.last_modified = object_info.last_modification_time;
+    result.attributes = S3::getObjectMetadata(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);

    return result;
 }
@ -442,7 +444,7 @@ void S3ObjectStorage::copyObjectImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    std::optional<Aws::S3::Model::HeadObjectResult> head,
+    size_t size,
    std::optional<ObjectAttributes> metadata) const
 {
    auto client_ptr = client.get();
@ -464,7 +466,7 @@ void S3ObjectStorage::copyObjectImpl(
    if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
            || outcome.GetError().GetExceptionName() == "InvalidRequest"))
    { // Can't come here with MinIO, MinIO allows single part upload for large objects.
-        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
+        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
        return;
    }

@ -472,12 +474,7 @@ void S3ObjectStorage::copyObjectImpl(

    auto settings_ptr = s3_settings.get();
    if (settings_ptr->request_settings.check_objects_after_upload)
-    {
-        auto object_head = requestObjectHeadData(dst_bucket, dst_key);
-        if (!object_head.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
-    }
-
+        checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
 }

 void S3ObjectStorage::copyObjectMultipartImpl(
@ -485,15 +482,11 @@ void S3ObjectStorage::copyObjectMultipartImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    std::optional<Aws::S3::Model::HeadObjectResult> head,
+    size_t size,
    std::optional<ObjectAttributes> metadata) const
 {
-    if (!head)
-        head = requestObjectHeadData(src_bucket, src_key).GetResult();
-
    auto settings_ptr = s3_settings.get();
    auto client_ptr = client.get();
-    size_t size = head->GetContentLength();

    String multipart_upload_id;

@ -569,29 +562,24 @@ void S3ObjectStorage::copyObjectMultipartImpl(
    }

    if (settings_ptr->request_settings.check_objects_after_upload)
-    {
-        auto object_head = requestObjectHeadData(dst_bucket, dst_key);
-        if (!object_head.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
-    }
-
+        checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
 }

 void S3ObjectStorage::copyObject( // NOLINT
    const StoredObject & object_from, const StoredObject & object_to, std::optional<ObjectAttributes> object_to_attributes)
 {
-    auto head = requestObjectHeadData(bucket, object_from.absolute_path).GetResult();
+    auto size = getObjectSize(bucket, object_from.absolute_path);
    static constexpr int64_t multipart_upload_threashold = 5UL * 1024 * 1024 * 1024;

-    if (head.GetContentLength() >= multipart_upload_threashold)
+    if (size >= multipart_upload_threashold)
    {
        copyObjectMultipartImpl(
-            bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
+            bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
    }
    else
    {
        copyObjectImpl(
-            bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
+            bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
    }
 }

--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@ -172,7 +172,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
+        size_t size,
        std::optional<ObjectAttributes> metadata = std::nullopt) const;

    void copyObjectMultipartImpl(
@ -180,13 +180,14 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
+        size_t size,
        std::optional<ObjectAttributes> metadata = std::nullopt) const;

    void removeObjectImpl(const StoredObject & object, bool if_exists);
    void removeObjectsImpl(const StoredObjects & objects, bool if_exists);

-    Aws::S3::Model::HeadObjectOutcome requestObjectHeadData(const std::string & bucket_from, const std::string & key) const;
+    size_t getObjectSize(const std::string & bucket_from, const std::string & key) const;
+    void checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const;

    std::string bucket;

--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -90,7 +90,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
    format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
    format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
-    format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
+    format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
+    format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
+    format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
    format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
    format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats;
    format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -153,7 +153,9 @@ struct FormatSettings
        bool quote_denormals = true;
        bool quote_decimals = false;
        bool escape_forward_slashes = true;
-        bool named_tuples_as_objects = false;
+        bool read_named_tuples_as_objects = false;
+        bool write_named_tuples_as_objects = false;
+        bool defaults_for_missing_elements_in_named_tuple = false;
        bool serialize_as_strings = false;
        bool read_bools_as_numbers = true;
        bool read_numbers_as_strings = true;
--- a/src/Functions/MatchImpl.h
+++ b/src/Functions/MatchImpl.h
@ -118,6 +118,16 @@ struct MatchImpl
        if (haystack_offsets.empty())
            return;

+        /// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
+        /// - col [not] [i]like '%' / '%%'
+        /// - match(col, '.*')
+        if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
+        {
+            for (auto & x : res)
+                x = !negate;
+            return;
+        }
+
        /// Special case that the [I]LIKE expression reduces to finding a substring in a string
        String strstr_pattern;
        if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
@ -267,6 +277,16 @@ struct MatchImpl
        if (haystack.empty())
            return;

+        /// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
+        /// - col [not] [i]like '%' / '%%'
+        /// - match(col, '.*')
+        if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
+        {
+            for (auto & x : res)
+                x = !negate;
+            return;
+        }
+
        /// Special case that the [I]LIKE expression reduces to finding a substring in a string
        String strstr_pattern;
        if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -250,7 +250,7 @@ size_t ReadBufferFromS3::getFileSize()
    if (file_size)
        return *file_size;

-    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, true, read_settings.for_object_storage);
+    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, /* for_disk_s3= */ read_settings.for_object_storage);

    file_size = object_size;
    return *file_size;
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -27,6 +27,8 @@
 #    include <aws/core/utils/UUID.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <aws/s3/S3Client.h>
+#    include <aws/s3/model/GetObjectAttributesRequest.h>
+#    include <aws/s3/model/GetObjectRequest.h>
 #    include <aws/s3/model/HeadObjectRequest.h>

 #    include <IO/S3/PocoHTTPClientFactory.h>
@ -40,7 +42,11 @@

 namespace ProfileEvents
 {
+    extern const Event S3GetObjectAttributes;
+    extern const Event S3GetObjectMetadata;
    extern const Event S3HeadObject;
+    extern const Event DiskS3GetObjectAttributes;
+    extern const Event DiskS3GetObjectMetadata;
    extern const Event DiskS3HeadObject;
 }

@ -699,6 +705,92 @@ public:
    }
 };

+/// Extracts the endpoint from a constructed S3 client.
+String getEndpoint(const Aws::S3::S3Client & client)
+{
+    const auto * endpoint_provider = dynamic_cast<const Aws::S3::Endpoint::S3DefaultEpProviderBase *>(const_cast<Aws::S3::S3Client &>(client).accessEndpointProvider().get());
+    if (!endpoint_provider)
+        return {};
+    String endpoint;
+    endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint);
+    return endpoint;
+}
+
+/// Performs a request to get the size and last modification time of an object.
+/// The function performs either HeadObject or GetObjectAttributes request depending on the endpoint.
+std::pair<std::optional<DB::S3::ObjectInfo>, Aws::S3::S3Error> tryGetObjectInfo(
+    const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+{
+    auto endpoint = getEndpoint(client);
+    bool use_get_object_attributes_request = (endpoint.find(".amazonaws.com") != String::npos);
+
+    if (use_get_object_attributes_request)
+    {
+        /// It's better not to use `HeadObject` requests for AWS S3 because they don't work well with the global region.
+        /// Details: `HeadObject` request never returns a response body (even if there is an error) however
+        /// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv"
+        /// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways
+        /// to determine the correct region and try to repeat the request again with the correct region.
+        /// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies,
+        /// but for `HeadObject` there is no response body so this way doesn't work. That's why we use `GetObjectAttributes` request instead.
+        /// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information.
+
+        ProfileEvents::increment(ProfileEvents::S3GetObjectAttributes);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3GetObjectAttributes);
+
+        Aws::S3::Model::GetObjectAttributesRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        req.SetObjectAttributes({Aws::S3::Model::ObjectAttributes::ObjectSize});
+
+        auto outcome = client.GetObjectAttributes(req);
+        if (outcome.IsSuccess())
+        {
+            const auto & result = outcome.GetResult();
+            DB::S3::ObjectInfo object_info;
+            object_info.size = static_cast<size_t>(result.GetObjectSize());
+            object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
+            return {object_info, {}};
+        }
+
+        return {std::nullopt, outcome.GetError()};
+    }
+    else
+    {
+        /// By default we use `HeadObject` requests.
+        /// We cannot just use `GetObjectAttributes` requests always because some S3 providers (e.g. Minio)
+        /// don't support `GetObjectAttributes` requests.
+
+        ProfileEvents::increment(ProfileEvents::S3HeadObject);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
+
+        Aws::S3::Model::HeadObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        auto outcome = client.HeadObject(req);
+        if (outcome.IsSuccess())
+        {
+            const auto & result = outcome.GetResult();
+            DB::S3::ObjectInfo object_info;
+            object_info.size = static_cast<size_t>(result.GetContentLength());
+            object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
+            return {object_info, {}};
+        }
+
+        return {std::nullopt, outcome.GetError()};
+    }
+}
+
 }


@ -894,54 +986,33 @@ namespace S3
        return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY;
    }

-    Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+    ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
    {
-        ProfileEvents::increment(ProfileEvents::S3HeadObject);
-        if (for_disk_s3)
-            ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
-
-        Aws::S3::Model::HeadObjectRequest req;
-        req.SetBucket(bucket);
-        req.SetKey(key);
-
-        if (!version_id.empty())
-            req.SetVersionId(version_id);
-
-        return client.HeadObject(req);
-    }
-
-    S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
-    {
-        auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
-
-        if (outcome.IsSuccess())
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
        {
-            auto read_result = outcome.GetResultWithOwnership();
-            return {.size = static_cast<size_t>(read_result.GetContentLength()), .last_modification_time = read_result.GetLastModified().Millis() / 1000};
+            return *object_info;
        }
        else if (throw_on_error)
        {
-            const auto & error = outcome.GetError();
            throw DB::Exception(ErrorCodes::S3_ERROR,
-                "Failed to HEAD object: {}. HTTP response code: {}",
+                "Failed to get object attributes: {}. HTTP response code: {}",
                error.GetMessage(), static_cast<size_t>(error.GetResponseCode()));
        }
        return {};
    }

-    size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
+    size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
    {
-        return getObjectInfo(client, bucket, key, version_id, throw_on_error, for_disk_s3).size;
+        return getObjectInfo(client, bucket, key, version_id, for_disk_s3, throw_on_error).size;
    }

    bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
    {
-        auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
-
-        if (outcome.IsSuccess())
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
            return true;

-        const auto & error = outcome.GetError();
        if (isNotFoundError(error.GetErrorType()))
            return false;

@ -949,6 +1020,48 @@ namespace S3
            "Failed to check existence of key {} in bucket {}: {}",
            key, bucket, error.GetMessage());
    }
+
+    void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, std::string_view description)
+    {
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
+            return;
+        throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}",
+                          (description.empty() ? "" : (String(description) + ": ")), key, bucket, error.GetMessage());
+    }
+
+    std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
+    {
+        ProfileEvents::increment(ProfileEvents::S3GetObjectMetadata);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3GetObjectMetadata);
+
+        /// We must not use the `HeadObject` request, see the comment about `HeadObjectRequest` in S3Common.h.
+
+        Aws::S3::Model::GetObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        /// Only the first byte will be read.
+        /// We don't need that first byte but the range should be set otherwise the entire object will be read.
+        req.SetRange("bytes=0-0");
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        auto outcome = client.GetObject(req);
+
+        if (outcome.IsSuccess())
+            return outcome.GetResult().GetMetadata();
+
+        if (!throw_on_error)
+            return {};
+
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetErrorType(),
+            "Failed to get metadata of key {} in bucket {}: {}",
+            key, bucket, error.GetMessage());
+    }
 }

 }
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -11,15 +11,15 @@
 #if USE_AWS_S3

 #include <base/types.h>
-#include <aws/core/Aws.h>
-#include <aws/core/client/ClientConfiguration.h>
-#include <aws/s3/S3Client.h>
-#include <aws/s3/S3Errors.h>
-#include <Poco/URI.h>
-
 #include <Common/Exception.h>
 #include <Common/Throttler_fwd.h>

+#include <Poco/URI.h>
+#include <aws/core/Aws.h>
+#include <aws/s3/S3Errors.h>
+
+
+namespace Aws::S3 { class S3Client; }

 namespace DB
 {
@ -121,22 +121,29 @@ struct URI
    static void validateBucket(const String & bucket, const Poco::URI & uri);
 };

+/// WARNING: Don't use `HeadObjectRequest`! Use the functions below instead.
+/// For explanation see the comment about `HeadObject` request in the function tryGetObjectInfo().
+
 struct ObjectInfo
 {
    size_t size = 0;
    time_t last_modification_time = 0;
 };

-bool isNotFoundError(Aws::S3::S3Errors error);
+ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);

-Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);
-
-S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
-
-size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
+size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);

 bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);

+/// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message.
+void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, std::string_view description = {});
+
+bool isNotFoundError(Aws::S3::S3Errors error);
+
+/// Returns the object's metadata.
+std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);
+
 }
 #endif

--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -182,12 +182,8 @@ void WriteBufferFromS3::finalizeImpl()
    if (check_objects_after_upload)
    {
        LOG_TRACE(log, "Checking object {} exists after upload", key);
-
-        auto response = S3::headObject(*client_ptr, bucket, key, "", write_settings.for_object_storage);
-        if (!response.IsSuccess())
-            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
-        else
-            LOG_TRACE(log, "Object {} exists after upload", key);
+        S3::checkObjectExists(*client_ptr, bucket, key, {}, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
+        LOG_TRACE(log, "Object {} exists after upload", key);
    }
 }

--- a/src/Interpreters/DatabaseAndTableWithAlias.cpp
+++ b/src/Interpreters/DatabaseAndTableWithAlias.cpp
@ -28,13 +28,29 @@ DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableIdentifier &
        database = current_database;
 }

+DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database)
+{
+    alias = identifier.tryGetAlias();
+
+    if (identifier.name_parts.size() == 2)
+        std::tie(database, table) = std::tie(identifier.name_parts[0], identifier.name_parts[1]);
+    else if (identifier.name_parts.size() == 1)
+        table = identifier.name_parts[0];
+    else
+        throw Exception("Logical error: invalid identifier", ErrorCodes::LOGICAL_ERROR);
+
+    if (database.empty())
+        database = current_database;
+}
+
 DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTPtr & node, const String & current_database)
 {
-    const auto * identifier = node->as<ASTTableIdentifier>();
-    if (!identifier)
-        throw Exception("Logical error: table identifier expected", ErrorCodes::LOGICAL_ERROR);
-
-    *this = DatabaseAndTableWithAlias(*identifier, current_database);
+    if (const auto * table_identifier = node->as<ASTTableIdentifier>())
+        *this = DatabaseAndTableWithAlias(*table_identifier, current_database);
+    else if (const auto * identifier = node->as<ASTIdentifier>())
+        *this = DatabaseAndTableWithAlias(*identifier, current_database);
+    else
+        throw Exception("Logical error: identifier or table identifier expected", ErrorCodes::LOGICAL_ERROR);
 }

 DatabaseAndTableWithAlias::DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database)
--- a/src/Interpreters/DatabaseAndTableWithAlias.h
+++ b/src/Interpreters/DatabaseAndTableWithAlias.h
@ -14,6 +14,7 @@ namespace DB
 {

 class ASTSelectQuery;
+class ASTIdentifier;
 class ASTTableIdentifier;
 struct ASTTableExpression;

@ -28,6 +29,7 @@ struct DatabaseAndTableWithAlias

    DatabaseAndTableWithAlias() = default;
    explicit DatabaseAndTableWithAlias(const ASTPtr & identifier_node, const String & current_database = "");
+    explicit DatabaseAndTableWithAlias(const ASTIdentifier & identifier, const String & current_database = "");
    explicit DatabaseAndTableWithAlias(const ASTTableIdentifier & identifier, const String & current_database = "");
    explicit DatabaseAndTableWithAlias(const ASTTableExpression & table_expression, const String & current_database = "");

--- a/src/Interpreters/GatherFunctionQuantileVisitor.cpp
+++ b/src/Interpreters/GatherFunctionQuantileVisitor.cpp
@ -25,6 +25,7 @@ static const std::unordered_map<String, String> quantile_fuse_name_mapping = {
    {NameQuantileExactInclusive::name, NameQuantilesExactInclusive::name},
    {NameQuantileExactLow::name, NameQuantilesExactLow::name},
    {NameQuantileExactWeighted::name, NameQuantilesExactWeighted::name},
+    {NameQuantileInterpolatedWeighted::name, NameQuantilesInterpolatedWeighted::name},
    {NameQuantileTDigest::name, NameQuantilesTDigest::name},
    {NameQuantileTDigestWeighted::name, NameQuantilesTDigestWeighted::name},
    {NameQuantileTiming::name, NameQuantilesTiming::name},
@ -61,9 +62,11 @@ void GatherFunctionQuantileData::FuseQuantileAggregatesData::addFuncNode(ASTPtr

    const auto & arguments = func->arguments->children;

+
    bool need_two_args = func->name == NameQuantileDeterministic::name || func->name == NameQuantileExactWeighted::name
-        || func->name == NameQuantileTimingWeighted::name || func->name == NameQuantileTDigestWeighted::name
-        || func->name == NameQuantileBFloat16Weighted::name;
+        || func->name == NameQuantileInterpolatedWeighted::name || func->name == NameQuantileTimingWeighted::name
+        || func->name == NameQuantileTDigestWeighted::name || func->name == NameQuantileBFloat16Weighted::name;
+
    if (arguments.size() != (need_two_args ? 2 : 1))
        return;

--- a/src/Interpreters/InterpreterExplainQuery.cpp
+++ b/src/Interpreters/InterpreterExplainQuery.cpp
@ -288,6 +288,20 @@ struct ExplainSettings : public Settings
    }
 };

+struct QuerySyntaxSettings
+{
+    bool oneline = false;
+
+    constexpr static char name[] = "SYNTAX";
+
+    std::unordered_map<std::string, std::reference_wrapper<bool>> boolean_settings =
+    {
+        {"oneline", oneline},
+    };
+
+    std::unordered_map<std::string, std::reference_wrapper<Int64>> integer_settings;
+};
+
 template <typename Settings>
 ExplainSettings<Settings> checkAndGetSettings(const ASTPtr & ast_settings)
 {
@ -362,13 +376,12 @@ QueryPipeline InterpreterExplainQuery::executeImpl()
        }
        case ASTExplainQuery::AnalyzedSyntax:
        {
-            if (ast.getSettings())
-                throw Exception("Settings are not supported for EXPLAIN SYNTAX query.", ErrorCodes::UNKNOWN_SETTING);
+            auto settings = checkAndGetSettings<QuerySyntaxSettings>(ast.getSettings());

            ExplainAnalyzedSyntaxVisitor::Data data(getContext());
            ExplainAnalyzedSyntaxVisitor(data).visit(query);

-            ast.getExplainedQuery()->format(IAST::FormatSettings(buf, false));
+            ast.getExplainedQuery()->format(IAST::FormatSettings(buf, settings.oneline));
            break;
        }
        case ASTExplainQuery::QueryTree:
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@ -487,7 +487,7 @@ BlockIO InterpreterSystemQuery::execute()
            dropDatabaseReplica(query);
            break;
        case Type::SYNC_REPLICA:
-            syncReplica(query);
+            syncReplica();
            break;
        case Type::SYNC_DATABASE_REPLICA:
            syncReplicatedDatabase(query);
@ -507,6 +507,9 @@ BlockIO InterpreterSystemQuery::execute()
        case Type::RESTORE_REPLICA:
            restoreReplica();
            break;
+        case Type::WAIT_LOADING_PARTS:
+            waitLoadingParts();
+            break;
        case Type::RESTART_DISK:
            restartDisk(query.disk);
        case Type::FLUSH_LOGS:
@ -852,7 +855,7 @@ void InterpreterSystemQuery::dropDatabaseReplica(ASTSystemQuery & query)
        throw Exception("Invalid query", ErrorCodes::LOGICAL_ERROR);
 }

-void InterpreterSystemQuery::syncReplica(ASTSystemQuery &)
+void InterpreterSystemQuery::syncReplica()
 {
    getContext()->checkAccess(AccessType::SYSTEM_SYNC_REPLICA, table_id);
    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
@ -872,6 +875,23 @@ void InterpreterSystemQuery::syncReplica(ASTSystemQuery &)
        throw Exception(ErrorCodes::BAD_ARGUMENTS, table_is_not_replicated.data(), table_id.getNameForLogs());
 }

+void InterpreterSystemQuery::waitLoadingParts()
+{
+    getContext()->checkAccess(AccessType::SYSTEM_WAIT_LOADING_PARTS, table_id);
+    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, getContext());
+
+    if (auto * merge_tree = dynamic_cast<MergeTreeData *>(table.get()))
+    {
+        LOG_TRACE(log, "Waiting for loading of parts of table {}", table_id.getFullTableName());
+        merge_tree->waitForOutdatedPartsToBeLoaded();
+        LOG_TRACE(log, "Finished waiting for loading of parts of table {}", table_id.getFullTableName());
+    }
+    else
+    {
+        throw Exception(ErrorCodes::BAD_ARGUMENTS,
+            "Command WAIT LOADING PARTS is supported only for MergeTree table, but got: {}", table->getName());
+    }
+}

 void InterpreterSystemQuery::syncReplicatedDatabase(ASTSystemQuery & query)
 {
@ -1071,6 +1091,11 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster()
            required_access.emplace_back(AccessType::SYSTEM_RESTART_REPLICA);
            break;
        }
+        case Type::WAIT_LOADING_PARTS:
+        {
+            required_access.emplace_back(AccessType::SYSTEM_WAIT_LOADING_PARTS, query.getDatabase(), query.getTable());
+            break;
+        }
        case Type::SYNC_DATABASE_REPLICA:
        {
            required_access.emplace_back(AccessType::SYSTEM_SYNC_DATABASE_REPLICA, query.getDatabase());
--- a/src/Interpreters/InterpreterSystemQuery.h
+++ b/src/Interpreters/InterpreterSystemQuery.h
@ -56,7 +56,8 @@ private:

    void restartReplica(const StorageID & replica, ContextMutablePtr system_context);
    void restartReplicas(ContextMutablePtr system_context);
-    void syncReplica(ASTSystemQuery & query);
+    void syncReplica();
+    void waitLoadingParts();

    void syncReplicatedDatabase(ASTSystemQuery & query);

--- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
+++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
@ -49,7 +49,8 @@ ASTPtr makeSubqueryTemplate()
 ASTPtr makeSubqueryQualifiedAsterisk()
 {
    auto asterisk = std::make_shared<ASTQualifiedAsterisk>();
-    asterisk->children.emplace_back(std::make_shared<ASTTableIdentifier>("--.s"));
+    asterisk->qualifier = std::make_shared<ASTIdentifier>("--.s");
+    asterisk->children.push_back(asterisk->qualifier);
    return asterisk;
 }

@ -153,24 +154,34 @@ private:
                for (auto & table_name : data.tables_order)
                    data.addTableColumns(table_name, columns);

-                for (const auto & transformer : asterisk->children)
-                    IASTColumnsTransformer::transform(transformer, columns);
+                if (asterisk->transformers)
+                {
+                    for (const auto & transformer : asterisk->transformers->children)
+                        IASTColumnsTransformer::transform(transformer, columns);
+                }
            }
            else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
            {
                has_asterisks = true;

-                auto & identifier = child->children[0]->as<ASTTableIdentifier &>();
+                if (!qualified_asterisk->qualifier)
+                    throw Exception("Logical error: qualified asterisk must have a qualifier", ErrorCodes::LOGICAL_ERROR);
+
+                auto & identifier = qualified_asterisk->qualifier->as<ASTIdentifier &>();

                data.addTableColumns(identifier.name(), columns);

-                // QualifiedAsterisk's transformers start to appear at child 1
-                for (const auto * it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
+                if (qualified_asterisk->transformers)
                {
-                    if (it->get()->as<ASTColumnsApplyTransformer>() || it->get()->as<ASTColumnsExceptTransformer>() || it->get()->as<ASTColumnsReplaceTransformer>())
-                        IASTColumnsTransformer::transform(*it, columns);
-                    else
-                        throw Exception("Logical error: qualified asterisk must only have children of IASTColumnsTransformer type", ErrorCodes::LOGICAL_ERROR);
+                    for (const auto & transformer : qualified_asterisk->transformers->children)
+                    {
+                        if (transformer->as<ASTColumnsApplyTransformer>() ||
+                            transformer->as<ASTColumnsExceptTransformer>() ||
+                            transformer->as<ASTColumnsReplaceTransformer>())
+                            IASTColumnsTransformer::transform(transformer, columns);
+                        else
+                            throw Exception("Logical error: qualified asterisk must only have children of IASTColumnsTransformer type", ErrorCodes::LOGICAL_ERROR);
+                    }
                }
            }
            else if (const auto * columns_list_matcher = child->as<ASTColumnsListMatcher>())
@ -180,8 +191,11 @@ private:
                for (const auto & ident : columns_list_matcher->column_list->children)
                    columns.emplace_back(ident->clone());

-                for (const auto & transformer : columns_list_matcher->children)
-                    IASTColumnsTransformer::transform(transformer, columns);
+                if (columns_list_matcher->transformers)
+                {
+                    for (const auto & transformer : columns_list_matcher->transformers->children)
+                        IASTColumnsTransformer::transform(transformer, columns);
+                }
            }
            else if (const auto * columns_regexp_matcher = child->as<ASTColumnsRegexpMatcher>())
            {
@ -193,8 +207,11 @@ private:
                        columns,
                        [&](const String & column_name) { return columns_regexp_matcher->isColumnMatching(column_name); });

-                for (const auto & transformer : columns_regexp_matcher->children)
-                    IASTColumnsTransformer::transform(transformer, columns);
+                if (columns_regexp_matcher->transformers)
+                {
+                    for (const auto & transformer : columns_regexp_matcher->transformers->children)
+                        IASTColumnsTransformer::transform(transformer, columns);
+                }
            }
            else
                data.new_select_expression_list->children.push_back(child);
@ -425,6 +442,7 @@ private:
        {
            if (data.expression_list->children.empty())
                data.expression_list->children.emplace_back(std::make_shared<ASTAsterisk>());
+
            select.setExpression(ASTSelectQuery::Expression::SELECT, std::move(data.expression_list));
        }
        data.done = true;
--- a/src/Interpreters/JoinedTables.cpp
+++ b/src/Interpreters/JoinedTables.cpp
@ -154,7 +154,7 @@ private:

    static void visit(const ASTQualifiedAsterisk & node, const ASTPtr &, Data & data)
    {
-        auto & identifier = node.children[0]->as<ASTTableIdentifier &>();
+        auto & identifier = node.qualifier->as<ASTIdentifier &>();
        bool rewritten = false;
        for (const auto & table : data)
        {
--- a/src/Interpreters/MergeTreeTransaction.cpp
+++ b/src/Interpreters/MergeTreeTransaction.cpp
@ -303,7 +303,6 @@ bool MergeTreeTransaction::rollback() noexcept
        part->version.unlockRemovalTID(tid, TransactionInfoContext{part->storage.getStorageID(), part->name});
    }

-
    assert([&]()
    {
        std::lock_guard lock{mutex};
--- a/src/Interpreters/TextLog.cpp
+++ b/src/Interpreters/TextLog.cpp
@ -49,7 +49,9 @@ NamesAndTypesList TextLogElement::getNamesAndTypes()
        {"revision", std::make_shared<DataTypeUInt32>()},

        {"source_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
-        {"source_line", std::make_shared<DataTypeUInt64>()}
+        {"source_line", std::make_shared<DataTypeUInt64>()},
+
+        {"message_format_string", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
    };
 }

@ -74,6 +76,8 @@ void TextLogElement::appendToBlock(MutableColumns & columns) const

    columns[i++]->insert(source_file);
    columns[i++]->insert(source_line);
+
+    columns[i++]->insert(message_format_string);
 }

 TextLog::TextLog(ContextPtr context_, const String & database_name_,
--- a/src/Interpreters/TextLog.h
+++ b/src/Interpreters/TextLog.h
@ -28,6 +28,8 @@ struct TextLogElement
    String source_file;
    UInt64 source_line{};

+    std::string_view message_format_string;
+
    static std::string name() { return "TextLog"; }
    static NamesAndTypesList getNamesAndTypes();
    static NamesAndAliases getNamesAndAliases() { return {}; }
--- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@ -156,21 +156,19 @@ void TranslateQualifiedNamesMatcher::visit(ASTFunction & node, const ASTPtr &, D
        func_arguments->children.clear();
 }

-void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk &, const ASTPtr & ast, Data & data)
+void TranslateQualifiedNamesMatcher::visit(const ASTQualifiedAsterisk & node, const ASTPtr &, Data & data)
 {
-    if (ast->children.empty())
-        throw Exception("Logical error: qualified asterisk must have children", ErrorCodes::LOGICAL_ERROR);
-
-    auto & ident = ast->children[0];
+    if (!node.qualifier)
+        throw Exception("Logical error: qualified asterisk must have a qualifier", ErrorCodes::LOGICAL_ERROR);

    /// @note it could contain table alias as table name.
-    DatabaseAndTableWithAlias db_and_table(ident);
+    DatabaseAndTableWithAlias db_and_table(node.qualifier);

    for (const auto & known_table : data.tables)
        if (db_and_table.satisfies(known_table.table, true))
            return;

-    throw Exception("Unknown qualified identifier: " + ident->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
+    throw Exception("Unknown qualified identifier: " + node.qualifier->getAliasOrColumnName(), ErrorCodes::UNKNOWN_IDENTIFIER);
 }

 void TranslateQualifiedNamesMatcher::visit(ASTTableJoin & join, const ASTPtr & , Data & data)
@ -266,16 +264,22 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                first_table = false;
            }

-            for (const auto & transformer : asterisk->children)
-                IASTColumnsTransformer::transform(transformer, columns);
+            if (asterisk->transformers)
+            {
+                for (const auto & transformer : asterisk->transformers->children)
+                    IASTColumnsTransformer::transform(transformer, columns);
+            }
        }
        else if (auto * asterisk_column_list = child->as<ASTColumnsListMatcher>())
        {
            for (const auto & ident : asterisk_column_list->column_list->children)
                columns.emplace_back(ident->clone());

-            for (const auto & transformer : asterisk_column_list->children)
-                IASTColumnsTransformer::transform(transformer, columns);
+            if (asterisk_column_list->transformers)
+            {
+                for (const auto & transformer : asterisk_column_list->transformers->children)
+                    IASTColumnsTransformer::transform(transformer, columns);
+            }
        }
        else if (const auto * asterisk_regexp_pattern = child->as<ASTColumnsRegexpMatcher>())
        {
@ -292,12 +296,15 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                first_table = false;
            }

-            for (const auto & transformer : asterisk_regexp_pattern->children)
-                IASTColumnsTransformer::transform(transformer, columns);
+            if (asterisk_regexp_pattern->transformers)
+            {
+                for (const auto & transformer : asterisk_regexp_pattern->transformers->children)
+                    IASTColumnsTransformer::transform(transformer, columns);
+            }
        }
        else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
        {
-            DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->children[0]);
+            DatabaseAndTableWithAlias ident_db_and_name(qualified_asterisk->qualifier);

            for (const auto & table : tables_with_columns)
            {
@ -309,10 +316,10 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                }
            }

-            // QualifiedAsterisk's transformers start to appear at child 1
-            for (const auto * it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
+            if (qualified_asterisk->transformers)
            {
-                IASTColumnsTransformer::transform(*it, columns);
+                for (const auto & transformer : qualified_asterisk->transformers->children)
+                    IASTColumnsTransformer::transform(transformer, columns);
            }
        }
        else
--- a/src/Loggers/OwnSplitChannel.cpp
+++ b/src/Loggers/OwnSplitChannel.cpp
@ -133,6 +133,8 @@ void OwnSplitChannel::logSplit(const Poco::Message & msg)
            elem.source_file = msg.getSourceFile();

        elem.source_line = msg.getSourceLine();
+        elem.message_format_string = msg.getFormatString();
+
        std::shared_ptr<TextLog> text_log_locked{};
        {
            std::lock_guard<std::mutex> lock(text_log_mutex);
--- a/src/Parsers/ASTAsterisk.cpp
+++ b/src/Parsers/ASTAsterisk.cpp
@ -8,21 +8,37 @@ namespace DB
 ASTPtr ASTAsterisk::clone() const
 {
    auto clone = std::make_shared<ASTAsterisk>(*this);
-    clone->cloneChildren();
+
+    if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
+    if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
    return clone;
 }

-void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const { ostr.write('*'); }
+void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const
+{
+    if (expression)
+    {
+        expression->appendColumnName(ostr);
+        writeCString(".", ostr);
+    }
+
+    ostr.write('*');
+}

 void ASTAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
+    if (expression)
+    {
+        expression->formatImpl(settings, state, frame);
+        settings.ostr << ".";
+    }
+
    settings.ostr << "*";

-    /// Format column transformers
-    for (const auto & child : children)
+    if (transformers)
    {
-        settings.ostr << ' ';
-        child->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

--- a/src/Parsers/ASTAsterisk.h
+++ b/src/Parsers/ASTAsterisk.h
@ -16,6 +16,8 @@ public:
    ASTPtr clone() const override;
    void appendColumnName(WriteBuffer & ostr) const override;

+    ASTPtr expression;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
 };
--- a/src/Parsers/ASTColumnsMatcher.cpp
+++ b/src/Parsers/ASTColumnsMatcher.cpp
@ -18,12 +18,20 @@ namespace ErrorCodes
 ASTPtr ASTColumnsRegexpMatcher::clone() const
 {
    auto clone = std::make_shared<ASTColumnsRegexpMatcher>(*this);
-    clone->cloneChildren();
+
+    if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
+    if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
    return clone;
 }

 void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
 {
+    if (expression)
+    {
+        expression->appendColumnName(ostr);
+        writeCString(".", ostr);
+    }
    writeCString("COLUMNS(", ostr);
    writeQuotedString(original_pattern, ostr);
    writeChar(')', ostr);
@ -38,15 +46,21 @@ void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const

 void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
-    settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
+    settings.ostr << (settings.hilite ? hilite_keyword : "");
+
+    if (expression)
+    {
+        expression->formatImpl(settings, state, frame);
+        settings.ostr << ".";
+    }
+
+    settings.ostr << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
    settings.ostr << quoteString(original_pattern);
    settings.ostr << ")";

-    /// Format column transformers
-    for (const auto & child : children)
+    if (transformers)
    {
-        settings.ostr << ' ';
-        child->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

@ -60,6 +74,11 @@ void ASTColumnsRegexpMatcher::setPattern(String pattern)
            DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
 }

+const String & ASTColumnsRegexpMatcher::getPattern() const
+{
+    return original_pattern;
+}
+
 const std::shared_ptr<re2::RE2> & ASTColumnsRegexpMatcher::getMatcher() const
 {
    return column_matcher;
@ -73,19 +92,23 @@ bool ASTColumnsRegexpMatcher::isColumnMatching(const String & column_name) const
 ASTPtr ASTColumnsListMatcher::clone() const
 {
    auto clone = std::make_shared<ASTColumnsListMatcher>(*this);
-    clone->column_list = column_list->clone();
-    clone->cloneChildren();
-    return clone;
-}

-void ASTColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
-{
-    column_list->updateTreeHash(hash_state);
-    IAST::updateTreeHashImpl(hash_state);
+    if (expression) { clone->expression = expression->clone(); clone->children.push_back(clone->expression); }
+    if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
+    clone->column_list = column_list->clone();
+    clone->children.push_back(clone->column_list);
+
+    return clone;
 }

 void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
 {
+    if (expression)
+    {
+        expression->appendColumnName(ostr);
+        writeCString(".", ostr);
+    }
    writeCString("COLUMNS(", ostr);
    for (auto * it = column_list->children.begin(); it != column_list->children.end(); ++it)
    {
@ -99,7 +122,15 @@ void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const

 void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
-    settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
+    settings.ostr << (settings.hilite ? hilite_keyword : "");
+
+    if (expression)
+    {
+        expression->formatImpl(settings, state, frame);
+        settings.ostr << ".";
+    }
+
+    settings.ostr << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";

    for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
    {
@ -111,33 +142,39 @@ void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatSt
    }
    settings.ostr << ")";

-    /// Format column transformers
-    for (const auto & child : children)
+    if (transformers)
    {
-        settings.ostr << ' ';
-        child->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

 ASTPtr ASTQualifiedColumnsRegexpMatcher::clone() const
 {
    auto clone = std::make_shared<ASTQualifiedColumnsRegexpMatcher>(*this);
-    clone->cloneChildren();
+
+    if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
+    clone->qualifier = qualifier->clone();
+    clone->children.push_back(clone->qualifier);
+
    return clone;
 }

 void ASTQualifiedColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
 {
-    const auto & qualifier = children.at(0);
    qualifier->appendColumnName(ostr);
    writeCString(".COLUMNS(", ostr);
    writeQuotedString(original_pattern, ostr);
    writeChar(')', ostr);
 }

-void ASTQualifiedColumnsRegexpMatcher::setPattern(String pattern)
+void ASTQualifiedColumnsRegexpMatcher::setPattern(String pattern, bool set_matcher)
 {
    original_pattern = std::move(pattern);
+
+    if (!set_matcher)
+        return;
+
    column_matcher = std::make_shared<RE2>(original_pattern, RE2::Quiet);
    if (!column_matcher->ok())
        throw DB::Exception(
@ -166,35 +203,35 @@ void ASTQualifiedColumnsRegexpMatcher::formatImpl(const FormatSettings & setting
 {
    settings.ostr << (settings.hilite ? hilite_keyword : "");

-    const auto & qualifier = children.at(0);
    qualifier->formatImpl(settings, state, frame);

    settings.ostr << ".COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
    settings.ostr << quoteString(original_pattern);
    settings.ostr << ")";

-    /// Format column transformers
-    size_t children_size = children.size();
-
-    for (size_t i = 1; i < children_size; ++i)
+    if (transformers)
    {
-        const auto & child = children[i];
-        settings.ostr << ' ';
-        child->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

 ASTPtr ASTQualifiedColumnsListMatcher::clone() const
 {
    auto clone = std::make_shared<ASTQualifiedColumnsListMatcher>(*this);
+
+    if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
+    clone->qualifier = qualifier->clone();
    clone->column_list = column_list->clone();
-    clone->cloneChildren();
+
+    clone->children.push_back(clone->qualifier);
+    clone->children.push_back(clone->column_list);
+
    return clone;
 }

 void ASTQualifiedColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
 {
-    const auto & qualifier = children.at(0);
    qualifier->appendColumnName(ostr);
    writeCString(".COLUMNS(", ostr);

@ -208,19 +245,10 @@ void ASTQualifiedColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
    writeChar(')', ostr);
 }

-void ASTQualifiedColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
-{
-    column_list->updateTreeHash(hash_state);
-    IAST::updateTreeHashImpl(hash_state);
-}
-
 void ASTQualifiedColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
    settings.ostr << (settings.hilite ? hilite_keyword : "");
-
-    const auto & qualifier = children.at(0);
    qualifier->formatImpl(settings, state, frame);
-
    settings.ostr << ".COLUMNS" << (settings.hilite ? hilite_none : "") << "(";

    for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
@ -232,14 +260,9 @@ void ASTQualifiedColumnsListMatcher::formatImpl(const FormatSettings & settings,
    }
    settings.ostr << ")";

-    /// Format column transformers
-    size_t children_size = children.size();
-
-    for (size_t i = 1; i < children_size; ++i)
+    if (transformers)
    {
-        const auto & child = children[i];
-        settings.ostr << ' ';
-        child->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

--- a/src/Parsers/ASTColumnsMatcher.h
+++ b/src/Parsers/ASTColumnsMatcher.h
@ -24,10 +24,13 @@ public:

    void appendColumnName(WriteBuffer & ostr) const override;
    void setPattern(String pattern);
+    const String & getPattern() const;
    const std::shared_ptr<re2::RE2> & getMatcher() const;
    bool isColumnMatching(const String & column_name) const;
    void updateTreeHashImpl(SipHash & hash_state) const override;

+    ASTPtr expression;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;

@ -43,9 +46,10 @@ public:
    String getID(char) const override { return "ColumnsListMatcher"; }
    ASTPtr clone() const override;
    void appendColumnName(WriteBuffer & ostr) const override;
-    void updateTreeHashImpl(SipHash & hash_state) const override;

+    ASTPtr expression;
    ASTPtr column_list;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
 };
@ -59,10 +63,12 @@ public:

    void appendColumnName(WriteBuffer & ostr) const override;
    const std::shared_ptr<re2::RE2> & getMatcher() const;
-    void setPattern(String pattern);
+    void setPattern(String pattern, bool set_matcher = true);
    void setMatcher(std::shared_ptr<re2::RE2> matcher);
    void updateTreeHashImpl(SipHash & hash_state) const override;

+    ASTPtr qualifier;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;

@ -78,9 +84,10 @@ public:
    String getID(char) const override { return "QualifiedColumnsListMatcher"; }
    ASTPtr clone() const override;
    void appendColumnName(WriteBuffer & ostr) const override;
-    void updateTreeHashImpl(SipHash & hash_state) const override;

+    ASTPtr qualifier;
    ASTPtr column_list;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
 };
--- a/src/Parsers/ASTColumnsTransformers.cpp
+++ b/src/Parsers/ASTColumnsTransformers.cpp
@ -19,6 +19,15 @@ namespace ErrorCodes
    extern const int CANNOT_COMPILE_REGEXP;
 }

+void ASTColumnsTransformerList::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
+{
+    for (const auto & child : children)
+    {
+        settings.ostr << ' ';
+        child->formatImpl(settings, state, frame);
+    }
+}
+
 void IASTColumnsTransformer::transform(const ASTPtr & transformer, ASTs & nodes)
 {
    if (const auto * apply = transformer->as<ASTColumnsApplyTransformer>())
--- a/src/Parsers/ASTColumnsTransformers.h
+++ b/src/Parsers/ASTColumnsTransformers.h
@ -9,6 +9,23 @@ namespace re2

 namespace DB
 {
+
+/// A list of column transformers
+class ASTColumnsTransformerList : public IAST
+{
+public:
+    String getID(char) const override { return "ColumnsTransformerList"; }
+    ASTPtr clone() const override
+    {
+        auto clone = std::make_shared<ASTColumnsTransformerList>(*this);
+        clone->cloneChildren();
+        return clone;
+    }
+
+protected:
+    void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
+};
+
 class IASTColumnsTransformer : public IAST
 {
 public:
--- a/src/Parsers/ASTExplainQuery.h
+++ b/src/Parsers/ASTExplainQuery.h
@ -6,6 +6,10 @@
 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+}

 /// AST, EXPLAIN or other query with meaning of explanation query instead of execution
 class ASTExplainQuery : public ASTQueryWithOutput
@ -23,6 +27,45 @@ public:
        CurrentTransaction, /// 'EXPLAIN CURRENT TRANSACTION'
    };

+    static String toString(ExplainKind kind)
+    {
+        switch (kind)
+        {
+            case ParsedAST: return "EXPLAIN AST";
+            case AnalyzedSyntax: return "EXPLAIN SYNTAX";
+            case QueryTree: return "EXPLAIN QUERY TREE";
+            case QueryPlan: return "EXPLAIN";
+            case QueryPipeline: return "EXPLAIN PIPELINE";
+            case QueryEstimates: return "EXPLAIN ESTIMATE";
+            case TableOverride: return "EXPLAIN TABLE OVERRIDE";
+            case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION";
+        }
+
+        UNREACHABLE();
+    }
+
+    static ExplainKind fromString(const String & str)
+    {
+        if (str == "EXPLAIN AST")
+            return ParsedAST;
+        if (str == "EXPLAIN SYNTAX")
+            return AnalyzedSyntax;
+        if (str == "EXPLAIN QUERY TREE")
+            return QueryTree;
+        if (str == "EXPLAIN" || str == "EXPLAIN PLAN")
+            return QueryPlan;
+        if (str == "EXPLAIN PIPELINE")
+            return QueryPipeline;
+        if (str == "EXPLAIN ESTIMATE")
+            return QueryEstimates;
+        if (str == "EXPLAIN TABLE OVERRIDE")
+            return TableOverride;
+        if (str == "EXPLAIN CURRENT TRANSACTION")
+            return CurrentTransaction;
+
+        throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown explain kind '{}'", str);
+    }
+
    explicit ASTExplainQuery(ExplainKind kind_) : kind(kind_) {}

    String getID(char delim) const override { return "Explain" + (delim + toString(kind)); }
@ -103,23 +146,6 @@ private:
    /// Used by EXPLAIN TABLE OVERRIDE
    ASTPtr table_function;
    ASTPtr table_override;
-
-    static String toString(ExplainKind kind)
-    {
-        switch (kind)
-        {
-            case ParsedAST: return "EXPLAIN AST";
-            case AnalyzedSyntax: return "EXPLAIN SYNTAX";
-            case QueryTree: return "EXPLAIN QUERY TREE";
-            case QueryPlan: return "EXPLAIN";
-            case QueryPipeline: return "EXPLAIN PIPELINE";
-            case QueryEstimates: return "EXPLAIN ESTIMATE";
-            case TableOverride: return "EXPLAIN TABLE OVERRIDE";
-            case CurrentTransaction: return "EXPLAIN CURRENT TRANSACTION";
-        }
-
-        UNREACHABLE();
-    }
 };

 }
--- a/src/Parsers/ASTQualifiedAsterisk.cpp
+++ b/src/Parsers/ASTQualifiedAsterisk.cpp
@ -7,22 +7,18 @@ namespace DB

 void ASTQualifiedAsterisk::appendColumnName(WriteBuffer & ostr) const
 {
-    const auto & qualifier = children.at(0);
    qualifier->appendColumnName(ostr);
    writeCString(".*", ostr);
 }

 void ASTQualifiedAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
 {
-    const auto & qualifier = children.at(0);
    qualifier->formatImpl(settings, state, frame);
    settings.ostr << ".*";

-    /// Format column transformers
-    for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
+    if (transformers)
    {
-        settings.ostr << ' ';
-        (*it)->formatImpl(settings, state, frame);
+        transformers->formatImpl(settings, state, frame);
    }
 }

--- a/src/Parsers/ASTQualifiedAsterisk.h
+++ b/src/Parsers/ASTQualifiedAsterisk.h
@ -17,11 +17,18 @@ public:
    ASTPtr clone() const override
    {
        auto clone = std::make_shared<ASTQualifiedAsterisk>(*this);
-        clone->cloneChildren();
+
+        if (transformers) { clone->transformers = transformers->clone(); clone->children.push_back(clone->transformers); }
+
+        clone->qualifier = qualifier->clone();
+        clone->children.push_back(clone->qualifier);
+
        return clone;
    }
    void appendColumnName(WriteBuffer & ostr) const override;

+    ASTPtr qualifier;
+    ASTPtr transformers;
 protected:
    void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
 };
--- a/src/Parsers/ASTSystemQuery.cpp
+++ b/src/Parsers/ASTSystemQuery.cpp
@ -166,6 +166,7 @@ void ASTSystemQuery::formatImpl(const FormatSettings & settings, FormatState &,
    else if (  type == Type::RESTART_REPLICA
            || type == Type::RESTORE_REPLICA
            || type == Type::SYNC_REPLICA
+            || type == Type::WAIT_LOADING_PARTS
            || type == Type::FLUSH_DISTRIBUTED
            || type == Type::RELOAD_DICTIONARY
            || type == Type::RELOAD_MODEL
--- a/src/Parsers/ASTSystemQuery.h
+++ b/src/Parsers/ASTSystemQuery.h
@ -35,6 +35,7 @@ public:
        RESTART_REPLICAS,
        RESTART_REPLICA,
        RESTORE_REPLICA,
+        WAIT_LOADING_PARTS,
        DROP_REPLICA,
        DROP_DATABASE_REPLICA,
        SYNC_REPLICA,
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@ -28,6 +28,8 @@
 #include <Parsers/ASTWindowDefinition.h>
 #include <Parsers/ASTAssignment.h>
 #include <Parsers/ASTColumnsMatcher.h>
+#include <Parsers/ASTExplainQuery.h>
+#include <Parsers/ASTSetQuery.h>
 #include <Parsers/ASTSelectQuery.h>
 #include <Parsers/ASTTablesInSelectQuery.h>
 #include <Parsers/ExpressionListParsers.h>
@ -116,8 +118,40 @@ bool ParserSubquery::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    }
    else if (ASTPtr explain_node; explain.parse(pos, explain_node, expected))
    {
-        /// Replace SELECT * FROM (EXPLAIN SELECT ...) with SELECT * FROM viewExplain(EXPLAIN SELECT ...)
-        result_node = buildSelectFromTableFunction(makeASTFunction("viewExplain", explain_node));
+        const auto & explain_query = explain_node->as<const ASTExplainQuery &>();
+
+        if (explain_query.getTableFunction() || explain_query.getTableOverride())
+            throw Exception("EXPLAIN in a subquery cannot have a table function or table override", ErrorCodes::BAD_ARGUMENTS);
+
+        /// Replace subquery `(EXPLAIN <kind> <explain_settings> SELECT ...)`
+        /// with `(SELECT * FROM viewExplain("<kind>", "<explain_settings>", SELECT ...))`
+
+        String kind_str = ASTExplainQuery::toString(explain_query.getKind());
+
+        String settings_str;
+        if (ASTPtr settings_ast = explain_query.getSettings())
+        {
+            if (!settings_ast->as<ASTSetQuery>())
+                throw Exception("EXPLAIN settings must be a SET query", ErrorCodes::BAD_ARGUMENTS);
+            settings_str = queryToString(settings_ast);
+        }
+
+        const ASTPtr & explained_ast = explain_query.getExplainedQuery();
+        if (explained_ast)
+        {
+            auto view_explain = makeASTFunction("viewExplain",
+                std::make_shared<ASTLiteral>(kind_str),
+                std::make_shared<ASTLiteral>(settings_str),
+                explained_ast);
+            result_node = buildSelectFromTableFunction(view_explain);
+        }
+        else
+        {
+            auto view_explain = makeASTFunction("viewExplain",
+                std::make_shared<ASTLiteral>(kind_str),
+                std::make_shared<ASTLiteral>(settings_str));
+            result_node = buildSelectFromTableFunction(view_explain);
+        }
    }
    else
    {
@ -1623,13 +1657,21 @@ bool ParserAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
    {
        ++pos;
        auto asterisk = std::make_shared<ASTAsterisk>();
+        auto transformers = std::make_shared<ASTColumnsTransformerList>();
        ParserColumnsTransformers transformers_p(allowed_transformers);
        ASTPtr transformer;
        while (transformers_p.parse(pos, transformer, expected))
        {
-            asterisk->children.push_back(transformer);
+            transformers->children.push_back(transformer);
        }
-        node = asterisk;
+
+        if (!transformers->children.empty())
+        {
+            asterisk->transformers = std::move(transformers);
+            asterisk->children.push_back(asterisk->transformers);
+        }
+
+        node = std::move(asterisk);
        return true;
    }
    return false;
@ -1638,7 +1680,7 @@ bool ParserAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)

 bool ParserQualifiedAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
-    if (!ParserCompoundIdentifier(true, true).parse(pos, node, expected))
+    if (!ParserCompoundIdentifier(false, true).parse(pos, node, expected))
        return false;

    if (pos->type != TokenType::Dot)
@ -1650,13 +1692,23 @@ bool ParserQualifiedAsterisk::parseImpl(Pos & pos, ASTPtr & node, Expected & exp
    ++pos;

    auto res = std::make_shared<ASTQualifiedAsterisk>();
-    res->children.push_back(node);
+    auto transformers = std::make_shared<ASTColumnsTransformerList>();
    ParserColumnsTransformers transformers_p;
    ASTPtr transformer;
    while (transformers_p.parse(pos, transformer, expected))
    {
-        res->children.push_back(transformer);
+        transformers->children.push_back(transformer);
    }
+
+    res->qualifier = std::move(node);
+    res->children.push_back(res->qualifier);
+
+    if (!transformers->children.empty())
+    {
+        res->transformers = std::move(transformers);
+        res->children.push_back(res->transformers);
+    }
+
    node = std::move(res);
    return true;
 }
@ -1680,28 +1732,44 @@ static bool parseColumnsMatcherBody(IParser::Pos & pos, ASTPtr & node, Expected
        return false;
    ++pos;

+    auto transformers = std::make_shared<ASTColumnsTransformerList>();
+    ParserColumnsTransformers transformers_p(allowed_transformers);
+    ASTPtr transformer;
+    while (transformers_p.parse(pos, transformer, expected))
+    {
+        transformers->children.push_back(transformer);
+    }
+
    ASTPtr res;
    if (column_list)
    {
        auto list_matcher = std::make_shared<ASTColumnsListMatcher>();
-        list_matcher->column_list = column_list;
-        res = list_matcher;
+
+        list_matcher->column_list = std::move(column_list);
+        list_matcher->children.push_back(list_matcher->column_list);
+
+        if (!transformers->children.empty())
+        {
+            list_matcher->transformers = std::move(transformers);
+            list_matcher->children.push_back(list_matcher->transformers);
+        }
+
+        node = std::move(list_matcher);
    }
    else
    {
        auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
        regexp_matcher->setPattern(regex_node->as<ASTLiteral &>().value.get<String>());
-        res = regexp_matcher;
+
+        if (!transformers->children.empty())
+        {
+            regexp_matcher->transformers = std::move(transformers);
+            regexp_matcher->children.push_back(regexp_matcher->transformers);
+        }
+
+        node = std::move(regexp_matcher);
    }

-    ParserColumnsTransformers transformers_p(allowed_transformers);
-    ASTPtr transformer;
-    while (transformers_p.parse(pos, transformer, expected))
-    {
-        res->children.push_back(transformer);
-    }
-
-    node = std::move(res);
    return true;
 }

@ -1717,29 +1785,19 @@ bool ParserColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expect

 bool ParserQualifiedColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
 {
-    if (!ParserCompoundIdentifier(true, true).parse(pos, node, expected))
+    if (!ParserCompoundIdentifier(false, true).parse(pos, node, expected))
        return false;

    auto identifier_node = node;
-    const auto & identifier_node_typed = identifier_node->as<ASTTableIdentifier &>();
+    auto & identifier_node_typed = identifier_node->as<ASTIdentifier &>();
+    auto & name_parts = identifier_node_typed.name_parts;

    /// ParserCompoundIdentifier parse identifier.COLUMNS
-    if (identifier_node_typed.name_parts.size() == 1 || identifier_node_typed.name_parts.back() != "COLUMNS")
+    if (name_parts.size() == 1 || name_parts.back() != "COLUMNS")
        return false;

-    /// TODO: ASTTableIdentifier can contain only 2 parts
-
-    if (identifier_node_typed.name_parts.size() == 2)
-    {
-        auto table_name = identifier_node_typed.name_parts[0];
-        identifier_node = std::make_shared<ASTTableIdentifier>(table_name);
-    }
-    else
-    {
-        throw Exception(ErrorCodes::LOGICAL_ERROR,
-            "Expected identifier to contain no more than 2 parts. Actual {}",
-            identifier_node_typed.full_name);
-    }
+    name_parts.pop_back();
+    identifier_node = std::make_shared<ASTIdentifier>(std::move(name_parts), false, std::move(node->children));

    if (!parseColumnsMatcherBody(pos, node, expected, allowed_transformers))
        return false;
@ -1747,28 +1805,36 @@ bool ParserQualifiedColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected
    if (auto * columns_list_matcher = node->as<ASTColumnsListMatcher>())
    {
        auto result = std::make_shared<ASTQualifiedColumnsListMatcher>();
+        result->qualifier = std::move(identifier_node);
        result->column_list = std::move(columns_list_matcher->column_list);

-        result->children.reserve(columns_list_matcher->children.size() + 1);
-        result->children.push_back(std::move(identifier_node));
+        result->children.push_back(result->qualifier);
+        result->children.push_back(result->column_list);

-        for (auto && child : columns_list_matcher->children)
-            result->children.push_back(std::move(child));
+        if (columns_list_matcher->transformers)
+        {
+            result->transformers = std::move(columns_list_matcher->transformers);
+            result->children.push_back(result->transformers);
+        }

-        node = result;
+        node = std::move(result);
    }
    else if (auto * column_regexp_matcher = node->as<ASTColumnsRegexpMatcher>())
    {
        auto result = std::make_shared<ASTQualifiedColumnsRegexpMatcher>();
+        result->setPattern(column_regexp_matcher->getPattern(), false);
        result->setMatcher(column_regexp_matcher->getMatcher());

-        result->children.reserve(column_regexp_matcher->children.size() + 1);
-        result->children.push_back(std::move(identifier_node));
+        result->qualifier = std::move(identifier_node);
+        result->children.push_back(result->qualifier);

-        for (auto && child : column_regexp_matcher->children)
-            result->children.push_back(std::move(child));
+        if (column_regexp_matcher->transformers)
+        {
+            result->transformers = std::move(column_regexp_matcher->transformers);
+            result->children.push_back(result->transformers);
+        }

-        node = result;
+        node = std::move(result);
    }
    else
    {
--- a/src/Parsers/ExpressionListParsers.cpp
+++ b/src/Parsers/ExpressionListParsers.cpp
@ -4,6 +4,7 @@
 #include <Parsers/ParserSetQuery.h>

 #include <Parsers/ASTAsterisk.h>
+#include <Parsers/ASTColumnsMatcher.h>
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTFunctionWithKeyValueArguments.h>
@ -2194,7 +2195,7 @@ struct ParserExpressionImpl
    using Layers = std::vector<std::unique_ptr<Layer>>;

    Action tryParseOperand(Layers & layers, IParser::Pos & pos, Expected & expected);
-    static Action tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected);
+    Action tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected);
 };


@ -2523,8 +2524,6 @@ Action ParserExpressionImpl::tryParseOperand(Layers & layers, IParser::Pos & pos

 Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & pos, Expected & expected)
 {
-    ASTPtr tmp;
-
    /// ParserExpression can be called in this part of the query:
    ///  ALTER TABLE partition_all2 CLEAR INDEX [ p ] IN PARTITION ALL
    ///
@ -2544,17 +2543,17 @@ Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & po

    if (cur_op == operators_table.end())
    {
+        ASTPtr alias;
        ParserAlias alias_parser(layers.back()->allow_alias_without_as_keyword);
-        auto old_pos = pos;
+
        if (layers.back()->allow_alias &&
            !layers.back()->parsed_alias &&
-            alias_parser.parse(pos, tmp, expected) &&
-            layers.back()->insertAlias(tmp))
+            alias_parser.parse(pos, alias, expected) &&
+            layers.back()->insertAlias(alias))
        {
            layers.back()->parsed_alias = true;
            return Action::OPERATOR;
        }
-        pos = old_pos;
        return Action::NONE;
    }

@ -2618,33 +2617,57 @@ Action ParserExpressionImpl::tryParseOperator(Layers & layers, IParser::Pos & po
        layers.back()->pushOperand(function);
    }

+    /// Dot (TupleElement operator) can be a beginning of a .* or .COLUMNS expressions
+    if (op.type == OperatorType::TupleElement)
+    {
+        ASTPtr tmp;
+        if (asterisk_parser.parse(pos, tmp, expected) ||
+            columns_matcher_parser.parse(pos, tmp, expected))
+        {
+            if (auto * asterisk = tmp->as<ASTAsterisk>())
+            {
+                if (!layers.back()->popOperand(asterisk->expression))
+                    return Action::NONE;
+            }
+            else if (auto * columns_list_matcher = tmp->as<ASTColumnsListMatcher>())
+            {
+                if (!layers.back()->popOperand(columns_list_matcher->expression))
+                    return Action::NONE;
+            }
+            else if (auto * columns_regexp_matcher = tmp->as<ASTColumnsRegexpMatcher>())
+            {
+                if (!layers.back()->popOperand(columns_regexp_matcher->expression))
+                    return Action::NONE;
+            }
+
+            layers.back()->pushOperand(std::move(tmp));
+            return Action::OPERATOR;
+        }
+    }
+
    layers.back()->pushOperator(op);

-    if (op.type == OperatorType::ArrayElement)
-        layers.push_back(std::make_unique<ArrayElementLayer>());
-
-
-    Action next = Action::OPERAND;
-
    /// isNull & isNotNull are postfix unary operators
    if (op.type == OperatorType::IsNull)
-        next = Action::OPERATOR;
-
-    if (op.type == OperatorType::StartBetween || op.type == OperatorType::StartNotBetween)
-        layers.back()->between_counter++;
+        return Action::OPERATOR;

    if (op.type == OperatorType::Cast)
    {
-        next = Action::OPERATOR;
-
        ASTPtr type_ast;
        if (!ParserDataType().parse(pos, type_ast, expected))
            return Action::NONE;

        layers.back()->pushOperand(std::make_shared<ASTLiteral>(queryToString(type_ast)));
+        return Action::OPERATOR;
    }

-    return next;
+    if (op.type == OperatorType::ArrayElement)
+        layers.push_back(std::make_unique<ArrayElementLayer>());
+
+    if (op.type == OperatorType::StartBetween || op.type == OperatorType::StartNotBetween)
+        layers.back()->between_counter++;
+
+    return Action::OPERAND;
 }

 }
--- a/src/Parsers/ParserSystemQuery.cpp
+++ b/src/Parsers/ParserSystemQuery.cpp
@ -253,6 +253,7 @@ bool ParserSystemQuery::parseImpl(IParser::Pos & pos, ASTPtr & node, Expected &

        case Type::RESTART_REPLICA:
        case Type::SYNC_REPLICA:
+        case Type::WAIT_LOADING_PARTS:
        {
            if (!parseQueryWithOnCluster(res, pos, expected))
                return false;
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@ -196,7 +196,7 @@ void PipelineExecutor::executeSingleThread(size_t thread_num)

 #ifndef NDEBUG
    auto & context = tasks.getThreadContext(thread_num);
-    LOG_TRACE(log,
+    LOG_TEST(log,
              "Thread finished. Total time: {} sec. Execution time: {} sec. Processing time: {} sec. Wait time: {} sec.",
              context.total_time_ns / 1e9,
              context.execution_time_ns / 1e9,
--- a/src/Processors/Formats/Impl/AvroRowInputFormat.cpp
+++ b/src/Processors/Formats/Impl/AvroRowInputFormat.cpp
@ -304,7 +304,9 @@ AvroDeserializer::DeserializeFn AvroDeserializer::createDeserializeFn(avro::Node
                    };
                }

-                if (null_as_default)
+                /// If the Union is ['Null', Nested-Type], since the Nested-Type can not be inside
+                /// Nullable, so we will get Nested-Type, instead of Nullable type.
+                if (null_as_default || !target.isNullable())
                {
                    auto nested_deserialize = this->createDeserializeFn(root_node->leafAt(non_null_union_index), target_type);
                    return [non_null_union_index, nested_deserialize](IColumn & column, avro::Decoder & decoder)
@ -1001,7 +1003,7 @@ DataTypePtr AvroSchemaReader::avroNodeToDataType(avro::NodePtr node)
        case avro::Type::AVRO_STRING:
            return std::make_shared<DataTypeString>();
        case avro::Type::AVRO_BYTES:
-            return std::make_shared<DataTypeFloat32>();
+            return std::make_shared<DataTypeString>();
        case avro::Type::AVRO_ENUM:
        {
            if (node->names() < 128)
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@ -402,14 +402,10 @@ void TCPHandler::runImpl()
                    {
                        auto callback = [this]()
                        {
-                            {
-                                std::lock_guard task_callback_lock(task_callback_mutex);
+                            std::scoped_lock lock(task_callback_mutex, fatal_error_mutex);

-                                if (isQueryCancelled())
-                                    return true;
-                            }
-
-                            std::lock_guard lock(fatal_error_mutex);
+                            if (isQueryCancelled())
+                                return true;

                            sendProgress();
                            sendSelectProfileEvents();
@ -424,6 +420,9 @@ void TCPHandler::runImpl()
                }

                state.io.onFinish();
+
+                std::lock_guard lock(task_callback_mutex);
+
                /// Send final progress after calling onFinish(), since it will update the progress.
                ///
                /// NOTE: we cannot send Progress for regular INSERT (with VALUES)
@ -446,8 +445,11 @@ void TCPHandler::runImpl()
            if (state.is_connection_closed)
                break;

-            sendLogs();
-            sendEndOfStream();
+            {
+                std::lock_guard lock(task_callback_mutex);
+                sendLogs();
+                sendEndOfStream();
+            }

            /// QueryState should be cleared before QueryScope, since otherwise
            /// the MemoryTracker will be wrong for possible deallocations.
@ -760,6 +762,9 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
        }
    }

+    /// Defer locking to cover a part of the scope below and everything after it
+    std::unique_lock progress_lock(task_callback_mutex, std::defer_lock);
+
    {
        PullingAsyncPipelineExecutor executor(pipeline);
        CurrentMetrics::Increment query_thread_metric_increment{CurrentMetrics::QueryThread};
@ -796,6 +801,11 @@ void TCPHandler::processOrdinaryQueryWithProcessors()
            }
        }

+        /// This lock wasn't acquired before and we make .lock() call here
+        /// so everything under this line is covered even together
+        /// with sendProgress() out of the scope
+        progress_lock.lock();
+
        /** If data has run out, we will send the profiling data and total values to
          * the last zero block to be able to use
          * this information in the suffix output of stream.
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@ -60,6 +60,7 @@ namespace ErrorCodes
    extern const int TOO_MANY_PARTITIONS;
    extern const int DISTRIBUTED_TOO_MANY_PENDING_BYTES;
    extern const int ARGUMENT_OUT_OF_BOUND;
+    extern const int LOGICAL_ERROR;
 }


@ -365,18 +366,22 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(
    const std::string & relative_path_,
    ConnectionPoolPtr pool_,
    ActionBlocker & monitor_blocker_,
-    BackgroundSchedulePool & bg_pool)
+    BackgroundSchedulePool & bg_pool,
+    bool initialize_from_disk)
    : storage(storage_)
    , pool(std::move(pool_))
    , disk(disk_)
    , relative_path(relative_path_)
    , path(fs::path(disk->getPath()) / relative_path / "")
+    , broken_relative_path(fs::path(relative_path) / "broken")
+    , broken_path(fs::path(path) / "broken" / "")
    , should_batch_inserts(storage.getDistributedSettingsRef().monitor_batch_inserts)
    , split_batch_on_failure(storage.getDistributedSettingsRef().monitor_split_batch_on_failure)
    , dir_fsync(storage.getDistributedSettingsRef().fsync_directories)
    , min_batched_block_size_rows(storage.getContext()->getSettingsRef().min_insert_block_size_rows)
    , min_batched_block_size_bytes(storage.getContext()->getSettingsRef().min_insert_block_size_bytes)
    , current_batch_file_path(path + "current_batch.txt")
+    , pending_files(std::numeric_limits<size_t>::max())
    , default_sleep_time(storage.getDistributedSettingsRef().monitor_sleep_time_ms.totalMilliseconds())
    , sleep_time(default_sleep_time)
    , max_sleep_time(storage.getDistributedSettingsRef().monitor_max_sleep_time_ms.totalMilliseconds())
@ -385,6 +390,11 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(
    , metric_pending_files(CurrentMetrics::DistributedFilesToInsert, 0)
    , metric_broken_files(CurrentMetrics::BrokenDistributedFilesToInsert, 0)
 {
+    fs::create_directory(broken_path);
+
+    if (initialize_from_disk)
+        initializeFilesFromDisk();
+
    task_handle = bg_pool.createTask(getLoggerName() + "/Bg", [this]{ run(); });
    task_handle->activateAndSchedule();
 }
@ -392,35 +402,29 @@ StorageDistributedDirectoryMonitor::StorageDistributedDirectoryMonitor(

 StorageDistributedDirectoryMonitor::~StorageDistributedDirectoryMonitor()
 {
-    if (!quit)
+    if (!pending_files.isFinished())
    {
-        quit = true;
+        pending_files.clearAndFinish();
        task_handle->deactivate();
    }
 }

 void StorageDistributedDirectoryMonitor::flushAllData()
 {
-    if (quit)
+    if (pending_files.isFinished())
        return;

    std::lock_guard lock{mutex};
-
-    const auto & files = getFiles();
-    if (!files.empty())
-    {
-        processFiles(files);
-
-        /// Update counters.
-        getFiles();
-    }
+    if (!hasPendingFiles())
+        return;
+    processFiles();
 }

 void StorageDistributedDirectoryMonitor::shutdownAndDropAllData()
 {
-    if (!quit)
+    if (!pending_files.isFinished())
    {
-        quit = true;
+        pending_files.clearAndFinish();
        task_handle->deactivate();
    }

@ -434,19 +438,21 @@ void StorageDistributedDirectoryMonitor::run()
    std::lock_guard lock{mutex};

    bool do_sleep = false;
-    while (!quit)
+    while (!pending_files.isFinished())
    {
        do_sleep = true;

-        const auto & files = getFiles();
-        if (files.empty())
+        if (!hasPendingFiles())
            break;

        if (!monitor_blocker.isCancelled())
        {
            try
            {
-                do_sleep = !processFiles(files);
+                processFiles();
+                /// No errors while processing existing files.
+                /// Let's see maybe there are more files to process.
+                do_sleep = false;

                std::lock_guard status_lock(status_mutex);
                status.last_exception = std::exception_ptr{};
@ -470,9 +476,7 @@ void StorageDistributedDirectoryMonitor::run()
            }
        }
        else
-        {
            LOG_DEBUG(log, "Skipping send data over distributed table.");
-        }

        const auto now = std::chrono::system_clock::now();
        if (now - last_decrease_time > decrease_error_count_period)
@ -487,10 +491,7 @@ void StorageDistributedDirectoryMonitor::run()
            break;
    }

-    /// Update counters.
-    getFiles();
-
-    if (!quit && do_sleep)
+    if (!pending_files.isFinished() && do_sleep)
        task_handle->scheduleAfter(sleep_time.count());
 }

@ -568,41 +569,83 @@ ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::stri
        settings.distributed_replica_error_cap);
 }

-
-std::map<UInt64, std::string> StorageDistributedDirectoryMonitor::getFiles()
+bool StorageDistributedDirectoryMonitor::hasPendingFiles() const
 {
-    std::map<UInt64, std::string> files;
+    return fs::exists(current_batch_file_path) || !current_batch_file.empty() || !pending_files.empty();
+}
+
+void StorageDistributedDirectoryMonitor::initializeFilesFromDisk()
+{
+    /// NOTE: This method does not requires to hold status_mutex, hence, no TSA
+    /// annotations in the header file.

    fs::directory_iterator end;
-    for (fs::directory_iterator it{path}; it != end; ++it)
+
+    /// Initialize pending files
    {
-        const auto & file_path_str = it->path();
-        if (!it->is_directory() && startsWith(fs::path(file_path_str).extension(), ".bin"))
+        size_t bytes_count = 0;
+
+        for (fs::directory_iterator it{path}; it != end; ++it)
        {
-            files[parse<UInt64>(fs::path(file_path_str).stem())] = file_path_str;
+            const auto & file_path = it->path();
+            const auto & base_name = file_path.stem().string();
+            if (!it->is_directory() && startsWith(fs::path(file_path).extension(), ".bin") && parse<UInt64>(base_name))
+            {
+                const std::string & file_path_str = file_path.string();
+                if (!pending_files.push(file_path_str))
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add pending file");
+                bytes_count += fs::file_size(file_path);
+            }
+            else if (base_name != "tmp" && base_name != "broken")
+            {
+                /// It is OK to log current_batch.txt here too (useful for debugging).
+                LOG_WARNING(log, "Unexpected file {} in {}", file_path.string(), path);
+            }
        }
+
+        LOG_TRACE(log, "Files set to {}", pending_files.size());
+        LOG_TRACE(log, "Bytes set to {}", bytes_count);
+
+        metric_pending_files.changeTo(pending_files.size());
+        status.files_count = pending_files.size();
+        status.bytes_count = bytes_count;
    }

-    return files;
+    /// Initialize broken files
+    {
+        size_t broken_bytes_count = 0;
+        size_t broken_files = 0;
+
+        for (fs::directory_iterator it{broken_path}; it != end; ++it)
+        {
+            const auto & file_path = it->path();
+            if (!it->is_directory() && startsWith(fs::path(file_path).extension(), ".bin") && parse<UInt64>(file_path.stem()))
+                broken_bytes_count += fs::file_size(file_path);
+            else
+                LOG_WARNING(log, "Unexpected file {} in {}", file_path.string(), broken_path);
+        }
+
+        LOG_TRACE(log, "Broken files set to {}", broken_files);
+        LOG_TRACE(log, "Broken bytes set to {}", broken_bytes_count);
+
+        metric_broken_files.changeTo(broken_files);
+        status.broken_files_count = broken_files;
+        status.broken_bytes_count = broken_bytes_count;
+    }
 }
-bool StorageDistributedDirectoryMonitor::processFiles(const std::map<UInt64, std::string> & files)
+void StorageDistributedDirectoryMonitor::processFiles()
 {
    if (should_batch_inserts)
-    {
-        processFilesWithBatching(files);
-    }
+        processFilesWithBatching();
    else
    {
-        for (const auto & file : files)
-        {
-            if (quit)
-                return true;
+        /// Process unprocessed file.
+        if (!current_batch_file.empty())
+            processFile(current_batch_file);

-            processFile(file.second);
-        }
+        while (pending_files.tryPop(current_batch_file))
+            processFile(current_batch_file);
    }
-
-    return true;
 }

 void StorageDistributedDirectoryMonitor::processFile(const std::string & file_path)
@ -649,7 +692,11 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa
            thread_trace_context->root_span.addAttribute(std::current_exception());

        e.addMessage(fmt::format("While sending {}", file_path));
-        maybeMarkAsBroken(file_path, e);
+        if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
+        {
+            markAsBroken(file_path);
+            current_batch_file.clear();
+        }
        throw;
    }
    catch (...)
@ -662,6 +709,7 @@ void StorageDistributedDirectoryMonitor::processFile(const std::string & file_pa

    auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
    markAsSend(file_path);
+    current_batch_file.clear();
    LOG_TRACE(log, "Finished processing `{}` (took {} ms)", file_path, watch.elapsedMilliseconds());
 }

@ -701,23 +749,19 @@ struct StorageDistributedDirectoryMonitor::BatchHeader

 struct StorageDistributedDirectoryMonitor::Batch
 {
-    std::vector<UInt64> file_indices;
    size_t total_rows = 0;
    size_t total_bytes = 0;
    bool recovered = false;

    StorageDistributedDirectoryMonitor & parent;
-    const std::map<UInt64, String> & file_index_to_path;
+    std::vector<std::string> files;

    bool split_batch_on_failure = true;
    bool fsync = false;
    bool dir_fsync = false;

-    Batch(
-        StorageDistributedDirectoryMonitor & parent_,
-        const std::map<UInt64, String> & file_index_to_path_)
+    explicit Batch(StorageDistributedDirectoryMonitor & parent_)
        : parent(parent_)
-        , file_index_to_path(file_index_to_path_)
        , split_batch_on_failure(parent.split_batch_on_failure)
        , fsync(parent.storage.getDistributedSettingsRef().fsync_after_insert)
        , dir_fsync(parent.dir_fsync)
@ -732,7 +776,7 @@ struct StorageDistributedDirectoryMonitor::Batch

    void send()
    {
-        if (file_indices.empty())
+        if (files.empty())
            return;

        CurrentMetrics::Increment metric_increment{CurrentMetrics::DistributedSend};
@ -775,7 +819,7 @@ struct StorageDistributedDirectoryMonitor::Batch
            }
            catch (const Exception & e)
            {
-                if (split_batch_on_failure && file_indices.size() > 1 && isSplittableErrorCode(e.code(), e.isRemoteException()))
+                if (split_batch_on_failure && files.size() > 1 && isSplittableErrorCode(e.code(), e.isRemoteException()))
                {
                    tryLogCurrentException(parent.log, "Trying to split batch due to");
                    sendSeparateFiles();
@ -795,44 +839,28 @@ struct StorageDistributedDirectoryMonitor::Batch
            }
            else
            {
-                std::vector<std::string> files;
-                for (const auto && file_info : file_index_to_path | boost::adaptors::indexed())
-                {
-                    if (file_info.index() > 8)
-                    {
-                        files.push_back("...");
-                        break;
-                    }
-
-                    files.push_back(file_info.value().second);
-                }
-                e.addMessage(fmt::format("While sending batch, nums: {}, files: {}", file_index_to_path.size(), fmt::join(files, "\n")));
-
+                e.addMessage(fmt::format("While sending a batch of {} files, files: {}", files.size(), fmt::join(files, "\n")));
                throw;
            }
        }

        if (!batch_broken)
        {
-            LOG_TRACE(parent.log, "Sent a batch of {} files (took {} ms).", file_indices.size(), watch.elapsedMilliseconds());
+            LOG_TRACE(parent.log, "Sent a batch of {} files (took {} ms).", files.size(), watch.elapsedMilliseconds());

            auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, parent.disk, parent.relative_path);
-            for (UInt64 file_index : file_indices)
-                parent.markAsSend(file_index_to_path.at(file_index));
+            for (const auto & file : files)
+                parent.markAsSend(file);
        }
        else if (!batch_marked_as_broken)
        {
-            LOG_ERROR(parent.log, "Marking a batch of {} files as broken.", file_indices.size());
+            LOG_ERROR(parent.log, "Marking a batch of {} files as broken, files: {}", files.size(), fmt::join(files, "\n"));

-            for (UInt64 file_idx : file_indices)
-            {
-                auto file_path = file_index_to_path.find(file_idx);
-                if (file_path != file_index_to_path.end())
-                    parent.markAsBroken(file_path->second);
-            }
+            for (const auto & file : files)
+                parent.markAsBroken(file);
        }

-        file_indices.clear();
+        files.clear();
        total_rows = 0;
        total_bytes = 0;
        recovered = false;
@ -842,8 +870,11 @@ struct StorageDistributedDirectoryMonitor::Batch

    void writeText(WriteBuffer & out)
    {
-        for (UInt64 file_idx : file_indices)
-            out << file_idx << '\n';
+        for (const auto & file : files)
+        {
+            UInt64 file_index = parse<UInt64>(fs::path(file).stem());
+            out << file_index << '\n';
+        }
    }

    void readText(ReadBuffer & in)
@ -852,8 +883,9 @@ struct StorageDistributedDirectoryMonitor::Batch
        {
            UInt64 idx;
            in >> idx >> "\n";
-            file_indices.push_back(idx);
+            files.push_back(fmt::format("{}/{}.bin", parent.path, idx));
        }
+
        recovered = true;
    }

@ -865,14 +897,9 @@ private:

        IConnectionPool::Entry connection;

-        for (UInt64 file_idx : file_indices)
+        for (const auto & file : files)
        {
-            auto file_path = file_index_to_path.find(file_idx);
-            if (file_path == file_index_to_path.end())
-                throw Exception(ErrorCodes::DISTRIBUTED_BROKEN_BATCH_INFO,
-                    "Failed to send batch: file with index {} is absent", file_idx);
-
-            ReadBufferFromFile in(file_path->second);
+            ReadBufferFromFile in(file);
            const auto & distributed_header = readDistributedHeader(in, parent.log);

            OpenTelemetry::TracingContextHolder thread_trace_context(__PRETTY_FUNCTION__,
@ -886,7 +913,7 @@ private:
                compression_expected = connection->getCompression() == Protocol::Compression::Enable;

                LOG_DEBUG(parent.log, "Sending a batch of {} files to {} ({} rows, {} bytes).",
-                    file_indices.size(),
+                    files.size(),
                    connection->getDescription(),
                    formatReadableQuantity(total_rows),
                    formatReadableSizeWithBinarySuffix(total_bytes));
@ -907,19 +934,11 @@ private:
    {
        size_t broken_files = 0;

-        for (UInt64 file_idx : file_indices)
+        for (const auto & file : files)
        {
-            auto file_path = file_index_to_path.find(file_idx);
-            if (file_path == file_index_to_path.end())
-            {
-                LOG_ERROR(parent.log, "Failed to send one file from batch: file with index {} is absent", file_idx);
-                ++broken_files;
-                continue;
-            }
-
            try
            {
-                ReadBufferFromFile in(file_path->second);
+                ReadBufferFromFile in(file);
                const auto & distributed_header = readDistributedHeader(in, parent.log);

                // this function is called in a separated thread, so we set up the trace context from the file
@ -941,9 +960,11 @@ private:
            }
            catch (Exception & e)
            {
-                e.addMessage(fmt::format("While sending {}", file_path->second));
-                parent.maybeMarkAsBroken(file_path->second, e);
-                ++broken_files;
+                if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
+                {
+                    parent.markAsBroken(file);
+                    ++broken_files;
+                }
            }
        }

@ -1023,13 +1044,18 @@ std::shared_ptr<ISource> StorageDistributedDirectoryMonitor::createSourceFromFil
    return std::make_shared<DirectoryMonitorSource>(file_name);
 }

-bool StorageDistributedDirectoryMonitor::addAndSchedule(size_t file_size, size_t ms)
+bool StorageDistributedDirectoryMonitor::addAndSchedule(const std::string & file_path, size_t file_size, size_t ms)
 {
-    if (quit)
+    /// NOTE: It is better not to throw in this case, since the file is already
+    /// on disk (see DistributedSink), and it will be processed next time.
+    if (pending_files.isFinished())
        return false;

+    if (!pending_files.push(file_path))
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot add pending file");
+
    {
-        std::lock_guard status_lock(status_mutex);
+        std::lock_guard lock(status_mutex);
        metric_pending_files.add();
        status.bytes_count += file_size;
        ++status.files_count;
@ -1045,33 +1071,25 @@ StorageDistributedDirectoryMonitor::Status StorageDistributedDirectoryMonitor::g
    return current_status;
 }

-void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map<UInt64, std::string> & files)
+void StorageDistributedDirectoryMonitor::processFilesWithBatching()
 {
-    std::unordered_set<UInt64> file_indices_to_skip;
-
+    /// Possibly, we failed to send a batch on the previous iteration. Try to send exactly the same batch.
    if (fs::exists(current_batch_file_path))
    {
-        /// Possibly, we failed to send a batch on the previous iteration. Try to send exactly the same batch.
-        Batch batch(*this, files);
+        Batch batch(*this);
        ReadBufferFromFile in{current_batch_file_path};
        batch.readText(in);
-        file_indices_to_skip.insert(batch.file_indices.begin(), batch.file_indices.end());
        batch.send();
+
+        auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
+        fs::remove(current_batch_file_path);
    }

    std::unordered_map<BatchHeader, Batch, BatchHeader::Hash> header_to_batch;

-    for (const auto & file : files)
+    std::string file_path;
+    while (pending_files.tryPop(file_path))
    {
-        if (quit)
-            return;
-
-        UInt64 file_idx = file.first;
-        const String & file_path = file.second;
-
-        if (file_indices_to_skip.contains(file_idx))
-            continue;
-
        size_t total_rows = 0;
        size_t total_bytes = 0;
        Block header;
@ -1110,8 +1128,9 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
        }
        catch (const Exception & e)
        {
-            if (maybeMarkAsBroken(file_path, e))
+            if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
            {
+                markAsBroken(file_path);
                tryLogCurrentException(log, "File is marked broken due to");
                continue;
            }
@ -1125,9 +1144,9 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map
            std::move(distributed_header.client_info),
            std::move(header)
        );
-        Batch & batch = header_to_batch.try_emplace(batch_header, *this, files).first->second;
+        Batch & batch = header_to_batch.try_emplace(batch_header, *this).first->second;

-        batch.file_indices.push_back(file_idx);
+        batch.files.push_back(file_path);
        batch.total_rows += total_rows;
        batch.total_bytes += total_bytes;

@ -1155,16 +1174,10 @@ void StorageDistributedDirectoryMonitor::processFilesWithBatching(const std::map

 void StorageDistributedDirectoryMonitor::markAsBroken(const std::string & file_path)
 {
-    const auto last_path_separator_pos = file_path.rfind('/');
-    const auto & base_path = file_path.substr(0, last_path_separator_pos + 1);
-    const auto & file_name = file_path.substr(last_path_separator_pos + 1);
-    const String & broken_path = fs::path(base_path) / "broken/";
-    const String & broken_file_path = fs::path(broken_path) / file_name;
-
-    fs::create_directory(broken_path);
+    const String & broken_file_path = fs::path(broken_path) / fs::path(file_path).filename();

    auto dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, relative_path);
-    auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, fs::path(relative_path) / "broken/");
+    auto broken_dir_sync_guard = getDirectorySyncGuard(dir_fsync, disk, broken_relative_path);

    {
        std::lock_guard status_lock(status_mutex);
@ -1198,21 +1211,9 @@ void StorageDistributedDirectoryMonitor::markAsSend(const std::string & file_pat
    fs::remove(file_path);
 }

-bool StorageDistributedDirectoryMonitor::maybeMarkAsBroken(const std::string & file_path, const Exception & e)
-{
-    /// Mark file as broken if necessary.
-    if (isFileBrokenErrorCode(e.code(), e.isRemoteException()))
-    {
-        markAsBroken(file_path);
-        return true;
-    }
-    else
-        return false;
-}
-
 std::string StorageDistributedDirectoryMonitor::getLoggerName() const
 {
-    return storage.getStorageID().getFullTableName() + ".DirectoryMonitor";
+    return storage.getStorageID().getFullTableName() + ".DirectoryMonitor." + disk->getName();
 }

 void StorageDistributedDirectoryMonitor::updatePath(const std::string & new_relative_path)
--- a/src/Storages/Distributed/DirectoryMonitor.h
+++ b/src/Storages/Distributed/DirectoryMonitor.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Core/BackgroundSchedulePool.h>
+#include <Common/ConcurrentBoundedQueue.h>
 #include <Client/ConnectionPool.h>

 #include <atomic>
@ -38,7 +39,8 @@ public:
        const std::string & relative_path_,
        ConnectionPoolPtr pool_,
        ActionBlocker & monitor_blocker_,
-        BackgroundSchedulePool & bg_pool);
+        BackgroundSchedulePool & bg_pool,
+        bool initialize_from_disk);

    ~StorageDistributedDirectoryMonitor();

@ -53,7 +55,7 @@ public:
    static std::shared_ptr<ISource> createSourceFromFile(const String & file_name);

    /// For scheduling via DistributedSink.
-    bool addAndSchedule(size_t file_size, size_t ms);
+    bool addAndSchedule(const std::string & file_path, size_t file_size, size_t ms);

    struct InternalStatus
    {
@ -78,14 +80,15 @@ public:
 private:
    void run();

-    std::map<UInt64, std::string> getFiles();
-    bool processFiles(const std::map<UInt64, std::string> & files);
+    bool hasPendingFiles() const;
+
+    void initializeFilesFromDisk();
+    void processFiles();
    void processFile(const std::string & file_path);
-    void processFilesWithBatching(const std::map<UInt64, std::string> & files);
+    void processFilesWithBatching();

    void markAsBroken(const std::string & file_path);
    void markAsSend(const std::string & file_path);
-    bool maybeMarkAsBroken(const std::string & file_path, const Exception & e);

    std::string getLoggerName() const;

@ -95,25 +98,33 @@ private:
    DiskPtr disk;
    std::string relative_path;
    std::string path;
+    std::string broken_relative_path;
+    std::string broken_path;

    const bool should_batch_inserts = false;
    const bool split_batch_on_failure = true;
    const bool dir_fsync = false;
    const size_t min_batched_block_size_rows = 0;
    const size_t min_batched_block_size_bytes = 0;
-    String current_batch_file_path;
+
+    /// This is pending data (due to some error) for should_batch_inserts==true
+    std::string current_batch_file_path;
+    /// This is pending data (due to some error) for should_batch_inserts==false
+    std::string current_batch_file;

    struct BatchHeader;
    struct Batch;

    std::mutex status_mutex;
+
    InternalStatus status;

+    ConcurrentBoundedQueue<std::string> pending_files;
+
    const std::chrono::milliseconds default_sleep_time;
    std::chrono::milliseconds sleep_time;
    const std::chrono::milliseconds max_sleep_time;
    std::chrono::time_point<std::chrono::system_clock> last_decrease_time {std::chrono::system_clock::now()};
-    std::atomic<bool> quit {false};
    std::mutex mutex;
    Poco::Logger * log;
    ActionBlocker & monitor_blocker;
--- a/src/Storages/Distributed/DistributedSink.cpp
+++ b/src/Storages/Distributed/DistributedSink.cpp
@ -724,6 +724,9 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
        return guard;
    };

+    std::vector<std::string> bin_files;
+    bin_files.reserve(dir_names.size());
+
    auto it = dir_names.begin();
    /// on first iteration write block to a temporary directory for subsequent
    /// hardlinking to ensure the inode is not freed until we're done
@ -802,8 +805,8 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
        }

        // Create hardlink here to reuse increment number
-        const std::string block_file_path(fs::path(path) / file_name);
-        createHardLink(first_file_tmp_path, block_file_path);
+        bin_files.push_back(fs::path(path) / file_name);
+        createHardLink(first_file_tmp_path, bin_files.back());
        auto dir_sync_guard = make_directory_sync_guard(*it);
    }
    ++it;
@ -814,8 +817,8 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const
        const std::string path(fs::path(disk_path) / (data_path + *it));
        fs::create_directory(path);

-        const std::string block_file_path(fs::path(path) / (toString(storage.file_names_increment.get()) + ".bin"));
-        createHardLink(first_file_tmp_path, block_file_path);
+        bin_files.push_back(fs::path(path) / (toString(storage.file_names_increment.get()) + ".bin"));
+        createHardLink(first_file_tmp_path, bin_files.back());
        auto dir_sync_guard = make_directory_sync_guard(*it);
    }

@ -826,10 +829,13 @@ void DistributedSink::writeToShard(const Cluster::ShardInfo & shard_info, const

    /// Notify
    auto sleep_ms = context->getSettingsRef().distributed_directory_monitor_sleep_time_ms;
-    for (const auto & dir_name : dir_names)
+    for (size_t i = 0; i < dir_names.size(); ++i)
    {
+        const auto & dir_name = dir_names[i];
+        const auto & bin_file = bin_files[i];
+
        auto & directory_monitor = storage.requireDirectoryMonitor(disk, dir_name, /* startup= */ false);
-        directory_monitor.addAndSchedule(file_size, sleep_ms.totalMilliseconds());
+        directory_monitor.addAndSchedule(bin_file, file_size, sleep_ms.totalMilliseconds());
    }
 }

--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@ -651,7 +651,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchSelectedPart(
            }
 #endif

-            LOG_WARNING(log, fmt::runtime(e.message() + " Will retry fetching part without zero-copy."));
+            LOG_WARNING(log, "Will retry fetching part without zero-copy: {}", e.message());

            /// It's important to release session from HTTP pool. Otherwise it's possible to get deadlock
            /// on http pool.
--- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp
+++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp
@ -109,10 +109,11 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare()
            /// 2. We have some larger merged part which covers new_part_name (and therefore it covers source_part_name too)
            /// 3. We have two intersecting parts, both cover source_part_name. It's logical error.
            /// TODO Why 1 and 2 can happen? Do we need more assertions here or somewhere else?
-            constexpr const char * message = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
-            LOG_WARNING(log, fmt::runtime(message), source_part_name, source_part_or_covering->name, entry.new_part_name);
+            constexpr auto fmt_string = "Part {} is covered by {} but should be merged into {}. This shouldn't happen often.";
+            String message;
+            LOG_WARNING(LogToStr(message, log), fmt_string, source_part_name, source_part_or_covering->name, entry.new_part_name);
            if (!source_part_or_covering->info.contains(MergeTreePartInfo::fromPartName(entry.new_part_name, storage.format_version)))
-                throw Exception(ErrorCodes::LOGICAL_ERROR, message, source_part_name, source_part_or_covering->name, entry.new_part_name);
+                throw Exception(ErrorCodes::LOGICAL_ERROR, message);

            return PrepareResult{
                .prepared_successfully = false,
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -1052,6 +1052,8 @@ public:
    /// Returns an object that protects temporary directory from cleanup
    scope_guard getTemporaryPartDirectoryHolder(const String & part_dir_name) const;

+    void waitForOutdatedPartsToBeLoaded() const;
+
 protected:
    friend class IMergeTreeDataPart;
    friend class MergeTreeDataMergerMutator;
@ -1068,7 +1070,6 @@ protected:
    /// under lockForShare if rename is possible.
    String relative_data_path;

-
    /// Current column sizes in compressed and uncompressed form.
    ColumnSizeByName column_sizes;

@ -1330,6 +1331,88 @@ protected:
    void resetObjectColumnsFromActiveParts(const DataPartsLock & lock);
    void updateObjectColumns(const DataPartPtr & part, const DataPartsLock & lock);

+    /** A structure that explicitly represents a "merge tree" of parts
+     *  which is implicitly presented by min-max block numbers and levels of parts.
+     *  The children of node are parts which are covered by parent part.
+     *  This tree provides the order of loading of parts.
+     *
+     *  We start to traverse tree from the top level and load parts
+     *  corresposponded to nodes. If part is loaded successfully then
+     *  we stop traversal at this node. Otherwise part is broken and we
+     *  traverse its children and try to load covered parts which will
+     *  replace broken covering part. Unloaded nodes represent outdated parts
+     *  nd they are pushed to background task and loaded asynchronoulsy.
+     */
+    class PartLoadingTree
+    {
+    public:
+        struct Node
+        {
+            Node(const MergeTreePartInfo & info_, const String & name_, const DiskPtr & disk_)
+                : info(info_), name(name_), disk(disk_)
+            {
+            }
+
+            const MergeTreePartInfo info;
+            const String name;
+            const DiskPtr disk;
+
+            bool is_loaded = false;
+            std::map<MergeTreePartInfo, std::shared_ptr<Node>> children;
+        };
+
+        struct PartLoadingInfo
+        {
+            PartLoadingInfo(const MergeTreePartInfo & info_, const String & name_, const DiskPtr & disk_)
+                : info(info_), name(name_), disk(disk_)
+            {
+            }
+
+            /// Store name explicitly because it cannot be easily
+            /// retrieved from info in tables with old syntax.
+            MergeTreePartInfo info;
+            String name;
+            DiskPtr disk;
+        };
+
+        using NodePtr = std::shared_ptr<Node>;
+        using PartLoadingInfos = std::vector<PartLoadingInfo>;
+
+        /// Builds a tree from the list of part infos.
+        static PartLoadingTree build(PartLoadingInfos nodes);
+
+        /// Traverses a tree and call @func on each node.
+        /// If recursive is false traverses only the top level.
+        template <typename Func>
+        void traverse(bool recursive, Func && func);
+
+    private:
+        /// NOTE: Parts should be added in descending order of their levels
+        /// because rearranging tree to the new root is not supported.
+        void add(const MergeTreePartInfo & info, const String & name, const DiskPtr & disk);
+        std::unordered_map<String, NodePtr> root_by_partition;
+    };
+
+    using PartLoadingTreeNodes = std::vector<PartLoadingTree::NodePtr>;
+
+    struct LoadPartResult
+    {
+        bool is_broken = false;
+        std::optional<size_t> size_of_part;
+        MutableDataPartPtr part;
+    };
+
+    mutable std::mutex outdated_data_parts_mutex;
+    mutable std::condition_variable outdated_data_parts_cv;
+
+    BackgroundSchedulePool::TaskHolder outdated_data_parts_loading_task;
+    PartLoadingTreeNodes outdated_unloaded_data_parts TSA_GUARDED_BY(outdated_data_parts_mutex);
+    bool outdated_data_parts_loading_canceled TSA_GUARDED_BY(outdated_data_parts_mutex) = false;
+
+    void loadOutdatedDataParts(bool is_async);
+    void startOutdatedDataPartsLoadingTask();
+    void stopOutdatedDataPartsLoadingTask();
+
    static void incrementInsertedPartsProfileEvent(MergeTreeDataPartType type);
    static void incrementMergedPartsProfileEvent(MergeTreeDataPartType type);

@ -1408,18 +1491,20 @@ private:
    /// Returns default settings for storage with possible changes from global config.
    virtual std::unique_ptr<MergeTreeSettings> getDefaultSettings() const = 0;

-    void loadDataPartsFromDisk(
-        MutableDataPartsVector & broken_parts_to_detach,
-        MutableDataPartsVector & duplicate_parts_to_remove,
+    LoadPartResult loadDataPart(
+        const MergeTreePartInfo & part_info,
+        const String & part_name,
+        const DiskPtr & part_disk_ptr,
+        MergeTreeDataPartState to_state,
+        std::mutex & part_loading_mutex);
+
+    std::vector<LoadPartResult> loadDataPartsFromDisk(
        ThreadPool & pool,
        size_t num_parts,
-        std::queue<std::vector<std::pair<String, DiskPtr>>> & parts_queue,
-        bool skip_sanity_checks,
+        std::queue<PartLoadingTreeNodes> & parts_queue,
        const MergeTreeSettingsPtr & settings);

-    void loadDataPartsFromWAL(
-        MutableDataPartsVector & duplicate_parts_to_remove,
-        MutableDataPartsVector & parts_from_wal);
+    void loadDataPartsFromWAL(MutableDataPartsVector & parts_from_wal);

    /// Create zero-copy exclusive lock for part and disk. Useful for coordination of
    /// distributed operations which can lead to data duplication. Implemented only in ReplicatedMergeTree.
@ -1430,7 +1515,7 @@ private:
    /// Otherwise, in non-parallel case will break and return.
    void clearPartsFromFilesystemImpl(const DataPartsVector & parts, NameSet * part_names_succeed);

-    static MutableDataPartPtr preparePartForRemoval(const DataPartPtr & part);
+    static MutableDataPartPtr asMutableDeletingPart(const DataPartPtr & part);

    mutable TemporaryParts temporary_parts;
 };
--- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
@ -193,7 +193,8 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
    if (!metadata_snapshot->hasPartitionKey()) /// Table is not partitioned.
    {
        result.emplace_back(Block(block), Row{});
-        result[0].offsets = chunk_offsets;
+        if (chunk_offsets != nullptr)
+            result[0].offsets = std::move(chunk_offsets->offsets);
        return result;
    }

@ -230,7 +231,7 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
        /// do not interfere with possible calculated primary key columns of the same name.
        result.emplace_back(Block(block), get_partition(0));
        if (!chunk_offsets_with_partition.empty())
-            result[0].offsets = chunk_offsets_with_partition[0];
+            result[0].offsets = std::move(chunk_offsets_with_partition[0]->offsets);
        return result;
    }

@ -245,7 +246,7 @@ BlocksWithPartition MergeTreeDataWriter::splitBlockIntoParts(
    }

    for (size_t i = 0; i < chunk_offsets_with_partition.size(); ++i)
-        result[i].offsets = chunk_offsets_with_partition[i];
+        result[i].offsets = std::move(chunk_offsets_with_partition[i]->offsets);

    return result;
 }
--- a/src/Storages/MergeTree/MergeTreeDataWriter.h
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.h
@ -22,15 +22,15 @@ struct BlockWithPartition
 {
    Block block;
    Row partition;
-    ChunkOffsetsPtr offsets;
+    std::vector<size_t> offsets;

    BlockWithPartition(Block && block_, Row && partition_)
        : block(block_), partition(std::move(partition_))
    {
    }

-    BlockWithPartition(Block && block_, Row && partition_, ChunkOffsetsPtr chunk_offsets_)
-        : block(block_), partition(std::move(partition_)), offsets(chunk_offsets_)
+    BlockWithPartition(Block && block_, Row && partition_, std::vector<size_t> && offsets_)
+        : block(block_), partition(std::move(partition_)), offsets(std::move(offsets_))
    {
    }
 };
--- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp
@ -373,9 +373,9 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na
                    throw;

                tryLogCurrentException(log, __PRETTY_FUNCTION__);
-
-                String message = "Part " + part_name + " looks broken. Removing it and will try to fetch.";
-                LOG_ERROR(log, fmt::runtime(message));
+                constexpr auto fmt_string = "Part {} looks broken. Removing it and will try to fetch.";
+                String message = fmt::format(fmt_string, part_name);
+                LOG_ERROR(log, fmt_string, part_name);

                /// Delete part locally.
                storage.outdateBrokenPartAndCloneToDetached(part, "broken");
@ -392,9 +392,9 @@ CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_na
            /// Probably, someone just wrote down the part, and has not yet added to ZK.
            /// Therefore, delete only if the part is old (not very reliable).
            ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);
-
-            String message = "Unexpected part " + part_name + " in filesystem. Removing.";
-            LOG_ERROR(log, fmt::runtime(message));
+            constexpr auto fmt_string = "Unexpected part {} in filesystem. Removing.";
+            String message = fmt::format(fmt_string, part_name);
+            LOG_ERROR(log, fmt_string, part_name);
            storage.outdateBrokenPartAndCloneToDetached(part, "unexpected");
            return {part_name, false, message};
        }
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@ -1191,12 +1191,10 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
    if (entry_for_same_part_it != future_parts.end())
    {
        const LogEntry & another_entry = *entry_for_same_part_it->second;
-        out_reason = fmt::format(
-            "Not executing log entry {} of type {} for part {} "
-            "because another log entry {} of type {} for the same part ({}) is being processed.",
-            entry.znode_name, entry.type, entry.new_part_name,
-            another_entry.znode_name, another_entry.type, another_entry.new_part_name);
-        LOG_INFO(log, fmt::runtime(out_reason));
+        constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
+                                    "because another log entry {} of type {} for the same part ({}) is being processed.";
+        LOG_INFO(LogToStr(out_reason, log), fmt_string, entry.znode_name, entry.type, entry.new_part_name,
+                 another_entry.znode_name, another_entry.type, another_entry.new_part_name);
        return true;

        /** When the corresponding action is completed, then `isNotCoveredByFuturePart` next time, will succeed,
@ -1238,11 +1236,9 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
        {
            if (entry.znode_name < future_part_elem.second->znode_name)
            {
-                out_reason = fmt::format(
-                    "Not executing log entry {} for part {} "
-                    "because it is not disjoint with part {} that is currently executing and another entry {} is newer.",
-                    entry.znode_name, new_part_name, future_part_elem.first, future_part_elem.second->znode_name);
-                LOG_TRACE(log, fmt::runtime(out_reason));
+                constexpr auto fmt_string = "Not executing log entry {} for part {} "
+                                            "because it is not disjoint with part {} that is currently executing and another entry {} is newer.";
+                LOG_TRACE(LogToStr(out_reason, log), fmt_string, entry.znode_name, new_part_name, future_part_elem.first, future_part_elem.second->znode_name);
                return true;
            }

@ -1250,11 +1246,9 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
            continue;
        }

-        out_reason = fmt::format(
-            "Not executing log entry {} for part {} "
-            "because it is not disjoint with part {} that is currently executing.",
-            entry.znode_name, new_part_name, future_part_elem.first);
-        LOG_TRACE(log, fmt::runtime(out_reason));
+        constexpr auto fmt_string = "Not executing log entry {} for part {} "
+                                    "because it is not disjoint with part {} that is currently executing.";
+        LOG_TEST(LogToStr(out_reason, log), fmt_string, entry.znode_name, new_part_name, future_part_elem.first);
        return true;
    }

@ -1337,11 +1331,9 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
        {
            if (future_parts.contains(name))
            {
-                out_postpone_reason = fmt::format(
-                    "Not executing log entry {} of type {} for part {} "
-                    "because part {} is not ready yet (log entry for that part is being processed).",
-                    entry.znode_name, entry.typeToString(), entry.new_part_name, name);
-                LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+                constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
+                      "because part {} is not ready yet (log entry for that part is being processed).";
+                LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name, name);
                return false;
            }

@ -1357,10 +1349,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(

        if (merger_mutator.merges_blocker.isCancelled())
        {
-            out_postpone_reason = fmt::format(
-                "Not executing log entry {} of type {} for part {} because merges and mutations are cancelled now.",
-                entry.znode_name, entry.typeToString(), entry.new_part_name);
-            LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
+            constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} because merges and mutations are cancelled now.";
+            LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name);
            return false;
        }

@ -1375,8 +1365,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(

            if (!disks.empty() && only_s3_storage && storage.checkZeroCopyLockExists(entry.new_part_name, disks[0]))
            {
-                out_postpone_reason =  "Not executing merge/mutation for the part " + entry.new_part_name
-                    +  ", waiting other replica to execute it and will fetch after.";
+                constexpr auto fmt_string = "Not executing merge/mutation for the part {}, waiting other replica to execute it and will fetch after.";
+                out_postpone_reason = fmt::format(fmt_string, entry.new_part_name);
                return false;
            }
        }
@ -1387,9 +1377,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(

            if (replica_to_execute_merge && !merge_strategy_picker.isMergeFinishedByReplica(replica_to_execute_merge.value(), entry))
            {
-                String reason = "Not executing merge for the part " + entry.new_part_name
-                    +  ", waiting for " + replica_to_execute_merge.value() + " to execute merge.";
-                out_postpone_reason = reason;
+                constexpr auto fmt_string = "Not executing merge for the part {}, waiting for {} to execute merge.";
+                out_postpone_reason = fmt::format(fmt_string, entry.new_part_name, replica_to_execute_merge.value());
                return false;
            }
        }
@ -1411,20 +1400,16 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
            {
                if (merger_mutator.ttl_merges_blocker.isCancelled())
                {
-                    out_postpone_reason = fmt::format(
-                        "Not executing log entry {} for part {} because merges with TTL are cancelled now.",
-                        entry.znode_name, entry.new_part_name);
-                    LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
+                    constexpr auto fmt_string = "Not executing log entry {} for part {} because merges with TTL are cancelled now.";
+                    LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.new_part_name);
                    return false;
                }
                size_t total_merges_with_ttl = data.getTotalMergesWithTTLInMergeList();
                if (total_merges_with_ttl >= data_settings->max_number_of_merges_with_ttl_in_pool)
                {
-                    out_postpone_reason = fmt::format(
-                        "Not executing log entry {} for part {} because {} merges with TTL already executing, maximum {}.",
-                        entry.znode_name, entry.new_part_name, total_merges_with_ttl,
-                        data_settings->max_number_of_merges_with_ttl_in_pool);
-                    LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
+                    constexpr auto fmt_string = "Not executing log entry {} for part {} because {} merges with TTL already executing, maximum {}.";
+                    LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.new_part_name, total_merges_with_ttl,
+                              data_settings->max_number_of_merges_with_ttl_in_pool);
                    return false;
                }
            }
@ -1432,12 +1417,10 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(

        if (!ignore_max_size && sum_parts_size_in_bytes > max_source_parts_size)
        {
-            out_postpone_reason = fmt::format("Not executing log entry {} of type {} for part {}"
-                " because source parts size ({}) is greater than the current maximum ({}).",
-                entry.znode_name, entry.typeToString(), entry.new_part_name,
-                ReadableSize(sum_parts_size_in_bytes), ReadableSize(max_source_parts_size));
-
-            LOG_DEBUG(log, fmt::runtime(out_postpone_reason));
+            constexpr auto fmt_string = "Not executing log entry {} of type {} for part {}"
+                                        " because source parts size ({}) is greater than the current maximum ({}).";
+            LOG_DEBUG(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(), entry.new_part_name,
+                      ReadableSize(sum_parts_size_in_bytes), ReadableSize(max_source_parts_size));

            return false;
        }
@ -1450,10 +1433,8 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
        if (!alter_sequence.canExecuteMetaAlter(entry.alter_version, state_lock))
        {
            int head_alter = alter_sequence.getHeadAlterVersion(state_lock);
-            out_postpone_reason = fmt::format(
-                "Cannot execute alter metadata {} with version {} because another alter {} must be executed before",
-                entry.znode_name, entry.alter_version, head_alter);
-            LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+            constexpr auto fmt_string = "Cannot execute alter metadata {} with version {} because another alter {} must be executed before";
+            LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version, head_alter);
            return false;
        }
    }
@ -1466,17 +1447,13 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
            int head_alter = alter_sequence.getHeadAlterVersion(state_lock);
            if (head_alter == entry.alter_version)
            {
-                out_postpone_reason = fmt::format(
-                    "Cannot execute alter data {} with version {} because metadata still not altered",
-                    entry.znode_name, entry.alter_version);
-                LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+                constexpr auto fmt_string = "Cannot execute alter data {} with version {} because metadata still not altered";
+                LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version);
            }
            else
            {
-                out_postpone_reason = fmt::format(
-                    "Cannot execute alter data {} with version {} because another alter {} must be executed before",
-                    entry.znode_name, entry.alter_version, head_alter);
-                LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+                constexpr auto fmt_string = "Cannot execute alter data {} with version {} because another alter {} must be executed before";
+                LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.alter_version, head_alter);
            }

            return false;
@ -1498,14 +1475,12 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
            {
                if (drop_range_info.isDisjoint(info))
                    continue;
-                out_postpone_reason = fmt::format(
-                    "Not executing log entry {} of type {} for part {} "
-                    "because another DROP_RANGE or REPLACE_RANGE entry with not disjoint range {} is currently executing.",
-                    entry.znode_name,
-                    entry.typeToString(),
-                    entry.new_part_name,
-                    info.getPartNameForLogs());
-                LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+                constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
+                    "because another DROP_RANGE or REPLACE_RANGE entry with not disjoint range {} is currently executing.";
+                LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name,
+                          entry.typeToString(),
+                          entry.new_part_name,
+                          info.getPartNameForLogs());
                return false;
            }
        }
@ -1531,11 +1506,10 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry(
                    auto new_part_info = MergeTreePartInfo::fromPartName(new_part_name, format_version);
                    if (!new_part_info.isDisjoint(drop_part_info))
                    {
-                        out_postpone_reason = fmt::format(
-                            "Not executing log entry {} of type {} for part {} "
-                            "because it probably depends on {} (REPLACE_RANGE).",
-                            entry.znode_name, entry.typeToString(), entry.new_part_name, replace_entry->znode_name);
-                        LOG_TRACE(log, fmt::runtime(out_postpone_reason));
+                        constexpr auto fmt_string = "Not executing log entry {} of type {} for part {} "
+                            "because it probably depends on {} (REPLACE_RANGE).";
+                        LOG_TRACE(LogToStr(out_postpone_reason, log), fmt_string, entry.znode_name, entry.typeToString(),
+                                  entry.new_part_name, replace_entry->znode_name);
                        return false;
                    }
                }
--- a/Show More
+++ b/Show More