Merge branch 'master' into hanfei/gwp-asan

2024-11-22 15:42:02 +00:00 · 2023-01-20 11:03:12 +01:00 · 2023-01-20 11:03:12 +01:00 · 8b29f8406b
commit 8b29f8406b
parent 29b20775f7 0ad37ad286
291 changed files with 10836 additions and 1928 deletions
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@ -683,3 +683,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py
--- a/.github/workflows/debug.yml
+++ b/.github/workflows/debug.yml
@ -8,4 +8,4 @@ jobs:
  DebugInfo:
    runs-on: ubuntu-latest
    steps:
-    - uses: hmarr/debug-action@1201a20fc9d278ddddd5f0f46922d06513892491
+    - uses: hmarr/debug-action@a701ed95a46e6f2fb0df25e1a558c16356fae35a
--- a/.github/workflows/docs_check.yml
+++ b/.github/workflows/docs_check.yml
@ -169,3 +169,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py --check-approved
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@ -4388,3 +4388,4 @@ jobs:
        run: |
          cd "$GITHUB_WORKSPACE/tests/ci"
          python3 finish_check.py
+          python3 merge_pr.py --check-approved
--- a/.gitignore
+++ b/.gitignore
@ -154,6 +154,8 @@ website/package-lock.json
 /programs/server/data
 /programs/server/metadata
 /programs/server/store
+/programs/server/uuid
+/programs/server/coordination

 # temporary test files
 tests/queries/0_stateless/test_*
--- a/contrib/poco
+++ b/contrib/poco
@ -1 +1 @@
-Subproject commit 799234226187c0ae0b8c90f23465b25ed7956e56
+Subproject commit 0ab9bba7ccad3c8dacce04a35cb3b78218547ab4
--- a/docker/server/README.md
+++ b/docker/server/README.md
@ -58,7 +58,7 @@ echo 'SELECT version()' | curl 'http://localhost:18123/' --data-binary @-
 22.6.3.35
 ```

-or by allowing the container to use [host ports directly](https://docs.docker.com/network/host/) using `--network=host` (also allows archiving better network performance):
+or by allowing the container to use [host ports directly](https://docs.docker.com/network/host/) using `--network=host` (also allows achieving better network performance):

 ```bash
 docker run -d --network=host --name some-clickhouse-server --ulimit nofile=262144:262144 clickhouse/clickhouse-server
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@ -5,6 +5,7 @@ set -x

 # core.COMM.PID-TID
 sysctl kernel.core_pattern='core.%e.%p-%P'
+dmesg --clear ||:

 set -e
 set -u
@ -368,6 +369,7 @@ if [ -f core.zst ]; then
 fi

 rg --text -F '<Fatal>' server.log > fatal.log ||:
+dmesg -T > dmesg.log ||:

 zstd --threads=0 server.log

@ -396,6 +398,7 @@ p.links a { padding: 5px; margin: 3px; background: #FFF; line-height: 2; white-s
  <a href="fuzzer.log">fuzzer.log</a>
  <a href="server.log.zst">server.log.zst</a>
  <a href="main.log">main.log</a>
+  <a href="dmesg.log">dmesg.log</a>
  ${CORE_LINK}
 </p>
 <table>
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -128,6 +128,7 @@ function run_tests()

    if [[ "${HIGH_LEVEL_COVERAGE}" = "YES" ]]; then
        ADDITIONAL_OPTIONS+=('--report-coverage')
+        ADDITIONAL_OPTIONS+=('--report-logs-stats')
    fi

    set +e
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -289,6 +289,7 @@ if __name__ == "__main__":
                "--database=system",
                "--hung-check",
                "--stress",
+                "--report-logs-stats",
                "00001_select_1",
            ]
        )
--- a/docs/en/development/architecture.md
+++ b/docs/en/development/architecture.md
@ -182,6 +182,31 @@ No matter what pool is used for a job, at start `ThreadStatus` instance is creat

 If thread is related to query execution, then the most important thing attached to `ThreadStatus` is query context `ContextPtr`. Every query has its master thread in the server pool. Master thread does the attachment by holding an `ThreadStatus::QueryScope query_scope(query_context)` object. Master thread also creates a thread group represented with `ThreadGroupStatus` object. Every additional thread that is allocated during this query execution is attached to its thread group by `CurrentThread::attachTo(thread_group)` call. Thread groups are used to aggregate profile event counters and track memory consumption by all threads dedicated to a single task (see `MemoryTracker` and `ProfileEvents::Counters` classes for more information).

+## Concurrency control {#concurrency-control}
+Query that can be parallelized uses `max_threads` setting to limit itself. Default value for this setting is selected in a way that allows single query to utilize all CPU cores in the best way. But what if there are multiple concurrent queries and each of them uses default `max_threads` setting value? Then queries will share CPU resources. OS will ensure fairness by constantly switching threads, which introduce some performance penalty. `ConcurrencyControl` helps to deal with this penalty and avoid allocating a lot of threads. Configuration setting `concurrent_threads_soft_limit_num` is used to limit how many concurrent thread can be allocated before applying some kind of CPU pressure.
+
+:::note
+`concurrent_threads_soft_limit_num` and `concurrent_threads_soft_limit_ratio_to_cores` are disabled (equal 0) by default. So this feature must be enabled before use.
+:::
+
+Notion of CPU `slot` is introduced. Slot is a unit of concurrency: to run a thread query has to acquire a slot in advance and release it when thread stops. The number of slots is globally limited in a server. Multiple concurrent queries are competing for CPU slots if the total demand exceeds the total number of slots. `ConcurrencyControl` is responsible to resolve this competition by doing CPU slot scheduling in a fair manner.
+
+Each slot can be seen as an independent state machine with the following states:
+ * `free`: slot is available to be allocated by any query.
+ * `granted`: slot is `allocated` by specific query, but not yet acquired by any thread.
+ * `acquired`: slot is `allocated` by specific query and acquired by a thread.
+
+Note that `allocated` slot can be in two different states: `granted` and `acquired`. The former is a transitional state, that actually should be short (from the instant when a slot is allocated to a query till the moment when the up-scaling procedure is run by any thread of that query).
+
+![state diagram](@site/docs/en/development/images/concurrency.png)
+
+API of `ConcurrencyControl` consists of the following functions:
+1. Create a resource allocation for a query: `auto slots = ConcurrencyControl::instance().allocate(1, max_threads);`. It will allocate at least 1 and at most `max_threads` slots. Note that the first slot is granted immediately, but the remaining slots may be granted later. Thus limit is soft, because every query will obtain at least one thread.
+2. For every thread a slot has to be acquired from an allocation: `while (auto slot = slots->tryAcquire()) spawnThread([slot = std::move(slot)] { ... });`.
+3. Update the total amount of slots: `ConcurrencyControl::setMaxConcurrency(concurrent_threads_soft_limit_num)`. Can be done in runtime, w/o server restart.
+
+This API allows queries to start with at least one thread (in presence of CPU pressure) and later scale up to `max_threads`.
+
 ## Distributed Query Execution {#distributed-query-execution}

 Servers in a cluster setup are mostly independent. You can create a `Distributed` table on one or all servers in a cluster. The `Distributed` table does not store data itself – it only provides a “view” to all local tables on multiple nodes of a cluster. When you SELECT from a `Distributed` table, it rewrites that query, chooses remote nodes according to load balancing settings, and sends the query to them. The `Distributed` table requests remote servers to process a query just up to a stage where intermediate results from different servers can be merged. Then it receives the intermediate results and merges them. The distributed table tries to distribute as much work as possible to remote servers and does not send much intermediate data over the network.
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@ -147,6 +147,14 @@ hash cmake

 ClickHouse is available in pre-built binaries and packages. Binaries are portable and can be run on any Linux flavour.

-Binaries are built for stable and LTS releases and also every commit to `master` for each pull request.
+The CI checks build the binaries on each commit to [ClickHouse](https://github.com/clickhouse/clickhouse/). To download them:
+
+1. Open the [commits list](https://github.com/ClickHouse/ClickHouse/commits/master)
+1. Choose a **Merge pull request** commit that includes the new feature, or was added after the new feature
+1. Click the status symbol (yellow dot, red x, green check) to open the CI check list
+1. Scroll through the list until you find **ClickHouse build check x/x artifact groups are OK**
+1. Click **Details**
+1. Find the type of package for your operating system that you need and download the files.
+
+![build artifact check](images/find-build-artifact.png)

-To find the freshest build from `master`, go to [commits page](https://github.com/ClickHouse/ClickHouse/commits/master), click on the first green check mark or red cross near commit, and click to the “Details” link right after “ClickHouse Build Check”.
--- a/docs/en/development/images/concurrency.png
+++ b/docs/en/development/images/concurrency.png
--- a/docs/en/development/images/find-build-artifact.png
+++ b/docs/en/development/images/find-build-artifact.png
--- a/docs/en/engines/database-engines/postgresql.md
+++ b/docs/en/engines/database-engines/postgresql.md
@ -136,3 +136,7 @@ DESCRIBE TABLE test_database.test_table;
 │ data   │ Nullable(String)  │
 └────────┴───────────────────┘
 ```
+
+## Related content
+
+- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)
--- a/docs/en/engines/table-engines/integrations/postgresql.md
+++ b/docs/en/engines/table-engines/integrations/postgresql.md
@ -175,3 +175,6 @@ CREATE TABLE pg_table_schema_with_dots (a UInt32)

 -   [The `postgresql` table function](../../../sql-reference/table-functions/postgresql.md)
 -   [Using PostgreSQL as a dictionary source](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-sources.md#dicts-external_dicts_dict_sources-postgresql)
+
+## Related content
+- Blog: [ClickHouse and PostgreSQL - a match made in data heaven - part 1](https://clickhouse.com/blog/migrating-data-between-clickhouse-postgres)
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -390,40 +390,46 @@ SELECT count() FROM table WHERE u64 * i32 == 10 AND u64 * length(s) >= 1234

 ### Available Types of Indices {#available-types-of-indices}

-####   `minmax`
+#### MinMax

 Stores extremes of the specified expression (if the expression is `tuple`, then it stores extremes for each element of `tuple`), uses stored info for skipping blocks of data like the primary key.

-####   `set(max_rows)`
+Syntax: `minmax`
+
+#### Set

 Stores unique values of the specified expression (no more than `max_rows` rows, `max_rows=0` means “no limits”). Uses the values to check if the `WHERE` expression is not satisfiable on a block of data.

-####   `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`
+Syntax: `set(max_rows)`

-Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all ngrams from a block of data. Works only with datatypes: [String](/docs/en/sql-reference/data-types/string.md), [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) and [Map](/docs/en/sql-reference/data-types/map.md). Can be used for optimization of `EQUALS`, `LIKE` and `IN` expressions.
+#### Bloom Filter
+
+Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns. An optional `false_positive` parameter with possible values between 0 and 1 specifies the probability of receiving a false positive response from the filter. Default value: 0.025. Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`, `UUID` and `Map`. For the `Map` data type, the client can specify if the index should be created for keys or values using [mapKeys](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapkeys) or [mapValues](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapvalues) function.
+
+Syntax: `bloom_filter([false_positive])`
+
+#### N-gram Bloom Filter
+
+Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) that contains all n-grams from a block of data. Only works with datatypes: [String](/docs/en/sql-reference/data-types/string.md), [FixedString](/docs/en/sql-reference/data-types/fixedstring.md) and [Map](/docs/en/sql-reference/data-types/map.md). Can be used for optimization of `EQUALS`, `LIKE` and `IN` expressions.
+
+Syntax: `ngrambf_v1(n, size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`

 - `n` — ngram size,
 - `size_of_bloom_filter_in_bytes` — Bloom filter size in bytes (you can use large values here, for example, 256 or 512, because it can be compressed well).
 - `number_of_hash_functions` — The number of hash functions used in the Bloom filter.
 - `random_seed` — The seed for Bloom filter hash functions.

-####   `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`
+#### Token Bloom Filter

 The same as `ngrambf_v1`, but stores tokens instead of ngrams. Tokens are sequences separated by non-alphanumeric characters.

-####   `bloom_filter([false_positive])` — Stores a [Bloom filter](https://en.wikipedia.org/wiki/Bloom_filter) for the specified columns.
+Syntax: `tokenbf_v1(size_of_bloom_filter_in_bytes, number_of_hash_functions, random_seed)`

-The optional `false_positive` parameter is the probability of receiving a false positive response from the filter. Possible values: (0, 1). Default value: 0.025.
+#### Special-purpose

-Supported data types: `Int*`, `UInt*`, `Float*`, `Enum`, `Date`, `DateTime`, `String`, `FixedString`, `Array`, `LowCardinality`, `Nullable`, `UUID`, `Map`.
+- An experimental index to support approximate nearest neighbor (ANN) search. See [here](annindexes.md) for details.

-For `Map` data type client can specify if index should be created for keys or values using [mapKeys](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapkeys) or [mapValues](/docs/en/sql-reference/functions/tuple-map-functions.md/#mapvalues) function.
-
-There are also special-purpose and experimental indexes to support approximate nearest neighbor (ANN) queries. See [here](annindexes.md) for details.
-
-The following functions can use the filter: [equals](/docs/en/sql-reference/functions/comparison-functions.md), [notEquals](/docs/en/sql-reference/functions/comparison-functions.md), [in](/docs/en/sql-reference/functions/in-functions), [notIn](/docs/en/sql-reference/functions/in-functions), [has](/docs/en/sql-reference/functions/array-functions#hasarr-elem), [hasAny](/docs/en/sql-reference/functions/array-functions#hasany), [hasAll](/docs/en/sql-reference/functions/array-functions#hasall).
-
-Example of index creation for `Map` data type
+## Example of index creation for Map data type

 ```
 INDEX map_key_index mapKeys(map_column) TYPE bloom_filter GRANULARITY 1
@ -484,9 +490,6 @@ For example:
 :::


-## Approximate Nearest Neighbor Search Indexes [experimental] {#table_engines-ANNIndex}
-In addition to skip indices, there are also [Approximate Nearest Neighbor Search Indexes](/docs/en/engines/table-engines/mergetree-family/annindexes.md).
-
 ## Projections {#projections}
 Projections are like [materialized views](/docs/en/sql-reference/statements/create/view.md/#materialized) but defined in part-level. It provides consistency guarantees along with automatic usage in queries.

@ -885,6 +888,10 @@ User can assign new big parts to different disks of a [JBOD](https://en.wikipedi

 ## Using S3 for Data Storage {#table_engine-mergetree-s3}

+:::note
+Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/data-ingestion/s3/gcs-merge-tree.md).
+:::
+
 `MergeTree` family table engines can store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`.

 Configuration markup:
@ -894,6 +901,7 @@ Configuration markup:
    <disks>
        <s3>
            <type>s3</type>
+            <support_batch_delete>true</support_batch_delete>
            <endpoint>https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/</endpoint>
            <access_key_id>your_access_key_id</access_key_id>
            <secret_access_key>your_secret_access_key</secret_access_key>
@ -927,6 +935,7 @@ Required parameters:
 Optional parameters:

 -   `region` — S3 region name.
+-   `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs.
 -   `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`.
 -   `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`.
 -   `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL.
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@ -271,6 +271,9 @@ You’ll need to create data and metadata folders manually and `chown` them for

 On Gentoo, you can just use `emerge clickhouse` to install ClickHouse from sources.

+### From CI checks pre-built binaries
+ClickHouse binaries are built for each [commit](/docs/en/development/build.md#you-dont-have-to-build-clickhouse).
+
 ## Launch {#launch}

 To start the server as a daemon, run:
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@ -1203,12 +1203,14 @@ SELECT * FROM json_each_row_nested
 - [input_format_json_read_bools_as_numbers](/docs/en/operations/settings/settings.md/#input_format_json_read_bools_as_numbers) - allow to parse bools as numbers in JSON input formats. Default value - `true`.
 - [input_format_json_read_numbers_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_numbers_as_strings) - allow to parse numbers as strings in JSON input formats. Default value - `false`.
 - [input_format_json_read_objects_as_strings](/docs/en/operations/settings/settings.md/#input_format_json_read_objects_as_strings) - allow to parse JSON objects as strings in JSON input formats. Default value - `false`.
+- [input_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#input_format_json_named_tuples_as_objects) - parse named tuple columns as JSON objects. Default value - `true`.
+- [input_format_json_defaults_for_missing_elements_in_named_tuple](/docs/en/operations/settings/settings.md/#input_format_json_defaults_for_missing_elements_in_named_tuple) - insert default values for missing elements in JSON object while parsing named tuple. Default value - `true`.
 - [output_format_json_quote_64bit_integers](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_integers) - controls quoting of 64-bit integers in JSON output format. Default value - `true`.
 - [output_format_json_quote_64bit_floats](/docs/en/operations/settings/settings.md/#output_format_json_quote_64bit_floats) - controls quoting of 64-bit floats in JSON output format. Default value - `false`.
 - [output_format_json_quote_denormals](/docs/en/operations/settings/settings.md/#output_format_json_quote_denormals) - enables '+nan', '-nan', '+inf', '-inf' outputs in JSON output format. Default value - `false`.
 - [output_format_json_quote_decimals](/docs/en/operations/settings/settings.md/#output_format_json_quote_decimals) - controls quoting of decimals in JSON output format. Default value - `false`.
 - [output_format_json_escape_forward_slashes](/docs/en/operations/settings/settings.md/#output_format_json_escape_forward_slashes) - controls escaping forward slashes for string outputs in JSON output format. Default value - `true`.
- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `false`.
+- [output_format_json_named_tuples_as_objects](/docs/en/operations/settings/settings.md/#output_format_json_named_tuples_as_objects) - serialize named tuple columns as JSON objects. Default value - `true`.
 - [output_format_json_array_of_rows](/docs/en/operations/settings/settings.md/#output_format_json_array_of_rows) - output a JSON array of all rows in JSONEachRow(Compact) format. Default value - `false`.
 - [output_format_json_validate_utf8](/docs/en/operations/settings/settings.md/#output_format_json_validate_utf8) - enables validation of UTF-8 sequences in JSON output formats (note that it doesn't impact formats JSON/JSONCompact/JSONColumnsWithMetadata, they always validate utf8). Default value - `false`.

--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -757,6 +757,10 @@ Possible values:

 Default value: `0`.

+**See Also**
+
+-   [Concurrency Control](/docs/en/development/architecture.md#concurrency-control)
+
 ## concurrent_threads_soft_limit_ratio_to_cores {#concurrent_threads_soft_limit_ratio_to_cores}
 The maximum number of query processing threads as multiple of number of logical cores.
 More details: [concurrent_threads_soft_limit_num](#concurrent-threads-soft-limit-num).
@ -768,6 +772,12 @@ Possible values:

 Default value: `0`.

+**Example**
+
+``` xml
+<concurrent_threads_soft_limit_ratio_to_cores>3</concurrent_threads_soft_limit_ratio_to_cores>
+```
+
 ## max_concurrent_queries {#max-concurrent-queries}

 The maximum number of simultaneously processed queries.
@ -1181,6 +1191,7 @@ Use the following parameters to configure logging:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 **Example**

@ -1244,6 +1255,7 @@ Use the following parameters to configure logging:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 If the table does not exist, ClickHouse will create it. If the structure of the query log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1271,6 +1283,7 @@ Use the following parameters to configure logging:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 If the table does not exist, ClickHouse will create it. If the structure of the query thread log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1298,6 +1311,7 @@ Use the following parameters to configure logging:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` – Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 If the table does not exist, ClickHouse will create it. If the structure of the query views log changed when the ClickHouse server was updated, the table with the old structure is renamed, and a new table is created automatically.

@ -1324,6 +1338,7 @@ Parameters:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-creating-a-table) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 **Example**
 ```xml
@ -1351,6 +1366,7 @@ Parameters:
 -   `partition_by` — [Custom partitioning key](../../engines/table-engines/mergetree-family/custom-partitioning-key.md) for a system table. Can't be used if `engine` defined.
 -   `engine` - [MergeTree Engine Definition](../../engines/table-engines/mergetree-family/index.md) for a system table. Can't be used if `partition_by` defined.
 -   `flush_interval_milliseconds` — Interval for flushing data from the buffer in memory to the table.
+-   `storage_policy` – Name of storage policy to use for the table (optional)

 The default server configuration file `config.xml` contains the following settings section:

--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -266,7 +266,7 @@ Default value: 0.

 Limits the size in bytes of the hash table used when joining tables.

-This settings applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).
+This setting applies to [SELECT … JOIN](../../sql-reference/statements/select/join.md#select-join) operations and [Join table engine](../../engines/table-engines/special/join.md).

 If the query contains joins, ClickHouse checks this setting for every intermediate result.

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -402,40 +402,62 @@ Default value: `ALL`.

 ## join_algorithm {#settings-join_algorithm}

-Specifies [JOIN](../../sql-reference/statements/select/join.md) algorithm.
+Specifies which [JOIN](../../sql-reference/statements/select/join.md) algorithm is used.

 Several algorithms can be specified, and an available one would be chosen for a particular query based on kind/strictness and table engine.

 Possible values:

- `default` — `hash` or `direct`, if possible (same as `direct,hash`)
+### `default` 

- `hash` — [Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
+This is the equivalent of `hash` or `direct`, if possible (same as `direct,hash`)

- `parallel_hash` - a variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.
+### `grace_hash` 
+
+[Grace hash join](https://en.wikipedia.org/wiki/Hash_join#Grace_hash_join) is used.  Grace hash provides an algorithm option that provides performant complex joins while limiting memory use.
+
+The first phase of a grace join reads the right table and splits it into N buckets depending on the hash value of key columns (initially, N is `grace_hash_join_initial_buckets`). This is done in a way to ensure that each bucket can be processed independently. Rows from the first bucket are added to an in-memory hash table while the others are saved to disk. If the hash table grows beyond the memory limit (e.g., as set by [`max_bytes_in_join`](/docs/en/operations/settings/query-complexity.md/#settings-max_bytes_in_join)), the number of buckets is increased and the assigned bucket for each row. Any rows which don’t belong to the current bucket are flushed and reassigned.
+
+### `hash`
+
+[Hash join algorithm](https://en.wikipedia.org/wiki/Hash_join) is used. The most generic implementation that supports all combinations of kind and strictness and multiple join keys that are combined with `OR` in the `JOIN ON` section.
+
+### `parallel_hash` 
+
+A variation of `hash` join that splits the data into buckets and builds several hashtables instead of one concurrently to speed up this process.

 When using the `hash` algorithm, the right part of `JOIN` is uploaded into RAM.

- `partial_merge` — a variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.
+### `partial_merge` 
+
+A variation of the [sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join), where only the right table is fully sorted.

 The `RIGHT JOIN` and `FULL JOIN` are supported only with `ALL` strictness (`SEMI`, `ANTI`, `ANY`, and `ASOF` are not supported).

-When using `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.
+When using the `partial_merge` algorithm, ClickHouse sorts the data and dumps it to the disk. The `partial_merge` algorithm in ClickHouse differs slightly from the classic realization. First, ClickHouse sorts the right table by joining keys in blocks and creates a min-max index for sorted blocks. Then it sorts parts of the left table by the `join key` and joins them over the right table. The min-max index is also used to skip unneeded right table blocks.

- `direct` - can be applied when the right storage supports key-value requests.
+### `direct` 
+
+This algorithm can be applied when the storage for the right table supports key-value requests.

 The `direct` algorithm performs a lookup in the right table using rows from the left table as keys. It's supported only by special storage such as [Dictionary](../../engines/table-engines/special/dictionary.md/#dictionary) or [EmbeddedRocksDB](../../engines/table-engines/integrations/embedded-rocksdb.md) and only the `LEFT` and `INNER` JOINs.

- `auto` — try `hash` join and switch on the fly to another algorithm if the memory limit is violated.
+### `auto` 

- `full_sorting_merge` — [Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
+When set to `auto`, `hash` join is tried first, and the algorithm is switched on the fly to another algorithm if the memory limit is violated.

- `prefer_partial_merge` — ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.
+### `full_sorting_merge` 
+
+[Sort-merge algorithm](https://en.wikipedia.org/wiki/Sort-merge_join) with full sorting joined tables before joining.
+
+### `prefer_partial_merge` 
+
+ClickHouse always tries to use `partial_merge` join if possible, otherwise, it uses `hash`. *Deprecated*, same as `partial_merge,hash`.


 ## join_any_take_last_row {#settings-join_any_take_last_row}

-Changes behaviour of join operations with `ANY` strictness.
+Changes the behaviour of join operations with `ANY` strictness.

 :::warning
 This setting applies only for `JOIN` operations with [Join](../../engines/table-engines/special/join.md) engine tables.
@ -498,7 +520,7 @@ Default value: 65536.

 Limits the number of files allowed for parallel sorting in MergeJoin operations when they are executed on disk.

-The bigger the value of the setting, the more RAM used and the less disk I/O needed.
+The bigger the value of the setting, the more RAM is used and the less disk I/O is needed.

 Possible values:

@ -514,12 +536,12 @@ Enables legacy ClickHouse server behaviour in `ANY INNER|LEFT JOIN` operations.
 Use this setting only for backward compatibility if your use cases depend on legacy `JOIN` behaviour.
 :::

-When the legacy behaviour enabled:
+When the legacy behaviour is enabled:

 -   Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are not equal because ClickHouse uses the logic with many-to-one left-to-right table keys mapping.
 -   Results of `ANY INNER JOIN` operations contain all rows from the left table like the `SEMI LEFT JOIN` operations do.

-When the legacy behaviour disabled:
+When the legacy behaviour is disabled:

 -   Results of `t1 ANY LEFT JOIN t2` and `t2 ANY RIGHT JOIN t1` operations are equal because ClickHouse uses the logic which provides one-to-many keys mapping in `ANY RIGHT JOIN` operations.
 -   Results of `ANY INNER JOIN` operations contain one row per key from both the left and right tables.
@ -572,7 +594,7 @@ Default value: `163840`.

 ## merge_tree_min_rows_for_concurrent_read_for_remote_filesystem {#merge-tree-min-rows-for-concurrent-read-for-remote-filesystem}

-The minimum number of lines to read from one file before [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.
+The minimum number of lines to read from one file before the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) engine can parallelize reading, when reading from remote filesystem.

 Possible values:

@ -706,7 +728,7 @@ log_queries=1

 ## log_queries_min_query_duration_ms {#settings-log-queries-min-query-duration-ms}

-If enabled (non-zero), queries faster then the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:
+If enabled (non-zero), queries faster than the value of this setting will not be logged (you can think about this as a `long_query_time` for [MySQL Slow Query Log](https://dev.mysql.com/doc/refman/5.7/en/slow-query-log.html)), and this basically means that you will not find them in the following tables:

 - `system.query_log`
 - `system.query_thread_log`
@ -741,7 +763,7 @@ log_queries_min_type='EXCEPTION_WHILE_PROCESSING'

 Setting up query threads logging.

-Query threads log into [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting have effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.
+Query threads log into the [system.query_thread_log](../../operations/system-tables/query_thread_log.md) table. This setting has effect only when [log_queries](#settings-log-queries) is true. Queries’ threads run by ClickHouse with this setup are logged according to the rules in the [query_thread_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_thread_log) server configuration parameter.

 Possible values:

@ -760,7 +782,7 @@ log_query_threads=1

 Setting up query views logging.

-When a query run by ClickHouse with this setup on has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.
+When a query run by ClickHouse with this setting enabled has associated views (materialized or live views), they are logged in the [query_views_log](../../operations/server-configuration-parameters/settings.md/#server_configuration_parameters-query_views_log) server configuration parameter.

 Example:

@ -787,7 +809,7 @@ It can be used to improve the readability of server logs. Additionally, it helps

 Possible values:

-   Any string no longer than [max_query_size](#settings-max_query_size). If length is exceeded, the server throws an exception.
+-   Any string no longer than [max_query_size](#settings-max_query_size). If the max_query_size is exceeded, the server throws an exception.

 Default value: empty string.

@ -821,11 +843,11 @@ The setting also does not have a purpose when using INSERT SELECT, since data is

 Default value: 1,048,576.

-The default is slightly more than `max_block_size`. The reason for this is because certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.
+The default is slightly more than `max_block_size`. The reason for this is that certain table engines (`*MergeTree`) form a data part on the disk for each inserted block, which is a fairly large entity. Similarly, `*MergeTree` tables sort data during insertion, and a large enough block size allow sorting more data in RAM.

 ## min_insert_block_size_rows {#min-insert-block-size-rows}

-Sets the minimum number of rows in the block which can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.
+Sets the minimum number of rows in the block that can be inserted into a table by an `INSERT` query. Smaller-sized blocks are squashed into bigger ones.

 Possible values:

@ -891,7 +913,7 @@ Higher values will lead to higher memory usage.

 ## max_compress_block_size {#max-compress-block-size}

-The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.
+The maximum size of blocks of uncompressed data before compressing for writing to a table. By default, 1,048,576 (1 MiB). Specifying a smaller block size generally leads to slightly reduced compression ratio, the compression and decompression speed increases slightly due to cache locality, and memory consumption is reduced.

 :::warning
 This is an expert-level setting, and you shouldn't change it if you're just getting started with ClickHouse.
@ -935,7 +957,7 @@ Default value: 1000.

 ## interactive_delay {#interactive-delay}

-The interval in microseconds for checking whether request execution has been cancelled and sending the progress.
+The interval in microseconds for checking whether request execution has been canceled and sending the progress.

 Default value: 100,000 (checks for cancelling and sends the progress ten times per second).

@ -4122,7 +4144,20 @@ Enabled by default.

 Serialize named tuple columns as JSON objects.

-Disabled by default.
+Enabled by default.
+
+### input_format_json_named_tuples_as_objects {#input_format_json_named_tuples_as_objects}
+
+Parse named tuple columns as JSON objects.
+
+Enabled by default.
+
+### input_format_json_defaults_for_missing_elements_in_named_tuple {#input_format_json_defaults_for_missing_elements_in_named_tuple}
+
+Insert default values for missing elements in JSON object while parsing named tuple.
+This setting works only when setting `input_format_json_named_tuples_as_objects` is enabled.
+
+Enabled by default.

 ### output_format_json_array_of_rows {#output_format_json_array_of_rows}

--- a/docs/en/operations/utilities/clickhouse-local.md
+++ b/docs/en/operations/utilities/clickhouse-local.md
@ -120,5 +120,6 @@ Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec.

 ## Related Content

+- [Extracting, converting, and querying data in local files using clickhouse-local](https://clickhouse.com/blog/extracting-converting-querying-local-files-with-sql-clickhouse-local)
 - [Getting Data Into ClickHouse - Part 1](https://clickhouse.com/blog/getting-data-into-clickhouse-part-1)
 - [Exploring massive, real-world data sets: 100+ Years of Weather Records in ClickHouse](https://clickhouse.com/blog/real-world-data-noaa-climate-data)
--- a/docs/en/sql-reference/aggregate-functions/reference/index.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/index.md
@ -57,6 +57,7 @@ ClickHouse-specific aggregate functions:
 -   [uniqCombined](../../../sql-reference/aggregate-functions/reference/uniqcombined.md)
 -   [uniqCombined64](../../../sql-reference/aggregate-functions/reference/uniqcombined64.md)
 -   [uniqHLL12](../../../sql-reference/aggregate-functions/reference/uniqhll12.md)
+-   [uniqTheta](../../../sql-reference/aggregate-functions/reference/uniqthetasketch.md)
 -   [quantile](../../../sql-reference/aggregate-functions/reference/quantile.md)
 -   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md)
 -   [quantileExact](../../../sql-reference/aggregate-functions/reference/quantileexact.md)
@ -77,4 +78,6 @@ ClickHouse-specific aggregate functions:
 -   [contingency](./contingency.md)
 -   [cramersV](./cramersv.md)
 -   [cramersVBiasCorrected](./cramersvbiascorrected.md)
-   [theilsU](./theilsu.md)
+-   [theilsU](./theilsu.md)
+-   [maxIntersections](./maxintersections.md)
+-   [maxIntersectionsPosition](./maxintersectionsposition.md)
--- a/docs/en/sql-reference/aggregate-functions/reference/maxintersections.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/maxintersections.md
@ -0,0 +1,64 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/maxintersections
+sidebar_position: 360
+title: maxIntersections
+---
+
+# maxIntersections
+
+Aggregate function that calculates the maximum number of times that a group of intervals intersects each other (if all the intervals intersect at least once).
+
+The syntax is:
+
+```sql
+maxIntersections(start_column, end_column)
+```
+
+**Arguments**
+
+- `start_column` – the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
+
+- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
+
+**Returned value**
+
+Returns the maximum number of intersected intervals.
+
+**Example**
+
+```sql
+CREATE TABLE my_events (
+    start UInt32,
+    end UInt32
+)
+Engine = MergeTree
+ORDER BY tuple();
+
+INSERT INTO my_events VALUES
+   (1, 3),
+   (1, 6),
+   (2, 5),
+   (3, 7);
+```
+
+The intervals look like the following:
+
+```response
+1 - 3
+1 - - - - 6
+  2 - - 5
+    3 - - - 7
+```
+
+Three of these intervals have a common value (the value is `4`, but the value that is common is not important, we are measuring the count of the intersections). The intervals `(1,3)` and `(3,7)` share an endpoint but are not considered intersecting by the `maxIntersections` function.
+
+```sql
+SELECT maxIntersections(start, end) FROM my_events;
+```
+
+Response:
+```response
+3
+```
+
+If you have multiple occurrences of the maximum interval, you can use the [`maxIntersectionsPosition` function](./maxintersectionsposition.md) to locate the number and location of those occurrences.
--- a/docs/en/sql-reference/aggregate-functions/reference/maxintersectionsposition.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/maxintersectionsposition.md
@ -0,0 +1,64 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/maxintersectionsposition
+sidebar_position: 361
+title: maxIntersectionsPosition
+---
+
+# maxIntersectionsPosition
+
+Aggregate function that calculates the positions of the occurrences of the [`maxIntersections` function](./maxintersections.md).
+
+The syntax is:
+
+```sql
+maxIntersectionsPosition(start_column, end_column)
+```
+
+**Arguments**
+
+- `start_column` – the numeric column that represents the start of each interval. If `start_column` is `NULL` or 0 then the interval will be skipped.
+
+- `end_column` - the numeric column that represents the end of each interval. If `end_column` is `NULL` or 0 then the interval will be skipped.
+
+**Returned value**
+
+Returns the start positions of the maximum number of intersected intervals.
+
+**Example**
+
+```sql
+CREATE TABLE my_events (
+    start UInt32,
+    end UInt32
+)
+Engine = MergeTree
+ORDER BY tuple();
+
+INSERT INTO my_events VALUES
+   (1, 3),
+   (1, 6),
+   (2, 5),
+   (3, 7);
+```
+
+The intervals look like the following:
+
+```response
+1 - 3
+1 - - - - 6
+  2 - - 5
+    3 - - - 7
+```
+
+Notice that three of these intervals have the value 4 in common, and that starts with the 2nd interval:
+
+```sql
+SELECT maxIntersectionsPosition(start, end) FROM my_events;
+```
+
+Response:
+```response
+2
+```
+
+In other words, the `(1,6)` row is the start of the 3 intervals that intersect, and 3 is the maximum number of intervals that intersect.
--- a/docs/en/sql-reference/aggregate-functions/reference/quantileinterpolatedweighted.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantileinterpolatedweighted.md
@ -0,0 +1,68 @@
+---
+slug: /en/sql-reference/aggregate-functions/reference/quantileInterpolatedWeighted
+sidebar_position: 203
+---
+
+# quantileInterpolatedWeighted
+
+Computes [quantile](https://en.wikipedia.org/wiki/Quantile) of a numeric data sequence using linear interpolation, taking into account the weight of each element.
+
+To get the interpolated value, all the passed values are combined into an array, which are then sorted by their corresponding weights. Quantile interpolation is then performed using the [weighted percentile method](https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method) by building a cumulative distribution based on weights and then a linear interpolation is performed using the weights and the values to compute the quantiles.
+
+When using multiple `quantile*` functions with different levels in a query, the internal states are not combined (that is, the query works less efficiently than it could). In this case, use the [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles) function.
+
+**Syntax**
+
+``` sql
+quantileInterpolatedWeighted(level)(expr, weight)
+```
+
+Alias: `medianInterpolatedWeighted`.
+
+**Arguments**
+
+-   `level` — Level of quantile. Optional parameter. Constant floating-point number from 0 to 1. We recommend using a `level` value in the range of `[0.01, 0.99]`. Default value: 0.5. At `level=0.5` the function calculates [median](https://en.wikipedia.org/wiki/Median).
+-   `expr` — Expression over the column values resulting in numeric [data types](../../../sql-reference/data-types/index.md#data_types), [Date](../../../sql-reference/data-types/date.md) or [DateTime](../../../sql-reference/data-types/datetime.md).
+-   `weight` — Column with weights of sequence members. Weight is a number of value occurrences.
+
+**Returned value**
+
+-   Quantile of the specified level.
+
+Type:
+
+-   [Float64](../../../sql-reference/data-types/float.md) for numeric data type input.
+-   [Date](../../../sql-reference/data-types/date.md) if input values have the `Date` type.
+-   [DateTime](../../../sql-reference/data-types/datetime.md) if input values have the `DateTime` type.
+
+**Example**
+
+Input table:
+
+``` text
+┌─n─┬─val─┐
+│ 0 │   3 │
+│ 1 │   2 │
+│ 2 │   1 │
+│ 5 │   4 │
+└───┴─────┘
+```
+
+Query:
+
+``` sql
+SELECT quantileInterpolatedWeighted(n, val) FROM t
+```
+
+Result:
+
+``` text
+┌─quantileInterpolatedWeighted(n, val)─┐
+│                                    1 │
+└──────────────────────────────────────┘
+```
+
+**See Also**
+
+-   [median](../../../sql-reference/aggregate-functions/reference/median.md#median)
+-   [quantiles](../../../sql-reference/aggregate-functions/reference/quantiles.md#quantiles)
--- a/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/quantiles.md
@ -9,7 +9,7 @@ sidebar_position: 201

 Syntax: `quantiles(level1, level2, …)(x)`

-All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.
+All the quantile functions also have corresponding quantiles functions: `quantiles`, `quantilesDeterministic`, `quantilesTiming`, `quantilesTimingWeighted`, `quantilesExact`, `quantilesExactWeighted`, `quantileInterpolatedWeighted`, `quantilesTDigest`, `quantilesBFloat16`. These functions calculate all the quantiles of the listed levels in one pass, and return an array of the resulting values.

 ## quantilesExactExclusive

--- a/docs/en/sql-reference/data-types/json.md
+++ b/docs/en/sql-reference/data-types/json.md
@ -6,6 +6,10 @@ sidebar_label: JSON

 # JSON

+:::warning
+This feature is experimental and is not production ready. If you need to work with JSON documents, consider using [this guide](/docs/en/guides/developer/working-with-json/json-load-data.md) instead.
+:::
+
 Stores JavaScript Object Notation (JSON) documents in a single column.

 `JSON` is an alias for `Object('json')`.
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@ -156,6 +156,33 @@ or
 LAYOUT(HASHED(PREALLOCATE 0))
 ```

+If `shards` greater then 1 (default is `1`) the dictionary will load data in parallel, useful if you have huge amount of elements in one dictionary.
+
+Configuration example:
+
+``` xml
+<layout>
+  <hashed>
+    <shards>10</shards>
+    <!-- Size of the backlog for blocks in parallel queue.
+
+         Since the bottleneck in parallel loading is rehash, and so to avoid
+         stalling because of thread is doing rehash, you need to have some
+         backlog.
+
+         10000 is good balance between memory and speed.
+         Even for 10e10 elements and can handle all the load without starvation. -->
+    <shard_load_queue_backlog>10000</shard_load_queue_backlog>
+  </hashed>
+</layout>
+```
+
+or
+
+``` sql
+LAYOUT(HASHED(SHARDS 10 [SHARD_LOAD_QUEUE_BACKLOG 10000]))
+```
+
 ### sparse_hashed

 Similar to `hashed`, but uses less memory in favor more CPU usage.
@ -178,6 +205,8 @@ or
 LAYOUT(SPARSE_HASHED([PREALLOCATE 0]))
 ```

+It is also possible to use `shards` for this type of dictionary, and again it is more important for `sparse_hashed` then for `hashed`, since `sparse_hashed` is slower.
+
 ### complex_key_hashed

 This type of storage is for use with composite [keys](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-structure.md). Similar to `hashed`.
@ -186,14 +215,18 @@ Configuration example:

 ``` xml
 <layout>
-  <complex_key_hashed />
+  <complex_key_hashed>
+    <preallocate>0</preallocate>
+    <shards>1</shards>
+    <!-- <shard_load_queue_backlog>10000</shard_load_queue_backlog> -->
+  </complex_key_hashed>
 </layout>
 ```

 or

 ``` sql
-LAYOUT(COMPLEX_KEY_HASHED())
+LAYOUT(COMPLEX_KEY_HASHED([PREALLOCATE 0] [SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000]))
 ```

 ### complex_key_sparse_hashed
@ -204,14 +237,17 @@ Configuration example:

 ``` xml
 <layout>
-  <complex_key_sparse_hashed />
+  <complex_key_sparse_hashed>
+    <preallocate>0</preallocate>
+    <shards>1</shards>
+  </complex_key_sparse_hashed>
 </layout>
 ```

 or

 ``` sql
-LAYOUT(COMPLEX_KEY_SPARSE_HASHED())
+LAYOUT(COMPLEX_KEY_SPARSE_HASHED([PREALLOCATE 0] [SHARDS 1] [SHARD_LOAD_QUEUE_BACKLOG 10000]))
 ```

 ### hashed_array
--- a/docs/en/sql-reference/functions/array-functions.md
+++ b/docs/en/sql-reference/functions/array-functions.md
@ -121,7 +121,7 @@ Accepts an empty array and returns a one-element array that is equal to the defa

 ## range(end), range(\[start, \] end \[, step\])

-Returns an array of `UInt` numbers from `start` to `end - 1` by `step`.
+Returns an array of numbers from `start` to `end - 1` by `step`. The supported types are [UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64](../data-types/int-uint.md).

 **Syntax**
 ``` sql
@ -130,31 +130,30 @@ range([start, ] end [, step])

 **Arguments**

-   `start` — The first element of the array. Optional, required if `step` is used. Default value: 0. [UInt](../data-types/int-uint.md)
-   `end` — The number before which the array is constructed. Required. [UInt](../data-types/int-uint.md)
-   `step` — Determines the incremental step between each element in the array. Optional. Default value: 1. [UInt](../data-types/int-uint.md)
+-   `start` — The first element of the array. Optional, required if `step` is used. Default value: 0.
+-   `end` — The number before which the array is constructed. Required.
+-   `step` — Determines the incremental step between each element in the array. Optional. Default value: 1.

 **Returned value**

-   Array of `UInt` numbers from `start` to `end - 1` by `step`.
+-   Array of numbers from `start` to `end - 1` by `step`.

 **Implementation details**

-   All arguments must be positive values: `start`, `end`, `step` are `UInt` data types, as well as elements of the returned array.
+-   All arguments `start`, `end`, `step` must be below data types: `UInt8`, `UInt16`, `UInt32`, `UInt64`,`Int8`, `Int16`, `Int32`, `Int64`, as well as elements of the returned array, which's type is a super type of all arguments's.
 -   An exception is thrown if query results in arrays with a total length of more than number of elements specified by the [function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block) setting.

-
 **Examples**

 Query:
 ``` sql
-SELECT range(5), range(1, 5), range(1, 5, 2);
+SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
 ```
 Result:
 ```txt
-┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
-│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │
-└─────────────┴─────────────┴────────────────┘
+┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─┐
+│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │ [-1,1,3]        │
+└─────────────┴─────────────┴────────────────┴─────────────────┘
 ```

 ## array(x1, …), operator \[x1, …\]
--- a/docs/en/sql-reference/table-functions/generate.md
+++ b/docs/en/sql-reference/table-functions/generate.md
@ -39,3 +39,16 @@ SELECT * FROM generateRandom('a Array(Int8), d Decimal32(4), c Tuple(DateTime64(
 │ [68]     │  -67417.0770 │ ('2080-03-12 14:17:31.269','110425e5-413f-10a6-05ba-fa6b3e929f15') │
 └──────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
 ```
+
+```sql
+CREATE TABLE random (a Array(Int8), d Decimal32(4), c Tuple(DateTime64(3), UUID)) engine=Memory;
+INSERT INTO random SELECT * FROM generateRandom() LIMIT 2;
+SELECT * FROM random;
+```
+
+```text
+┌─a────────────────────────────┬────────────d─┬─c──────────────────────────────────────────────────────────────────┐
+│ []                           │   68091.8197 │ ('2037-10-02 12:44:23.368','039ecab7-81c2-45ee-208c-844e5c6c5652') │
+│ [8,-83,0,-22,65,9,-30,28,64] │ -186233.4909 │ ('2062-01-11 00:06:04.124','69563ea1-5ad1-f870-16d8-67061da0df25') │
+└──────────────────────────────┴──────────────┴────────────────────────────────────────────────────────────────────┘
+```
--- a/docs/zh/sql-reference/functions/array-functions.md
+++ b/docs/zh/sql-reference/functions/array-functions.md
@ -117,7 +117,7 @@ SELECT notEmpty([1,2]);

 ## range(end), range(\[start, \] end \[, step\]) {#range}

-返回一个以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组。
+返回一个以`step`作为增量步长的从`start`到`end - 1`的整形数字数组， 支持类型包括[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)。

 **语法**
 ``` sql
@ -126,31 +126,30 @@ range([start, ] end [, step])

 **参数**

-   `start` — 数组的第一个元素。可选项，如果设置了`step`时同样需要`start`，默认值为：0，类型为[UInt](../data-types/int-uint.md)。
-   `end` — 计数到`end`结束，但不包括`end`，必填项，类型为[UInt](../data-types/int-uint.md)。
-   `step` — 确定数组中每个元素之间的增量步长。可选项，默认值为：1，类型为[UInt](../data-types/int-uint.md)。
+-   `start` — 数组的第一个元素。可选项，如果设置了`step`时同样需要`start`，默认值为：0。
+-   `end` — 计数到`end`结束，但不包括`end`，必填项。
+-   `step` — 确定数组中每个元素之间的增量步长。可选项，默认值为：1。

 **返回值**

-   以`step`作为增量步长的从`start`到`end - 1`的`UInt`类型数字数组。
+-   以`step`作为增量步长的从`start`到`end - 1`的数字数组。

 **注意事项**

-   所有参数必须是正值：`start`、`end`、`step`，类型均为`UInt`，结果数组的元素与此相同。
+-   所有参数`start`、`end`、`step`必须属于以下几种类型之一：[`UInt8`, `UInt16`, `UInt32`, `UInt64`, `Int8`, `Int16`, `Int32`, `Int64`](../data-types/int-uint.md)。结果数组的元素数据类型为所有入参类型的最小超类，也必须属于以上几种类型之一。
 -   如果查询结果的数组总长度超过[function_range_max_elements_in_block](../../operations/settings/settings.md#settings-function_range_max_elements_in_block)指定的元素数，将会抛出异常。

-
 **示例**

 查询语句:
 ``` sql
-SELECT range(5), range(1, 5), range(1, 5, 2);
+SELECT range(5), range(1, 5), range(1, 5, 2), range(-1, 5, 2);
 ```
 结果:
 ```txt
-┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┐
-│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │
-└─────────────┴─────────────┴────────────────┘
+┌─range(5)────┬─range(1, 5)─┬─range(1, 5, 2)─┬─range(-1, 5, 2)─┐
+│ [0,1,2,3,4] │ [1,2,3,4]   │ [1,3]          │ [-1,1,3]        │
+└─────────────┴─────────────┴────────────────┴─────────────────┘
 ```

 ## array(x1, …), operator \[x1, …\] {#arrayx1-operator-x1}
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -140,6 +140,7 @@ namespace CurrentMetrics
 namespace ProfileEvents
 {
    extern const Event MainConfigLoads;
+    extern const Event ServerStartupMilliseconds;
 }

 namespace fs = std::filesystem;
@ -652,6 +653,8 @@ static void sanityChecks(Server & server)
 int Server::main(const std::vector<std::string> & /*args*/)
 try
 {
+    Stopwatch startup_watch;
+
    Poco::Logger * log = &logger();

    UseSSL use_ssl;
@ -1822,6 +1825,9 @@ try
            LOG_INFO(log, "Ready for connections.");
        }

+        startup_watch.stop();
+        ProfileEvents::increment(ProfileEvents::ServerStartupMilliseconds, startup_watch.elapsedMilliseconds());
+
        try
        {
            global_context->startClusterDiscovery();
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -1073,6 +1073,9 @@

        <!-- Interval of flushing data. -->
        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+
+        <!-- example of using a different storage policy for a system table -->
+        <!-- storage_policy>local_ssd</storage_policy -->
    </query_log>

    <!-- Trace log. Stores stack traces collected by query profilers.
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -167,6 +167,7 @@ enum class AccessType
    M(SYSTEM_SYNC_REPLICA, "SYNC REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_RESTART_REPLICA, "RESTART REPLICA", TABLE, SYSTEM) \
    M(SYSTEM_RESTORE_REPLICA, "RESTORE REPLICA", TABLE, SYSTEM) \
+    M(SYSTEM_WAIT_LOADING_PARTS, "WAIT LOADING PARTS", TABLE, SYSTEM) \
    M(SYSTEM_SYNC_DATABASE_REPLICA, "SYNC DATABASE REPLICA", DATABASE, SYSTEM) \
    M(SYSTEM_SYNC_TRANSACTION_LOG, "SYNC TRANSACTION LOG", GLOBAL, SYSTEM) \
    M(SYSTEM_FLUSH_DISTRIBUTED, "FLUSH DISTRIBUTED", TABLE, SYSTEM_FLUSH) \
--- a/src/Access/tests/gtest_access_rights_ops.cpp
+++ b/src/Access/tests/gtest_access_rights_ops.cpp
@ -53,7 +53,7 @@ TEST(AccessRights, Union)
              "SHOW ROW POLICIES, SYSTEM MERGES, SYSTEM TTL MERGES, SYSTEM FETCHES, "
              "SYSTEM MOVES, SYSTEM SENDS, SYSTEM REPLICATION QUEUES, "
              "SYSTEM DROP REPLICA, SYSTEM SYNC REPLICA, SYSTEM RESTART REPLICA, "
-              "SYSTEM RESTORE REPLICA, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
+              "SYSTEM RESTORE REPLICA, SYSTEM WAIT LOADING PARTS, SYSTEM SYNC DATABASE REPLICA, SYSTEM FLUSH DISTRIBUTED, dictGet ON db1.*");
 }


--- a/src/AggregateFunctions/AggregateFunctionHistogram.h
+++ b/src/AggregateFunctions/AggregateFunctionHistogram.h
@ -207,7 +207,7 @@ private:
        {
            // Fuse points if their text representations differ only in last digit
            auto min_diff = 10 * (points[left].mean + points[right].mean) * std::numeric_limits<Mean>::epsilon();
-            if (points[left].mean + min_diff >= points[right].mean)
+            if (points[left].mean + std::fabs(min_diff) >= points[right].mean)
            {
                points[left] = points[left] + points[right];
            }
--- a/src/AggregateFunctions/AggregateFunctionQuantile.h
+++ b/src/AggregateFunctions/AggregateFunctionQuantile.h
@ -232,6 +232,9 @@ struct NameQuantilesExactInclusive { static constexpr auto name = "quantilesExac
 struct NameQuantileExactWeighted { static constexpr auto name = "quantileExactWeighted"; };
 struct NameQuantilesExactWeighted { static constexpr auto name = "quantilesExactWeighted"; };

+struct NameQuantileInterpolatedWeighted { static constexpr auto name = "quantileInterpolatedWeighted"; };
+struct NameQuantilesInterpolatedWeighted { static constexpr auto name = "quantilesInterpolatedWeighted"; };
+
 struct NameQuantileTiming { static constexpr auto name = "quantileTiming"; };
 struct NameQuantileTimingWeighted { static constexpr auto name = "quantileTimingWeighted"; };
 struct NameQuantilesTiming { static constexpr auto name = "quantilesTiming"; };
--- a/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
+++ b/src/AggregateFunctions/AggregateFunctionQuantileInterpolatedWeighted.cpp
@ -0,0 +1,70 @@
+#include <AggregateFunctions/AggregateFunctionQuantile.h>
+#include <AggregateFunctions/QuantileInterpolatedWeighted.h>
+#include <AggregateFunctions/AggregateFunctionFactory.h>
+#include <AggregateFunctions/Helpers.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+namespace
+{
+
+    template <typename Value, bool _> using FuncQuantileInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantileInterpolatedWeighted, true, void, false>;
+    template <typename Value, bool _> using FuncQuantilesInterpolatedWeighted = AggregateFunctionQuantile<Value, QuantileInterpolatedWeighted<Value>, NameQuantilesInterpolatedWeighted, true, void, true>;
+
+    template <template <typename, bool> class Function>
+    AggregateFunctionPtr createAggregateFunctionQuantile(
+        const std::string & name, const DataTypes & argument_types, const Array & params, const Settings *)
+    {
+        /// Second argument type check doesn't depend on the type of the first one.
+        Function<void, true>::assertSecondArg(argument_types);
+
+        const DataTypePtr & argument_type = argument_types[0];
+        WhichDataType which(argument_type);
+
+#define DISPATCH(TYPE) \
+    if (which.idx == TypeIndex::TYPE) return std::make_shared<Function<TYPE, true>>(argument_types, params);
+        FOR_BASIC_NUMERIC_TYPES(DISPATCH)
+#undef DISPATCH
+        if (which.idx == TypeIndex::Date) return std::make_shared<Function<DataTypeDate::FieldType, false>>(argument_types, params);
+        if (which.idx == TypeIndex::DateTime) return std::make_shared<Function<DataTypeDateTime::FieldType, false>>(argument_types, params);
+
+        if (which.idx == TypeIndex::Decimal32) return std::make_shared<Function<Decimal32, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal64) return std::make_shared<Function<Decimal64, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal128) return std::make_shared<Function<Decimal128, false>>(argument_types, params);
+        if (which.idx == TypeIndex::Decimal256) return std::make_shared<Function<Decimal256, false>>(argument_types, params);
+        if (which.idx == TypeIndex::DateTime64) return std::make_shared<Function<DateTime64, false>>(argument_types, params);
+
+        if (which.idx == TypeIndex::Int128) return std::make_shared<Function<Int128, true>>(argument_types, params);
+        if (which.idx == TypeIndex::UInt128) return std::make_shared<Function<UInt128, true>>(argument_types, params);
+        if (which.idx == TypeIndex::Int256) return std::make_shared<Function<Int256, true>>(argument_types, params);
+        if (which.idx == TypeIndex::UInt256) return std::make_shared<Function<UInt256, true>>(argument_types, params);
+
+        throw Exception("Illegal type " + argument_type->getName() + " of argument for aggregate function " + name,
+                        ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
+    }
+}
+
+void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory & factory)
+{
+    /// For aggregate functions returning array we cannot return NULL on empty set.
+    AggregateFunctionProperties properties = { .returns_default_when_only_null = true };
+
+    factory.registerFunction(NameQuantileInterpolatedWeighted::name, createAggregateFunctionQuantile<FuncQuantileInterpolatedWeighted>);
+    factory.registerFunction(NameQuantilesInterpolatedWeighted::name, { createAggregateFunctionQuantile<FuncQuantilesInterpolatedWeighted>, properties });
+
+    /// 'median' is an alias for 'quantile'
+    factory.registerAlias("medianInterpolatedWeighted", NameQuantileInterpolatedWeighted::name);
+}
+
+}
--- a/src/AggregateFunctions/QuantileInterpolatedWeighted.h
+++ b/src/AggregateFunctions/QuantileInterpolatedWeighted.h
@ -0,0 +1,308 @@
+#pragma once
+
+#include <base/sort.h>
+
+#include <Common/HashTable/HashMap.h>
+#include <Common/NaNUtils.h>
+
+
+namespace DB
+{
+struct Settings;
+
+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}
+
+/** Approximates Quantile by:
+  * - sorting input values and weights
+  * - building a cumulative distribution based on weights
+  * - performing linear interpolation between the weights and values
+  *
+  */
+template <typename Value>
+struct QuantileInterpolatedWeighted
+{
+    struct Int128Hash
+    {
+        size_t operator()(Int128 x) const
+        {
+            return CityHash_v1_0_2::Hash128to64({x >> 64, x & 0xffffffffffffffffll});
+        }
+    };
+
+    using Weight = UInt64;
+    using UnderlyingType = NativeType<Value>;
+    using Hasher = std::conditional_t<std::is_same_v<Value, Decimal128>, Int128Hash, HashCRC32<UnderlyingType>>;
+
+    /// When creating, the hash table must be small.
+    using Map = HashMapWithStackMemory<UnderlyingType, Weight, Hasher, 4>;
+
+    Map map;
+
+    void add(const Value & x)
+    {
+        /// We must skip NaNs as they are not compatible with comparison sorting.
+        if (!isNaN(x))
+            ++map[x];
+    }
+
+    void add(const Value & x, Weight weight)
+    {
+        if (!isNaN(x))
+            map[x] += weight;
+    }
+
+    void merge(const QuantileInterpolatedWeighted & rhs)
+    {
+        for (const auto & pair : rhs.map)
+            map[pair.getKey()] += pair.getMapped();
+    }
+
+    void serialize(WriteBuffer & buf) const
+    {
+        map.write(buf);
+    }
+
+    void deserialize(ReadBuffer & buf)
+    {
+        typename Map::Reader reader(buf);
+        while (reader.next())
+        {
+            const auto & pair = reader.get();
+            map[pair.first] = pair.second;
+        }
+    }
+
+    Value get(Float64 level) const
+    {
+        return getImpl<Value>(level);
+    }
+
+    void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) const
+    {
+        getManyImpl<Value>(levels, indices, size, result);
+    }
+
+    /// The same, but in the case of an empty state, NaN is returned.
+    Float64 getFloat(Float64) const
+    {
+        throw Exception("Method getFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+    void getManyFloat(const Float64 *, const size_t *, size_t, Float64 *) const
+    {
+        throw Exception("Method getManyFloat is not implemented for QuantileInterpolatedWeighted", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
+private:
+    using Pair = typename std::pair<UnderlyingType, Float64>;
+
+    /// Get the value of the `level` quantile. The level must be between 0 and 1.
+    template <typename T>
+    T getImpl(Float64 level) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+            return std::numeric_limits<Value>::quiet_NaN();
+
+        /// Maintain a vector of pair of values and weights for easier sorting and for building
+        /// a cumulative distribution using the provided weights.
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        /// Note: weight provided must be a 64-bit integer
+        /// Float64 is used as accumulator here to get approximate results.
+        /// But weight used in the internal array is stored as Float64 as we
+        /// do some quantile estimation operation which involves division and
+        /// require Float64 level of precision.
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.push_back(accumulated);
+        }
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        /// perform linear interpolation
+        size_t idx = 0;
+        if (size >= 2)
+        {
+            if (level >= value_weight_pairs[size - 2].second)
+            {
+                idx = size - 2;
+            }
+            else
+            {
+                size_t start = 0, end = size - 1;
+                while (start <= end)
+                {
+                    size_t mid = start + (end - start) / 2;
+                    if (mid > size)
+                        break;
+                    if (level > value_weight_pairs[mid + 1].second)
+                        start = mid + 1;
+                    else
+                    {
+                        idx = mid;
+                        end = mid - 1;
+                    }
+                }
+            }
+        }
+
+        size_t l = idx;
+        size_t u = idx + 1 < size ? idx + 1 : idx;
+
+        Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+        UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+        if (level < xl)
+            yr = yl;
+        if (level > xr)
+            yl = yr;
+
+        return static_cast<T>(interpolate(level, xl, xr, yl, yr));
+    }
+
+    /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
+    /// indices - an array of index levels such that the corresponding elements will go in ascending order.
+    template <typename T>
+    void getManyImpl(const Float64 * levels, const size_t * indices, size_t num_levels, Value * result) const
+    {
+        size_t size = map.size();
+
+        if (0 == size)
+        {
+            for (size_t i = 0; i < num_levels; ++i)
+                result[i] = Value();
+            return;
+        }
+
+        std::vector<Pair> value_weight_pairs;
+        value_weight_pairs.reserve(size);
+
+        Float64 sum_weight = 0;
+        for (const auto & pair : map)
+        {
+            sum_weight += pair.getMapped();
+            auto value = pair.getKey();
+            auto weight = pair.getMapped();
+            value_weight_pairs.push_back({value, weight});
+        }
+
+        ::sort(value_weight_pairs.begin(), value_weight_pairs.end(), [](const Pair & a, const Pair & b) { return a.first < b.first; });
+
+        Float64 accumulated = 0;
+
+        /// vector for populating and storing the cumulative sum using the provided weights.
+        /// example: [0,1,2,3,4,5] -> [0,1,3,6,10,15]
+        std::vector<Float64> weights_cum_sum;
+        weights_cum_sum.reserve(size);
+
+        for (size_t idx = 0; idx < size; ++idx)
+        {
+            accumulated += value_weight_pairs[idx].second;
+            weights_cum_sum.emplace_back(accumulated);
+        }
+
+
+        /// The following estimation of quantile is general and the idea is:
+        /// https://en.wikipedia.org/wiki/Percentile#The_weighted_percentile_method
+
+        /// calculates a simple cumulative distribution based on weights
+        if (sum_weight != 0)
+        {
+            for (size_t idx = 0; idx < size; ++idx)
+                value_weight_pairs[idx].second = (weights_cum_sum[idx] - 0.5 * value_weight_pairs[idx].second) / sum_weight;
+        }
+
+        for (size_t level_index = 0; level_index < num_levels; ++level_index)
+        {
+            /// perform linear interpolation for every level
+            auto level = levels[indices[level_index]];
+
+            size_t idx = 0;
+            if (size >= 2)
+            {
+                if (level >= value_weight_pairs[size - 2].second)
+                {
+                    idx = size - 2;
+                }
+                else
+                {
+                    size_t start = 0, end = size - 1;
+                    while (start <= end)
+                    {
+                        size_t mid = start + (end - start) / 2;
+                        if (mid > size)
+                            break;
+                        if (level > value_weight_pairs[mid + 1].second)
+                            start = mid + 1;
+                        else
+                        {
+                            idx = mid;
+                            end = mid - 1;
+                        }
+                    }
+                }
+            }
+
+            size_t l = idx;
+            size_t u = idx + 1 < size ? idx + 1 : idx;
+
+            Float64 xl = value_weight_pairs[l].second, xr = value_weight_pairs[u].second;
+            UnderlyingType yl = value_weight_pairs[l].first, yr = value_weight_pairs[u].first;
+
+            if (level < xl)
+                yr = yl;
+            if (level > xr)
+                yl = yr;
+
+            result[indices[level_index]] = static_cast<T>(interpolate(level, xl, xr, yl, yr));
+        }
+    }
+
+    /// This ignores overflows or NaN's that might arise during add, sub and mul operations and doesn't aim to provide exact
+    /// results since `the quantileInterpolatedWeighted` function itself relies mainly on approximation.
+    UnderlyingType NO_SANITIZE_UNDEFINED interpolate(Float64 level, Float64 xl, Float64 xr, UnderlyingType yl, UnderlyingType yr) const
+    {
+        UnderlyingType dy = yr - yl;
+        Float64 dx = xr - xl;
+        dx = dx == 0 ? 1 : dx; /// to handle NaN behavior that might arise during integer division below.
+
+        /// yl + (dy / dx) * (level - xl)
+        return static_cast<UnderlyingType>(yl + (dy / dx) * (level - xl));
+    }
+};
+
+}
--- a/src/AggregateFunctions/registerAggregateFunctions.cpp
+++ b/src/AggregateFunctions/registerAggregateFunctions.cpp
@ -21,6 +21,7 @@ void registerAggregateFunctionsQuantile(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileDeterministic(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExact(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactWeighted(AggregateFunctionFactory &);
+void registerAggregateFunctionsQuantileInterpolatedWeighted(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactLow(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactHigh(AggregateFunctionFactory &);
 void registerAggregateFunctionsQuantileExactInclusive(AggregateFunctionFactory &);
@ -106,6 +107,7 @@ void registerAggregateFunctions()
        registerAggregateFunctionsQuantileDeterministic(factory);
        registerAggregateFunctionsQuantileExact(factory);
        registerAggregateFunctionsQuantileExactWeighted(factory);
+        registerAggregateFunctionsQuantileInterpolatedWeighted(factory);
        registerAggregateFunctionsQuantileExactLow(factory);
        registerAggregateFunctionsQuantileExactHigh(factory);
        registerAggregateFunctionsQuantileExactInclusive(factory);
--- a/src/Analyzer/ConstantNode.cpp
+++ b/src/Analyzer/ConstantNode.cpp
@ -48,7 +48,7 @@ void ConstantNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state

    if (getSourceExpression())
    {
-        buffer << '\n' << std::string(indent + 2, ' ') << "EXPRESSION " << '\n';
+        buffer << '\n' << std::string(indent + 2, ' ') << "EXPRESSION" << '\n';
        getSourceExpression()->dumpTreeImpl(buffer, format_state, indent + 4);
    }
 }
--- a/src/Analyzer/FunctionNode.cpp
+++ b/src/Analyzer/FunctionNode.cpp
@ -2,6 +2,7 @@

 #include <Common/SipHash.h>
 #include <Common/FieldVisitorToString.h>
+#include <DataTypes/IDataType.h>
 #include <Analyzer/ConstantNode.h>

 #include <IO/WriteBufferFromString.h>
@ -31,6 +32,15 @@ FunctionNode::FunctionNode(String function_name_)
    children[arguments_child_index] = std::make_shared<ListNode>();
 }

+const DataTypes & FunctionNode::getArgumentTypes() const
+{
+    if (!function)
+        throw Exception(ErrorCodes::LOGICAL_ERROR,
+        "Function {} is not resolved",
+        function_name);
+    return function->getArgumentTypes();
+}
+
 ColumnsWithTypeAndName FunctionNode::getArgumentColumns() const
 {
    const auto & arguments = getArguments().getNodes();
--- a/src/Analyzer/FunctionNode.h
+++ b/src/Analyzer/FunctionNode.h
@ -85,6 +85,7 @@ public:
    /// Get arguments node
    QueryTreeNodePtr & getArgumentsNode() { return children[arguments_child_index]; }

+    const DataTypes & getArgumentTypes() const;
    ColumnsWithTypeAndName getArgumentColumns() const;

    /// Returns true if function node has window, false otherwise
@ -144,6 +145,11 @@ public:
      */
    void resolveAsFunction(FunctionBasePtr function_value);

+    void resolveAsFunction(const FunctionOverloadResolverPtr & resolver)
+    {
+        resolveAsFunction(resolver->build(getArgumentColumns()));
+    }
+
    /** Resolve function node as aggregate function.
      * It is important that function name is updated with resolved function name.
      * Main motivation for this is query tree optimizations.
--- a/src/Analyzer/InDepthQueryTreeVisitor.h
+++ b/src/Analyzer/InDepthQueryTreeVisitor.h
@ -1,8 +1,13 @@
 #pragma once

+#include <optional>
+#include <utility>
+#include <Common/SettingsChanges.h>
 #include <Common/Exception.h>
+#include <Core/Settings.h>

 #include <Analyzer/IQueryTreeNode.h>
+#include <Analyzer/QueryNode.h>


 namespace DB
--- a/src/Analyzer/ListNode.h
+++ b/src/Analyzer/ListNode.h
@ -16,6 +16,8 @@ using ListNodePtr = std::shared_ptr<ListNode>;
 class ListNode final : public IQueryTreeNode
 {
 public:
+    using iterator = QueryTreeNodes::iterator;
+
    /// Initialize list node with empty nodes
    ListNode();

@ -41,6 +43,9 @@ public:

    void dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const override;

+    iterator begin() { return children.begin(); }
+    iterator end() { return children.end(); }
+
 protected:
    bool isEqualImpl(const IQueryTreeNode & rhs) const override;

--- a/src/Analyzer/MatcherNode.cpp
+++ b/src/Analyzer/MatcherNode.cpp
@ -11,6 +11,7 @@
 #include <Parsers/ASTQualifiedAsterisk.h>
 #include <Parsers/ASTColumnsMatcher.h>
 #include <Parsers/ASTExpressionList.h>
+#include <Parsers/ASTColumnsTransformers.h>

 namespace DB
 {
@ -206,19 +207,43 @@ QueryTreeNodePtr MatcherNode::cloneImpl() const
 ASTPtr MatcherNode::toASTImpl() const
 {
    ASTPtr result;
+    ASTPtr transformers;
+
+    if (!children.empty())
+    {
+        transformers = std::make_shared<ASTColumnsTransformerList>();
+
+        for (const auto & child : children)
+            transformers->children.push_back(child->toAST());
+    }

    if (matcher_type == MatcherNodeType::ASTERISK)
    {
        if (qualified_identifier.empty())
        {
-            result = std::make_shared<ASTAsterisk>();
+            auto asterisk = std::make_shared<ASTAsterisk>();
+
+            if (transformers)
+            {
+                asterisk->transformers = std::move(transformers);
+                asterisk->children.push_back(asterisk->transformers);
+            }
+
+            result = asterisk;
        }
        else
        {
            auto qualified_asterisk = std::make_shared<ASTQualifiedAsterisk>();

            auto identifier_parts = qualified_identifier.getParts();
-            qualified_asterisk->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            qualified_asterisk->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            qualified_asterisk->children.push_back(qualified_asterisk->qualifier);
+
+            if (transformers)
+            {
+                qualified_asterisk->transformers = std::move(transformers);
+                qualified_asterisk->children.push_back(qualified_asterisk->transformers);
+            }

            result = qualified_asterisk;
        }
@ -229,6 +254,13 @@ ASTPtr MatcherNode::toASTImpl() const
        {
            auto regexp_matcher = std::make_shared<ASTColumnsRegexpMatcher>();
            regexp_matcher->setPattern(columns_matcher->pattern());
+
+            if (transformers)
+            {
+                regexp_matcher->transformers = std::move(transformers);
+                regexp_matcher->children.push_back(regexp_matcher->transformers);
+            }
+
            result = regexp_matcher;
        }
        else
@ -237,7 +269,14 @@ ASTPtr MatcherNode::toASTImpl() const
            regexp_matcher->setPattern(columns_matcher->pattern());

            auto identifier_parts = qualified_identifier.getParts();
-            regexp_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            regexp_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            regexp_matcher->children.push_back(regexp_matcher->qualifier);
+
+            if (transformers)
+            {
+                regexp_matcher->transformers = std::move(transformers);
+                regexp_matcher->children.push_back(regexp_matcher->transformers);
+            }

            result = regexp_matcher;
        }
@ -257,23 +296,36 @@ ASTPtr MatcherNode::toASTImpl() const
        {
            auto columns_list_matcher = std::make_shared<ASTColumnsListMatcher>();
            columns_list_matcher->column_list = std::move(column_list);
+            columns_list_matcher->children.push_back(columns_list_matcher->column_list);
+
+            if (transformers)
+            {
+                columns_list_matcher->transformers = std::move(transformers);
+                columns_list_matcher->children.push_back(columns_list_matcher->transformers);
+            }
+
            result = columns_list_matcher;
        }
        else
        {
            auto columns_list_matcher = std::make_shared<ASTQualifiedColumnsListMatcher>();
-            columns_list_matcher->column_list = std::move(column_list);

            auto identifier_parts = qualified_identifier.getParts();
-            columns_list_matcher->children.push_back(std::make_shared<ASTIdentifier>(std::move(identifier_parts)));
+            columns_list_matcher->qualifier = std::make_shared<ASTIdentifier>(std::move(identifier_parts));
+            columns_list_matcher->column_list = std::move(column_list);
+            columns_list_matcher->children.push_back(columns_list_matcher->qualifier);
+            columns_list_matcher->children.push_back(columns_list_matcher->column_list);
+
+            if (transformers)
+            {
+                columns_list_matcher->transformers = std::move(transformers);
+                columns_list_matcher->children.push_back(columns_list_matcher->transformers);
+            }

            result = columns_list_matcher;
        }
    }

-    for (const auto & child : children)
-        result->children.push_back(child->toAST());
-
    return result;
 }

--- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
+++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp
@ -3,6 +3,7 @@
 #include <AggregateFunctions/AggregateFunctionFactory.h>
 #include <AggregateFunctions/IAggregateFunction.h>

+#include <Functions/FunctionFactory.h>
 #include <Functions/IFunction.h>

 #include <Analyzer/InDepthQueryTreeVisitor.h>
@ -47,19 +48,23 @@ Field zeroField(const Field & value)
 class AggregateFunctionsArithmericOperationsVisitor : public InDepthQueryTreeVisitor<AggregateFunctionsArithmericOperationsVisitor>
 {
 public:
+    explicit AggregateFunctionsArithmericOperationsVisitor(ContextPtr context_)
+        : context(std::move(context_))
+    {}
+
    /// Traverse tree bottom to top
    static bool shouldTraverseTopToBottom()
    {
        return false;
    }

-    static void visitImpl(QueryTreeNodePtr & node)
+    void visitImpl(QueryTreeNodePtr & node)
    {
        auto * aggregate_function_node = node->as<FunctionNode>();
        if (!aggregate_function_node || !aggregate_function_node->isAggregateFunction())
            return;

-        static std::unordered_map<std::string_view, std::unordered_set<std::string_view>> supported_functions
+        static std::unordered_map<std::string_view, std::unordered_set<std::string_view>> supported_aggregate_functions
            = {{"sum", {"multiply", "divide"}},
               {"min", {"multiply", "divide", "plus", "minus"}},
               {"max", {"multiply", "divide", "plus", "minus"}},
@ -69,85 +74,112 @@ public:
        if (aggregate_function_arguments_nodes.size() != 1)
            return;

-        auto * inner_function_node = aggregate_function_arguments_nodes[0]->as<FunctionNode>();
-        if (!inner_function_node)
+        const auto & arithmetic_function_node = aggregate_function_arguments_nodes[0];
+        auto * arithmetic_function_node_typed = arithmetic_function_node->as<FunctionNode>();
+        if (!arithmetic_function_node_typed)
            return;

-        auto & inner_function_arguments_nodes = inner_function_node->getArguments().getNodes();
-        if (inner_function_arguments_nodes.size() != 2)
+        const auto & arithmetic_function_arguments_nodes = arithmetic_function_node_typed->getArguments().getNodes();
+        if (arithmetic_function_arguments_nodes.size() != 2)
            return;

        /// Aggregate functions[sum|min|max|avg] is case-insensitive, so we use lower cases name
-        auto lower_function_name = Poco::toLower(aggregate_function_node->getFunctionName());
+        auto lower_aggregate_function_name = Poco::toLower(aggregate_function_node->getFunctionName());

-        auto supported_function_it = supported_functions.find(lower_function_name);
-        if (supported_function_it == supported_functions.end())
+        auto supported_aggregate_function_it = supported_aggregate_functions.find(lower_aggregate_function_name);
+        if (supported_aggregate_function_it == supported_aggregate_functions.end())
            return;

-        const auto & inner_function_name = inner_function_node->getFunctionName();
-
-        if (!supported_function_it->second.contains(inner_function_name))
+        const auto & arithmetic_function_name = arithmetic_function_node_typed->getFunctionName();
+        if (!supported_aggregate_function_it->second.contains(arithmetic_function_name))
            return;

-        const auto * left_argument_constant_node = inner_function_arguments_nodes[0]->as<ConstantNode>();
-        const auto * right_argument_constant_node = inner_function_arguments_nodes[1]->as<ConstantNode>();
+        const auto * left_argument_constant_node = arithmetic_function_arguments_nodes[0]->as<ConstantNode>();
+        const auto * right_argument_constant_node = arithmetic_function_arguments_nodes[1]->as<ConstantNode>();

        /** If we extract negative constant, aggregate function name must be updated.
          *
          * Example: SELECT min(-1 * id);
          * Result: SELECT -1 * max(id);
          */
-        std::string function_name_if_constant_is_negative;
-        if (inner_function_name == "multiply" || inner_function_name == "divide")
+        std::string aggregate_function_name_if_constant_is_negative;
+        if (arithmetic_function_name == "multiply" || arithmetic_function_name == "divide")
        {
-            if (lower_function_name == "min")
-                function_name_if_constant_is_negative = "max";
-            else if (lower_function_name == "max")
-                function_name_if_constant_is_negative = "min";
+            if (lower_aggregate_function_name == "min")
+                aggregate_function_name_if_constant_is_negative = "max";
+            else if (lower_aggregate_function_name == "max")
+                aggregate_function_name_if_constant_is_negative = "min";
        }

+        size_t arithmetic_function_argument_index = 0;
+
        if (left_argument_constant_node && !right_argument_constant_node)
        {
            /// Do not rewrite `sum(1/n)` with `sum(1) * div(1/n)` because of lose accuracy
-            if (inner_function_name == "divide")
+            if (arithmetic_function_name == "divide")
                return;

            /// Rewrite `aggregate_function(inner_function(constant, argument))` into `inner_function(constant, aggregate_function(argument))`
            const auto & left_argument_constant_value_literal = left_argument_constant_node->getValue();
-            if (!function_name_if_constant_is_negative.empty() &&
+            if (!aggregate_function_name_if_constant_is_negative.empty() &&
                left_argument_constant_value_literal < zeroField(left_argument_constant_value_literal))
            {
-                lower_function_name = function_name_if_constant_is_negative;
+                lower_aggregate_function_name = aggregate_function_name_if_constant_is_negative;
            }
-            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[1], lower_function_name);

-            auto inner_function = aggregate_function_arguments_nodes[0];
-            auto inner_function_right_argument = std::move(inner_function_arguments_nodes[1]);
-            aggregate_function_arguments_nodes = {inner_function_right_argument};
-            inner_function_arguments_nodes[1] = node;
-            node = std::move(inner_function);
+            arithmetic_function_argument_index = 1;
        }
        else if (right_argument_constant_node)
        {
            /// Rewrite `aggregate_function(inner_function(argument, constant))` into `inner_function(aggregate_function(argument), constant)`
            const auto & right_argument_constant_value_literal = right_argument_constant_node->getValue();
-            if (!function_name_if_constant_is_negative.empty() &&
+            if (!aggregate_function_name_if_constant_is_negative.empty() &&
                right_argument_constant_value_literal < zeroField(right_argument_constant_value_literal))
            {
-                lower_function_name = function_name_if_constant_is_negative;
+                lower_aggregate_function_name = aggregate_function_name_if_constant_is_negative;
            }
-            resolveAggregateFunctionNode(*aggregate_function_node, inner_function_arguments_nodes[0], function_name_if_constant_is_negative);

-            auto inner_function = aggregate_function_arguments_nodes[0];
-            auto inner_function_left_argument = std::move(inner_function_arguments_nodes[0]);
-            aggregate_function_arguments_nodes = {inner_function_left_argument};
-            inner_function_arguments_nodes[0] = node;
-            node = std::move(inner_function);
+            arithmetic_function_argument_index = 0;
        }
+
+        auto optimized_function_node = cloneArithmeticFunctionAndWrapArgumentIntoAggregateFunction(arithmetic_function_node,
+            arithmetic_function_argument_index,
+            node,
+            lower_aggregate_function_name);
+        if (optimized_function_node->getResultType()->equals(*node->getResultType()))
+            node = std::move(optimized_function_node);
    }

 private:
-    static inline void resolveAggregateFunctionNode(FunctionNode & function_node, QueryTreeNodePtr & argument, const String & aggregate_function_name)
+    QueryTreeNodePtr cloneArithmeticFunctionAndWrapArgumentIntoAggregateFunction(
+        const QueryTreeNodePtr & arithmetic_function,
+        size_t arithmetic_function_argument_index,
+        const QueryTreeNodePtr & aggregate_function,
+        const std::string & result_aggregate_function_name)
+    {
+        auto arithmetic_function_clone = arithmetic_function->clone();
+        auto & arithmetic_function_clone_typed = arithmetic_function_clone->as<FunctionNode &>();
+        auto & arithmetic_function_clone_arguments_nodes = arithmetic_function_clone_typed.getArguments().getNodes();
+        auto & arithmetic_function_clone_argument = arithmetic_function_clone_arguments_nodes[arithmetic_function_argument_index];
+
+        auto aggregate_function_clone = aggregate_function->clone();
+        auto & aggregate_function_clone_typed = aggregate_function_clone->as<FunctionNode &>();
+        aggregate_function_clone_typed.getArguments().getNodes() = { arithmetic_function_clone_argument };
+        resolveAggregateFunctionNode(aggregate_function_clone_typed, arithmetic_function_clone_argument, result_aggregate_function_name);
+
+        arithmetic_function_clone_arguments_nodes[arithmetic_function_argument_index] = std::move(aggregate_function_clone);
+        resolveOrdinaryFunctionNode(arithmetic_function_clone_typed, arithmetic_function_clone_typed.getFunctionName());
+
+        return arithmetic_function_clone;
+    }
+
+    inline void resolveOrdinaryFunctionNode(FunctionNode & function_node, const String & function_name) const
+    {
+        auto function = FunctionFactory::instance().get(function_name, context);
+        function_node.resolveAsFunction(function->build(function_node.getArgumentColumns()));
+    }
+
+    static inline void resolveAggregateFunctionNode(FunctionNode & function_node, const QueryTreeNodePtr & argument, const String & aggregate_function_name)
    {
        auto function_aggregate_function = function_node.getAggregateFunction();

@ -159,13 +191,15 @@ private:

        function_node.resolveAsAggregateFunction(std::move(aggregate_function));
    }
+
+    ContextPtr context;
 };

 }

-void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr)
+void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context)
 {
-    AggregateFunctionsArithmericOperationsVisitor visitor;
+    AggregateFunctionsArithmericOperationsVisitor visitor(std::move(context));
    visitor.visit(query_tree_node);
 }

--- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
+++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
@ -0,0 +1,134 @@
+#include <memory>
+#include <unordered_map>
+#include <vector>
+#include <Analyzer/Passes/ConvertOrLikeChainPass.h>
+#include <Analyzer/ConstantNode.h>
+#include <Analyzer/UnionNode.h>
+#include <Analyzer/FunctionNode.h>
+#include <Analyzer/HashUtils.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Core/Field.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Functions/FunctionFactory.h>
+#include <Functions/likePatternToRegexp.h>
+#include <Interpreters/Context.h>
+
+namespace DB
+{
+
+namespace
+{
+
+class ConvertOrLikeChainVisitor : public InDepthQueryTreeVisitor<ConvertOrLikeChainVisitor>
+{
+    using FunctionNodes = std::vector<std::shared_ptr<FunctionNode>>;
+
+    const FunctionOverloadResolverPtr match_function_ref;
+    const FunctionOverloadResolverPtr or_function_resolver;
+public:
+
+    explicit ConvertOrLikeChainVisitor(ContextPtr context)
+        : InDepthQueryTreeVisitor<ConvertOrLikeChainVisitor>()
+        , match_function_ref(FunctionFactory::instance().get("multiMatchAny", context))
+        , or_function_resolver(FunctionFactory::instance().get("or", context))
+    {}
+
+    static bool needChildVisit(VisitQueryTreeNodeType & parent, VisitQueryTreeNodeType &)
+    {
+        ContextPtr context;
+        if (auto * query = parent->as<QueryNode>())
+            context = query->getContext();
+        else if (auto * union_node = parent->as<UnionNode>())
+            context = union_node->getContext();
+        if (context)
+        {
+            const auto & settings = context->getSettingsRef();
+            return settings.optimize_or_like_chain
+                && settings.allow_hyperscan
+                && settings.max_hyperscan_regexp_length == 0
+                && settings.max_hyperscan_regexp_total_length == 0;
+        }
+        return true;
+    }
+
+    void visitImpl(QueryTreeNodePtr & node)
+    {
+        auto * function_node = node->as<FunctionNode>();
+        if (!function_node || function_node->getFunctionName() != "or")
+            return;
+
+        QueryTreeNodes unique_elems;
+
+        QueryTreeNodePtrWithHashMap<Array> node_to_patterns;
+        FunctionNodes match_functions;
+        for (auto & arg : function_node->getArguments())
+        {
+            unique_elems.push_back(arg);
+
+            auto * arg_func = arg->as<FunctionNode>();
+            if (!arg_func)
+                continue;
+
+            const bool is_like  = arg_func->getFunctionName() == "like";
+            const bool is_ilike = arg_func->getFunctionName() == "ilike";
+
+            /// Not {i}like -> bail out.
+            if (!is_like && !is_ilike)
+                continue;
+
+            const auto & like_arguments = arg_func->getArguments().getNodes();
+            if (like_arguments.size() != 2)
+                continue;
+
+            auto identifier = like_arguments[0];
+            auto * pattern = like_arguments[1]->as<ConstantNode>();
+            if (!pattern || !isString(pattern->getResultType()))
+                continue;
+
+            auto regexp = likePatternToRegexp(pattern->getValue().get<String>());
+            /// Case insensitive. Works with UTF-8 as well.
+            if (is_ilike)
+                regexp = "(?i)" + regexp;
+
+            unique_elems.pop_back();
+            auto it = node_to_patterns.find(identifier);
+            if (it == node_to_patterns.end())
+            {
+                it = node_to_patterns.insert({identifier, Array{}}).first;
+                /// The second argument will be added when all patterns are known.
+                auto match_function = std::make_shared<FunctionNode>("multiMatchAny");
+                match_function->getArguments().getNodes().push_back(identifier);
+
+                match_functions.push_back(match_function);
+                unique_elems.push_back(std::move(match_function));
+            }
+            it->second.push_back(regexp);
+        }
+
+        /// Add all the patterns into the function arguments lists.
+        for (auto & match_function : match_functions)
+        {
+            auto & arguments = match_function->getArguments().getNodes();
+            auto & patterns = node_to_patterns.at(arguments[0]);
+            arguments.push_back(std::make_shared<ConstantNode>(Field{std::move(patterns)}));
+            match_function->resolveAsFunction(match_function_ref);
+        }
+
+        /// OR must have at least two arguments.
+        if (unique_elems.size() == 1)
+            unique_elems.push_back(std::make_shared<ConstantNode>(false));
+
+        function_node->getArguments().getNodes() = std::move(unique_elems);
+        function_node->resolveAsFunction(or_function_resolver);
+    }
+};
+
+}
+
+void ConvertOrLikeChainPass::run(QueryTreeNodePtr query_tree_node, ContextPtr  context)
+{
+    ConvertOrLikeChainVisitor visitor(context);
+    visitor.visit(query_tree_node);
+}
+
+}
--- a/src/Analyzer/Passes/ConvertOrLikeChainPass.h
+++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.h
@ -0,0 +1,20 @@
+#pragma once
+
+#include <Analyzer/IQueryTreePass.h>
+
+namespace DB
+{
+
+/** Replaces all the "or"'s with {i}like to multiMatchAny
+ */
+class ConvertOrLikeChainPass final : public IQueryTreePass
+{
+public:
+    String getName() override { return "ConvertOrLikeChain"; }
+
+    String getDescription() override { return "Replaces all the 'or's with {i}like to multiMatchAny"; }
+
+    void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
+};
+
+}
--- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp
+++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp
@ -0,0 +1,113 @@
+#include <Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h>
+#include <Analyzer/FunctionNode.h>
+#include <Analyzer/HashUtils.h>
+#include <Analyzer/IQueryTreeNode.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Analyzer/QueryNode.h>
+
+#include <algorithm>
+#include <queue>
+
+namespace DB
+{
+
+class OptimizeGroupByFunctionKeysVisitor : public InDepthQueryTreeVisitor<OptimizeGroupByFunctionKeysVisitor>
+{
+public:
+    static bool needChildVisit(QueryTreeNodePtr & /*parent*/, QueryTreeNodePtr & child)
+    {
+        return !child->as<FunctionNode>();
+    }
+
+    static void visitImpl(QueryTreeNodePtr & node)
+    {
+        auto * query = node->as<QueryNode>();
+        if (!query)
+            return;
+
+        if (!query->hasGroupBy())
+            return;
+
+        auto & group_by = query->getGroupBy().getNodes();
+        if (query->isGroupByWithGroupingSets())
+        {
+            for (auto & set : group_by)
+            {
+                auto & grouping_set = set->as<ListNode>()->getNodes();
+                optimizeGroupingSet(grouping_set);
+            }
+        }
+        else
+            optimizeGroupingSet(group_by);
+    }
+private:
+
+    static bool canBeEliminated(QueryTreeNodePtr & node, const QueryTreeNodePtrWithHashSet & group_by_keys)
+    {
+        auto * function = node->as<FunctionNode>();
+        if (!function || function->getArguments().getNodes().empty())
+            return false;
+
+        QueryTreeNodes candidates;
+        auto & function_arguments = function->getArguments().getNodes();
+        for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
+            candidates.push_back(*it);
+
+        // Using DFS we traverse function tree and try to find if it uses other keys as function arguments.
+        // TODO: Also process CONSTANT here. We can simplify GROUP BY x, x + 1 to GROUP BY x.
+        while (!candidates.empty())
+        {
+            auto candidate = candidates.back();
+            candidates.pop_back();
+
+            bool found = group_by_keys.contains(candidate);
+
+            switch (candidate->getNodeType())
+            {
+                case QueryTreeNodeType::FUNCTION:
+                {
+                    auto * func = candidate->as<FunctionNode>();
+                    auto & arguments = func->getArguments().getNodes();
+                    if (arguments.empty())
+                        return false;
+
+                    if (!found)
+                    {
+                        for (auto it = arguments.rbegin(); it != arguments.rend(); ++it)
+                            candidates.push_back(*it);
+                    }
+                    break;
+                }
+                case QueryTreeNodeType::COLUMN:
+                    if (!found)
+                        return false;
+                    break;
+                default:
+                    return false;
+            }
+        }
+        return true;
+    }
+
+    static void optimizeGroupingSet(QueryTreeNodes & grouping_set)
+    {
+        QueryTreeNodePtrWithHashSet group_by_keys(grouping_set.begin(), grouping_set.end());
+
+        QueryTreeNodes new_group_by_keys;
+        new_group_by_keys.reserve(grouping_set.size());
+        for (auto & group_by_elem : grouping_set)
+        {
+            if (!canBeEliminated(group_by_elem, group_by_keys))
+                new_group_by_keys.push_back(group_by_elem);
+        }
+
+        grouping_set = std::move(new_group_by_keys);
+    }
+};
+
+void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
+{
+    OptimizeGroupByFunctionKeysVisitor().visit(query_tree_node);
+}
+
+}
--- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h
+++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h
@ -0,0 +1,22 @@
+#pragma once
+
+#include <Analyzer/IQueryTreePass.h>
+
+namespace DB
+{
+
+/* Eliminates functions of other keys in GROUP BY section.
+ * Ex.:    GROUP BY x, f(x)
+ * Output: GROUP BY x
+ */
+class OptimizeGroupByFunctionKeysPass final : public IQueryTreePass
+{
+public:
+    String getName() override { return "OptimizeGroupByFunctionKeys"; }
+
+    String getDescription() override { return "Eliminates functions of other keys in GROUP BY section."; }
+
+    void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
+};
+
+}
--- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp
+++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp
@ -0,0 +1,124 @@
+#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>
+#include <Analyzer/ColumnNode.h>
+#include <Analyzer/FunctionNode.h>
+#include <Analyzer/HashUtils.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Analyzer/QueryNode.h>
+#include <Analyzer/SortNode.h>
+#include <Functions/IFunction.h>
+
+namespace DB
+{
+
+namespace
+{
+
+class OptimizeRedundantFunctionsInOrderByVisitor : public InDepthQueryTreeVisitor<OptimizeRedundantFunctionsInOrderByVisitor>
+{
+public:
+    static bool needChildVisit(QueryTreeNodePtr & node, QueryTreeNodePtr & /*parent*/)
+    {
+        if (node->as<FunctionNode>())
+            return false;
+        return true;
+    }
+
+    void visitImpl(QueryTreeNodePtr & node)
+    {
+        auto * query = node->as<QueryNode>();
+        if (!query)
+            return;
+
+        if (!query->hasOrderBy())
+            return;
+
+        auto & order_by = query->getOrderBy();
+        for (auto & elem : order_by.getNodes())
+        {
+            auto * order_by_elem = elem->as<SortNode>();
+            if (order_by_elem->withFill())
+                return;
+        }
+
+        QueryTreeNodes new_order_by_nodes;
+        new_order_by_nodes.reserve(order_by.getNodes().size());
+
+        for (auto & elem : order_by.getNodes())
+        {
+            auto & order_by_expr = elem->as<SortNode>()->getExpression();
+            switch (order_by_expr->getNodeType())
+            {
+                case QueryTreeNodeType::FUNCTION:
+                {
+                    if (isRedundantExpression(order_by_expr))
+                        continue;
+                    break;
+                }
+                case QueryTreeNodeType::COLUMN:
+                {
+                    existing_keys.insert(order_by_expr);
+                    break;
+                }
+                default:
+                    break;
+            }
+
+            new_order_by_nodes.push_back(elem);
+        }
+        existing_keys.clear();
+
+        if (new_order_by_nodes.size() < order_by.getNodes().size())
+            order_by.getNodes() = std::move(new_order_by_nodes);
+    }
+
+private:
+    QueryTreeNodePtrWithHashSet existing_keys;
+
+    bool isRedundantExpression(QueryTreeNodePtr function)
+    {
+        QueryTreeNodes nodes_to_process{ function };
+        while (!nodes_to_process.empty())
+        {
+            auto node = nodes_to_process.back();
+            nodes_to_process.pop_back();
+
+            // TODO: handle constants here
+            switch (node->getNodeType())
+            {
+                case QueryTreeNodeType::FUNCTION:
+                {
+                    auto * function_node = node->as<FunctionNode>();
+                    const auto & function_arguments = function_node->getArguments().getNodes();
+                    if (function_arguments.empty())
+                        return false;
+                    const auto & function_base = function_node->getFunction();
+                    if (!function_base || !function_base->isDeterministicInScopeOfQuery())
+                        return false;
+
+                    // Process arguments in order
+                    for (auto it = function_arguments.rbegin(); it != function_arguments.rend(); ++it)
+                        nodes_to_process.push_back(*it);
+                    break;
+                }
+                case QueryTreeNodeType::COLUMN:
+                {
+                    if (!existing_keys.contains(node))
+                        return false;
+                    break;
+                }
+                default:
+                    return false;
+            }
+        }
+        return true;
+    }
+};
+
+}
+
+void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr /*context*/)
+{
+    OptimizeRedundantFunctionsInOrderByVisitor().visit(query_tree_node);
+}
+
+}
--- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h
+++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <Analyzer/IQueryTreePass.h>
+
+namespace DB
+{
+
+/** If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x.
+  * Optimize ORDER BY x, y, f(x), g(x, y), f(h(x)), t(f(x), g(x)) into ORDER BY x, y
+  * in case if f(), g(), h(), t() are deterministic (in scope of query).
+  * Don't optimize ORDER BY f(x), g(x), x even if f(x) is bijection for x or g(x).
+  */
+class OptimizeRedundantFunctionsInOrderByPass final : public IQueryTreePass
+{
+public:
+    String getName() override { return "OptimizeRedundantFunctionsInOrderBy"; }
+
+    String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; }
+
+    void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override;
+};
+
+}
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@ -1,6 +1,7 @@
 #include <Analyzer/Passes/QueryAnalysisPass.h>

 #include <Common/NamePrompter.h>
+#include <Common/ProfileEvents.h>

 #include <IO/WriteBuffer.h>
 #include <IO/WriteHelpers.h>
@ -66,6 +67,14 @@
 #include <Analyzer/UnionNode.h>
 #include <Analyzer/InDepthQueryTreeVisitor.h>
 #include <Analyzer/QueryTreeBuilder.h>
+#include <Analyzer/IQueryTreeNode.h>
+#include <Analyzer/HashUtils.h>
+
+namespace ProfileEvents
+{
+    extern const Event ScalarSubqueriesGlobalCacheHit;
+    extern const Event ScalarSubqueriesCacheMiss;
+}

 #include <Common/checkStackSize.h>

@ -1049,6 +1058,8 @@ private:

    static bool isTableExpressionNodeType(QueryTreeNodeType node_type);

+    static DataTypePtr getExpressionNodeResultTypeOrNull(const QueryTreeNodePtr & query_tree_node);
+
    static ProjectionName calculateFunctionProjectionName(const QueryTreeNodePtr & function_node,
        const ProjectionNames & parameters_projection_names,
        const ProjectionNames & arguments_projection_names);
@ -1097,7 +1108,7 @@ private:

    static QueryTreeNodePtr tryGetLambdaFromSQLUserDefinedFunctions(const std::string & function_name, ContextPtr context);

-    static void evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & query_tree_node, size_t subquery_depth, ContextPtr context);
+    void evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & query_tree_node, size_t subquery_depth, ContextPtr context);

    static void mergeWindowWithParentWindow(const QueryTreeNodePtr & window_node, const QueryTreeNodePtr & parent_window_node, IdentifierResolveScope & scope);

@ -1207,6 +1218,9 @@ private:
    /// Global resolve expression node to projection names map
    std::unordered_map<QueryTreeNodePtr, ProjectionNames> resolved_expressions;

+    /// Results of scalar sub queries
+    std::unordered_map<QueryTreeNodeConstRawPtrWithHash, std::shared_ptr<ConstantValue>> scalars;
+
 };

 /// Utility functions implementation
@ -1229,6 +1243,34 @@ bool QueryAnalyzer::isTableExpressionNodeType(QueryTreeNodeType node_type)
        node_type == QueryTreeNodeType::QUERY || node_type == QueryTreeNodeType::UNION;
 }

+DataTypePtr QueryAnalyzer::getExpressionNodeResultTypeOrNull(const QueryTreeNodePtr & query_tree_node)
+{
+    auto node_type = query_tree_node->getNodeType();
+
+    switch (node_type)
+    {
+        case QueryTreeNodeType::CONSTANT:
+            [[fallthrough]];
+        case QueryTreeNodeType::COLUMN:
+        {
+            return query_tree_node->getResultType();
+        }
+        case QueryTreeNodeType::FUNCTION:
+        {
+            auto & function_node = query_tree_node->as<FunctionNode &>();
+            if (function_node.isResolved())
+                return function_node.getResultType();
+            break;
+        }
+        default:
+        {
+            break;
+        }
+    }
+
+    return nullptr;
+}
+
 ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNodePtr & function_node, const ProjectionNames & parameters_projection_names,
    const ProjectionNames & arguments_projection_names)
 {
@ -1534,12 +1576,12 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection(
            auto expression_identifier = Identifier(name);
            valid_identifiers_result.insert(expression_identifier);

-            auto expression_node_type = expression->getNodeType();
+            auto result_type = getExpressionNodeResultTypeOrNull(expression);

-            if (identifier_is_compound && isExpressionNodeType(expression_node_type))
+            if (identifier_is_compound && result_type)
            {
                collectCompoundExpressionValidIdentifiersForTypoCorrection(unresolved_identifier,
-                    expression->getResultType(),
+                    result_type,
                    expression_identifier,
                    valid_identifiers_result);
            }
@ -1571,21 +1613,23 @@ void QueryAnalyzer::collectScopeValidIdentifiersForTypoCorrection(

    for (const auto & [argument_name, expression] : scope.expression_argument_name_to_node)
    {
+        assert(expression);
        auto expression_node_type = expression->getNodeType();

        if (allow_expression_identifiers && isExpressionNodeType(expression_node_type))
        {
            auto expression_identifier = Identifier(argument_name);
+            valid_identifiers_result.insert(expression_identifier);

-            if (identifier_is_compound)
+            auto result_type = getExpressionNodeResultTypeOrNull(expression);
+
+            if (identifier_is_compound && result_type)
            {
                collectCompoundExpressionValidIdentifiersForTypoCorrection(unresolved_identifier,
-                    expression->getResultType(),
+                    result_type,
                    expression_identifier,
                    valid_identifiers_result);
            }
-
-            valid_identifiers_result.insert(expression_identifier);
        }
        else if (identifier_is_short && allow_function_identifiers && isFunctionExpressionNodeType(expression_node_type))
        {
@ -1687,6 +1731,16 @@ void QueryAnalyzer::evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & node, size
            node->getNodeTypeName(),
            node->formatASTForErrorMessage());

+    auto scalars_iterator = scalars.find(node.get());
+    if (scalars_iterator != scalars.end())
+    {
+        ProfileEvents::increment(ProfileEvents::ScalarSubqueriesGlobalCacheHit);
+        node = std::make_shared<ConstantNode>(scalars_iterator->second, node);
+        return;
+    }
+
+    ProfileEvents::increment(ProfileEvents::ScalarSubqueriesCacheMiss);
+
    auto subquery_context = Context::createCopy(context);

    Settings subquery_settings = context->getSettings();
@ -1699,10 +1753,11 @@ void QueryAnalyzer::evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & node, size

    auto io = interpreter->execute();

-    Block block;
    PullingAsyncPipelineExecutor executor(io.pipeline);
    io.pipeline.setProgressCallback(context->getProgressCallback());

+    Block block;
+
    while (block.rows() == 0 && executor.pull(block))
    {
    }
@ -1743,7 +1798,6 @@ void QueryAnalyzer::evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & node, size
    block = materializeBlock(block);
    size_t columns = block.columns();

-    // Block scalar;
    Field scalar_value;
    DataTypePtr scalar_type;

@ -1770,6 +1824,7 @@ void QueryAnalyzer::evaluateScalarSubqueryIfNeeded(QueryTreeNodePtr & node, size
    }

    auto constant_value = std::make_shared<ConstantValue>(std::move(scalar_value), std::move(scalar_type));
+    scalars[node.get()] = constant_value;
    node = std::make_shared<ConstantNode>(std::move(constant_value), node);
 }

--- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp
+++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp
@ -77,11 +77,11 @@ public:
        if (!nested_function || nested_function->getFunctionName() != "if")
            return;

-        auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
+        const auto & nested_if_function_arguments_nodes = nested_function->getArguments().getNodes();
        if (nested_if_function_arguments_nodes.size() != 3)
            return;

-        auto & cond_argument = nested_if_function_arguments_nodes[0];
+        const auto & cond_argument = nested_if_function_arguments_nodes[0];
        const auto * if_true_condition_constant_node = nested_if_function_arguments_nodes[1]->as<ConstantNode>();
        const auto * if_false_condition_constant_node = nested_if_function_arguments_nodes[2]->as<ConstantNode>();

@ -101,7 +101,7 @@ public:
        /// Rewrite `sum(if(cond, 1, 0))` into `countIf(cond)`.
        if (if_true_condition_value == 1 && if_false_condition_value == 0)
        {
-            function_node_arguments_nodes[0] = std::move(nested_if_function_arguments_nodes[0]);
+            function_node_arguments_nodes[0] = nested_if_function_arguments_nodes[0];
            function_node_arguments_nodes.resize(1);

            resolveAsCountIfAggregateFunction(*function_node, function_node_arguments_nodes[0]->getResultType());
@ -120,7 +120,7 @@ public:
            auto not_function = std::make_shared<FunctionNode>("not");

            auto & not_function_arguments = not_function->getArguments().getNodes();
-            not_function_arguments.push_back(std::move(nested_if_function_arguments_nodes[0]));
+            not_function_arguments.push_back(nested_if_function_arguments_nodes[0]);

            not_function->resolveAsFunction(FunctionFactory::instance().get("not", context)->build(not_function->getArgumentColumns()));

--- a/src/Analyzer/QueryNode.cpp
+++ b/src/Analyzer/QueryNode.cpp
@ -17,6 +17,7 @@
 #include <Parsers/ASTSetQuery.h>

 #include <Analyzer/Utils.h>
+#include <fmt/core.h>

 namespace DB
 {
@ -179,6 +180,16 @@ void QueryNode::dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, s
        buffer << '\n' << std::string(indent + 2, ' ') << "OFFSET\n";
        getOffset()->dumpTreeImpl(buffer, format_state, indent + 4);
    }
+
+    if (hasSettingsChanges())
+    {
+        buffer << '\n' << std::string(indent + 2, ' ') << "SETTINGS";
+        for (const auto & change : settings_changes)
+        {
+            buffer << fmt::format(" {}={}", change.name, toString(change.value));
+        }
+        buffer << '\n';
+    }
 }

 bool QueryNode::isEqualImpl(const IQueryTreeNode & rhs) const
--- a/src/Analyzer/QueryTreeBuilder.cpp
+++ b/src/Analyzer/QueryTreeBuilder.cpp
@ -111,7 +111,7 @@ private:

    QueryTreeNodePtr buildJoinTree(const ASTPtr & tables_in_select_query, const ContextPtr & context) const;

-    ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const;
+    ColumnTransformersNodes buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const;

    ASTPtr query;
    QueryTreeNodePtr query_tree_node;
@ -439,13 +439,13 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
    }
    else if (const auto * asterisk = expression->as<ASTAsterisk>())
    {
-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(asterisk->transformers, context);
        result = std::make_shared<MatcherNode>(std::move(column_transformers));
    }
    else if (const auto * qualified_asterisk = expression->as<ASTQualifiedAsterisk>())
    {
-        auto & qualified_identifier = qualified_asterisk->children.at(0)->as<ASTTableIdentifier &>();
-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto & qualified_identifier = qualified_asterisk->qualifier->as<ASTIdentifier &>();
+        auto column_transformers = buildColumnTransformers(qualified_asterisk->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_transformers));
    }
    else if (const auto * ast_literal = expression->as<ASTLiteral>())
@ -543,7 +543,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
    }
    else if (const auto * columns_regexp_matcher = expression->as<ASTColumnsRegexpMatcher>())
    {
-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(columns_regexp_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(columns_regexp_matcher->getMatcher(), std::move(column_transformers));
    }
    else if (const auto * columns_list_matcher = expression->as<ASTColumnsListMatcher>())
@ -557,18 +557,18 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
            column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
        }

-        auto column_transformers = buildColumnTransformers(expression, 0 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(columns_list_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(std::move(column_list_identifiers), std::move(column_transformers));
    }
    else if (const auto * qualified_columns_regexp_matcher = expression->as<ASTQualifiedColumnsRegexpMatcher>())
    {
-        auto & qualified_identifier = qualified_columns_regexp_matcher->children.at(0)->as<ASTTableIdentifier &>();
-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto & qualified_identifier = qualified_columns_regexp_matcher->qualifier->as<ASTIdentifier &>();
+        auto column_transformers = buildColumnTransformers(qualified_columns_regexp_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), qualified_columns_regexp_matcher->getMatcher(), std::move(column_transformers));
    }
    else if (const auto * qualified_columns_list_matcher = expression->as<ASTQualifiedColumnsListMatcher>())
    {
-        auto & qualified_identifier = qualified_columns_list_matcher->children.at(0)->as<ASTTableIdentifier &>();
+        auto & qualified_identifier = qualified_columns_list_matcher->qualifier->as<ASTIdentifier &>();

        Identifiers column_list_identifiers;
        column_list_identifiers.reserve(qualified_columns_list_matcher->column_list->children.size());
@ -579,7 +579,7 @@ QueryTreeNodePtr QueryTreeBuilder::buildExpression(const ASTPtr & expression, co
            column_list_identifiers.emplace_back(Identifier{column_list_identifier.name_parts});
        }

-        auto column_transformers = buildColumnTransformers(expression, 1 /*start_child_index*/, context);
+        auto column_transformers = buildColumnTransformers(qualified_columns_list_matcher->transformers, context);
        result = std::make_shared<MatcherNode>(Identifier(qualified_identifier.name_parts), std::move(column_list_identifiers), std::move(column_transformers));
    }
    else
@ -833,15 +833,15 @@ QueryTreeNodePtr QueryTreeBuilder::buildJoinTree(const ASTPtr & tables_in_select
 }


-ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, size_t start_child_index, const ContextPtr & context) const
+ColumnTransformersNodes QueryTreeBuilder::buildColumnTransformers(const ASTPtr & matcher_expression, const ContextPtr & context) const
 {
    ColumnTransformersNodes column_transformers;
-    size_t children_size = matcher_expression->children.size();

-    for (; start_child_index < children_size; ++start_child_index)
+    if (!matcher_expression)
+        return column_transformers;
+
+    for (const auto & child : matcher_expression->children)
    {
-        const auto & child = matcher_expression->children[start_child_index];
-
        if (auto * apply_transformer = child->as<ASTColumnsApplyTransformer>())
        {
            if (apply_transformer->lambda)
--- a/src/Analyzer/QueryTreePassManager.cpp
+++ b/src/Analyzer/QueryTreePassManager.cpp
@ -1,5 +1,19 @@
+#include <memory>
 #include <Analyzer/QueryTreePassManager.h>

+#include <Common/Exception.h>
+
+#include <IO/WriteHelpers.h>
+#include <IO/Operators.h>
+
+#include <DataTypes/IDataType.h>
+
+#include <Interpreters/Context.h>
+
+#include <Analyzer/ColumnNode.h>
+#include <Analyzer/FunctionNode.h>
+#include <Analyzer/InDepthQueryTreeVisitor.h>
+#include <Analyzer/Utils.h>
 #include <Analyzer/Passes/QueryAnalysisPass.h>
 #include <Analyzer/Passes/CountDistinctPass.h>
 #include <Analyzer/Passes/FunctionToSubcolumnsPass.h>
@ -14,16 +28,10 @@
 #include <Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h>
 #include <Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h>
 #include <Analyzer/Passes/FuseFunctionsPass.h>
+#include <Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h>
 #include <Analyzer/Passes/IfTransformStringsToEnumPass.h>
-
-#include <IO/WriteHelpers.h>
-#include <IO/Operators.h>
-
-#include <Interpreters/Context.h>
-#include <Analyzer/ColumnNode.h>
-#include <Analyzer/FunctionNode.h>
-#include <Analyzer/InDepthQueryTreeVisitor.h>
-#include <Common/Exception.h>
+#include <Analyzer/Passes/ConvertOrLikeChainPass.h>
+#include <Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h>

 namespace DB
 {
@ -44,24 +52,6 @@ namespace
  */
 class ValidationChecker : public InDepthQueryTreeVisitor<ValidationChecker>
 {
-    String pass_name;
-
-    void visitColumn(ColumnNode * column) const
-    {
-        if (column->getColumnSourceOrNull() == nullptr)
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-                "Column {} {} query tree node does not have valid source node after running {} pass",
-                column->getColumnName(), column->getColumnType(), pass_name);
-    }
-
-    void visitFunction(FunctionNode * function) const
-    {
-        if (!function->isResolved())
-            throw Exception(ErrorCodes::LOGICAL_ERROR,
-            "Function {} is not resolved after running {} pass",
-            function->toAST()->formatForErrorMessage(), pass_name);
-    }
-
 public:
    explicit ValidationChecker(String pass_name_)
        : pass_name(std::move(pass_name_))
@ -74,6 +64,57 @@ public:
        else if (auto * function = node->as<FunctionNode>())
            return visitFunction(function);
    }
+private:
+    void visitColumn(ColumnNode * column) const
+    {
+        if (column->getColumnSourceOrNull() == nullptr)
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                "Column {} {} query tree node does not have valid source node after running {} pass",
+                column->getColumnName(), column->getColumnType(), pass_name);
+    }
+
+    void visitFunction(FunctionNode * function) const
+    {
+        if (!function->isResolved())
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                "Function {} is not resolved after running {} pass",
+                function->toAST()->formatForErrorMessage(), pass_name);
+
+        if (isNameOfInFunction(function->getFunctionName()))
+            return;
+
+        const auto & expected_argument_types = function->getArgumentTypes();
+        size_t expected_argument_types_size = expected_argument_types.size();
+        auto actual_argument_columns = function->getArgumentColumns();
+
+        if (expected_argument_types_size != actual_argument_columns.size())
+            throw Exception(ErrorCodes::LOGICAL_ERROR,
+                "Function {} expects {} arguments but has {} after running {} pass",
+                function->toAST()->formatForErrorMessage(),
+                expected_argument_types_size,
+                actual_argument_columns.size(),
+                pass_name);
+
+        for (size_t i = 0; i < expected_argument_types_size; ++i)
+        {
+            // Skip lambdas
+            if (WhichDataType(expected_argument_types[i]).isFunction())
+                continue;
+
+            if (!expected_argument_types[i]->equals(*actual_argument_columns[i].type))
+            {
+                throw Exception(ErrorCodes::LOGICAL_ERROR,
+                    "Function {} expects {} argument to have {} type but receives {} after running {} pass",
+                    function->toAST()->formatForErrorMessage(),
+                    i + 1,
+                    expected_argument_types[i]->getName(),
+                    actual_argument_columns[i].type->getName(),
+                    pass_name);
+            }
+        }
+    }
+
+    String pass_name;
 };
 #endif

@ -87,11 +128,9 @@ public:
  * TODO: Support setting optimize_using_constraints.
  * TODO: Support setting optimize_substitute_columns.
  * TODO: Support GROUP BY injective function elimination.
-  * TODO: Support GROUP BY functions of other keys elimination.
  * TODO: Support setting optimize_move_functions_out_of_any.
  * TODO: Support setting optimize_aggregators_of_group_by_keys.
  * TODO: Support setting optimize_duplicate_order_by_and_distinct.
-  * TODO: Support setting optimize_redundant_functions_in_order_by.
  * TODO: Support setting optimize_monotonous_functions_in_order_by.
  * TODO: Support settings.optimize_or_like_chain.
  * TODO: Add optimizations based on function semantics. Example: SELECT * FROM test_table WHERE id != id. (id is not nullable column).
@ -195,6 +234,9 @@ void addQueryTreePasses(QueryTreePassManager & manager)
    if (settings.optimize_injective_functions_inside_uniq)
        manager.addPass(std::make_unique<UniqInjectiveFunctionsEliminationPass>());

+    if (settings.optimize_group_by_function_keys)
+        manager.addPass(std::make_unique<OptimizeGroupByFunctionKeysPass>());
+
    if (settings.optimize_multiif_to_if)
        manager.addPass(std::make_unique<MultiIfToIfPass>());

@ -203,6 +245,9 @@ void addQueryTreePasses(QueryTreePassManager & manager)
    if (settings.optimize_if_chain_to_multiif)
        manager.addPass(std::make_unique<IfChainToMultiIfPass>());

+    if (settings.optimize_redundant_functions_in_order_by)
+        manager.addPass(std::make_unique<OptimizeRedundantFunctionsInOrderByPass>());
+
    manager.addPass(std::make_unique<OrderByTupleEliminationPass>());
    manager.addPass(std::make_unique<OrderByLimitByDuplicateEliminationPass>());

@ -211,6 +256,8 @@ void addQueryTreePasses(QueryTreePassManager & manager)

    if (settings.optimize_if_transform_strings_to_enum)
        manager.addPass(std::make_unique<IfTransformStringsToEnumPass>());
+
+    manager.addPass(std::make_unique<ConvertOrLikeChainPass>());
 }

 }
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@ -156,10 +156,9 @@ void BackupWriterS3::copyObjectImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    const Aws::S3::Model::HeadObjectResult & head,
+    size_t size,
    const std::optional<ObjectAttributes> & metadata) const
 {
-    size_t size = head.GetContentLength();
    LOG_TRACE(log, "Copying {} bytes using single-operation copy", size);

    Aws::S3::Model::CopyObjectRequest request;
@ -177,7 +176,7 @@ void BackupWriterS3::copyObjectImpl(
    if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
            || outcome.GetError().GetExceptionName() == "InvalidRequest"))
    { // Can't come here with MinIO, MinIO allows single part upload for large objects.
-        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
+        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
        return;
    }

@ -191,10 +190,9 @@ void BackupWriterS3::copyObjectMultipartImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    const Aws::S3::Model::HeadObjectResult & head,
+    size_t size,
    const std::optional<ObjectAttributes> & metadata) const
 {
-    size_t size = head.GetContentLength();
    LOG_TRACE(log, "Copying {} bytes using multipart upload copy", size);

    String multipart_upload_id;
@ -309,16 +307,16 @@ void BackupWriterS3::copyFileNative(DiskPtr from_disk, const String & file_name_
        std::string source_bucket = object_storage->getObjectsNamespace();
        auto file_path = fs::path(s3_uri.key) / file_name_to;

-        auto head = S3::headObject(*client, source_bucket, objects[0].absolute_path).GetResult();
-        if (static_cast<size_t>(head.GetContentLength()) < request_settings.getUploadSettings().max_single_operation_copy_size)
+        auto size = S3::getObjectSize(*client, source_bucket, objects[0].absolute_path);
+        if (size < request_settings.getUploadSettings().max_single_operation_copy_size)
        {
            copyObjectImpl(
-                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
+                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
        }
        else
        {
            copyObjectMultipartImpl(
-                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, head);
+                source_bucket, objects[0].absolute_path, s3_uri.bucket, file_path, size);
        }
    }
 }
--- a/src/Backups/BackupIO_S3.h
+++ b/src/Backups/BackupIO_S3.h
@ -67,7 +67,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        const Aws::S3::Model::HeadObjectResult & head,
+        size_t size,
        const std::optional<ObjectAttributes> & metadata = std::nullopt) const;

    void copyObjectMultipartImpl(
@ -75,7 +75,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        const Aws::S3::Model::HeadObjectResult & head,
+        size_t size,
        const std::optional<ObjectAttributes> & metadata = std::nullopt) const;

    void removeFilesBatch(const Strings & file_names);
--- a/src/Backups/BackupSettings.cpp
+++ b/src/Backups/BackupSettings.cpp
@ -6,7 +6,7 @@
 #include <Parsers/ASTSetQuery.h>
 #include <Parsers/ASTLiteral.h>
 #include <IO/ReadHelpers.h>
-
+#include <Backups/SettingsFieldOptionalUUID.h>

 namespace DB
 {
@ -16,48 +16,6 @@ namespace ErrorCodes
    extern const int WRONG_BACKUP_SETTINGS;
 }

-
-namespace
-{
-    struct SettingFieldOptionalUUID
-    {
-        std::optional<UUID> value;
-
-        explicit SettingFieldOptionalUUID(const std::optional<UUID> & value_) : value(value_) {}
-
-        explicit SettingFieldOptionalUUID(const Field & field)
-        {
-            if (field.getType() == Field::Types::Null)
-            {
-                value = std::nullopt;
-                return;
-            }
-
-            if (field.getType() == Field::Types::String)
-            {
-                const String & str = field.get<const String &>();
-                if (str.empty())
-                {
-                    value = std::nullopt;
-                    return;
-                }
-
-                UUID id;
-                if (tryParse(id, str))
-                {
-                    value = id;
-                    return;
-                }
-            }
-
-            throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse uuid from {}", field);
-        }
-
-        explicit operator Field() const { return Field(value ? toString(*value) : ""); }
-    };
-}
-
-
 /// List of backup settings except base_backup_name and cluster_host_ids.
 #define LIST_OF_BACKUP_SETTINGS(M) \
    M(String, id) \
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -30,6 +30,7 @@ namespace ErrorCodes
 {
    extern const int BAD_ARGUMENTS;
    extern const int LOGICAL_ERROR;
+    extern const int CONCURRENT_ACCESS_NOT_SUPPORTED;
 }

 using OperationID = BackupsWorker::OperationID;
@ -121,10 +122,12 @@ namespace
 }


-BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads)
+BackupsWorker::BackupsWorker(size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_)
    : backups_thread_pool(num_backup_threads, /* max_free_threads = */ 0, num_backup_threads)
    , restores_thread_pool(num_restore_threads, /* max_free_threads = */ 0, num_restore_threads)
    , log(&Poco::Logger::get("BackupsWorker"))
+    , allow_concurrent_backups(allow_concurrent_backups_)
+    , allow_concurrent_restores(allow_concurrent_restores_)
 {
    /// We set max_free_threads = 0 because we don't want to keep any threads if there is no BACKUP or RESTORE query running right now.
 }
@ -157,6 +160,16 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
    else
        backup_id = toString(*backup_settings.backup_uuid);

+    /// Check if there are no concurrent backups
+    if (num_active_backups && !allow_concurrent_backups)
+    {
+        /// If its an internal backup and we currently have 1 active backup, it could be the original query, validate using backup_uuid
+        if (!(num_active_backups == 1 && backup_settings.internal && getAllActiveBackupInfos().at(0).id == toString(*backup_settings.backup_uuid)))
+        {
+            throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent backups not supported, turn on setting 'allow_concurrent_backups'");
+        }
+    }
+
    std::shared_ptr<IBackupCoordination> backup_coordination;
    if (backup_settings.internal)
    {
@ -370,6 +383,9 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    auto restore_query = std::static_pointer_cast<ASTBackupQuery>(query->clone());
    auto restore_settings = RestoreSettings::fromRestoreQuery(*restore_query);

+    if (!restore_settings.backup_uuid)
+        restore_settings.backup_uuid = UUIDHelpers::generateV4();
+
    /// `restore_id` will be used as a key to the `infos` map, so it should be unique.
    OperationID restore_id;
    if (restore_settings.internal)
@ -377,7 +393,17 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
    else if (!restore_settings.id.empty())
        restore_id = restore_settings.id;
    else
-        restore_id = toString(UUIDHelpers::generateV4());
+        restore_id = toString(*restore_settings.backup_uuid);
+
+    /// Check if there are no concurrent restores
+    if (num_active_restores && !allow_concurrent_restores)
+    {
+        /// If its an internal restore and we currently have 1 active restore, it could be the original query, validate using iz
+        if (!(num_active_restores == 1 && restore_settings.internal && getAllActiveRestoreInfos().at(0).id == toString(*restore_settings.backup_uuid)))
+        {
+            throw Exception(ErrorCodes::CONCURRENT_ACCESS_NOT_SUPPORTED, "Concurrent restores not supported, turn on setting 'allow_concurrent_restores'");
+        }
+    }

    std::shared_ptr<IRestoreCoordination> restore_coordination;
    if (restore_settings.internal)
@ -471,6 +497,7 @@ void BackupsWorker::doRestore(
        backup_open_params.context = context;
        backup_open_params.backup_info = backup_info;
        backup_open_params.base_backup_info = restore_settings.base_backup_info;
+        backup_open_params.backup_uuid = restore_settings.backup_uuid;
        backup_open_params.password = restore_settings.password;
        BackupPtr backup = BackupFactory::instance().createBackup(backup_open_params);

@ -687,6 +714,30 @@ std::vector<BackupsWorker::Info> BackupsWorker::getAllInfos() const
    return res_infos;
 }

+std::vector<BackupsWorker::Info> BackupsWorker::getAllActiveBackupInfos() const
+{
+    std::vector<Info> res_infos;
+    std::lock_guard lock{infos_mutex};
+    for (const auto & info : infos | boost::adaptors::map_values)
+    {
+        if (info.status==BackupStatus::CREATING_BACKUP)
+            res_infos.push_back(info);
+    }
+    return res_infos;
+}
+
+std::vector<BackupsWorker::Info> BackupsWorker::getAllActiveRestoreInfos() const
+{
+    std::vector<Info> res_infos;
+    std::lock_guard lock{infos_mutex};
+    for (const auto & info : infos | boost::adaptors::map_values)
+    {
+        if (info.status==BackupStatus::RESTORING)
+            res_infos.push_back(info);
+    }
+    return res_infos;
+}
+
 void BackupsWorker::shutdown()
 {
    bool has_active_backups_and_restores = (num_active_backups || num_active_restores);
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -23,7 +23,7 @@ class IRestoreCoordination;
 class BackupsWorker
 {
 public:
-    BackupsWorker(size_t num_backup_threads, size_t num_restore_threads);
+    BackupsWorker(size_t num_backup_threads, size_t num_restore_threads, bool allow_concurrent_backups_, bool allow_concurrent_restores_);

    /// Waits until all tasks have been completed.
    void shutdown();
@ -103,6 +103,8 @@ private:
    void setStatus(const OperationID & id, BackupStatus status, bool throw_if_error = true);
    void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
    void setNumFilesAndSize(const OperationID & id, size_t num_files, UInt64 uncompressed_size, UInt64 compressed_size);
+    std::vector<Info> getAllActiveBackupInfos() const;
+    std::vector<Info> getAllActiveRestoreInfos() const;

    ThreadPool backups_thread_pool;
    ThreadPool restores_thread_pool;
@ -113,6 +115,8 @@ private:
    std::atomic<size_t> num_active_restores = 0;
    mutable std::mutex infos_mutex;
    Poco::Logger * log;
+    const bool allow_concurrent_backups;
+    const bool allow_concurrent_restores;
 };

 }
--- a/src/Backups/RestoreSettings.cpp
+++ b/src/Backups/RestoreSettings.cpp
@ -7,6 +7,7 @@
 #include <Parsers/ASTSetQuery.h>
 #include <boost/algorithm/string/predicate.hpp>
 #include <Common/FieldVisitorConvertToNumber.h>
+#include <Backups/SettingsFieldOptionalUUID.h>


 namespace DB
@ -162,7 +163,9 @@ namespace
    M(RestoreUDFCreationMode, create_function) \
    M(Bool, internal) \
    M(String, host_id) \
-    M(String, coordination_zk_path)
+    M(String, coordination_zk_path) \
+    M(OptionalUUID, backup_uuid)
+

 RestoreSettings RestoreSettings::fromRestoreQuery(const ASTBackupQuery & query)
 {
--- a/src/Backups/RestoreSettings.h
+++ b/src/Backups/RestoreSettings.h
@ -122,6 +122,11 @@ struct RestoreSettings
    /// Path in Zookeeper used to coordinate restoring process while executing by RESTORE ON CLUSTER.
    String coordination_zk_path;

+    /// Internal, should not be specified by user.
+    /// UUID of the backup. If it's not set it will be generated randomly.
+    /// This is used to validate internal restores when allow_concurrent_restores is turned off
+    std::optional<UUID> backup_uuid;
+
    static RestoreSettings fromRestoreQuery(const ASTBackupQuery & query);
    void copySettingsToQuery(ASTBackupQuery & query) const;
 };
--- a/src/Backups/SettingsFieldOptionalUUID.cpp
+++ b/src/Backups/SettingsFieldOptionalUUID.cpp
@ -0,0 +1,43 @@
+#include <Backups/SettingsFieldOptionalUUID.h>
+#include <Common/ErrorCodes.h>
+#include <Core/SettingsFields.h>
+#include <IO/ReadHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int CANNOT_PARSE_BACKUP_SETTINGS;
+}
+
+
+    SettingFieldOptionalUUID::SettingFieldOptionalUUID(const Field & field)
+    {
+        if (field.getType() == Field::Types::Null)
+        {
+            value = std::nullopt;
+            return;
+        }
+
+        if (field.getType() == Field::Types::String)
+        {
+            const String & str = field.get<const String &>();
+            if (str.empty())
+            {
+                value = std::nullopt;
+                return;
+            }
+
+            UUID id;
+            if (tryParse(id, str))
+            {
+                value = id;
+                return;
+            }
+        }
+
+        throw Exception(ErrorCodes::CANNOT_PARSE_BACKUP_SETTINGS, "Cannot parse uuid from {}", field);
+    }
+
+}
--- a/src/Backups/SettingsFieldOptionalUUID.h
+++ b/src/Backups/SettingsFieldOptionalUUID.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include <optional>
+#include <Core/SettingsFields.h>
+
+namespace DB
+{
+struct SettingFieldOptionalUUID
+    {
+        std::optional<UUID> value;
+
+        explicit SettingFieldOptionalUUID(const std::optional<UUID> & value_) : value(value_) {}
+
+        explicit SettingFieldOptionalUUID(const Field & field);
+
+        explicit operator Field() const { return Field(value ? toString(*value) : ""); }
+    };
+}
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -100,6 +100,7 @@
    M(CacheDetachedFileSegments, "Number of existing detached cache file segments") \
    M(FilesystemCacheSize, "Filesystem cache size in bytes") \
    M(FilesystemCacheElements, "Filesystem cache elements (file segments)") \
+    M(AsyncInsertCacheSize, "Number of async insert hash id in cache") \
    M(S3Requests, "S3 requests") \
    M(KeeperAliveConnections, "Number of alive connections") \
    M(KeeperOutstandingRequets, "Number of outstanding requests") \
--- a/src/Common/FST.cpp
+++ b/src/Common/FST.cpp
@ -0,0 +1,480 @@
+#include "FST.h"
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <memory>
+#include <vector>
+#include <Common/Exception.h>
+#include <city.h>
+
+/// "paper" in the comments in this file refers to:
+/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int BAD_ARGUMENTS;
+};
+
+namespace FST
+{
+
+UInt64 Arc::serialize(WriteBuffer& write_buffer) const
+{
+    UInt64 written_bytes = 0;
+    bool has_output = output != 0;
+
+    /// First UInt64 is target_index << 1 + has_output
+    assert(target != nullptr);
+    UInt64 first = ((target->state_index) << 1) + has_output;
+    writeVarUInt(first, write_buffer);
+    written_bytes += getLengthOfVarUInt(first);
+
+    /// Second UInt64 is output (optional based on whether has_output is not zero)
+    if (has_output)
+    {
+        writeVarUInt(output, write_buffer);
+        written_bytes += getLengthOfVarUInt(output);
+    }
+    return written_bytes;
+}
+
+bool operator==(const Arc & arc1, const Arc & arc2)
+{
+    assert(arc1.target != nullptr && arc2.target != nullptr);
+    return (arc1.output == arc2.output && arc1.target->id == arc2.target->id);
+}
+
+void LabelsAsBitmap::addLabel(char label)
+{
+    UInt8 index = label;
+    UInt256 bit_label = 1;
+    bit_label <<= index;
+
+    data |= bit_label;
+}
+
+UInt64 LabelsAsBitmap::getIndex(char label) const
+{
+    UInt64 bit_count = 0;
+
+    UInt8 index = label;
+    int which_int64 = 0;
+    while (true)
+    {
+        if (index < 64)
+        {
+            UInt64 mask = index == 63 ? (-1) : (1ULL << (index + 1)) - 1;
+
+            bit_count += std::popcount(mask & data.items[which_int64]);
+            break;
+        }
+        index -= 64;
+        bit_count += std::popcount(data.items[which_int64]);
+
+        which_int64++;
+    }
+    return bit_count;
+}
+
+UInt64 LabelsAsBitmap::serialize(WriteBuffer& write_buffer)
+{
+    writeVarUInt(data.items[0], write_buffer);
+    writeVarUInt(data.items[1], write_buffer);
+    writeVarUInt(data.items[2], write_buffer);
+    writeVarUInt(data.items[3], write_buffer);
+
+    return getLengthOfVarUInt(data.items[0])
+        + getLengthOfVarUInt(data.items[1])
+        + getLengthOfVarUInt(data.items[2])
+        + getLengthOfVarUInt(data.items[3]);
+}
+
+bool LabelsAsBitmap::hasLabel(char label) const
+{
+    UInt8 index = label;
+    UInt256 bit_label = 1;
+    bit_label <<= index;
+
+    return ((data & bit_label) != 0);
+}
+
+Arc* State::getArc(char label) const
+{
+    auto it = arcs.find(label);
+    if (it == arcs.cend())
+        return nullptr;
+
+    return const_cast<Arc *>(&it->second);
+}
+
+void State::addArc(char label, Output output, StatePtr target)
+{
+    arcs[label] = Arc(output, target);
+}
+
+void State::clear()
+{
+    id = 0;
+    state_index = 0;
+    flag = 0;
+
+    arcs.clear();
+}
+
+UInt64 State::hash() const
+{
+    std::vector<char> values;
+    values.reserve(arcs.size() * (sizeof(Output) + sizeof(UInt64) + 1));
+    for (const auto & [label, arc] : arcs)
+    {
+        values.push_back(label);
+        const auto * ptr = reinterpret_cast<const char*>(&arc.output);
+        std::copy(ptr, ptr + sizeof(Output), std::back_inserter(values));
+
+        ptr = reinterpret_cast<const char*>(&arc.target->id);
+        std::copy(ptr, ptr + sizeof(UInt64), std::back_inserter(values));
+    }
+
+    return CityHash_v1_0_2::CityHash64(values.data(), values.size());
+}
+
+bool operator== (const State & state1, const State & state2)
+{
+    if (state1.arcs.size() != state2.arcs.size())
+        return false;
+
+    for (const auto & [label, arc] : state1.arcs)
+    {
+        const auto it = state2.arcs.find(label);
+        if (it == state2.arcs.cend())
+            return false;
+
+        if (it->second != arc)
+            return false;
+    }
+    return true;
+}
+
+UInt64 State::serialize(WriteBuffer& write_buffer)
+{
+    UInt64 written_bytes = 0;
+
+    /// Serialize flag
+    write_buffer.write(flag);
+    written_bytes += 1;
+
+    if (getEncodingMethod() == EncodingMethod::Sequential)
+    {
+        /// Serialize all labels
+        std::vector<char> labels;
+        labels.reserve(arcs.size());
+
+        for (auto& [label, state] : arcs)
+        {
+            labels.push_back(label);
+        }
+
+        UInt8 label_size = labels.size();
+        write_buffer.write(label_size);
+        written_bytes += 1;
+
+        write_buffer.write(labels.data(), labels.size());
+        written_bytes += labels.size();
+
+        /// Serialize all arcs
+        for (char label : labels)
+        {
+            Arc* arc = getArc(label);
+            assert(arc != nullptr);
+            written_bytes += arc->serialize(write_buffer);
+        }
+    }
+    else
+    {
+        /// Serialize bitmap
+        LabelsAsBitmap bmp;
+        for (auto & [label, state] : arcs)
+        {
+            bmp.addLabel(label);
+        }
+        written_bytes += bmp.serialize(write_buffer);
+
+        /// Serialize all arcs
+        for (auto & [label, state] : arcs)
+        {
+            Arc* arc = getArc(label);
+            assert(arc != nullptr);
+            written_bytes += arc->serialize(write_buffer);
+        }
+    }
+
+    return written_bytes;
+}
+
+FSTBuilder::FSTBuilder(WriteBuffer& write_buffer_) : write_buffer(write_buffer_)
+{
+    for (auto & temp_state : temp_states)
+    {
+        temp_state = std::make_shared<State>();
+    }
+}
+
+/// See FindMinimized in the paper pseudo code l11-l21.
+StatePtr FSTBuilder::findMinimized(const State & state, bool & found)
+{
+    found = false;
+    auto hash = state.hash();
+
+    /// MEMBER: in the paper pseudo code l15
+    auto it = minimized_states.find(hash);
+
+    if (it != minimized_states.cend() && *it->second == state)
+    {
+        found = true;
+        return it->second;
+    }
+
+    /// COPY_STATE: in the paper pseudo code l17
+    StatePtr p = std::make_shared<State>(state);
+
+    /// INSERT: in the paper pseudo code l18
+    minimized_states[hash] = p;
+    return p;
+}
+
+/// See the paper pseudo code l33-34.
+size_t FSTBuilder::getCommonPrefixLength(const String & word1, const String & word2)
+{
+    size_t i = 0;
+    while (i < word1.size() && i < word2.size() && word1[i] == word2[i])
+        i++;
+    return i;
+}
+
+/// See the paper pseudo code l33-39 and l70-72(when down_to is 0).
+void FSTBuilder::minimizePreviousWordSuffix(Int64 down_to)
+{
+    for (Int64 i = static_cast<Int64>(previous_word.size()); i >= down_to; --i)
+    {
+        bool found = false;
+        auto minimized_state = findMinimized(*temp_states[i], found);
+
+        if (i != 0)
+        {
+            Output output = 0;
+            Arc* arc = temp_states[i - 1]->getArc(previous_word[i - 1]);
+            if (arc)
+                output = arc->output;
+
+            /// SET_TRANSITION
+            temp_states[i - 1]->addArc(previous_word[i - 1], output, minimized_state);
+        }
+        if (minimized_state->id == 0)
+            minimized_state->id = next_id++;
+
+        if (i > 0 && temp_states[i - 1]->id == 0)
+            temp_states[i - 1]->id = next_id++;
+
+        if (!found)
+        {
+            minimized_state->state_index = previous_state_index;
+
+            previous_written_bytes = minimized_state->serialize(write_buffer);
+            previous_state_index += previous_written_bytes;
+        }
+    }
+}
+
+void FSTBuilder::add(const std::string & current_word, Output current_output)
+{
+    /// We assume word size is no greater than MAX_TERM_LENGTH(256).
+    /// FSTs without word size limitation would be inefficient and easy to cause memory bloat
+    /// Note that when using "split" tokenizer, if a granule has tokens which are longer than
+    /// MAX_TERM_LENGTH, the granule cannot be dropped and will be fully-scanned. It doesn't affect "ngram" tokenizers.
+    /// Another limitation is that if the query string has tokens which exceed this length
+    /// it will fallback to default searching when using "split" tokenizers.
+    auto current_word_len = current_word.size();
+
+    if (current_word_len > MAX_TERM_LENGTH)
+        throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "Too long term ({}) passed to FST builder.", current_word_len);
+
+    size_t prefix_length_plus1 = getCommonPrefixLength(current_word, previous_word) + 1;
+
+    minimizePreviousWordSuffix(prefix_length_plus1);
+
+    /// Initialize the tail state, see paper pseudo code l39-43
+    for (size_t i = prefix_length_plus1; i <= current_word.size(); ++i)
+    {
+        /// CLEAR_STATE: l41
+        temp_states[i]->clear();
+
+        /// SET_TRANSITION: l42
+        temp_states[i - 1]->addArc(current_word[i - 1], 0, temp_states[i]);
+    }
+
+    /// We assume the current word is different with previous word
+    /// See paper pseudo code l44-47
+    temp_states[current_word_len]->setFinal(true);
+
+    /// Adjust outputs on the arcs
+    /// See paper pseudo code l48-63
+    for (size_t i = 1; i <= prefix_length_plus1 - 1; ++i)
+    {
+        Arc * arc_ptr = temp_states[i - 1]->getArc(current_word[i - 1]);
+        assert(arc_ptr != nullptr);
+
+        Output common_prefix = std::min(arc_ptr->output, current_output);
+        Output word_suffix = arc_ptr->output - common_prefix;
+        arc_ptr->output = common_prefix;
+
+        /// For each arc, adjust its output
+        if (word_suffix != 0)
+        {
+            for (auto & [label, arc] : temp_states[i]->arcs)
+            {
+                arc.output += word_suffix;
+            }
+        }
+        /// Reduce current_output
+        current_output -= common_prefix;
+    }
+
+    /// Set last temp state's output
+    /// paper pseudo code l66-67 (assuming CurrentWord != PreviousWorld)
+    Arc * arc = temp_states[prefix_length_plus1 - 1]->getArc(current_word[prefix_length_plus1 - 1]);
+    assert(arc != nullptr);
+    arc->output = current_output;
+
+    previous_word = current_word;
+}
+
+UInt64 FSTBuilder::build()
+{
+    minimizePreviousWordSuffix(0);
+
+    /// Save initial state index
+
+    previous_state_index -= previous_written_bytes;
+    UInt8 length = getLengthOfVarUInt(previous_state_index);
+    writeVarUInt(previous_state_index, write_buffer);
+    write_buffer.write(length);
+
+    return previous_state_index + previous_written_bytes + length + 1;
+}
+
+FiniteStateTransducer::FiniteStateTransducer(std::vector<UInt8> data_) : data(std::move(data_))
+{
+}
+
+void FiniteStateTransducer::clear()
+{
+    data.clear();
+}
+
+std::pair<UInt64, bool> FiniteStateTransducer::getOutput(const String & term)
+{
+    std::pair<UInt64, bool> result{ 0, false };
+
+    /// Read index of initial state
+    ReadBufferFromMemory read_buffer(data.data(), data.size());
+    read_buffer.seek(data.size()-1, SEEK_SET);
+
+    UInt8 length{ 0 };
+    read_buffer.readStrict(reinterpret_cast<char&>(length));
+
+    /// FST contains no terms
+    if (length == 0)
+        return { 0, false };
+
+    read_buffer.seek(data.size() - 1 - length, SEEK_SET);
+    UInt64 state_index{ 0 };
+    readVarUInt(state_index, read_buffer);
+
+    for (size_t i = 0; i <= term.size(); ++i)
+    {
+        UInt64 arc_output{ 0 };
+
+        /// Read flag
+        State temp_state;
+
+        read_buffer.seek(state_index, SEEK_SET);
+        temp_state.readFlag(read_buffer);
+        if (i == term.size())
+        {
+            result.second = temp_state.isFinal();
+            break;
+        }
+
+        UInt8 label = term[i];
+        if (temp_state.getEncodingMethod() == State::EncodingMethod::Sequential)
+        {
+            /// Read number of labels
+            UInt8 label_num{ 0 };
+            read_buffer.readStrict(reinterpret_cast<char&>(label_num));
+
+            if (label_num == 0)
+                return { 0, false };
+
+            auto labels_position = read_buffer.getPosition();
+
+            /// Find the index of the label from "labels" bytes
+            auto begin_it{ data.begin() + labels_position };
+            auto end_it{ data.begin() + labels_position + label_num };
+
+            auto pos = std::find(begin_it, end_it, label);
+
+            if (pos == end_it)
+                return { 0, false };
+
+            /// Read the arc for the label
+            UInt64 arc_index = (pos - begin_it);
+            auto arcs_start_postion = labels_position + label_num;
+
+            read_buffer.seek(arcs_start_postion, SEEK_SET);
+            for (size_t j = 0; j <= arc_index; j++)
+            {
+                state_index = 0;
+                arc_output = 0;
+                readVarUInt(state_index, read_buffer);
+                if (state_index & 0x1) // output is followed
+                {
+                    readVarUInt(arc_output, read_buffer);
+                }
+                state_index >>= 1;
+            }
+        }
+        else
+        {
+            LabelsAsBitmap bmp;
+
+            readVarUInt(bmp.data.items[0], read_buffer);
+            readVarUInt(bmp.data.items[1], read_buffer);
+            readVarUInt(bmp.data.items[2], read_buffer);
+            readVarUInt(bmp.data.items[3], read_buffer);
+
+            if (!bmp.hasLabel(label))
+                return { 0, false };
+
+            /// Read the arc for the label
+            size_t arc_index = bmp.getIndex(label);
+            for (size_t j = 0; j < arc_index; j++)
+            {
+                state_index = 0;
+                arc_output = 0;
+                readVarUInt(state_index, read_buffer);
+                if (state_index & 0x1) // output is followed
+                {
+                    readVarUInt(arc_output, read_buffer);
+                }
+                state_index >>= 1;
+            }
+        }
+        /// Accumulate the output value
+        result.first += arc_output;
+    }
+    return result;
+}
+}
+}
--- a/src/Common/FST.h
+++ b/src/Common/FST.h
@ -0,0 +1,182 @@
+#pragma once
+#include <array>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include <Core/Types.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteBuffer.h>
+#include <base/types.h>
+
+namespace DB
+{
+/// Finite State Transducer is an efficient way to represent term dictionary.
+/// It can be viewed as a map of <term, output> where output is an integer.
+/// Detailed explanation can be found in the following paper
+/// [Direct Construction of Minimal Acyclic Subsequential Transduers] by Stoyan Mihov and Denis Maurel, University of Tours, France
+namespace FST
+{
+using Output = UInt64;
+
+class State;
+using StatePtr = std::shared_ptr<State>;
+
+/// Arc represents a transition from one state to another
+/// It includes the target state to which the arc points and the arc's output.
+struct Arc
+{
+    Arc() = default;
+
+    explicit Arc(Output output_, const StatePtr & target_) : output{output_}, target{target_} { }
+
+    /// 0 means the arc has no output
+    Output output = 0;
+
+    StatePtr target;
+
+    UInt64 serialize(WriteBuffer & write_buffer) const;
+};
+
+bool operator==(const Arc & arc1, const Arc & arc2);
+
+/// LabelsAsBitmap implements a 256-bit bitmap for all labels of a state. Each bit represents
+/// a label's presence and the index value of the bit represents the corresponding label
+class LabelsAsBitmap
+{
+public:
+    void addLabel(char label);
+    bool hasLabel(char label) const;
+
+    /// computes the rank
+    UInt64 getIndex(char label) const;
+
+    UInt64 serialize(WriteBuffer& write_buffer);
+private:
+    friend class State;
+    friend class FiniteStateTransducer;
+    /// data holds a 256-bit bitmap for all labels of a state. Its 256 bits correspond to 256
+    /// possible label values.
+    UInt256 data{ 0 };
+};
+
+/// State implements the State in Finite State Transducer
+/// Each state contains all its arcs and a flag indicating if it is final state
+class State
+{
+public:
+    static constexpr size_t MAX_ARCS_IN_SEQUENTIAL_METHOD = 32;
+    enum class EncodingMethod
+    {
+        /// Serialize arcs sequentially
+        Sequential = 0,
+
+        /// Serialize arcs by using bitmap
+        /// Note this is NOT enabled for now since it is experimental
+        Bitmap,
+    };
+    State() = default;
+
+    State(const State & state) = default;
+
+    UInt64 hash() const;
+
+    Arc * getArc(char label) const;
+
+    void addArc(char label, Output output, StatePtr target);
+
+    void clear();
+
+    UInt64 serialize(WriteBuffer & write_buffer);
+
+    bool isFinal() const
+    {
+        return flag_values.is_final == 1;
+    }
+    void setFinal(bool value)
+    {
+        flag_values.is_final = value;
+    }
+    EncodingMethod getEncodingMethod() const
+    {
+        return flag_values.encoding_method;
+    }
+    void readFlag(ReadBuffer & read_buffer)
+    {
+        read_buffer.readStrict(reinterpret_cast<char&>(flag));
+    }
+
+    /// Transient ID of the state which is used for building FST. It won't be serialized
+    UInt64 id = 0;
+
+    /// State index which indicates location of state in FST
+    UInt64 state_index = 0;
+
+    /// Arcs which are started from state, the 'char' is the label on the arc
+    std::unordered_map<char, Arc> arcs;
+private:
+    struct FlagValues
+    {
+        unsigned int is_final : 1;
+        EncodingMethod encoding_method : 3;
+    };
+
+    union
+    {
+        FlagValues flag_values;
+        uint8_t flag = 0;
+    };
+};
+
+bool operator==(const State & state1, const State & state2);
+
+inline constexpr size_t MAX_TERM_LENGTH = 256;
+
+/// FSTBuilder is used to build Finite State Transducer by adding words incrementally.
+/// Note that all the words have to be added in sorted order in order to achieve minimized result.
+/// In the end, the caller should call build() to serialize minimized FST to WriteBuffer
+class FSTBuilder
+{
+public:
+    explicit FSTBuilder(WriteBuffer & write_buffer_);
+
+    void add(const std::string & word, Output output);
+    UInt64 build();
+private:
+    StatePtr findMinimized(const State & s, bool & found);
+    void minimizePreviousWordSuffix(Int64 down_to);
+    static size_t getCommonPrefixLength(const String & word1, const String & word2);
+
+    std::array<StatePtr, MAX_TERM_LENGTH + 1> temp_states;
+    String previous_word;
+    StatePtr initial_state;
+
+    /// map of (state_hash, StatePtr)
+    std::unordered_map<UInt64, StatePtr> minimized_states;
+
+    /// Next available ID of state
+    UInt64 next_id = 1;
+
+    WriteBuffer & write_buffer;
+    UInt64 previous_written_bytes = 0;
+    UInt64 previous_state_index = 0;
+};
+
+//FiniteStateTransducer is constructed by using minimized FST blob(which is loaded from index storage)
+// It is used to retrieve output by given term
+class FiniteStateTransducer
+{
+public:
+    FiniteStateTransducer() = default;
+    explicit FiniteStateTransducer(std::vector<UInt8> data_);
+    std::pair<UInt64, bool> getOutput(const String & term);
+    void clear();
+    std::vector<UInt8> & getData() { return data; }
+
+private:
+    std::vector<UInt8> data;
+};
+}
+}
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@ -10,6 +10,7 @@
    M(InsertQuery, "Same as Query, but only for INSERT queries.") \
    M(AsyncInsertQuery, "Same as InsertQuery, but only for asynchronous INSERT queries.") \
    M(AsyncInsertBytes, "Data size in bytes of asynchronous INSERT queries.") \
+    M(AsyncInsertCacheHits, "Number of times a duplicate hash id has been found in asynchronous INSERT hash id cache.") \
    M(FailedQuery, "Number of failed queries.") \
    M(FailedSelectQuery, "Same as FailedQuery, but only for SELECT queries.") \
    M(FailedInsertQuery, "Same as FailedQuery, but only for INSERT queries.") \
@ -309,6 +310,8 @@ The server successfully detected this situation and will download merged part fr
    M(S3CopyObject, "Number of S3 API CopyObject calls.") \
    M(S3ListObjects, "Number of S3 API ListObjects calls.") \
    M(S3HeadObject,  "Number of S3 API HeadObject calls.") \
+    M(S3GetObjectAttributes, "Number of S3 API GetObjectAttributes calls.") \
+    M(S3GetObjectMetadata, "Number of S3 API GetObject calls for getting metadata.") \
    M(S3CreateMultipartUpload, "Number of S3 API CreateMultipartUpload calls.") \
    M(S3UploadPartCopy, "Number of S3 API UploadPartCopy calls.") \
    M(S3UploadPart, "Number of S3 API UploadPart calls.") \
@ -321,6 +324,8 @@ The server successfully detected this situation and will download merged part fr
    M(DiskS3CopyObject, "Number of DiskS3 API CopyObject calls.") \
    M(DiskS3ListObjects, "Number of DiskS3 API ListObjects calls.") \
    M(DiskS3HeadObject,  "Number of DiskS3 API HeadObject calls.") \
+    M(DiskS3GetObjectAttributes, "Number of DiskS3 API GetObjectAttributes calls.") \
+    M(DiskS3GetObjectMetadata, "Number of DiskS3 API GetObject calls for getting metadata.") \
    M(DiskS3CreateMultipartUpload, "Number of DiskS3 API CreateMultipartUpload calls.") \
    M(DiskS3UploadPartCopy, "Number of DiskS3 API UploadPartCopy calls.") \
    M(DiskS3UploadPart, "Number of DiskS3 API UploadPart calls.") \
@ -449,7 +454,8 @@ The server successfully detected this situation and will download merged part fr
    M(OverflowBreak, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'break' and the result is incomplete.") \
    M(OverflowThrow, "Number of times, data processing was cancelled by query complexity limitation with setting '*_overflow_mode' = 'throw' and exception was thrown.") \
    M(OverflowAny, "Number of times approximate GROUP BY was in effect: when aggregation was performed only on top of first 'max_rows_to_group_by' unique keys and other keys were ignored due to 'group_by_overflow_mode' = 'any'.") \
-
+    \
+    M(ServerStartupMilliseconds, "Time elapsed from starting server to listening to sockets in milliseconds")\

 namespace ProfileEvents
 {
--- a/src/Common/logger_useful.h
+++ b/src/Common/logger_useful.h
@ -7,6 +7,29 @@
 #include <Poco/Message.h>
 #include <Common/CurrentThread.h>

+/// This wrapper is useful to save formatted message into a String before sending it to a logger
+class LogToStrImpl
+{
+    String & out_str;
+    Poco::Logger * logger;
+    bool propagate_to_actual_log = true;
+public:
+    LogToStrImpl(String & out_str_, Poco::Logger * logger_) : out_str(out_str_) , logger(logger_) {}
+    LogToStrImpl & operator -> () { return *this; }
+    bool is(Poco::Message::Priority priority) { propagate_to_actual_log &= logger->is(priority); return true; }
+    LogToStrImpl * getChannel() {return this; }
+    const String & name() const { return logger->name(); }
+    void log(const Poco::Message & message)
+    {
+        out_str = message.getText();
+        if (!propagate_to_actual_log)
+            return;
+        if (auto * channel = logger->getChannel())
+            channel->log(message);
+    }
+};
+
+#define LogToStr(x, y) std::make_unique<LogToStrImpl>(x, y)

 namespace
 {
@ -17,8 +40,37 @@ namespace

    [[maybe_unused]] const ::Poco::Logger * getLogger(const ::Poco::Logger * logger) { return logger; };
    [[maybe_unused]] const ::Poco::Logger * getLogger(const std::atomic<::Poco::Logger *> & logger) { return logger.load(); };
+    [[maybe_unused]] std::unique_ptr<LogToStrImpl> getLogger(std::unique_ptr<LogToStrImpl> && logger) { return logger; };
+
+    template<typename T> struct is_fmt_runtime : std::false_type {};
+    template<typename T> struct is_fmt_runtime<fmt::basic_runtime<T>> : std::true_type {};
+
+    /// Usually we use LOG_*(...) macros with either string literals or fmt::runtime(whatever) as a format string.
+    /// This function is useful to get a string_view to a static format string passed to LOG_* macro.
+    template <typename T> constexpr std::string_view tryGetStaticFormatString(T && x)
+    {
+        if constexpr (is_fmt_runtime<T>::value)
+        {
+            /// It definitely was fmt::runtime(something).
+            /// We are not sure about a lifetime of the string, so return empty view.
+            /// Also it can be arbitrary string, not a formatting pattern.
+            /// So returning empty pattern will not pollute the set of patterns.
+            return std::string_view();
+        }
+        else
+        {
+            /// Most likely it was a string literal.
+            /// Unfortunately, there's no good way to check if something is a string literal.
+            /// But fmtlib requires a format string to be compile-time constant unless fmt::runtime is used.
+            static_assert(std::is_nothrow_convertible<T, const char * const>::value);
+            static_assert(!std::is_pointer<T>::value);
+            return std::string_view(x);
+        }
+    }
 }

+#define LOG_IMPL_FIRST_ARG(X, ...) X
+
 /// Logs a message to a specified logger with that level.
 /// If more than one argument is provided,
 ///  the first argument is interpreted as template with {}-substitutions
@ -30,7 +82,7 @@ namespace
    auto _logger = ::getLogger(logger);                                           \
    const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) &&    \
        (DB::CurrentThread::getGroup()->client_logs_level >= (priority));         \
-    if (_logger->is((PRIORITY)) || _is_clients_log)                               \
+    if (_is_clients_log || _logger->is((PRIORITY)))                               \
    {                                                                             \
        std::string formatted_message = numArgs(__VA_ARGS__) > 1 ? fmt::format(__VA_ARGS__) : firstArg(__VA_ARGS__); \
        if (auto _channel = _logger->getChannel())                                \
@ -40,7 +92,7 @@ namespace
            file_function += "; ";                                                \
            file_function += __PRETTY_FUNCTION__;                                 \
            Poco::Message poco_message(_logger->name(), formatted_message,        \
-                                 (PRIORITY), file_function.c_str(), __LINE__);    \
+                (PRIORITY), file_function.c_str(), __LINE__, tryGetStaticFormatString(LOG_IMPL_FIRST_ARG(__VA_ARGS__)));    \
            _channel->log(poco_message);                                          \
        }                                                                         \
    }                                                                             \
--- a/src/Common/tests/gtest_fst.cpp
+++ b/src/Common/tests/gtest_fst.cpp
@ -0,0 +1,94 @@
+#include <string>
+#include <vector>
+
+#include <IO/WriteBufferFromVector.h>
+#include <Common/FST.h>
+#include <gtest/gtest.h>
+
+TEST(FST, SimpleTest)
+{
+    std::vector<std::pair<std::string, DB::FST::Output>> indexed_data
+    {
+        {"mop", 100},
+        {"moth", 91},
+        {"pop", 72},
+        {"star", 83},
+        {"stop", 54},
+        {"top", 55},
+    };
+
+    std::vector<std::pair<std::string, DB::FST::Output>> not_indexed_data
+    {
+        {"mo", 100},
+        {"moth1", 91},
+        {"po", 72},
+        {"star2", 83},
+        {"sto", 54},
+        {"top33", 55},
+    };
+
+    std::vector<UInt8> buffer;
+    DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
+    DB::FST::FSTBuilder builder(wbuf);
+
+    for (auto& [term, output] : indexed_data)
+    {
+        builder.add(term, output);
+    }
+    builder.build();
+    wbuf.finalize();
+
+    DB::FST::FiniteStateTransducer fst(buffer);
+    for (auto& [term, output] : indexed_data)
+    {
+        auto [result, found] = fst.getOutput(term);
+        ASSERT_EQ(found, true);
+        ASSERT_EQ(result, output);
+    }
+
+    for (auto& [term, output] : not_indexed_data)
+    {
+        auto [result, found] = fst.getOutput(term);
+        ASSERT_EQ(found, false);
+    }
+}
+
+TEST(FST, TestForLongTerms)
+{
+    /// Test long terms within limitation
+    std::string term1(DB::FST::MAX_TERM_LENGTH - 1, 'A');
+    std::string term2(DB::FST::MAX_TERM_LENGTH, 'B');
+
+    DB::FST::Output output1 = 100;
+    DB::FST::Output output2 = 200;
+
+    std::vector<UInt8> buffer;
+    DB::WriteBufferFromVector<std::vector<UInt8>> wbuf(buffer);
+    DB::FST::FSTBuilder builder(wbuf);
+
+    builder.add(term1, output1);
+    builder.add(term2, output2);
+
+    builder.build();
+    wbuf.finalize();
+
+    DB::FST::FiniteStateTransducer fst(buffer);
+
+    auto [result1, found1] = fst.getOutput(term1);
+    ASSERT_EQ(found1, true);
+    ASSERT_EQ(result1, output1);
+
+    auto [result2, found2] = fst.getOutput(term2);
+    ASSERT_EQ(found2, true);
+    ASSERT_EQ(result2, output2);
+
+    /// Test exception case when term length exceeds limitation
+    std::string term3(DB::FST::MAX_TERM_LENGTH + 1, 'C');
+    DB::FST::Output output3 = 300;
+
+    std::vector<UInt8> buffer3;
+    DB::WriteBufferFromVector<std::vector<UInt8>> wbuf3(buffer3);
+    DB::FST::FSTBuilder builder3(wbuf3);
+
+    EXPECT_THROW(builder3.add(term3, output3), DB::Exception);
+}
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -182,6 +182,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, merge_tree_max_rows_to_use_cache, (128 * 8192), "The maximum number of rows per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
    M(UInt64, merge_tree_max_bytes_to_use_cache, (192 * 10 * 1024 * 1024), "The maximum number of bytes per request, to use the cache of uncompressed data. If the request is large, the cache is not used. (For large queries not to flush out the cache.)", 0) \
    M(Bool, do_not_merge_across_partitions_select_final, false, "Merge parts only in one partition in select final", 0) \
+    M(Bool, allow_experimental_inverted_index, false, "If it is set to true, allow to use experimental inverted index.", 0) \
    \
    M(UInt64, mysql_max_rows_to_insert, 65536, "The maximum number of rows in MySQL batch insertion of the MySQL storage engine", 0) \
    \
@ -773,6 +774,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, input_format_json_validate_types_from_metadata, true, "For JSON/JSONCompact/JSONColumnsWithMetadata input formats this controls whether format parser should check if data types from input metadata match data types of the corresponding columns from the table", 0) \
    M(Bool, input_format_json_read_numbers_as_strings, false, "Allow to parse numbers as strings in JSON input formats", 0) \
    M(Bool, input_format_json_read_objects_as_strings, true, "Allow to parse JSON objects as strings in JSON input formats", 0) \
+    M(Bool, input_format_json_named_tuples_as_objects, true, "Deserialize named tuple columns as JSON objects", 0) \
+    M(Bool, input_format_json_defaults_for_missing_elements_in_named_tuple, true, "Insert default value in named tuple element if it's missing in json object", 0) \
    M(Bool, input_format_try_infer_integers, true, "Try to infer integers instead of floats while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_dates, true, "Try to infer dates from string fields while schema inference in text formats", 0) \
    M(Bool, input_format_try_infer_datetimes, true, "Try to infer datetimes from string fields while schema inference in text formats", 0) \
--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -80,7 +80,8 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
-    {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"}}},
+    {"23.1", {{"input_format_json_read_objects_as_strings", 0, 1, "Enable reading nested json objects as strings while object type is experimental"},
+              {"input_format_json_defaults_for_missing_elements_in_named_tuple", false, true, "Allow missing elements in JSON objects while reading named tuples by default"}}},
    {"22.12", {{"max_size_to_preallocate_for_aggregation", 10'000'000, 100'000'000, "This optimizes performance"},
               {"query_plan_aggregation_in_order", 0, 1, "Enable some refactoring around query plan"},
               {"format_binary_max_string_size", 0, 1_GiB, "Prevent allocating large amount of memory"}}},
--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@ -16,6 +16,7 @@ namespace ErrorCodes
 {
    extern const int SIZES_OF_COLUMNS_IN_TUPLE_DOESNT_MATCH;
    extern const int NOT_FOUND_COLUMN_IN_BLOCK;
+    extern const int INCORRECT_DATA;
 }


@ -154,7 +155,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co

 void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
 {
-    if (settings.json.named_tuples_as_objects
+    if (settings.json.write_named_tuples_as_objects
        && have_explicit_names)
    {
        writeChar('{', ostr);
@ -185,7 +186,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu

 void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const
 {
-    if (settings.json.named_tuples_as_objects
+    if (settings.json.read_named_tuples_as_objects
        && have_explicit_names)
    {
        skipWhitespaceIfAny(istr);
@ -194,12 +195,15 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr

        addElementSafe(elems.size(), column, [&]
        {
-            // Require all elements but in arbitrary order.
-            for (size_t i = 0; i < elems.size(); ++i)
+            std::vector<UInt8> seen_elements(elems.size(), 0);
+            size_t i = 0;
+            while (!istr.eof() && *istr.position() != '}')
            {
+                if (i == elems.size())
+                    throw Exception(ErrorCodes::INCORRECT_DATA, "Unexpected number of elements in named tuple. Expected no more than {}", elems.size());
+
                if (i > 0)
                {
-                    skipWhitespaceIfAny(istr);
                    assertChar(',', istr);
                    skipWhitespaceIfAny(istr);
                }
@ -211,12 +215,35 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
                skipWhitespaceIfAny(istr);

                const size_t element_pos = getPositionByName(name);
+                seen_elements[element_pos] = 1;
                auto & element_column = extractElementColumn(column, element_pos);
                elems[element_pos]->deserializeTextJSON(element_column, istr, settings);
+
+                skipWhitespaceIfAny(istr);
+                ++i;
            }

-            skipWhitespaceIfAny(istr);
            assertChar('}', istr);
+
+            /// Check if we have missing elements.
+            if (i != elems.size())
+            {
+                for (size_t element_pos = 0; element_pos != seen_elements.size(); ++element_pos)
+                {
+                    if (seen_elements[element_pos])
+                        continue;
+
+                    if (!settings.json.defaults_for_missing_elements_in_named_tuple)
+                        throw Exception(
+                            ErrorCodes::INCORRECT_DATA,
+                            "JSON object doesn't contain tuple element {}. If you want to insert defaults in case of missing elements, "
+                            "enable setting input_format_json_defaults_for_missing_elements_in_named_tuple",
+                            elems[element_pos]->getElementName());
+
+                    auto & element_column = extractElementColumn(column, element_pos);
+                    element_column.insertDefault();
+                }
+            }
        });
    }
    else
--- a/src/DataTypes/Serializations/SerializationUUID.cpp
+++ b/src/DataTypes/Serializations/SerializationUUID.cpp
@ -1,9 +1,10 @@
-#include <DataTypes/Serializations/SerializationUUID.h>
 #include <Columns/ColumnsNumber.h>
+#include <DataTypes/Serializations/SerializationUUID.h>
 #include <Formats/ProtobufReader.h>
 #include <Formats/ProtobufWriter.h>
-#include <IO/WriteHelpers.h>
+#include <IO/ReadBufferFromString.h>
 #include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
 #include <Common/assert_cast.h>


@ -44,11 +45,44 @@ void SerializationUUID::serializeTextQuoted(const IColumn & column, size_t row_n

 void SerializationUUID::deserializeTextQuoted(IColumn & column, ReadBuffer & istr, const FormatSettings &) const
 {
-    UUID x;
-    assertChar('\'', istr);
-    readText(x, istr);
-    assertChar('\'', istr);
-    assert_cast<ColumnUUID &>(column).getData().push_back(x);    /// It's important to do this at the end - for exception safety.
+    UUID uuid;
+    bool fast = false;
+    if (istr.available() >= 38)
+    {
+        assertChar('\'', istr);
+        char * next_pos = find_first_symbols<'\\', '\''>(istr.position(), istr.buffer().end());
+        size_t len = next_pos - istr.position();
+        if ((len == 32) && (istr.position()[32] == '\''))
+        {
+            parseUUIDWithoutSeparator(
+                reinterpret_cast<const UInt8 *>(istr.position()), std::reverse_iterator<UInt8 *>(reinterpret_cast<UInt8 *>(&uuid) + 16));
+            istr.ignore(33);
+            fast = true;
+        }
+        else if ((len == 36) && (istr.position()[36] == '\''))
+        {
+            parseUUID(
+                reinterpret_cast<const UInt8 *>(istr.position()), std::reverse_iterator<UInt8 *>(reinterpret_cast<UInt8 *>(&uuid) + 16));
+            istr.ignore(37);
+            fast = true;
+        }
+        else
+        {
+            // It's ok to go back in the position because we haven't read from the buffer except the first char
+            // and we know there were at least 38 bytes available (so no new read has been triggered)
+            istr.position()--;
+        }
+    }
+
+    if (!fast)
+    {
+        String quoted_chars;
+        readQuotedStringInto<false>(quoted_chars, istr);
+        ReadBufferFromString parsed_quoted_buffer(quoted_chars);
+        readText(uuid, parsed_quoted_buffer);
+    }
+
+    assert_cast<ColumnUUID &>(column).getData().push_back(std::move(uuid)); /// It's important to do this at the end - for exception safety.
 }

 void SerializationUUID::serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
--- a/src/Databases/DDLDependencyVisitor.cpp
+++ b/src/Databases/DDLDependencyVisitor.cpp
@ -2,6 +2,7 @@
 #include <Dictionaries/getDictionaryConfigurationFromAST.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/misc.h>
 #include <Interpreters/InDepthNodeVisitor.h>
 #include <Interpreters/evaluateConstantExpression.h>
 #include <Interpreters/getClusterName.h>
@ -175,7 +176,7 @@ namespace
        /// Finds dependencies of a function.
        void visitFunction(const ASTFunction & function)
        {
-            if (function.name == "joinGet" || function.name == "dictHas" || function.name == "dictIsIn" || function.name.starts_with("dictGet"))
+            if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
            {
                /// dictGet('dict_name', attr_names, id_expr)
                /// dictHas('dict_name', id_expr)
--- a/src/Databases/DDLLoadingDependencyVisitor.cpp
+++ b/src/Databases/DDLLoadingDependencyVisitor.cpp
@ -1,6 +1,7 @@
 #include <Databases/DDLLoadingDependencyVisitor.h>
 #include <Dictionaries/getDictionaryConfigurationFromAST.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/misc.h>
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTIdentifier.h>
@ -52,23 +53,41 @@ bool DDLMatcherBase::needChildVisit(const ASTPtr & node, const ASTPtr & child)
    return true;
 }

-ssize_t DDLMatcherBase::getPositionOfTableNameArgument(const ASTFunction & function)
+ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function)
 {
-    if (function.name == "joinGet" ||
-        function.name == "dictHas" ||
-        function.name == "dictIsIn" ||
-        function.name.starts_with("dictGet"))
+    if (functionIsJoinGet(function.name) || functionIsDictGet(function.name))
        return 0;

-    if (Poco::toLower(function.name) == "in")
+    return -1;
+}
+
+ssize_t DDLMatcherBase::getPositionOfTableNameArgumentToVisit(const ASTFunction & function)
+{
+    ssize_t maybe_res = getPositionOfTableNameArgumentToEvaluate(function);
+    if (0 <= maybe_res)
+        return maybe_res;
+
+    if (functionIsInOrGlobalInOperator(function.name))
+    {
+        if (function.children.empty())
+            return -1;
+
+        const auto * args = function.children[0]->as<ASTExpressionList>();
+        if (!args || args->children.size() != 2)
+            return -1;
+
+        if (args->children[1]->as<ASTFunction>())
+            return -1;
+
        return 1;
+    }

    return -1;
 }

 void DDLLoadingDependencyVisitor::visit(const ASTFunction & function, Data & data)
 {
-    ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
+    ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToVisit(function);
    if (table_name_arg_idx < 0)
        return;
    extractTableNameFromArgument(function, data, table_name_arg_idx);
--- a/src/Databases/DDLLoadingDependencyVisitor.h
+++ b/src/Databases/DDLLoadingDependencyVisitor.h
@ -23,7 +23,8 @@ class DDLMatcherBase
 {
 public:
    static bool needChildVisit(const ASTPtr & node, const ASTPtr & child);
-    static ssize_t getPositionOfTableNameArgument(const ASTFunction & function);
+    static ssize_t getPositionOfTableNameArgumentToVisit(const ASTFunction & function);
+    static ssize_t getPositionOfTableNameArgumentToEvaluate(const ASTFunction & function);
 };

 /// Visits ASTCreateQuery and extracts the names of all tables which should be loaded before a specified table.
--- a/src/Databases/NormalizeAndEvaluateConstantsVisitor.cpp
+++ b/src/Databases/NormalizeAndEvaluateConstantsVisitor.cpp
@ -23,7 +23,7 @@ void NormalizeAndEvaluateConstants::visit(const ASTFunction & function, Data & d
 {
    /// Replace expressions like "dictGet(currentDatabase() || '.dict', 'value', toUInt32(1))"
    /// with "dictGet('db_name.dict', 'value', toUInt32(1))"
-    ssize_t table_name_arg_idx = getPositionOfTableNameArgument(function);
+    ssize_t table_name_arg_idx = getPositionOfTableNameArgumentToEvaluate(function);
    if (table_name_arg_idx < 0)
        return;

--- a/src/Dictionaries/HashedDictionary.cpp
+++ b/src/Dictionaries/HashedDictionary.cpp
@ -1,16 +1,25 @@
-#include "HashedDictionary.h"
+#include <numeric>
+#include <boost/noncopyable.hpp>

 #include <Common/ArenaUtils.h>
+#include <Common/ThreadPool.h>
+#include <Common/setThreadName.h>
+#include <Common/logger_useful.h>
+#include <Common/ConcurrentBoundedQueue.h>
+
 #include <Core/Defines.h>
-#include <DataTypes/DataTypesDecimal.h>
+
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnNullable.h>
+#include <DataTypes/DataTypesDecimal.h>
 #include <Functions/FunctionHelpers.h>

 #include <Dictionaries//DictionarySource.h>
 #include <Dictionaries/DictionaryFactory.h>
 #include <Dictionaries/HierarchyDictionariesUtils.h>
-#include <Common/logger_useful.h>
+
+#include "HashedDictionary.h"
+

 namespace
 {
@ -35,16 +44,154 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
    extern const int DICTIONARY_IS_EMPTY;
    extern const int UNSUPPORTED_METHOD;
+    extern const int LOGICAL_ERROR;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-HashedDictionary<dictionary_key_type, sparse>::HashedDictionary(
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded> class HashedDictionary;
+
+/// Implementation parallel dictionary load for SHARDS
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+class ParallelDictionaryLoader : public boost::noncopyable
+{
+    using HashedDictionary = HashedDictionary<dictionary_key_type, sparse, sharded>;
+
+public:
+    explicit ParallelDictionaryLoader(HashedDictionary & dictionary_)
+        : dictionary(dictionary_)
+        , shards(dictionary.configuration.shards)
+        , simple_key(dictionary.dict_struct.getKeysSize() == 1)
+        , pool(shards)
+        , shards_queues(shards)
+    {
+        UInt64 backlog = dictionary.configuration.shard_load_queue_backlog;
+        LOG_TRACE(dictionary.log, "Will load the dictionary using {} threads (with {} backlog)", shards, backlog);
+
+        shards_slots.resize(shards);
+        std::iota(shards_slots.begin(), shards_slots.end(), 0);
+
+        for (size_t shard = 0; shard < shards; ++shard)
+        {
+            shards_queues[shard].emplace(backlog);
+            pool.scheduleOrThrowOnError([this, shard, thread_group = CurrentThread::getGroup()]
+            {
+                if (thread_group)
+                    CurrentThread::attachToIfDetached(thread_group);
+                setThreadName("HashedDictLoad");
+
+                threadWorker(shard);
+            });
+        }
+    }
+
+    void addBlock(Block block)
+    {
+        IColumn::Selector selector = createShardSelector(block, shards_slots);
+        Blocks shards_blocks = splitBlock(selector, block);
+
+        for (size_t shard = 0; shard < shards; ++shard)
+        {
+            if (!shards_queues[shard]->push(std::move(shards_blocks[shard])))
+                throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not push to shards queue #{}", shard);
+        }
+    }
+
+    void finish()
+    {
+        for (auto & queue : shards_queues)
+            queue->finish();
+
+        Stopwatch watch;
+        pool.wait();
+        UInt64 elapsed_ms = watch.elapsedMilliseconds();
+        LOG_TRACE(dictionary.log, "Processing the tail took {}ms", elapsed_ms);
+    }
+
+    ~ParallelDictionaryLoader()
+    {
+        for (auto & queue : shards_queues)
+            queue->clearAndFinish();
+        pool.wait();
+    }
+
+private:
+    HashedDictionary & dictionary;
+    const size_t shards;
+    bool simple_key;
+    ThreadPool pool;
+    std::vector<std::optional<ConcurrentBoundedQueue<Block>>> shards_queues;
+    std::vector<UInt64> shards_slots;
+    DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
+
+    void threadWorker(size_t shard)
+    {
+        Block block;
+        DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
+        auto & shard_queue = *shards_queues[shard];
+
+        while (shard_queue.pop(block))
+        {
+            Stopwatch watch;
+            dictionary.blockToAttributes(block, arena_holder, shard);
+            UInt64 elapsed_ms = watch.elapsedMilliseconds();
+            if (elapsed_ms > 1'000)
+                LOG_TRACE(dictionary.log, "Block processing for shard #{} is slow {}ms (rows {}).", shard, elapsed_ms, block.rows());
+        }
+
+        if (!shard_queue.isFinished())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Could not pull non finished shards queue #{}", shard);
+    }
+
+    /// Split block to shards smaller block, using 'selector'.
+    Blocks splitBlock(const IColumn::Selector & selector, const Block & block)
+    {
+        Blocks out_blocks(shards);
+        for (size_t shard = 0; shard < shards; ++shard)
+            out_blocks[shard] = block.cloneEmpty();
+
+        size_t columns = block.columns();
+        for (size_t col = 0; col < columns; ++col)
+        {
+            MutableColumns splitted_columns = block.getByPosition(col).column->scatter(shards, selector);
+            for (size_t shard = 0; shard < shards; ++shard)
+                out_blocks[shard].getByPosition(col).column = std::move(splitted_columns[shard]);
+        }
+
+        return out_blocks;
+    }
+
+    IColumn::Selector createShardSelector(const Block & block, const std::vector<UInt64> & slots)
+    {
+        size_t num_rows = block.rows();
+        IColumn::Selector selector(num_rows);
+
+        size_t skip_keys_size_offset = dictionary.dict_struct.getKeysSize();
+        Columns key_columns;
+        key_columns.reserve(skip_keys_size_offset);
+        for (size_t i = 0; i < skip_keys_size_offset; ++i)
+            key_columns.emplace_back(block.safeGetByPosition(i).column);
+
+        DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, arena_holder.getComplexKeyArena());
+        for (size_t i = 0; i < num_rows; ++i)
+        {
+            auto key = keys_extractor.extractCurrentKey();
+            size_t shard = dictionary.getShard(key);
+            selector[i] = slots[shard];
+            keys_extractor.rollbackCurrentKey();
+        }
+
+        return selector;
+    }
+};
+
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+HashedDictionary<dictionary_key_type, sparse, sharded>::HashedDictionary(
    const StorageID & dict_id_,
    const DictionaryStructure & dict_struct_,
    DictionarySourcePtr source_ptr_,
    const HashedDictionaryStorageConfiguration & configuration_,
    BlockPtr update_field_loaded_block_)
    : IDictionary(dict_id_)
+    , log(&Poco::Logger::get("HashedDictionary"))
    , dict_struct(dict_struct_)
    , source_ptr(std::move(source_ptr_))
    , configuration(configuration_)
@ -56,8 +203,74 @@ HashedDictionary<dictionary_key_type, sparse>::HashedDictionary(
    calculateBytesAllocated();
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getColumn(
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+HashedDictionary<dictionary_key_type, sparse, sharded>::~HashedDictionary()
+try
+{
+    /// Do a regular sequential destroy in case of non sharded dictionary
+    ///
+    /// Note, that even in non-sharded dictionaries you can have multiple hash
+    /// tables, since each attribute is stored in a separate hash table.
+    if constexpr (!sharded)
+        return;
+
+    size_t shards = std::max<size_t>(configuration.shards, 1);
+    size_t attributes_tables = std::max<size_t>(attributes.size(), 1 /* no_attributes_containers */);
+    ThreadPool pool(shards * attributes_tables);
+
+    size_t hash_tables_count = 0;
+    auto schedule_destroy = [&hash_tables_count, &pool](auto & container)
+    {
+        if (container.empty())
+            return;
+
+        pool.scheduleOrThrowOnError([&container, thread_group = CurrentThread::getGroup()]
+        {
+            if (thread_group)
+                CurrentThread::attachToIfDetached(thread_group);
+            setThreadName("HashedDictDtor");
+
+            if constexpr (sparse)
+                container.clear();
+            else
+                container.clearAndShrink();
+        });
+
+        ++hash_tables_count;
+    };
+
+    if (attributes.empty())
+    {
+        for (size_t shard = 0; shard < shards; ++shard)
+        {
+            schedule_destroy(no_attributes_containers[shard]);
+        }
+    }
+    else
+    {
+        for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index)
+        {
+            getAttributeContainer(attribute_index, [&](auto & containers)
+            {
+                for (size_t shard = 0; shard < shards; ++shard)
+                {
+                    schedule_destroy(containers[shard]);
+                }
+            });
+        }
+    }
+
+    LOG_TRACE(log, "Destroying {} non empty hash tables (using {} threads)", hash_tables_count, pool.getMaxThreads());
+    pool.wait();
+    LOG_TRACE(log, "Hash tables destroyed");
+}
+catch (...)
+{
+    tryLogCurrentException("HashedDictionary", "Error while destroying dictionary in parallel, will do a sequential destroy.");
+}
+
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+ColumnPtr HashedDictionary<dictionary_key_type, sparse, sharded>::getColumn(
    const std::string & attribute_name,
    const DataTypePtr & result_type,
    const Columns & key_columns,
@ -163,8 +376,8 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getColumn(
    return result;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse, sharded>::hasKeys(const Columns & key_columns, const DataTypes & key_types) const
 {
    if (dictionary_key_type == DictionaryKeyType::Complex)
        dict_struct.validateKeyTypes(key_types);
@ -183,8 +396,9 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::hasKeys(const Co
    {
        for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index)
        {
-            auto requested_key = extractor.extractCurrentKey();
-            out[requested_key_index] = no_attributes_container.find(requested_key) != no_attributes_container.end();
+            auto key = extractor.extractCurrentKey();
+            const auto & container = no_attributes_containers[getShard(key)];
+            out[requested_key_index] = container.find(key) != container.end();
            keys_found += out[requested_key_index];
            extractor.rollbackCurrentKey();
        }
@ -197,18 +411,19 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::hasKeys(const Co
    const auto & attribute = attributes.front();
    bool is_attribute_nullable = attribute.is_nullable_set.has_value();

-    getAttributeContainer(0, [&](const auto & container)
+    getAttributeContainer(0, [&](const auto & containers)
    {
        for (size_t requested_key_index = 0; requested_key_index < keys_size; ++requested_key_index)
        {
-            auto requested_key = extractor.extractCurrentKey();
+            auto key = extractor.extractCurrentKey();
+            const auto & container = containers[getShard(key)];

-            out[requested_key_index] = container.find(requested_key) != container.end();
+            out[requested_key_index] = container.find(key) != container.end();

            keys_found += out[requested_key_index];

            if (is_attribute_nullable && !out[requested_key_index])
-                out[requested_key_index] = attribute.is_nullable_set->find(requested_key) != nullptr;
+                out[requested_key_index] = attribute.is_nullable_set->find(key) != nullptr;

            extractor.rollbackCurrentKey();
        }
@ -220,8 +435,8 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::hasKeys(const Co
    return result;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+ColumnPtr HashedDictionary<dictionary_key_type, sparse, sharded>::getHierarchy(ColumnPtr key_column [[maybe_unused]], const DataTypePtr &) const
 {
    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
    {
@ -238,14 +453,15 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchy(ColumnPtr
        if (!dictionary_attribute.null_value.isNull())
            null_value = dictionary_attribute.null_value.get<UInt64>();

-        const CollectionType<UInt64> & child_key_to_parent_key_map = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
+        const CollectionsHolder<UInt64> & child_key_to_parent_key_maps = std::get<CollectionsHolder<UInt64>>(hierarchical_attribute.containers);

        auto is_key_valid_func = [&](auto & hierarchy_key)
        {
            if (unlikely(hierarchical_attribute.is_nullable_set) && hierarchical_attribute.is_nullable_set->find(hierarchy_key))
                return true;

-            return child_key_to_parent_key_map.find(hierarchy_key) != child_key_to_parent_key_map.end();
+            const auto & map = child_key_to_parent_key_maps[getShard(hierarchy_key)];
+            return map.find(hierarchy_key) != map.end();
        };

        size_t keys_found = 0;
@ -254,9 +470,9 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchy(ColumnPtr
        {
            std::optional<UInt64> result;

-            auto it = child_key_to_parent_key_map.find(hierarchy_key);
-
-            if (it == child_key_to_parent_key_map.end())
+            const auto & map = child_key_to_parent_key_maps[getShard(hierarchy_key)];
+            auto it = map.find(hierarchy_key);
+            if (it == map.end())
                return result;

            UInt64 parent_key = getValueFromCell(it);
@ -282,8 +498,8 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchy(ColumnPtr
    }
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse, sharded>::isInHierarchy(
    ColumnPtr key_column [[maybe_unused]],
    ColumnPtr in_key_column [[maybe_unused]],
    const DataTypePtr &) const
@ -309,14 +525,15 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
        if (!dictionary_attribute.null_value.isNull())
            null_value = dictionary_attribute.null_value.get<UInt64>();

-        const CollectionType<UInt64> & child_key_to_parent_key_map = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
+        const CollectionsHolder<UInt64> & child_key_to_parent_key_maps = std::get<CollectionsHolder<UInt64>>(hierarchical_attribute.containers);

        auto is_key_valid_func = [&](auto & hierarchy_key)
        {
            if (unlikely(hierarchical_attribute.is_nullable_set) && hierarchical_attribute.is_nullable_set->find(hierarchy_key))
                return true;

-            return child_key_to_parent_key_map.find(hierarchy_key) != child_key_to_parent_key_map.end();
+            const auto & map = child_key_to_parent_key_maps[getShard(hierarchy_key)];
+            return map.find(hierarchy_key) != map.end();
        };

        size_t keys_found = 0;
@ -325,9 +542,9 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
        {
            std::optional<UInt64> result;

-            auto it = child_key_to_parent_key_map.find(hierarchy_key);
-
-            if (it == child_key_to_parent_key_map.end())
+            const auto & map = child_key_to_parent_key_maps[getShard(hierarchy_key)];
+            auto it = map.find(hierarchy_key);
+            if (it == map.end())
                return result;

            UInt64 parent_key = getValueFromCell(it);
@ -351,8 +568,8 @@ ColumnUInt8::Ptr HashedDictionary<dictionary_key_type, sparse>::isInHierarchy(
        return nullptr;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, sparse>::getHierarchicalIndex() const
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, sparse, sharded>::getHierarchicalIndex() const
 {
    if constexpr (dictionary_key_type == DictionaryKeyType::Simple)
    {
@ -361,13 +578,22 @@ DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, s

        size_t hierarchical_attribute_index = *dict_struct.hierarchical_attribute_index;
        const auto & hierarchical_attribute = attributes[hierarchical_attribute_index];
-        const CollectionType<UInt64> & child_key_to_parent_key_map = std::get<CollectionType<UInt64>>(hierarchical_attribute.container);
+        const CollectionsHolder<UInt64> & child_key_to_parent_key_maps = std::get<CollectionsHolder<UInt64>>(hierarchical_attribute.containers);
+
+        size_t size = 0;
+        for (const auto & map : child_key_to_parent_key_maps)
+            size += map.size();

        HashMap<UInt64, PaddedPODArray<UInt64>> parent_to_child;
-        parent_to_child.reserve(child_key_to_parent_key_map.size());
+        parent_to_child.reserve(size);

-        for (const auto & [child_key, parent_key] : child_key_to_parent_key_map)
-            parent_to_child[parent_key].emplace_back(child_key);
+        for (const auto & map : child_key_to_parent_key_maps)
+        {
+            for (const auto & [child_key, parent_key] : map)
+            {
+                parent_to_child[parent_key].emplace_back(child_key);
+            }
+        }

        return std::make_shared<DictionaryHierarchicalParentToChildIndex>(parent_to_child);
    }
@ -377,8 +603,8 @@ DictionaryHierarchyParentToChildIndexPtr HashedDictionary<dictionary_key_type, s
    }
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+ColumnPtr HashedDictionary<dictionary_key_type, sparse, sharded>::getDescendants(
    ColumnPtr key_column [[maybe_unused]],
    const DataTypePtr &,
    size_t level [[maybe_unused]],
@ -403,8 +629,8 @@ ColumnPtr HashedDictionary<dictionary_key_type, sparse>::getDescendants(
    }
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::createAttributes()
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::createAttributes()
 {
    const auto size = dict_struct.attributes.size();
    attributes.reserve(size);
@ -418,16 +644,25 @@ void HashedDictionary<dictionary_key_type, sparse>::createAttributes()
            using ValueType = DictionaryValueType<AttributeType>;

            auto is_nullable_set = dictionary_attribute.is_nullable ? std::make_optional<NullableSet>() : std::optional<NullableSet>{};
-            Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionType<ValueType>()};
+            Attribute attribute{dictionary_attribute.underlying_type, std::move(is_nullable_set), CollectionsHolder<ValueType>(configuration.shards)};
            attributes.emplace_back(std::move(attribute));
        };

        callOnDictionaryAttributeType(dictionary_attribute.underlying_type, type_call);
    }
+
+    if (unlikely(attributes.size()) == 0)
+    {
+        no_attributes_containers.resize(configuration.shards);
+    }
+
+    string_arenas.resize(configuration.shards);
+    for (auto & arena : string_arenas)
+        arena = std::make_unique<Arena>();
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::updateData()
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::updateData()
 {
    /// NOTE: updateData() does not preallocation since it may increase memory usage.

@ -465,14 +700,16 @@ void HashedDictionary<dictionary_key_type, sparse>::updateData()
    if (update_field_loaded_block)
    {
        resize(update_field_loaded_block->rows());
-        blockToAttributes(*update_field_loaded_block.get());
+        DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
+        blockToAttributes(*update_field_loaded_block.get(), arena_holder, /* shard= */ 0);
    }
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Block & block [[maybe_unused]])
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::blockToAttributes(const Block & block, DictionaryKeysArenaHolder<dictionary_key_type> & arena_holder, UInt64 shard)
 {
    size_t skip_keys_size_offset = dict_struct.getKeysSize();
+    size_t new_element_count = 0;

    Columns key_columns;
    key_columns.reserve(skip_keys_size_offset);
@ -481,7 +718,6 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
    for (size_t i = 0; i < skip_keys_size_offset; ++i)
        key_columns.emplace_back(block.safeGetByPosition(i).column);

-    DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
    DictionaryKeysExtractor<dictionary_key_type> keys_extractor(key_columns, arena_holder.getComplexKeyArena());
    const size_t keys_size = keys_extractor.getKeysSize();

@ -496,12 +732,14 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
            auto key = keys_extractor.extractCurrentKey();

            if constexpr (std::is_same_v<KeyType, StringRef>)
-                key = copyStringInArena(string_arena, key);
+                key = copyStringInArena(*string_arenas[shard], key);

-            no_attributes_container.insert(key);
+            no_attributes_containers[shard].insert(key);
            keys_extractor.rollbackCurrentKey();
+            ++new_element_count;
        }

+        element_count += new_element_count;
        return;
    }

@ -511,14 +749,15 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
        auto & attribute = attributes[attribute_index];
        bool attribute_is_nullable = attribute.is_nullable_set.has_value();

-        getAttributeContainer(attribute_index, [&](auto & container)
+        getAttributeContainer(attribute_index, [&](auto & containers)
        {
-            using ContainerType = std::decay_t<decltype(container)>;
+            using ContainerType = std::decay_t<decltype(containers.front())>;
            using AttributeValueType = typename ContainerType::mapped_type;

            for (size_t key_index = 0; key_index < keys_size; ++key_index)
            {
                auto key = keys_extractor.extractCurrentKey();
+                auto & container = containers[shard];

                auto it = container.find(key);
                bool key_is_nullable_and_already_exists = attribute_is_nullable && attribute.is_nullable_set->find(key) != nullptr;
@ -530,7 +769,7 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
                }

                if constexpr (std::is_same_v<KeyType, StringRef>)
-                    key = copyStringInArena(string_arena, key);
+                    key = copyStringInArena(*string_arenas[shard], key);

                attribute_column.get(key_index, column_value_to_insert);

@ -544,7 +783,7 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
                if constexpr (std::is_same_v<AttributeValueType, StringRef>)
                {
                    String & value_to_insert = column_value_to_insert.get<String>();
-                    StringRef arena_value = copyStringInArena(string_arena, value_to_insert);
+                    StringRef arena_value = copyStringInArena(*string_arenas[shard], value_to_insert);
                    container.insert({key, arena_value});
                }
                else
@ -553,7 +792,7 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
                    container.insert({key, value_to_insert});
                }

-                ++element_count;
+                ++new_element_count;

                keys_extractor.rollbackCurrentKey();
            }
@ -561,51 +800,58 @@ void HashedDictionary<dictionary_key_type, sparse>::blockToAttributes(const Bloc
            keys_extractor.reset();
        });
    }
+
+    element_count += new_element_count;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::resize(size_t added_rows)
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::resize(size_t added_rows)
 {
    if (unlikely(!added_rows))
        return;

+    /// In multi shards configuration it is pointless.
+    if constexpr (sharded)
+        return;
+
    size_t attributes_size = attributes.size();

    if (unlikely(attributes_size == 0))
    {
-        size_t reserve_size = added_rows + no_attributes_container.size();
+        size_t reserve_size = added_rows + no_attributes_containers.front().size();

        if constexpr (sparse)
-            no_attributes_container.resize(reserve_size);
+            no_attributes_containers.front().resize(reserve_size);
        else
-            no_attributes_container.reserve(reserve_size);
+            no_attributes_containers.front().reserve(reserve_size);

        return;
    }

    for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index)
    {
-        getAttributeContainer(attribute_index, [added_rows](auto & attribute_map)
+        getAttributeContainer(attribute_index, [added_rows](auto & containers)
        {
-            size_t reserve_size = added_rows + attribute_map.size();
+            auto & container = containers.front();
+            size_t reserve_size = added_rows + container.size();

            if constexpr (sparse)
-                attribute_map.resize(reserve_size);
+                container.resize(reserve_size);
            else
-                attribute_map.reserve(reserve_size);
+                container.reserve(reserve_size);
        });
    }
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
 template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
-void HashedDictionary<dictionary_key_type, sparse>::getItemsImpl(
+void HashedDictionary<dictionary_key_type, sparse, sharded>::getItemsImpl(
    const Attribute & attribute,
    DictionaryKeysExtractor<dictionary_key_type> & keys_extractor,
    ValueSetter && set_value [[maybe_unused]],
    DefaultValueExtractor & default_value_extractor) const
 {
-    const auto & attribute_container = std::get<CollectionType<AttributeType>>(attribute.container);
+    const auto & attribute_containers = std::get<CollectionsHolder<AttributeType>>(attribute.containers);
    const size_t keys_size = keys_extractor.getKeysSize();

    size_t keys_found = 0;
@ -614,9 +860,10 @@ void HashedDictionary<dictionary_key_type, sparse>::getItemsImpl(
    {
        auto key = keys_extractor.extractCurrentKey();

-        const auto it = attribute_container.find(key);
+        const auto & container = attribute_containers[getShard(key)];
+        const auto it = container.find(key);

-        if (it != attribute_container.end())
+        if (it != container.end())
        {
            set_value(key_index, getValueFromCell(it), false);
            ++keys_found;
@ -639,11 +886,15 @@ void HashedDictionary<dictionary_key_type, sparse>::getItemsImpl(
    found_count.fetch_add(keys_found, std::memory_order_relaxed);
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::loadData()
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::loadData()
 {
    if (!source_ptr->hasUpdateField())
    {
+        std::optional<ParallelDictionaryLoader<dictionary_key_type, sparse, sharded>> parallel_loader;
+        if constexpr (sharded)
+            parallel_loader.emplace(*this);
+
        std::atomic<size_t> new_size = 0;

        QueryPipeline pipeline;
@ -654,6 +905,8 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()

        PullingPipelineExecutor executor(pipeline);
        Block block;
+        DictionaryKeysArenaHolder<dictionary_key_type> arena_holder;
+
        while (executor.pull(block))
        {
            if (configuration.preallocate && new_size)
@ -661,7 +914,7 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()
                size_t current_new_size = new_size.exchange(0);
                if (current_new_size)
                {
-                    LOG_TRACE(&Poco::Logger::get("HashedDictionary"), "Preallocated {} elements", current_new_size);
+                    LOG_TRACE(log, "Preallocated {} elements", current_new_size);
                    resize(current_new_size);
                }
            }
@ -670,8 +923,14 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()
                resize(block.rows());
            }

-            blockToAttributes(block);
+            if (parallel_loader)
+                parallel_loader->addBlock(block);
+            else
+                blockToAttributes(block, arena_holder, /* shard= */ 0);
        }
+
+        if (parallel_loader)
+            parallel_loader->finish();
    }
    else
    {
@ -684,8 +943,8 @@ void HashedDictionary<dictionary_key_type, sparse>::loadData()
            getFullName());
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::buildHierarchyParentToChildIndexIfNeeded()
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::buildHierarchyParentToChildIndexIfNeeded()
 {
    if (!dict_struct.hierarchical_attribute_index)
        return;
@ -694,34 +953,37 @@ void HashedDictionary<dictionary_key_type, sparse>::buildHierarchyParentToChildI
        hierarchical_index = getHierarchicalIndex();
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+void HashedDictionary<dictionary_key_type, sparse, sharded>::calculateBytesAllocated()
 {
    size_t attributes_size = attributes.size();
    bytes_allocated += attributes_size * sizeof(attributes.front());

    for (size_t i = 0; i < attributes_size; ++i)
    {
-        getAttributeContainer(i, [&](const auto & container)
+        getAttributeContainer(i, [&](const auto & containers)
        {
-            using ContainerType = std::decay_t<decltype(container)>;
-            using AttributeValueType = typename ContainerType::mapped_type;
-
-            bytes_allocated += sizeof(container);
-
-            if constexpr (sparse || std::is_same_v<AttributeValueType, Field>)
+            for (const auto & container : containers)
            {
-                /// bucket_count() - Returns table size, that includes empty and deleted
-                /// size()         - Returns table size, without empty and deleted
-                /// and since this is sparsehash, empty cells should not be significant,
-                /// and since items cannot be removed from the dictionary, deleted is also not important.
-                bytes_allocated += container.size() * (sizeof(KeyType) + sizeof(AttributeValueType));
-                bucket_count = container.bucket_count();
-            }
-            else
-            {
-                bytes_allocated += container.getBufferSizeInBytes();
-                bucket_count = container.getBufferSizeInCells();
+                using ContainerType = std::decay_t<decltype(container)>;
+                using AttributeValueType = typename ContainerType::mapped_type;
+
+                bytes_allocated += sizeof(container);
+
+                if constexpr (sparse || std::is_same_v<AttributeValueType, Field>)
+                {
+                    /// bucket_count() - Returns table size, that includes empty and deleted
+                    /// size()         - Returns table size, without empty and deleted
+                    /// and since this is sparsehash, empty cells should not be significant,
+                    /// and since items cannot be removed from the dictionary, deleted is also not important.
+                    bytes_allocated += container.size() * (sizeof(KeyType) + sizeof(AttributeValueType));
+                    bucket_count = container.bucket_count();
+                }
+                else
+                {
+                    bytes_allocated += container.getBufferSizeInBytes();
+                    bucket_count = container.getBufferSizeInCells();
+                }
            }
        });

@ -733,17 +995,20 @@ void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()

    if (unlikely(attributes_size == 0))
    {
-        bytes_allocated += sizeof(no_attributes_container);
+        for (const auto & container : no_attributes_containers)
+        {
+            bytes_allocated += sizeof(container);

-        if constexpr (sparse)
-        {
-            bytes_allocated += no_attributes_container.size() * (sizeof(KeyType));
-            bucket_count = no_attributes_container.bucket_count();
-        }
-        else
-        {
-            bytes_allocated += no_attributes_container.getBufferSizeInBytes();
-            bucket_count = no_attributes_container.getBufferSizeInCells();
+            if constexpr (sparse)
+            {
+                bytes_allocated += container.size() * (sizeof(KeyType));
+                bucket_count = container.bucket_count();
+            }
+            else
+            {
+                bytes_allocated += container.getBufferSizeInBytes();
+                bucket_count = container.getBufferSizeInCells();
+            }
        }
    }

@ -756,48 +1021,55 @@ void HashedDictionary<dictionary_key_type, sparse>::calculateBytesAllocated()
        bytes_allocated += hierarchical_index_bytes_allocated;
    }

-    bytes_allocated += string_arena.size();
+    for (const auto & arena : string_arenas)
+        bytes_allocated += arena->size();
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
-Pipe HashedDictionary<dictionary_key_type, sparse>::read(const Names & column_names, size_t max_block_size, size_t num_streams) const
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+Pipe HashedDictionary<dictionary_key_type, sparse, sharded>::read(const Names & column_names, size_t max_block_size, size_t num_streams) const
 {
    PaddedPODArray<HashedDictionary::KeyType> keys;

+    /// NOTE: could read multiple shards in parallel
    if (!attributes.empty())
    {
        const auto & attribute = attributes.front();

-        getAttributeContainer(0, [&](auto & container)
+        getAttributeContainer(0, [&](auto & containers)
        {
-            keys.reserve(container.size());
-
-            for (const auto & [key, value] : container)
+            for (const auto & container : containers)
            {
-                (void)(value);
-                keys.emplace_back(key);
-            }
+                keys.reserve(container.size());

-            if (attribute.is_nullable_set)
-            {
-                const auto & is_nullable_set = *attribute.is_nullable_set;
-                keys.reserve(is_nullable_set.size());
+                for (const auto & [key, _] : container)
+                {
+                    keys.emplace_back(key);
+                }

-                for (auto & node : is_nullable_set)
-                    keys.emplace_back(node.getKey());
+                if (attribute.is_nullable_set)
+                {
+                    const auto & is_nullable_set = *attribute.is_nullable_set;
+                    keys.reserve(is_nullable_set.size());
+
+                    for (auto & node : is_nullable_set)
+                        keys.emplace_back(node.getKey());
+                }
            }
        });
    }
    else
    {
-        keys.reserve(no_attributes_container.size());
-
-        for (const auto & key : no_attributes_container)
+        for (const auto & container : no_attributes_containers)
        {
-            if constexpr (sparse)
-                keys.emplace_back(key);
-            else
-                keys.emplace_back(key.getKey());
+            keys.reserve(keys.size() + container.size());
+
+            for (const auto & key : container)
+            {
+                if constexpr (sparse)
+                    keys.emplace_back(key);
+                else
+                    keys.emplace_back(key.getKey());
+            }
        }
    }

@ -820,9 +1092,9 @@ Pipe HashedDictionary<dictionary_key_type, sparse>::read(const Names & column_na
    return result;
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
 template <typename GetContainerFunc>
-void HashedDictionary<dictionary_key_type, sparse>::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func)
+void HashedDictionary<dictionary_key_type, sparse, sharded>::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func)
 {
    assert(attribute_index < attributes.size());

@ -834,16 +1106,16 @@ void HashedDictionary<dictionary_key_type, sparse>::getAttributeContainer(size_t
        using AttributeType = typename Type::AttributeType;
        using ValueType = DictionaryValueType<AttributeType>;

-        auto & attribute_container = std::get<CollectionType<ValueType>>(attribute.container);
-        std::forward<GetContainerFunc>(get_container_func)(attribute_container);
+        auto & attribute_containers = std::get<CollectionsHolder<ValueType>>(attribute.containers);
+        std::forward<GetContainerFunc>(get_container_func)(attribute_containers);
    };

    callOnDictionaryAttributeType(attribute.type, type_call);
 }

-template <DictionaryKeyType dictionary_key_type, bool sparse>
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
 template <typename GetContainerFunc>
-void HashedDictionary<dictionary_key_type, sparse>::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const
+void HashedDictionary<dictionary_key_type, sparse, sharded>::getAttributeContainer(size_t attribute_index, GetContainerFunc && get_container_func) const
 {
    const_cast<std::decay_t<decltype(*this)> *>(this)->getAttributeContainer(attribute_index, [&](auto & attribute_container)
    {
@ -851,10 +1123,14 @@ void HashedDictionary<dictionary_key_type, sparse>::getAttributeContainer(size_t
    });
 }

-template class HashedDictionary<DictionaryKeyType::Simple, true>;
-template class HashedDictionary<DictionaryKeyType::Simple, false>;
-template class HashedDictionary<DictionaryKeyType::Complex, true>;
-template class HashedDictionary<DictionaryKeyType::Complex, false>;
+template class HashedDictionary<DictionaryKeyType::Simple, false, false>;
+template class HashedDictionary<DictionaryKeyType::Simple, false, true>;
+template class HashedDictionary<DictionaryKeyType::Simple, true, false>;
+template class HashedDictionary<DictionaryKeyType::Simple, true, true>;
+template class HashedDictionary<DictionaryKeyType::Complex, false, false>;
+template class HashedDictionary<DictionaryKeyType::Complex, false, true>;
+template class HashedDictionary<DictionaryKeyType::Complex, true, false>;
+template class HashedDictionary<DictionaryKeyType::Complex, true, true>;

 void registerDictionaryHashed(DictionaryFactory & factory)
 {
@ -883,32 +1159,76 @@ void registerDictionaryHashed(DictionaryFactory & factory)

        std::string dictionary_layout_name;

-        if (dictionary_key_type == DictionaryKeyType::Simple)
-            dictionary_layout_name = "hashed";
-        else
-            dictionary_layout_name = "complex_key_hashed";
-
-        if (sparse)
-            dictionary_layout_name = "sparse_" + dictionary_layout_name;
-
-        const std::string dictionary_layout_prefix = ".layout." + dictionary_layout_name;
-        const bool preallocate = config.getBool(config_prefix + dictionary_layout_prefix + ".preallocate", false);
-
-        HashedDictionaryStorageConfiguration configuration{preallocate, require_nonempty, dict_lifetime};
-
        if (dictionary_key_type == DictionaryKeyType::Simple)
        {
            if (sparse)
-                return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                dictionary_layout_name = "sparse_hashed";
            else
-                return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                dictionary_layout_name = "hashed";
        }
        else
        {
            if (sparse)
-                return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                dictionary_layout_name = "complex_key_sparse_hashed";
            else
-                return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                dictionary_layout_name = "complex_key_hashed";
+        }
+
+        const std::string dictionary_layout_prefix = ".layout." + dictionary_layout_name;
+        const bool preallocate = config.getBool(config_prefix + dictionary_layout_prefix + ".preallocate", false);
+
+        Int64 shards = config.getInt(config_prefix + dictionary_layout_prefix + ".shards", 1);
+        if (shards <= 0 || shards > 128)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,"{}: SHARDS parameter should be within [1, 128]", full_name);
+
+        Int64 shard_load_queue_backlog = config.getInt(config_prefix + dictionary_layout_prefix + ".shard_load_queue_backlog", 10000);
+        if (shard_load_queue_backlog <= 0)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,"{}: SHARD_LOAD_QUEUE_BACKLOG parameter should be greater then zero", full_name);
+
+        HashedDictionaryStorageConfiguration configuration{
+            preallocate,
+            static_cast<UInt64>(shards),
+            static_cast<UInt64>(shard_load_queue_backlog),
+            require_nonempty,
+            dict_lifetime,
+        };
+
+        if (source_ptr->hasUpdateField() && shards > 1)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS,"{}: SHARDS parameter does not supports for updatable source (UPDATE_FIELD)", full_name);
+
+        if (dictionary_key_type == DictionaryKeyType::Simple)
+        {
+            if (sparse)
+            {
+                if (shards > 1)
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, true, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                else
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, true, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+            }
+            else
+            {
+                if (shards > 1)
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, false, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                else
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Simple, false, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+            }
+        }
+        else
+        {
+            if (sparse)
+            {
+                if (shards > 1)
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, true, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                else
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, true, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+            }
+            else
+            {
+                if (shards > 1)
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, false, true>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+                else
+                    return std::make_unique<HashedDictionary<DictionaryKeyType::Complex, false, false>>(dict_id, dict_struct, std::move(source_ptr), configuration);
+            }
        }
    };

--- a/src/Dictionaries/HashedDictionary.h
+++ b/src/Dictionaries/HashedDictionary.h
@ -27,13 +27,20 @@ namespace DB
 struct HashedDictionaryStorageConfiguration
 {
    const bool preallocate;
+    const UInt64 shards;
+    const UInt64 shard_load_queue_backlog;
    const bool require_nonempty;
    const DictionaryLifetime lifetime;
 };

-template <DictionaryKeyType dictionary_key_type, bool sparse>
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
+class ParallelDictionaryLoader;
+
+template <DictionaryKeyType dictionary_key_type, bool sparse, bool sharded>
 class HashedDictionary final : public IDictionary
 {
+    friend class ParallelDictionaryLoader<dictionary_key_type, sparse, sharded>;
+
 public:
    using KeyType = std::conditional_t<dictionary_key_type == DictionaryKeyType::Simple, UInt64, StringRef>;

@ -43,6 +50,7 @@ public:
        DictionarySourcePtr source_ptr_,
        const HashedDictionaryStorageConfiguration & configuration_,
        BlockPtr update_field_loaded_block_ = nullptr);
+    ~HashedDictionary() override;

    std::string getTypeName() const override
    {
@ -76,7 +84,12 @@ public:

    std::shared_ptr<const IExternalLoadable> clone() const override
    {
-        return std::make_shared<HashedDictionary<dictionary_key_type, sparse>>(getDictionaryID(), dict_struct, source_ptr->clone(), configuration, update_field_loaded_block);
+        return std::make_shared<HashedDictionary<dictionary_key_type, sparse, sharded>>(
+            getDictionaryID(),
+            dict_struct,
+            source_ptr->clone(),
+            configuration,
+            update_field_loaded_block);
    }

    DictionarySourcePtr getSource() const override { return source_ptr; }
@ -156,6 +169,9 @@ private:
    template <typename Value>
    using CollectionType = std::conditional_t<sparse, CollectionTypeSparse<Value>, CollectionTypeNonSparse<Value>>;

+    template <typename Value>
+    using CollectionsHolder = std::vector<CollectionType<Value>>;
+
    using NoAttributesCollectionType = std::conditional_t<sparse, NoAttributesCollectionTypeSparse, NoAttributesCollectionTypeNonSparse>;

    using NullableSet = HashSet<KeyType, DefaultHash<KeyType>>;
@ -166,36 +182,36 @@ private:
        std::optional<NullableSet> is_nullable_set;

        std::variant<
-            CollectionType<UInt8>,
-            CollectionType<UInt16>,
-            CollectionType<UInt32>,
-            CollectionType<UInt64>,
-            CollectionType<UInt128>,
-            CollectionType<UInt256>,
-            CollectionType<Int8>,
-            CollectionType<Int16>,
-            CollectionType<Int32>,
-            CollectionType<Int64>,
-            CollectionType<Int128>,
-            CollectionType<Int256>,
-            CollectionType<Decimal32>,
-            CollectionType<Decimal64>,
-            CollectionType<Decimal128>,
-            CollectionType<Decimal256>,
-            CollectionType<DateTime64>,
-            CollectionType<Float32>,
-            CollectionType<Float64>,
-            CollectionType<UUID>,
-            CollectionType<IPv4>,
-            CollectionType<IPv6>,
-            CollectionType<StringRef>,
-            CollectionType<Array>>
-            container;
+            CollectionsHolder<UInt8>,
+            CollectionsHolder<UInt16>,
+            CollectionsHolder<UInt32>,
+            CollectionsHolder<UInt64>,
+            CollectionsHolder<UInt128>,
+            CollectionsHolder<UInt256>,
+            CollectionsHolder<Int8>,
+            CollectionsHolder<Int16>,
+            CollectionsHolder<Int32>,
+            CollectionsHolder<Int64>,
+            CollectionsHolder<Int128>,
+            CollectionsHolder<Int256>,
+            CollectionsHolder<Decimal32>,
+            CollectionsHolder<Decimal64>,
+            CollectionsHolder<Decimal128>,
+            CollectionsHolder<Decimal256>,
+            CollectionsHolder<DateTime64>,
+            CollectionsHolder<Float32>,
+            CollectionsHolder<Float64>,
+            CollectionsHolder<UUID>,
+            CollectionsHolder<IPv4>,
+            CollectionsHolder<IPv6>,
+            CollectionsHolder<StringRef>,
+            CollectionsHolder<Array>>
+            containers;
    };

    void createAttributes();

-    void blockToAttributes(const Block & block);
+    void blockToAttributes(const Block & block, DictionaryKeysArenaHolder<dictionary_key_type> & arena_holder, UInt64 shard);

    void updateData();

@ -205,6 +221,22 @@ private:

    void calculateBytesAllocated();

+    UInt64 getShard(UInt64 key) const
+    {
+        if constexpr (!sharded)
+            return 0;
+        /// NOTE: function here should not match with the DefaultHash<> since
+        /// it used for the HashMap/sparse_hash_map.
+        return intHashCRC32(key) % configuration.shards;
+    }
+
+    UInt64 getShard(StringRef key) const
+    {
+        if constexpr (!sharded)
+            return 0;
+        return StringRefHash()(key) % configuration.shards;
+    }
+
    template <typename AttributeType, bool is_nullable, typename ValueSetter, typename DefaultValueExtractor>
    void getItemsImpl(
        const Attribute & attribute,
@ -220,6 +252,8 @@ private:

    void resize(size_t added_rows);

+    Poco::Logger * log;
+
    const DictionaryStructure dict_struct;
    const DictionarySourcePtr source_ptr;
    const HashedDictionaryStorageConfiguration configuration;
@ -228,21 +262,25 @@ private:

    size_t bytes_allocated = 0;
    size_t hierarchical_index_bytes_allocated = 0;
-    size_t element_count = 0;
+    std::atomic<size_t> element_count = 0;
    size_t bucket_count = 0;
    mutable std::atomic<size_t> query_count{0};
    mutable std::atomic<size_t> found_count{0};

    BlockPtr update_field_loaded_block;
-    Arena string_arena;
-    NoAttributesCollectionType no_attributes_container;
+    std::vector<std::unique_ptr<Arena>> string_arenas;
+    std::vector<NoAttributesCollectionType> no_attributes_containers;
    DictionaryHierarchicalParentToChildIndexPtr hierarchical_index;
 };

-extern template class HashedDictionary<DictionaryKeyType::Simple, false>;
-extern template class HashedDictionary<DictionaryKeyType::Simple, true>;
+extern template class HashedDictionary<DictionaryKeyType::Simple, /* sparse= */ false, /* sharded= */ false>;
+extern template class HashedDictionary<DictionaryKeyType::Simple, /* sparse= */ false, /* sharded= */ true>;
+extern template class HashedDictionary<DictionaryKeyType::Simple, /* sparse= */ true, /* sharded= */ false>;
+extern template class HashedDictionary<DictionaryKeyType::Simple, /* sparse= */ true, /* sharded= */ true>;

-extern template class HashedDictionary<DictionaryKeyType::Complex, false>;
-extern template class HashedDictionary<DictionaryKeyType::Complex, true>;
+extern template class HashedDictionary<DictionaryKeyType::Complex, /* sparse= */ false, /* sharded= */ false>;
+extern template class HashedDictionary<DictionaryKeyType::Complex, /* sparse= */ false, /* sharded= */ true>;
+extern template class HashedDictionary<DictionaryKeyType::Complex, /* sparse= */ true, /* sharded= */ false>;
+extern template class HashedDictionary<DictionaryKeyType::Complex, /* sparse= */ true, /* sharded= */ true>;

 }
--- a/src/Dictionaries/RegExpTreeDictionary.cpp
+++ b/src/Dictionaries/RegExpTreeDictionary.cpp
@ -243,13 +243,23 @@ void RegExpTreeDictionary::loadData()
            initRegexNodes(block);
        }
        initGraph();
+        if (regexps.empty())
+            throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are no available regular expression. Please check your config");
        #if USE_VECTORSCAN
-        std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
-        hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
-        /// TODO: fallback when exceptions occur.
-        hyperscan_regex->get();
+        try
+        {
+            std::vector<std::string_view> regexps_views(regexps.begin(), regexps.end());
+            hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
+            hyperscan_regex->get();
+        }
+        catch (Exception & e)
+        {
+            /// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large.
+            /// We catch the error here and rethrow again.
+            /// TODO: fallback to other engine, like re2, when exceptions occur.
+            throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message());
+        }
        #endif
-
    }
    else
    {
--- a/src/Disks/DiskEncrypted.cpp
+++ b/src/Disks/DiskEncrypted.cpp
@ -289,6 +289,12 @@ std::unique_ptr<ReadBufferFromFileBase> DiskEncrypted::readFile(
    std::optional<size_t> read_hint,
    std::optional<size_t> file_size) const
 {
+    if (read_hint && *read_hint > 0)
+        read_hint = *read_hint + FileEncryption::Header::kSize;
+
+    if (file_size && *file_size > 0)
+        file_size = *file_size + FileEncryption::Header::kSize;
+
    auto wrapped_path = wrappedPath(path);
    auto buffer = delegate->readFile(wrapped_path, settings, read_hint, file_size);
    if (buffer->eof())
--- a/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
+++ b/src/Disks/IO/AsynchronousReadIndirectBufferFromRemoteFS.cpp
@ -171,8 +171,9 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    if (!hasPendingDataToRead())
        return false;

-    size_t size, offset;
+    chassert(file_offset_of_buffer_end <= impl->getFileSize());

+    size_t size, offset;
    if (prefetch_future.valid())
    {
        ProfileEventTimeIncrement<Microseconds> watch(ProfileEvents::AsynchronousRemoteReadWaitMicroseconds);
@ -210,8 +211,8 @@ bool AsynchronousReadIndirectBufferFromRemoteFS::nextImpl()
    /// In case of multiple files for the same file in clickhouse (i.e. log family)
    /// file_offset_of_buffer_end will not match getImplementationBufferOffset()
    /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()]
-    assert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
-    assert(file_offset_of_buffer_end <= impl->getFileSize());
+    chassert(file_offset_of_buffer_end >= impl->getImplementationBufferOffset());
+    chassert(file_offset_of_buffer_end <= impl->getFileSize());

    return bytes_read;
 }
@ -277,6 +278,15 @@ off_t AsynchronousReadIndirectBufferFromRemoteFS::seek(off_t offset, int whence)
    /// First reset the buffer so the next read will fetch new data to the buffer.
    resetWorkingBuffer();

+    if (read_until_position && new_pos > *read_until_position)
+    {
+        ProfileEvents::increment(ProfileEvents::RemoteFSSeeksWithReset);
+        impl->reset();
+
+        file_offset_of_buffer_end = new_pos = *read_until_position; /// read_until_position is a non-included boundary.
+        return new_pos;
+    }
+
    /**
    * Lazy ignore. Save number of bytes to ignore and ignore it either for prefetch buffer or current buffer.
    * Note: we read in range [file_offset_of_buffer_end, read_until_position).
--- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
+++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp
@ -256,7 +256,7 @@ size_t ReadBufferFromRemoteFSGather::getFileSize() const
 String ReadBufferFromRemoteFSGather::getInfoForLog()
 {
    if (!current_buf)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot get info: buffer not initialized");
+        return "";

    return current_buf->getInfoForLog();
 }
--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp
@ -125,14 +125,19 @@ std::string S3ObjectStorage::generateBlobNameForPath(const std::string & /* path
        getRandomASCIIString(key_name_total_size - key_name_prefix_size));
 }

-Aws::S3::Model::HeadObjectOutcome S3ObjectStorage::requestObjectHeadData(const std::string & bucket_from, const std::string & key) const
+size_t S3ObjectStorage::getObjectSize(const std::string & bucket_from, const std::string & key) const
 {
-    return S3::headObject(*client.get(), bucket_from, key, "", true);
+    return S3::getObjectSize(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true);
 }

 bool S3ObjectStorage::exists(const StoredObject & object) const
 {
-    return S3::objectExists(*client.get(), bucket, object.absolute_path, "", true);
+    return S3::objectExists(*client.get(), bucket, object.absolute_path, {}, /* for_disk_s3= */ true);
+}
+
+void S3ObjectStorage::checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const
+{
+    return S3::checkObjectExists(*client.get(), bucket_from, key, {}, /* for_disk_s3= */ true, description);
 }

 std::unique_ptr<ReadBufferFromFileBase> S3ObjectStorage::readObjects( /// NOLINT
@ -409,13 +414,10 @@ ObjectMetadata S3ObjectStorage::getObjectMetadata(const std::string & path) cons
 {
    ObjectMetadata result;

-    auto object_head = requestObjectHeadData(bucket, path);
-    throwIfError(object_head);
-
-    auto & object_head_result = object_head.GetResult();
-    result.size_bytes = object_head_result.GetContentLength();
-    result.last_modified = object_head_result.GetLastModified().Millis();
-    result.attributes = object_head_result.GetMetadata();
+    auto object_info = S3::getObjectInfo(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);
+    result.size_bytes = object_info.size;
+    result.last_modified = object_info.last_modification_time;
+    result.attributes = S3::getObjectMetadata(*client.get(), bucket, path, {}, /* for_disk_s3= */ true);

    return result;
 }
@ -442,7 +444,7 @@ void S3ObjectStorage::copyObjectImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    std::optional<Aws::S3::Model::HeadObjectResult> head,
+    size_t size,
    std::optional<ObjectAttributes> metadata) const
 {
    auto client_ptr = client.get();
@ -464,7 +466,7 @@ void S3ObjectStorage::copyObjectImpl(
    if (!outcome.IsSuccess() && (outcome.GetError().GetExceptionName() == "EntityTooLarge"
            || outcome.GetError().GetExceptionName() == "InvalidRequest"))
    { // Can't come here with MinIO, MinIO allows single part upload for large objects.
-        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, head, metadata);
+        copyObjectMultipartImpl(src_bucket, src_key, dst_bucket, dst_key, size, metadata);
        return;
    }

@ -472,12 +474,7 @@ void S3ObjectStorage::copyObjectImpl(

    auto settings_ptr = s3_settings.get();
    if (settings_ptr->request_settings.check_objects_after_upload)
-    {
-        auto object_head = requestObjectHeadData(dst_bucket, dst_key);
-        if (!object_head.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
-    }
-
+        checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
 }

 void S3ObjectStorage::copyObjectMultipartImpl(
@ -485,15 +482,11 @@ void S3ObjectStorage::copyObjectMultipartImpl(
    const String & src_key,
    const String & dst_bucket,
    const String & dst_key,
-    std::optional<Aws::S3::Model::HeadObjectResult> head,
+    size_t size,
    std::optional<ObjectAttributes> metadata) const
 {
-    if (!head)
-        head = requestObjectHeadData(src_bucket, src_key).GetResult();
-
    auto settings_ptr = s3_settings.get();
    auto client_ptr = client.get();
-    size_t size = head->GetContentLength();

    String multipart_upload_id;

@ -569,29 +562,24 @@ void S3ObjectStorage::copyObjectMultipartImpl(
    }

    if (settings_ptr->request_settings.check_objects_after_upload)
-    {
-        auto object_head = requestObjectHeadData(dst_bucket, dst_key);
-        if (!object_head.IsSuccess())
-            throw Exception(ErrorCodes::S3_ERROR, "Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", dst_key, dst_bucket);
-    }
-
+        checkObjectExists(dst_bucket, dst_key, "Immediately after upload");
 }

 void S3ObjectStorage::copyObject( // NOLINT
    const StoredObject & object_from, const StoredObject & object_to, std::optional<ObjectAttributes> object_to_attributes)
 {
-    auto head = requestObjectHeadData(bucket, object_from.absolute_path).GetResult();
+    auto size = getObjectSize(bucket, object_from.absolute_path);
    static constexpr int64_t multipart_upload_threashold = 5UL * 1024 * 1024 * 1024;

-    if (head.GetContentLength() >= multipart_upload_threashold)
+    if (size >= multipart_upload_threashold)
    {
        copyObjectMultipartImpl(
-            bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
+            bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
    }
    else
    {
        copyObjectImpl(
-            bucket, object_from.absolute_path, bucket, object_to.absolute_path, head, object_to_attributes);
+            bucket, object_from.absolute_path, bucket, object_to.absolute_path, size, object_to_attributes);
    }
 }

--- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
+++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h
@ -172,7 +172,7 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
+        size_t size,
        std::optional<ObjectAttributes> metadata = std::nullopt) const;

    void copyObjectMultipartImpl(
@ -180,13 +180,14 @@ private:
        const String & src_key,
        const String & dst_bucket,
        const String & dst_key,
-        std::optional<Aws::S3::Model::HeadObjectResult> head = std::nullopt,
+        size_t size,
        std::optional<ObjectAttributes> metadata = std::nullopt) const;

    void removeObjectImpl(const StoredObject & object, bool if_exists);
    void removeObjectsImpl(const StoredObjects & objects, bool if_exists);

-    Aws::S3::Model::HeadObjectOutcome requestObjectHeadData(const std::string & bucket_from, const std::string & key) const;
+    size_t getObjectSize(const std::string & bucket_from, const std::string & key) const;
+    void checkObjectExists(const std::string & bucket_from, const std::string & key, std::string_view description) const;

    std::string bucket;

--- a/src/Disks/tests/gtest_disk_encrypted.cpp
+++ b/src/Disks/tests/gtest_disk_encrypted.cpp
@ -55,9 +55,9 @@ protected:
        return temp_dir->path() + "/";
    }

-    String getFileContents(const String & file_name)
+    String getFileContents(const String & file_name, std::optional<size_t> file_size = {})
    {
-        auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* read_hint= */ {}, /* file_size= */ {});
+        auto buf = encrypted_disk->readFile(file_name, /* settings= */ {}, /* read_hint= */ {}, file_size);
        String str;
        readStringUntilEOF(str, *buf);
        return str;
@ -108,6 +108,10 @@ TEST_F(DiskEncryptedTest, WriteAndRead)
    EXPECT_EQ(getFileContents("a.txt"), "Some text");
    checkBinaryRepresentation(getDirectory() + "a.txt", kHeaderSize + 9);

+    /// Read the file with specified file size.
+    EXPECT_EQ(getFileContents("a.txt", 9), "Some text");
+    checkBinaryRepresentation(getDirectory() + "a.txt", kHeaderSize + 9);
+
    /// Remove the file.
    encrypted_disk->removeFile("a.txt");

--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -90,7 +90,9 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings)
    format_settings.input_allow_errors_ratio = settings.input_format_allow_errors_ratio;
    format_settings.json.array_of_rows = settings.output_format_json_array_of_rows;
    format_settings.json.escape_forward_slashes = settings.output_format_json_escape_forward_slashes;
-    format_settings.json.named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
+    format_settings.json.write_named_tuples_as_objects = settings.output_format_json_named_tuples_as_objects;
+    format_settings.json.read_named_tuples_as_objects = settings.input_format_json_named_tuples_as_objects;
+    format_settings.json.defaults_for_missing_elements_in_named_tuple = settings.input_format_json_defaults_for_missing_elements_in_named_tuple;
    format_settings.json.quote_64bit_integers = settings.output_format_json_quote_64bit_integers;
    format_settings.json.quote_64bit_floats = settings.output_format_json_quote_64bit_floats;
    format_settings.json.quote_denormals = settings.output_format_json_quote_denormals;
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -153,7 +153,9 @@ struct FormatSettings
        bool quote_denormals = true;
        bool quote_decimals = false;
        bool escape_forward_slashes = true;
-        bool named_tuples_as_objects = false;
+        bool read_named_tuples_as_objects = false;
+        bool write_named_tuples_as_objects = false;
+        bool defaults_for_missing_elements_in_named_tuple = false;
        bool serialize_as_strings = false;
        bool read_bools_as_numbers = true;
        bool read_numbers_as_strings = true;
--- a/src/Functions/MatchImpl.h
+++ b/src/Functions/MatchImpl.h
@ -118,6 +118,16 @@ struct MatchImpl
        if (haystack_offsets.empty())
            return;

+        /// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
+        /// - col [not] [i]like '%' / '%%'
+        /// - match(col, '.*')
+        if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
+        {
+            for (auto & x : res)
+                x = !negate;
+            return;
+        }
+
        /// Special case that the [I]LIKE expression reduces to finding a substring in a string
        String strstr_pattern;
        if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
@ -267,6 +277,16 @@ struct MatchImpl
        if (haystack.empty())
            return;

+        /// Shortcut for the silly but practical case that the pattern matches everything/nothing independently of the haystack:
+        /// - col [not] [i]like '%' / '%%'
+        /// - match(col, '.*')
+        if ((is_like && (needle == "%" or needle == "%%")) || (!is_like && (needle == ".*" || needle == ".*?")))
+        {
+            for (auto & x : res)
+                x = !negate;
+            return;
+        }
+
        /// Special case that the [I]LIKE expression reduces to finding a substring in a string
        String strstr_pattern;
        if (is_like && impl::likePatternIsSubstring(needle, strstr_pattern))
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -250,7 +250,7 @@ size_t ReadBufferFromS3::getFileSize()
    if (file_size)
        return *file_size;

-    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, true, read_settings.for_object_storage);
+    auto object_size = S3::getObjectSize(*client_ptr, bucket, key, version_id, /* for_disk_s3= */ read_settings.for_object_storage);

    file_size = object_size;
    return *file_size;
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@ -39,8 +39,8 @@ namespace ErrorCodes
    extern const int BAD_ARGUMENTS;
 }

-template <typename IteratorSrc, typename IteratorDst>
-void parseHex(IteratorSrc src, IteratorDst dst, const size_t num_bytes)
+template <size_t num_bytes, typename IteratorSrc, typename IteratorDst>
+inline void parseHex(IteratorSrc src, IteratorDst dst)
 {
    size_t src_pos = 0;
    size_t dst_pos = 0;
@ -52,18 +52,18 @@ void parseUUID(const UInt8 * src36, UInt8 * dst16)
 {
    /// If string is not like UUID - implementation specific behaviour.

-    parseHex(&src36[0], &dst16[0], 4);
-    parseHex(&src36[9], &dst16[4], 2);
-    parseHex(&src36[14], &dst16[6], 2);
-    parseHex(&src36[19], &dst16[8], 2);
-    parseHex(&src36[24], &dst16[10], 6);
+    parseHex<4>(&src36[0], &dst16[0]);
+    parseHex<2>(&src36[9], &dst16[4]);
+    parseHex<2>(&src36[14], &dst16[6]);
+    parseHex<2>(&src36[19], &dst16[8]);
+    parseHex<6>(&src36[24], &dst16[10]);
 }

 void parseUUIDWithoutSeparator(const UInt8 * src36, UInt8 * dst16)
 {
    /// If string is not like UUID - implementation specific behaviour.

-    parseHex(&src36[0], &dst16[0], 16);
+    parseHex<16>(&src36[0], &dst16[0]);
 }

 /** Function used when byte ordering is important when parsing uuid
@ -74,11 +74,11 @@ void parseUUID(const UInt8 * src36, std::reverse_iterator<UInt8 *> dst16)
    /// If string is not like UUID - implementation specific behaviour.

    /// FIXME This code looks like trash.
-    parseHex(&src36[0], dst16 + 8, 4);
-    parseHex(&src36[9], dst16 + 12, 2);
-    parseHex(&src36[14], dst16 + 14, 2);
-    parseHex(&src36[19], dst16, 2);
-    parseHex(&src36[24], dst16 + 2, 6);
+    parseHex<4>(&src36[0], dst16 + 8);
+    parseHex<2>(&src36[9], dst16 + 12);
+    parseHex<2>(&src36[14], dst16 + 14);
+    parseHex<2>(&src36[19], dst16);
+    parseHex<6>(&src36[24], dst16 + 2);
 }

 /** Function used when byte ordering is important when parsing uuid
@ -88,8 +88,8 @@ void parseUUIDWithoutSeparator(const UInt8 * src36, std::reverse_iterator<UInt8
 {
    /// If string is not like UUID - implementation specific behaviour.

-    parseHex(&src36[0], dst16 + 8, 8);
-    parseHex(&src36[16], dst16, 8);
+    parseHex<8>(&src36[0], dst16 + 8);
+    parseHex<8>(&src36[16], dst16);
 }

 void NO_INLINE throwAtAssertionFailed(const char * s, ReadBuffer & buf)
--- a/src/IO/S3Common.cpp
+++ b/src/IO/S3Common.cpp
@ -27,6 +27,8 @@
 #    include <aws/core/utils/UUID.h>
 #    include <aws/core/http/HttpClientFactory.h>
 #    include <aws/s3/S3Client.h>
+#    include <aws/s3/model/GetObjectAttributesRequest.h>
+#    include <aws/s3/model/GetObjectRequest.h>
 #    include <aws/s3/model/HeadObjectRequest.h>

 #    include <IO/S3/PocoHTTPClientFactory.h>
@ -40,7 +42,11 @@

 namespace ProfileEvents
 {
+    extern const Event S3GetObjectAttributes;
+    extern const Event S3GetObjectMetadata;
    extern const Event S3HeadObject;
+    extern const Event DiskS3GetObjectAttributes;
+    extern const Event DiskS3GetObjectMetadata;
    extern const Event DiskS3HeadObject;
 }

@ -699,6 +705,92 @@ public:
    }
 };

+/// Extracts the endpoint from a constructed S3 client.
+String getEndpoint(const Aws::S3::S3Client & client)
+{
+    const auto * endpoint_provider = dynamic_cast<const Aws::S3::Endpoint::S3DefaultEpProviderBase *>(const_cast<Aws::S3::S3Client &>(client).accessEndpointProvider().get());
+    if (!endpoint_provider)
+        return {};
+    String endpoint;
+    endpoint_provider->GetBuiltInParameters().GetParameter("Endpoint").GetString(endpoint);
+    return endpoint;
+}
+
+/// Performs a request to get the size and last modification time of an object.
+/// The function performs either HeadObject or GetObjectAttributes request depending on the endpoint.
+std::pair<std::optional<DB::S3::ObjectInfo>, Aws::S3::S3Error> tryGetObjectInfo(
+    const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+{
+    auto endpoint = getEndpoint(client);
+    bool use_get_object_attributes_request = (endpoint.find(".amazonaws.com") != String::npos);
+
+    if (use_get_object_attributes_request)
+    {
+        /// It's better not to use `HeadObject` requests for AWS S3 because they don't work well with the global region.
+        /// Details: `HeadObject` request never returns a response body (even if there is an error) however
+        /// if the request was sent without specifying a region in the endpoint (i.e. for example "https://test.s3.amazonaws.com/mydata.csv"
+        /// instead of "https://test.s3-us-west-2.amazonaws.com/mydata.csv") then that response body is one of the main ways
+        /// to determine the correct region and try to repeat the request again with the correct region.
+        /// For any other request type (`GetObject`, `ListObjects`, etc.) AWS SDK does that because they have response bodies,
+        /// but for `HeadObject` there is no response body so this way doesn't work. That's why we use `GetObjectAttributes` request instead.
+        /// See https://github.com/aws/aws-sdk-cpp/issues/1558 and also the function S3ErrorMarshaller::ExtractRegion() for more information.
+
+        ProfileEvents::increment(ProfileEvents::S3GetObjectAttributes);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3GetObjectAttributes);
+
+        Aws::S3::Model::GetObjectAttributesRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        req.SetObjectAttributes({Aws::S3::Model::ObjectAttributes::ObjectSize});
+
+        auto outcome = client.GetObjectAttributes(req);
+        if (outcome.IsSuccess())
+        {
+            const auto & result = outcome.GetResult();
+            DB::S3::ObjectInfo object_info;
+            object_info.size = static_cast<size_t>(result.GetObjectSize());
+            object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
+            return {object_info, {}};
+        }
+
+        return {std::nullopt, outcome.GetError()};
+    }
+    else
+    {
+        /// By default we use `HeadObject` requests.
+        /// We cannot just use `GetObjectAttributes` requests always because some S3 providers (e.g. Minio)
+        /// don't support `GetObjectAttributes` requests.
+
+        ProfileEvents::increment(ProfileEvents::S3HeadObject);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
+
+        Aws::S3::Model::HeadObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        auto outcome = client.HeadObject(req);
+        if (outcome.IsSuccess())
+        {
+            const auto & result = outcome.GetResult();
+            DB::S3::ObjectInfo object_info;
+            object_info.size = static_cast<size_t>(result.GetContentLength());
+            object_info.last_modification_time = result.GetLastModified().Millis() / 1000;
+            return {object_info, {}};
+        }
+
+        return {std::nullopt, outcome.GetError()};
+    }
+}
+
 }


@ -894,54 +986,33 @@ namespace S3
        return error == Aws::S3::S3Errors::RESOURCE_NOT_FOUND || error == Aws::S3::S3Errors::NO_SUCH_KEY;
    }

-    Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
+    ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
    {
-        ProfileEvents::increment(ProfileEvents::S3HeadObject);
-        if (for_disk_s3)
-            ProfileEvents::increment(ProfileEvents::DiskS3HeadObject);
-
-        Aws::S3::Model::HeadObjectRequest req;
-        req.SetBucket(bucket);
-        req.SetKey(key);
-
-        if (!version_id.empty())
-            req.SetVersionId(version_id);
-
-        return client.HeadObject(req);
-    }
-
-    S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
-    {
-        auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
-
-        if (outcome.IsSuccess())
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
        {
-            auto read_result = outcome.GetResultWithOwnership();
-            return {.size = static_cast<size_t>(read_result.GetContentLength()), .last_modification_time = read_result.GetLastModified().Millis() / 1000};
+            return *object_info;
        }
        else if (throw_on_error)
        {
-            const auto & error = outcome.GetError();
            throw DB::Exception(ErrorCodes::S3_ERROR,
-                "Failed to HEAD object: {}. HTTP response code: {}",
+                "Failed to get object attributes: {}. HTTP response code: {}",
                error.GetMessage(), static_cast<size_t>(error.GetResponseCode()));
        }
        return {};
    }

-    size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3)
+    size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
    {
-        return getObjectInfo(client, bucket, key, version_id, throw_on_error, for_disk_s3).size;
+        return getObjectInfo(client, bucket, key, version_id, for_disk_s3, throw_on_error).size;
    }

    bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3)
    {
-        auto outcome = headObject(client, bucket, key, version_id, for_disk_s3);
-
-        if (outcome.IsSuccess())
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
            return true;

-        const auto & error = outcome.GetError();
        if (isNotFoundError(error.GetErrorType()))
            return false;

@ -949,6 +1020,48 @@ namespace S3
            "Failed to check existence of key {} in bucket {}: {}",
            key, bucket, error.GetMessage());
    }
+
+    void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, std::string_view description)
+    {
+        auto [object_info, error] = tryGetObjectInfo(client, bucket, key, version_id, for_disk_s3);
+        if (object_info)
+            return;
+        throw S3Exception(error.GetErrorType(), "{}Object {} in bucket {} suddenly disappeared: {}",
+                          (description.empty() ? "" : (String(description) + ": ")), key, bucket, error.GetMessage());
+    }
+
+    std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool for_disk_s3, bool throw_on_error)
+    {
+        ProfileEvents::increment(ProfileEvents::S3GetObjectMetadata);
+        if (for_disk_s3)
+            ProfileEvents::increment(ProfileEvents::DiskS3GetObjectMetadata);
+
+        /// We must not use the `HeadObject` request, see the comment about `HeadObjectRequest` in S3Common.h.
+
+        Aws::S3::Model::GetObjectRequest req;
+        req.SetBucket(bucket);
+        req.SetKey(key);
+
+        /// Only the first byte will be read.
+        /// We don't need that first byte but the range should be set otherwise the entire object will be read.
+        req.SetRange("bytes=0-0");
+
+        if (!version_id.empty())
+            req.SetVersionId(version_id);
+
+        auto outcome = client.GetObject(req);
+
+        if (outcome.IsSuccess())
+            return outcome.GetResult().GetMetadata();
+
+        if (!throw_on_error)
+            return {};
+
+        const auto & error = outcome.GetError();
+        throw S3Exception(error.GetErrorType(),
+            "Failed to get metadata of key {} in bucket {}: {}",
+            key, bucket, error.GetMessage());
+    }
 }

 }
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -11,15 +11,15 @@
 #if USE_AWS_S3

 #include <base/types.h>
-#include <aws/core/Aws.h>
-#include <aws/core/client/ClientConfiguration.h>
-#include <aws/s3/S3Client.h>
-#include <aws/s3/S3Errors.h>
-#include <Poco/URI.h>
-
 #include <Common/Exception.h>
 #include <Common/Throttler_fwd.h>

+#include <Poco/URI.h>
+#include <aws/core/Aws.h>
+#include <aws/s3/S3Errors.h>
+
+
+namespace Aws::S3 { class S3Client; }

 namespace DB
 {
@ -121,22 +121,29 @@ struct URI
    static void validateBucket(const String & bucket, const Poco::URI & uri);
 };

+/// WARNING: Don't use `HeadObjectRequest`! Use the functions below instead.
+/// For explanation see the comment about `HeadObject` request in the function tryGetObjectInfo().
+
 struct ObjectInfo
 {
    size_t size = 0;
    time_t last_modification_time = 0;
 };

-bool isNotFoundError(Aws::S3::S3Errors error);
+ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);

-Aws::S3::Model::HeadObjectOutcome headObject(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);
-
-S3::ObjectInfo getObjectInfo(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
-
-size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id, bool throw_on_error, bool for_disk_s3);
+size_t getObjectSize(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);

 bool objectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false);

+/// Throws an exception if a specified object doesn't exist. `description` is used as a part of the error message.
+void checkObjectExists(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, std::string_view description = {});
+
+bool isNotFoundError(Aws::S3::S3Errors error);
+
+/// Returns the object's metadata.
+std::map<String, String> getObjectMetadata(const Aws::S3::S3Client & client, const String & bucket, const String & key, const String & version_id = "", bool for_disk_s3 = false, bool throw_on_error = true);
+
 }
 #endif

--- a/src/IO/WriteBufferFromS3.cpp
+++ b/src/IO/WriteBufferFromS3.cpp
@ -182,12 +182,8 @@ void WriteBufferFromS3::finalizeImpl()
    if (check_objects_after_upload)
    {
        LOG_TRACE(log, "Checking object {} exists after upload", key);
-
-        auto response = S3::headObject(*client_ptr, bucket, key, "", write_settings.for_object_storage);
-        if (!response.IsSuccess())
-            throw S3Exception(fmt::format("Object {} from bucket {} disappeared immediately after upload, it's a bug in S3 or S3 API.", key, bucket), response.GetError().GetErrorType());
-        else
-            LOG_TRACE(log, "Object {} exists after upload", key);
+        S3::checkObjectExists(*client_ptr, bucket, key, {}, /* for_disk_s3= */ write_settings.for_object_storage, "Immediately after upload");
+        LOG_TRACE(log, "Object {} exists after upload", key);
    }
 }

--- a/Show More
+++ b/Show More